parsley-ruby 0.0.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,26 @@
1
+ .libs/
2
+ *.o
3
+ *.lo
4
+ dexterc
5
+ dexter
6
+ parsleyc
7
+ parsley
8
+ .deps/
9
+ Makefile
10
+ y.tab.c
11
+ autom4te.cache/
12
+ autoscan.log
13
+ config.log
14
+ configure.scan
15
+ parser.c
16
+ scanner.c
17
+ libparsley.la
18
+ parser.h
19
+ test.log
20
+ parsley*.gem
21
+ ext/cparsley.bundle
22
+ ext/cparsley.so
23
+ ext/Makefile
24
+ ext/conftest.dSYM/
25
+ work
26
+ ext/mkmf.log
data/README ADDED
@@ -0,0 +1,32 @@
1
+ ABOUT
2
+
3
+ Ruby bindings for Parsley.
4
+
5
+ INSTALLATION
6
+
7
+ = Get Parsley and Dependancies =
8
+
9
+ Download Parsley from http://github.com/fizx/parsley/tree/master following the installation directions located at http://github.com/fizx/parsley/blob/master/INSTALL
10
+
11
+ = Install parsley-ruby =
12
+
13
+ From source:
14
+ sudo rake install
15
+
16
+ From GitHub: DEPRECATED!
17
+
18
+ From GemCutter
19
+
20
+ Run the following if you haven't already:
21
+ gem sources -a http://gemcutter.org
22
+ Install the gem:
23
+ sudo gem install parsley-ruby
24
+
25
+ PARSLETS.COM INTEGRATION
26
+
27
+ We also recommend installing the free online_parselets rubygem in order to use other people's parselets and to share your own:
28
+ Run the following if you haven't already:
29
+ gem sources -a http://gems.github.com
30
+ Install the gem:
31
+ sudo gem install iterationlabs-online_parslets
32
+
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "parsley-ruby"
8
+ gem.summary = "Ruby binding for parsley"
9
+ gem.description = "XML/HTML Parser"
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/parsley-ruby"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_dependency("json", ["> 0.0.0"])
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies
41
+
42
+ task :default => :test
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ if File.exist?('VERSION')
47
+ version = File.read('VERSION')
48
+ else
49
+ version = ""
50
+ end
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "robots #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.1
data/ext/cparsley.c ADDED
@@ -0,0 +1,140 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <libxslt/xslt.h>
4
+ #include <libexslt/exslt.h>
5
+ #include <libxslt/xsltInternals.h>
6
+ #include <libxslt/transform.h>
7
+ #include <libxml/parser.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/HTMLtree.h>
10
+ #include <libxml/xmlwriter.h>
11
+ #include <parsley.h>
12
+ #include <json/json.h>
13
+ #include <xml2json.h>
14
+
15
+ VALUE _new(VALUE, VALUE, VALUE);
16
+ VALUE _parse(VALUE, VALUE);
17
+ VALUE _rb_set_user_agent(VALUE self, VALUE agent);
18
+ VALUE c_parsley_err;
19
+ VALUE c_parsley;
20
+
21
+ void Init_cparsley()
22
+ {
23
+ c_parsley = rb_define_class("CParsley", rb_cObject);
24
+ c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
25
+ rb_define_singleton_method(c_parsley, "new", _new, 2);
26
+ rb_define_singleton_method(c_parsley, "set_user_agent", _rb_set_user_agent, 1);
27
+ rb_define_method(c_parsley, "parse", _parse, 1);
28
+ }
29
+
30
+ VALUE
31
+ _new(VALUE self, VALUE parsley, VALUE incl){
32
+ parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
33
+ if(ptr->error != NULL) {
34
+ rb_raise(c_parsley_err, ptr->error);
35
+ parsley_free(ptr);
36
+ return Qnil;
37
+ }
38
+
39
+ return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
40
+ }
41
+
42
+ VALUE
43
+ _rb_set_user_agent(VALUE self, VALUE agent) {
44
+ parsley_set_user_agent(STR2CSTR(agent));
45
+ return Qtrue;
46
+ }
47
+
48
+
49
+ static VALUE
50
+ rubify_recurse(xmlNodePtr xml) {
51
+ if(xml == NULL) return NULL;
52
+ xmlNodePtr child;
53
+ VALUE obj = Qnil;
54
+
55
+ switch(xml->type) {
56
+ case XML_ELEMENT_NODE:
57
+ child = xml->children;
58
+ if(xml->ns == NULL) {
59
+ child = xml;
60
+ obj = rb_hash_new();
61
+ while(child != NULL) {
62
+ rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
63
+ child = child->next;
64
+ }
65
+ } else if(!strcmp(xml->ns->prefix, "parsley")) {
66
+ if(!strcmp(xml->name, "groups")) {
67
+ obj = rb_ary_new();
68
+ while(child != NULL) {
69
+ rb_ary_push(obj, rubify_recurse(child->children));
70
+ child = child->next;
71
+ }
72
+ } else if(!strcmp(xml->name, "group")) {
73
+ // Implicitly handled by parsley:groups handler
74
+ }
75
+ }
76
+ break;
77
+ case XML_TEXT_NODE:
78
+ obj = rb_str_new2(xml->content);
79
+ break;
80
+ }
81
+ // inspect(obj);
82
+ return obj;
83
+ }
84
+
85
+ static VALUE
86
+ _parse_doc(parsedParsleyPtr ptr, VALUE type) {
87
+ if(ptr->error != NULL || ptr->xml == NULL) {
88
+ if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
89
+ rb_raise(c_parsley_err, ptr->error);
90
+ parsed_parsley_free(ptr);
91
+ return Qnil;
92
+ }
93
+
94
+ VALUE output;
95
+ if(type == ID2SYM(rb_intern("json"))) {
96
+ struct json_object *json = xml2json(ptr->xml->children->children);
97
+ char* str = json_object_to_json_string(json);
98
+ output = rb_str_new2(str);
99
+ json_object_put(json);
100
+ } else if(type == ID2SYM(rb_intern("xml"))) {
101
+ xmlChar* str;
102
+ int size;
103
+ xmlDocDumpMemory(ptr->xml, &str, &size);
104
+ output = rb_str_new(str, size);
105
+ } else {
106
+ output = rubify_recurse(ptr->xml->children->children);
107
+ if((void*)output == NULL) output = Qnil;
108
+ }
109
+
110
+ parsed_parsley_free(ptr);
111
+
112
+ return output;
113
+ }
114
+
115
+ #define OPT(A) rb_hash_aref(options, ID2SYM(rb_intern(A)))
116
+ #define OPT_BOOL(A) (OPT(A) != Qnil && OPT(A) != Qfalse)
117
+ #define OPT_MATCH(A, B) (rb_hash_aref(options, ID2SYM(rb_intern(A))) == ID2SYM(rb_intern(B)))
118
+
119
+ VALUE _parse(VALUE self, VALUE options){
120
+ parsleyPtr parsley;
121
+ Data_Get_Struct(self, parsleyPtr, parsley);
122
+ int flags = 0;
123
+ char *base = NULL;
124
+ if(OPT_MATCH("input", "html")) flags |= PARSLEY_OPTIONS_HTML;
125
+ if(OPT_BOOL("prune")) flags |= PARSLEY_OPTIONS_PRUNE;
126
+ if(OPT_BOOL("collate")) flags |= PARSLEY_OPTIONS_COLLATE;
127
+ if(OPT_BOOL("allow_net")) flags |= PARSLEY_OPTIONS_ALLOW_NET;
128
+ if(OPT_BOOL("allow_local")) flags |= PARSLEY_OPTIONS_ALLOW_LOCAL;
129
+ if(OPT_BOOL("sgwrap")) flags |= PARSLEY_OPTIONS_SGWRAP;
130
+ if(OPT_BOOL("has_base")) base = STR2CSTR(OPT("base"));
131
+
132
+ // printf("prune: %d\nallow_net: %d\nallow_local: %d\nhas_base: %d\nflags: %d\n", OPT_BOOL("prune"), OPT_BOOL("allow_net"), OPT_BOOL("allow_local"), OPT_BOOL("has_base"), flags);
133
+
134
+ if(OPT_BOOL("is_file")) {
135
+ return _parse_doc(parsley_parse_file(parsley, STR2CSTR(OPT("file")), flags), OPT("output"));
136
+ } else {
137
+ char * str = STR2CSTR(OPT("string"));
138
+ return _parse_doc(parsley_parse_string(parsley, str, strlen(str), base, flags), OPT("output"));
139
+ }
140
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["ARCHFLAGS"] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}"
3
+
4
+ require 'mkmf'
5
+
6
+ ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
7
+ LIBDIR = Config::CONFIG['libdir']
8
+ INCLUDEDIR = Config::CONFIG['includedir']
9
+
10
+ $CFLAGS << " #{ENV["CFLAGS"]}"
11
+ if Config::CONFIG['target_os'] == 'mingw32'
12
+ $CFLAGS << " -DXP_WIN -DXP_WIN32"
13
+ else
14
+ $CFLAGS << " -g -DXP_UNIX"
15
+ end
16
+
17
+ $CFLAGS << " -O3 -Wall -Wextra -Wcast-qual -Wwrite-strings -Wconversion -Wmissing-noreturn -Winline"
18
+
19
+ if Config::CONFIG['target_os'] == 'mingw32'
20
+ find_library('xml2', 'xmlParseDoc',
21
+ File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'bin'))
22
+ find_library('xslt', 'xsltParseStylesheetDoc',
23
+ File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'bin'))
24
+ else
25
+ find_library('xml2', 'xmlParseDoc', LIBDIR)
26
+ find_library('xslt', 'xsltParseStylesheetDoc', LIBDIR)
27
+ end
28
+
29
+
30
+ if Config::CONFIG['target_os'] == 'mingw32'
31
+ header = File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'include')
32
+ unless find_header('libxml/xmlversion.h', header)
33
+ abort "need libxml"
34
+ end
35
+
36
+ header = File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'include')
37
+ unless find_header('libxslt/libxslt.h', header)
38
+ abort "need libxslt"
39
+ end
40
+
41
+ header = File.join(ROOT, 'cross', 'iconv-1.9.2.win32', 'include')
42
+ unless find_header('iconv.h', header)
43
+ abort "need iconv"
44
+ end
45
+ else
46
+ unless find_header('libxml/xmlversion.h',
47
+ File.join(INCLUDEDIR, "libxml2"), '/usr/include/libxml2'
48
+ )
49
+ abort "need libxml"
50
+ end
51
+ unless find_header('libxslt/xslt.h', INCLUDEDIR, '/usr/include')
52
+ abort "need libxslt"
53
+ end
54
+
55
+ version = try_constant('LIBXML_VERSION', 'libxml/xmlversion.h')
56
+ end
57
+
58
+ myincl = %w[/usr/local/include /opt/local/include /usr/include]
59
+ mylib = %w[/usr/local/lib /opt/local/lib /usr/lib]
60
+
61
+ find_header('ruby.h', INCLUDEDIR, *myincl) or abort "need ruby.h"
62
+
63
+ find_header('json/json.h', INCLUDEDIR, *myincl) or abort "need json/json.h"
64
+ find_library('json', 'json_object_new_string', LIBDIR, *mylib) or abort "need libjson"
65
+
66
+ find_header('parsley.h', INCLUDEDIR, *myincl) or abort "need parsley.h"
67
+ find_library('parsley', 'parsley_compile', LIBDIR, *mylib) or abort "need libparsley"
68
+
69
+ create_makefile('cparsley')
data/lib/parsley.rb ADDED
@@ -0,0 +1,84 @@
1
+ require File.dirname(__FILE__) + "/../ext/cparsley"
2
+ require "rubygems"
3
+ require "json"
4
+ require "thread"
5
+
6
+ class Parsley
7
+
8
+ def self.user_agent=(agent)
9
+ @user_agent = agent
10
+ CParsley.set_user_agent(agent.to_s)
11
+ end
12
+
13
+ def self.user_agent
14
+ @user_agent
15
+ end
16
+
17
+ def initialize(parsley, incl = "")
18
+ if(parsley.is_a?(Hash))
19
+ parsley = recursive_stringify(parsley).to_json
20
+ end
21
+ @@mutex ||= Mutex.new
22
+ @@mutex.synchronize do
23
+ @parsley = CParsley.new(parsley, incl)
24
+ end
25
+ end
26
+
27
+ # Valid options:
28
+ #
29
+ # Requires one of:
30
+ # :file -- the input file path or url
31
+ # :string -- the input string
32
+ #
33
+ # And optionally (default is the first listed value):
34
+ # :input => [:html, :xml]
35
+ # :output => [:ruby, :json, :xml]
36
+ # :prune => [true, false]
37
+ # :sgwrap => [false, true]
38
+ # :collate => [true, false]
39
+ # :base => "http://some/base/href"
40
+ # :allow_net => [true, false]
41
+ # :allow_local => [true, false]
42
+ def parse(options = {})
43
+ options[:file] || options[:string] || (raise ParsleyError.new("must specify what to parse"))
44
+
45
+ options[:sgwrap] = !!options[:sgwrap]
46
+ options[:is_file] = !!options[:file]
47
+ options[:has_base] = !!options[:base]
48
+
49
+ options[:base] = options[:base].to_s
50
+ options[:file] = options[:file].to_s
51
+ options[:string] = options[:string].to_s
52
+
53
+ options[:input] ||= :html
54
+ options[:output] ||= :ruby
55
+
56
+ options[:collate] = true unless options.has_key?(:collate)
57
+ options[:prune] = true unless options.has_key?(:prune)
58
+ options[:allow_net] = true unless options.has_key?(:allow_net)
59
+ options[:allow_local] = true unless options.has_key?(:allow_local)
60
+
61
+ options[:collate] = !!options[:collate]
62
+ options[:prune] = !!options[:prune]
63
+ options[:allow_net] = !!options[:allow_net]
64
+ options[:allow_local] = !!options[:allow_local]
65
+
66
+ @parsley.parse(options)
67
+ end
68
+ private
69
+
70
+ def recursive_stringify(obj)
71
+ case obj
72
+ when Hash
73
+ obj.inject({}) do |memo, (k, v)|
74
+ memo[k.to_s] = recursive_stringify(v)
75
+ memo
76
+ end
77
+ when Array
78
+ obj.map{|e| recursive_stringify(e) }
79
+ else
80
+ obj.to_s
81
+ end
82
+ end
83
+
84
+ end
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{parsley-ruby}
8
+ s.version = "0.4.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-11-01}
13
+ s.description = %q{XML/HTML Parser}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.extra_rdoc_files = [
16
+ "README"
17
+ ]
18
+ s.files = [
19
+ ".gitignore",
20
+ "README",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "ext/cparsley.c",
24
+ "ext/extconf.rb",
25
+ "lib/parsley.rb",
26
+ "parsley-ruby.gemspec",
27
+ "test/test_parsley.rb",
28
+ "test/yelp-benchmark.rb",
29
+ "test/yelp-home.html",
30
+ "test/yelp-home.let",
31
+ "test/yelp.html"
32
+ ]
33
+ s.homepage = %q{http://github.com/fizx/parsley-ruby}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.5}
37
+ s.summary = %q{Ruby binding for parsley}
38
+ s.test_files = [
39
+ "test/test_parsley.rb",
40
+ "test/yelp-benchmark.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
48
+ s.add_runtime_dependency(%q<json>, ["> 0.0.0"])
49
+ else
50
+ s.add_dependency(%q<json>, ["> 0.0.0"])
51
+ end
52
+ else
53
+ s.add_dependency(%q<json>, ["> 0.0.0"])
54
+ end
55
+ end
56
+
@@ -0,0 +1,116 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/parsley"
3
+
4
+ class TestParsley < Test::Unit::TestCase
5
+ def setup
6
+ @page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
7
+ @home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
8
+ @let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
9
+ end
10
+
11
+ def test_segfault_regression
12
+ simple_html = <<-HTML
13
+ <html>
14
+ <body>
15
+ <h1 class="iCIMS_Header_JobTitle">CEO</h1>
16
+ </body>
17
+ </html>
18
+ HTML
19
+
20
+ struct = {
21
+ 'jobs' => [{
22
+ 'title' => ".iCIMS_Header_JobTitle",
23
+ 'description?' => "blah",
24
+ 'location?' => "blah",
25
+ 'experience?' => "blah",
26
+ 'education?' => "blah"
27
+ }]
28
+ }
29
+ parselet = Parsley.new(struct)
30
+ result = parselet.parse(:string => simple_html)
31
+ assert_equal "CEO", result['jobs'].first['title']
32
+ assert result['jobs'].first['description'].nil?
33
+ end
34
+ #
35
+ # def test_yelp
36
+ # @parsley = Parsley.new(File.read(@let))
37
+ # out = @parsley.parse(:file => @home)
38
+ # assert_equal "/c/sf/shopping", out["categories"][0]["href"]
39
+ # end
40
+ #
41
+ # def test_parsley_should_raise_if_value_syntax_error
42
+ # assert_raises(ParsleyError) do
43
+ # Parsley.new({"foo" => nil})
44
+ # end
45
+ #
46
+ # assert_raises(ParsleyError) do
47
+ # Parsley.new({"foo" => ""})
48
+ # end
49
+ #
50
+ # assert_raises(ParsleyError) do
51
+ # Parsley.new({"foo" => "<<<<<<<<<<<"})
52
+ # end
53
+ # end
54
+ #
55
+ # def test_yelp_xml
56
+ # @parsley = Parsley.new(File.read(@let))
57
+ # out = @parsley.parse(:file => @home, :output => :xml)
58
+ # end
59
+ #
60
+ # def test_broken
61
+ # @parsley = Parsley.new("hi" => "no-ns:match(h1)")
62
+ # assert_raises(ParsleyError) {
63
+ # @parsley.parse(:file => @page)
64
+ # }
65
+ # end
66
+ #
67
+ # def test_simple
68
+ # @parsley = Parsley.new("hi" => "h1")
69
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
70
+ # end
71
+ #
72
+ # def test_simple_string
73
+ # @parsley = Parsley.new("hi" => "h1")
74
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
75
+ # end
76
+ #
77
+ # def test_xml
78
+ # @parsley = Parsley.new("hi" => "h1")
79
+ # xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parselets.com/json\"><hi position=\"63\">Nick's Crispy Tacos</hi></parsley:root>\n"
80
+ # assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
81
+ # end
82
+ #
83
+ # def test_sgwrap
84
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
85
+ # html = "<p><b>hi</b>world</p>"
86
+ # assert_equal({"hi" => "world"}, @parsley.parse(:string => html, :sgwrap => true))
87
+ # end
88
+ #
89
+ # def test_sgwrap_off
90
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
91
+ # html = "<p><b>hi</b>world</p>"
92
+ # assert_raises(ParsleyError) do
93
+ # @parsley.parse(:string => html, :sgwrap => false)
94
+ # end
95
+ # end
96
+ #
97
+ #
98
+ # def test_json
99
+ # @parsley = Parsley.new("hi" => "h1")
100
+ # assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
101
+ # end
102
+ #
103
+ # def test_rescuable_file_error
104
+ # @parsley = Parsley.new("hi" => "h1")
105
+ # @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
106
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
107
+ # end
108
+ #
109
+ # def test_array_string
110
+ # @parsley = Parsley.new({"foo" => ["li"]})
111
+ # out = @parsley.parse(:file => @page)
112
+ # assert_kind_of Hash, out
113
+ # assert_kind_of Array, out["foo"], out.inspect
114
+ # assert out["foo"].length > 1
115
+ # end
116
+ end
@@ -0,0 +1,53 @@
1
+ require "rubygems"
2
+ require "nokogiri"
3
+ require "hpricot"
4
+ require "parsley"
5
+ require "benchmark"
6
+ require "pp"
7
+
8
+ YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
9
+
10
+ def noko
11
+ parse Nokogiri.Hpricot(File.open(YELP_HTML))
12
+ end
13
+
14
+ def hpri
15
+ parse Hpricot(File.open(YELP_HTML))
16
+ end
17
+
18
+ def parse(doc)
19
+ out = {}
20
+ out["name"] = (doc / "h1").first.inner_text
21
+ out["phone"] = (doc / "#bizPhone").first.inner_text
22
+ out["address"] = (doc / "address").first.inner_text
23
+ out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
24
+ review = {}
25
+ review["date"] = (node / ".ieSucks .smaller").first.inner_text
26
+ review["user_name"] = (node / ".reviewer_info a").first.inner_text
27
+ review["comment"] = (node / ".review_comment").first.inner_text
28
+ review
29
+ end
30
+ end
31
+
32
+ def pars
33
+ parslet = Parsley.new({
34
+ "name" => "h1",
35
+ "phone" => "#bizPhone",
36
+ "address" => "address",
37
+ "reviews(.nonfavoriteReview)" => [
38
+ {
39
+ "date" => ".ieSucks .smaller",
40
+ "user_name" => ".reviewer_info a",
41
+ "comment" => ".review_comment"
42
+ }
43
+ ]
44
+ })
45
+ pp parslet.parse(:file => YELP_HTML)
46
+ end
47
+
48
+ Benchmark.bm do |x|
49
+ x.report("nokogiri: ") { 3.times { noko } }
50
+ x.report("hpricot: ") { 3.times { hpri } }
51
+ x.report("parsley: ") { 3.times { pars } }
52
+ end
53
+