parsley-ruby 0.0.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,26 @@
1
+ .libs/
2
+ *.o
3
+ *.lo
4
+ dexterc
5
+ dexter
6
+ parsleyc
7
+ parsley
8
+ .deps/
9
+ Makefile
10
+ y.tab.c
11
+ autom4te.cache/
12
+ autoscan.log
13
+ config.log
14
+ configure.scan
15
+ parser.c
16
+ scanner.c
17
+ libparsley.la
18
+ parser.h
19
+ test.log
20
+ parsley*.gem
21
+ ext/cparsley.bundle
22
+ ext/cparsley.so
23
+ ext/Makefile
24
+ ext/conftest.dSYM/
25
+ work
26
+ ext/mkmf.log
data/README ADDED
@@ -0,0 +1,32 @@
1
+ ABOUT
2
+
3
+ Ruby bindings for Parsley.
4
+
5
+ INSTALLATION
6
+
7
+ = Get Parsley and Dependancies =
8
+
9
+ Download Parsley from http://github.com/fizx/parsley/tree/master following the installation directions located at http://github.com/fizx/parsley/blob/master/INSTALL
10
+
11
+ = Install parsley-ruby =
12
+
13
+ From source:
14
+ sudo rake install
15
+
16
+ From GitHub: DEPRECATED!
17
+
18
+ From GemCutter
19
+
20
+ Run the following if you haven't already:
21
+ gem sources -a http://gemcutter.org
22
+ Install the gem:
23
+ sudo gem install parsley-ruby
24
+
25
+ PARSLETS.COM INTEGRATION
26
+
27
+ We also recommend installing the free online_parselets rubygem in order to use other people's parselets and to share your own:
28
+ Run the following if you haven't already:
29
+ gem sources -a http://gems.github.com
30
+ Install the gem:
31
+ sudo gem install iterationlabs-online_parslets
32
+
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "parsley-ruby"
8
+ gem.summary = "Ruby binding for parsley"
9
+ gem.description = "XML/HTML Parser"
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/parsley-ruby"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_dependency("json", ["> 0.0.0"])
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/*_test.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/*_test.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies
41
+
42
+ task :default => :test
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ if File.exist?('VERSION')
47
+ version = File.read('VERSION')
48
+ else
49
+ version = ""
50
+ end
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "robots #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.1
data/ext/cparsley.c ADDED
@@ -0,0 +1,140 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <libxslt/xslt.h>
4
+ #include <libexslt/exslt.h>
5
+ #include <libxslt/xsltInternals.h>
6
+ #include <libxslt/transform.h>
7
+ #include <libxml/parser.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/HTMLtree.h>
10
+ #include <libxml/xmlwriter.h>
11
+ #include <parsley.h>
12
+ #include <json/json.h>
13
+ #include <xml2json.h>
14
+
15
+ VALUE _new(VALUE, VALUE, VALUE);
16
+ VALUE _parse(VALUE, VALUE);
17
+ VALUE _rb_set_user_agent(VALUE self, VALUE agent);
18
+ VALUE c_parsley_err;
19
+ VALUE c_parsley;
20
+
21
+ void Init_cparsley()
22
+ {
23
+ c_parsley = rb_define_class("CParsley", rb_cObject);
24
+ c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
25
+ rb_define_singleton_method(c_parsley, "new", _new, 2);
26
+ rb_define_singleton_method(c_parsley, "set_user_agent", _rb_set_user_agent, 1);
27
+ rb_define_method(c_parsley, "parse", _parse, 1);
28
+ }
29
+
30
+ VALUE
31
+ _new(VALUE self, VALUE parsley, VALUE incl){
32
+ parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
33
+ if(ptr->error != NULL) {
34
+ rb_raise(c_parsley_err, ptr->error);
35
+ parsley_free(ptr);
36
+ return Qnil;
37
+ }
38
+
39
+ return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
40
+ }
41
+
42
+ VALUE
43
+ _rb_set_user_agent(VALUE self, VALUE agent) {
44
+ parsley_set_user_agent(STR2CSTR(agent));
45
+ return Qtrue;
46
+ }
47
+
48
+
49
+ static VALUE
50
+ rubify_recurse(xmlNodePtr xml) {
51
+ if(xml == NULL) return NULL;
52
+ xmlNodePtr child;
53
+ VALUE obj = Qnil;
54
+
55
+ switch(xml->type) {
56
+ case XML_ELEMENT_NODE:
57
+ child = xml->children;
58
+ if(xml->ns == NULL) {
59
+ child = xml;
60
+ obj = rb_hash_new();
61
+ while(child != NULL) {
62
+ rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
63
+ child = child->next;
64
+ }
65
+ } else if(!strcmp(xml->ns->prefix, "parsley")) {
66
+ if(!strcmp(xml->name, "groups")) {
67
+ obj = rb_ary_new();
68
+ while(child != NULL) {
69
+ rb_ary_push(obj, rubify_recurse(child->children));
70
+ child = child->next;
71
+ }
72
+ } else if(!strcmp(xml->name, "group")) {
73
+ // Implicitly handled by parsley:groups handler
74
+ }
75
+ }
76
+ break;
77
+ case XML_TEXT_NODE:
78
+ obj = rb_str_new2(xml->content);
79
+ break;
80
+ }
81
+ // inspect(obj);
82
+ return obj;
83
+ }
84
+
85
+ static VALUE
86
+ _parse_doc(parsedParsleyPtr ptr, VALUE type) {
87
+ if(ptr->error != NULL || ptr->xml == NULL) {
88
+ if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
89
+ rb_raise(c_parsley_err, ptr->error);
90
+ parsed_parsley_free(ptr);
91
+ return Qnil;
92
+ }
93
+
94
+ VALUE output;
95
+ if(type == ID2SYM(rb_intern("json"))) {
96
+ struct json_object *json = xml2json(ptr->xml->children->children);
97
+ char* str = json_object_to_json_string(json);
98
+ output = rb_str_new2(str);
99
+ json_object_put(json);
100
+ } else if(type == ID2SYM(rb_intern("xml"))) {
101
+ xmlChar* str;
102
+ int size;
103
+ xmlDocDumpMemory(ptr->xml, &str, &size);
104
+ output = rb_str_new(str, size);
105
+ } else {
106
+ output = rubify_recurse(ptr->xml->children->children);
107
+ if((void*)output == NULL) output = Qnil;
108
+ }
109
+
110
+ parsed_parsley_free(ptr);
111
+
112
+ return output;
113
+ }
114
+
115
+ #define OPT(A) rb_hash_aref(options, ID2SYM(rb_intern(A)))
116
+ #define OPT_BOOL(A) (OPT(A) != Qnil && OPT(A) != Qfalse)
117
+ #define OPT_MATCH(A, B) (rb_hash_aref(options, ID2SYM(rb_intern(A))) == ID2SYM(rb_intern(B)))
118
+
119
+ VALUE _parse(VALUE self, VALUE options){
120
+ parsleyPtr parsley;
121
+ Data_Get_Struct(self, parsleyPtr, parsley);
122
+ int flags = 0;
123
+ char *base = NULL;
124
+ if(OPT_MATCH("input", "html")) flags |= PARSLEY_OPTIONS_HTML;
125
+ if(OPT_BOOL("prune")) flags |= PARSLEY_OPTIONS_PRUNE;
126
+ if(OPT_BOOL("collate")) flags |= PARSLEY_OPTIONS_COLLATE;
127
+ if(OPT_BOOL("allow_net")) flags |= PARSLEY_OPTIONS_ALLOW_NET;
128
+ if(OPT_BOOL("allow_local")) flags |= PARSLEY_OPTIONS_ALLOW_LOCAL;
129
+ if(OPT_BOOL("sgwrap")) flags |= PARSLEY_OPTIONS_SGWRAP;
130
+ if(OPT_BOOL("has_base")) base = STR2CSTR(OPT("base"));
131
+
132
+ // printf("prune: %d\nallow_net: %d\nallow_local: %d\nhas_base: %d\nflags: %d\n", OPT_BOOL("prune"), OPT_BOOL("allow_net"), OPT_BOOL("allow_local"), OPT_BOOL("has_base"), flags);
133
+
134
+ if(OPT_BOOL("is_file")) {
135
+ return _parse_doc(parsley_parse_file(parsley, STR2CSTR(OPT("file")), flags), OPT("output"));
136
+ } else {
137
+ char * str = STR2CSTR(OPT("string"));
138
+ return _parse_doc(parsley_parse_string(parsley, str, strlen(str), base, flags), OPT("output"));
139
+ }
140
+ }
data/ext/extconf.rb ADDED
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ ENV["ARCHFLAGS"] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}"
3
+
4
+ require 'mkmf'
5
+
6
+ ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
7
+ LIBDIR = Config::CONFIG['libdir']
8
+ INCLUDEDIR = Config::CONFIG['includedir']
9
+
10
+ $CFLAGS << " #{ENV["CFLAGS"]}"
11
+ if Config::CONFIG['target_os'] == 'mingw32'
12
+ $CFLAGS << " -DXP_WIN -DXP_WIN32"
13
+ else
14
+ $CFLAGS << " -g -DXP_UNIX"
15
+ end
16
+
17
+ $CFLAGS << " -O3 -Wall -Wextra -Wcast-qual -Wwrite-strings -Wconversion -Wmissing-noreturn -Winline"
18
+
19
+ if Config::CONFIG['target_os'] == 'mingw32'
20
+ find_library('xml2', 'xmlParseDoc',
21
+ File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'bin'))
22
+ find_library('xslt', 'xsltParseStylesheetDoc',
23
+ File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'bin'))
24
+ else
25
+ find_library('xml2', 'xmlParseDoc', LIBDIR)
26
+ find_library('xslt', 'xsltParseStylesheetDoc', LIBDIR)
27
+ end
28
+
29
+
30
+ if Config::CONFIG['target_os'] == 'mingw32'
31
+ header = File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'include')
32
+ unless find_header('libxml/xmlversion.h', header)
33
+ abort "need libxml"
34
+ end
35
+
36
+ header = File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'include')
37
+ unless find_header('libxslt/libxslt.h', header)
38
+ abort "need libxslt"
39
+ end
40
+
41
+ header = File.join(ROOT, 'cross', 'iconv-1.9.2.win32', 'include')
42
+ unless find_header('iconv.h', header)
43
+ abort "need iconv"
44
+ end
45
+ else
46
+ unless find_header('libxml/xmlversion.h',
47
+ File.join(INCLUDEDIR, "libxml2"), '/usr/include/libxml2'
48
+ )
49
+ abort "need libxml"
50
+ end
51
+ unless find_header('libxslt/xslt.h', INCLUDEDIR, '/usr/include')
52
+ abort "need libxslt"
53
+ end
54
+
55
+ version = try_constant('LIBXML_VERSION', 'libxml/xmlversion.h')
56
+ end
57
+
58
+ myincl = %w[/usr/local/include /opt/local/include /usr/include]
59
+ mylib = %w[/usr/local/lib /opt/local/lib /usr/lib]
60
+
61
+ find_header('ruby.h', INCLUDEDIR, *myincl) or abort "need ruby.h"
62
+
63
+ find_header('json/json.h', INCLUDEDIR, *myincl) or abort "need json/json.h"
64
+ find_library('json', 'json_object_new_string', LIBDIR, *mylib) or abort "need libjson"
65
+
66
+ find_header('parsley.h', INCLUDEDIR, *myincl) or abort "need parsley.h"
67
+ find_library('parsley', 'parsley_compile', LIBDIR, *mylib) or abort "need libparsley"
68
+
69
+ create_makefile('cparsley')
data/lib/parsley.rb ADDED
@@ -0,0 +1,84 @@
1
+ require File.dirname(__FILE__) + "/../ext/cparsley"
2
+ require "rubygems"
3
+ require "json"
4
+ require "thread"
5
+
6
+ class Parsley
7
+
8
+ def self.user_agent=(agent)
9
+ @user_agent = agent
10
+ CParsley.set_user_agent(agent.to_s)
11
+ end
12
+
13
+ def self.user_agent
14
+ @user_agent
15
+ end
16
+
17
+ def initialize(parsley, incl = "")
18
+ if(parsley.is_a?(Hash))
19
+ parsley = recursive_stringify(parsley).to_json
20
+ end
21
+ @@mutex ||= Mutex.new
22
+ @@mutex.synchronize do
23
+ @parsley = CParsley.new(parsley, incl)
24
+ end
25
+ end
26
+
27
+ # Valid options:
28
+ #
29
+ # Requires one of:
30
+ # :file -- the input file path or url
31
+ # :string -- the input string
32
+ #
33
+ # And optionally (default is the first listed value):
34
+ # :input => [:html, :xml]
35
+ # :output => [:ruby, :json, :xml]
36
+ # :prune => [true, false]
37
+ # :sgwrap => [false, true]
38
+ # :collate => [true, false]
39
+ # :base => "http://some/base/href"
40
+ # :allow_net => [true, false]
41
+ # :allow_local => [true, false]
42
+ def parse(options = {})
43
+ options[:file] || options[:string] || (raise ParsleyError.new("must specify what to parse"))
44
+
45
+ options[:sgwrap] = !!options[:sgwrap]
46
+ options[:is_file] = !!options[:file]
47
+ options[:has_base] = !!options[:base]
48
+
49
+ options[:base] = options[:base].to_s
50
+ options[:file] = options[:file].to_s
51
+ options[:string] = options[:string].to_s
52
+
53
+ options[:input] ||= :html
54
+ options[:output] ||= :ruby
55
+
56
+ options[:collate] = true unless options.has_key?(:collate)
57
+ options[:prune] = true unless options.has_key?(:prune)
58
+ options[:allow_net] = true unless options.has_key?(:allow_net)
59
+ options[:allow_local] = true unless options.has_key?(:allow_local)
60
+
61
+ options[:collate] = !!options[:collate]
62
+ options[:prune] = !!options[:prune]
63
+ options[:allow_net] = !!options[:allow_net]
64
+ options[:allow_local] = !!options[:allow_local]
65
+
66
+ @parsley.parse(options)
67
+ end
68
+ private
69
+
70
+ def recursive_stringify(obj)
71
+ case obj
72
+ when Hash
73
+ obj.inject({}) do |memo, (k, v)|
74
+ memo[k.to_s] = recursive_stringify(v)
75
+ memo
76
+ end
77
+ when Array
78
+ obj.map{|e| recursive_stringify(e) }
79
+ else
80
+ obj.to_s
81
+ end
82
+ end
83
+
84
+ end
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{parsley-ruby}
8
+ s.version = "0.4.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-11-01}
13
+ s.description = %q{XML/HTML Parser}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.extra_rdoc_files = [
16
+ "README"
17
+ ]
18
+ s.files = [
19
+ ".gitignore",
20
+ "README",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "ext/cparsley.c",
24
+ "ext/extconf.rb",
25
+ "lib/parsley.rb",
26
+ "parsley-ruby.gemspec",
27
+ "test/test_parsley.rb",
28
+ "test/yelp-benchmark.rb",
29
+ "test/yelp-home.html",
30
+ "test/yelp-home.let",
31
+ "test/yelp.html"
32
+ ]
33
+ s.homepage = %q{http://github.com/fizx/parsley-ruby}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.5}
37
+ s.summary = %q{Ruby binding for parsley}
38
+ s.test_files = [
39
+ "test/test_parsley.rb",
40
+ "test/yelp-benchmark.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
48
+ s.add_runtime_dependency(%q<json>, ["> 0.0.0"])
49
+ else
50
+ s.add_dependency(%q<json>, ["> 0.0.0"])
51
+ end
52
+ else
53
+ s.add_dependency(%q<json>, ["> 0.0.0"])
54
+ end
55
+ end
56
+
@@ -0,0 +1,116 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/parsley"
3
+
4
+ class TestParsley < Test::Unit::TestCase
5
+ def setup
6
+ @page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
7
+ @home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
8
+ @let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
9
+ end
10
+
11
+ def test_segfault_regression
12
+ simple_html = <<-HTML
13
+ <html>
14
+ <body>
15
+ <h1 class="iCIMS_Header_JobTitle">CEO</h1>
16
+ </body>
17
+ </html>
18
+ HTML
19
+
20
+ struct = {
21
+ 'jobs' => [{
22
+ 'title' => ".iCIMS_Header_JobTitle",
23
+ 'description?' => "blah",
24
+ 'location?' => "blah",
25
+ 'experience?' => "blah",
26
+ 'education?' => "blah"
27
+ }]
28
+ }
29
+ parselet = Parsley.new(struct)
30
+ result = parselet.parse(:string => simple_html)
31
+ assert_equal "CEO", result['jobs'].first['title']
32
+ assert result['jobs'].first['description'].nil?
33
+ end
34
+ #
35
+ # def test_yelp
36
+ # @parsley = Parsley.new(File.read(@let))
37
+ # out = @parsley.parse(:file => @home)
38
+ # assert_equal "/c/sf/shopping", out["categories"][0]["href"]
39
+ # end
40
+ #
41
+ # def test_parsley_should_raise_if_value_syntax_error
42
+ # assert_raises(ParsleyError) do
43
+ # Parsley.new({"foo" => nil})
44
+ # end
45
+ #
46
+ # assert_raises(ParsleyError) do
47
+ # Parsley.new({"foo" => ""})
48
+ # end
49
+ #
50
+ # assert_raises(ParsleyError) do
51
+ # Parsley.new({"foo" => "<<<<<<<<<<<"})
52
+ # end
53
+ # end
54
+ #
55
+ # def test_yelp_xml
56
+ # @parsley = Parsley.new(File.read(@let))
57
+ # out = @parsley.parse(:file => @home, :output => :xml)
58
+ # end
59
+ #
60
+ # def test_broken
61
+ # @parsley = Parsley.new("hi" => "no-ns:match(h1)")
62
+ # assert_raises(ParsleyError) {
63
+ # @parsley.parse(:file => @page)
64
+ # }
65
+ # end
66
+ #
67
+ # def test_simple
68
+ # @parsley = Parsley.new("hi" => "h1")
69
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
70
+ # end
71
+ #
72
+ # def test_simple_string
73
+ # @parsley = Parsley.new("hi" => "h1")
74
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
75
+ # end
76
+ #
77
+ # def test_xml
78
+ # @parsley = Parsley.new("hi" => "h1")
79
+ # xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parselets.com/json\"><hi position=\"63\">Nick's Crispy Tacos</hi></parsley:root>\n"
80
+ # assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
81
+ # end
82
+ #
83
+ # def test_sgwrap
84
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
85
+ # html = "<p><b>hi</b>world</p>"
86
+ # assert_equal({"hi" => "world"}, @parsley.parse(:string => html, :sgwrap => true))
87
+ # end
88
+ #
89
+ # def test_sgwrap_off
90
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
91
+ # html = "<p><b>hi</b>world</p>"
92
+ # assert_raises(ParsleyError) do
93
+ # @parsley.parse(:string => html, :sgwrap => false)
94
+ # end
95
+ # end
96
+ #
97
+ #
98
+ # def test_json
99
+ # @parsley = Parsley.new("hi" => "h1")
100
+ # assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
101
+ # end
102
+ #
103
+ # def test_rescuable_file_error
104
+ # @parsley = Parsley.new("hi" => "h1")
105
+ # @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
106
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
107
+ # end
108
+ #
109
+ # def test_array_string
110
+ # @parsley = Parsley.new({"foo" => ["li"]})
111
+ # out = @parsley.parse(:file => @page)
112
+ # assert_kind_of Hash, out
113
+ # assert_kind_of Array, out["foo"], out.inspect
114
+ # assert out["foo"].length > 1
115
+ # end
116
+ end
@@ -0,0 +1,53 @@
1
+ require "rubygems"
2
+ require "nokogiri"
3
+ require "hpricot"
4
+ require "parsley"
5
+ require "benchmark"
6
+ require "pp"
7
+
8
+ YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
9
+
10
+ def noko
11
+ parse Nokogiri.Hpricot(File.open(YELP_HTML))
12
+ end
13
+
14
+ def hpri
15
+ parse Hpricot(File.open(YELP_HTML))
16
+ end
17
+
18
+ def parse(doc)
19
+ out = {}
20
+ out["name"] = (doc / "h1").first.inner_text
21
+ out["phone"] = (doc / "#bizPhone").first.inner_text
22
+ out["address"] = (doc / "address").first.inner_text
23
+ out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
24
+ review = {}
25
+ review["date"] = (node / ".ieSucks .smaller").first.inner_text
26
+ review["user_name"] = (node / ".reviewer_info a").first.inner_text
27
+ review["comment"] = (node / ".review_comment").first.inner_text
28
+ review
29
+ end
30
+ end
31
+
32
+ def pars
33
+ parslet = Parsley.new({
34
+ "name" => "h1",
35
+ "phone" => "#bizPhone",
36
+ "address" => "address",
37
+ "reviews(.nonfavoriteReview)" => [
38
+ {
39
+ "date" => ".ieSucks .smaller",
40
+ "user_name" => ".reviewer_info a",
41
+ "comment" => ".review_comment"
42
+ }
43
+ ]
44
+ })
45
+ pp parslet.parse(:file => YELP_HTML)
46
+ end
47
+
48
+ Benchmark.bm do |x|
49
+ x.report("nokogiri: ") { 3.times { noko } }
50
+ x.report("hpricot: ") { 3.times { hpri } }
51
+ x.report("parsley: ") { 3.times { pars } }
52
+ end
53
+