fizx-parsley-ruby 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/cparsley.c +118 -0
- data/lib/parsley.rb +39 -0
- data/parsley-ruby.gemspec +28 -0
- data/test/test_parsley.rb +56 -0
- data/test/yelp-benchmark.rb +53 -0
- data/test/yelp-home.html +1004 -0
- data/test/yelp-home.let +6 -0
- data/test/yelp.html +2329 -0
- metadata +10 -11
data/ext/cparsley.c
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <libxslt/xslt.h>
|
4
|
+
#include <libexslt/exslt.h>
|
5
|
+
#include <libxslt/xsltInternals.h>
|
6
|
+
#include <libxslt/transform.h>
|
7
|
+
#include <libxml/parser.h>
|
8
|
+
#include <libxml/HTMLparser.h>
|
9
|
+
#include <libxml/HTMLtree.h>
|
10
|
+
#include <libxml/xmlwriter.h>
|
11
|
+
#include <parsley.h>
|
12
|
+
#include <json/json.h>
|
13
|
+
#include <xml2json.h>
|
14
|
+
|
15
|
+
VALUE _new(VALUE, VALUE, VALUE);
|
16
|
+
VALUE _parse_file(VALUE, VALUE, VALUE, VALUE);
|
17
|
+
VALUE _parse_string(VALUE, VALUE, VALUE, VALUE);
|
18
|
+
VALUE _parse_doc(parsedParsleyPtr, VALUE);
|
19
|
+
VALUE rubify_recurse(xmlNodePtr xml);
|
20
|
+
VALUE c_parsley_err;
|
21
|
+
VALUE c_parsley;
|
22
|
+
|
23
|
+
void Init_cparsley()
|
24
|
+
{
|
25
|
+
c_parsley = rb_define_class("CParsley", rb_cObject);
|
26
|
+
c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
|
27
|
+
rb_define_singleton_method(c_parsley, "new", _new, 2);
|
28
|
+
rb_define_method(c_parsley, "parse_file", _parse_file, 3);
|
29
|
+
rb_define_method(c_parsley, "parse_string", _parse_string, 3);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE _new(VALUE self, VALUE parsley, VALUE incl){
|
33
|
+
parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
|
34
|
+
if(ptr->error != NULL) {
|
35
|
+
rb_raise(c_parsley_err, ptr->error);
|
36
|
+
parsley_free(ptr);
|
37
|
+
return Qnil;
|
38
|
+
}
|
39
|
+
|
40
|
+
return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
|
41
|
+
}
|
42
|
+
|
43
|
+
VALUE _parse_file(VALUE self, VALUE name, VALUE input, VALUE output){
|
44
|
+
parsleyPtr parsley;
|
45
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
46
|
+
return _parse_doc(parsley_parse_file(parsley, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
|
47
|
+
}
|
48
|
+
|
49
|
+
VALUE _parse_string(VALUE self, VALUE string, VALUE input, VALUE output) {
|
50
|
+
parsleyPtr parsley;
|
51
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
52
|
+
char* cstr = STR2CSTR(string);
|
53
|
+
return _parse_doc(parsley_parse_string(parsley, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
|
54
|
+
}
|
55
|
+
|
56
|
+
VALUE _parse_doc(parsedParsleyPtr ptr, VALUE type) {
|
57
|
+
if(ptr->error != NULL || ptr->xml == NULL) {
|
58
|
+
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
|
59
|
+
rb_raise(c_parsley_err, ptr->error);
|
60
|
+
parsed_parsley_free(ptr);
|
61
|
+
return Qnil;
|
62
|
+
}
|
63
|
+
|
64
|
+
VALUE output;
|
65
|
+
if(type == ID2SYM(rb_intern("json"))) {
|
66
|
+
struct json_object *json = xml2json(ptr->xml->children->children);
|
67
|
+
char* str = json_object_to_json_string(json);
|
68
|
+
output = rb_str_new2(str);
|
69
|
+
json_object_put(json);
|
70
|
+
} else if(type == ID2SYM(rb_intern("xml"))) {
|
71
|
+
char* str;
|
72
|
+
int size;
|
73
|
+
xmlDocDumpMemory(ptr->xml, &str, &size);
|
74
|
+
output = rb_str_new(str, size);
|
75
|
+
} else {
|
76
|
+
output = rubify_recurse(ptr->xml->children->children);
|
77
|
+
if(output == NULL) output = Qnil;
|
78
|
+
}
|
79
|
+
|
80
|
+
parsed_parsley_free(ptr);
|
81
|
+
|
82
|
+
return output;
|
83
|
+
}
|
84
|
+
|
85
|
+
VALUE rubify_recurse(xmlNodePtr xml) {
|
86
|
+
if(xml == NULL) return NULL;
|
87
|
+
xmlNodePtr child;
|
88
|
+
VALUE obj = Qnil;
|
89
|
+
|
90
|
+
switch(xml->type) {
|
91
|
+
case XML_ELEMENT_NODE:
|
92
|
+
child = xml->children;
|
93
|
+
if(xml->ns == NULL) {
|
94
|
+
child = xml;
|
95
|
+
obj = rb_hash_new();
|
96
|
+
while(child != NULL) {
|
97
|
+
rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
|
98
|
+
child = child->next;
|
99
|
+
}
|
100
|
+
} else if(!strcmp(xml->ns->prefix, "parsley")) {
|
101
|
+
if(!strcmp(xml->name, "groups")) {
|
102
|
+
obj = rb_ary_new();
|
103
|
+
while(child != NULL) {
|
104
|
+
rb_ary_push(obj, rubify_recurse(child->children));
|
105
|
+
child = child->next;
|
106
|
+
}
|
107
|
+
} else if(!strcmp(xml->name, "group")) {
|
108
|
+
// Implicitly handled by parsley:groups handler
|
109
|
+
}
|
110
|
+
}
|
111
|
+
break;
|
112
|
+
case XML_TEXT_NODE:
|
113
|
+
obj = rb_str_new2(xml->content);
|
114
|
+
break;
|
115
|
+
}
|
116
|
+
// inspect(obj);
|
117
|
+
return obj;
|
118
|
+
}
|
data/lib/parsley.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../ext/cparsley"
|
2
|
+
require "rubygems"
|
3
|
+
require "json"
|
4
|
+
require "thread"
|
5
|
+
|
6
|
+
class Parsley
|
7
|
+
def initialize(parsley, incl = "")
|
8
|
+
if(parsley.is_a?(Hash))
|
9
|
+
parsley = parsley.to_json
|
10
|
+
end
|
11
|
+
@@mutex ||= Mutex.new
|
12
|
+
@@mutex.synchronize do
|
13
|
+
@parsley = CParsley.new(parsley, incl)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Valid options:
|
18
|
+
#
|
19
|
+
# Requires one of:
|
20
|
+
# :file -- the input file path
|
21
|
+
# :string -- the input string
|
22
|
+
#
|
23
|
+
# And optionally:
|
24
|
+
# :input => [:xml, :html]
|
25
|
+
# :output => [:json, :xml, :ruby]
|
26
|
+
# :allow_empty -- If false, throws an exception if any value is empty.
|
27
|
+
#
|
28
|
+
# Defaults are :input => :html, :output => :ruby, :allow_empty => false
|
29
|
+
def parse(options = {})
|
30
|
+
options[:file] || options[:string] || throw("must specify what to parse")
|
31
|
+
options[:input] ||= :html
|
32
|
+
options[:output]||= :ruby
|
33
|
+
if options[:file]
|
34
|
+
@parsley.parse_file options[:file], options[:input], options[:output]
|
35
|
+
else
|
36
|
+
@parsley.parse_string options[:string], options[:input], options[:output]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "parsley-ruby"
|
3
|
+
s.version = "0.1.2"
|
4
|
+
s.date = "2008-08-10"
|
5
|
+
s.summary = "Ruby binding for parsley"
|
6
|
+
s.email = "kyle@kylemaxwell.com"
|
7
|
+
s.homepage = "http://github.com/fizx/parsley-ruby"
|
8
|
+
s.description = "Ruby binding for parsley"
|
9
|
+
s.has_rdoc = true
|
10
|
+
s.require_paths = ["lib", "ext"]
|
11
|
+
s.extensions = "ext/extconf.rb"
|
12
|
+
s.authors = ["Kyle Maxwell"]
|
13
|
+
s.files = %w[
|
14
|
+
ext/cparsley.c
|
15
|
+
ext/extconf.rb
|
16
|
+
lib/parsley.rb
|
17
|
+
parsley-ruby.gemspec
|
18
|
+
README
|
19
|
+
test/test_parsley.rb
|
20
|
+
test/yelp-benchmark.rb
|
21
|
+
test/yelp-home.html
|
22
|
+
test/yelp-home.let
|
23
|
+
test/yelp.html
|
24
|
+
]
|
25
|
+
s.rdoc_options = ["--main", "README"]
|
26
|
+
s.extra_rdoc_files = ["README"]
|
27
|
+
s.add_dependency("json", ["> 0.0.0"])
|
28
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/parsley"
|
3
|
+
|
4
|
+
class TestParsley < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
|
7
|
+
@home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
|
8
|
+
@let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_yelp
|
12
|
+
@parsley = Parsley.new(File.read(@let))
|
13
|
+
out = @parsley.parse(:file => @home)
|
14
|
+
assert_equal "/c/sf/shopping", out["categories"][0]["href"]
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_yelp_xml
|
18
|
+
@parsley = Parsley.new(File.read(@let))
|
19
|
+
out = @parsley.parse(:file => @home, :output => :xml)
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_simple
|
23
|
+
@parsley = Parsley.new("hi" => "h1")
|
24
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_simple_string
|
28
|
+
@parsley = Parsley.new("hi" => "h1")
|
29
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_xml
|
33
|
+
@parsley = Parsley.new("hi" => "h1")
|
34
|
+
xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parslets.com/json\"><hi>Nick's Crispy Tacos</hi></parsley:root>\n"
|
35
|
+
assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_json
|
39
|
+
@parsley = Parsley.new("hi" => "h1")
|
40
|
+
assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_rescuable_file_error
|
44
|
+
@parsley = Parsley.new("hi" => "h1")
|
45
|
+
@nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
|
46
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_array_string
|
50
|
+
@parsley = Parsley.new({"foo" => ["li"]})
|
51
|
+
out = @parsley.parse(:file => @page)
|
52
|
+
assert_kind_of Hash, out
|
53
|
+
assert_kind_of Array, out["foo"], out.inspect
|
54
|
+
assert out["foo"].length > 1
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "nokogiri"
|
3
|
+
require "hpricot"
|
4
|
+
require "parsley"
|
5
|
+
require "benchmark"
|
6
|
+
require "pp"
|
7
|
+
|
8
|
+
YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
|
9
|
+
|
10
|
+
def noko
|
11
|
+
parse Nokogiri.Hpricot(File.open(YELP_HTML))
|
12
|
+
end
|
13
|
+
|
14
|
+
def hpri
|
15
|
+
parse Hpricot(File.open(YELP_HTML))
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(doc)
|
19
|
+
out = {}
|
20
|
+
out["name"] = (doc / "h1").first.inner_text
|
21
|
+
out["phone"] = (doc / "#bizPhone").first.inner_text
|
22
|
+
out["address"] = (doc / "address").first.inner_text
|
23
|
+
out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
|
24
|
+
review = {}
|
25
|
+
review["date"] = (node / ".ieSucks .smaller").first.inner_text
|
26
|
+
review["user_name"] = (node / ".reviewer_info a").first.inner_text
|
27
|
+
review["comment"] = (node / ".review_comment").first.inner_text
|
28
|
+
review
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def pars
|
33
|
+
parslet = Parsley.new({
|
34
|
+
"name" => "h1",
|
35
|
+
"phone" => "#bizPhone",
|
36
|
+
"address" => "address",
|
37
|
+
"reviews(.nonfavoriteReview)" => [
|
38
|
+
{
|
39
|
+
"date" => ".ieSucks .smaller",
|
40
|
+
"user_name" => ".reviewer_info a",
|
41
|
+
"comment" => ".review_comment"
|
42
|
+
}
|
43
|
+
]
|
44
|
+
})
|
45
|
+
pp parslet.parse(:file => YELP_HTML)
|
46
|
+
end
|
47
|
+
|
48
|
+
Benchmark.bm do |x|
|
49
|
+
x.report("nokogiri: ") { 3.times { noko } }
|
50
|
+
x.report("hpricot: ") { 3.times { hpri } }
|
51
|
+
x.report("parsley: ") { 3.times { pars } }
|
52
|
+
end
|
53
|
+
|