fizx-parsley-ruby 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/cparsley.c +118 -0
- data/lib/parsley.rb +39 -0
- data/parsley-ruby.gemspec +28 -0
- data/test/test_parsley.rb +56 -0
- data/test/yelp-benchmark.rb +53 -0
- data/test/yelp-home.html +1004 -0
- data/test/yelp-home.let +6 -0
- data/test/yelp.html +2329 -0
- metadata +10 -11
data/ext/cparsley.c
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <libxslt/xslt.h>
|
4
|
+
#include <libexslt/exslt.h>
|
5
|
+
#include <libxslt/xsltInternals.h>
|
6
|
+
#include <libxslt/transform.h>
|
7
|
+
#include <libxml/parser.h>
|
8
|
+
#include <libxml/HTMLparser.h>
|
9
|
+
#include <libxml/HTMLtree.h>
|
10
|
+
#include <libxml/xmlwriter.h>
|
11
|
+
#include <parsley.h>
|
12
|
+
#include <json/json.h>
|
13
|
+
#include <xml2json.h>
|
14
|
+
|
15
|
+
VALUE _new(VALUE, VALUE, VALUE);
|
16
|
+
VALUE _parse_file(VALUE, VALUE, VALUE, VALUE);
|
17
|
+
VALUE _parse_string(VALUE, VALUE, VALUE, VALUE);
|
18
|
+
VALUE _parse_doc(parsedParsleyPtr, VALUE);
|
19
|
+
VALUE rubify_recurse(xmlNodePtr xml);
|
20
|
+
VALUE c_parsley_err;
|
21
|
+
VALUE c_parsley;
|
22
|
+
|
23
|
+
void Init_cparsley()
|
24
|
+
{
|
25
|
+
c_parsley = rb_define_class("CParsley", rb_cObject);
|
26
|
+
c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
|
27
|
+
rb_define_singleton_method(c_parsley, "new", _new, 2);
|
28
|
+
rb_define_method(c_parsley, "parse_file", _parse_file, 3);
|
29
|
+
rb_define_method(c_parsley, "parse_string", _parse_string, 3);
|
30
|
+
}
|
31
|
+
|
32
|
+
VALUE _new(VALUE self, VALUE parsley, VALUE incl){
|
33
|
+
parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
|
34
|
+
if(ptr->error != NULL) {
|
35
|
+
rb_raise(c_parsley_err, ptr->error);
|
36
|
+
parsley_free(ptr);
|
37
|
+
return Qnil;
|
38
|
+
}
|
39
|
+
|
40
|
+
return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
|
41
|
+
}
|
42
|
+
|
43
|
+
VALUE _parse_file(VALUE self, VALUE name, VALUE input, VALUE output){
|
44
|
+
parsleyPtr parsley;
|
45
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
46
|
+
return _parse_doc(parsley_parse_file(parsley, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
|
47
|
+
}
|
48
|
+
|
49
|
+
VALUE _parse_string(VALUE self, VALUE string, VALUE input, VALUE output) {
|
50
|
+
parsleyPtr parsley;
|
51
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
52
|
+
char* cstr = STR2CSTR(string);
|
53
|
+
return _parse_doc(parsley_parse_string(parsley, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
|
54
|
+
}
|
55
|
+
|
56
|
+
VALUE _parse_doc(parsedParsleyPtr ptr, VALUE type) {
|
57
|
+
if(ptr->error != NULL || ptr->xml == NULL) {
|
58
|
+
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
|
59
|
+
rb_raise(c_parsley_err, ptr->error);
|
60
|
+
parsed_parsley_free(ptr);
|
61
|
+
return Qnil;
|
62
|
+
}
|
63
|
+
|
64
|
+
VALUE output;
|
65
|
+
if(type == ID2SYM(rb_intern("json"))) {
|
66
|
+
struct json_object *json = xml2json(ptr->xml->children->children);
|
67
|
+
char* str = json_object_to_json_string(json);
|
68
|
+
output = rb_str_new2(str);
|
69
|
+
json_object_put(json);
|
70
|
+
} else if(type == ID2SYM(rb_intern("xml"))) {
|
71
|
+
char* str;
|
72
|
+
int size;
|
73
|
+
xmlDocDumpMemory(ptr->xml, &str, &size);
|
74
|
+
output = rb_str_new(str, size);
|
75
|
+
} else {
|
76
|
+
output = rubify_recurse(ptr->xml->children->children);
|
77
|
+
if(output == NULL) output = Qnil;
|
78
|
+
}
|
79
|
+
|
80
|
+
parsed_parsley_free(ptr);
|
81
|
+
|
82
|
+
return output;
|
83
|
+
}
|
84
|
+
|
85
|
+
VALUE rubify_recurse(xmlNodePtr xml) {
|
86
|
+
if(xml == NULL) return NULL;
|
87
|
+
xmlNodePtr child;
|
88
|
+
VALUE obj = Qnil;
|
89
|
+
|
90
|
+
switch(xml->type) {
|
91
|
+
case XML_ELEMENT_NODE:
|
92
|
+
child = xml->children;
|
93
|
+
if(xml->ns == NULL) {
|
94
|
+
child = xml;
|
95
|
+
obj = rb_hash_new();
|
96
|
+
while(child != NULL) {
|
97
|
+
rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
|
98
|
+
child = child->next;
|
99
|
+
}
|
100
|
+
} else if(!strcmp(xml->ns->prefix, "parsley")) {
|
101
|
+
if(!strcmp(xml->name, "groups")) {
|
102
|
+
obj = rb_ary_new();
|
103
|
+
while(child != NULL) {
|
104
|
+
rb_ary_push(obj, rubify_recurse(child->children));
|
105
|
+
child = child->next;
|
106
|
+
}
|
107
|
+
} else if(!strcmp(xml->name, "group")) {
|
108
|
+
// Implicitly handled by parsley:groups handler
|
109
|
+
}
|
110
|
+
}
|
111
|
+
break;
|
112
|
+
case XML_TEXT_NODE:
|
113
|
+
obj = rb_str_new2(xml->content);
|
114
|
+
break;
|
115
|
+
}
|
116
|
+
// inspect(obj);
|
117
|
+
return obj;
|
118
|
+
}
|
data/lib/parsley.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../ext/cparsley"
|
2
|
+
require "rubygems"
|
3
|
+
require "json"
|
4
|
+
require "thread"
|
5
|
+
|
6
|
+
class Parsley
|
7
|
+
def initialize(parsley, incl = "")
|
8
|
+
if(parsley.is_a?(Hash))
|
9
|
+
parsley = parsley.to_json
|
10
|
+
end
|
11
|
+
@@mutex ||= Mutex.new
|
12
|
+
@@mutex.synchronize do
|
13
|
+
@parsley = CParsley.new(parsley, incl)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Valid options:
|
18
|
+
#
|
19
|
+
# Requires one of:
|
20
|
+
# :file -- the input file path
|
21
|
+
# :string -- the input string
|
22
|
+
#
|
23
|
+
# And optionally:
|
24
|
+
# :input => [:xml, :html]
|
25
|
+
# :output => [:json, :xml, :ruby]
|
26
|
+
# :allow_empty -- If false, throws an exception if any value is empty.
|
27
|
+
#
|
28
|
+
# Defaults are :input => :html, :output => :ruby, :allow_empty => false
|
29
|
+
def parse(options = {})
|
30
|
+
options[:file] || options[:string] || throw("must specify what to parse")
|
31
|
+
options[:input] ||= :html
|
32
|
+
options[:output]||= :ruby
|
33
|
+
if options[:file]
|
34
|
+
@parsley.parse_file options[:file], options[:input], options[:output]
|
35
|
+
else
|
36
|
+
@parsley.parse_string options[:string], options[:input], options[:output]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "parsley-ruby"
|
3
|
+
s.version = "0.1.2"
|
4
|
+
s.date = "2008-08-10"
|
5
|
+
s.summary = "Ruby binding for parsley"
|
6
|
+
s.email = "kyle@kylemaxwell.com"
|
7
|
+
s.homepage = "http://github.com/fizx/parsley-ruby"
|
8
|
+
s.description = "Ruby binding for parsley"
|
9
|
+
s.has_rdoc = true
|
10
|
+
s.require_paths = ["lib", "ext"]
|
11
|
+
s.extensions = "ext/extconf.rb"
|
12
|
+
s.authors = ["Kyle Maxwell"]
|
13
|
+
s.files = %w[
|
14
|
+
ext/cparsley.c
|
15
|
+
ext/extconf.rb
|
16
|
+
lib/parsley.rb
|
17
|
+
parsley-ruby.gemspec
|
18
|
+
README
|
19
|
+
test/test_parsley.rb
|
20
|
+
test/yelp-benchmark.rb
|
21
|
+
test/yelp-home.html
|
22
|
+
test/yelp-home.let
|
23
|
+
test/yelp.html
|
24
|
+
]
|
25
|
+
s.rdoc_options = ["--main", "README"]
|
26
|
+
s.extra_rdoc_files = ["README"]
|
27
|
+
s.add_dependency("json", ["> 0.0.0"])
|
28
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/parsley"
|
3
|
+
|
4
|
+
class TestParsley < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
|
7
|
+
@home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
|
8
|
+
@let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_yelp
|
12
|
+
@parsley = Parsley.new(File.read(@let))
|
13
|
+
out = @parsley.parse(:file => @home)
|
14
|
+
assert_equal "/c/sf/shopping", out["categories"][0]["href"]
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_yelp_xml
|
18
|
+
@parsley = Parsley.new(File.read(@let))
|
19
|
+
out = @parsley.parse(:file => @home, :output => :xml)
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_simple
|
23
|
+
@parsley = Parsley.new("hi" => "h1")
|
24
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_simple_string
|
28
|
+
@parsley = Parsley.new("hi" => "h1")
|
29
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_xml
|
33
|
+
@parsley = Parsley.new("hi" => "h1")
|
34
|
+
xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parslets.com/json\"><hi>Nick's Crispy Tacos</hi></parsley:root>\n"
|
35
|
+
assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_json
|
39
|
+
@parsley = Parsley.new("hi" => "h1")
|
40
|
+
assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_rescuable_file_error
|
44
|
+
@parsley = Parsley.new("hi" => "h1")
|
45
|
+
@nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
|
46
|
+
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_array_string
|
50
|
+
@parsley = Parsley.new({"foo" => ["li"]})
|
51
|
+
out = @parsley.parse(:file => @page)
|
52
|
+
assert_kind_of Hash, out
|
53
|
+
assert_kind_of Array, out["foo"], out.inspect
|
54
|
+
assert out["foo"].length > 1
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "nokogiri"
|
3
|
+
require "hpricot"
|
4
|
+
require "parsley"
|
5
|
+
require "benchmark"
|
6
|
+
require "pp"
|
7
|
+
|
8
|
+
YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
|
9
|
+
|
10
|
+
def noko
|
11
|
+
parse Nokogiri.Hpricot(File.open(YELP_HTML))
|
12
|
+
end
|
13
|
+
|
14
|
+
def hpri
|
15
|
+
parse Hpricot(File.open(YELP_HTML))
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(doc)
|
19
|
+
out = {}
|
20
|
+
out["name"] = (doc / "h1").first.inner_text
|
21
|
+
out["phone"] = (doc / "#bizPhone").first.inner_text
|
22
|
+
out["address"] = (doc / "address").first.inner_text
|
23
|
+
out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
|
24
|
+
review = {}
|
25
|
+
review["date"] = (node / ".ieSucks .smaller").first.inner_text
|
26
|
+
review["user_name"] = (node / ".reviewer_info a").first.inner_text
|
27
|
+
review["comment"] = (node / ".review_comment").first.inner_text
|
28
|
+
review
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def pars
|
33
|
+
parslet = Parsley.new({
|
34
|
+
"name" => "h1",
|
35
|
+
"phone" => "#bizPhone",
|
36
|
+
"address" => "address",
|
37
|
+
"reviews(.nonfavoriteReview)" => [
|
38
|
+
{
|
39
|
+
"date" => ".ieSucks .smaller",
|
40
|
+
"user_name" => ".reviewer_info a",
|
41
|
+
"comment" => ".review_comment"
|
42
|
+
}
|
43
|
+
]
|
44
|
+
})
|
45
|
+
pp parslet.parse(:file => YELP_HTML)
|
46
|
+
end
|
47
|
+
|
48
|
+
Benchmark.bm do |x|
|
49
|
+
x.report("nokogiri: ") { 3.times { noko } }
|
50
|
+
x.report("hpricot: ") { 3.times { hpri } }
|
51
|
+
x.report("parsley: ") { 3.times { pars } }
|
52
|
+
end
|
53
|
+
|