fizx-parsley-ruby 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/ext/cparsley.c ADDED
@@ -0,0 +1,118 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <libxslt/xslt.h>
4
+ #include <libexslt/exslt.h>
5
+ #include <libxslt/xsltInternals.h>
6
+ #include <libxslt/transform.h>
7
+ #include <libxml/parser.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/HTMLtree.h>
10
+ #include <libxml/xmlwriter.h>
11
+ #include <parsley.h>
12
+ #include <json/json.h>
13
+ #include <xml2json.h>
14
+
15
+ VALUE _new(VALUE, VALUE, VALUE);
16
+ VALUE _parse_file(VALUE, VALUE, VALUE, VALUE);
17
+ VALUE _parse_string(VALUE, VALUE, VALUE, VALUE);
18
+ VALUE _parse_doc(parsedParsleyPtr, VALUE);
19
+ VALUE rubify_recurse(xmlNodePtr xml);
20
+ VALUE c_parsley_err;
21
+ VALUE c_parsley;
22
+
23
+ void Init_cparsley()
24
+ {
25
+ c_parsley = rb_define_class("CParsley", rb_cObject);
26
+ c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
27
+ rb_define_singleton_method(c_parsley, "new", _new, 2);
28
+ rb_define_method(c_parsley, "parse_file", _parse_file, 3);
29
+ rb_define_method(c_parsley, "parse_string", _parse_string, 3);
30
+ }
31
+
32
+ VALUE _new(VALUE self, VALUE parsley, VALUE incl){
33
+ parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
34
+ if(ptr->error != NULL) {
35
+ rb_raise(c_parsley_err, ptr->error);
36
+ parsley_free(ptr);
37
+ return Qnil;
38
+ }
39
+
40
+ return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
41
+ }
42
+
43
+ VALUE _parse_file(VALUE self, VALUE name, VALUE input, VALUE output){
44
+ parsleyPtr parsley;
45
+ Data_Get_Struct(self, parsleyPtr, parsley);
46
+ return _parse_doc(parsley_parse_file(parsley, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
47
+ }
48
+
49
+ VALUE _parse_string(VALUE self, VALUE string, VALUE input, VALUE output) {
50
+ parsleyPtr parsley;
51
+ Data_Get_Struct(self, parsleyPtr, parsley);
52
+ char* cstr = STR2CSTR(string);
53
+ return _parse_doc(parsley_parse_string(parsley, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
54
+ }
55
+
56
+ VALUE _parse_doc(parsedParsleyPtr ptr, VALUE type) {
57
+ if(ptr->error != NULL || ptr->xml == NULL) {
58
+ if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
59
+ rb_raise(c_parsley_err, ptr->error);
60
+ parsed_parsley_free(ptr);
61
+ return Qnil;
62
+ }
63
+
64
+ VALUE output;
65
+ if(type == ID2SYM(rb_intern("json"))) {
66
+ struct json_object *json = xml2json(ptr->xml->children->children);
67
+ char* str = json_object_to_json_string(json);
68
+ output = rb_str_new2(str);
69
+ json_object_put(json);
70
+ } else if(type == ID2SYM(rb_intern("xml"))) {
71
+ char* str;
72
+ int size;
73
+ xmlDocDumpMemory(ptr->xml, &str, &size);
74
+ output = rb_str_new(str, size);
75
+ } else {
76
+ output = rubify_recurse(ptr->xml->children->children);
77
+ if(output == NULL) output = Qnil;
78
+ }
79
+
80
+ parsed_parsley_free(ptr);
81
+
82
+ return output;
83
+ }
84
+
85
+ VALUE rubify_recurse(xmlNodePtr xml) {
86
+ if(xml == NULL) return NULL;
87
+ xmlNodePtr child;
88
+ VALUE obj = Qnil;
89
+
90
+ switch(xml->type) {
91
+ case XML_ELEMENT_NODE:
92
+ child = xml->children;
93
+ if(xml->ns == NULL) {
94
+ child = xml;
95
+ obj = rb_hash_new();
96
+ while(child != NULL) {
97
+ rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
98
+ child = child->next;
99
+ }
100
+ } else if(!strcmp(xml->ns->prefix, "parsley")) {
101
+ if(!strcmp(xml->name, "groups")) {
102
+ obj = rb_ary_new();
103
+ while(child != NULL) {
104
+ rb_ary_push(obj, rubify_recurse(child->children));
105
+ child = child->next;
106
+ }
107
+ } else if(!strcmp(xml->name, "group")) {
108
+ // Implicitly handled by parsley:groups handler
109
+ }
110
+ }
111
+ break;
112
+ case XML_TEXT_NODE:
113
+ obj = rb_str_new2(xml->content);
114
+ break;
115
+ }
116
+ // inspect(obj);
117
+ return obj;
118
+ }
data/lib/parsley.rb ADDED
@@ -0,0 +1,39 @@
1
+ require File.dirname(__FILE__) + "/../ext/cparsley"
2
+ require "rubygems"
3
+ require "json"
4
+ require "thread"
5
+
6
+ class Parsley
7
+ def initialize(parsley, incl = "")
8
+ if(parsley.is_a?(Hash))
9
+ parsley = parsley.to_json
10
+ end
11
+ @@mutex ||= Mutex.new
12
+ @@mutex.synchronize do
13
+ @parsley = CParsley.new(parsley, incl)
14
+ end
15
+ end
16
+
17
+ # Valid options:
18
+ #
19
+ # Requires one of:
20
+ # :file -- the input file path
21
+ # :string -- the input string
22
+ #
23
+ # And optionally:
24
+ # :input => [:xml, :html]
25
+ # :output => [:json, :xml, :ruby]
26
+ # :allow_empty -- If false, throws an exception if any value is empty.
27
+ #
28
+ # Defaults are :input => :html, :output => :ruby, :allow_empty => false
29
+ def parse(options = {})
30
+ options[:file] || options[:string] || throw("must specify what to parse")
31
+ options[:input] ||= :html
32
+ options[:output]||= :ruby
33
+ if options[:file]
34
+ @parsley.parse_file options[:file], options[:input], options[:output]
35
+ else
36
+ @parsley.parse_string options[:string], options[:input], options[:output]
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,28 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "parsley-ruby"
3
+ s.version = "0.1.2"
4
+ s.date = "2008-08-10"
5
+ s.summary = "Ruby binding for parsley"
6
+ s.email = "kyle@kylemaxwell.com"
7
+ s.homepage = "http://github.com/fizx/parsley-ruby"
8
+ s.description = "Ruby binding for parsley"
9
+ s.has_rdoc = true
10
+ s.require_paths = ["lib", "ext"]
11
+ s.extensions = "ext/extconf.rb"
12
+ s.authors = ["Kyle Maxwell"]
13
+ s.files = %w[
14
+ ext/cparsley.c
15
+ ext/extconf.rb
16
+ lib/parsley.rb
17
+ parsley-ruby.gemspec
18
+ README
19
+ test/test_parsley.rb
20
+ test/yelp-benchmark.rb
21
+ test/yelp-home.html
22
+ test/yelp-home.let
23
+ test/yelp.html
24
+ ]
25
+ s.rdoc_options = ["--main", "README"]
26
+ s.extra_rdoc_files = ["README"]
27
+ s.add_dependency("json", ["> 0.0.0"])
28
+ end
@@ -0,0 +1,56 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/parsley"
3
+
4
+ class TestParsley < Test::Unit::TestCase
5
+ def setup
6
+ @page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
7
+ @home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
8
+ @let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
9
+ end
10
+
11
+ def test_yelp
12
+ @parsley = Parsley.new(File.read(@let))
13
+ out = @parsley.parse(:file => @home)
14
+ assert_equal "/c/sf/shopping", out["categories"][0]["href"]
15
+ end
16
+
17
+ def test_yelp_xml
18
+ @parsley = Parsley.new(File.read(@let))
19
+ out = @parsley.parse(:file => @home, :output => :xml)
20
+ end
21
+
22
+ def test_simple
23
+ @parsley = Parsley.new("hi" => "h1")
24
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
25
+ end
26
+
27
+ def test_simple_string
28
+ @parsley = Parsley.new("hi" => "h1")
29
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
30
+ end
31
+
32
+ def test_xml
33
+ @parsley = Parsley.new("hi" => "h1")
34
+ xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parslets.com/json\"><hi>Nick's Crispy Tacos</hi></parsley:root>\n"
35
+ assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
36
+ end
37
+
38
+ def test_json
39
+ @parsley = Parsley.new("hi" => "h1")
40
+ assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
41
+ end
42
+
43
+ def test_rescuable_file_error
44
+ @parsley = Parsley.new("hi" => "h1")
45
+ @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
46
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
47
+ end
48
+
49
+ def test_array_string
50
+ @parsley = Parsley.new({"foo" => ["li"]})
51
+ out = @parsley.parse(:file => @page)
52
+ assert_kind_of Hash, out
53
+ assert_kind_of Array, out["foo"], out.inspect
54
+ assert out["foo"].length > 1
55
+ end
56
+ end
@@ -0,0 +1,53 @@
1
+ require "rubygems"
2
+ require "nokogiri"
3
+ require "hpricot"
4
+ require "parsley"
5
+ require "benchmark"
6
+ require "pp"
7
+
8
+ YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
9
+
10
+ def noko
11
+ parse Nokogiri.Hpricot(File.open(YELP_HTML))
12
+ end
13
+
14
+ def hpri
15
+ parse Hpricot(File.open(YELP_HTML))
16
+ end
17
+
18
+ def parse(doc)
19
+ out = {}
20
+ out["name"] = (doc / "h1").first.inner_text
21
+ out["phone"] = (doc / "#bizPhone").first.inner_text
22
+ out["address"] = (doc / "address").first.inner_text
23
+ out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
24
+ review = {}
25
+ review["date"] = (node / ".ieSucks .smaller").first.inner_text
26
+ review["user_name"] = (node / ".reviewer_info a").first.inner_text
27
+ review["comment"] = (node / ".review_comment").first.inner_text
28
+ review
29
+ end
30
+ end
31
+
32
+ def pars
33
+ parslet = Parsley.new({
34
+ "name" => "h1",
35
+ "phone" => "#bizPhone",
36
+ "address" => "address",
37
+ "reviews(.nonfavoriteReview)" => [
38
+ {
39
+ "date" => ".ieSucks .smaller",
40
+ "user_name" => ".reviewer_info a",
41
+ "comment" => ".review_comment"
42
+ }
43
+ ]
44
+ })
45
+ pp parslet.parse(:file => YELP_HTML)
46
+ end
47
+
48
+ Benchmark.bm do |x|
49
+ x.report("nokogiri: ") { 3.times { noko } }
50
+ x.report("hpricot: ") { 3.times { hpri } }
51
+ x.report("parsley: ") { 3.times { pars } }
52
+ end
53
+