fizx-parsley-ruby 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/cparsley.c ADDED
@@ -0,0 +1,118 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <libxslt/xslt.h>
4
+ #include <libexslt/exslt.h>
5
+ #include <libxslt/xsltInternals.h>
6
+ #include <libxslt/transform.h>
7
+ #include <libxml/parser.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/HTMLtree.h>
10
+ #include <libxml/xmlwriter.h>
11
+ #include <parsley.h>
12
+ #include <json/json.h>
13
+ #include <xml2json.h>
14
+
15
+ VALUE _new(VALUE, VALUE, VALUE);
16
+ VALUE _parse_file(VALUE, VALUE, VALUE, VALUE);
17
+ VALUE _parse_string(VALUE, VALUE, VALUE, VALUE);
18
+ VALUE _parse_doc(parsedParsleyPtr, VALUE);
19
+ VALUE rubify_recurse(xmlNodePtr xml);
20
+ VALUE c_parsley_err;
21
+ VALUE c_parsley;
22
+
23
+ void Init_cparsley()
24
+ {
25
+ c_parsley = rb_define_class("CParsley", rb_cObject);
26
+ c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
27
+ rb_define_singleton_method(c_parsley, "new", _new, 2);
28
+ rb_define_method(c_parsley, "parse_file", _parse_file, 3);
29
+ rb_define_method(c_parsley, "parse_string", _parse_string, 3);
30
+ }
31
+
32
+ VALUE _new(VALUE self, VALUE parsley, VALUE incl){
33
+ parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
34
+ if(ptr->error != NULL) {
35
+ rb_raise(c_parsley_err, ptr->error);
36
+ parsley_free(ptr);
37
+ return Qnil;
38
+ }
39
+
40
+ return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
41
+ }
42
+
43
+ VALUE _parse_file(VALUE self, VALUE name, VALUE input, VALUE output){
44
+ parsleyPtr parsley;
45
+ Data_Get_Struct(self, parsleyPtr, parsley);
46
+ return _parse_doc(parsley_parse_file(parsley, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
47
+ }
48
+
49
+ VALUE _parse_string(VALUE self, VALUE string, VALUE input, VALUE output) {
50
+ parsleyPtr parsley;
51
+ Data_Get_Struct(self, parsleyPtr, parsley);
52
+ char* cstr = STR2CSTR(string);
53
+ return _parse_doc(parsley_parse_string(parsley, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
54
+ }
55
+
56
+ VALUE _parse_doc(parsedParsleyPtr ptr, VALUE type) {
57
+ if(ptr->error != NULL || ptr->xml == NULL) {
58
+ if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
59
+ rb_raise(c_parsley_err, ptr->error);
60
+ parsed_parsley_free(ptr);
61
+ return Qnil;
62
+ }
63
+
64
+ VALUE output;
65
+ if(type == ID2SYM(rb_intern("json"))) {
66
+ struct json_object *json = xml2json(ptr->xml->children->children);
67
+ char* str = json_object_to_json_string(json);
68
+ output = rb_str_new2(str);
69
+ json_object_put(json);
70
+ } else if(type == ID2SYM(rb_intern("xml"))) {
71
+ char* str;
72
+ int size;
73
+ xmlDocDumpMemory(ptr->xml, &str, &size);
74
+ output = rb_str_new(str, size);
75
+ } else {
76
+ output = rubify_recurse(ptr->xml->children->children);
77
+ if(output == NULL) output = Qnil;
78
+ }
79
+
80
+ parsed_parsley_free(ptr);
81
+
82
+ return output;
83
+ }
84
+
85
+ VALUE rubify_recurse(xmlNodePtr xml) {
86
+ if(xml == NULL) return NULL;
87
+ xmlNodePtr child;
88
+ VALUE obj = Qnil;
89
+
90
+ switch(xml->type) {
91
+ case XML_ELEMENT_NODE:
92
+ child = xml->children;
93
+ if(xml->ns == NULL) {
94
+ child = xml;
95
+ obj = rb_hash_new();
96
+ while(child != NULL) {
97
+ rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
98
+ child = child->next;
99
+ }
100
+ } else if(!strcmp(xml->ns->prefix, "parsley")) {
101
+ if(!strcmp(xml->name, "groups")) {
102
+ obj = rb_ary_new();
103
+ while(child != NULL) {
104
+ rb_ary_push(obj, rubify_recurse(child->children));
105
+ child = child->next;
106
+ }
107
+ } else if(!strcmp(xml->name, "group")) {
108
+ // Implicitly handled by parsley:groups handler
109
+ }
110
+ }
111
+ break;
112
+ case XML_TEXT_NODE:
113
+ obj = rb_str_new2(xml->content);
114
+ break;
115
+ }
116
+ // inspect(obj);
117
+ return obj;
118
+ }
data/lib/parsley.rb ADDED
@@ -0,0 +1,39 @@
1
+ require File.dirname(__FILE__) + "/../ext/cparsley"
2
+ require "rubygems"
3
+ require "json"
4
+ require "thread"
5
+
6
+ class Parsley
7
+ def initialize(parsley, incl = "")
8
+ if(parsley.is_a?(Hash))
9
+ parsley = parsley.to_json
10
+ end
11
+ @@mutex ||= Mutex.new
12
+ @@mutex.synchronize do
13
+ @parsley = CParsley.new(parsley, incl)
14
+ end
15
+ end
16
+
17
+ # Valid options:
18
+ #
19
+ # Requires one of:
20
+ # :file -- the input file path
21
+ # :string -- the input string
22
+ #
23
+ # And optionally:
24
+ # :input => [:xml, :html]
25
+ # :output => [:json, :xml, :ruby]
26
+ # :allow_empty -- If false, throws an exception if any value is empty.
27
+ #
28
+ # Defaults are :input => :html, :output => :ruby, :allow_empty => false
29
+ def parse(options = {})
30
+ options[:file] || options[:string] || throw("must specify what to parse")
31
+ options[:input] ||= :html
32
+ options[:output]||= :ruby
33
+ if options[:file]
34
+ @parsley.parse_file options[:file], options[:input], options[:output]
35
+ else
36
+ @parsley.parse_string options[:string], options[:input], options[:output]
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,28 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "parsley-ruby"
3
+ s.version = "0.1.2"
4
+ s.date = "2008-08-10"
5
+ s.summary = "Ruby binding for parsley"
6
+ s.email = "kyle@kylemaxwell.com"
7
+ s.homepage = "http://github.com/fizx/parsley-ruby"
8
+ s.description = "Ruby binding for parsley"
9
+ s.has_rdoc = true
10
+ s.require_paths = ["lib", "ext"]
11
+ s.extensions = "ext/extconf.rb"
12
+ s.authors = ["Kyle Maxwell"]
13
+ s.files = %w[
14
+ ext/cparsley.c
15
+ ext/extconf.rb
16
+ lib/parsley.rb
17
+ parsley-ruby.gemspec
18
+ README
19
+ test/test_parsley.rb
20
+ test/yelp-benchmark.rb
21
+ test/yelp-home.html
22
+ test/yelp-home.let
23
+ test/yelp.html
24
+ ]
25
+ s.rdoc_options = ["--main", "README"]
26
+ s.extra_rdoc_files = ["README"]
27
+ s.add_dependency("json", ["> 0.0.0"])
28
+ end
@@ -0,0 +1,56 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/parsley"
3
+
4
+ class TestParsley < Test::Unit::TestCase
5
+ def setup
6
+ @page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
7
+ @home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
8
+ @let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
9
+ end
10
+
11
+ def test_yelp
12
+ @parsley = Parsley.new(File.read(@let))
13
+ out = @parsley.parse(:file => @home)
14
+ assert_equal "/c/sf/shopping", out["categories"][0]["href"]
15
+ end
16
+
17
+ def test_yelp_xml
18
+ @parsley = Parsley.new(File.read(@let))
19
+ out = @parsley.parse(:file => @home, :output => :xml)
20
+ end
21
+
22
+ def test_simple
23
+ @parsley = Parsley.new("hi" => "h1")
24
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
25
+ end
26
+
27
+ def test_simple_string
28
+ @parsley = Parsley.new("hi" => "h1")
29
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
30
+ end
31
+
32
+ def test_xml
33
+ @parsley = Parsley.new("hi" => "h1")
34
+ xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parslets.com/json\"><hi>Nick's Crispy Tacos</hi></parsley:root>\n"
35
+ assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
36
+ end
37
+
38
+ def test_json
39
+ @parsley = Parsley.new("hi" => "h1")
40
+ assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
41
+ end
42
+
43
+ def test_rescuable_file_error
44
+ @parsley = Parsley.new("hi" => "h1")
45
+ @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
46
+ assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
47
+ end
48
+
49
+ def test_array_string
50
+ @parsley = Parsley.new({"foo" => ["li"]})
51
+ out = @parsley.parse(:file => @page)
52
+ assert_kind_of Hash, out
53
+ assert_kind_of Array, out["foo"], out.inspect
54
+ assert out["foo"].length > 1
55
+ end
56
+ end
@@ -0,0 +1,53 @@
1
+ require "rubygems"
2
+ require "nokogiri"
3
+ require "hpricot"
4
+ require "parsley"
5
+ require "benchmark"
6
+ require "pp"
7
+
8
+ YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
9
+
10
+ def noko
11
+ parse Nokogiri.Hpricot(File.open(YELP_HTML))
12
+ end
13
+
14
+ def hpri
15
+ parse Hpricot(File.open(YELP_HTML))
16
+ end
17
+
18
+ def parse(doc)
19
+ out = {}
20
+ out["name"] = (doc / "h1").first.inner_text
21
+ out["phone"] = (doc / "#bizPhone").first.inner_text
22
+ out["address"] = (doc / "address").first.inner_text
23
+ out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
24
+ review = {}
25
+ review["date"] = (node / ".ieSucks .smaller").first.inner_text
26
+ review["user_name"] = (node / ".reviewer_info a").first.inner_text
27
+ review["comment"] = (node / ".review_comment").first.inner_text
28
+ review
29
+ end
30
+ end
31
+
32
+ def pars
33
+ parslet = Parsley.new({
34
+ "name" => "h1",
35
+ "phone" => "#bizPhone",
36
+ "address" => "address",
37
+ "reviews(.nonfavoriteReview)" => [
38
+ {
39
+ "date" => ".ieSucks .smaller",
40
+ "user_name" => ".reviewer_info a",
41
+ "comment" => ".review_comment"
42
+ }
43
+ ]
44
+ })
45
+ pp parslet.parse(:file => YELP_HTML)
46
+ end
47
+
48
+ Benchmark.bm do |x|
49
+ x.report("nokogiri: ") { 3.times { noko } }
50
+ x.report("hpricot: ") { 3.times { hpri } }
51
+ x.report("parsley: ") { 3.times { pars } }
52
+ end
53
+