le1t0-parsley-ruby 0.4.5.001

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ .libs/
2
+ *.o
3
+ *.lo
4
+ dexterc
5
+ dexter
6
+ parsleyc
7
+ parsley
8
+ .deps/
9
+ Makefile
10
+ y.tab.c
11
+ autom4te.cache/
12
+ autoscan.log
13
+ config.log
14
+ configure.scan
15
+ parser.c
16
+ scanner.c
17
+ libparsley.la
18
+ parser.h
19
+ test.log
20
+ parsley*.gem
21
+ ext/cparsley.bundle
22
+ ext/cparsley.so
23
+ ext/Makefile
24
+ ext/conftest.dSYM/
25
+ work
26
+ ext/mkmf.log
27
+ pkg
28
+ le1t0-parsley-ruby.gemspec
@@ -0,0 +1,8 @@
1
+ 0.4.5.001
2
+
3
+ - Changed ext/extconf.rb such that it is much simpler and compiles. Added external dependency for parsley instead of compiling
4
+ inline.
5
+
6
+ 0.4.3
7
+ - Added CHANGELOG
8
+ - Only assigning ARCHFLAGS if not already specified.
data/README ADDED
@@ -0,0 +1,32 @@
1
+ ABOUT
2
+
3
+ Ruby bindings for Parsley.
4
+
5
+ INSTALLATION
6
+
7
+ = Get Parsley and Dependancies =
8
+
9
+ Download Parsley from http://github.com/fizx/parsley/tree/master following the installation directions located at http://github.com/fizx/parsley/blob/master/INSTALL
10
+
11
+ = Install parsley-ruby =
12
+
13
+ From source:
14
+ sudo rake install
15
+
16
+ From GitHub: DEPRECATED!
17
+
18
+ From GemCutter
19
+
20
+ Run the following if you haven't already:
21
+ gem sources -a http://gemcutter.org
22
+ Install the gem:
23
+ sudo gem install parsley-ruby
24
+
25
+ PARSLETS.COM INTEGRATION
26
+
27
+ We also recommend installing the free online_parselets rubygem in order to use other people's parselets and to share your own:
28
+ Run the following if you haven't already:
29
+ gem sources -a http://gems.github.com
30
+ Install the gem:
31
+ sudo gem install iterationlabs-online_parslets
32
+
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "le1t0-parsley-ruby"
8
+ gem.summary = "Ruby binding for parsley"
9
+ gem.description = "XML/HTML Parser"
10
+ gem.email = "dev@ewout.to"
11
+ gem.homepage = "http://github.com/le1t0/parsley-ruby"
12
+ gem.authors = ["Le1t0"]
13
+ gem.add_dependency("json", ["> 0.0.0"])
14
+ gem.require_paths = ["lib", "ext"]
15
+ gem.extensions = "ext/extconf.rb" end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/*_test.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/*_test.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION')
48
+ version = File.read('VERSION')
49
+ else
50
+ version = ""
51
+ end
52
+
53
+ rdoc.rdoc_dir = 'rdoc'
54
+ rdoc.title = "robots #{version}"
55
+ rdoc.rdoc_files.include('README*')
56
+ rdoc.rdoc_files.include('lib/**/*.rb')
57
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.5.001
@@ -0,0 +1,140 @@
1
+ #include "ruby.h"
2
+ #include <stdio.h>
3
+ #include <libxslt/xslt.h>
4
+ #include <libexslt/exslt.h>
5
+ #include <libxslt/xsltInternals.h>
6
+ #include <libxslt/transform.h>
7
+ #include <libxml/parser.h>
8
+ #include <libxml/HTMLparser.h>
9
+ #include <libxml/HTMLtree.h>
10
+ #include <libxml/xmlwriter.h>
11
+ #include <parsley.h>
12
+ #include <json/json.h>
13
+ #include <xml2json.h>
14
+
15
+ VALUE _new(VALUE, VALUE, VALUE);
16
+ VALUE _parse(VALUE, VALUE);
17
+ VALUE _rb_set_user_agent(VALUE self, VALUE agent);
18
+ VALUE c_parsley_err;
19
+ VALUE c_parsley;
20
+
21
+ void Init_cparsley()
22
+ {
23
+ c_parsley = rb_define_class("CParsley", rb_cObject);
24
+ c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
25
+ rb_define_singleton_method(c_parsley, "new", _new, 2);
26
+ rb_define_singleton_method(c_parsley, "set_user_agent", _rb_set_user_agent, 1);
27
+ rb_define_method(c_parsley, "parse", _parse, 1);
28
+ }
29
+
30
+ VALUE
31
+ _new(VALUE self, VALUE parsley, VALUE incl){
32
+ parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
33
+ if(ptr->error != NULL) {
34
+ rb_raise(c_parsley_err, ptr->error);
35
+ parsley_free(ptr);
36
+ return Qnil;
37
+ }
38
+
39
+ return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
40
+ }
41
+
42
+ VALUE
43
+ _rb_set_user_agent(VALUE self, VALUE agent) {
44
+ parsley_set_user_agent(STR2CSTR(agent));
45
+ return Qtrue;
46
+ }
47
+
48
+
49
+ static VALUE
50
+ rubify_recurse(xmlNodePtr xml) {
51
+ if(xml == NULL) return NULL;
52
+ xmlNodePtr child;
53
+ VALUE obj = Qnil;
54
+
55
+ switch(xml->type) {
56
+ case XML_ELEMENT_NODE:
57
+ child = xml->children;
58
+ if(xml->ns == NULL) {
59
+ child = xml;
60
+ obj = rb_hash_new();
61
+ while(child != NULL) {
62
+ rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
63
+ child = child->next;
64
+ }
65
+ } else if(!strcmp(xml->ns->prefix, "parsley")) {
66
+ if(!strcmp(xml->name, "groups")) {
67
+ obj = rb_ary_new();
68
+ while(child != NULL) {
69
+ rb_ary_push(obj, rubify_recurse(child->children));
70
+ child = child->next;
71
+ }
72
+ } else if(!strcmp(xml->name, "group")) {
73
+ // Implicitly handled by parsley:groups handler
74
+ }
75
+ }
76
+ break;
77
+ case XML_TEXT_NODE:
78
+ obj = rb_str_new2(xml->content);
79
+ break;
80
+ }
81
+ // inspect(obj);
82
+ return obj;
83
+ }
84
+
85
+ static VALUE
86
+ _parse_doc(parsedParsleyPtr ptr, VALUE type) {
87
+ if(ptr->error != NULL || ptr->xml == NULL) {
88
+ if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
89
+ rb_raise(c_parsley_err, ptr->error);
90
+ parsed_parsley_free(ptr);
91
+ return Qnil;
92
+ }
93
+
94
+ VALUE output;
95
+ if(type == ID2SYM(rb_intern("json"))) {
96
+ struct json_object *json = xml2json(ptr->xml->children->children);
97
+ char* str = json_object_to_json_string(json);
98
+ output = rb_str_new2(str);
99
+ json_object_put(json);
100
+ } else if(type == ID2SYM(rb_intern("xml"))) {
101
+ xmlChar* str;
102
+ int size;
103
+ xmlDocDumpMemory(ptr->xml, &str, &size);
104
+ output = rb_str_new(str, size);
105
+ } else {
106
+ output = rubify_recurse(ptr->xml->children->children);
107
+ if((void*)output == NULL) output = Qnil;
108
+ }
109
+
110
+ parsed_parsley_free(ptr);
111
+
112
+ return output;
113
+ }
114
+
115
+ #define OPT(A) rb_hash_aref(options, ID2SYM(rb_intern(A)))
116
+ #define OPT_BOOL(A) (OPT(A) != Qnil && OPT(A) != Qfalse)
117
+ #define OPT_MATCH(A, B) (rb_hash_aref(options, ID2SYM(rb_intern(A))) == ID2SYM(rb_intern(B)))
118
+
119
+ VALUE _parse(VALUE self, VALUE options){
120
+ parsleyPtr parsley;
121
+ Data_Get_Struct(self, parsleyPtr, parsley);
122
+ int flags = 0;
123
+ char *base = NULL;
124
+ if(OPT_MATCH("input", "html")) flags |= PARSLEY_OPTIONS_HTML;
125
+ if(OPT_BOOL("prune")) flags |= PARSLEY_OPTIONS_PRUNE;
126
+ if(OPT_BOOL("collate")) flags |= PARSLEY_OPTIONS_COLLATE;
127
+ if(OPT_BOOL("allow_net")) flags |= PARSLEY_OPTIONS_ALLOW_NET;
128
+ if(OPT_BOOL("allow_local")) flags |= PARSLEY_OPTIONS_ALLOW_LOCAL;
129
+ if(OPT_BOOL("sgwrap")) flags |= PARSLEY_OPTIONS_SGWRAP;
130
+ if(OPT_BOOL("has_base")) base = STR2CSTR(OPT("base"));
131
+
132
+ // printf("prune: %d\nallow_net: %d\nallow_local: %d\nhas_base: %d\nflags: %d\n", OPT_BOOL("prune"), OPT_BOOL("allow_net"), OPT_BOOL("allow_local"), OPT_BOOL("has_base"), flags);
133
+
134
+ if(OPT_BOOL("is_file")) {
135
+ return _parse_doc(parsley_parse_file(parsley, STR2CSTR(OPT("file")), flags), OPT("output"));
136
+ } else {
137
+ char * str = STR2CSTR(OPT("string"));
138
+ return _parse_doc(parsley_parse_string(parsley, str, strlen(str), base, flags), OPT("output"));
139
+ }
140
+ }
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mkmf'
4
+
5
+ $CFLAGS << " -I/usr/local/include -I/usr/include/libxml2"
6
+ $LDFLAGS << " -L/usr/local/lib -lparsley"
7
+
8
+ create_makefile('cparsley')
@@ -0,0 +1,84 @@
1
+ require File.dirname(__FILE__) + "/../ext/cparsley"
2
+ require "rubygems"
3
+ require "json"
4
+ require "thread"
5
+
6
+ class Parsley
7
+
8
+ def self.user_agent=(agent)
9
+ @user_agent = agent
10
+ CParsley.set_user_agent(agent.to_s)
11
+ end
12
+
13
+ def self.user_agent
14
+ @user_agent
15
+ end
16
+
17
+ def initialize(parsley, incl = "")
18
+ if(parsley.is_a?(Hash))
19
+ parsley = recursive_stringify(parsley).to_json
20
+ end
21
+ @@mutex ||= Mutex.new
22
+ @@mutex.synchronize do
23
+ @parsley = CParsley.new(parsley, incl)
24
+ end
25
+ end
26
+
27
+ # Valid options:
28
+ #
29
+ # Requires one of:
30
+ # :file -- the input file path or url
31
+ # :string -- the input string
32
+ #
33
+ # And optionally (default is the first listed value):
34
+ # :input => [:html, :xml]
35
+ # :output => [:ruby, :json, :xml]
36
+ # :prune => [true, false]
37
+ # :sgwrap => [false, true]
38
+ # :collate => [true, false]
39
+ # :base => "http://some/base/href"
40
+ # :allow_net => [true, false]
41
+ # :allow_local => [true, false]
42
+ def parse(options = {})
43
+ options[:file] || options[:string] || (raise ParsleyError.new("must specify what to parse"))
44
+
45
+ options[:sgwrap] = !!options[:sgwrap]
46
+ options[:is_file] = !!options[:file]
47
+ options[:has_base] = !!options[:base]
48
+
49
+ options[:base] = options[:base].to_s
50
+ options[:file] = options[:file].to_s
51
+ options[:string] = options[:string].to_s
52
+
53
+ options[:input] ||= :html
54
+ options[:output] ||= :ruby
55
+
56
+ options[:collate] = true unless options.has_key?(:collate)
57
+ options[:prune] = true unless options.has_key?(:prune)
58
+ options[:allow_net] = true unless options.has_key?(:allow_net)
59
+ options[:allow_local] = true unless options.has_key?(:allow_local)
60
+
61
+ options[:collate] = !!options[:collate]
62
+ options[:prune] = !!options[:prune]
63
+ options[:allow_net] = !!options[:allow_net]
64
+ options[:allow_local] = !!options[:allow_local]
65
+
66
+ @parsley.parse(options)
67
+ end
68
+ private
69
+
70
+ def recursive_stringify(obj)
71
+ case obj
72
+ when Hash
73
+ obj.inject({}) do |memo, (k, v)|
74
+ memo[k.to_s] = recursive_stringify(v)
75
+ memo
76
+ end
77
+ when Array
78
+ obj.map{|e| recursive_stringify(e) }
79
+ else
80
+ obj.to_s
81
+ end
82
+ end
83
+
84
+ end
@@ -0,0 +1,116 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/parsley"
3
+
4
+ class TestParsley < Test::Unit::TestCase
5
+ def setup
6
+ @page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
7
+ @home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
8
+ @let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
9
+ end
10
+
11
+ def test_segfault_regression
12
+ simple_html = <<-HTML
13
+ <html>
14
+ <body>
15
+ <h1 class="iCIMS_Header_JobTitle">CEO</h1>
16
+ </body>
17
+ </html>
18
+ HTML
19
+
20
+ struct = {
21
+ 'jobs' => [{
22
+ 'title' => ".iCIMS_Header_JobTitle",
23
+ 'description?' => "blah",
24
+ 'location?' => "blah",
25
+ 'experience?' => "blah",
26
+ 'education?' => "blah"
27
+ }]
28
+ }
29
+ parselet = Parsley.new(struct)
30
+ result = parselet.parse(:string => simple_html)
31
+ assert_equal "CEO", result['jobs'].first['title']
32
+ assert result['jobs'].first['description'].nil?
33
+ end
34
+ #
35
+ # def test_yelp
36
+ # @parsley = Parsley.new(File.read(@let))
37
+ # out = @parsley.parse(:file => @home)
38
+ # assert_equal "/c/sf/shopping", out["categories"][0]["href"]
39
+ # end
40
+ #
41
+ # def test_parsley_should_raise_if_value_syntax_error
42
+ # assert_raises(ParsleyError) do
43
+ # Parsley.new({"foo" => nil})
44
+ # end
45
+ #
46
+ # assert_raises(ParsleyError) do
47
+ # Parsley.new({"foo" => ""})
48
+ # end
49
+ #
50
+ # assert_raises(ParsleyError) do
51
+ # Parsley.new({"foo" => "<<<<<<<<<<<"})
52
+ # end
53
+ # end
54
+ #
55
+ # def test_yelp_xml
56
+ # @parsley = Parsley.new(File.read(@let))
57
+ # out = @parsley.parse(:file => @home, :output => :xml)
58
+ # end
59
+ #
60
+ # def test_broken
61
+ # @parsley = Parsley.new("hi" => "no-ns:match(h1)")
62
+ # assert_raises(ParsleyError) {
63
+ # @parsley.parse(:file => @page)
64
+ # }
65
+ # end
66
+ #
67
+ # def test_simple
68
+ # @parsley = Parsley.new("hi" => "h1")
69
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
70
+ # end
71
+ #
72
+ # def test_simple_string
73
+ # @parsley = Parsley.new("hi" => "h1")
74
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
75
+ # end
76
+ #
77
+ # def test_xml
78
+ # @parsley = Parsley.new("hi" => "h1")
79
+ # xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parselets.com/json\"><hi position=\"63\">Nick's Crispy Tacos</hi></parsley:root>\n"
80
+ # assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
81
+ # end
82
+ #
83
+ # def test_sgwrap
84
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
85
+ # html = "<p><b>hi</b>world</p>"
86
+ # assert_equal({"hi" => "world"}, @parsley.parse(:string => html, :sgwrap => true))
87
+ # end
88
+ #
89
+ # def test_sgwrap_off
90
+ # @parsley = Parsley.new("hi" => "p sg_wrap")
91
+ # html = "<p><b>hi</b>world</p>"
92
+ # assert_raises(ParsleyError) do
93
+ # @parsley.parse(:string => html, :sgwrap => false)
94
+ # end
95
+ # end
96
+ #
97
+ #
98
+ # def test_json
99
+ # @parsley = Parsley.new("hi" => "h1")
100
+ # assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
101
+ # end
102
+ #
103
+ # def test_rescuable_file_error
104
+ # @parsley = Parsley.new("hi" => "h1")
105
+ # @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
106
+ # assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
107
+ # end
108
+ #
109
+ # def test_array_string
110
+ # @parsley = Parsley.new({"foo" => ["li"]})
111
+ # out = @parsley.parse(:file => @page)
112
+ # assert_kind_of Hash, out
113
+ # assert_kind_of Array, out["foo"], out.inspect
114
+ # assert out["foo"].length > 1
115
+ # end
116
+ end