le1t0-parsley-ruby 0.4.5.001
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +28 -0
- data/CHANGELOG +8 -0
- data/README +32 -0
- data/Rakefile +57 -0
- data/VERSION +1 -0
- data/ext/cparsley.c +140 -0
- data/ext/extconf.rb +8 -0
- data/lib/parsley.rb +84 -0
- data/test/test_parsley.rb +116 -0
- data/test/yelp-benchmark.rb +53 -0
- data/test/yelp-home.html +1004 -0
- data/test/yelp-home.let +6 -0
- data/test/yelp.html +2329 -0
- metadata +376 -0
data/.gitignore
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
.libs/
|
2
|
+
*.o
|
3
|
+
*.lo
|
4
|
+
dexterc
|
5
|
+
dexter
|
6
|
+
parsleyc
|
7
|
+
parsley
|
8
|
+
.deps/
|
9
|
+
Makefile
|
10
|
+
y.tab.c
|
11
|
+
autom4te.cache/
|
12
|
+
autoscan.log
|
13
|
+
config.log
|
14
|
+
configure.scan
|
15
|
+
parser.c
|
16
|
+
scanner.c
|
17
|
+
libparsley.la
|
18
|
+
parser.h
|
19
|
+
test.log
|
20
|
+
parsley*.gem
|
21
|
+
ext/cparsley.bundle
|
22
|
+
ext/cparsley.so
|
23
|
+
ext/Makefile
|
24
|
+
ext/conftest.dSYM/
|
25
|
+
work
|
26
|
+
ext/mkmf.log
|
27
|
+
pkg
|
28
|
+
le1t0-parsley-ruby.gemspec
|
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
ABOUT
|
2
|
+
|
3
|
+
Ruby bindings for Parsley.
|
4
|
+
|
5
|
+
INSTALLATION
|
6
|
+
|
7
|
+
= Get Parsley and Dependancies =
|
8
|
+
|
9
|
+
Download Parsley from http://github.com/fizx/parsley/tree/master following the installation directions located at http://github.com/fizx/parsley/blob/master/INSTALL
|
10
|
+
|
11
|
+
= Install parsley-ruby =
|
12
|
+
|
13
|
+
From source:
|
14
|
+
sudo rake install
|
15
|
+
|
16
|
+
From GitHub: DEPRECATED!
|
17
|
+
|
18
|
+
From GemCutter
|
19
|
+
|
20
|
+
Run the following if you haven't already:
|
21
|
+
gem sources -a http://gemcutter.org
|
22
|
+
Install the gem:
|
23
|
+
sudo gem install parsley-ruby
|
24
|
+
|
25
|
+
PARSLETS.COM INTEGRATION
|
26
|
+
|
27
|
+
We also recommend installing the free online_parselets rubygem in order to use other people's parselets and to share your own:
|
28
|
+
Run the following if you haven't already:
|
29
|
+
gem sources -a http://gems.github.com
|
30
|
+
Install the gem:
|
31
|
+
sudo gem install iterationlabs-online_parslets
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "le1t0-parsley-ruby"
|
8
|
+
gem.summary = "Ruby binding for parsley"
|
9
|
+
gem.description = "XML/HTML Parser"
|
10
|
+
gem.email = "dev@ewout.to"
|
11
|
+
gem.homepage = "http://github.com/le1t0/parsley-ruby"
|
12
|
+
gem.authors = ["Le1t0"]
|
13
|
+
gem.add_dependency("json", ["> 0.0.0"])
|
14
|
+
gem.require_paths = ["lib", "ext"]
|
15
|
+
gem.extensions = "ext/extconf.rb" end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/*_test.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
begin
|
29
|
+
require 'rcov/rcovtask'
|
30
|
+
Rcov::RcovTask.new do |test|
|
31
|
+
test.libs << 'test'
|
32
|
+
test.pattern = 'test/**/*_test.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
rescue LoadError
|
36
|
+
task :rcov do
|
37
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
task :test => :check_dependencies
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION')
|
48
|
+
version = File.read('VERSION')
|
49
|
+
else
|
50
|
+
version = ""
|
51
|
+
end
|
52
|
+
|
53
|
+
rdoc.rdoc_dir = 'rdoc'
|
54
|
+
rdoc.title = "robots #{version}"
|
55
|
+
rdoc.rdoc_files.include('README*')
|
56
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
57
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4.5.001
|
data/ext/cparsley.c
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <libxslt/xslt.h>
|
4
|
+
#include <libexslt/exslt.h>
|
5
|
+
#include <libxslt/xsltInternals.h>
|
6
|
+
#include <libxslt/transform.h>
|
7
|
+
#include <libxml/parser.h>
|
8
|
+
#include <libxml/HTMLparser.h>
|
9
|
+
#include <libxml/HTMLtree.h>
|
10
|
+
#include <libxml/xmlwriter.h>
|
11
|
+
#include <parsley.h>
|
12
|
+
#include <json/json.h>
|
13
|
+
#include <xml2json.h>
|
14
|
+
|
15
|
+
VALUE _new(VALUE, VALUE, VALUE);
|
16
|
+
VALUE _parse(VALUE, VALUE);
|
17
|
+
VALUE _rb_set_user_agent(VALUE self, VALUE agent);
|
18
|
+
VALUE c_parsley_err;
|
19
|
+
VALUE c_parsley;
|
20
|
+
|
21
|
+
void Init_cparsley()
|
22
|
+
{
|
23
|
+
c_parsley = rb_define_class("CParsley", rb_cObject);
|
24
|
+
c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
|
25
|
+
rb_define_singleton_method(c_parsley, "new", _new, 2);
|
26
|
+
rb_define_singleton_method(c_parsley, "set_user_agent", _rb_set_user_agent, 1);
|
27
|
+
rb_define_method(c_parsley, "parse", _parse, 1);
|
28
|
+
}
|
29
|
+
|
30
|
+
VALUE
|
31
|
+
_new(VALUE self, VALUE parsley, VALUE incl){
|
32
|
+
parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
|
33
|
+
if(ptr->error != NULL) {
|
34
|
+
rb_raise(c_parsley_err, ptr->error);
|
35
|
+
parsley_free(ptr);
|
36
|
+
return Qnil;
|
37
|
+
}
|
38
|
+
|
39
|
+
return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
|
40
|
+
}
|
41
|
+
|
42
|
+
VALUE
|
43
|
+
_rb_set_user_agent(VALUE self, VALUE agent) {
|
44
|
+
parsley_set_user_agent(STR2CSTR(agent));
|
45
|
+
return Qtrue;
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
static VALUE
|
50
|
+
rubify_recurse(xmlNodePtr xml) {
|
51
|
+
if(xml == NULL) return NULL;
|
52
|
+
xmlNodePtr child;
|
53
|
+
VALUE obj = Qnil;
|
54
|
+
|
55
|
+
switch(xml->type) {
|
56
|
+
case XML_ELEMENT_NODE:
|
57
|
+
child = xml->children;
|
58
|
+
if(xml->ns == NULL) {
|
59
|
+
child = xml;
|
60
|
+
obj = rb_hash_new();
|
61
|
+
while(child != NULL) {
|
62
|
+
rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
|
63
|
+
child = child->next;
|
64
|
+
}
|
65
|
+
} else if(!strcmp(xml->ns->prefix, "parsley")) {
|
66
|
+
if(!strcmp(xml->name, "groups")) {
|
67
|
+
obj = rb_ary_new();
|
68
|
+
while(child != NULL) {
|
69
|
+
rb_ary_push(obj, rubify_recurse(child->children));
|
70
|
+
child = child->next;
|
71
|
+
}
|
72
|
+
} else if(!strcmp(xml->name, "group")) {
|
73
|
+
// Implicitly handled by parsley:groups handler
|
74
|
+
}
|
75
|
+
}
|
76
|
+
break;
|
77
|
+
case XML_TEXT_NODE:
|
78
|
+
obj = rb_str_new2(xml->content);
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
// inspect(obj);
|
82
|
+
return obj;
|
83
|
+
}
|
84
|
+
|
85
|
+
static VALUE
|
86
|
+
_parse_doc(parsedParsleyPtr ptr, VALUE type) {
|
87
|
+
if(ptr->error != NULL || ptr->xml == NULL) {
|
88
|
+
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
|
89
|
+
rb_raise(c_parsley_err, ptr->error);
|
90
|
+
parsed_parsley_free(ptr);
|
91
|
+
return Qnil;
|
92
|
+
}
|
93
|
+
|
94
|
+
VALUE output;
|
95
|
+
if(type == ID2SYM(rb_intern("json"))) {
|
96
|
+
struct json_object *json = xml2json(ptr->xml->children->children);
|
97
|
+
char* str = json_object_to_json_string(json);
|
98
|
+
output = rb_str_new2(str);
|
99
|
+
json_object_put(json);
|
100
|
+
} else if(type == ID2SYM(rb_intern("xml"))) {
|
101
|
+
xmlChar* str;
|
102
|
+
int size;
|
103
|
+
xmlDocDumpMemory(ptr->xml, &str, &size);
|
104
|
+
output = rb_str_new(str, size);
|
105
|
+
} else {
|
106
|
+
output = rubify_recurse(ptr->xml->children->children);
|
107
|
+
if((void*)output == NULL) output = Qnil;
|
108
|
+
}
|
109
|
+
|
110
|
+
parsed_parsley_free(ptr);
|
111
|
+
|
112
|
+
return output;
|
113
|
+
}
|
114
|
+
|
115
|
+
#define OPT(A) rb_hash_aref(options, ID2SYM(rb_intern(A)))
|
116
|
+
#define OPT_BOOL(A) (OPT(A) != Qnil && OPT(A) != Qfalse)
|
117
|
+
#define OPT_MATCH(A, B) (rb_hash_aref(options, ID2SYM(rb_intern(A))) == ID2SYM(rb_intern(B)))
|
118
|
+
|
119
|
+
VALUE _parse(VALUE self, VALUE options){
|
120
|
+
parsleyPtr parsley;
|
121
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
122
|
+
int flags = 0;
|
123
|
+
char *base = NULL;
|
124
|
+
if(OPT_MATCH("input", "html")) flags |= PARSLEY_OPTIONS_HTML;
|
125
|
+
if(OPT_BOOL("prune")) flags |= PARSLEY_OPTIONS_PRUNE;
|
126
|
+
if(OPT_BOOL("collate")) flags |= PARSLEY_OPTIONS_COLLATE;
|
127
|
+
if(OPT_BOOL("allow_net")) flags |= PARSLEY_OPTIONS_ALLOW_NET;
|
128
|
+
if(OPT_BOOL("allow_local")) flags |= PARSLEY_OPTIONS_ALLOW_LOCAL;
|
129
|
+
if(OPT_BOOL("sgwrap")) flags |= PARSLEY_OPTIONS_SGWRAP;
|
130
|
+
if(OPT_BOOL("has_base")) base = STR2CSTR(OPT("base"));
|
131
|
+
|
132
|
+
// printf("prune: %d\nallow_net: %d\nallow_local: %d\nhas_base: %d\nflags: %d\n", OPT_BOOL("prune"), OPT_BOOL("allow_net"), OPT_BOOL("allow_local"), OPT_BOOL("has_base"), flags);
|
133
|
+
|
134
|
+
if(OPT_BOOL("is_file")) {
|
135
|
+
return _parse_doc(parsley_parse_file(parsley, STR2CSTR(OPT("file")), flags), OPT("output"));
|
136
|
+
} else {
|
137
|
+
char * str = STR2CSTR(OPT("string"));
|
138
|
+
return _parse_doc(parsley_parse_string(parsley, str, strlen(str), base, flags), OPT("output"));
|
139
|
+
}
|
140
|
+
}
|
data/ext/extconf.rb
ADDED
data/lib/parsley.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../ext/cparsley"
|
2
|
+
require "rubygems"
|
3
|
+
require "json"
|
4
|
+
require "thread"
|
5
|
+
|
6
|
+
class Parsley
|
7
|
+
|
8
|
+
def self.user_agent=(agent)
|
9
|
+
@user_agent = agent
|
10
|
+
CParsley.set_user_agent(agent.to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.user_agent
|
14
|
+
@user_agent
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(parsley, incl = "")
|
18
|
+
if(parsley.is_a?(Hash))
|
19
|
+
parsley = recursive_stringify(parsley).to_json
|
20
|
+
end
|
21
|
+
@@mutex ||= Mutex.new
|
22
|
+
@@mutex.synchronize do
|
23
|
+
@parsley = CParsley.new(parsley, incl)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Valid options:
|
28
|
+
#
|
29
|
+
# Requires one of:
|
30
|
+
# :file -- the input file path or url
|
31
|
+
# :string -- the input string
|
32
|
+
#
|
33
|
+
# And optionally (default is the first listed value):
|
34
|
+
# :input => [:html, :xml]
|
35
|
+
# :output => [:ruby, :json, :xml]
|
36
|
+
# :prune => [true, false]
|
37
|
+
# :sgwrap => [false, true]
|
38
|
+
# :collate => [true, false]
|
39
|
+
# :base => "http://some/base/href"
|
40
|
+
# :allow_net => [true, false]
|
41
|
+
# :allow_local => [true, false]
|
42
|
+
def parse(options = {})
|
43
|
+
options[:file] || options[:string] || (raise ParsleyError.new("must specify what to parse"))
|
44
|
+
|
45
|
+
options[:sgwrap] = !!options[:sgwrap]
|
46
|
+
options[:is_file] = !!options[:file]
|
47
|
+
options[:has_base] = !!options[:base]
|
48
|
+
|
49
|
+
options[:base] = options[:base].to_s
|
50
|
+
options[:file] = options[:file].to_s
|
51
|
+
options[:string] = options[:string].to_s
|
52
|
+
|
53
|
+
options[:input] ||= :html
|
54
|
+
options[:output] ||= :ruby
|
55
|
+
|
56
|
+
options[:collate] = true unless options.has_key?(:collate)
|
57
|
+
options[:prune] = true unless options.has_key?(:prune)
|
58
|
+
options[:allow_net] = true unless options.has_key?(:allow_net)
|
59
|
+
options[:allow_local] = true unless options.has_key?(:allow_local)
|
60
|
+
|
61
|
+
options[:collate] = !!options[:collate]
|
62
|
+
options[:prune] = !!options[:prune]
|
63
|
+
options[:allow_net] = !!options[:allow_net]
|
64
|
+
options[:allow_local] = !!options[:allow_local]
|
65
|
+
|
66
|
+
@parsley.parse(options)
|
67
|
+
end
|
68
|
+
private
|
69
|
+
|
70
|
+
def recursive_stringify(obj)
|
71
|
+
case obj
|
72
|
+
when Hash
|
73
|
+
obj.inject({}) do |memo, (k, v)|
|
74
|
+
memo[k.to_s] = recursive_stringify(v)
|
75
|
+
memo
|
76
|
+
end
|
77
|
+
when Array
|
78
|
+
obj.map{|e| recursive_stringify(e) }
|
79
|
+
else
|
80
|
+
obj.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/parsley"
|
3
|
+
|
4
|
+
class TestParsley < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
|
7
|
+
@home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
|
8
|
+
@let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_segfault_regression
|
12
|
+
simple_html = <<-HTML
|
13
|
+
<html>
|
14
|
+
<body>
|
15
|
+
<h1 class="iCIMS_Header_JobTitle">CEO</h1>
|
16
|
+
</body>
|
17
|
+
</html>
|
18
|
+
HTML
|
19
|
+
|
20
|
+
struct = {
|
21
|
+
'jobs' => [{
|
22
|
+
'title' => ".iCIMS_Header_JobTitle",
|
23
|
+
'description?' => "blah",
|
24
|
+
'location?' => "blah",
|
25
|
+
'experience?' => "blah",
|
26
|
+
'education?' => "blah"
|
27
|
+
}]
|
28
|
+
}
|
29
|
+
parselet = Parsley.new(struct)
|
30
|
+
result = parselet.parse(:string => simple_html)
|
31
|
+
assert_equal "CEO", result['jobs'].first['title']
|
32
|
+
assert result['jobs'].first['description'].nil?
|
33
|
+
end
|
34
|
+
#
|
35
|
+
# def test_yelp
|
36
|
+
# @parsley = Parsley.new(File.read(@let))
|
37
|
+
# out = @parsley.parse(:file => @home)
|
38
|
+
# assert_equal "/c/sf/shopping", out["categories"][0]["href"]
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# def test_parsley_should_raise_if_value_syntax_error
|
42
|
+
# assert_raises(ParsleyError) do
|
43
|
+
# Parsley.new({"foo" => nil})
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# assert_raises(ParsleyError) do
|
47
|
+
# Parsley.new({"foo" => ""})
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# assert_raises(ParsleyError) do
|
51
|
+
# Parsley.new({"foo" => "<<<<<<<<<<<"})
|
52
|
+
# end
|
53
|
+
# end
|
54
|
+
#
|
55
|
+
# def test_yelp_xml
|
56
|
+
# @parsley = Parsley.new(File.read(@let))
|
57
|
+
# out = @parsley.parse(:file => @home, :output => :xml)
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# def test_broken
|
61
|
+
# @parsley = Parsley.new("hi" => "no-ns:match(h1)")
|
62
|
+
# assert_raises(ParsleyError) {
|
63
|
+
# @parsley.parse(:file => @page)
|
64
|
+
# }
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# def test_simple
|
68
|
+
# @parsley = Parsley.new("hi" => "h1")
|
69
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# def test_simple_string
|
73
|
+
# @parsley = Parsley.new("hi" => "h1")
|
74
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
|
75
|
+
# end
|
76
|
+
#
|
77
|
+
# def test_xml
|
78
|
+
# @parsley = Parsley.new("hi" => "h1")
|
79
|
+
# xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parselets.com/json\"><hi position=\"63\">Nick's Crispy Tacos</hi></parsley:root>\n"
|
80
|
+
# assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
# def test_sgwrap
|
84
|
+
# @parsley = Parsley.new("hi" => "p sg_wrap")
|
85
|
+
# html = "<p><b>hi</b>world</p>"
|
86
|
+
# assert_equal({"hi" => "world"}, @parsley.parse(:string => html, :sgwrap => true))
|
87
|
+
# end
|
88
|
+
#
|
89
|
+
# def test_sgwrap_off
|
90
|
+
# @parsley = Parsley.new("hi" => "p sg_wrap")
|
91
|
+
# html = "<p><b>hi</b>world</p>"
|
92
|
+
# assert_raises(ParsleyError) do
|
93
|
+
# @parsley.parse(:string => html, :sgwrap => false)
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
#
|
97
|
+
#
|
98
|
+
# def test_json
|
99
|
+
# @parsley = Parsley.new("hi" => "h1")
|
100
|
+
# assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# def test_rescuable_file_error
|
104
|
+
# @parsley = Parsley.new("hi" => "h1")
|
105
|
+
# @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
|
106
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
|
107
|
+
# end
|
108
|
+
#
|
109
|
+
# def test_array_string
|
110
|
+
# @parsley = Parsley.new({"foo" => ["li"]})
|
111
|
+
# out = @parsley.parse(:file => @page)
|
112
|
+
# assert_kind_of Hash, out
|
113
|
+
# assert_kind_of Array, out["foo"], out.inspect
|
114
|
+
# assert out["foo"].length > 1
|
115
|
+
# end
|
116
|
+
end
|