parsley-ruby 0.0.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +26 -0
- data/README +32 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/ext/cparsley.c +140 -0
- data/ext/extconf.rb +69 -0
- data/lib/parsley.rb +84 -0
- data/parsley-ruby.gemspec +56 -0
- data/test/test_parsley.rb +116 -0
- data/test/yelp-benchmark.rb +53 -0
- data/test/yelp-home.html +1004 -0
- data/test/yelp-home.let +6 -0
- data/test/yelp.html +2329 -0
- metadata +39 -17
data/.gitignore
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
.libs/
|
2
|
+
*.o
|
3
|
+
*.lo
|
4
|
+
dexterc
|
5
|
+
dexter
|
6
|
+
parsleyc
|
7
|
+
parsley
|
8
|
+
.deps/
|
9
|
+
Makefile
|
10
|
+
y.tab.c
|
11
|
+
autom4te.cache/
|
12
|
+
autoscan.log
|
13
|
+
config.log
|
14
|
+
configure.scan
|
15
|
+
parser.c
|
16
|
+
scanner.c
|
17
|
+
libparsley.la
|
18
|
+
parser.h
|
19
|
+
test.log
|
20
|
+
parsley*.gem
|
21
|
+
ext/cparsley.bundle
|
22
|
+
ext/cparsley.so
|
23
|
+
ext/Makefile
|
24
|
+
ext/conftest.dSYM/
|
25
|
+
work
|
26
|
+
ext/mkmf.log
|
data/README
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
ABOUT
|
2
|
+
|
3
|
+
Ruby bindings for Parsley.
|
4
|
+
|
5
|
+
INSTALLATION
|
6
|
+
|
7
|
+
= Get Parsley and Dependancies =
|
8
|
+
|
9
|
+
Download Parsley from http://github.com/fizx/parsley/tree/master following the installation directions located at http://github.com/fizx/parsley/blob/master/INSTALL
|
10
|
+
|
11
|
+
= Install parsley-ruby =
|
12
|
+
|
13
|
+
From source:
|
14
|
+
sudo rake install
|
15
|
+
|
16
|
+
From GitHub: DEPRECATED!
|
17
|
+
|
18
|
+
From GemCutter
|
19
|
+
|
20
|
+
Run the following if you haven't already:
|
21
|
+
gem sources -a http://gemcutter.org
|
22
|
+
Install the gem:
|
23
|
+
sudo gem install parsley-ruby
|
24
|
+
|
25
|
+
PARSLETS.COM INTEGRATION
|
26
|
+
|
27
|
+
We also recommend installing the free online_parselets rubygem in order to use other people's parselets and to share your own:
|
28
|
+
Run the following if you haven't already:
|
29
|
+
gem sources -a http://gems.github.com
|
30
|
+
Install the gem:
|
31
|
+
sudo gem install iterationlabs-online_parslets
|
32
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "parsley-ruby"
|
8
|
+
gem.summary = "Ruby binding for parsley"
|
9
|
+
gem.description = "XML/HTML Parser"
|
10
|
+
gem.email = "kyle@kylemaxwell.com"
|
11
|
+
gem.homepage = "http://github.com/fizx/parsley-ruby"
|
12
|
+
gem.authors = ["Kyle Maxwell"]
|
13
|
+
gem.add_dependency("json", ["> 0.0.0"])
|
14
|
+
end
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'rake/testtask'
|
21
|
+
Rake::TestTask.new(:test) do |test|
|
22
|
+
test.libs << 'lib' << 'test'
|
23
|
+
test.pattern = 'test/**/*_test.rb'
|
24
|
+
test.verbose = true
|
25
|
+
end
|
26
|
+
|
27
|
+
begin
|
28
|
+
require 'rcov/rcovtask'
|
29
|
+
Rcov::RcovTask.new do |test|
|
30
|
+
test.libs << 'test'
|
31
|
+
test.pattern = 'test/**/*_test.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
task :rcov do
|
36
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
task :test => :check_dependencies
|
41
|
+
|
42
|
+
task :default => :test
|
43
|
+
|
44
|
+
require 'rake/rdoctask'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
if File.exist?('VERSION')
|
47
|
+
version = File.read('VERSION')
|
48
|
+
else
|
49
|
+
version = ""
|
50
|
+
end
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "robots #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.4.1
|
data/ext/cparsley.c
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <libxslt/xslt.h>
|
4
|
+
#include <libexslt/exslt.h>
|
5
|
+
#include <libxslt/xsltInternals.h>
|
6
|
+
#include <libxslt/transform.h>
|
7
|
+
#include <libxml/parser.h>
|
8
|
+
#include <libxml/HTMLparser.h>
|
9
|
+
#include <libxml/HTMLtree.h>
|
10
|
+
#include <libxml/xmlwriter.h>
|
11
|
+
#include <parsley.h>
|
12
|
+
#include <json/json.h>
|
13
|
+
#include <xml2json.h>
|
14
|
+
|
15
|
+
VALUE _new(VALUE, VALUE, VALUE);
|
16
|
+
VALUE _parse(VALUE, VALUE);
|
17
|
+
VALUE _rb_set_user_agent(VALUE self, VALUE agent);
|
18
|
+
VALUE c_parsley_err;
|
19
|
+
VALUE c_parsley;
|
20
|
+
|
21
|
+
void Init_cparsley()
|
22
|
+
{
|
23
|
+
c_parsley = rb_define_class("CParsley", rb_cObject);
|
24
|
+
c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
|
25
|
+
rb_define_singleton_method(c_parsley, "new", _new, 2);
|
26
|
+
rb_define_singleton_method(c_parsley, "set_user_agent", _rb_set_user_agent, 1);
|
27
|
+
rb_define_method(c_parsley, "parse", _parse, 1);
|
28
|
+
}
|
29
|
+
|
30
|
+
VALUE
|
31
|
+
_new(VALUE self, VALUE parsley, VALUE incl){
|
32
|
+
parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
|
33
|
+
if(ptr->error != NULL) {
|
34
|
+
rb_raise(c_parsley_err, ptr->error);
|
35
|
+
parsley_free(ptr);
|
36
|
+
return Qnil;
|
37
|
+
}
|
38
|
+
|
39
|
+
return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
|
40
|
+
}
|
41
|
+
|
42
|
+
VALUE
|
43
|
+
_rb_set_user_agent(VALUE self, VALUE agent) {
|
44
|
+
parsley_set_user_agent(STR2CSTR(agent));
|
45
|
+
return Qtrue;
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
static VALUE
|
50
|
+
rubify_recurse(xmlNodePtr xml) {
|
51
|
+
if(xml == NULL) return NULL;
|
52
|
+
xmlNodePtr child;
|
53
|
+
VALUE obj = Qnil;
|
54
|
+
|
55
|
+
switch(xml->type) {
|
56
|
+
case XML_ELEMENT_NODE:
|
57
|
+
child = xml->children;
|
58
|
+
if(xml->ns == NULL) {
|
59
|
+
child = xml;
|
60
|
+
obj = rb_hash_new();
|
61
|
+
while(child != NULL) {
|
62
|
+
rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
|
63
|
+
child = child->next;
|
64
|
+
}
|
65
|
+
} else if(!strcmp(xml->ns->prefix, "parsley")) {
|
66
|
+
if(!strcmp(xml->name, "groups")) {
|
67
|
+
obj = rb_ary_new();
|
68
|
+
while(child != NULL) {
|
69
|
+
rb_ary_push(obj, rubify_recurse(child->children));
|
70
|
+
child = child->next;
|
71
|
+
}
|
72
|
+
} else if(!strcmp(xml->name, "group")) {
|
73
|
+
// Implicitly handled by parsley:groups handler
|
74
|
+
}
|
75
|
+
}
|
76
|
+
break;
|
77
|
+
case XML_TEXT_NODE:
|
78
|
+
obj = rb_str_new2(xml->content);
|
79
|
+
break;
|
80
|
+
}
|
81
|
+
// inspect(obj);
|
82
|
+
return obj;
|
83
|
+
}
|
84
|
+
|
85
|
+
static VALUE
|
86
|
+
_parse_doc(parsedParsleyPtr ptr, VALUE type) {
|
87
|
+
if(ptr->error != NULL || ptr->xml == NULL) {
|
88
|
+
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
|
89
|
+
rb_raise(c_parsley_err, ptr->error);
|
90
|
+
parsed_parsley_free(ptr);
|
91
|
+
return Qnil;
|
92
|
+
}
|
93
|
+
|
94
|
+
VALUE output;
|
95
|
+
if(type == ID2SYM(rb_intern("json"))) {
|
96
|
+
struct json_object *json = xml2json(ptr->xml->children->children);
|
97
|
+
char* str = json_object_to_json_string(json);
|
98
|
+
output = rb_str_new2(str);
|
99
|
+
json_object_put(json);
|
100
|
+
} else if(type == ID2SYM(rb_intern("xml"))) {
|
101
|
+
xmlChar* str;
|
102
|
+
int size;
|
103
|
+
xmlDocDumpMemory(ptr->xml, &str, &size);
|
104
|
+
output = rb_str_new(str, size);
|
105
|
+
} else {
|
106
|
+
output = rubify_recurse(ptr->xml->children->children);
|
107
|
+
if((void*)output == NULL) output = Qnil;
|
108
|
+
}
|
109
|
+
|
110
|
+
parsed_parsley_free(ptr);
|
111
|
+
|
112
|
+
return output;
|
113
|
+
}
|
114
|
+
|
115
|
+
#define OPT(A) rb_hash_aref(options, ID2SYM(rb_intern(A)))
|
116
|
+
#define OPT_BOOL(A) (OPT(A) != Qnil && OPT(A) != Qfalse)
|
117
|
+
#define OPT_MATCH(A, B) (rb_hash_aref(options, ID2SYM(rb_intern(A))) == ID2SYM(rb_intern(B)))
|
118
|
+
|
119
|
+
VALUE _parse(VALUE self, VALUE options){
|
120
|
+
parsleyPtr parsley;
|
121
|
+
Data_Get_Struct(self, parsleyPtr, parsley);
|
122
|
+
int flags = 0;
|
123
|
+
char *base = NULL;
|
124
|
+
if(OPT_MATCH("input", "html")) flags |= PARSLEY_OPTIONS_HTML;
|
125
|
+
if(OPT_BOOL("prune")) flags |= PARSLEY_OPTIONS_PRUNE;
|
126
|
+
if(OPT_BOOL("collate")) flags |= PARSLEY_OPTIONS_COLLATE;
|
127
|
+
if(OPT_BOOL("allow_net")) flags |= PARSLEY_OPTIONS_ALLOW_NET;
|
128
|
+
if(OPT_BOOL("allow_local")) flags |= PARSLEY_OPTIONS_ALLOW_LOCAL;
|
129
|
+
if(OPT_BOOL("sgwrap")) flags |= PARSLEY_OPTIONS_SGWRAP;
|
130
|
+
if(OPT_BOOL("has_base")) base = STR2CSTR(OPT("base"));
|
131
|
+
|
132
|
+
// printf("prune: %d\nallow_net: %d\nallow_local: %d\nhas_base: %d\nflags: %d\n", OPT_BOOL("prune"), OPT_BOOL("allow_net"), OPT_BOOL("allow_local"), OPT_BOOL("has_base"), flags);
|
133
|
+
|
134
|
+
if(OPT_BOOL("is_file")) {
|
135
|
+
return _parse_doc(parsley_parse_file(parsley, STR2CSTR(OPT("file")), flags), OPT("output"));
|
136
|
+
} else {
|
137
|
+
char * str = STR2CSTR(OPT("string"));
|
138
|
+
return _parse_doc(parsley_parse_string(parsley, str, strlen(str), base, flags), OPT("output"));
|
139
|
+
}
|
140
|
+
}
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
ENV["ARCHFLAGS"] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}"
|
3
|
+
|
4
|
+
require 'mkmf'
|
5
|
+
|
6
|
+
ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
7
|
+
LIBDIR = Config::CONFIG['libdir']
|
8
|
+
INCLUDEDIR = Config::CONFIG['includedir']
|
9
|
+
|
10
|
+
$CFLAGS << " #{ENV["CFLAGS"]}"
|
11
|
+
if Config::CONFIG['target_os'] == 'mingw32'
|
12
|
+
$CFLAGS << " -DXP_WIN -DXP_WIN32"
|
13
|
+
else
|
14
|
+
$CFLAGS << " -g -DXP_UNIX"
|
15
|
+
end
|
16
|
+
|
17
|
+
$CFLAGS << " -O3 -Wall -Wextra -Wcast-qual -Wwrite-strings -Wconversion -Wmissing-noreturn -Winline"
|
18
|
+
|
19
|
+
if Config::CONFIG['target_os'] == 'mingw32'
|
20
|
+
find_library('xml2', 'xmlParseDoc',
|
21
|
+
File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'bin'))
|
22
|
+
find_library('xslt', 'xsltParseStylesheetDoc',
|
23
|
+
File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'bin'))
|
24
|
+
else
|
25
|
+
find_library('xml2', 'xmlParseDoc', LIBDIR)
|
26
|
+
find_library('xslt', 'xsltParseStylesheetDoc', LIBDIR)
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
if Config::CONFIG['target_os'] == 'mingw32'
|
31
|
+
header = File.join(ROOT, 'cross', 'libxml2-2.7.2.win32', 'include')
|
32
|
+
unless find_header('libxml/xmlversion.h', header)
|
33
|
+
abort "need libxml"
|
34
|
+
end
|
35
|
+
|
36
|
+
header = File.join(ROOT, 'cross', 'libxslt-1.1.24.win32', 'include')
|
37
|
+
unless find_header('libxslt/libxslt.h', header)
|
38
|
+
abort "need libxslt"
|
39
|
+
end
|
40
|
+
|
41
|
+
header = File.join(ROOT, 'cross', 'iconv-1.9.2.win32', 'include')
|
42
|
+
unless find_header('iconv.h', header)
|
43
|
+
abort "need iconv"
|
44
|
+
end
|
45
|
+
else
|
46
|
+
unless find_header('libxml/xmlversion.h',
|
47
|
+
File.join(INCLUDEDIR, "libxml2"), '/usr/include/libxml2'
|
48
|
+
)
|
49
|
+
abort "need libxml"
|
50
|
+
end
|
51
|
+
unless find_header('libxslt/xslt.h', INCLUDEDIR, '/usr/include')
|
52
|
+
abort "need libxslt"
|
53
|
+
end
|
54
|
+
|
55
|
+
version = try_constant('LIBXML_VERSION', 'libxml/xmlversion.h')
|
56
|
+
end
|
57
|
+
|
58
|
+
myincl = %w[/usr/local/include /opt/local/include /usr/include]
|
59
|
+
mylib = %w[/usr/local/lib /opt/local/lib /usr/lib]
|
60
|
+
|
61
|
+
find_header('ruby.h', INCLUDEDIR, *myincl) or abort "need ruby.h"
|
62
|
+
|
63
|
+
find_header('json/json.h', INCLUDEDIR, *myincl) or abort "need json/json.h"
|
64
|
+
find_library('json', 'json_object_new_string', LIBDIR, *mylib) or abort "need libjson"
|
65
|
+
|
66
|
+
find_header('parsley.h', INCLUDEDIR, *myincl) or abort "need parsley.h"
|
67
|
+
find_library('parsley', 'parsley_compile', LIBDIR, *mylib) or abort "need libparsley"
|
68
|
+
|
69
|
+
create_makefile('cparsley')
|
data/lib/parsley.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../ext/cparsley"
|
2
|
+
require "rubygems"
|
3
|
+
require "json"
|
4
|
+
require "thread"
|
5
|
+
|
6
|
+
class Parsley
|
7
|
+
|
8
|
+
def self.user_agent=(agent)
|
9
|
+
@user_agent = agent
|
10
|
+
CParsley.set_user_agent(agent.to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.user_agent
|
14
|
+
@user_agent
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize(parsley, incl = "")
|
18
|
+
if(parsley.is_a?(Hash))
|
19
|
+
parsley = recursive_stringify(parsley).to_json
|
20
|
+
end
|
21
|
+
@@mutex ||= Mutex.new
|
22
|
+
@@mutex.synchronize do
|
23
|
+
@parsley = CParsley.new(parsley, incl)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Valid options:
|
28
|
+
#
|
29
|
+
# Requires one of:
|
30
|
+
# :file -- the input file path or url
|
31
|
+
# :string -- the input string
|
32
|
+
#
|
33
|
+
# And optionally (default is the first listed value):
|
34
|
+
# :input => [:html, :xml]
|
35
|
+
# :output => [:ruby, :json, :xml]
|
36
|
+
# :prune => [true, false]
|
37
|
+
# :sgwrap => [false, true]
|
38
|
+
# :collate => [true, false]
|
39
|
+
# :base => "http://some/base/href"
|
40
|
+
# :allow_net => [true, false]
|
41
|
+
# :allow_local => [true, false]
|
42
|
+
def parse(options = {})
|
43
|
+
options[:file] || options[:string] || (raise ParsleyError.new("must specify what to parse"))
|
44
|
+
|
45
|
+
options[:sgwrap] = !!options[:sgwrap]
|
46
|
+
options[:is_file] = !!options[:file]
|
47
|
+
options[:has_base] = !!options[:base]
|
48
|
+
|
49
|
+
options[:base] = options[:base].to_s
|
50
|
+
options[:file] = options[:file].to_s
|
51
|
+
options[:string] = options[:string].to_s
|
52
|
+
|
53
|
+
options[:input] ||= :html
|
54
|
+
options[:output] ||= :ruby
|
55
|
+
|
56
|
+
options[:collate] = true unless options.has_key?(:collate)
|
57
|
+
options[:prune] = true unless options.has_key?(:prune)
|
58
|
+
options[:allow_net] = true unless options.has_key?(:allow_net)
|
59
|
+
options[:allow_local] = true unless options.has_key?(:allow_local)
|
60
|
+
|
61
|
+
options[:collate] = !!options[:collate]
|
62
|
+
options[:prune] = !!options[:prune]
|
63
|
+
options[:allow_net] = !!options[:allow_net]
|
64
|
+
options[:allow_local] = !!options[:allow_local]
|
65
|
+
|
66
|
+
@parsley.parse(options)
|
67
|
+
end
|
68
|
+
private
|
69
|
+
|
70
|
+
def recursive_stringify(obj)
|
71
|
+
case obj
|
72
|
+
when Hash
|
73
|
+
obj.inject({}) do |memo, (k, v)|
|
74
|
+
memo[k.to_s] = recursive_stringify(v)
|
75
|
+
memo
|
76
|
+
end
|
77
|
+
when Array
|
78
|
+
obj.map{|e| recursive_stringify(e) }
|
79
|
+
else
|
80
|
+
obj.to_s
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{parsley-ruby}
|
8
|
+
s.version = "0.4.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Kyle Maxwell"]
|
12
|
+
s.date = %q{2009-11-01}
|
13
|
+
s.description = %q{XML/HTML Parser}
|
14
|
+
s.email = %q{kyle@kylemaxwell.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"README"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".gitignore",
|
20
|
+
"README",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"ext/cparsley.c",
|
24
|
+
"ext/extconf.rb",
|
25
|
+
"lib/parsley.rb",
|
26
|
+
"parsley-ruby.gemspec",
|
27
|
+
"test/test_parsley.rb",
|
28
|
+
"test/yelp-benchmark.rb",
|
29
|
+
"test/yelp-home.html",
|
30
|
+
"test/yelp-home.let",
|
31
|
+
"test/yelp.html"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/fizx/parsley-ruby}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.3.5}
|
37
|
+
s.summary = %q{Ruby binding for parsley}
|
38
|
+
s.test_files = [
|
39
|
+
"test/test_parsley.rb",
|
40
|
+
"test/yelp-benchmark.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
|
+
s.specification_version = 3
|
46
|
+
|
47
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
48
|
+
s.add_runtime_dependency(%q<json>, ["> 0.0.0"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<json>, ["> 0.0.0"])
|
51
|
+
end
|
52
|
+
else
|
53
|
+
s.add_dependency(%q<json>, ["> 0.0.0"])
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/parsley"
|
3
|
+
|
4
|
+
class TestParsley < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@page = File.expand_path(File.dirname(__FILE__) + "/yelp.html")
|
7
|
+
@home = File.expand_path(File.dirname(__FILE__) + "/yelp-home.html")
|
8
|
+
@let = File.expand_path(File.dirname(__FILE__) + "/yelp-home.let")
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_segfault_regression
|
12
|
+
simple_html = <<-HTML
|
13
|
+
<html>
|
14
|
+
<body>
|
15
|
+
<h1 class="iCIMS_Header_JobTitle">CEO</h1>
|
16
|
+
</body>
|
17
|
+
</html>
|
18
|
+
HTML
|
19
|
+
|
20
|
+
struct = {
|
21
|
+
'jobs' => [{
|
22
|
+
'title' => ".iCIMS_Header_JobTitle",
|
23
|
+
'description?' => "blah",
|
24
|
+
'location?' => "blah",
|
25
|
+
'experience?' => "blah",
|
26
|
+
'education?' => "blah"
|
27
|
+
}]
|
28
|
+
}
|
29
|
+
parselet = Parsley.new(struct)
|
30
|
+
result = parselet.parse(:string => simple_html)
|
31
|
+
assert_equal "CEO", result['jobs'].first['title']
|
32
|
+
assert result['jobs'].first['description'].nil?
|
33
|
+
end
|
34
|
+
#
|
35
|
+
# def test_yelp
|
36
|
+
# @parsley = Parsley.new(File.read(@let))
|
37
|
+
# out = @parsley.parse(:file => @home)
|
38
|
+
# assert_equal "/c/sf/shopping", out["categories"][0]["href"]
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# def test_parsley_should_raise_if_value_syntax_error
|
42
|
+
# assert_raises(ParsleyError) do
|
43
|
+
# Parsley.new({"foo" => nil})
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# assert_raises(ParsleyError) do
|
47
|
+
# Parsley.new({"foo" => ""})
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# assert_raises(ParsleyError) do
|
51
|
+
# Parsley.new({"foo" => "<<<<<<<<<<<"})
|
52
|
+
# end
|
53
|
+
# end
|
54
|
+
#
|
55
|
+
# def test_yelp_xml
|
56
|
+
# @parsley = Parsley.new(File.read(@let))
|
57
|
+
# out = @parsley.parse(:file => @home, :output => :xml)
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# def test_broken
|
61
|
+
# @parsley = Parsley.new("hi" => "no-ns:match(h1)")
|
62
|
+
# assert_raises(ParsleyError) {
|
63
|
+
# @parsley.parse(:file => @page)
|
64
|
+
# }
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# def test_simple
|
68
|
+
# @parsley = Parsley.new("hi" => "h1")
|
69
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @page))
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# def test_simple_string
|
73
|
+
# @parsley = Parsley.new("hi" => "h1")
|
74
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
|
75
|
+
# end
|
76
|
+
#
|
77
|
+
# def test_xml
|
78
|
+
# @parsley = Parsley.new("hi" => "h1")
|
79
|
+
# xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parselets.com/json\"><hi position=\"63\">Nick's Crispy Tacos</hi></parsley:root>\n"
|
80
|
+
# assert_equal(xml, @parsley.parse(:file => @page, :output => :xml))
|
81
|
+
# end
|
82
|
+
#
|
83
|
+
# def test_sgwrap
|
84
|
+
# @parsley = Parsley.new("hi" => "p sg_wrap")
|
85
|
+
# html = "<p><b>hi</b>world</p>"
|
86
|
+
# assert_equal({"hi" => "world"}, @parsley.parse(:string => html, :sgwrap => true))
|
87
|
+
# end
|
88
|
+
#
|
89
|
+
# def test_sgwrap_off
|
90
|
+
# @parsley = Parsley.new("hi" => "p sg_wrap")
|
91
|
+
# html = "<p><b>hi</b>world</p>"
|
92
|
+
# assert_raises(ParsleyError) do
|
93
|
+
# @parsley.parse(:string => html, :sgwrap => false)
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
#
|
97
|
+
#
|
98
|
+
# def test_json
|
99
|
+
# @parsley = Parsley.new("hi" => "h1")
|
100
|
+
# assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @page, :output => :json))
|
101
|
+
# end
|
102
|
+
#
|
103
|
+
# def test_rescuable_file_error
|
104
|
+
# @parsley = Parsley.new("hi" => "h1")
|
105
|
+
# @nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
|
106
|
+
# assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
|
107
|
+
# end
|
108
|
+
#
|
109
|
+
# def test_array_string
|
110
|
+
# @parsley = Parsley.new({"foo" => ["li"]})
|
111
|
+
# out = @parsley.parse(:file => @page)
|
112
|
+
# assert_kind_of Hash, out
|
113
|
+
# assert_kind_of Array, out["foo"], out.inspect
|
114
|
+
# assert out["foo"].length > 1
|
115
|
+
# end
|
116
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "nokogiri"
|
3
|
+
require "hpricot"
|
4
|
+
require "parsley"
|
5
|
+
require "benchmark"
|
6
|
+
require "pp"
|
7
|
+
|
8
|
+
YELP_HTML = File.dirname(__FILE__) + "/yelp.html"
|
9
|
+
|
10
|
+
def noko
|
11
|
+
parse Nokogiri.Hpricot(File.open(YELP_HTML))
|
12
|
+
end
|
13
|
+
|
14
|
+
def hpri
|
15
|
+
parse Hpricot(File.open(YELP_HTML))
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(doc)
|
19
|
+
out = {}
|
20
|
+
out["name"] = (doc / "h1").first.inner_text
|
21
|
+
out["phone"] = (doc / "#bizPhone").first.inner_text
|
22
|
+
out["address"] = (doc / "address").first.inner_text
|
23
|
+
out["reviews"] = (doc / ".nonfavoriteReview").map do |node|
|
24
|
+
review = {}
|
25
|
+
review["date"] = (node / ".ieSucks .smaller").first.inner_text
|
26
|
+
review["user_name"] = (node / ".reviewer_info a").first.inner_text
|
27
|
+
review["comment"] = (node / ".review_comment").first.inner_text
|
28
|
+
review
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def pars
|
33
|
+
parslet = Parsley.new({
|
34
|
+
"name" => "h1",
|
35
|
+
"phone" => "#bizPhone",
|
36
|
+
"address" => "address",
|
37
|
+
"reviews(.nonfavoriteReview)" => [
|
38
|
+
{
|
39
|
+
"date" => ".ieSucks .smaller",
|
40
|
+
"user_name" => ".reviewer_info a",
|
41
|
+
"comment" => ".review_comment"
|
42
|
+
}
|
43
|
+
]
|
44
|
+
})
|
45
|
+
pp parslet.parse(:file => YELP_HTML)
|
46
|
+
end
|
47
|
+
|
48
|
+
Benchmark.bm do |x|
|
49
|
+
x.report("nokogiri: ") { 3.times { noko } }
|
50
|
+
x.report("hpricot: ") { 3.times { hpri } }
|
51
|
+
x.report("parsley: ") { 3.times { pars } }
|
52
|
+
end
|
53
|
+
|