hpricot 0.6.164 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/Rakefile +31 -33
- data/ext/fast_xs/fast_xs.c +11 -5
- data/ext/hpricot_scan/HpricotScanService.java +10 -6
- data/ext/hpricot_scan/hpricot_css.c +3502 -0
- data/ext/hpricot_scan/hpricot_css.rl +115 -0
- data/ext/hpricot_scan/hpricot_scan.c +1032 -589
- data/ext/hpricot_scan/hpricot_scan.java.rl +5 -1
- data/ext/hpricot_scan/hpricot_scan.rl +493 -50
- data/ext/hpricot_scan/test.rb +1 -2
- data/lib/hpricot/builder.rb +21 -20
- data/lib/hpricot/elements.rb +12 -12
- data/lib/hpricot/htmlinfo.rb +19 -0
- data/lib/hpricot/inspect.rb +27 -31
- data/lib/hpricot/modules.rb +2 -1
- data/lib/hpricot/parse.rb +8 -268
- data/lib/hpricot/tag.rb +65 -99
- data/lib/hpricot/traverse.rb +20 -14
- data/test/files/boingboing.html +1 -1
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_builder.rb +4 -4
- data/test/test_parser.rb +36 -13
- data/test/test_preserved.rb +6 -2
- metadata +51 -51
- data/ext/hpricot_scan/hpricot_gram.c +0 -882
- data/ext/hpricot_scan/hpricot_gram.h +0 -9
data/lib/hpricot/tag.rb
CHANGED
@@ -2,66 +2,55 @@ module Hpricot
|
|
2
2
|
# :stopdoc:
|
3
3
|
|
4
4
|
class Doc
|
5
|
-
attr_accessor :children
|
6
|
-
def initialize(children = [], options = {})
|
7
|
-
@children = children ? children.each { |c| c.parent = self } : []
|
8
|
-
@options = options
|
9
|
-
end
|
10
5
|
def output(out, opts = {})
|
11
|
-
|
6
|
+
children.each do |n|
|
12
7
|
n.output(out, opts)
|
13
|
-
end
|
8
|
+
end if children
|
14
9
|
out
|
15
10
|
end
|
16
11
|
def make(input = nil, &blk)
|
17
|
-
Hpricot.make(input, @options, &blk)
|
12
|
+
Hpricot.make(input, @options, &blk).children
|
18
13
|
end
|
19
14
|
def altered!; end
|
15
|
+
def inspect_tree
|
16
|
+
children.map { |x| x.inspect_tree }.join if children
|
17
|
+
end
|
20
18
|
end
|
21
19
|
|
22
20
|
class BaseEle
|
23
|
-
attr_accessor :raw_string, :parent
|
24
21
|
def html_quote(str)
|
25
22
|
"\"" + str.gsub('"', '\\"') + "\""
|
26
23
|
end
|
27
24
|
def if_output(opts)
|
28
|
-
if opts[:preserve] and not
|
29
|
-
|
25
|
+
if opts[:preserve] and not raw_string.nil?
|
26
|
+
raw_string
|
30
27
|
else
|
31
28
|
yield opts
|
32
29
|
end
|
33
30
|
end
|
34
31
|
def pathname; self.name end
|
35
32
|
def altered!
|
36
|
-
|
37
|
-
end
|
38
|
-
def
|
39
|
-
|
40
|
-
fields.each do |f|
|
41
|
-
define_method("#{f}=") do |v|
|
42
|
-
altered!
|
43
|
-
instance_variable_set("@#{f}", v)
|
44
|
-
end
|
45
|
-
end
|
33
|
+
clear_raw
|
34
|
+
end
|
35
|
+
def inspect_tree(depth = 0)
|
36
|
+
%{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
|
46
37
|
end
|
47
38
|
end
|
48
39
|
|
49
40
|
class Elem
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
@children = children ? children.each { |c| c.parent = self } : []
|
54
|
-
end
|
55
|
-
def empty?; @children.empty? end
|
56
|
-
[:name, :raw_attributes, :parent, :altered!].each do |m|
|
57
|
-
[m, "#{m}="].each { |m2| define_method(m2) { |*a| [@etag, @stag].inject { |_,t| t.send(m2, *a) if t and t.respond_to?(m2) } } }
|
41
|
+
def initialize tag, attrs = nil, children = nil, etag = nil
|
42
|
+
self.name, self.raw_attributes, self.children, self.etag =
|
43
|
+
tag, attrs, children, etag
|
58
44
|
end
|
45
|
+
def empty?; children.nil? or children.empty? end
|
59
46
|
def attributes
|
60
47
|
if raw_attributes
|
61
48
|
raw_attributes.inject({}) do |hsh, (k, v)|
|
62
49
|
hsh[k] = Hpricot.uxs(v)
|
63
50
|
hsh
|
64
51
|
end
|
52
|
+
else
|
53
|
+
{}
|
65
54
|
end
|
66
55
|
end
|
67
56
|
def to_plain_text
|
@@ -79,151 +68,128 @@ module Hpricot
|
|
79
68
|
end
|
80
69
|
def pathname; self.name end
|
81
70
|
def output(out, opts = {})
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
if @etag
|
88
|
-
@etag.output(out, opts)
|
89
|
-
elsif !opts[:preserve]
|
90
|
-
ETag.new(@stag.name).output(out, opts)
|
71
|
+
out <<
|
72
|
+
if_output(opts) do
|
73
|
+
"<#{name}#{attributes_as_html}" +
|
74
|
+
((empty? and not etag) ? " /" : "") +
|
75
|
+
">"
|
91
76
|
end
|
77
|
+
if children
|
78
|
+
children.each { |n| n.output(out, opts) }
|
79
|
+
end
|
80
|
+
if etag
|
81
|
+
etag.output(out, opts)
|
82
|
+
elsif !opts[:preserve] && !empty?
|
83
|
+
out <<
|
84
|
+
if_output(opts) do
|
85
|
+
"</#{name}>"
|
86
|
+
end
|
92
87
|
end
|
93
88
|
out
|
94
89
|
end
|
95
|
-
end
|
96
|
-
|
97
|
-
class STag < BaseEle
|
98
|
-
def initialize(name, attributes=nil)
|
99
|
-
@name = name.to_s
|
100
|
-
@raw_attributes = attributes || {}
|
101
|
-
end
|
102
|
-
alterable :name, :raw_attributes
|
103
90
|
def attributes_as_html
|
104
|
-
if
|
105
|
-
|
91
|
+
if raw_attributes
|
92
|
+
raw_attributes.map do |aname, aval|
|
106
93
|
" #{aname}" +
|
107
94
|
(aval ? "=#{html_quote aval}" : "")
|
108
95
|
end.join
|
109
96
|
end
|
110
97
|
end
|
111
|
-
def
|
112
|
-
|
113
|
-
|
114
|
-
"<#{@name}#{attributes_as_html}" +
|
115
|
-
(opts[:style] == :empty ? " /" : "") +
|
116
|
-
">"
|
117
|
-
end
|
98
|
+
def inspect_tree(depth = 0)
|
99
|
+
%{#{" " * depth}} + name + "\n" +
|
100
|
+
(children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
|
118
101
|
end
|
119
102
|
end
|
120
103
|
|
121
|
-
class ETag
|
122
|
-
def initialize
|
123
|
-
@name = qualified_name.to_s
|
124
|
-
end
|
125
|
-
alterable :name
|
104
|
+
class ETag
|
105
|
+
def initialize name; self.name = name end
|
126
106
|
def output(out, opts = {})
|
127
107
|
out <<
|
128
108
|
if_output(opts) do
|
129
|
-
"</#{
|
109
|
+
"</#{name}>"
|
130
110
|
end
|
131
111
|
end
|
132
112
|
end
|
133
113
|
|
134
|
-
class BogusETag
|
114
|
+
class BogusETag
|
135
115
|
def output(out, opts = {}); out << if_output(opts) { '' }; end
|
136
116
|
end
|
137
117
|
|
138
|
-
class Text
|
139
|
-
def initialize
|
140
|
-
@content = text
|
141
|
-
end
|
142
|
-
alterable :content
|
118
|
+
class Text
|
119
|
+
def initialize content; self.content = content end
|
143
120
|
def pathname; "text()" end
|
144
121
|
def to_s
|
145
|
-
Hpricot.uxs(
|
122
|
+
Hpricot.uxs(content)
|
146
123
|
end
|
147
124
|
alias_method :inner_text, :to_s
|
148
125
|
alias_method :to_plain_text, :to_s
|
126
|
+
def << str; self.content << str end
|
149
127
|
def output(out, opts = {})
|
150
128
|
out <<
|
151
129
|
if_output(opts) do
|
152
|
-
|
130
|
+
content.to_s
|
153
131
|
end
|
154
132
|
end
|
155
133
|
end
|
156
134
|
|
157
|
-
class CData
|
135
|
+
class CData
|
136
|
+
def initialize content; self.content = content end
|
158
137
|
alias_method :to_s, :content
|
159
138
|
alias_method :to_plain_text, :content
|
160
139
|
def output(out, opts = {})
|
161
140
|
out <<
|
162
141
|
if_output(opts) do
|
163
|
-
"<![CDATA[
|
142
|
+
"<![CDATA[#{content}]]>"
|
164
143
|
end
|
165
144
|
end
|
166
145
|
end
|
167
146
|
|
168
|
-
class XMLDecl
|
169
|
-
def initialize(version, encoding, standalone)
|
170
|
-
@version, @encoding, @standalone = version, encoding, standalone
|
171
|
-
end
|
172
|
-
alterable :version, :encoding, :standalone
|
147
|
+
class XMLDecl
|
173
148
|
def pathname; "xmldecl()" end
|
174
149
|
def output(out, opts = {})
|
175
150
|
out <<
|
176
151
|
if_output(opts) do
|
177
|
-
"<?xml version=\"#{
|
178
|
-
(
|
179
|
-
(
|
152
|
+
"<?xml version=\"#{version}\"" +
|
153
|
+
(encoding ? " encoding=\"#{encoding}\"" : "") +
|
154
|
+
(standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
|
180
155
|
"?>"
|
181
156
|
end
|
182
157
|
end
|
183
158
|
end
|
184
159
|
|
185
|
-
class DocType
|
186
|
-
def initialize
|
187
|
-
|
160
|
+
class DocType
|
161
|
+
def initialize target, pub, sys
|
162
|
+
self.target, self.public_id, self.system_id = target, pub, sys
|
188
163
|
end
|
189
|
-
alterable :target, :public_id, :system_id
|
190
164
|
def pathname; "doctype()" end
|
191
165
|
def output(out, opts = {})
|
192
166
|
out <<
|
193
167
|
if_output(opts) do
|
194
|
-
"<!DOCTYPE #{
|
195
|
-
(
|
196
|
-
(
|
168
|
+
"<!DOCTYPE #{target} " +
|
169
|
+
(public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
|
170
|
+
(system_id ? " #{html_quote(system_id)}" : "") + ">"
|
197
171
|
end
|
198
172
|
end
|
199
173
|
end
|
200
174
|
|
201
|
-
class ProcIns
|
202
|
-
def initialize(target, content)
|
203
|
-
@target, @content = target, content
|
204
|
-
end
|
175
|
+
class ProcIns
|
205
176
|
def pathname; "procins()" end
|
206
|
-
alterable :target, :content
|
207
177
|
def output(out, opts = {})
|
208
178
|
out <<
|
209
179
|
if_output(opts) do
|
210
|
-
"<?#{
|
211
|
-
(
|
180
|
+
"<?#{target}" +
|
181
|
+
(content ? " #{content}" : "") +
|
212
182
|
"?>"
|
213
183
|
end
|
214
184
|
end
|
215
185
|
end
|
216
186
|
|
217
|
-
class Comment
|
218
|
-
def initialize(content)
|
219
|
-
@content = content
|
220
|
-
end
|
187
|
+
class Comment
|
221
188
|
def pathname; "comment()" end
|
222
|
-
alterable :content
|
223
189
|
def output(out, opts = {})
|
224
190
|
out <<
|
225
191
|
if_output(opts) do
|
226
|
-
"<!--#{
|
192
|
+
"<!--#{content}-->"
|
227
193
|
end
|
228
194
|
end
|
229
195
|
end
|
data/lib/hpricot/traverse.rb
CHANGED
@@ -26,7 +26,7 @@ module Hpricot
|
|
26
26
|
if parent and parent.respond_to? :make
|
27
27
|
parent.make(input, &blk)
|
28
28
|
else
|
29
|
-
Hpricot.make(input, &blk)
|
29
|
+
Hpricot.make(input, &blk).children
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
@@ -51,7 +51,7 @@ module Hpricot
|
|
51
51
|
return i if (x.respond_to?(:name) and name == x.name) or
|
52
52
|
(x.text? and name == "text()")
|
53
53
|
i += 1
|
54
|
-
end
|
54
|
+
end if children
|
55
55
|
-1
|
56
56
|
end
|
57
57
|
|
@@ -146,16 +146,20 @@ module Hpricot
|
|
146
146
|
# Builds a string from the text contained in this node. All
|
147
147
|
# HTML elements are removed.
|
148
148
|
def to_plain_text
|
149
|
-
if respond_to?
|
149
|
+
if respond_to?(:children) and children
|
150
150
|
children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
|
151
|
+
else
|
152
|
+
""
|
151
153
|
end
|
152
154
|
end
|
153
155
|
|
154
156
|
# Builds a string from the text contained in this node. All
|
155
157
|
# HTML elements are removed.
|
156
158
|
def inner_text
|
157
|
-
if respond_to?
|
159
|
+
if respond_to?(:children) and children
|
158
160
|
children.map { |x| x.inner_text }.join
|
161
|
+
else
|
162
|
+
""
|
159
163
|
end
|
160
164
|
end
|
161
165
|
alias_method :innerText, :inner_text
|
@@ -172,8 +176,10 @@ module Hpricot
|
|
172
176
|
end
|
173
177
|
reparent self.children
|
174
178
|
else
|
175
|
-
if respond_to?
|
179
|
+
if respond_to?(:children) and children
|
176
180
|
children.map { |x| x.output("") }.join
|
181
|
+
else
|
182
|
+
""
|
177
183
|
end
|
178
184
|
end
|
179
185
|
end
|
@@ -207,7 +213,7 @@ module Hpricot
|
|
207
213
|
parent.children.each do |e|
|
208
214
|
id = sim if e == self
|
209
215
|
sim += 1 if e.pathname == self.pathname
|
210
|
-
end
|
216
|
+
end if parent.children
|
211
217
|
p = File.join(parent.xpath, self.pathname)
|
212
218
|
p += "[#{id+1}]" if sim >= 2
|
213
219
|
p
|
@@ -224,7 +230,7 @@ module Hpricot
|
|
224
230
|
parent.children.each do |e|
|
225
231
|
id = sim if e == self
|
226
232
|
sim += 1 if e.pathname == self.pathname
|
227
|
-
end
|
233
|
+
end if parent.children
|
228
234
|
p = parent.css_path
|
229
235
|
p = p ? "#{p} > #{self.pathname}" : self.pathname
|
230
236
|
p += ":nth(#{id})" if sim >= 2
|
@@ -489,13 +495,13 @@ module Hpricot
|
|
489
495
|
|
490
496
|
# +each_child+ iterates over each child.
|
491
497
|
def each_child(&block) # :yields: child_node
|
492
|
-
children.each(&block)
|
498
|
+
children.each(&block) if children
|
493
499
|
nil
|
494
500
|
end
|
495
501
|
|
496
502
|
# +each_child_with_index+ iterates over each child.
|
497
503
|
def each_child_with_index(&block) # :yields: child_node, index
|
498
|
-
children.each_with_index(&block)
|
504
|
+
children.each_with_index(&block) if children
|
499
505
|
nil
|
500
506
|
end
|
501
507
|
|
@@ -626,7 +632,7 @@ module Hpricot
|
|
626
632
|
# :stopdoc:
|
627
633
|
module Doc::Trav
|
628
634
|
def traverse_all_element(&block)
|
629
|
-
children.each {|c| c.traverse_all_element(&block) }
|
635
|
+
children.each {|c| c.traverse_all_element(&block) } if children
|
630
636
|
end
|
631
637
|
def xpath
|
632
638
|
"/"
|
@@ -639,7 +645,7 @@ module Hpricot
|
|
639
645
|
module Elem::Trav
|
640
646
|
def traverse_all_element(&block)
|
641
647
|
yield self
|
642
|
-
children.each {|c| c.traverse_all_element(&block) }
|
648
|
+
children.each {|c| c.traverse_all_element(&block) } if children
|
643
649
|
end
|
644
650
|
end
|
645
651
|
|
@@ -651,14 +657,14 @@ module Hpricot
|
|
651
657
|
|
652
658
|
module Doc::Trav
|
653
659
|
def traverse_some_element(name_set, &block)
|
654
|
-
children.each {|c| c.traverse_some_element(name_set, &block) }
|
660
|
+
children.each {|c| c.traverse_some_element(name_set, &block) } if children
|
655
661
|
end
|
656
662
|
end
|
657
663
|
|
658
664
|
module Elem::Trav
|
659
665
|
def traverse_some_element(name_set, &block)
|
660
666
|
yield self if name_set.include? self.name
|
661
|
-
children.each {|c| c.traverse_some_element(name_set, &block) }
|
667
|
+
children.each {|c| c.traverse_some_element(name_set, &block) } if children
|
662
668
|
end
|
663
669
|
end
|
664
670
|
|
@@ -797,7 +803,7 @@ module Hpricot
|
|
797
803
|
module Doc::Trav
|
798
804
|
def root
|
799
805
|
es = []
|
800
|
-
children.each {|c| es << c if c.elem? }
|
806
|
+
children.each {|c| es << c if c.elem? } if children
|
801
807
|
raise Hpricot::Error, "no element" if es.empty?
|
802
808
|
raise Hpricot::Error, "multiple top elements" if 1 < es.length
|
803
809
|
es[0]
|
data/test/files/boingboing.html
CHANGED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'benchmark'
|
7
|
+
|
8
|
+
content = File.read("test/files/boingboing.html")
|
9
|
+
|
10
|
+
N = 100
|
11
|
+
|
12
|
+
unless Gem.loaded_specs['hpricot'].version > Gem::Version.new('0.6.161')
|
13
|
+
abort "** Use higher than Hpricot 0.6.161!"
|
14
|
+
end
|
15
|
+
|
16
|
+
puts "Hpricot #{Gem.loaded_specs['hpricot'].version} vs. Nokogiri #{Gem.loaded_specs['nokogiri'].version}"
|
17
|
+
hdoc = Hpricot(content)
|
18
|
+
ndoc = Nokogiri.Hpricot(content)
|
19
|
+
|
20
|
+
Benchmark.bm do |x|
|
21
|
+
x.report('hpricot:doc') do
|
22
|
+
N.times do
|
23
|
+
Hpricot(content)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
x.report('nokogiri:doc') do
|
28
|
+
N.times do
|
29
|
+
Nokogiri.Hpricot(content)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
Benchmark.bm do |x|
|
35
|
+
x.report('hpricot:xpath') do
|
36
|
+
N.times do
|
37
|
+
info = hdoc.search("//a[@name='027906']").first.inner_text
|
38
|
+
url = hdoc.search("h3[text()='College kids reportedly taking more smart drugs']").first.inner_text
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
x.report('nokogiri:xpath') do
|
43
|
+
N.times do
|
44
|
+
info = ndoc.search("//a[@name='027906']").first.inner_text
|
45
|
+
url = ndoc.search("h3[text()='College kids reportedly taking more smart drugs']").first.inner_text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
Benchmark.bm do |x|
|
51
|
+
x.report('hpricot:css') do
|
52
|
+
N.times do
|
53
|
+
info = hdoc.search('form input[@checked]').first
|
54
|
+
url = hdoc.search('td spacer').first.inner_text
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
x.report('nokogiri:css') do
|
59
|
+
N.times do
|
60
|
+
info = ndoc.search('form input[@checked]').first
|
61
|
+
url = ndoc.search('td spacer').first.inner_text
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|