hpricot 0.4-mswin32 → 0.5-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
data/lib/hpricot/traverse.rb
CHANGED
@@ -3,20 +3,93 @@ require 'uri'
|
|
3
3
|
|
4
4
|
module Hpricot
|
5
5
|
module Traverse
|
6
|
+
# Is this object the enclosing HTML or XML document?
|
6
7
|
def doc?() Doc::Trav === self end
|
8
|
+
# Is this object an HTML or XML element?
|
7
9
|
def elem?() Elem::Trav === self end
|
10
|
+
# Is this object an HTML text node?
|
8
11
|
def text?() Text::Trav === self end
|
12
|
+
# Is this object an XML declaration?
|
9
13
|
def xmldecl?() XMLDecl::Trav === self end
|
14
|
+
# Is this object a doctype tag?
|
10
15
|
def doctype?() DocType::Trav === self end
|
16
|
+
# Is this object an XML processing instruction?
|
11
17
|
def procins?() ProcIns::Trav === self end
|
18
|
+
# Is this object a comment?
|
12
19
|
def comment?() Comment::Trav === self end
|
20
|
+
# Is this object a stranded end tag?
|
13
21
|
def bogusetag?() BogusETag::Trav === self end
|
14
22
|
|
23
|
+
# Builds an HTML string from this node and its contents.
|
24
|
+
# If you need to write to a stream, try calling <tt>output(io)</tt>
|
25
|
+
# as a method on this object.
|
15
26
|
def to_html
|
16
27
|
output("")
|
17
28
|
end
|
18
29
|
alias_method :to_s, :to_html
|
19
30
|
|
31
|
+
# Attempts to preserve the original HTML of the document, only
|
32
|
+
# outputing new tags for elements which have changed.
|
33
|
+
def to_original_html
|
34
|
+
output("", :preserve => true)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Puts together an array of neighboring nodes based on their proximity
|
38
|
+
# to this node. So, for example, to get the next node, you could use
|
39
|
+
# <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
|
40
|
+
#
|
41
|
+
# This method also accepts ranges and sets of numbers.
|
42
|
+
#
|
43
|
+
# ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
|
44
|
+
# ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
|
45
|
+
# ele.nodes_at(0, 5..6) # the current node and two others
|
46
|
+
def nodes_at(*pos)
|
47
|
+
sib = parent.children
|
48
|
+
i, si = 0, sib.index(self)
|
49
|
+
Elements[*
|
50
|
+
sib.select do |x|
|
51
|
+
sel = case i - si when *pos
|
52
|
+
true
|
53
|
+
end
|
54
|
+
i += 1
|
55
|
+
sel
|
56
|
+
end
|
57
|
+
]
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the node neighboring this node to the south: just below it.
|
61
|
+
# This method includes text nodes and comments and such.
|
62
|
+
def next_node
|
63
|
+
sib = parent.children
|
64
|
+
sib[sib.index(self) + 1] if parent
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns to node neighboring this node to the north: just above it.
|
68
|
+
# This method includes text nodes and comments and such.
|
69
|
+
def previous_node
|
70
|
+
sib = parent.children
|
71
|
+
x = sib.index(self) - 1
|
72
|
+
sib[x] if sib and x >= 0
|
73
|
+
end
|
74
|
+
|
75
|
+
# Adds elements immediately after this element, contained in the +html+ string.
|
76
|
+
def after(html)
|
77
|
+
parent.insert_after(Hpricot.make(html), self)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Adds elements immediately before this element, contained in the +html+ string.
|
81
|
+
def before(html)
|
82
|
+
parent.insert_after(Hpricot.make(html), self)
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Replace this element and its contents with the nodes contained
|
87
|
+
# in the +html+ string.
|
88
|
+
def swap(html)
|
89
|
+
parent.altered!
|
90
|
+
parent.replace_child(self, Hpricot.make(html))
|
91
|
+
end
|
92
|
+
|
20
93
|
def get_subnode(*indexes)
|
21
94
|
n = self
|
22
95
|
indexes.each {|index|
|
@@ -24,37 +97,36 @@ module Hpricot
|
|
24
97
|
}
|
25
98
|
n
|
26
99
|
end
|
27
|
-
end
|
28
100
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
children[children.index(old), 1] = [*new]
|
35
|
-
end
|
36
|
-
def insert_before(nodes, ele)
|
37
|
-
case nodes
|
38
|
-
when Array
|
39
|
-
nodes.each { |n| insert_before(n, ele) }
|
40
|
-
else
|
41
|
-
children[children.index(ele) || 0, 0] = nodes
|
101
|
+
# Builds a string from the text contained in this node. All
|
102
|
+
# HTML elements are removed.
|
103
|
+
def to_plain_text
|
104
|
+
if respond_to? :children
|
105
|
+
children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
|
42
106
|
end
|
43
107
|
end
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
children[idx ? idx + 1 : children.length, 0] = nodes
|
108
|
+
|
109
|
+
# Builds a string from the text contained in this node. All
|
110
|
+
# HTML elements are removed.
|
111
|
+
def inner_text
|
112
|
+
if respond_to? :children
|
113
|
+
children.map { |x| x.inner_text }.join
|
51
114
|
end
|
52
115
|
end
|
116
|
+
alias_method :innerText, :inner_text
|
117
|
+
|
118
|
+
# Builds an HTML string from the contents of this node.
|
53
119
|
def inner_html
|
54
|
-
|
120
|
+
if respond_to? :children
|
121
|
+
children.map { |x| x.output("") }.join
|
122
|
+
end
|
55
123
|
end
|
56
124
|
alias_method :innerHTML, :inner_html
|
125
|
+
|
126
|
+
# Inserts new contents into the current node, based on
|
127
|
+
# the HTML contained in string +inner+.
|
57
128
|
def inner_html=(inner)
|
129
|
+
altered!
|
58
130
|
case inner
|
59
131
|
when String, IO
|
60
132
|
self.children = Hpricot.parse(inner).children
|
@@ -63,63 +135,137 @@ module Hpricot
|
|
63
135
|
when nil
|
64
136
|
self.children = []
|
65
137
|
end
|
138
|
+
reparent self.children
|
66
139
|
end
|
67
140
|
alias_method :innerHTML=, :inner_html=
|
141
|
+
|
142
|
+
def reparent(nodes)
|
143
|
+
altered!
|
144
|
+
[*nodes].each { |e| e.parent = self }
|
145
|
+
end
|
146
|
+
private :reparent
|
147
|
+
|
148
|
+
def clean_path(path)
|
149
|
+
path.gsub(/^\s+|\s+$/, '')
|
150
|
+
end
|
151
|
+
|
152
|
+
# Builds a unique XPath string for this node, from the
|
153
|
+
# root of the document containing it.
|
154
|
+
def xpath
|
155
|
+
if elem? and has_attribute? 'id'
|
156
|
+
"//#{self.name}[@id='#{get_attribute('id')}']"
|
157
|
+
else
|
158
|
+
sim, id = 0, 0, 0
|
159
|
+
parent.children.each do |e|
|
160
|
+
id = sim if e == self
|
161
|
+
sim += 1 if e.pathname == self.pathname
|
162
|
+
end
|
163
|
+
p = File.join(parent.xpath, self.pathname)
|
164
|
+
p += "[#{id+1}]" if sim >= 2
|
165
|
+
p
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Builds a unique CSS string for this node, from the
|
170
|
+
# root of the document containing it.
|
171
|
+
def css_path
|
172
|
+
if elem? and has_attribute? 'id'
|
173
|
+
"##{get_attribute('id')}"
|
174
|
+
else
|
175
|
+
sim, i, id = 0, 0, 0
|
176
|
+
parent.children.each do |e|
|
177
|
+
id = sim if e == self
|
178
|
+
sim += 1 if e.pathname == self.pathname
|
179
|
+
end
|
180
|
+
p = parent.css_path
|
181
|
+
p = p ? "#{p} > #{self.pathname}" : self.pathname
|
182
|
+
p += ":nth(#{id})" if sim >= 2
|
183
|
+
p
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def node_position
|
188
|
+
parent.children.index(self)
|
189
|
+
end
|
190
|
+
|
191
|
+
def position
|
192
|
+
parent.children_of_type(self.pathname).index(self)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Searches this node for all elements matching
|
196
|
+
# the CSS or XPath +expr+. Returns an Elements array
|
197
|
+
# containing the matching nodes. If +blk+ is given, it
|
198
|
+
# is used to iterate through the matching set.
|
68
199
|
def search(expr, &blk)
|
69
200
|
last = nil
|
70
201
|
nodes = [self]
|
71
202
|
done = []
|
72
203
|
expr = expr.to_s
|
204
|
+
hist = []
|
73
205
|
until expr.empty?
|
74
206
|
expr = clean_path(expr)
|
75
207
|
expr.gsub!(%r!^//!, '')
|
76
208
|
|
77
209
|
case expr
|
78
210
|
when %r!^/?\.\.!
|
79
|
-
expr = $'
|
211
|
+
last = expr = $'
|
80
212
|
nodes.map! { |node| node.parent }
|
81
213
|
when %r!^[>/]!
|
82
|
-
expr = $'
|
83
|
-
nodes = Elements[*nodes.map { |node| node.
|
214
|
+
last = expr = $'
|
215
|
+
nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
|
84
216
|
when %r!^\+!
|
85
|
-
expr = $'
|
217
|
+
last = expr = $'
|
86
218
|
nodes.map! do |node|
|
87
|
-
siblings = node.parent.
|
219
|
+
siblings = node.parent.children
|
88
220
|
siblings[siblings.index(node)+1]
|
89
221
|
end
|
90
222
|
nodes.compact!
|
91
223
|
when %r!^~!
|
92
|
-
expr = $'
|
224
|
+
last = expr = $'
|
93
225
|
nodes.map! do |node|
|
94
|
-
siblings = node.parent.
|
226
|
+
siblings = node.parent.children
|
95
227
|
siblings[(siblings.index(node)+1)..-1]
|
96
228
|
end
|
97
229
|
nodes.flatten!
|
98
230
|
when %r!^[|,]!
|
99
|
-
expr = " #$'"
|
231
|
+
last = expr = " #$'"
|
100
232
|
nodes.shift if nodes.first == self
|
101
233
|
done += nodes
|
102
234
|
nodes = [self]
|
103
235
|
else
|
104
|
-
m = expr.match
|
105
|
-
|
236
|
+
m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
|
237
|
+
after = $'
|
238
|
+
mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
|
239
|
+
oop = false
|
240
|
+
if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
|
241
|
+
after = $'
|
242
|
+
m[2] += mt
|
243
|
+
expr = after
|
244
|
+
end
|
106
245
|
if m[1] == '#'
|
107
246
|
oid = get_element_by_id(m[2])
|
108
247
|
nodes = oid ? [oid] : []
|
248
|
+
expr = after
|
109
249
|
else
|
110
|
-
m[2] = "*" if m[2] == "" || m[1] == "."
|
250
|
+
m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
|
111
251
|
ret = []
|
112
252
|
nodes.each do |node|
|
113
253
|
case m[2]
|
114
254
|
when '*'
|
255
|
+
node.traverse_element { |n| ret << n }
|
115
256
|
else
|
116
|
-
|
257
|
+
if node.respond_to? :get_elements_by_tag_name
|
258
|
+
ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
|
259
|
+
end
|
117
260
|
end
|
118
261
|
end
|
119
262
|
nodes = ret
|
120
263
|
end
|
264
|
+
last = nil
|
121
265
|
end
|
122
266
|
|
267
|
+
hist << expr
|
268
|
+
break if hist[-1] == hist[-2]
|
123
269
|
nodes, expr = Elements.filter(nodes, expr)
|
124
270
|
end
|
125
271
|
nodes = done + nodes.flatten.uniq
|
@@ -132,35 +278,13 @@ module Hpricot
|
|
132
278
|
end
|
133
279
|
alias_method :/, :search
|
134
280
|
|
135
|
-
|
136
|
-
|
281
|
+
# Find the first matching node for the CSS or XPath
|
282
|
+
# +expr+ string.
|
283
|
+
def at(expr)
|
284
|
+
search(expr).first
|
137
285
|
end
|
138
286
|
alias_method :%, :at
|
139
287
|
|
140
|
-
def clean_path(path)
|
141
|
-
path.gsub(/^\s+|\s+$/, '')
|
142
|
-
end
|
143
|
-
|
144
|
-
# +each_child+ iterates over each child.
|
145
|
-
def each_child(&block) # :yields: child_node
|
146
|
-
children.each(&block)
|
147
|
-
nil
|
148
|
-
end
|
149
|
-
|
150
|
-
# +each_child_with_index+ iterates over each child.
|
151
|
-
def each_child_with_index(&block) # :yields: child_node, index
|
152
|
-
children.each_with_index(&block)
|
153
|
-
nil
|
154
|
-
end
|
155
|
-
|
156
|
-
# +find_element+ searches an element which universal name is specified by
|
157
|
-
# the arguments.
|
158
|
-
# It returns nil if not found.
|
159
|
-
def find_element(*names)
|
160
|
-
traverse_element(*names) {|e| return e }
|
161
|
-
nil
|
162
|
-
end
|
163
|
-
|
164
288
|
# +traverse_element+ traverses elements in the tree.
|
165
289
|
# It yields elements in depth first order.
|
166
290
|
#
|
@@ -200,13 +324,130 @@ module Hpricot
|
|
200
324
|
nil
|
201
325
|
end
|
202
326
|
|
327
|
+
# Find children of a given +tag_name+.
|
328
|
+
#
|
329
|
+
# ele.children_of_type('p')
|
330
|
+
# #=> [...array of paragraphs...]
|
331
|
+
#
|
332
|
+
def children_of_type(tag_name)
|
333
|
+
if respond_to? :children
|
334
|
+
children.find_all do |x|
|
335
|
+
x.respond_to?(:pathname) && x.pathname == tag_name
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
module Container::Trav
|
343
|
+
# Return all children of this node which can contain other
|
344
|
+
# nodes. This is a good way to get all HTML elements which
|
345
|
+
# aren't text, comment, doctype or processing instruction nodes.
|
346
|
+
def containers
|
347
|
+
children.grep(Container::Trav)
|
348
|
+
end
|
349
|
+
|
350
|
+
# Returns the container node neighboring this node to the south: just below it.
|
351
|
+
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
|
352
|
+
# See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
|
353
|
+
def next_sibling
|
354
|
+
sib = parent.containers
|
355
|
+
sib[sib.index(self) + 1] if parent
|
356
|
+
end
|
357
|
+
|
358
|
+
# Returns the container node neighboring this node to the north: just above it.
|
359
|
+
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
|
360
|
+
# See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
|
361
|
+
def previous_sibling
|
362
|
+
sib = parent.containers
|
363
|
+
x = sib.index(self) - 1
|
364
|
+
sib[x] if sib and x >= 0
|
365
|
+
end
|
366
|
+
|
367
|
+
# Puts together an array of neighboring sibling elements based on their proximity
|
368
|
+
# to this element.
|
369
|
+
#
|
370
|
+
# This method accepts ranges and sets of numbers.
|
371
|
+
#
|
372
|
+
# ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
|
373
|
+
# ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
|
374
|
+
# ele.siblings_at(0, 5..6) # the current element and two others
|
375
|
+
#
|
376
|
+
# Like the other "sibling" methods, this doesn't find text and comment nodes.
|
377
|
+
# Use nodes_at to include those nodes.
|
378
|
+
def siblings_at(*pos)
|
379
|
+
sib = parent.containers
|
380
|
+
i, si = 0, sib.index(self)
|
381
|
+
Elements[*
|
382
|
+
sib.select do |x|
|
383
|
+
sel = case i - si when *pos
|
384
|
+
true
|
385
|
+
end
|
386
|
+
i += 1
|
387
|
+
sel
|
388
|
+
end
|
389
|
+
]
|
390
|
+
end
|
391
|
+
|
392
|
+
# Replace +old+, a child of the current node, with +new+ node.
|
393
|
+
def replace_child(old, new)
|
394
|
+
reparent new
|
395
|
+
children[children.index(old), 1] = [*new]
|
396
|
+
end
|
397
|
+
|
398
|
+
# Insert +nodes+, an array of HTML elements or a single element,
|
399
|
+
# before the node +ele+, a child of the current node.
|
400
|
+
def insert_before(nodes, ele)
|
401
|
+
case nodes
|
402
|
+
when Array
|
403
|
+
nodes.each { |n| insert_before(n, ele) }
|
404
|
+
else
|
405
|
+
reparent nodes
|
406
|
+
children[children.index(ele) || 0, 0] = nodes
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
# Insert +nodes+, an array of HTML elements or a single element,
|
411
|
+
# after the node +ele+, a child of the current node.
|
412
|
+
def insert_after(nodes, ele)
|
413
|
+
case nodes
|
414
|
+
when Array
|
415
|
+
nodes.each { |n| insert_after(n, ele) }
|
416
|
+
else
|
417
|
+
reparent nodes
|
418
|
+
idx = children.index(ele)
|
419
|
+
children[idx ? idx + 1 : children.length, 0] = nodes
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
# +each_child+ iterates over each child.
|
424
|
+
def each_child(&block) # :yields: child_node
|
425
|
+
children.each(&block)
|
426
|
+
nil
|
427
|
+
end
|
428
|
+
|
429
|
+
# +each_child_with_index+ iterates over each child.
|
430
|
+
def each_child_with_index(&block) # :yields: child_node, index
|
431
|
+
children.each_with_index(&block)
|
432
|
+
nil
|
433
|
+
end
|
434
|
+
|
435
|
+
# +find_element+ searches an element which universal name is specified by
|
436
|
+
# the arguments.
|
437
|
+
# It returns nil if not found.
|
438
|
+
def find_element(*names)
|
439
|
+
traverse_element(*names) {|e| return e }
|
440
|
+
nil
|
441
|
+
end
|
442
|
+
|
443
|
+
# Returns a list of CSS classes to which this element belongs.
|
203
444
|
def classes
|
204
445
|
get_attribute('class').to_s.strip.split(/\s+/)
|
205
446
|
end
|
206
447
|
|
207
448
|
def get_element_by_id(id)
|
208
449
|
traverse_all_element do |ele|
|
209
|
-
if eid = ele.get_attribute('id')
|
450
|
+
if ele.elem? and eid = ele.get_attribute('id')
|
210
451
|
return ele if eid.to_s == id
|
211
452
|
end
|
212
453
|
end
|
@@ -319,6 +560,12 @@ module Hpricot
|
|
319
560
|
def traverse_all_element(&block)
|
320
561
|
children.each {|c| c.traverse_all_element(&block) }
|
321
562
|
end
|
563
|
+
def xpath
|
564
|
+
"/"
|
565
|
+
end
|
566
|
+
def css_path
|
567
|
+
nil
|
568
|
+
end
|
322
569
|
end
|
323
570
|
|
324
571
|
module Elem::Trav
|
@@ -330,6 +577,7 @@ module Hpricot
|
|
330
577
|
|
331
578
|
module Leaf::Trav
|
332
579
|
def traverse_all_element
|
580
|
+
yield self
|
333
581
|
end
|
334
582
|
end
|
335
583
|
|
@@ -497,12 +745,14 @@ module Hpricot
|
|
497
745
|
end
|
498
746
|
alias_method :[], :get_attribute
|
499
747
|
def set_attribute(name, val)
|
748
|
+
altered!
|
500
749
|
self.attributes ||= {}
|
501
750
|
self.attributes[name.to_s] = val
|
502
751
|
end
|
503
752
|
alias_method :[]=, :set_attribute
|
504
753
|
def remove_attribute(name)
|
505
754
|
if has_attribute? name
|
755
|
+
altered!
|
506
756
|
self.attributes.delete(name)
|
507
757
|
end
|
508
758
|
end
|