hpricot 0.4-mswin32 → 0.5-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
data/lib/hpricot/traverse.rb
CHANGED
@@ -3,20 +3,93 @@ require 'uri'
|
|
3
3
|
|
4
4
|
module Hpricot
|
5
5
|
module Traverse
|
6
|
+
# Is this object the enclosing HTML or XML document?
|
6
7
|
def doc?() Doc::Trav === self end
|
8
|
+
# Is this object an HTML or XML element?
|
7
9
|
def elem?() Elem::Trav === self end
|
10
|
+
# Is this object an HTML text node?
|
8
11
|
def text?() Text::Trav === self end
|
12
|
+
# Is this object an XML declaration?
|
9
13
|
def xmldecl?() XMLDecl::Trav === self end
|
14
|
+
# Is this object a doctype tag?
|
10
15
|
def doctype?() DocType::Trav === self end
|
16
|
+
# Is this object an XML processing instruction?
|
11
17
|
def procins?() ProcIns::Trav === self end
|
18
|
+
# Is this object a comment?
|
12
19
|
def comment?() Comment::Trav === self end
|
20
|
+
# Is this object a stranded end tag?
|
13
21
|
def bogusetag?() BogusETag::Trav === self end
|
14
22
|
|
23
|
+
# Builds an HTML string from this node and its contents.
|
24
|
+
# If you need to write to a stream, try calling <tt>output(io)</tt>
|
25
|
+
# as a method on this object.
|
15
26
|
def to_html
|
16
27
|
output("")
|
17
28
|
end
|
18
29
|
alias_method :to_s, :to_html
|
19
30
|
|
31
|
+
# Attempts to preserve the original HTML of the document, only
|
32
|
+
# outputing new tags for elements which have changed.
|
33
|
+
def to_original_html
|
34
|
+
output("", :preserve => true)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Puts together an array of neighboring nodes based on their proximity
|
38
|
+
# to this node. So, for example, to get the next node, you could use
|
39
|
+
# <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
|
40
|
+
#
|
41
|
+
# This method also accepts ranges and sets of numbers.
|
42
|
+
#
|
43
|
+
# ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
|
44
|
+
# ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
|
45
|
+
# ele.nodes_at(0, 5..6) # the current node and two others
|
46
|
+
def nodes_at(*pos)
|
47
|
+
sib = parent.children
|
48
|
+
i, si = 0, sib.index(self)
|
49
|
+
Elements[*
|
50
|
+
sib.select do |x|
|
51
|
+
sel = case i - si when *pos
|
52
|
+
true
|
53
|
+
end
|
54
|
+
i += 1
|
55
|
+
sel
|
56
|
+
end
|
57
|
+
]
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the node neighboring this node to the south: just below it.
|
61
|
+
# This method includes text nodes and comments and such.
|
62
|
+
def next_node
|
63
|
+
sib = parent.children
|
64
|
+
sib[sib.index(self) + 1] if parent
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns to node neighboring this node to the north: just above it.
|
68
|
+
# This method includes text nodes and comments and such.
|
69
|
+
def previous_node
|
70
|
+
sib = parent.children
|
71
|
+
x = sib.index(self) - 1
|
72
|
+
sib[x] if sib and x >= 0
|
73
|
+
end
|
74
|
+
|
75
|
+
# Adds elements immediately after this element, contained in the +html+ string.
|
76
|
+
def after(html)
|
77
|
+
parent.insert_after(Hpricot.make(html), self)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Adds elements immediately before this element, contained in the +html+ string.
|
81
|
+
def before(html)
|
82
|
+
parent.insert_after(Hpricot.make(html), self)
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Replace this element and its contents with the nodes contained
|
87
|
+
# in the +html+ string.
|
88
|
+
def swap(html)
|
89
|
+
parent.altered!
|
90
|
+
parent.replace_child(self, Hpricot.make(html))
|
91
|
+
end
|
92
|
+
|
20
93
|
def get_subnode(*indexes)
|
21
94
|
n = self
|
22
95
|
indexes.each {|index|
|
@@ -24,37 +97,36 @@ module Hpricot
|
|
24
97
|
}
|
25
98
|
n
|
26
99
|
end
|
27
|
-
end
|
28
100
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
children[children.index(old), 1] = [*new]
|
35
|
-
end
|
36
|
-
def insert_before(nodes, ele)
|
37
|
-
case nodes
|
38
|
-
when Array
|
39
|
-
nodes.each { |n| insert_before(n, ele) }
|
40
|
-
else
|
41
|
-
children[children.index(ele) || 0, 0] = nodes
|
101
|
+
# Builds a string from the text contained in this node. All
|
102
|
+
# HTML elements are removed.
|
103
|
+
def to_plain_text
|
104
|
+
if respond_to? :children
|
105
|
+
children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
|
42
106
|
end
|
43
107
|
end
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
children[idx ? idx + 1 : children.length, 0] = nodes
|
108
|
+
|
109
|
+
# Builds a string from the text contained in this node. All
|
110
|
+
# HTML elements are removed.
|
111
|
+
def inner_text
|
112
|
+
if respond_to? :children
|
113
|
+
children.map { |x| x.inner_text }.join
|
51
114
|
end
|
52
115
|
end
|
116
|
+
alias_method :innerText, :inner_text
|
117
|
+
|
118
|
+
# Builds an HTML string from the contents of this node.
|
53
119
|
def inner_html
|
54
|
-
|
120
|
+
if respond_to? :children
|
121
|
+
children.map { |x| x.output("") }.join
|
122
|
+
end
|
55
123
|
end
|
56
124
|
alias_method :innerHTML, :inner_html
|
125
|
+
|
126
|
+
# Inserts new contents into the current node, based on
|
127
|
+
# the HTML contained in string +inner+.
|
57
128
|
def inner_html=(inner)
|
129
|
+
altered!
|
58
130
|
case inner
|
59
131
|
when String, IO
|
60
132
|
self.children = Hpricot.parse(inner).children
|
@@ -63,63 +135,137 @@ module Hpricot
|
|
63
135
|
when nil
|
64
136
|
self.children = []
|
65
137
|
end
|
138
|
+
reparent self.children
|
66
139
|
end
|
67
140
|
alias_method :innerHTML=, :inner_html=
|
141
|
+
|
142
|
+
def reparent(nodes)
|
143
|
+
altered!
|
144
|
+
[*nodes].each { |e| e.parent = self }
|
145
|
+
end
|
146
|
+
private :reparent
|
147
|
+
|
148
|
+
def clean_path(path)
|
149
|
+
path.gsub(/^\s+|\s+$/, '')
|
150
|
+
end
|
151
|
+
|
152
|
+
# Builds a unique XPath string for this node, from the
|
153
|
+
# root of the document containing it.
|
154
|
+
def xpath
|
155
|
+
if elem? and has_attribute? 'id'
|
156
|
+
"//#{self.name}[@id='#{get_attribute('id')}']"
|
157
|
+
else
|
158
|
+
sim, id = 0, 0, 0
|
159
|
+
parent.children.each do |e|
|
160
|
+
id = sim if e == self
|
161
|
+
sim += 1 if e.pathname == self.pathname
|
162
|
+
end
|
163
|
+
p = File.join(parent.xpath, self.pathname)
|
164
|
+
p += "[#{id+1}]" if sim >= 2
|
165
|
+
p
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Builds a unique CSS string for this node, from the
|
170
|
+
# root of the document containing it.
|
171
|
+
def css_path
|
172
|
+
if elem? and has_attribute? 'id'
|
173
|
+
"##{get_attribute('id')}"
|
174
|
+
else
|
175
|
+
sim, i, id = 0, 0, 0
|
176
|
+
parent.children.each do |e|
|
177
|
+
id = sim if e == self
|
178
|
+
sim += 1 if e.pathname == self.pathname
|
179
|
+
end
|
180
|
+
p = parent.css_path
|
181
|
+
p = p ? "#{p} > #{self.pathname}" : self.pathname
|
182
|
+
p += ":nth(#{id})" if sim >= 2
|
183
|
+
p
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
def node_position
|
188
|
+
parent.children.index(self)
|
189
|
+
end
|
190
|
+
|
191
|
+
def position
|
192
|
+
parent.children_of_type(self.pathname).index(self)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Searches this node for all elements matching
|
196
|
+
# the CSS or XPath +expr+. Returns an Elements array
|
197
|
+
# containing the matching nodes. If +blk+ is given, it
|
198
|
+
# is used to iterate through the matching set.
|
68
199
|
def search(expr, &blk)
|
69
200
|
last = nil
|
70
201
|
nodes = [self]
|
71
202
|
done = []
|
72
203
|
expr = expr.to_s
|
204
|
+
hist = []
|
73
205
|
until expr.empty?
|
74
206
|
expr = clean_path(expr)
|
75
207
|
expr.gsub!(%r!^//!, '')
|
76
208
|
|
77
209
|
case expr
|
78
210
|
when %r!^/?\.\.!
|
79
|
-
expr = $'
|
211
|
+
last = expr = $'
|
80
212
|
nodes.map! { |node| node.parent }
|
81
213
|
when %r!^[>/]!
|
82
|
-
expr = $'
|
83
|
-
nodes = Elements[*nodes.map { |node| node.
|
214
|
+
last = expr = $'
|
215
|
+
nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
|
84
216
|
when %r!^\+!
|
85
|
-
expr = $'
|
217
|
+
last = expr = $'
|
86
218
|
nodes.map! do |node|
|
87
|
-
siblings = node.parent.
|
219
|
+
siblings = node.parent.children
|
88
220
|
siblings[siblings.index(node)+1]
|
89
221
|
end
|
90
222
|
nodes.compact!
|
91
223
|
when %r!^~!
|
92
|
-
expr = $'
|
224
|
+
last = expr = $'
|
93
225
|
nodes.map! do |node|
|
94
|
-
siblings = node.parent.
|
226
|
+
siblings = node.parent.children
|
95
227
|
siblings[(siblings.index(node)+1)..-1]
|
96
228
|
end
|
97
229
|
nodes.flatten!
|
98
230
|
when %r!^[|,]!
|
99
|
-
expr = " #$'"
|
231
|
+
last = expr = " #$'"
|
100
232
|
nodes.shift if nodes.first == self
|
101
233
|
done += nodes
|
102
234
|
nodes = [self]
|
103
235
|
else
|
104
|
-
m = expr.match
|
105
|
-
|
236
|
+
m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
|
237
|
+
after = $'
|
238
|
+
mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
|
239
|
+
oop = false
|
240
|
+
if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
|
241
|
+
after = $'
|
242
|
+
m[2] += mt
|
243
|
+
expr = after
|
244
|
+
end
|
106
245
|
if m[1] == '#'
|
107
246
|
oid = get_element_by_id(m[2])
|
108
247
|
nodes = oid ? [oid] : []
|
248
|
+
expr = after
|
109
249
|
else
|
110
|
-
m[2] = "*" if m[2] == "" || m[1] == "."
|
250
|
+
m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
|
111
251
|
ret = []
|
112
252
|
nodes.each do |node|
|
113
253
|
case m[2]
|
114
254
|
when '*'
|
255
|
+
node.traverse_element { |n| ret << n }
|
115
256
|
else
|
116
|
-
|
257
|
+
if node.respond_to? :get_elements_by_tag_name
|
258
|
+
ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
|
259
|
+
end
|
117
260
|
end
|
118
261
|
end
|
119
262
|
nodes = ret
|
120
263
|
end
|
264
|
+
last = nil
|
121
265
|
end
|
122
266
|
|
267
|
+
hist << expr
|
268
|
+
break if hist[-1] == hist[-2]
|
123
269
|
nodes, expr = Elements.filter(nodes, expr)
|
124
270
|
end
|
125
271
|
nodes = done + nodes.flatten.uniq
|
@@ -132,35 +278,13 @@ module Hpricot
|
|
132
278
|
end
|
133
279
|
alias_method :/, :search
|
134
280
|
|
135
|
-
|
136
|
-
|
281
|
+
# Find the first matching node for the CSS or XPath
|
282
|
+
# +expr+ string.
|
283
|
+
def at(expr)
|
284
|
+
search(expr).first
|
137
285
|
end
|
138
286
|
alias_method :%, :at
|
139
287
|
|
140
|
-
def clean_path(path)
|
141
|
-
path.gsub(/^\s+|\s+$/, '')
|
142
|
-
end
|
143
|
-
|
144
|
-
# +each_child+ iterates over each child.
|
145
|
-
def each_child(&block) # :yields: child_node
|
146
|
-
children.each(&block)
|
147
|
-
nil
|
148
|
-
end
|
149
|
-
|
150
|
-
# +each_child_with_index+ iterates over each child.
|
151
|
-
def each_child_with_index(&block) # :yields: child_node, index
|
152
|
-
children.each_with_index(&block)
|
153
|
-
nil
|
154
|
-
end
|
155
|
-
|
156
|
-
# +find_element+ searches an element which universal name is specified by
|
157
|
-
# the arguments.
|
158
|
-
# It returns nil if not found.
|
159
|
-
def find_element(*names)
|
160
|
-
traverse_element(*names) {|e| return e }
|
161
|
-
nil
|
162
|
-
end
|
163
|
-
|
164
288
|
# +traverse_element+ traverses elements in the tree.
|
165
289
|
# It yields elements in depth first order.
|
166
290
|
#
|
@@ -200,13 +324,130 @@ module Hpricot
|
|
200
324
|
nil
|
201
325
|
end
|
202
326
|
|
327
|
+
# Find children of a given +tag_name+.
|
328
|
+
#
|
329
|
+
# ele.children_of_type('p')
|
330
|
+
# #=> [...array of paragraphs...]
|
331
|
+
#
|
332
|
+
def children_of_type(tag_name)
|
333
|
+
if respond_to? :children
|
334
|
+
children.find_all do |x|
|
335
|
+
x.respond_to?(:pathname) && x.pathname == tag_name
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
module Container::Trav
|
343
|
+
# Return all children of this node which can contain other
|
344
|
+
# nodes. This is a good way to get all HTML elements which
|
345
|
+
# aren't text, comment, doctype or processing instruction nodes.
|
346
|
+
def containers
|
347
|
+
children.grep(Container::Trav)
|
348
|
+
end
|
349
|
+
|
350
|
+
# Returns the container node neighboring this node to the south: just below it.
|
351
|
+
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
|
352
|
+
# See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
|
353
|
+
def next_sibling
|
354
|
+
sib = parent.containers
|
355
|
+
sib[sib.index(self) + 1] if parent
|
356
|
+
end
|
357
|
+
|
358
|
+
# Returns the container node neighboring this node to the north: just above it.
|
359
|
+
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
|
360
|
+
# See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
|
361
|
+
def previous_sibling
|
362
|
+
sib = parent.containers
|
363
|
+
x = sib.index(self) - 1
|
364
|
+
sib[x] if sib and x >= 0
|
365
|
+
end
|
366
|
+
|
367
|
+
# Puts together an array of neighboring sibling elements based on their proximity
|
368
|
+
# to this element.
|
369
|
+
#
|
370
|
+
# This method accepts ranges and sets of numbers.
|
371
|
+
#
|
372
|
+
# ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
|
373
|
+
# ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
|
374
|
+
# ele.siblings_at(0, 5..6) # the current element and two others
|
375
|
+
#
|
376
|
+
# Like the other "sibling" methods, this doesn't find text and comment nodes.
|
377
|
+
# Use nodes_at to include those nodes.
|
378
|
+
def siblings_at(*pos)
|
379
|
+
sib = parent.containers
|
380
|
+
i, si = 0, sib.index(self)
|
381
|
+
Elements[*
|
382
|
+
sib.select do |x|
|
383
|
+
sel = case i - si when *pos
|
384
|
+
true
|
385
|
+
end
|
386
|
+
i += 1
|
387
|
+
sel
|
388
|
+
end
|
389
|
+
]
|
390
|
+
end
|
391
|
+
|
392
|
+
# Replace +old+, a child of the current node, with +new+ node.
|
393
|
+
def replace_child(old, new)
|
394
|
+
reparent new
|
395
|
+
children[children.index(old), 1] = [*new]
|
396
|
+
end
|
397
|
+
|
398
|
+
# Insert +nodes+, an array of HTML elements or a single element,
|
399
|
+
# before the node +ele+, a child of the current node.
|
400
|
+
def insert_before(nodes, ele)
|
401
|
+
case nodes
|
402
|
+
when Array
|
403
|
+
nodes.each { |n| insert_before(n, ele) }
|
404
|
+
else
|
405
|
+
reparent nodes
|
406
|
+
children[children.index(ele) || 0, 0] = nodes
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
# Insert +nodes+, an array of HTML elements or a single element,
|
411
|
+
# after the node +ele+, a child of the current node.
|
412
|
+
def insert_after(nodes, ele)
|
413
|
+
case nodes
|
414
|
+
when Array
|
415
|
+
nodes.each { |n| insert_after(n, ele) }
|
416
|
+
else
|
417
|
+
reparent nodes
|
418
|
+
idx = children.index(ele)
|
419
|
+
children[idx ? idx + 1 : children.length, 0] = nodes
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
# +each_child+ iterates over each child.
|
424
|
+
def each_child(&block) # :yields: child_node
|
425
|
+
children.each(&block)
|
426
|
+
nil
|
427
|
+
end
|
428
|
+
|
429
|
+
# +each_child_with_index+ iterates over each child.
|
430
|
+
def each_child_with_index(&block) # :yields: child_node, index
|
431
|
+
children.each_with_index(&block)
|
432
|
+
nil
|
433
|
+
end
|
434
|
+
|
435
|
+
# +find_element+ searches an element which universal name is specified by
|
436
|
+
# the arguments.
|
437
|
+
# It returns nil if not found.
|
438
|
+
def find_element(*names)
|
439
|
+
traverse_element(*names) {|e| return e }
|
440
|
+
nil
|
441
|
+
end
|
442
|
+
|
443
|
+
# Returns a list of CSS classes to which this element belongs.
|
203
444
|
def classes
|
204
445
|
get_attribute('class').to_s.strip.split(/\s+/)
|
205
446
|
end
|
206
447
|
|
207
448
|
def get_element_by_id(id)
|
208
449
|
traverse_all_element do |ele|
|
209
|
-
if eid = ele.get_attribute('id')
|
450
|
+
if ele.elem? and eid = ele.get_attribute('id')
|
210
451
|
return ele if eid.to_s == id
|
211
452
|
end
|
212
453
|
end
|
@@ -319,6 +560,12 @@ module Hpricot
|
|
319
560
|
def traverse_all_element(&block)
|
320
561
|
children.each {|c| c.traverse_all_element(&block) }
|
321
562
|
end
|
563
|
+
def xpath
|
564
|
+
"/"
|
565
|
+
end
|
566
|
+
def css_path
|
567
|
+
nil
|
568
|
+
end
|
322
569
|
end
|
323
570
|
|
324
571
|
module Elem::Trav
|
@@ -330,6 +577,7 @@ module Hpricot
|
|
330
577
|
|
331
578
|
module Leaf::Trav
|
332
579
|
def traverse_all_element
|
580
|
+
yield self
|
333
581
|
end
|
334
582
|
end
|
335
583
|
|
@@ -497,12 +745,14 @@ module Hpricot
|
|
497
745
|
end
|
498
746
|
alias_method :[], :get_attribute
|
499
747
|
def set_attribute(name, val)
|
748
|
+
altered!
|
500
749
|
self.attributes ||= {}
|
501
750
|
self.attributes[name.to_s] = val
|
502
751
|
end
|
503
752
|
alias_method :[]=, :set_attribute
|
504
753
|
def remove_attribute(name)
|
505
754
|
if has_attribute? name
|
755
|
+
altered!
|
506
756
|
self.attributes.delete(name)
|
507
757
|
end
|
508
758
|
end
|