hpricot 0.4-mswin32 → 0.5-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,20 +3,93 @@ require 'uri'
3
3
 
4
4
  module Hpricot
5
5
  module Traverse
6
+ # Is this object the enclosing HTML or XML document?
6
7
  def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
7
9
  def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
8
11
  def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
9
13
  def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
10
15
  def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
11
17
  def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
12
19
  def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
13
21
  def bogusetag?() BogusETag::Trav === self end
14
22
 
23
+ # Builds an HTML string from this node and its contents.
24
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
25
+ # as a method on this object.
15
26
  def to_html
16
27
  output("")
17
28
  end
18
29
  alias_method :to_s, :to_html
19
30
 
31
+ # Attempts to preserve the original HTML of the document, only
32
+ # outputing new tags for elements which have changed.
33
+ def to_original_html
34
+ output("", :preserve => true)
35
+ end
36
+
37
+ # Puts together an array of neighboring nodes based on their proximity
38
+ # to this node. So, for example, to get the next node, you could use
39
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
40
+ #
41
+ # This method also accepts ranges and sets of numbers.
42
+ #
43
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
44
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
45
+ # ele.nodes_at(0, 5..6) # the current node and two others
46
+ def nodes_at(*pos)
47
+ sib = parent.children
48
+ i, si = 0, sib.index(self)
49
+ Elements[*
50
+ sib.select do |x|
51
+ sel = case i - si when *pos
52
+ true
53
+ end
54
+ i += 1
55
+ sel
56
+ end
57
+ ]
58
+ end
59
+
60
+ # Returns the node neighboring this node to the south: just below it.
61
+ # This method includes text nodes and comments and such.
62
+ def next_node
63
+ sib = parent.children
64
+ sib[sib.index(self) + 1] if parent
65
+ end
66
+
67
+ # Returns to node neighboring this node to the north: just above it.
68
+ # This method includes text nodes and comments and such.
69
+ def previous_node
70
+ sib = parent.children
71
+ x = sib.index(self) - 1
72
+ sib[x] if sib and x >= 0
73
+ end
74
+
75
+ # Adds elements immediately after this element, contained in the +html+ string.
76
+ def after(html)
77
+ parent.insert_after(Hpricot.make(html), self)
78
+ end
79
+
80
+ # Adds elements immediately before this element, contained in the +html+ string.
81
+ def before(html)
82
+ parent.insert_after(Hpricot.make(html), self)
83
+ end
84
+
85
+
86
+ # Replace this element and its contents with the nodes contained
87
+ # in the +html+ string.
88
+ def swap(html)
89
+ parent.altered!
90
+ parent.replace_child(self, Hpricot.make(html))
91
+ end
92
+
20
93
  def get_subnode(*indexes)
21
94
  n = self
22
95
  indexes.each {|index|
@@ -24,37 +97,36 @@ module Hpricot
24
97
  }
25
98
  n
26
99
  end
27
- end
28
100
 
29
- module Container::Trav
30
- def containers
31
- children.grep(Container::Trav)
32
- end
33
- def replace_child(old, new)
34
- children[children.index(old), 1] = [*new]
35
- end
36
- def insert_before(nodes, ele)
37
- case nodes
38
- when Array
39
- nodes.each { |n| insert_before(n, ele) }
40
- else
41
- children[children.index(ele) || 0, 0] = nodes
101
+ # Builds a string from the text contained in this node. All
102
+ # HTML elements are removed.
103
+ def to_plain_text
104
+ if respond_to? :children
105
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
42
106
  end
43
107
  end
44
- def insert_after(nodes, ele)
45
- case nodes
46
- when Array
47
- nodes.each { |n| insert_after(n, ele) }
48
- else
49
- idx = children.index(ele)
50
- children[idx ? idx + 1 : children.length, 0] = nodes
108
+
109
+ # Builds a string from the text contained in this node. All
110
+ # HTML elements are removed.
111
+ def inner_text
112
+ if respond_to? :children
113
+ children.map { |x| x.inner_text }.join
51
114
  end
52
115
  end
116
+ alias_method :innerText, :inner_text
117
+
118
+ # Builds an HTML string from the contents of this node.
53
119
  def inner_html
54
- children.map { |x| x.output("") }.join
120
+ if respond_to? :children
121
+ children.map { |x| x.output("") }.join
122
+ end
55
123
  end
56
124
  alias_method :innerHTML, :inner_html
125
+
126
+ # Inserts new contents into the current node, based on
127
+ # the HTML contained in string +inner+.
57
128
  def inner_html=(inner)
129
+ altered!
58
130
  case inner
59
131
  when String, IO
60
132
  self.children = Hpricot.parse(inner).children
@@ -63,63 +135,137 @@ module Hpricot
63
135
  when nil
64
136
  self.children = []
65
137
  end
138
+ reparent self.children
66
139
  end
67
140
  alias_method :innerHTML=, :inner_html=
141
+
142
+ def reparent(nodes)
143
+ altered!
144
+ [*nodes].each { |e| e.parent = self }
145
+ end
146
+ private :reparent
147
+
148
+ def clean_path(path)
149
+ path.gsub(/^\s+|\s+$/, '')
150
+ end
151
+
152
+ # Builds a unique XPath string for this node, from the
153
+ # root of the document containing it.
154
+ def xpath
155
+ if elem? and has_attribute? 'id'
156
+ "//#{self.name}[@id='#{get_attribute('id')}']"
157
+ else
158
+ sim, id = 0, 0, 0
159
+ parent.children.each do |e|
160
+ id = sim if e == self
161
+ sim += 1 if e.pathname == self.pathname
162
+ end
163
+ p = File.join(parent.xpath, self.pathname)
164
+ p += "[#{id+1}]" if sim >= 2
165
+ p
166
+ end
167
+ end
168
+
169
+ # Builds a unique CSS string for this node, from the
170
+ # root of the document containing it.
171
+ def css_path
172
+ if elem? and has_attribute? 'id'
173
+ "##{get_attribute('id')}"
174
+ else
175
+ sim, i, id = 0, 0, 0
176
+ parent.children.each do |e|
177
+ id = sim if e == self
178
+ sim += 1 if e.pathname == self.pathname
179
+ end
180
+ p = parent.css_path
181
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
182
+ p += ":nth(#{id})" if sim >= 2
183
+ p
184
+ end
185
+ end
186
+
187
+ def node_position
188
+ parent.children.index(self)
189
+ end
190
+
191
+ def position
192
+ parent.children_of_type(self.pathname).index(self)
193
+ end
194
+
195
+ # Searches this node for all elements matching
196
+ # the CSS or XPath +expr+. Returns an Elements array
197
+ # containing the matching nodes. If +blk+ is given, it
198
+ # is used to iterate through the matching set.
68
199
  def search(expr, &blk)
69
200
  last = nil
70
201
  nodes = [self]
71
202
  done = []
72
203
  expr = expr.to_s
204
+ hist = []
73
205
  until expr.empty?
74
206
  expr = clean_path(expr)
75
207
  expr.gsub!(%r!^//!, '')
76
208
 
77
209
  case expr
78
210
  when %r!^/?\.\.!
79
- expr = $'
211
+ last = expr = $'
80
212
  nodes.map! { |node| node.parent }
81
213
  when %r!^[>/]!
82
- expr = $'
83
- nodes = Elements[*nodes.map { |node| node.containers }.flatten]
214
+ last = expr = $'
215
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
84
216
  when %r!^\+!
85
- expr = $'
217
+ last = expr = $'
86
218
  nodes.map! do |node|
87
- siblings = node.parent.containers
219
+ siblings = node.parent.children
88
220
  siblings[siblings.index(node)+1]
89
221
  end
90
222
  nodes.compact!
91
223
  when %r!^~!
92
- expr = $'
224
+ last = expr = $'
93
225
  nodes.map! do |node|
94
- siblings = node.parent.containers
226
+ siblings = node.parent.children
95
227
  siblings[(siblings.index(node)+1)..-1]
96
228
  end
97
229
  nodes.flatten!
98
230
  when %r!^[|,]!
99
- expr = " #$'"
231
+ last = expr = " #$'"
100
232
  nodes.shift if nodes.first == self
101
233
  done += nodes
102
234
  nodes = [self]
103
235
  else
104
- m = expr.match %r!^([#.]?)([a-z0-9\\*_-]*)!i
105
- expr = $'
236
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
237
+ after = $'
238
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
239
+ oop = false
240
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
241
+ after = $'
242
+ m[2] += mt
243
+ expr = after
244
+ end
106
245
  if m[1] == '#'
107
246
  oid = get_element_by_id(m[2])
108
247
  nodes = oid ? [oid] : []
248
+ expr = after
109
249
  else
110
- m[2] = "*" if m[2] == "" || m[1] == "."
250
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
111
251
  ret = []
112
252
  nodes.each do |node|
113
253
  case m[2]
114
254
  when '*'
255
+ node.traverse_element { |n| ret << n }
115
256
  else
116
- ret += [*node.get_elements_by_tag_name(m[2])]
257
+ if node.respond_to? :get_elements_by_tag_name
258
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
259
+ end
117
260
  end
118
261
  end
119
262
  nodes = ret
120
263
  end
264
+ last = nil
121
265
  end
122
266
 
267
+ hist << expr
268
+ break if hist[-1] == hist[-2]
123
269
  nodes, expr = Elements.filter(nodes, expr)
124
270
  end
125
271
  nodes = done + nodes.flatten.uniq
@@ -132,35 +278,13 @@ module Hpricot
132
278
  end
133
279
  alias_method :/, :search
134
280
 
135
- def at(expr, &blk)
136
- search(expr, &blk).first
281
+ # Find the first matching node for the CSS or XPath
282
+ # +expr+ string.
283
+ def at(expr)
284
+ search(expr).first
137
285
  end
138
286
  alias_method :%, :at
139
287
 
140
- def clean_path(path)
141
- path.gsub(/^\s+|\s+$/, '')
142
- end
143
-
144
- # +each_child+ iterates over each child.
145
- def each_child(&block) # :yields: child_node
146
- children.each(&block)
147
- nil
148
- end
149
-
150
- # +each_child_with_index+ iterates over each child.
151
- def each_child_with_index(&block) # :yields: child_node, index
152
- children.each_with_index(&block)
153
- nil
154
- end
155
-
156
- # +find_element+ searches an element which universal name is specified by
157
- # the arguments.
158
- # It returns nil if not found.
159
- def find_element(*names)
160
- traverse_element(*names) {|e| return e }
161
- nil
162
- end
163
-
164
288
  # +traverse_element+ traverses elements in the tree.
165
289
  # It yields elements in depth first order.
166
290
  #
@@ -200,13 +324,130 @@ module Hpricot
200
324
  nil
201
325
  end
202
326
 
327
+ # Find children of a given +tag_name+.
328
+ #
329
+ # ele.children_of_type('p')
330
+ # #=> [...array of paragraphs...]
331
+ #
332
+ def children_of_type(tag_name)
333
+ if respond_to? :children
334
+ children.find_all do |x|
335
+ x.respond_to?(:pathname) && x.pathname == tag_name
336
+ end
337
+ end
338
+ end
339
+
340
+ end
341
+
342
+ module Container::Trav
343
+ # Return all children of this node which can contain other
344
+ # nodes. This is a good way to get all HTML elements which
345
+ # aren't text, comment, doctype or processing instruction nodes.
346
+ def containers
347
+ children.grep(Container::Trav)
348
+ end
349
+
350
+ # Returns the container node neighboring this node to the south: just below it.
351
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
352
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
353
+ def next_sibling
354
+ sib = parent.containers
355
+ sib[sib.index(self) + 1] if parent
356
+ end
357
+
358
+ # Returns the container node neighboring this node to the north: just above it.
359
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
360
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
361
+ def previous_sibling
362
+ sib = parent.containers
363
+ x = sib.index(self) - 1
364
+ sib[x] if sib and x >= 0
365
+ end
366
+
367
+ # Puts together an array of neighboring sibling elements based on their proximity
368
+ # to this element.
369
+ #
370
+ # This method accepts ranges and sets of numbers.
371
+ #
372
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
373
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
374
+ # ele.siblings_at(0, 5..6) # the current element and two others
375
+ #
376
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
377
+ # Use nodes_at to include those nodes.
378
+ def siblings_at(*pos)
379
+ sib = parent.containers
380
+ i, si = 0, sib.index(self)
381
+ Elements[*
382
+ sib.select do |x|
383
+ sel = case i - si when *pos
384
+ true
385
+ end
386
+ i += 1
387
+ sel
388
+ end
389
+ ]
390
+ end
391
+
392
+ # Replace +old+, a child of the current node, with +new+ node.
393
+ def replace_child(old, new)
394
+ reparent new
395
+ children[children.index(old), 1] = [*new]
396
+ end
397
+
398
+ # Insert +nodes+, an array of HTML elements or a single element,
399
+ # before the node +ele+, a child of the current node.
400
+ def insert_before(nodes, ele)
401
+ case nodes
402
+ when Array
403
+ nodes.each { |n| insert_before(n, ele) }
404
+ else
405
+ reparent nodes
406
+ children[children.index(ele) || 0, 0] = nodes
407
+ end
408
+ end
409
+
410
+ # Insert +nodes+, an array of HTML elements or a single element,
411
+ # after the node +ele+, a child of the current node.
412
+ def insert_after(nodes, ele)
413
+ case nodes
414
+ when Array
415
+ nodes.each { |n| insert_after(n, ele) }
416
+ else
417
+ reparent nodes
418
+ idx = children.index(ele)
419
+ children[idx ? idx + 1 : children.length, 0] = nodes
420
+ end
421
+ end
422
+
423
+ # +each_child+ iterates over each child.
424
+ def each_child(&block) # :yields: child_node
425
+ children.each(&block)
426
+ nil
427
+ end
428
+
429
+ # +each_child_with_index+ iterates over each child.
430
+ def each_child_with_index(&block) # :yields: child_node, index
431
+ children.each_with_index(&block)
432
+ nil
433
+ end
434
+
435
+ # +find_element+ searches an element which universal name is specified by
436
+ # the arguments.
437
+ # It returns nil if not found.
438
+ def find_element(*names)
439
+ traverse_element(*names) {|e| return e }
440
+ nil
441
+ end
442
+
443
+ # Returns a list of CSS classes to which this element belongs.
203
444
  def classes
204
445
  get_attribute('class').to_s.strip.split(/\s+/)
205
446
  end
206
447
 
207
448
  def get_element_by_id(id)
208
449
  traverse_all_element do |ele|
209
- if eid = ele.get_attribute('id')
450
+ if ele.elem? and eid = ele.get_attribute('id')
210
451
  return ele if eid.to_s == id
211
452
  end
212
453
  end
@@ -319,6 +560,12 @@ module Hpricot
319
560
  def traverse_all_element(&block)
320
561
  children.each {|c| c.traverse_all_element(&block) }
321
562
  end
563
+ def xpath
564
+ "/"
565
+ end
566
+ def css_path
567
+ nil
568
+ end
322
569
  end
323
570
 
324
571
  module Elem::Trav
@@ -330,6 +577,7 @@ module Hpricot
330
577
 
331
578
  module Leaf::Trav
332
579
  def traverse_all_element
580
+ yield self
333
581
  end
334
582
  end
335
583
 
@@ -497,12 +745,14 @@ module Hpricot
497
745
  end
498
746
  alias_method :[], :get_attribute
499
747
  def set_attribute(name, val)
748
+ altered!
500
749
  self.attributes ||= {}
501
750
  self.attributes[name.to_s] = val
502
751
  end
503
752
  alias_method :[]=, :set_attribute
504
753
  def remove_attribute(name)
505
754
  if has_attribute? name
755
+ altered!
506
756
  self.attributes.delete(name)
507
757
  end
508
758
  end