hpricot 0.8.3-i386-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7039 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +896 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/fast_xs.rb +1 -0
  21. data/lib/fast_xs/1.8/fast_xs.so +0 -0
  22. data/lib/fast_xs/1.9/fast_xs.so +0 -0
  23. data/lib/hpricot.rb +26 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +216 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +94 -0
  35. data/lib/hpricot_scan.rb +1 -0
  36. data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
  37. data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
  38. data/test/files/basic.xhtml +17 -0
  39. data/test/files/boingboing.html +2266 -0
  40. data/test/files/cy0.html +3653 -0
  41. data/test/files/immob.html +400 -0
  42. data/test/files/pace_application.html +1320 -0
  43. data/test/files/tenderlove.html +16 -0
  44. data/test/files/uswebgen.html +220 -0
  45. data/test/files/utf8.html +1054 -0
  46. data/test/files/week9.html +1723 -0
  47. data/test/files/why.xml +19 -0
  48. data/test/load_files.rb +7 -0
  49. data/test/nokogiri-bench.rb +64 -0
  50. data/test/test_alter.rb +96 -0
  51. data/test/test_builder.rb +37 -0
  52. data/test/test_parser.rb +457 -0
  53. data/test/test_paths.rb +25 -0
  54. data/test/test_preserved.rb +88 -0
  55. data/test/test_xml.rb +28 -0
  56. metadata +128 -0
@@ -0,0 +1,839 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Parses an HTML string, making an HTML fragment based on
24
+ # the options used to create the container document.
25
+ def make(input = nil, &blk)
26
+ if parent and parent.respond_to? :make
27
+ parent.make(input, &blk)
28
+ else
29
+ Hpricot.make(input, &blk).children
30
+ end
31
+ end
32
+
33
+ # Builds an HTML string from this node and its contents.
34
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
35
+ # as a method on this object.
36
+ def to_html
37
+ output("")
38
+ end
39
+ alias_method :to_s, :to_html
40
+
41
+ # Attempts to preserve the original HTML of the document, only
42
+ # outputing new tags for elements which have changed.
43
+ def to_original_html
44
+ output("", :preserve => true)
45
+ end
46
+
47
+ def index(name)
48
+ i = 0
49
+ return i if name == "*"
50
+ children.each do |x|
51
+ return i if (x.respond_to?(:name) and name == x.name) or
52
+ (x.text? and name == "text()")
53
+ i += 1
54
+ end if children
55
+ -1
56
+ end
57
+
58
+ # Puts together an array of neighboring nodes based on their proximity
59
+ # to this node. So, for example, to get the next node, you could use
60
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
61
+ #
62
+ # This method also accepts ranges and sets of numbers.
63
+ #
64
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
65
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
66
+ # ele.nodes_at(0, 5..6) # the current node and two others
67
+ def nodes_at(*pos)
68
+ sib = parent.children
69
+ i, si = 0, sib.index(self)
70
+ pos.map! do |r|
71
+ if r.is_a?(Range) and r.begin.is_a?(String)
72
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
73
+ end
74
+ r
75
+ end
76
+ p pos
77
+ Elements[*
78
+ sib.select do |x|
79
+ sel =
80
+ case i - si when *pos
81
+ true
82
+ end
83
+ i += 1
84
+ sel
85
+ end
86
+ ]
87
+ end
88
+
89
+ # Returns the node neighboring this node to the south: just below it.
90
+ # This method includes text nodes and comments and such.
91
+ def next
92
+ sib = parent.children
93
+ sib[sib.index(self) + 1] if parent
94
+ end
95
+ alias_method :next_node, :next
96
+
97
+ # Returns to node neighboring this node to the north: just above it.
98
+ # This method includes text nodes and comments and such.
99
+ def previous
100
+ sib = parent.children
101
+ x = sib.index(self) - 1
102
+ sib[x] if sib and x >= 0
103
+ end
104
+ alias_method :previous_node, :previous
105
+
106
+ # Find all preceding nodes.
107
+ def preceding
108
+ sibs = parent.children
109
+ si = sibs.index(self)
110
+ return Elements[*sibs[0...si]]
111
+ end
112
+
113
+ # Find all nodes which follow the current one.
114
+ def following
115
+ sibs = parent.children
116
+ si = sibs.index(self) + 1
117
+ return Elements[*sibs[si...sibs.length]]
118
+ end
119
+
120
+ # Adds elements immediately after this element, contained in the +html+ string.
121
+ def after(html = nil, &blk)
122
+ parent.insert_after(make(html, &blk), self)
123
+ end
124
+
125
+ # Adds elements immediately before this element, contained in the +html+ string.
126
+ def before(html = nil, &blk)
127
+ parent.insert_before(make(html, &blk), self)
128
+ end
129
+
130
+
131
+ # Replace this element and its contents with the nodes contained
132
+ # in the +html+ string.
133
+ def swap(html = nil, &blk)
134
+ parent.altered!
135
+ parent.replace_child(self, make(html, &blk))
136
+ end
137
+
138
+ def get_subnode(*indexes)
139
+ n = self
140
+ indexes.each {|index|
141
+ n = n.get_subnode_internal(index)
142
+ }
143
+ n
144
+ end
145
+
146
+ # Builds a string from the text contained in this node. All
147
+ # HTML elements are removed.
148
+ def to_plain_text
149
+ if respond_to?(:children) and children
150
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
151
+ else
152
+ ""
153
+ end
154
+ end
155
+
156
+ # Builds a string from the text contained in this node. All
157
+ # HTML elements are removed.
158
+ def inner_text
159
+ if respond_to?(:children) and children
160
+ children.map { |x| x.inner_text }.join
161
+ else
162
+ ""
163
+ end
164
+ end
165
+ alias_method :innerText, :inner_text
166
+
167
+ # Builds an HTML string from the contents of this node.
168
+ def html(inner = nil, &blk)
169
+ if inner or blk
170
+ altered!
171
+ case inner
172
+ when Array
173
+ self.children = inner
174
+ else
175
+ self.children = make(inner, &blk)
176
+ end
177
+ reparent self.children
178
+ else
179
+ if respond_to?(:children) and children
180
+ children.map { |x| x.output("") }.join
181
+ else
182
+ ""
183
+ end
184
+ end
185
+ end
186
+ alias_method :inner_html, :html
187
+ alias_method :innerHTML, :inner_html
188
+
189
+ # Inserts new contents into the current node, based on
190
+ # the HTML contained in string +inner+.
191
+ def inner_html=(inner)
192
+ html(inner || [])
193
+ end
194
+ alias_method :innerHTML=, :inner_html=
195
+
196
+ def reparent(nodes)
197
+ return unless nodes
198
+ altered!
199
+ [*nodes].each { |e| e.parent = self }
200
+ end
201
+ private :reparent
202
+
203
+ def clean_path(path)
204
+ path.gsub(/^\s+|\s+$/, '')
205
+ end
206
+
207
+ # Builds a unique XPath string for this node, from the
208
+ # root of the document containing it.
209
+ def xpath
210
+ if elem? and has_attribute? 'id'
211
+ "//#{self.name}[@id='#{get_attribute('id')}']"
212
+ else
213
+ sim, id = 0, 0, 0
214
+ parent.children.each do |e|
215
+ id = sim if e == self
216
+ sim += 1 if e.pathname == self.pathname
217
+ end if parent.children
218
+ p = File.join(parent.xpath, self.pathname)
219
+ p += "[#{id+1}]" if sim >= 2
220
+ p
221
+ end
222
+ end
223
+
224
+ # Builds a unique CSS string for this node, from the
225
+ # root of the document containing it.
226
+ def css_path
227
+ if elem? and has_attribute? 'id'
228
+ "##{get_attribute('id')}"
229
+ else
230
+ sim, i, id = 0, 0, 0
231
+ parent.children.each do |e|
232
+ id = sim if e == self
233
+ sim += 1 if e.pathname == self.pathname
234
+ end if parent.children
235
+ p = parent.css_path
236
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
237
+ p += ":nth(#{id})" if sim >= 2
238
+ p
239
+ end
240
+ end
241
+
242
+ def node_position
243
+ parent.children.index(self)
244
+ end
245
+
246
+ def position
247
+ parent.children_of_type(self.pathname).index(self)
248
+ end
249
+
250
+ # Searches this node for all elements matching
251
+ # the CSS or XPath +expr+. Returns an Elements array
252
+ # containing the matching nodes. If +blk+ is given, it
253
+ # is used to iterate through the matching set.
254
+ def search(expr, &blk)
255
+ if Range === expr
256
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
257
+ end
258
+ last = nil
259
+ nodes = [self]
260
+ done = []
261
+ expr = expr.to_s
262
+ hist = []
263
+ until expr.empty?
264
+ expr = clean_path(expr)
265
+ expr.gsub!(%r!^//!, '')
266
+
267
+ case expr
268
+ when %r!^/?\.\.!
269
+ last = expr = $'
270
+ nodes.map! { |node| node.parent }
271
+ when %r!^[>/]\s*!
272
+ last = expr = $'
273
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
274
+ when %r!^\+!
275
+ last = expr = $'
276
+ nodes.map! do |node|
277
+ siblings = node.parent.children
278
+ siblings[siblings.index(node)+1]
279
+ end
280
+ nodes.compact!
281
+ when %r!^~!
282
+ last = expr = $'
283
+ nodes.map! do |node|
284
+ siblings = node.parent.children
285
+ siblings[(siblings.index(node)+1)..-1]
286
+ end
287
+ nodes.flatten!
288
+ when %r!^[|,]!
289
+ last = expr = " #$'"
290
+ nodes.shift if nodes.first == self
291
+ done += nodes
292
+ nodes = [self]
293
+ else
294
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
295
+ after = $'
296
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
297
+ oop = false
298
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
299
+ after = $'
300
+ m[2] += mt
301
+ expr = after
302
+ end
303
+ if m[1] == '#'
304
+ oid = get_element_by_id(m[2])
305
+ nodes = oid ? [oid] : []
306
+ expr = after
307
+ else
308
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
309
+ ret = []
310
+ nodes.each do |node|
311
+ case m[2]
312
+ when '*'
313
+ node.traverse_element { |n| ret << n }
314
+ else
315
+ if node.respond_to? :get_elements_by_tag_name
316
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
317
+ end
318
+ end
319
+ end
320
+ nodes = ret
321
+ end
322
+ last = nil
323
+ end
324
+
325
+ hist << expr
326
+ break if hist[-1] == hist[-2]
327
+ nodes, expr = Elements.filter(nodes, expr)
328
+ end
329
+ nodes = done + nodes.flatten.uniq
330
+ if blk
331
+ nodes.each(&blk)
332
+ self
333
+ else
334
+ Elements[*nodes]
335
+ end
336
+ end
337
+ alias_method :/, :search
338
+
339
+ # Find the first matching node for the CSS or XPath
340
+ # +expr+ string.
341
+ def at(expr)
342
+ search(expr).first
343
+ end
344
+ alias_method :%, :at
345
+
346
+ # +traverse_element+ traverses elements in the tree.
347
+ # It yields elements in depth first order.
348
+ #
349
+ # If _names_ are empty, it yields all elements.
350
+ # If non-empty _names_ are given, it should be list of universal names.
351
+ #
352
+ # A nested element is yielded in depth first order as follows.
353
+ #
354
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
355
+ # t.traverse_element("a", "c") {|e| p e}
356
+ # # =>
357
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
358
+ # {emptyelem <a id="1">}
359
+ # {emptyelem <c id="2">}
360
+ #
361
+ # Universal names are specified as follows.
362
+ #
363
+ # t = Hpricot(<<'End')
364
+ # <html>
365
+ # <meta name="robots" content="index,nofollow">
366
+ # <meta name="author" content="Who am I?">
367
+ # </html>
368
+ # End
369
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
370
+ # # =>
371
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
372
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
373
+ #
374
+ def traverse_element(*names, &block) # :yields: element
375
+ if names.empty?
376
+ traverse_all_element(&block)
377
+ else
378
+ name_set = {}
379
+ names.each {|n| name_set[n] = true }
380
+ traverse_some_element(name_set, &block)
381
+ end
382
+ nil
383
+ end
384
+
385
+ # Find children of a given +tag_name+.
386
+ #
387
+ # ele.children_of_type('p')
388
+ # #=> [...array of paragraphs...]
389
+ #
390
+ def children_of_type(tag_name)
391
+ if respond_to? :children
392
+ children.find_all do |x|
393
+ x.respond_to?(:pathname) && x.pathname == tag_name
394
+ end
395
+ end
396
+ end
397
+
398
+ end
399
+
400
+ module Container::Trav
401
+ # Return all children of this node which can contain other
402
+ # nodes. This is a good way to get all HTML elements which
403
+ # aren't text, comment, doctype or processing instruction nodes.
404
+ def containers
405
+ children.grep(Container::Trav)
406
+ end
407
+
408
+ # Returns the container node neighboring this node to the south: just below it.
409
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
410
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
411
+ def next_sibling
412
+ sib = parent.containers
413
+ sib[sib.index(self) + 1] if parent
414
+ end
415
+
416
+ # Returns the container node neighboring this node to the north: just above it.
417
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
418
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
419
+ def previous_sibling
420
+ sib = parent.containers
421
+ x = sib.index(self) - 1
422
+ sib[x] if sib and x >= 0
423
+ end
424
+
425
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
426
+ # out text and comment nodes.
427
+ def preceding_siblings()
428
+ sibs = parent.containers
429
+ si = sibs.index(self)
430
+ return Elements[*sibs[0...si]]
431
+ end
432
+
433
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
434
+ # out text and comment nodes.
435
+ def following_siblings()
436
+ sibs = parent.containers
437
+ si = sibs.index(self) + 1
438
+ return Elements[*sibs[si...sibs.length]]
439
+ end
440
+
441
+ # Puts together an array of neighboring sibling elements based on their proximity
442
+ # to this element.
443
+ #
444
+ # This method accepts ranges and sets of numbers.
445
+ #
446
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
447
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
448
+ # ele.siblings_at(0, 5..6) # the current element and two others
449
+ #
450
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
451
+ # Use nodes_at to include those nodes.
452
+ def siblings_at(*pos)
453
+ sib = parent.containers
454
+ i, si = 0, sib.index(self)
455
+ Elements[*
456
+ sib.select do |x|
457
+ sel = case i - si when *pos
458
+ true
459
+ end
460
+ i += 1
461
+ sel
462
+ end
463
+ ]
464
+ end
465
+
466
+ # Replace +old+, a child of the current node, with +new+ node.
467
+ def replace_child(old, new)
468
+ reparent new
469
+ children[children.index(old), 1] = [*new]
470
+ end
471
+
472
+ # Insert +nodes+, an array of HTML elements or a single element,
473
+ # before the node +ele+, a child of the current node.
474
+ def insert_before(nodes, ele)
475
+ case nodes
476
+ when Array
477
+ nodes.each { |n| insert_before(n, ele) }
478
+ else
479
+ reparent nodes
480
+ children[children.index(ele) || 0, 0] = nodes
481
+ end
482
+ end
483
+
484
+ # Insert +nodes+, an array of HTML elements or a single element,
485
+ # after the node +ele+, a child of the current node.
486
+ def insert_after(nodes, ele)
487
+ case nodes
488
+ when Array
489
+ nodes.reverse_each { |n| insert_after(n, ele) }
490
+ else
491
+ reparent nodes
492
+ idx = children.index(ele)
493
+ children[idx ? idx + 1 : children.length, 0] = nodes
494
+ end
495
+ end
496
+
497
+ # +each_child+ iterates over each child.
498
+ def each_child(&block) # :yields: child_node
499
+ children.each(&block) if children
500
+ nil
501
+ end
502
+
503
+ # +each_child_with_index+ iterates over each child.
504
+ def each_child_with_index(&block) # :yields: child_node, index
505
+ children.each_with_index(&block) if children
506
+ nil
507
+ end
508
+
509
+ # +find_element+ searches an element which universal name is specified by
510
+ # the arguments.
511
+ # It returns nil if not found.
512
+ def find_element(*names)
513
+ traverse_element(*names) {|e| return e }
514
+ nil
515
+ end
516
+
517
+ # Returns a list of CSS classes to which this element belongs.
518
+ def classes
519
+ get_attribute('class').to_s.strip.split(/\s+/)
520
+ end
521
+
522
+ def get_element_by_id(id)
523
+ traverse_all_element do |ele|
524
+ if ele.elem? and eid = ele.get_attribute('id')
525
+ return ele if eid.to_s == id
526
+ end
527
+ end
528
+ nil
529
+ end
530
+
531
+ def get_elements_by_tag_name(*a)
532
+ list = Elements[]
533
+ a.delete("*")
534
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
535
+ list << e if e.elem?
536
+ end
537
+ list
538
+ end
539
+
540
+ def each_hyperlink_attribute
541
+ traverse_element(
542
+ '{http://www.w3.org/1999/xhtml}a',
543
+ '{http://www.w3.org/1999/xhtml}area',
544
+ '{http://www.w3.org/1999/xhtml}link',
545
+ '{http://www.w3.org/1999/xhtml}img',
546
+ '{http://www.w3.org/1999/xhtml}object',
547
+ '{http://www.w3.org/1999/xhtml}q',
548
+ '{http://www.w3.org/1999/xhtml}blockquote',
549
+ '{http://www.w3.org/1999/xhtml}ins',
550
+ '{http://www.w3.org/1999/xhtml}del',
551
+ '{http://www.w3.org/1999/xhtml}form',
552
+ '{http://www.w3.org/1999/xhtml}input',
553
+ '{http://www.w3.org/1999/xhtml}head',
554
+ '{http://www.w3.org/1999/xhtml}base',
555
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
556
+ case elem.name
557
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
558
+ attrs = ['href']
559
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
560
+ attrs = ['src', 'longdesc', 'usemap']
561
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
562
+ attrs = ['classid', 'codebase', 'data', 'usemap']
563
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
564
+ attrs = ['cite']
565
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
566
+ attrs = ['action']
567
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
568
+ attrs = ['src', 'usemap']
569
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
570
+ attrs = ['profile']
571
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
572
+ attrs = ['src', 'for']
573
+ end
574
+ attrs.each {|attr|
575
+ if hyperlink = elem.get_attribute(attr)
576
+ yield elem, attr, hyperlink
577
+ end
578
+ }
579
+ }
580
+ end
581
+ private :each_hyperlink_attribute
582
+
583
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
584
+ # of A element.
585
+ #
586
+ # It yields Hpricot::Text and URI for each hyperlink.
587
+ #
588
+ # The URI objects are created with a base URI which is given by
589
+ # HTML BASE element or the argument ((|base_uri|)).
590
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
591
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
592
+ base_uri = URI.parse(base_uri) if String === base_uri
593
+ links = []
594
+ each_hyperlink_attribute {|elem, attr, hyperlink|
595
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
596
+ base_uri = URI.parse(hyperlink.to_s)
597
+ else
598
+ links << hyperlink
599
+ end
600
+ }
601
+ if base_uri
602
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
603
+ else
604
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
605
+ end
606
+ end
607
+
608
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
609
+ # of A element.
610
+ #
611
+ # It yields Hpricot::Text.
612
+ #
613
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
614
+ def each_hyperlink # :yields: text
615
+ links = []
616
+ each_hyperlink_attribute {|elem, attr, hyperlink|
617
+ yield hyperlink
618
+ }
619
+ end
620
+
621
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
622
+ # of A element.
623
+ #
624
+ # It yields URI for each hyperlink.
625
+ #
626
+ # The URI objects are created with a base URI which is given by
627
+ # HTML BASE element or the argument ((|base_uri|)).
628
+ def each_uri(base_uri=nil) # :yields: URI
629
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
630
+ end
631
+ end
632
+
633
+ # :stopdoc:
634
+ module Doc::Trav
635
+ def traverse_all_element(&block)
636
+ children.each {|c| c.traverse_all_element(&block) } if children
637
+ end
638
+ def xpath
639
+ "/"
640
+ end
641
+ def css_path
642
+ nil
643
+ end
644
+ end
645
+
646
+ module Elem::Trav
647
+ def traverse_all_element(&block)
648
+ yield self
649
+ children.each {|c| c.traverse_all_element(&block) } if children
650
+ end
651
+ end
652
+
653
+ module Leaf::Trav
654
+ def traverse_all_element
655
+ yield self
656
+ end
657
+ end
658
+
659
+ module Doc::Trav
660
+ def traverse_some_element(name_set, &block)
661
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
662
+ end
663
+ end
664
+
665
+ module Elem::Trav
666
+ def traverse_some_element(name_set, &block)
667
+ yield self if name_set.include? self.name
668
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
669
+ end
670
+ end
671
+
672
+ module Leaf::Trav
673
+ def traverse_some_element(name_set)
674
+ end
675
+ end
676
+ # :startdoc:
677
+
678
+ module Traverse
679
+ # +traverse_text+ traverses texts in the tree
680
+ def traverse_text(&block) # :yields: text
681
+ traverse_text_internal(&block)
682
+ nil
683
+ end
684
+ end
685
+
686
+ # :stopdoc:
687
+ module Container::Trav
688
+ def traverse_text_internal(&block)
689
+ each_child {|c| c.traverse_text_internal(&block) }
690
+ end
691
+ end
692
+
693
+ module Leaf::Trav
694
+ def traverse_text_internal
695
+ end
696
+ end
697
+
698
+ module Text::Trav
699
+ def traverse_text_internal
700
+ yield self
701
+ end
702
+ end
703
+ # :startdoc:
704
+
705
+ module Container::Trav
706
+ # +filter+ rebuilds the tree without some components.
707
+ #
708
+ # node.filter {|descendant_node| predicate } -> node
709
+ # loc.filter {|descendant_loc| predicate } -> node
710
+ #
711
+ # +filter+ yields each node except top node.
712
+ # If given block returns false, corresponding node is dropped.
713
+ # If given block returns true, corresponding node is retained and
714
+ # inner nodes are examined.
715
+ #
716
+ # +filter+ returns an node.
717
+ # It doesn't return location object even if self is location object.
718
+ #
719
+ def filter(&block)
720
+ subst = {}
721
+ each_child_with_index {|descendant, i|
722
+ if yield descendant
723
+ if descendant.elem?
724
+ subst[i] = descendant.filter(&block)
725
+ else
726
+ subst[i] = descendant
727
+ end
728
+ else
729
+ subst[i] = nil
730
+ end
731
+ }
732
+ to_node.subst_subnode(subst)
733
+ end
734
+ end
735
+
736
+ module Doc::Trav
737
+ # +title+ searches title and return it as a text.
738
+ # It returns nil if not found.
739
+ #
740
+ # +title+ searchs following information.
741
+ #
742
+ # - <title>...</title> in HTML
743
+ # - <title>...</title> in RSS
744
+ def title
745
+ e = find_element('title',
746
+ '{http://www.w3.org/1999/xhtml}title',
747
+ '{http://purl.org/rss/1.0/}title',
748
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
749
+ e && e.extract_text
750
+ end
751
+
752
+ # +author+ searches author and return it as a text.
753
+ # It returns nil if not found.
754
+ #
755
+ # +author+ searchs following information.
756
+ #
757
+ # - <meta name="author" content="author-name"> in HTML
758
+ # - <link rev="made" title="author-name"> in HTML
759
+ # - <dc:creator>author-name</dc:creator> in RSS
760
+ # - <dc:publisher>author-name</dc:publisher> in RSS
761
+ def author
762
+ traverse_element('meta',
763
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
764
+ begin
765
+ next unless e.fetch_attr('name').downcase == 'author'
766
+ author = e.fetch_attribute('content').strip
767
+ return author if !author.empty?
768
+ rescue IndexError
769
+ end
770
+ }
771
+
772
+ traverse_element('link',
773
+ '{http://www.w3.org/1999/xhtml}link') {|e|
774
+ begin
775
+ next unless e.fetch_attr('rev').downcase == 'made'
776
+ author = e.fetch_attribute('title').strip
777
+ return author if !author.empty?
778
+ rescue IndexError
779
+ end
780
+ }
781
+
782
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
783
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
784
+ begin
785
+ author = e.extract_text.strip
786
+ return author if !author.empty?
787
+ rescue IndexError
788
+ end
789
+ }
790
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
791
+ begin
792
+ author = e.extract_text.strip
793
+ return author if !author.empty?
794
+ rescue IndexError
795
+ end
796
+ }
797
+ end
798
+
799
+ nil
800
+ end
801
+
802
+ end
803
+
804
+ module Doc::Trav
805
+ def root
806
+ es = []
807
+ children.each {|c| es << c if c.elem? } if children
808
+ raise Hpricot::Error, "no element" if es.empty?
809
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
810
+ es[0]
811
+ end
812
+ end
813
+
814
+ module Elem::Trav
815
+ def has_attribute?(name)
816
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
817
+ end
818
+ def get_attribute(name)
819
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
820
+ a = Hpricot.uxs(a) if a
821
+ a
822
+ end
823
+ alias_method :[], :get_attribute
824
+ def set_attribute(name, val)
825
+ altered!
826
+ self.raw_attributes ||= {}
827
+ self.raw_attributes[name.to_s] = val.fast_xs
828
+ end
829
+ alias_method :[]=, :set_attribute
830
+ def remove_attribute(name)
831
+ name = name.to_s
832
+ if has_attribute? name
833
+ altered!
834
+ self.raw_attributes.delete(name)
835
+ end
836
+ end
837
+ end
838
+
839
+ end