thbar-hpricot 0.8.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7045 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +902 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +514 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +40 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +219 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +839 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/test/files/basic.xhtml +17 -0
  33. data/test/files/boingboing.html +2266 -0
  34. data/test/files/cy0.html +3653 -0
  35. data/test/files/immob.html +400 -0
  36. data/test/files/pace_application.html +1320 -0
  37. data/test/files/tenderlove.html +16 -0
  38. data/test/files/uswebgen.html +220 -0
  39. data/test/files/utf8.html +1054 -0
  40. data/test/files/week9.html +1723 -0
  41. data/test/files/why.xml +19 -0
  42. data/test/load_files.rb +7 -0
  43. data/test/nokogiri-bench.rb +64 -0
  44. data/test/test_alter.rb +96 -0
  45. data/test/test_builder.rb +37 -0
  46. data/test/test_parser.rb +457 -0
  47. data/test/test_paths.rb +25 -0
  48. data/test/test_preserved.rb +88 -0
  49. data/test/test_xml.rb +28 -0
  50. metadata +124 -0
@@ -0,0 +1,839 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Parses an HTML string, making an HTML fragment based on
24
+ # the options used to create the container document.
25
+ def make(input = nil, &blk)
26
+ if parent and parent.respond_to? :make
27
+ parent.make(input, &blk)
28
+ else
29
+ Hpricot.make(input, &blk).children
30
+ end
31
+ end
32
+
33
+ # Builds an HTML string from this node and its contents.
34
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
35
+ # as a method on this object.
36
+ def to_html
37
+ output("")
38
+ end
39
+ alias_method :to_s, :to_html
40
+
41
+ # Attempts to preserve the original HTML of the document, only
42
+ # outputing new tags for elements which have changed.
43
+ def to_original_html
44
+ output("", :preserve => true)
45
+ end
46
+
47
+ def index(name)
48
+ i = 0
49
+ return i if name == "*"
50
+ children.each do |x|
51
+ return i if (x.respond_to?(:name) and name == x.name) or
52
+ (x.text? and name == "text()")
53
+ i += 1
54
+ end if children
55
+ -1
56
+ end
57
+
58
+ # Puts together an array of neighboring nodes based on their proximity
59
+ # to this node. So, for example, to get the next node, you could use
60
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
61
+ #
62
+ # This method also accepts ranges and sets of numbers.
63
+ #
64
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
65
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
66
+ # ele.nodes_at(0, 5..6) # the current node and two others
67
+ def nodes_at(*pos)
68
+ sib = parent.children
69
+ i, si = 0, sib.index(self)
70
+ pos.map! do |r|
71
+ if r.is_a?(Range) and r.begin.is_a?(String)
72
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
73
+ end
74
+ r
75
+ end
76
+ p pos
77
+ Elements[*
78
+ sib.select do |x|
79
+ sel =
80
+ case i - si when *pos
81
+ true
82
+ end
83
+ i += 1
84
+ sel
85
+ end
86
+ ]
87
+ end
88
+
89
+ # Returns the node neighboring this node to the south: just below it.
90
+ # This method includes text nodes and comments and such.
91
+ def next
92
+ sib = parent.children
93
+ sib[sib.index(self) + 1] if parent
94
+ end
95
+ alias_method :next_node, :next
96
+
97
+ # Returns to node neighboring this node to the north: just above it.
98
+ # This method includes text nodes and comments and such.
99
+ def previous
100
+ sib = parent.children
101
+ x = sib.index(self) - 1
102
+ sib[x] if sib and x >= 0
103
+ end
104
+ alias_method :previous_node, :previous
105
+
106
+ # Find all preceding nodes.
107
+ def preceding
108
+ sibs = parent.children
109
+ si = sibs.index(self)
110
+ return Elements[*sibs[0...si]]
111
+ end
112
+
113
+ # Find all nodes which follow the current one.
114
+ def following
115
+ sibs = parent.children
116
+ si = sibs.index(self) + 1
117
+ return Elements[*sibs[si...sibs.length]]
118
+ end
119
+
120
+ # Adds elements immediately after this element, contained in the +html+ string.
121
+ def after(html = nil, &blk)
122
+ parent.insert_after(make(html, &blk), self)
123
+ end
124
+
125
+ # Adds elements immediately before this element, contained in the +html+ string.
126
+ def before(html = nil, &blk)
127
+ parent.insert_before(make(html, &blk), self)
128
+ end
129
+
130
+
131
+ # Replace this element and its contents with the nodes contained
132
+ # in the +html+ string.
133
+ def swap(html = nil, &blk)
134
+ parent.altered!
135
+ parent.replace_child(self, make(html, &blk))
136
+ end
137
+
138
+ def get_subnode(*indexes)
139
+ n = self
140
+ indexes.each {|index|
141
+ n = n.get_subnode_internal(index)
142
+ }
143
+ n
144
+ end
145
+
146
+ # Builds a string from the text contained in this node. All
147
+ # HTML elements are removed.
148
+ def to_plain_text
149
+ if respond_to?(:children) and children
150
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
151
+ else
152
+ ""
153
+ end
154
+ end
155
+
156
+ # Builds a string from the text contained in this node. All
157
+ # HTML elements are removed.
158
+ def inner_text
159
+ if respond_to?(:children) and children
160
+ children.map { |x| x.inner_text }.join
161
+ else
162
+ ""
163
+ end
164
+ end
165
+ alias_method :innerText, :inner_text
166
+
167
+ # Builds an HTML string from the contents of this node.
168
+ def html(inner = nil, &blk)
169
+ if inner or blk
170
+ altered!
171
+ case inner
172
+ when Array
173
+ self.children = inner
174
+ else
175
+ self.children = make(inner, &blk)
176
+ end
177
+ reparent self.children
178
+ else
179
+ if respond_to?(:children) and children
180
+ children.map { |x| x.output("") }.join
181
+ else
182
+ ""
183
+ end
184
+ end
185
+ end
186
+ alias_method :inner_html, :html
187
+ alias_method :innerHTML, :inner_html
188
+
189
+ # Inserts new contents into the current node, based on
190
+ # the HTML contained in string +inner+.
191
+ def inner_html=(inner)
192
+ html(inner || [])
193
+ end
194
+ alias_method :innerHTML=, :inner_html=
195
+
196
+ def reparent(nodes)
197
+ return unless nodes
198
+ altered!
199
+ [*nodes].each { |e| e.parent = self }
200
+ end
201
+ private :reparent
202
+
203
+ def clean_path(path)
204
+ path.gsub(/^\s+|\s+$/, '')
205
+ end
206
+
207
+ # Builds a unique XPath string for this node, from the
208
+ # root of the document containing it.
209
+ def xpath
210
+ if elem? and has_attribute? 'id'
211
+ "//#{self.name}[@id='#{get_attribute('id')}']"
212
+ else
213
+ sim, id = 0, 0, 0
214
+ parent.children.each do |e|
215
+ id = sim if e == self
216
+ sim += 1 if e.pathname == self.pathname
217
+ end if parent.children
218
+ p = File.join(parent.xpath, self.pathname)
219
+ p += "[#{id+1}]" if sim >= 2
220
+ p
221
+ end
222
+ end
223
+
224
+ # Builds a unique CSS string for this node, from the
225
+ # root of the document containing it.
226
+ def css_path
227
+ if elem? and has_attribute? 'id'
228
+ "##{get_attribute('id')}"
229
+ else
230
+ sim, i, id = 0, 0, 0
231
+ parent.children.each do |e|
232
+ id = sim if e == self
233
+ sim += 1 if e.pathname == self.pathname
234
+ end if parent.children
235
+ p = parent.css_path
236
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
237
+ p += ":nth(#{id})" if sim >= 2
238
+ p
239
+ end
240
+ end
241
+
242
+ def node_position
243
+ parent.children.index(self)
244
+ end
245
+
246
+ def position
247
+ parent.children_of_type(self.pathname).index(self)
248
+ end
249
+
250
+ # Searches this node for all elements matching
251
+ # the CSS or XPath +expr+. Returns an Elements array
252
+ # containing the matching nodes. If +blk+ is given, it
253
+ # is used to iterate through the matching set.
254
+ def search(expr, &blk)
255
+ if Range === expr
256
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
257
+ end
258
+ last = nil
259
+ nodes = [self]
260
+ done = []
261
+ expr = expr.to_s
262
+ hist = []
263
+ until expr.empty?
264
+ expr = clean_path(expr)
265
+ expr.gsub!(%r!^//!, '')
266
+
267
+ case expr
268
+ when %r!^/?\.\.!
269
+ last = expr = $'
270
+ nodes.map! { |node| node.parent }
271
+ when %r!^[>/]\s*!
272
+ last = expr = $'
273
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
274
+ when %r!^\+!
275
+ last = expr = $'
276
+ nodes.map! do |node|
277
+ siblings = node.parent.children
278
+ siblings[siblings.index(node)+1]
279
+ end
280
+ nodes.compact!
281
+ when %r!^~!
282
+ last = expr = $'
283
+ nodes.map! do |node|
284
+ siblings = node.parent.children
285
+ siblings[(siblings.index(node)+1)..-1]
286
+ end
287
+ nodes.flatten!
288
+ when %r!^[|,]!
289
+ last = expr = " #$'"
290
+ nodes.shift if nodes.first == self
291
+ done += nodes
292
+ nodes = [self]
293
+ else
294
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
295
+ after = $'
296
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
297
+ oop = false
298
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
299
+ after = $'
300
+ m[2] += mt
301
+ expr = after
302
+ end
303
+ if m[1] == '#'
304
+ oid = get_element_by_id(m[2])
305
+ nodes = oid ? [oid] : []
306
+ expr = after
307
+ else
308
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
309
+ ret = []
310
+ nodes.each do |node|
311
+ case m[2]
312
+ when '*'
313
+ node.traverse_element { |n| ret << n }
314
+ else
315
+ if node.respond_to? :get_elements_by_tag_name
316
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
317
+ end
318
+ end
319
+ end
320
+ nodes = ret
321
+ end
322
+ last = nil
323
+ end
324
+
325
+ hist << expr
326
+ break if hist[-1] == hist[-2]
327
+ nodes, expr = Elements.filter(nodes, expr)
328
+ end
329
+ nodes = done + nodes.flatten.uniq
330
+ if blk
331
+ nodes.each(&blk)
332
+ self
333
+ else
334
+ Elements[*nodes]
335
+ end
336
+ end
337
+ alias_method :/, :search
338
+
339
+ # Find the first matching node for the CSS or XPath
340
+ # +expr+ string.
341
+ def at(expr)
342
+ search(expr).first
343
+ end
344
+ alias_method :%, :at
345
+
346
+ # +traverse_element+ traverses elements in the tree.
347
+ # It yields elements in depth first order.
348
+ #
349
+ # If _names_ are empty, it yields all elements.
350
+ # If non-empty _names_ are given, it should be list of universal names.
351
+ #
352
+ # A nested element is yielded in depth first order as follows.
353
+ #
354
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
355
+ # t.traverse_element("a", "c") {|e| p e}
356
+ # # =>
357
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
358
+ # {emptyelem <a id="1">}
359
+ # {emptyelem <c id="2">}
360
+ #
361
+ # Universal names are specified as follows.
362
+ #
363
+ # t = Hpricot(<<'End')
364
+ # <html>
365
+ # <meta name="robots" content="index,nofollow">
366
+ # <meta name="author" content="Who am I?">
367
+ # </html>
368
+ # End
369
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
370
+ # # =>
371
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
372
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
373
+ #
374
+ def traverse_element(*names, &block) # :yields: element
375
+ if names.empty?
376
+ traverse_all_element(&block)
377
+ else
378
+ name_set = {}
379
+ names.each {|n| name_set[n] = true }
380
+ traverse_some_element(name_set, &block)
381
+ end
382
+ nil
383
+ end
384
+
385
+ # Find children of a given +tag_name+.
386
+ #
387
+ # ele.children_of_type('p')
388
+ # #=> [...array of paragraphs...]
389
+ #
390
+ def children_of_type(tag_name)
391
+ if respond_to? :children
392
+ children.find_all do |x|
393
+ x.respond_to?(:pathname) && x.pathname == tag_name
394
+ end
395
+ end
396
+ end
397
+
398
+ end
399
+
400
+ module Container::Trav
401
+ # Return all children of this node which can contain other
402
+ # nodes. This is a good way to get all HTML elements which
403
+ # aren't text, comment, doctype or processing instruction nodes.
404
+ def containers
405
+ children.grep(Container::Trav)
406
+ end
407
+
408
+ # Returns the container node neighboring this node to the south: just below it.
409
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
410
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
411
+ def next_sibling
412
+ sib = parent.containers
413
+ sib[sib.index(self) + 1] if parent
414
+ end
415
+
416
+ # Returns the container node neighboring this node to the north: just above it.
417
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
418
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
419
+ def previous_sibling
420
+ sib = parent.containers
421
+ x = sib.index(self) - 1
422
+ sib[x] if sib and x >= 0
423
+ end
424
+
425
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
426
+ # out text and comment nodes.
427
+ def preceding_siblings()
428
+ sibs = parent.containers
429
+ si = sibs.index(self)
430
+ return Elements[*sibs[0...si]]
431
+ end
432
+
433
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
434
+ # out text and comment nodes.
435
+ def following_siblings()
436
+ sibs = parent.containers
437
+ si = sibs.index(self) + 1
438
+ return Elements[*sibs[si...sibs.length]]
439
+ end
440
+
441
+ # Puts together an array of neighboring sibling elements based on their proximity
442
+ # to this element.
443
+ #
444
+ # This method accepts ranges and sets of numbers.
445
+ #
446
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
447
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
448
+ # ele.siblings_at(0, 5..6) # the current element and two others
449
+ #
450
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
451
+ # Use nodes_at to include those nodes.
452
+ def siblings_at(*pos)
453
+ sib = parent.containers
454
+ i, si = 0, sib.index(self)
455
+ Elements[*
456
+ sib.select do |x|
457
+ sel = case i - si when *pos
458
+ true
459
+ end
460
+ i += 1
461
+ sel
462
+ end
463
+ ]
464
+ end
465
+
466
+ # Replace +old+, a child of the current node, with +new+ node.
467
+ def replace_child(old, new)
468
+ reparent new
469
+ children[children.index(old), 1] = [*new]
470
+ end
471
+
472
+ # Insert +nodes+, an array of HTML elements or a single element,
473
+ # before the node +ele+, a child of the current node.
474
+ def insert_before(nodes, ele)
475
+ case nodes
476
+ when Array
477
+ nodes.each { |n| insert_before(n, ele) }
478
+ else
479
+ reparent nodes
480
+ children[children.index(ele) || 0, 0] = nodes
481
+ end
482
+ end
483
+
484
+ # Insert +nodes+, an array of HTML elements or a single element,
485
+ # after the node +ele+, a child of the current node.
486
+ def insert_after(nodes, ele)
487
+ case nodes
488
+ when Array
489
+ nodes.reverse_each { |n| insert_after(n, ele) }
490
+ else
491
+ reparent nodes
492
+ idx = children.index(ele)
493
+ children[idx ? idx + 1 : children.length, 0] = nodes
494
+ end
495
+ end
496
+
497
+ # +each_child+ iterates over each child.
498
+ def each_child(&block) # :yields: child_node
499
+ children.each(&block) if children
500
+ nil
501
+ end
502
+
503
+ # +each_child_with_index+ iterates over each child.
504
+ def each_child_with_index(&block) # :yields: child_node, index
505
+ children.each_with_index(&block) if children
506
+ nil
507
+ end
508
+
509
+ # +find_element+ searches an element which universal name is specified by
510
+ # the arguments.
511
+ # It returns nil if not found.
512
+ def find_element(*names)
513
+ traverse_element(*names) {|e| return e }
514
+ nil
515
+ end
516
+
517
+ # Returns a list of CSS classes to which this element belongs.
518
+ def classes
519
+ get_attribute('class').to_s.strip.split(/\s+/)
520
+ end
521
+
522
+ def get_element_by_id(id)
523
+ traverse_all_element do |ele|
524
+ if ele.elem? and eid = ele.get_attribute('id')
525
+ return ele if eid.to_s == id
526
+ end
527
+ end
528
+ nil
529
+ end
530
+
531
+ def get_elements_by_tag_name(*a)
532
+ list = Elements[]
533
+ a.delete("*")
534
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
535
+ list << e if e.elem?
536
+ end
537
+ list
538
+ end
539
+
540
+ def each_hyperlink_attribute
541
+ traverse_element(
542
+ '{http://www.w3.org/1999/xhtml}a',
543
+ '{http://www.w3.org/1999/xhtml}area',
544
+ '{http://www.w3.org/1999/xhtml}link',
545
+ '{http://www.w3.org/1999/xhtml}img',
546
+ '{http://www.w3.org/1999/xhtml}object',
547
+ '{http://www.w3.org/1999/xhtml}q',
548
+ '{http://www.w3.org/1999/xhtml}blockquote',
549
+ '{http://www.w3.org/1999/xhtml}ins',
550
+ '{http://www.w3.org/1999/xhtml}del',
551
+ '{http://www.w3.org/1999/xhtml}form',
552
+ '{http://www.w3.org/1999/xhtml}input',
553
+ '{http://www.w3.org/1999/xhtml}head',
554
+ '{http://www.w3.org/1999/xhtml}base',
555
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
556
+ case elem.name
557
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
558
+ attrs = ['href']
559
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
560
+ attrs = ['src', 'longdesc', 'usemap']
561
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
562
+ attrs = ['classid', 'codebase', 'data', 'usemap']
563
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
564
+ attrs = ['cite']
565
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
566
+ attrs = ['action']
567
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
568
+ attrs = ['src', 'usemap']
569
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
570
+ attrs = ['profile']
571
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
572
+ attrs = ['src', 'for']
573
+ end
574
+ attrs.each {|attr|
575
+ if hyperlink = elem.get_attribute(attr)
576
+ yield elem, attr, hyperlink
577
+ end
578
+ }
579
+ }
580
+ end
581
+ private :each_hyperlink_attribute
582
+
583
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
584
+ # of A element.
585
+ #
586
+ # It yields Hpricot::Text and URI for each hyperlink.
587
+ #
588
+ # The URI objects are created with a base URI which is given by
589
+ # HTML BASE element or the argument ((|base_uri|)).
590
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
591
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
592
+ base_uri = URI.parse(base_uri) if String === base_uri
593
+ links = []
594
+ each_hyperlink_attribute {|elem, attr, hyperlink|
595
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
596
+ base_uri = URI.parse(hyperlink.to_s)
597
+ else
598
+ links << hyperlink
599
+ end
600
+ }
601
+ if base_uri
602
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
603
+ else
604
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
605
+ end
606
+ end
607
+
608
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
609
+ # of A element.
610
+ #
611
+ # It yields Hpricot::Text.
612
+ #
613
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
614
+ def each_hyperlink # :yields: text
615
+ links = []
616
+ each_hyperlink_attribute {|elem, attr, hyperlink|
617
+ yield hyperlink
618
+ }
619
+ end
620
+
621
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
622
+ # of A element.
623
+ #
624
+ # It yields URI for each hyperlink.
625
+ #
626
+ # The URI objects are created with a base URI which is given by
627
+ # HTML BASE element or the argument ((|base_uri|)).
628
+ def each_uri(base_uri=nil) # :yields: URI
629
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
630
+ end
631
+ end
632
+
633
+ # :stopdoc:
634
+ module Doc::Trav
635
+ def traverse_all_element(&block)
636
+ children.each {|c| c.traverse_all_element(&block) } if children
637
+ end
638
+ def xpath
639
+ "/"
640
+ end
641
+ def css_path
642
+ nil
643
+ end
644
+ end
645
+
646
+ module Elem::Trav
647
+ def traverse_all_element(&block)
648
+ yield self
649
+ children.each {|c| c.traverse_all_element(&block) } if children
650
+ end
651
+ end
652
+
653
+ module Leaf::Trav
654
+ def traverse_all_element
655
+ yield self
656
+ end
657
+ end
658
+
659
+ module Doc::Trav
660
+ def traverse_some_element(name_set, &block)
661
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
662
+ end
663
+ end
664
+
665
+ module Elem::Trav
666
+ def traverse_some_element(name_set, &block)
667
+ yield self if name_set.include? self.name
668
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
669
+ end
670
+ end
671
+
672
+ module Leaf::Trav
673
+ def traverse_some_element(name_set)
674
+ end
675
+ end
676
+ # :startdoc:
677
+
678
+ module Traverse
679
+ # +traverse_text+ traverses texts in the tree
680
+ def traverse_text(&block) # :yields: text
681
+ traverse_text_internal(&block)
682
+ nil
683
+ end
684
+ end
685
+
686
+ # :stopdoc:
687
+ module Container::Trav
688
+ def traverse_text_internal(&block)
689
+ each_child {|c| c.traverse_text_internal(&block) }
690
+ end
691
+ end
692
+
693
+ module Leaf::Trav
694
+ def traverse_text_internal
695
+ end
696
+ end
697
+
698
+ module Text::Trav
699
+ def traverse_text_internal
700
+ yield self
701
+ end
702
+ end
703
+ # :startdoc:
704
+
705
+ module Container::Trav
706
+ # +filter+ rebuilds the tree without some components.
707
+ #
708
+ # node.filter {|descendant_node| predicate } -> node
709
+ # loc.filter {|descendant_loc| predicate } -> node
710
+ #
711
+ # +filter+ yields each node except top node.
712
+ # If given block returns false, corresponding node is dropped.
713
+ # If given block returns true, corresponding node is retained and
714
+ # inner nodes are examined.
715
+ #
716
+ # +filter+ returns an node.
717
+ # It doesn't return location object even if self is location object.
718
+ #
719
+ def filter(&block)
720
+ subst = {}
721
+ each_child_with_index {|descendant, i|
722
+ if yield descendant
723
+ if descendant.elem?
724
+ subst[i] = descendant.filter(&block)
725
+ else
726
+ subst[i] = descendant
727
+ end
728
+ else
729
+ subst[i] = nil
730
+ end
731
+ }
732
+ to_node.subst_subnode(subst)
733
+ end
734
+ end
735
+
736
+ module Doc::Trav
737
+ # +title+ searches title and return it as a text.
738
+ # It returns nil if not found.
739
+ #
740
+ # +title+ searchs following information.
741
+ #
742
+ # - <title>...</title> in HTML
743
+ # - <title>...</title> in RSS
744
+ def title
745
+ e = find_element('title',
746
+ '{http://www.w3.org/1999/xhtml}title',
747
+ '{http://purl.org/rss/1.0/}title',
748
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
749
+ e && e.extract_text
750
+ end
751
+
752
+ # +author+ searches author and return it as a text.
753
+ # It returns nil if not found.
754
+ #
755
+ # +author+ searchs following information.
756
+ #
757
+ # - <meta name="author" content="author-name"> in HTML
758
+ # - <link rev="made" title="author-name"> in HTML
759
+ # - <dc:creator>author-name</dc:creator> in RSS
760
+ # - <dc:publisher>author-name</dc:publisher> in RSS
761
+ def author
762
+ traverse_element('meta',
763
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
764
+ begin
765
+ next unless e.fetch_attr('name').downcase == 'author'
766
+ author = e.fetch_attribute('content').strip
767
+ return author if !author.empty?
768
+ rescue IndexError
769
+ end
770
+ }
771
+
772
+ traverse_element('link',
773
+ '{http://www.w3.org/1999/xhtml}link') {|e|
774
+ begin
775
+ next unless e.fetch_attr('rev').downcase == 'made'
776
+ author = e.fetch_attribute('title').strip
777
+ return author if !author.empty?
778
+ rescue IndexError
779
+ end
780
+ }
781
+
782
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
783
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
784
+ begin
785
+ author = e.extract_text.strip
786
+ return author if !author.empty?
787
+ rescue IndexError
788
+ end
789
+ }
790
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
791
+ begin
792
+ author = e.extract_text.strip
793
+ return author if !author.empty?
794
+ rescue IndexError
795
+ end
796
+ }
797
+ end
798
+
799
+ nil
800
+ end
801
+
802
+ end
803
+
804
+ module Doc::Trav
805
+ def root
806
+ es = []
807
+ children.each {|c| es << c if c.elem? } if children
808
+ raise Hpricot::Error, "no element" if es.empty?
809
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
810
+ es[0]
811
+ end
812
+ end
813
+
814
+ module Elem::Trav
815
+ def has_attribute?(name)
816
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
817
+ end
818
+ def get_attribute(name)
819
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
820
+ a = Hpricot.uxs(a) if a
821
+ a
822
+ end
823
+ alias_method :[], :get_attribute
824
+ def set_attribute(name, val)
825
+ altered!
826
+ self.raw_attributes ||= {}
827
+ self.raw_attributes[name.to_s] = val.fast_xs
828
+ end
829
+ alias_method :[]=, :set_attribute
830
+ def remove_attribute(name)
831
+ name = name.to_s
832
+ if has_attribute? name
833
+ altered!
834
+ self.raw_attributes.delete(name)
835
+ end
836
+ end
837
+ end
838
+
839
+ end