jerryvos-hpricot 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/CHANGELOG +75 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +201 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_scan.c +6768 -0
  13. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  14. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  15. data/ext/hpricot_scan/hpricot_scan.rl +786 -0
  16. data/extras/mingw-rbconfig.rb +176 -0
  17. data/lib/hpricot.rb +26 -0
  18. data/lib/hpricot/blankslate.rb +63 -0
  19. data/lib/hpricot/builder.rb +216 -0
  20. data/lib/hpricot/elements.rb +510 -0
  21. data/lib/hpricot/htmlinfo.rb +691 -0
  22. data/lib/hpricot/inspect.rb +103 -0
  23. data/lib/hpricot/modules.rb +40 -0
  24. data/lib/hpricot/parse.rb +38 -0
  25. data/lib/hpricot/tag.rb +200 -0
  26. data/lib/hpricot/tags.rb +164 -0
  27. data/lib/hpricot/traverse.rb +838 -0
  28. data/lib/hpricot/xchar.rb +94 -0
  29. data/test/files/basic.xhtml +17 -0
  30. data/test/files/boingboing.html +2266 -0
  31. data/test/files/cy0.html +3653 -0
  32. data/test/files/immob.html +400 -0
  33. data/test/files/pace_application.html +1320 -0
  34. data/test/files/tenderlove.html +16 -0
  35. data/test/files/uswebgen.html +220 -0
  36. data/test/files/utf8.html +1054 -0
  37. data/test/files/week9.html +1723 -0
  38. data/test/files/why.xml +19 -0
  39. data/test/load_files.rb +7 -0
  40. data/test/test_alter.rb +77 -0
  41. data/test/test_builder.rb +37 -0
  42. data/test/test_parser.rb +420 -0
  43. data/test/test_paths.rb +25 -0
  44. data/test/test_preserved.rb +70 -0
  45. data/test/test_xml.rb +28 -0
  46. metadata +107 -0
@@ -0,0 +1,838 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Parses an HTML string, making an HTML fragment based on
24
+ # the options used to create the container document.
25
+ def make(input = nil, &blk)
26
+ if parent and parent.respond_to? :make
27
+ parent.make(input, &blk)
28
+ else
29
+ Hpricot.make(input, &blk).children
30
+ end
31
+ end
32
+
33
+ # Builds an HTML string from this node and its contents.
34
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
35
+ # as a method on this object.
36
+ def to_html
37
+ output("")
38
+ end
39
+ alias_method :to_s, :to_html
40
+
41
+ # Attempts to preserve the original HTML of the document, only
42
+ # outputing new tags for elements which have changed.
43
+ def to_original_html
44
+ output("", :preserve => true)
45
+ end
46
+
47
+ def index(name)
48
+ i = 0
49
+ return i if name == "*"
50
+ children.each do |x|
51
+ return i if (x.respond_to?(:name) and name == x.name) or
52
+ (x.text? and name == "text()")
53
+ i += 1
54
+ end if children
55
+ -1
56
+ end
57
+
58
+ # Puts together an array of neighboring nodes based on their proximity
59
+ # to this node. So, for example, to get the next node, you could use
60
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
61
+ #
62
+ # This method also accepts ranges and sets of numbers.
63
+ #
64
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
65
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
66
+ # ele.nodes_at(0, 5..6) # the current node and two others
67
+ def nodes_at(*pos)
68
+ sib = parent.children
69
+ i, si = 0, sib.index(self)
70
+ pos.map! do |r|
71
+ if r.is_a?(Range) and r.begin.is_a?(String)
72
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
73
+ end
74
+ r
75
+ end
76
+ p pos
77
+ Elements[*
78
+ sib.select do |x|
79
+ sel =
80
+ case i - si when *pos
81
+ true
82
+ end
83
+ i += 1
84
+ sel
85
+ end
86
+ ]
87
+ end
88
+
89
+ # Returns the node neighboring this node to the south: just below it.
90
+ # This method includes text nodes and comments and such.
91
+ def next
92
+ sib = parent.children
93
+ sib[sib.index(self) + 1] if parent
94
+ end
95
+ alias_method :next_node, :next
96
+
97
+ # Returns to node neighboring this node to the north: just above it.
98
+ # This method includes text nodes and comments and such.
99
+ def previous
100
+ sib = parent.children
101
+ x = sib.index(self) - 1
102
+ sib[x] if sib and x >= 0
103
+ end
104
+ alias_method :previous_node, :previous
105
+
106
+ # Find all preceding nodes.
107
+ def preceding
108
+ sibs = parent.children
109
+ si = sibs.index(self)
110
+ return Elements[*sibs[0...si]]
111
+ end
112
+
113
+ # Find all nodes which follow the current one.
114
+ def following
115
+ sibs = parent.children
116
+ si = sibs.index(self) + 1
117
+ return Elements[*sibs[si...sibs.length]]
118
+ end
119
+
120
+ # Adds elements immediately after this element, contained in the +html+ string.
121
+ def after(html = nil, &blk)
122
+ parent.insert_after(make(html, &blk), self)
123
+ end
124
+
125
+ # Adds elements immediately before this element, contained in the +html+ string.
126
+ def before(html = nil, &blk)
127
+ parent.insert_before(make(html, &blk), self)
128
+ end
129
+
130
+
131
+ # Replace this element and its contents with the nodes contained
132
+ # in the +html+ string.
133
+ def swap(html = nil, &blk)
134
+ parent.altered!
135
+ parent.replace_child(self, make(html, &blk))
136
+ end
137
+
138
+ def get_subnode(*indexes)
139
+ n = self
140
+ indexes.each {|index|
141
+ n = n.get_subnode_internal(index)
142
+ }
143
+ n
144
+ end
145
+
146
+ # Builds a string from the text contained in this node. All
147
+ # HTML elements are removed.
148
+ def to_plain_text
149
+ if respond_to?(:children) and children
150
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
151
+ else
152
+ ""
153
+ end
154
+ end
155
+
156
+ # Builds a string from the text contained in this node. All
157
+ # HTML elements are removed.
158
+ def inner_text
159
+ if respond_to?(:children) and children
160
+ children.map { |x| x.inner_text }.join
161
+ else
162
+ ""
163
+ end
164
+ end
165
+ alias_method :innerText, :inner_text
166
+
167
+ # Builds an HTML string from the contents of this node.
168
+ def html(inner = nil, &blk)
169
+ if inner or blk
170
+ altered!
171
+ case inner
172
+ when Array
173
+ self.children = inner
174
+ else
175
+ self.children = make(inner, &blk)
176
+ end
177
+ reparent self.children
178
+ else
179
+ if respond_to?(:children) and children
180
+ children.map { |x| x.output("") }.join
181
+ else
182
+ ""
183
+ end
184
+ end
185
+ end
186
+ alias_method :inner_html, :html
187
+ alias_method :innerHTML, :inner_html
188
+
189
+ # Inserts new contents into the current node, based on
190
+ # the HTML contained in string +inner+.
191
+ def inner_html=(inner)
192
+ html(inner || [])
193
+ end
194
+ alias_method :innerHTML=, :inner_html=
195
+
196
+ def reparent(nodes)
197
+ altered!
198
+ [*nodes].each { |e| e.parent = self }
199
+ end
200
+ private :reparent
201
+
202
+ def clean_path(path)
203
+ path.gsub(/^\s+|\s+$/, '')
204
+ end
205
+
206
+ # Builds a unique XPath string for this node, from the
207
+ # root of the document containing it.
208
+ def xpath
209
+ if elem? and has_attribute? 'id'
210
+ "//#{self.name}[@id='#{get_attribute('id')}']"
211
+ else
212
+ sim, id = 0, 0, 0
213
+ parent.children.each do |e|
214
+ id = sim if e == self
215
+ sim += 1 if e.pathname == self.pathname
216
+ end if parent.children
217
+ p = File.join(parent.xpath, self.pathname)
218
+ p += "[#{id+1}]" if sim >= 2
219
+ p
220
+ end
221
+ end
222
+
223
+ # Builds a unique CSS string for this node, from the
224
+ # root of the document containing it.
225
+ def css_path
226
+ if elem? and has_attribute? 'id'
227
+ "##{get_attribute('id')}"
228
+ else
229
+ sim, i, id = 0, 0, 0
230
+ parent.children.each do |e|
231
+ id = sim if e == self
232
+ sim += 1 if e.pathname == self.pathname
233
+ end if parent.children
234
+ p = parent.css_path
235
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
236
+ p += ":nth(#{id})" if sim >= 2
237
+ p
238
+ end
239
+ end
240
+
241
+ def node_position
242
+ parent.children.index(self)
243
+ end
244
+
245
+ def position
246
+ parent.children_of_type(self.pathname).index(self)
247
+ end
248
+
249
+ # Searches this node for all elements matching
250
+ # the CSS or XPath +expr+. Returns an Elements array
251
+ # containing the matching nodes. If +blk+ is given, it
252
+ # is used to iterate through the matching set.
253
+ def search(expr, &blk)
254
+ if Range === expr
255
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
256
+ end
257
+ last = nil
258
+ nodes = [self]
259
+ done = []
260
+ expr = expr.to_s
261
+ hist = []
262
+ until expr.empty?
263
+ expr = clean_path(expr)
264
+ expr.gsub!(%r!^//!, '')
265
+
266
+ case expr
267
+ when %r!^/?\.\.!
268
+ last = expr = $'
269
+ nodes.map! { |node| node.parent }
270
+ when %r!^[>/]\s*!
271
+ last = expr = $'
272
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
273
+ when %r!^\+!
274
+ last = expr = $'
275
+ nodes.map! do |node|
276
+ siblings = node.parent.children
277
+ siblings[siblings.index(node)+1]
278
+ end
279
+ nodes.compact!
280
+ when %r!^~!
281
+ last = expr = $'
282
+ nodes.map! do |node|
283
+ siblings = node.parent.children
284
+ siblings[(siblings.index(node)+1)..-1]
285
+ end
286
+ nodes.flatten!
287
+ when %r!^[|,]!
288
+ last = expr = " #$'"
289
+ nodes.shift if nodes.first == self
290
+ done += nodes
291
+ nodes = [self]
292
+ else
293
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
294
+ after = $'
295
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
296
+ oop = false
297
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
298
+ after = $'
299
+ m[2] += mt
300
+ expr = after
301
+ end
302
+ if m[1] == '#'
303
+ oid = get_element_by_id(m[2])
304
+ nodes = oid ? [oid] : []
305
+ expr = after
306
+ else
307
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
308
+ ret = []
309
+ nodes.each do |node|
310
+ case m[2]
311
+ when '*'
312
+ node.traverse_element { |n| ret << n }
313
+ else
314
+ if node.respond_to? :get_elements_by_tag_name
315
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
316
+ end
317
+ end
318
+ end
319
+ nodes = ret
320
+ end
321
+ last = nil
322
+ end
323
+
324
+ hist << expr
325
+ break if hist[-1] == hist[-2]
326
+ nodes, expr = Elements.filter(nodes, expr)
327
+ end
328
+ nodes = done + nodes.flatten.uniq
329
+ if blk
330
+ nodes.each(&blk)
331
+ self
332
+ else
333
+ Elements[*nodes]
334
+ end
335
+ end
336
+ alias_method :/, :search
337
+
338
+ # Find the first matching node for the CSS or XPath
339
+ # +expr+ string.
340
+ def at(expr)
341
+ search(expr).first
342
+ end
343
+ alias_method :%, :at
344
+
345
+ # +traverse_element+ traverses elements in the tree.
346
+ # It yields elements in depth first order.
347
+ #
348
+ # If _names_ are empty, it yields all elements.
349
+ # If non-empty _names_ are given, it should be list of universal names.
350
+ #
351
+ # A nested element is yielded in depth first order as follows.
352
+ #
353
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
354
+ # t.traverse_element("a", "c") {|e| p e}
355
+ # # =>
356
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
357
+ # {emptyelem <a id="1">}
358
+ # {emptyelem <c id="2">}
359
+ #
360
+ # Universal names are specified as follows.
361
+ #
362
+ # t = Hpricot(<<'End')
363
+ # <html>
364
+ # <meta name="robots" content="index,nofollow">
365
+ # <meta name="author" content="Who am I?">
366
+ # </html>
367
+ # End
368
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
369
+ # # =>
370
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
371
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
372
+ #
373
+ def traverse_element(*names, &block) # :yields: element
374
+ if names.empty?
375
+ traverse_all_element(&block)
376
+ else
377
+ name_set = {}
378
+ names.each {|n| name_set[n] = true }
379
+ traverse_some_element(name_set, &block)
380
+ end
381
+ nil
382
+ end
383
+
384
+ # Find children of a given +tag_name+.
385
+ #
386
+ # ele.children_of_type('p')
387
+ # #=> [...array of paragraphs...]
388
+ #
389
+ def children_of_type(tag_name)
390
+ if respond_to? :children
391
+ children.find_all do |x|
392
+ x.respond_to?(:pathname) && x.pathname == tag_name
393
+ end
394
+ end
395
+ end
396
+
397
+ end
398
+
399
+ module Container::Trav
400
+ # Return all children of this node which can contain other
401
+ # nodes. This is a good way to get all HTML elements which
402
+ # aren't text, comment, doctype or processing instruction nodes.
403
+ def containers
404
+ children.grep(Container::Trav)
405
+ end
406
+
407
+ # Returns the container node neighboring this node to the south: just below it.
408
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
409
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
410
+ def next_sibling
411
+ sib = parent.containers
412
+ sib[sib.index(self) + 1] if parent
413
+ end
414
+
415
+ # Returns the container node neighboring this node to the north: just above it.
416
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
417
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
418
+ def previous_sibling
419
+ sib = parent.containers
420
+ x = sib.index(self) - 1
421
+ sib[x] if sib and x >= 0
422
+ end
423
+
424
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
425
+ # out text and comment nodes.
426
+ def preceding_siblings()
427
+ sibs = parent.containers
428
+ si = sibs.index(self)
429
+ return Elements[*sibs[0...si]]
430
+ end
431
+
432
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
433
+ # out text and comment nodes.
434
+ def following_siblings()
435
+ sibs = parent.containers
436
+ si = sibs.index(self) + 1
437
+ return Elements[*sibs[si...sibs.length]]
438
+ end
439
+
440
+ # Puts together an array of neighboring sibling elements based on their proximity
441
+ # to this element.
442
+ #
443
+ # This method accepts ranges and sets of numbers.
444
+ #
445
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
446
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
447
+ # ele.siblings_at(0, 5..6) # the current element and two others
448
+ #
449
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
450
+ # Use nodes_at to include those nodes.
451
+ def siblings_at(*pos)
452
+ sib = parent.containers
453
+ i, si = 0, sib.index(self)
454
+ Elements[*
455
+ sib.select do |x|
456
+ sel = case i - si when *pos
457
+ true
458
+ end
459
+ i += 1
460
+ sel
461
+ end
462
+ ]
463
+ end
464
+
465
+ # Replace +old+, a child of the current node, with +new+ node.
466
+ def replace_child(old, new)
467
+ reparent new
468
+ children[children.index(old), 1] = [*new]
469
+ end
470
+
471
+ # Insert +nodes+, an array of HTML elements or a single element,
472
+ # before the node +ele+, a child of the current node.
473
+ def insert_before(nodes, ele)
474
+ case nodes
475
+ when Array
476
+ nodes.each { |n| insert_before(n, ele) }
477
+ else
478
+ reparent nodes
479
+ children[children.index(ele) || 0, 0] = nodes
480
+ end
481
+ end
482
+
483
+ # Insert +nodes+, an array of HTML elements or a single element,
484
+ # after the node +ele+, a child of the current node.
485
+ def insert_after(nodes, ele)
486
+ case nodes
487
+ when Array
488
+ nodes.reverse_each { |n| insert_after(n, ele) }
489
+ else
490
+ reparent nodes
491
+ idx = children.index(ele)
492
+ children[idx ? idx + 1 : children.length, 0] = nodes
493
+ end
494
+ end
495
+
496
+ # +each_child+ iterates over each child.
497
+ def each_child(&block) # :yields: child_node
498
+ children.each(&block) if children
499
+ nil
500
+ end
501
+
502
+ # +each_child_with_index+ iterates over each child.
503
+ def each_child_with_index(&block) # :yields: child_node, index
504
+ children.each_with_index(&block) if children
505
+ nil
506
+ end
507
+
508
+ # +find_element+ searches an element which universal name is specified by
509
+ # the arguments.
510
+ # It returns nil if not found.
511
+ def find_element(*names)
512
+ traverse_element(*names) {|e| return e }
513
+ nil
514
+ end
515
+
516
+ # Returns a list of CSS classes to which this element belongs.
517
+ def classes
518
+ get_attribute('class').to_s.strip.split(/\s+/)
519
+ end
520
+
521
+ def get_element_by_id(id)
522
+ traverse_all_element do |ele|
523
+ if ele.elem? and eid = ele.get_attribute('id')
524
+ return ele if eid.to_s == id
525
+ end
526
+ end
527
+ nil
528
+ end
529
+
530
+ def get_elements_by_tag_name(*a)
531
+ list = Elements[]
532
+ a.delete("*")
533
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
534
+ list << e if e.elem?
535
+ end
536
+ list
537
+ end
538
+
539
+ def each_hyperlink_attribute
540
+ traverse_element(
541
+ '{http://www.w3.org/1999/xhtml}a',
542
+ '{http://www.w3.org/1999/xhtml}area',
543
+ '{http://www.w3.org/1999/xhtml}link',
544
+ '{http://www.w3.org/1999/xhtml}img',
545
+ '{http://www.w3.org/1999/xhtml}object',
546
+ '{http://www.w3.org/1999/xhtml}q',
547
+ '{http://www.w3.org/1999/xhtml}blockquote',
548
+ '{http://www.w3.org/1999/xhtml}ins',
549
+ '{http://www.w3.org/1999/xhtml}del',
550
+ '{http://www.w3.org/1999/xhtml}form',
551
+ '{http://www.w3.org/1999/xhtml}input',
552
+ '{http://www.w3.org/1999/xhtml}head',
553
+ '{http://www.w3.org/1999/xhtml}base',
554
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
555
+ case elem.name
556
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
557
+ attrs = ['href']
558
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
559
+ attrs = ['src', 'longdesc', 'usemap']
560
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
561
+ attrs = ['classid', 'codebase', 'data', 'usemap']
562
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
563
+ attrs = ['cite']
564
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
565
+ attrs = ['action']
566
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
567
+ attrs = ['src', 'usemap']
568
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
569
+ attrs = ['profile']
570
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
571
+ attrs = ['src', 'for']
572
+ end
573
+ attrs.each {|attr|
574
+ if hyperlink = elem.get_attribute(attr)
575
+ yield elem, attr, hyperlink
576
+ end
577
+ }
578
+ }
579
+ end
580
+ private :each_hyperlink_attribute
581
+
582
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
583
+ # of A element.
584
+ #
585
+ # It yields Hpricot::Text and URI for each hyperlink.
586
+ #
587
+ # The URI objects are created with a base URI which is given by
588
+ # HTML BASE element or the argument ((|base_uri|)).
589
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
590
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
591
+ base_uri = URI.parse(base_uri) if String === base_uri
592
+ links = []
593
+ each_hyperlink_attribute {|elem, attr, hyperlink|
594
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
595
+ base_uri = URI.parse(hyperlink.to_s)
596
+ else
597
+ links << hyperlink
598
+ end
599
+ }
600
+ if base_uri
601
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
602
+ else
603
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
604
+ end
605
+ end
606
+
607
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
608
+ # of A element.
609
+ #
610
+ # It yields Hpricot::Text.
611
+ #
612
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
613
+ def each_hyperlink # :yields: text
614
+ links = []
615
+ each_hyperlink_attribute {|elem, attr, hyperlink|
616
+ yield hyperlink
617
+ }
618
+ end
619
+
620
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
621
+ # of A element.
622
+ #
623
+ # It yields URI for each hyperlink.
624
+ #
625
+ # The URI objects are created with a base URI which is given by
626
+ # HTML BASE element or the argument ((|base_uri|)).
627
+ def each_uri(base_uri=nil) # :yields: URI
628
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
629
+ end
630
+ end
631
+
632
+ # :stopdoc:
633
+ module Doc::Trav
634
+ def traverse_all_element(&block)
635
+ children.each {|c| c.traverse_all_element(&block) } if children
636
+ end
637
+ def xpath
638
+ "/"
639
+ end
640
+ def css_path
641
+ nil
642
+ end
643
+ end
644
+
645
+ module Elem::Trav
646
+ def traverse_all_element(&block)
647
+ yield self
648
+ children.each {|c| c.traverse_all_element(&block) } if children
649
+ end
650
+ end
651
+
652
+ module Leaf::Trav
653
+ def traverse_all_element
654
+ yield self
655
+ end
656
+ end
657
+
658
+ module Doc::Trav
659
+ def traverse_some_element(name_set, &block)
660
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
661
+ end
662
+ end
663
+
664
+ module Elem::Trav
665
+ def traverse_some_element(name_set, &block)
666
+ yield self if name_set.include? self.name
667
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
668
+ end
669
+ end
670
+
671
+ module Leaf::Trav
672
+ def traverse_some_element(name_set)
673
+ end
674
+ end
675
+ # :startdoc:
676
+
677
+ module Traverse
678
+ # +traverse_text+ traverses texts in the tree
679
+ def traverse_text(&block) # :yields: text
680
+ traverse_text_internal(&block)
681
+ nil
682
+ end
683
+ end
684
+
685
+ # :stopdoc:
686
+ module Container::Trav
687
+ def traverse_text_internal(&block)
688
+ each_child {|c| c.traverse_text_internal(&block) }
689
+ end
690
+ end
691
+
692
+ module Leaf::Trav
693
+ def traverse_text_internal
694
+ end
695
+ end
696
+
697
+ module Text::Trav
698
+ def traverse_text_internal
699
+ yield self
700
+ end
701
+ end
702
+ # :startdoc:
703
+
704
+ module Container::Trav
705
+ # +filter+ rebuilds the tree without some components.
706
+ #
707
+ # node.filter {|descendant_node| predicate } -> node
708
+ # loc.filter {|descendant_loc| predicate } -> node
709
+ #
710
+ # +filter+ yields each node except top node.
711
+ # If given block returns false, corresponding node is dropped.
712
+ # If given block returns true, corresponding node is retained and
713
+ # inner nodes are examined.
714
+ #
715
+ # +filter+ returns an node.
716
+ # It doesn't return location object even if self is location object.
717
+ #
718
+ def filter(&block)
719
+ subst = {}
720
+ each_child_with_index {|descendant, i|
721
+ if yield descendant
722
+ if descendant.elem?
723
+ subst[i] = descendant.filter(&block)
724
+ else
725
+ subst[i] = descendant
726
+ end
727
+ else
728
+ subst[i] = nil
729
+ end
730
+ }
731
+ to_node.subst_subnode(subst)
732
+ end
733
+ end
734
+
735
+ module Doc::Trav
736
+ # +title+ searches title and return it as a text.
737
+ # It returns nil if not found.
738
+ #
739
+ # +title+ searchs following information.
740
+ #
741
+ # - <title>...</title> in HTML
742
+ # - <title>...</title> in RSS
743
+ def title
744
+ e = find_element('title',
745
+ '{http://www.w3.org/1999/xhtml}title',
746
+ '{http://purl.org/rss/1.0/}title',
747
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
748
+ e && e.extract_text
749
+ end
750
+
751
+ # +author+ searches author and return it as a text.
752
+ # It returns nil if not found.
753
+ #
754
+ # +author+ searchs following information.
755
+ #
756
+ # - <meta name="author" content="author-name"> in HTML
757
+ # - <link rev="made" title="author-name"> in HTML
758
+ # - <dc:creator>author-name</dc:creator> in RSS
759
+ # - <dc:publisher>author-name</dc:publisher> in RSS
760
+ def author
761
+ traverse_element('meta',
762
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
763
+ begin
764
+ next unless e.fetch_attr('name').downcase == 'author'
765
+ author = e.fetch_attribute('content').strip
766
+ return author if !author.empty?
767
+ rescue IndexError
768
+ end
769
+ }
770
+
771
+ traverse_element('link',
772
+ '{http://www.w3.org/1999/xhtml}link') {|e|
773
+ begin
774
+ next unless e.fetch_attr('rev').downcase == 'made'
775
+ author = e.fetch_attribute('title').strip
776
+ return author if !author.empty?
777
+ rescue IndexError
778
+ end
779
+ }
780
+
781
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
782
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
783
+ begin
784
+ author = e.extract_text.strip
785
+ return author if !author.empty?
786
+ rescue IndexError
787
+ end
788
+ }
789
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
790
+ begin
791
+ author = e.extract_text.strip
792
+ return author if !author.empty?
793
+ rescue IndexError
794
+ end
795
+ }
796
+ end
797
+
798
+ nil
799
+ end
800
+
801
+ end
802
+
803
+ module Doc::Trav
804
+ def root
805
+ es = []
806
+ children.each {|c| es << c if c.elem? } if children
807
+ raise Hpricot::Error, "no element" if es.empty?
808
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
809
+ es[0]
810
+ end
811
+ end
812
+
813
+ module Elem::Trav
814
+ def has_attribute?(name)
815
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
816
+ end
817
+ def get_attribute(name)
818
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
819
+ a = Hpricot.uxs(a) if a
820
+ a
821
+ end
822
+ alias_method :[], :get_attribute
823
+ def set_attribute(name, val)
824
+ altered!
825
+ self.raw_attributes ||= {}
826
+ self.raw_attributes[name.to_s] = val.fast_xs
827
+ end
828
+ alias_method :[]=, :set_attribute
829
+ def remove_attribute(name)
830
+ name = name.to_s
831
+ if has_attribute? name
832
+ altered!
833
+ self.raw_attributes.delete(name)
834
+ end
835
+ end
836
+ end
837
+
838
+ end