hpricot 0.7-x86-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,838 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Parses an HTML string, making an HTML fragment based on
24
+ # the options used to create the container document.
25
+ def make(input = nil, &blk)
26
+ if parent and parent.respond_to? :make
27
+ parent.make(input, &blk)
28
+ else
29
+ Hpricot.make(input, &blk).children
30
+ end
31
+ end
32
+
33
+ # Builds an HTML string from this node and its contents.
34
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
35
+ # as a method on this object.
36
+ def to_html
37
+ output("")
38
+ end
39
+ alias_method :to_s, :to_html
40
+
41
+ # Attempts to preserve the original HTML of the document, only
42
+ # outputing new tags for elements which have changed.
43
+ def to_original_html
44
+ output("", :preserve => true)
45
+ end
46
+
47
+ def index(name)
48
+ i = 0
49
+ return i if name == "*"
50
+ children.each do |x|
51
+ return i if (x.respond_to?(:name) and name == x.name) or
52
+ (x.text? and name == "text()")
53
+ i += 1
54
+ end if children
55
+ -1
56
+ end
57
+
58
+ # Puts together an array of neighboring nodes based on their proximity
59
+ # to this node. So, for example, to get the next node, you could use
60
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
61
+ #
62
+ # This method also accepts ranges and sets of numbers.
63
+ #
64
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
65
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
66
+ # ele.nodes_at(0, 5..6) # the current node and two others
67
+ def nodes_at(*pos)
68
+ sib = parent.children
69
+ i, si = 0, sib.index(self)
70
+ pos.map! do |r|
71
+ if r.is_a?(Range) and r.begin.is_a?(String)
72
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
73
+ end
74
+ r
75
+ end
76
+ p pos
77
+ Elements[*
78
+ sib.select do |x|
79
+ sel =
80
+ case i - si when *pos
81
+ true
82
+ end
83
+ i += 1
84
+ sel
85
+ end
86
+ ]
87
+ end
88
+
89
+ # Returns the node neighboring this node to the south: just below it.
90
+ # This method includes text nodes and comments and such.
91
+ def next
92
+ sib = parent.children
93
+ sib[sib.index(self) + 1] if parent
94
+ end
95
+ alias_method :next_node, :next
96
+
97
+ # Returns to node neighboring this node to the north: just above it.
98
+ # This method includes text nodes and comments and such.
99
+ def previous
100
+ sib = parent.children
101
+ x = sib.index(self) - 1
102
+ sib[x] if sib and x >= 0
103
+ end
104
+ alias_method :previous_node, :previous
105
+
106
+ # Find all preceding nodes.
107
+ def preceding
108
+ sibs = parent.children
109
+ si = sibs.index(self)
110
+ return Elements[*sibs[0...si]]
111
+ end
112
+
113
+ # Find all nodes which follow the current one.
114
+ def following
115
+ sibs = parent.children
116
+ si = sibs.index(self) + 1
117
+ return Elements[*sibs[si...sibs.length]]
118
+ end
119
+
120
+ # Adds elements immediately after this element, contained in the +html+ string.
121
+ def after(html = nil, &blk)
122
+ parent.insert_after(make(html, &blk), self)
123
+ end
124
+
125
+ # Adds elements immediately before this element, contained in the +html+ string.
126
+ def before(html = nil, &blk)
127
+ parent.insert_before(make(html, &blk), self)
128
+ end
129
+
130
+
131
+ # Replace this element and its contents with the nodes contained
132
+ # in the +html+ string.
133
+ def swap(html = nil, &blk)
134
+ parent.altered!
135
+ parent.replace_child(self, make(html, &blk))
136
+ end
137
+
138
+ def get_subnode(*indexes)
139
+ n = self
140
+ indexes.each {|index|
141
+ n = n.get_subnode_internal(index)
142
+ }
143
+ n
144
+ end
145
+
146
+ # Builds a string from the text contained in this node. All
147
+ # HTML elements are removed.
148
+ def to_plain_text
149
+ if respond_to?(:children) and children
150
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
151
+ else
152
+ ""
153
+ end
154
+ end
155
+
156
+ # Builds a string from the text contained in this node. All
157
+ # HTML elements are removed.
158
+ def inner_text
159
+ if respond_to?(:children) and children
160
+ children.map { |x| x.inner_text }.join
161
+ else
162
+ ""
163
+ end
164
+ end
165
+ alias_method :innerText, :inner_text
166
+
167
+ # Builds an HTML string from the contents of this node.
168
+ def html(inner = nil, &blk)
169
+ if inner or blk
170
+ altered!
171
+ case inner
172
+ when Array
173
+ self.children = inner
174
+ else
175
+ self.children = make(inner, &blk)
176
+ end
177
+ reparent self.children
178
+ else
179
+ if respond_to?(:children) and children
180
+ children.map { |x| x.output("") }.join
181
+ else
182
+ ""
183
+ end
184
+ end
185
+ end
186
+ alias_method :inner_html, :html
187
+ alias_method :innerHTML, :inner_html
188
+
189
+ # Inserts new contents into the current node, based on
190
+ # the HTML contained in string +inner+.
191
+ def inner_html=(inner)
192
+ html(inner || [])
193
+ end
194
+ alias_method :innerHTML=, :inner_html=
195
+
196
+ def reparent(nodes)
197
+ altered!
198
+ [*nodes].each { |e| e.parent = self }
199
+ end
200
+ private :reparent
201
+
202
+ def clean_path(path)
203
+ path.gsub(/^\s+|\s+$/, '')
204
+ end
205
+
206
+ # Builds a unique XPath string for this node, from the
207
+ # root of the document containing it.
208
+ def xpath
209
+ if elem? and has_attribute? 'id'
210
+ "//#{self.name}[@id='#{get_attribute('id')}']"
211
+ else
212
+ sim, id = 0, 0, 0
213
+ parent.children.each do |e|
214
+ id = sim if e == self
215
+ sim += 1 if e.pathname == self.pathname
216
+ end if parent.children
217
+ p = File.join(parent.xpath, self.pathname)
218
+ p += "[#{id+1}]" if sim >= 2
219
+ p
220
+ end
221
+ end
222
+
223
+ # Builds a unique CSS string for this node, from the
224
+ # root of the document containing it.
225
+ def css_path
226
+ if elem? and has_attribute? 'id'
227
+ "##{get_attribute('id')}"
228
+ else
229
+ sim, i, id = 0, 0, 0
230
+ parent.children.each do |e|
231
+ id = sim if e == self
232
+ sim += 1 if e.pathname == self.pathname
233
+ end if parent.children
234
+ p = parent.css_path
235
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
236
+ p += ":nth(#{id})" if sim >= 2
237
+ p
238
+ end
239
+ end
240
+
241
+ def node_position
242
+ parent.children.index(self)
243
+ end
244
+
245
+ def position
246
+ parent.children_of_type(self.pathname).index(self)
247
+ end
248
+
249
+ # Searches this node for all elements matching
250
+ # the CSS or XPath +expr+. Returns an Elements array
251
+ # containing the matching nodes. If +blk+ is given, it
252
+ # is used to iterate through the matching set.
253
+ def search(expr, &blk)
254
+ if Range === expr
255
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
256
+ end
257
+ last = nil
258
+ nodes = [self]
259
+ done = []
260
+ expr = expr.to_s
261
+ hist = []
262
+ until expr.empty?
263
+ expr = clean_path(expr)
264
+ expr.gsub!(%r!^//!, '')
265
+
266
+ case expr
267
+ when %r!^/?\.\.!
268
+ last = expr = $'
269
+ nodes.map! { |node| node.parent }
270
+ when %r!^[>/]\s*!
271
+ last = expr = $'
272
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
273
+ when %r!^\+!
274
+ last = expr = $'
275
+ nodes.map! do |node|
276
+ siblings = node.parent.children
277
+ siblings[siblings.index(node)+1]
278
+ end
279
+ nodes.compact!
280
+ when %r!^~!
281
+ last = expr = $'
282
+ nodes.map! do |node|
283
+ siblings = node.parent.children
284
+ siblings[(siblings.index(node)+1)..-1]
285
+ end
286
+ nodes.flatten!
287
+ when %r!^[|,]!
288
+ last = expr = " #$'"
289
+ nodes.shift if nodes.first == self
290
+ done += nodes
291
+ nodes = [self]
292
+ else
293
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
294
+ after = $'
295
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
296
+ oop = false
297
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
298
+ after = $'
299
+ m[2] += mt
300
+ expr = after
301
+ end
302
+ if m[1] == '#'
303
+ oid = get_element_by_id(m[2])
304
+ nodes = oid ? [oid] : []
305
+ expr = after
306
+ else
307
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
308
+ ret = []
309
+ nodes.each do |node|
310
+ case m[2]
311
+ when '*'
312
+ node.traverse_element { |n| ret << n }
313
+ else
314
+ if node.respond_to? :get_elements_by_tag_name
315
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
316
+ end
317
+ end
318
+ end
319
+ nodes = ret
320
+ end
321
+ last = nil
322
+ end
323
+
324
+ hist << expr
325
+ break if hist[-1] == hist[-2]
326
+ nodes, expr = Elements.filter(nodes, expr)
327
+ end
328
+ nodes = done + nodes.flatten.uniq
329
+ if blk
330
+ nodes.each(&blk)
331
+ self
332
+ else
333
+ Elements[*nodes]
334
+ end
335
+ end
336
+ alias_method :/, :search
337
+
338
+ # Find the first matching node for the CSS or XPath
339
+ # +expr+ string.
340
+ def at(expr)
341
+ search(expr).first
342
+ end
343
+ alias_method :%, :at
344
+
345
+ # +traverse_element+ traverses elements in the tree.
346
+ # It yields elements in depth first order.
347
+ #
348
+ # If _names_ are empty, it yields all elements.
349
+ # If non-empty _names_ are given, it should be list of universal names.
350
+ #
351
+ # A nested element is yielded in depth first order as follows.
352
+ #
353
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
354
+ # t.traverse_element("a", "c") {|e| p e}
355
+ # # =>
356
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
357
+ # {emptyelem <a id="1">}
358
+ # {emptyelem <c id="2">}
359
+ #
360
+ # Universal names are specified as follows.
361
+ #
362
+ # t = Hpricot(<<'End')
363
+ # <html>
364
+ # <meta name="robots" content="index,nofollow">
365
+ # <meta name="author" content="Who am I?">
366
+ # </html>
367
+ # End
368
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
369
+ # # =>
370
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
371
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
372
+ #
373
+ def traverse_element(*names, &block) # :yields: element
374
+ if names.empty?
375
+ traverse_all_element(&block)
376
+ else
377
+ name_set = {}
378
+ names.each {|n| name_set[n] = true }
379
+ traverse_some_element(name_set, &block)
380
+ end
381
+ nil
382
+ end
383
+
384
+ # Find children of a given +tag_name+.
385
+ #
386
+ # ele.children_of_type('p')
387
+ # #=> [...array of paragraphs...]
388
+ #
389
+ def children_of_type(tag_name)
390
+ if respond_to? :children
391
+ children.find_all do |x|
392
+ x.respond_to?(:pathname) && x.pathname == tag_name
393
+ end
394
+ end
395
+ end
396
+
397
+ end
398
+
399
+ module Container::Trav
400
+ # Return all children of this node which can contain other
401
+ # nodes. This is a good way to get all HTML elements which
402
+ # aren't text, comment, doctype or processing instruction nodes.
403
+ def containers
404
+ children.grep(Container::Trav)
405
+ end
406
+
407
+ # Returns the container node neighboring this node to the south: just below it.
408
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
409
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
410
+ def next_sibling
411
+ sib = parent.containers
412
+ sib[sib.index(self) + 1] if parent
413
+ end
414
+
415
+ # Returns the container node neighboring this node to the north: just above it.
416
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
417
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
418
+ def previous_sibling
419
+ sib = parent.containers
420
+ x = sib.index(self) - 1
421
+ sib[x] if sib and x >= 0
422
+ end
423
+
424
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
425
+ # out text and comment nodes.
426
+ def preceding_siblings()
427
+ sibs = parent.containers
428
+ si = sibs.index(self)
429
+ return Elements[*sibs[0...si]]
430
+ end
431
+
432
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
433
+ # out text and comment nodes.
434
+ def following_siblings()
435
+ sibs = parent.containers
436
+ si = sibs.index(self) + 1
437
+ return Elements[*sibs[si...sibs.length]]
438
+ end
439
+
440
+ # Puts together an array of neighboring sibling elements based on their proximity
441
+ # to this element.
442
+ #
443
+ # This method accepts ranges and sets of numbers.
444
+ #
445
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
446
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
447
+ # ele.siblings_at(0, 5..6) # the current element and two others
448
+ #
449
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
450
+ # Use nodes_at to include those nodes.
451
+ def siblings_at(*pos)
452
+ sib = parent.containers
453
+ i, si = 0, sib.index(self)
454
+ Elements[*
455
+ sib.select do |x|
456
+ sel = case i - si when *pos
457
+ true
458
+ end
459
+ i += 1
460
+ sel
461
+ end
462
+ ]
463
+ end
464
+
465
+ # Replace +old+, a child of the current node, with +new+ node.
466
+ def replace_child(old, new)
467
+ reparent new
468
+ children[children.index(old), 1] = [*new]
469
+ end
470
+
471
+ # Insert +nodes+, an array of HTML elements or a single element,
472
+ # before the node +ele+, a child of the current node.
473
+ def insert_before(nodes, ele)
474
+ case nodes
475
+ when Array
476
+ nodes.each { |n| insert_before(n, ele) }
477
+ else
478
+ reparent nodes
479
+ children[children.index(ele) || 0, 0] = nodes
480
+ end
481
+ end
482
+
483
+ # Insert +nodes+, an array of HTML elements or a single element,
484
+ # after the node +ele+, a child of the current node.
485
+ def insert_after(nodes, ele)
486
+ case nodes
487
+ when Array
488
+ nodes.reverse_each { |n| insert_after(n, ele) }
489
+ else
490
+ reparent nodes
491
+ idx = children.index(ele)
492
+ children[idx ? idx + 1 : children.length, 0] = nodes
493
+ end
494
+ end
495
+
496
+ # +each_child+ iterates over each child.
497
+ def each_child(&block) # :yields: child_node
498
+ children.each(&block) if children
499
+ nil
500
+ end
501
+
502
+ # +each_child_with_index+ iterates over each child.
503
+ def each_child_with_index(&block) # :yields: child_node, index
504
+ children.each_with_index(&block) if children
505
+ nil
506
+ end
507
+
508
+ # +find_element+ searches an element which universal name is specified by
509
+ # the arguments.
510
+ # It returns nil if not found.
511
+ def find_element(*names)
512
+ traverse_element(*names) {|e| return e }
513
+ nil
514
+ end
515
+
516
+ # Returns a list of CSS classes to which this element belongs.
517
+ def classes
518
+ get_attribute('class').to_s.strip.split(/\s+/)
519
+ end
520
+
521
+ def get_element_by_id(id)
522
+ traverse_all_element do |ele|
523
+ if ele.elem? and eid = ele.get_attribute('id')
524
+ return ele if eid.to_s == id
525
+ end
526
+ end
527
+ nil
528
+ end
529
+
530
+ def get_elements_by_tag_name(*a)
531
+ list = Elements[]
532
+ a.delete("*")
533
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
534
+ list << e if e.elem?
535
+ end
536
+ list
537
+ end
538
+
539
+ def each_hyperlink_attribute
540
+ traverse_element(
541
+ '{http://www.w3.org/1999/xhtml}a',
542
+ '{http://www.w3.org/1999/xhtml}area',
543
+ '{http://www.w3.org/1999/xhtml}link',
544
+ '{http://www.w3.org/1999/xhtml}img',
545
+ '{http://www.w3.org/1999/xhtml}object',
546
+ '{http://www.w3.org/1999/xhtml}q',
547
+ '{http://www.w3.org/1999/xhtml}blockquote',
548
+ '{http://www.w3.org/1999/xhtml}ins',
549
+ '{http://www.w3.org/1999/xhtml}del',
550
+ '{http://www.w3.org/1999/xhtml}form',
551
+ '{http://www.w3.org/1999/xhtml}input',
552
+ '{http://www.w3.org/1999/xhtml}head',
553
+ '{http://www.w3.org/1999/xhtml}base',
554
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
555
+ case elem.name
556
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
557
+ attrs = ['href']
558
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
559
+ attrs = ['src', 'longdesc', 'usemap']
560
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
561
+ attrs = ['classid', 'codebase', 'data', 'usemap']
562
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
563
+ attrs = ['cite']
564
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
565
+ attrs = ['action']
566
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
567
+ attrs = ['src', 'usemap']
568
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
569
+ attrs = ['profile']
570
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
571
+ attrs = ['src', 'for']
572
+ end
573
+ attrs.each {|attr|
574
+ if hyperlink = elem.get_attribute(attr)
575
+ yield elem, attr, hyperlink
576
+ end
577
+ }
578
+ }
579
+ end
580
+ private :each_hyperlink_attribute
581
+
582
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
583
+ # of A element.
584
+ #
585
+ # It yields Hpricot::Text and URI for each hyperlink.
586
+ #
587
+ # The URI objects are created with a base URI which is given by
588
+ # HTML BASE element or the argument ((|base_uri|)).
589
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
590
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
591
+ base_uri = URI.parse(base_uri) if String === base_uri
592
+ links = []
593
+ each_hyperlink_attribute {|elem, attr, hyperlink|
594
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
595
+ base_uri = URI.parse(hyperlink.to_s)
596
+ else
597
+ links << hyperlink
598
+ end
599
+ }
600
+ if base_uri
601
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
602
+ else
603
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
604
+ end
605
+ end
606
+
607
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
608
+ # of A element.
609
+ #
610
+ # It yields Hpricot::Text.
611
+ #
612
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
613
+ def each_hyperlink # :yields: text
614
+ links = []
615
+ each_hyperlink_attribute {|elem, attr, hyperlink|
616
+ yield hyperlink
617
+ }
618
+ end
619
+
620
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
621
+ # of A element.
622
+ #
623
+ # It yields URI for each hyperlink.
624
+ #
625
+ # The URI objects are created with a base URI which is given by
626
+ # HTML BASE element or the argument ((|base_uri|)).
627
+ def each_uri(base_uri=nil) # :yields: URI
628
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
629
+ end
630
+ end
631
+
632
+ # :stopdoc:
633
+ module Doc::Trav
634
+ def traverse_all_element(&block)
635
+ children.each {|c| c.traverse_all_element(&block) } if children
636
+ end
637
+ def xpath
638
+ "/"
639
+ end
640
+ def css_path
641
+ nil
642
+ end
643
+ end
644
+
645
+ module Elem::Trav
646
+ def traverse_all_element(&block)
647
+ yield self
648
+ children.each {|c| c.traverse_all_element(&block) } if children
649
+ end
650
+ end
651
+
652
+ module Leaf::Trav
653
+ def traverse_all_element
654
+ yield self
655
+ end
656
+ end
657
+
658
+ module Doc::Trav
659
+ def traverse_some_element(name_set, &block)
660
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
661
+ end
662
+ end
663
+
664
+ module Elem::Trav
665
+ def traverse_some_element(name_set, &block)
666
+ yield self if name_set.include? self.name
667
+ children.each {|c| c.traverse_some_element(name_set, &block) } if children
668
+ end
669
+ end
670
+
671
+ module Leaf::Trav
672
+ def traverse_some_element(name_set)
673
+ end
674
+ end
675
+ # :startdoc:
676
+
677
+ module Traverse
678
+ # +traverse_text+ traverses texts in the tree
679
+ def traverse_text(&block) # :yields: text
680
+ traverse_text_internal(&block)
681
+ nil
682
+ end
683
+ end
684
+
685
+ # :stopdoc:
686
+ module Container::Trav
687
+ def traverse_text_internal(&block)
688
+ each_child {|c| c.traverse_text_internal(&block) }
689
+ end
690
+ end
691
+
692
+ module Leaf::Trav
693
+ def traverse_text_internal
694
+ end
695
+ end
696
+
697
+ module Text::Trav
698
+ def traverse_text_internal
699
+ yield self
700
+ end
701
+ end
702
+ # :startdoc:
703
+
704
+ module Container::Trav
705
+ # +filter+ rebuilds the tree without some components.
706
+ #
707
+ # node.filter {|descendant_node| predicate } -> node
708
+ # loc.filter {|descendant_loc| predicate } -> node
709
+ #
710
+ # +filter+ yields each node except top node.
711
+ # If given block returns false, corresponding node is dropped.
712
+ # If given block returns true, corresponding node is retained and
713
+ # inner nodes are examined.
714
+ #
715
+ # +filter+ returns an node.
716
+ # It doesn't return location object even if self is location object.
717
+ #
718
+ def filter(&block)
719
+ subst = {}
720
+ each_child_with_index {|descendant, i|
721
+ if yield descendant
722
+ if descendant.elem?
723
+ subst[i] = descendant.filter(&block)
724
+ else
725
+ subst[i] = descendant
726
+ end
727
+ else
728
+ subst[i] = nil
729
+ end
730
+ }
731
+ to_node.subst_subnode(subst)
732
+ end
733
+ end
734
+
735
+ module Doc::Trav
736
+ # +title+ searches title and return it as a text.
737
+ # It returns nil if not found.
738
+ #
739
+ # +title+ searchs following information.
740
+ #
741
+ # - <title>...</title> in HTML
742
+ # - <title>...</title> in RSS
743
+ def title
744
+ e = find_element('title',
745
+ '{http://www.w3.org/1999/xhtml}title',
746
+ '{http://purl.org/rss/1.0/}title',
747
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
748
+ e && e.extract_text
749
+ end
750
+
751
+ # +author+ searches author and return it as a text.
752
+ # It returns nil if not found.
753
+ #
754
+ # +author+ searchs following information.
755
+ #
756
+ # - <meta name="author" content="author-name"> in HTML
757
+ # - <link rev="made" title="author-name"> in HTML
758
+ # - <dc:creator>author-name</dc:creator> in RSS
759
+ # - <dc:publisher>author-name</dc:publisher> in RSS
760
+ def author
761
+ traverse_element('meta',
762
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
763
+ begin
764
+ next unless e.fetch_attr('name').downcase == 'author'
765
+ author = e.fetch_attribute('content').strip
766
+ return author if !author.empty?
767
+ rescue IndexError
768
+ end
769
+ }
770
+
771
+ traverse_element('link',
772
+ '{http://www.w3.org/1999/xhtml}link') {|e|
773
+ begin
774
+ next unless e.fetch_attr('rev').downcase == 'made'
775
+ author = e.fetch_attribute('title').strip
776
+ return author if !author.empty?
777
+ rescue IndexError
778
+ end
779
+ }
780
+
781
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
782
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
783
+ begin
784
+ author = e.extract_text.strip
785
+ return author if !author.empty?
786
+ rescue IndexError
787
+ end
788
+ }
789
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
790
+ begin
791
+ author = e.extract_text.strip
792
+ return author if !author.empty?
793
+ rescue IndexError
794
+ end
795
+ }
796
+ end
797
+
798
+ nil
799
+ end
800
+
801
+ end
802
+
803
+ module Doc::Trav
804
+ def root
805
+ es = []
806
+ children.each {|c| es << c if c.elem? } if children
807
+ raise Hpricot::Error, "no element" if es.empty?
808
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
809
+ es[0]
810
+ end
811
+ end
812
+
813
+ module Elem::Trav
814
+ def has_attribute?(name)
815
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
816
+ end
817
+ def get_attribute(name)
818
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
819
+ a = Hpricot.uxs(a) if a
820
+ a
821
+ end
822
+ alias_method :[], :get_attribute
823
+ def set_attribute(name, val)
824
+ altered!
825
+ self.raw_attributes ||= {}
826
+ self.raw_attributes[name.to_s] = val.fast_xs
827
+ end
828
+ alias_method :[]=, :set_attribute
829
+ def remove_attribute(name)
830
+ name = name.to_s
831
+ if has_attribute? name
832
+ altered!
833
+ self.raw_attributes.delete(name)
834
+ end
835
+ end
836
+ end
837
+
838
+ end