hpricot 0.6-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end
@@ -0,0 +1,821 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Builds an HTML string from this node and its contents.
24
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
25
+ # as a method on this object.
26
+ def to_html
27
+ output("")
28
+ end
29
+ alias_method :to_s, :to_html
30
+
31
+ # Attempts to preserve the original HTML of the document, only
32
+ # outputing new tags for elements which have changed.
33
+ def to_original_html
34
+ output("", :preserve => true)
35
+ end
36
+
37
+ def index(name)
38
+ i = 0
39
+ return i if name == "*"
40
+ children.each do |x|
41
+ return i if (x.respond_to?(:name) and name == x.name) or
42
+ (x.text? and name == "text()")
43
+ i += 1
44
+ end
45
+ -1
46
+ end
47
+
48
+ # Puts together an array of neighboring nodes based on their proximity
49
+ # to this node. So, for example, to get the next node, you could use
50
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
51
+ #
52
+ # This method also accepts ranges and sets of numbers.
53
+ #
54
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
55
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
56
+ # ele.nodes_at(0, 5..6) # the current node and two others
57
+ def nodes_at(*pos)
58
+ sib = parent.children
59
+ i, si = 0, sib.index(self)
60
+ pos.map! do |r|
61
+ if r.is_a?(Range) and r.begin.is_a?(String)
62
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
63
+ end
64
+ r
65
+ end
66
+ p pos
67
+ Elements[*
68
+ sib.select do |x|
69
+ sel =
70
+ case i - si when *pos
71
+ true
72
+ end
73
+ i += 1
74
+ sel
75
+ end
76
+ ]
77
+ end
78
+
79
+ # Returns the node neighboring this node to the south: just below it.
80
+ # This method includes text nodes and comments and such.
81
+ def next
82
+ sib = parent.children
83
+ sib[sib.index(self) + 1] if parent
84
+ end
85
+ alias_method :next_node, :next
86
+
87
+ # Returns to node neighboring this node to the north: just above it.
88
+ # This method includes text nodes and comments and such.
89
+ def previous
90
+ sib = parent.children
91
+ x = sib.index(self) - 1
92
+ sib[x] if sib and x >= 0
93
+ end
94
+ alias_method :previous_node, :previous
95
+
96
+ # Find all preceding nodes.
97
+ def preceding
98
+ sibs = parent.children
99
+ si = sibs.index(self)
100
+ return Elements[*sibs[0...si]]
101
+ end
102
+
103
+ # Find all nodes which follow the current one.
104
+ def following
105
+ sibs = parent.children
106
+ si = sibs.index(self) + 1
107
+ return Elements[*sibs[si...sibs.length]]
108
+ end
109
+
110
+ # Adds elements immediately after this element, contained in the +html+ string.
111
+ def after(html = nil, &blk)
112
+ parent.insert_after(Hpricot.make(html, &blk), self)
113
+ end
114
+
115
+ # Adds elements immediately before this element, contained in the +html+ string.
116
+ def before(html = nil, &blk)
117
+ parent.insert_before(Hpricot.make(html, &blk), self)
118
+ end
119
+
120
+
121
+ # Replace this element and its contents with the nodes contained
122
+ # in the +html+ string.
123
+ def swap(html = nil, &blk)
124
+ parent.altered!
125
+ parent.replace_child(self, Hpricot.make(html, &blk))
126
+ end
127
+
128
+ def get_subnode(*indexes)
129
+ n = self
130
+ indexes.each {|index|
131
+ n = n.get_subnode_internal(index)
132
+ }
133
+ n
134
+ end
135
+
136
+ # Builds a string from the text contained in this node. All
137
+ # HTML elements are removed.
138
+ def to_plain_text
139
+ if respond_to? :children
140
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
141
+ end
142
+ end
143
+
144
+ # Builds a string from the text contained in this node. All
145
+ # HTML elements are removed.
146
+ def inner_text
147
+ if respond_to? :children
148
+ children.map { |x| x.inner_text }.join
149
+ end
150
+ end
151
+ alias_method :innerText, :inner_text
152
+
153
+ # Builds an HTML string from the contents of this node.
154
+ def html(inner = nil, &blk)
155
+ if inner or blk
156
+ altered!
157
+ case inner
158
+ when Array
159
+ self.children = inner
160
+ else
161
+ self.children = Hpricot.make(inner, &blk)
162
+ end
163
+ reparent self.children
164
+ else
165
+ if respond_to? :children
166
+ children.map { |x| x.output("") }.join
167
+ end
168
+ end
169
+ end
170
+ alias_method :inner_html, :html
171
+ alias_method :innerHTML, :inner_html
172
+
173
+ # Inserts new contents into the current node, based on
174
+ # the HTML contained in string +inner+.
175
+ def inner_html=(inner)
176
+ html(inner || [])
177
+ end
178
+ alias_method :innerHTML=, :inner_html=
179
+
180
+ def reparent(nodes)
181
+ altered!
182
+ [*nodes].each { |e| e.parent = self }
183
+ end
184
+ private :reparent
185
+
186
+ def clean_path(path)
187
+ path.gsub(/^\s+|\s+$/, '')
188
+ end
189
+
190
+ # Builds a unique XPath string for this node, from the
191
+ # root of the document containing it.
192
+ def xpath
193
+ if elem? and has_attribute? 'id'
194
+ "//#{self.name}[@id='#{get_attribute('id')}']"
195
+ else
196
+ sim, id = 0, 0, 0
197
+ parent.children.each do |e|
198
+ id = sim if e == self
199
+ sim += 1 if e.pathname == self.pathname
200
+ end
201
+ p = File.join(parent.xpath, self.pathname)
202
+ p += "[#{id+1}]" if sim >= 2
203
+ p
204
+ end
205
+ end
206
+
207
+ # Builds a unique CSS string for this node, from the
208
+ # root of the document containing it.
209
+ def css_path
210
+ if elem? and has_attribute? 'id'
211
+ "##{get_attribute('id')}"
212
+ else
213
+ sim, i, id = 0, 0, 0
214
+ parent.children.each do |e|
215
+ id = sim if e == self
216
+ sim += 1 if e.pathname == self.pathname
217
+ end
218
+ p = parent.css_path
219
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
220
+ p += ":nth(#{id})" if sim >= 2
221
+ p
222
+ end
223
+ end
224
+
225
+ def node_position
226
+ parent.children.index(self)
227
+ end
228
+
229
+ def position
230
+ parent.children_of_type(self.pathname).index(self)
231
+ end
232
+
233
+ # Searches this node for all elements matching
234
+ # the CSS or XPath +expr+. Returns an Elements array
235
+ # containing the matching nodes. If +blk+ is given, it
236
+ # is used to iterate through the matching set.
237
+ def search(expr, &blk)
238
+ if Range === expr
239
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
240
+ end
241
+ last = nil
242
+ nodes = [self]
243
+ done = []
244
+ expr = expr.to_s
245
+ hist = []
246
+ until expr.empty?
247
+ expr = clean_path(expr)
248
+ expr.gsub!(%r!^//!, '')
249
+
250
+ case expr
251
+ when %r!^/?\.\.!
252
+ last = expr = $'
253
+ nodes.map! { |node| node.parent }
254
+ when %r!^[>/]\s*!
255
+ last = expr = $'
256
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
257
+ when %r!^\+!
258
+ last = expr = $'
259
+ nodes.map! do |node|
260
+ siblings = node.parent.children
261
+ siblings[siblings.index(node)+1]
262
+ end
263
+ nodes.compact!
264
+ when %r!^~!
265
+ last = expr = $'
266
+ nodes.map! do |node|
267
+ siblings = node.parent.children
268
+ siblings[(siblings.index(node)+1)..-1]
269
+ end
270
+ nodes.flatten!
271
+ when %r!^[|,]!
272
+ last = expr = " #$'"
273
+ nodes.shift if nodes.first == self
274
+ done += nodes
275
+ nodes = [self]
276
+ else
277
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
278
+ after = $'
279
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
280
+ oop = false
281
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
282
+ after = $'
283
+ m[2] += mt
284
+ expr = after
285
+ end
286
+ if m[1] == '#'
287
+ oid = get_element_by_id(m[2])
288
+ nodes = oid ? [oid] : []
289
+ expr = after
290
+ else
291
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
292
+ ret = []
293
+ nodes.each do |node|
294
+ case m[2]
295
+ when '*'
296
+ node.traverse_element { |n| ret << n }
297
+ else
298
+ if node.respond_to? :get_elements_by_tag_name
299
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
300
+ end
301
+ end
302
+ end
303
+ nodes = ret
304
+ end
305
+ last = nil
306
+ end
307
+
308
+ hist << expr
309
+ break if hist[-1] == hist[-2]
310
+ nodes, expr = Elements.filter(nodes, expr)
311
+ end
312
+ nodes = done + nodes.flatten.uniq
313
+ if blk
314
+ nodes.each(&blk)
315
+ self
316
+ else
317
+ Elements[*nodes]
318
+ end
319
+ end
320
+ alias_method :/, :search
321
+
322
+ # Find the first matching node for the CSS or XPath
323
+ # +expr+ string.
324
+ def at(expr)
325
+ search(expr).first
326
+ end
327
+ alias_method :%, :at
328
+
329
+ # +traverse_element+ traverses elements in the tree.
330
+ # It yields elements in depth first order.
331
+ #
332
+ # If _names_ are empty, it yields all elements.
333
+ # If non-empty _names_ are given, it should be list of universal names.
334
+ #
335
+ # A nested element is yielded in depth first order as follows.
336
+ #
337
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
338
+ # t.traverse_element("a", "c") {|e| p e}
339
+ # # =>
340
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
341
+ # {emptyelem <a id="1">}
342
+ # {emptyelem <c id="2">}
343
+ #
344
+ # Universal names are specified as follows.
345
+ #
346
+ # t = Hpricot(<<'End')
347
+ # <html>
348
+ # <meta name="robots" content="index,nofollow">
349
+ # <meta name="author" content="Who am I?">
350
+ # </html>
351
+ # End
352
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
353
+ # # =>
354
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
355
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
356
+ #
357
+ def traverse_element(*names, &block) # :yields: element
358
+ if names.empty?
359
+ traverse_all_element(&block)
360
+ else
361
+ name_set = {}
362
+ names.each {|n| name_set[n] = true }
363
+ traverse_some_element(name_set, &block)
364
+ end
365
+ nil
366
+ end
367
+
368
+ # Find children of a given +tag_name+.
369
+ #
370
+ # ele.children_of_type('p')
371
+ # #=> [...array of paragraphs...]
372
+ #
373
+ def children_of_type(tag_name)
374
+ if respond_to? :children
375
+ children.find_all do |x|
376
+ x.respond_to?(:pathname) && x.pathname == tag_name
377
+ end
378
+ end
379
+ end
380
+
381
+ end
382
+
383
+ module Container::Trav
384
+ # Return all children of this node which can contain other
385
+ # nodes. This is a good way to get all HTML elements which
386
+ # aren't text, comment, doctype or processing instruction nodes.
387
+ def containers
388
+ children.grep(Container::Trav)
389
+ end
390
+
391
+ # Returns the container node neighboring this node to the south: just below it.
392
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
393
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
394
+ def next_sibling
395
+ sib = parent.containers
396
+ sib[sib.index(self) + 1] if parent
397
+ end
398
+
399
+ # Returns the container node neighboring this node to the north: just above it.
400
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
401
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
402
+ def previous_sibling
403
+ sib = parent.containers
404
+ x = sib.index(self) - 1
405
+ sib[x] if sib and x >= 0
406
+ end
407
+
408
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
409
+ # out text and comment nodes.
410
+ def preceding_siblings()
411
+ sibs = parent.containers
412
+ si = sibs.index(self)
413
+ return Elements[*sibs[0...si]]
414
+ end
415
+
416
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
417
+ # out text and comment nodes.
418
+ def following_siblings()
419
+ sibs = parent.containers
420
+ si = sibs.index(self) + 1
421
+ return Elements[*sibs[si...sibs.length]]
422
+ end
423
+
424
+ # Puts together an array of neighboring sibling elements based on their proximity
425
+ # to this element.
426
+ #
427
+ # This method accepts ranges and sets of numbers.
428
+ #
429
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
430
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
431
+ # ele.siblings_at(0, 5..6) # the current element and two others
432
+ #
433
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
434
+ # Use nodes_at to include those nodes.
435
+ def siblings_at(*pos)
436
+ sib = parent.containers
437
+ i, si = 0, sib.index(self)
438
+ Elements[*
439
+ sib.select do |x|
440
+ sel = case i - si when *pos
441
+ true
442
+ end
443
+ i += 1
444
+ sel
445
+ end
446
+ ]
447
+ end
448
+
449
+ # Replace +old+, a child of the current node, with +new+ node.
450
+ def replace_child(old, new)
451
+ reparent new
452
+ children[children.index(old), 1] = [*new]
453
+ end
454
+
455
+ # Insert +nodes+, an array of HTML elements or a single element,
456
+ # before the node +ele+, a child of the current node.
457
+ def insert_before(nodes, ele)
458
+ case nodes
459
+ when Array
460
+ nodes.each { |n| insert_before(n, ele) }
461
+ else
462
+ reparent nodes
463
+ children[children.index(ele) || 0, 0] = nodes
464
+ end
465
+ end
466
+
467
+ # Insert +nodes+, an array of HTML elements or a single element,
468
+ # after the node +ele+, a child of the current node.
469
+ def insert_after(nodes, ele)
470
+ case nodes
471
+ when Array
472
+ nodes.reverse_each { |n| insert_after(n, ele) }
473
+ else
474
+ reparent nodes
475
+ idx = children.index(ele)
476
+ children[idx ? idx + 1 : children.length, 0] = nodes
477
+ end
478
+ end
479
+
480
+ # +each_child+ iterates over each child.
481
+ def each_child(&block) # :yields: child_node
482
+ children.each(&block)
483
+ nil
484
+ end
485
+
486
+ # +each_child_with_index+ iterates over each child.
487
+ def each_child_with_index(&block) # :yields: child_node, index
488
+ children.each_with_index(&block)
489
+ nil
490
+ end
491
+
492
+ # +find_element+ searches an element which universal name is specified by
493
+ # the arguments.
494
+ # It returns nil if not found.
495
+ def find_element(*names)
496
+ traverse_element(*names) {|e| return e }
497
+ nil
498
+ end
499
+
500
+ # Returns a list of CSS classes to which this element belongs.
501
+ def classes
502
+ get_attribute('class').to_s.strip.split(/\s+/)
503
+ end
504
+
505
+ def get_element_by_id(id)
506
+ traverse_all_element do |ele|
507
+ if ele.elem? and eid = ele.get_attribute('id')
508
+ return ele if eid.to_s == id
509
+ end
510
+ end
511
+ nil
512
+ end
513
+
514
+ def get_elements_by_tag_name(*a)
515
+ list = Elements[]
516
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
517
+ list << e
518
+ end
519
+ list
520
+ end
521
+
522
+ def each_hyperlink_attribute
523
+ traverse_element(
524
+ '{http://www.w3.org/1999/xhtml}a',
525
+ '{http://www.w3.org/1999/xhtml}area',
526
+ '{http://www.w3.org/1999/xhtml}link',
527
+ '{http://www.w3.org/1999/xhtml}img',
528
+ '{http://www.w3.org/1999/xhtml}object',
529
+ '{http://www.w3.org/1999/xhtml}q',
530
+ '{http://www.w3.org/1999/xhtml}blockquote',
531
+ '{http://www.w3.org/1999/xhtml}ins',
532
+ '{http://www.w3.org/1999/xhtml}del',
533
+ '{http://www.w3.org/1999/xhtml}form',
534
+ '{http://www.w3.org/1999/xhtml}input',
535
+ '{http://www.w3.org/1999/xhtml}head',
536
+ '{http://www.w3.org/1999/xhtml}base',
537
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
538
+ case elem.name
539
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
540
+ attrs = ['href']
541
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
542
+ attrs = ['src', 'longdesc', 'usemap']
543
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
544
+ attrs = ['classid', 'codebase', 'data', 'usemap']
545
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
546
+ attrs = ['cite']
547
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
548
+ attrs = ['action']
549
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
550
+ attrs = ['src', 'usemap']
551
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
552
+ attrs = ['profile']
553
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
554
+ attrs = ['src', 'for']
555
+ end
556
+ attrs.each {|attr|
557
+ if hyperlink = elem.get_attribute(attr)
558
+ yield elem, attr, hyperlink
559
+ end
560
+ }
561
+ }
562
+ end
563
+ private :each_hyperlink_attribute
564
+
565
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
566
+ # of A element.
567
+ #
568
+ # It yields Hpricot::Text and URI for each hyperlink.
569
+ #
570
+ # The URI objects are created with a base URI which is given by
571
+ # HTML BASE element or the argument ((|base_uri|)).
572
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
573
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
574
+ base_uri = URI.parse(base_uri) if String === base_uri
575
+ links = []
576
+ each_hyperlink_attribute {|elem, attr, hyperlink|
577
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
578
+ base_uri = URI.parse(hyperlink.to_s)
579
+ else
580
+ links << hyperlink
581
+ end
582
+ }
583
+ if base_uri
584
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
585
+ else
586
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
587
+ end
588
+ end
589
+
590
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
591
+ # of A element.
592
+ #
593
+ # It yields Hpricot::Text.
594
+ #
595
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
596
+ def each_hyperlink # :yields: text
597
+ links = []
598
+ each_hyperlink_attribute {|elem, attr, hyperlink|
599
+ yield hyperlink
600
+ }
601
+ end
602
+
603
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
604
+ # of A element.
605
+ #
606
+ # It yields URI for each hyperlink.
607
+ #
608
+ # The URI objects are created with a base URI which is given by
609
+ # HTML BASE element or the argument ((|base_uri|)).
610
+ def each_uri(base_uri=nil) # :yields: URI
611
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
612
+ end
613
+ end
614
+
615
+ # :stopdoc:
616
+ module Doc::Trav
617
+ def traverse_all_element(&block)
618
+ children.each {|c| c.traverse_all_element(&block) }
619
+ end
620
+ def xpath
621
+ "/"
622
+ end
623
+ def css_path
624
+ nil
625
+ end
626
+ end
627
+
628
+ module Elem::Trav
629
+ def traverse_all_element(&block)
630
+ yield self
631
+ children.each {|c| c.traverse_all_element(&block) }
632
+ end
633
+ end
634
+
635
+ module Leaf::Trav
636
+ def traverse_all_element
637
+ yield self
638
+ end
639
+ end
640
+
641
+ module Doc::Trav
642
+ def traverse_some_element(name_set, &block)
643
+ children.each {|c| c.traverse_some_element(name_set, &block) }
644
+ end
645
+ end
646
+
647
+ module Elem::Trav
648
+ def traverse_some_element(name_set, &block)
649
+ yield self if name_set.include? self.name
650
+ children.each {|c| c.traverse_some_element(name_set, &block) }
651
+ end
652
+ end
653
+
654
+ module Leaf::Trav
655
+ def traverse_some_element(name_set)
656
+ end
657
+ end
658
+ # :startdoc:
659
+
660
+ module Traverse
661
+ # +traverse_text+ traverses texts in the tree
662
+ def traverse_text(&block) # :yields: text
663
+ traverse_text_internal(&block)
664
+ nil
665
+ end
666
+ end
667
+
668
+ # :stopdoc:
669
+ module Container::Trav
670
+ def traverse_text_internal(&block)
671
+ each_child {|c| c.traverse_text_internal(&block) }
672
+ end
673
+ end
674
+
675
+ module Leaf::Trav
676
+ def traverse_text_internal
677
+ end
678
+ end
679
+
680
+ module Text::Trav
681
+ def traverse_text_internal
682
+ yield self
683
+ end
684
+ end
685
+ # :startdoc:
686
+
687
+ module Container::Trav
688
+ # +filter+ rebuilds the tree without some components.
689
+ #
690
+ # node.filter {|descendant_node| predicate } -> node
691
+ # loc.filter {|descendant_loc| predicate } -> node
692
+ #
693
+ # +filter+ yields each node except top node.
694
+ # If given block returns false, corresponding node is dropped.
695
+ # If given block returns true, corresponding node is retained and
696
+ # inner nodes are examined.
697
+ #
698
+ # +filter+ returns an node.
699
+ # It doesn't return location object even if self is location object.
700
+ #
701
+ def filter(&block)
702
+ subst = {}
703
+ each_child_with_index {|descendant, i|
704
+ if yield descendant
705
+ if descendant.elem?
706
+ subst[i] = descendant.filter(&block)
707
+ else
708
+ subst[i] = descendant
709
+ end
710
+ else
711
+ subst[i] = nil
712
+ end
713
+ }
714
+ to_node.subst_subnode(subst)
715
+ end
716
+ end
717
+
718
+ module Doc::Trav
719
+ # +title+ searches title and return it as a text.
720
+ # It returns nil if not found.
721
+ #
722
+ # +title+ searchs following information.
723
+ #
724
+ # - <title>...</title> in HTML
725
+ # - <title>...</title> in RSS
726
+ def title
727
+ e = find_element('title',
728
+ '{http://www.w3.org/1999/xhtml}title',
729
+ '{http://purl.org/rss/1.0/}title',
730
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
731
+ e && e.extract_text
732
+ end
733
+
734
+ # +author+ searches author and return it as a text.
735
+ # It returns nil if not found.
736
+ #
737
+ # +author+ searchs following information.
738
+ #
739
+ # - <meta name="author" content="author-name"> in HTML
740
+ # - <link rev="made" title="author-name"> in HTML
741
+ # - <dc:creator>author-name</dc:creator> in RSS
742
+ # - <dc:publisher>author-name</dc:publisher> in RSS
743
+ def author
744
+ traverse_element('meta',
745
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
746
+ begin
747
+ next unless e.fetch_attr('name').downcase == 'author'
748
+ author = e.fetch_attribute('content').strip
749
+ return author if !author.empty?
750
+ rescue IndexError
751
+ end
752
+ }
753
+
754
+ traverse_element('link',
755
+ '{http://www.w3.org/1999/xhtml}link') {|e|
756
+ begin
757
+ next unless e.fetch_attr('rev').downcase == 'made'
758
+ author = e.fetch_attribute('title').strip
759
+ return author if !author.empty?
760
+ rescue IndexError
761
+ end
762
+ }
763
+
764
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
765
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
766
+ begin
767
+ author = e.extract_text.strip
768
+ return author if !author.empty?
769
+ rescue IndexError
770
+ end
771
+ }
772
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
773
+ begin
774
+ author = e.extract_text.strip
775
+ return author if !author.empty?
776
+ rescue IndexError
777
+ end
778
+ }
779
+ end
780
+
781
+ nil
782
+ end
783
+
784
+ end
785
+
786
+ module Doc::Trav
787
+ def root
788
+ es = []
789
+ children.each {|c| es << c if c.elem? }
790
+ raise Hpricot::Error, "no element" if es.empty?
791
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
792
+ es[0]
793
+ end
794
+ end
795
+
796
+ module Elem::Trav
797
+ def has_attribute?(name)
798
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
799
+ end
800
+ def get_attribute(name)
801
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
802
+ a = Hpricot.uxs(a) if a
803
+ a
804
+ end
805
+ alias_method :[], :get_attribute
806
+ def set_attribute(name, val)
807
+ altered!
808
+ self.raw_attributes ||= {}
809
+ self.raw_attributes[name.to_s] = Hpricot.xs(val)
810
+ end
811
+ alias_method :[]=, :set_attribute
812
+ def remove_attribute(name)
813
+ name = name.to_s
814
+ if has_attribute? name
815
+ altered!
816
+ self.raw_attributes.delete(name)
817
+ end
818
+ end
819
+ end
820
+
821
+ end