hpricot 0.6-jruby

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/CHANGELOG +62 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +211 -0
  5. data/ext/hpricot_scan/HpricotScanService.java +1340 -0
  6. data/ext/hpricot_scan/extconf.rb +6 -0
  7. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  8. data/ext/hpricot_scan/hpricot_scan.c +5976 -0
  9. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  10. data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
  11. data/ext/hpricot_scan/hpricot_scan.rl +273 -0
  12. data/extras/mingw-rbconfig.rb +176 -0
  13. data/lib/hpricot.rb +26 -0
  14. data/lib/hpricot/blankslate.rb +63 -0
  15. data/lib/hpricot/builder.rb +200 -0
  16. data/lib/hpricot/elements.rb +510 -0
  17. data/lib/hpricot/htmlinfo.rb +672 -0
  18. data/lib/hpricot/inspect.rb +107 -0
  19. data/lib/hpricot/modules.rb +37 -0
  20. data/lib/hpricot/parse.rb +297 -0
  21. data/lib/hpricot/tag.rb +228 -0
  22. data/lib/hpricot/tags.rb +164 -0
  23. data/lib/hpricot/traverse.rb +821 -0
  24. data/lib/hpricot/xchar.rb +94 -0
  25. data/lib/i686-linux/hpricot_scan.jar +0 -0
  26. data/test/files/basic.xhtml +17 -0
  27. data/test/files/boingboing.html +2266 -0
  28. data/test/files/cy0.html +3653 -0
  29. data/test/files/immob.html +400 -0
  30. data/test/files/pace_application.html +1320 -0
  31. data/test/files/tenderlove.html +16 -0
  32. data/test/files/uswebgen.html +220 -0
  33. data/test/files/utf8.html +1054 -0
  34. data/test/files/week9.html +1723 -0
  35. data/test/files/why.xml +19 -0
  36. data/test/load_files.rb +7 -0
  37. data/test/test_alter.rb +65 -0
  38. data/test/test_builder.rb +24 -0
  39. data/test/test_parser.rb +379 -0
  40. data/test/test_paths.rb +16 -0
  41. data/test/test_preserved.rb +66 -0
  42. data/test/test_xml.rb +28 -0
  43. metadata +98 -0
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end
@@ -0,0 +1,821 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ # Is this object the enclosing HTML or XML document?
7
+ def doc?() Doc::Trav === self end
8
+ # Is this object an HTML or XML element?
9
+ def elem?() Elem::Trav === self end
10
+ # Is this object an HTML text node?
11
+ def text?() Text::Trav === self end
12
+ # Is this object an XML declaration?
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ # Is this object a doctype tag?
15
+ def doctype?() DocType::Trav === self end
16
+ # Is this object an XML processing instruction?
17
+ def procins?() ProcIns::Trav === self end
18
+ # Is this object a comment?
19
+ def comment?() Comment::Trav === self end
20
+ # Is this object a stranded end tag?
21
+ def bogusetag?() BogusETag::Trav === self end
22
+
23
+ # Builds an HTML string from this node and its contents.
24
+ # If you need to write to a stream, try calling <tt>output(io)</tt>
25
+ # as a method on this object.
26
+ def to_html
27
+ output("")
28
+ end
29
+ alias_method :to_s, :to_html
30
+
31
+ # Attempts to preserve the original HTML of the document, only
32
+ # outputing new tags for elements which have changed.
33
+ def to_original_html
34
+ output("", :preserve => true)
35
+ end
36
+
37
+ def index(name)
38
+ i = 0
39
+ return i if name == "*"
40
+ children.each do |x|
41
+ return i if (x.respond_to?(:name) and name == x.name) or
42
+ (x.text? and name == "text()")
43
+ i += 1
44
+ end
45
+ -1
46
+ end
47
+
48
+ # Puts together an array of neighboring nodes based on their proximity
49
+ # to this node. So, for example, to get the next node, you could use
50
+ # <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
51
+ #
52
+ # This method also accepts ranges and sets of numbers.
53
+ #
54
+ # ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
55
+ # ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
56
+ # ele.nodes_at(0, 5..6) # the current node and two others
57
+ def nodes_at(*pos)
58
+ sib = parent.children
59
+ i, si = 0, sib.index(self)
60
+ pos.map! do |r|
61
+ if r.is_a?(Range) and r.begin.is_a?(String)
62
+ r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
63
+ end
64
+ r
65
+ end
66
+ p pos
67
+ Elements[*
68
+ sib.select do |x|
69
+ sel =
70
+ case i - si when *pos
71
+ true
72
+ end
73
+ i += 1
74
+ sel
75
+ end
76
+ ]
77
+ end
78
+
79
+ # Returns the node neighboring this node to the south: just below it.
80
+ # This method includes text nodes and comments and such.
81
+ def next
82
+ sib = parent.children
83
+ sib[sib.index(self) + 1] if parent
84
+ end
85
+ alias_method :next_node, :next
86
+
87
+ # Returns to node neighboring this node to the north: just above it.
88
+ # This method includes text nodes and comments and such.
89
+ def previous
90
+ sib = parent.children
91
+ x = sib.index(self) - 1
92
+ sib[x] if sib and x >= 0
93
+ end
94
+ alias_method :previous_node, :previous
95
+
96
+ # Find all preceding nodes.
97
+ def preceding
98
+ sibs = parent.children
99
+ si = sibs.index(self)
100
+ return Elements[*sibs[0...si]]
101
+ end
102
+
103
+ # Find all nodes which follow the current one.
104
+ def following
105
+ sibs = parent.children
106
+ si = sibs.index(self) + 1
107
+ return Elements[*sibs[si...sibs.length]]
108
+ end
109
+
110
+ # Adds elements immediately after this element, contained in the +html+ string.
111
+ def after(html = nil, &blk)
112
+ parent.insert_after(Hpricot.make(html, &blk), self)
113
+ end
114
+
115
+ # Adds elements immediately before this element, contained in the +html+ string.
116
+ def before(html = nil, &blk)
117
+ parent.insert_before(Hpricot.make(html, &blk), self)
118
+ end
119
+
120
+
121
+ # Replace this element and its contents with the nodes contained
122
+ # in the +html+ string.
123
+ def swap(html = nil, &blk)
124
+ parent.altered!
125
+ parent.replace_child(self, Hpricot.make(html, &blk))
126
+ end
127
+
128
+ def get_subnode(*indexes)
129
+ n = self
130
+ indexes.each {|index|
131
+ n = n.get_subnode_internal(index)
132
+ }
133
+ n
134
+ end
135
+
136
+ # Builds a string from the text contained in this node. All
137
+ # HTML elements are removed.
138
+ def to_plain_text
139
+ if respond_to? :children
140
+ children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
141
+ end
142
+ end
143
+
144
+ # Builds a string from the text contained in this node. All
145
+ # HTML elements are removed.
146
+ def inner_text
147
+ if respond_to? :children
148
+ children.map { |x| x.inner_text }.join
149
+ end
150
+ end
151
+ alias_method :innerText, :inner_text
152
+
153
+ # Builds an HTML string from the contents of this node.
154
+ def html(inner = nil, &blk)
155
+ if inner or blk
156
+ altered!
157
+ case inner
158
+ when Array
159
+ self.children = inner
160
+ else
161
+ self.children = Hpricot.make(inner, &blk)
162
+ end
163
+ reparent self.children
164
+ else
165
+ if respond_to? :children
166
+ children.map { |x| x.output("") }.join
167
+ end
168
+ end
169
+ end
170
+ alias_method :inner_html, :html
171
+ alias_method :innerHTML, :inner_html
172
+
173
+ # Inserts new contents into the current node, based on
174
+ # the HTML contained in string +inner+.
175
+ def inner_html=(inner)
176
+ html(inner || [])
177
+ end
178
+ alias_method :innerHTML=, :inner_html=
179
+
180
+ def reparent(nodes)
181
+ altered!
182
+ [*nodes].each { |e| e.parent = self }
183
+ end
184
+ private :reparent
185
+
186
+ def clean_path(path)
187
+ path.gsub(/^\s+|\s+$/, '')
188
+ end
189
+
190
+ # Builds a unique XPath string for this node, from the
191
+ # root of the document containing it.
192
+ def xpath
193
+ if elem? and has_attribute? 'id'
194
+ "//#{self.name}[@id='#{get_attribute('id')}']"
195
+ else
196
+ sim, id = 0, 0, 0
197
+ parent.children.each do |e|
198
+ id = sim if e == self
199
+ sim += 1 if e.pathname == self.pathname
200
+ end
201
+ p = File.join(parent.xpath, self.pathname)
202
+ p += "[#{id+1}]" if sim >= 2
203
+ p
204
+ end
205
+ end
206
+
207
+ # Builds a unique CSS string for this node, from the
208
+ # root of the document containing it.
209
+ def css_path
210
+ if elem? and has_attribute? 'id'
211
+ "##{get_attribute('id')}"
212
+ else
213
+ sim, i, id = 0, 0, 0
214
+ parent.children.each do |e|
215
+ id = sim if e == self
216
+ sim += 1 if e.pathname == self.pathname
217
+ end
218
+ p = parent.css_path
219
+ p = p ? "#{p} > #{self.pathname}" : self.pathname
220
+ p += ":nth(#{id})" if sim >= 2
221
+ p
222
+ end
223
+ end
224
+
225
+ def node_position
226
+ parent.children.index(self)
227
+ end
228
+
229
+ def position
230
+ parent.children_of_type(self.pathname).index(self)
231
+ end
232
+
233
+ # Searches this node for all elements matching
234
+ # the CSS or XPath +expr+. Returns an Elements array
235
+ # containing the matching nodes. If +blk+ is given, it
236
+ # is used to iterate through the matching set.
237
+ def search(expr, &blk)
238
+ if Range === expr
239
+ return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
240
+ end
241
+ last = nil
242
+ nodes = [self]
243
+ done = []
244
+ expr = expr.to_s
245
+ hist = []
246
+ until expr.empty?
247
+ expr = clean_path(expr)
248
+ expr.gsub!(%r!^//!, '')
249
+
250
+ case expr
251
+ when %r!^/?\.\.!
252
+ last = expr = $'
253
+ nodes.map! { |node| node.parent }
254
+ when %r!^[>/]\s*!
255
+ last = expr = $'
256
+ nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
257
+ when %r!^\+!
258
+ last = expr = $'
259
+ nodes.map! do |node|
260
+ siblings = node.parent.children
261
+ siblings[siblings.index(node)+1]
262
+ end
263
+ nodes.compact!
264
+ when %r!^~!
265
+ last = expr = $'
266
+ nodes.map! do |node|
267
+ siblings = node.parent.children
268
+ siblings[(siblings.index(node)+1)..-1]
269
+ end
270
+ nodes.flatten!
271
+ when %r!^[|,]!
272
+ last = expr = " #$'"
273
+ nodes.shift if nodes.first == self
274
+ done += nodes
275
+ nodes = [self]
276
+ else
277
+ m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
278
+ after = $'
279
+ mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
280
+ oop = false
281
+ if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
282
+ after = $'
283
+ m[2] += mt
284
+ expr = after
285
+ end
286
+ if m[1] == '#'
287
+ oid = get_element_by_id(m[2])
288
+ nodes = oid ? [oid] : []
289
+ expr = after
290
+ else
291
+ m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
292
+ ret = []
293
+ nodes.each do |node|
294
+ case m[2]
295
+ when '*'
296
+ node.traverse_element { |n| ret << n }
297
+ else
298
+ if node.respond_to? :get_elements_by_tag_name
299
+ ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
300
+ end
301
+ end
302
+ end
303
+ nodes = ret
304
+ end
305
+ last = nil
306
+ end
307
+
308
+ hist << expr
309
+ break if hist[-1] == hist[-2]
310
+ nodes, expr = Elements.filter(nodes, expr)
311
+ end
312
+ nodes = done + nodes.flatten.uniq
313
+ if blk
314
+ nodes.each(&blk)
315
+ self
316
+ else
317
+ Elements[*nodes]
318
+ end
319
+ end
320
+ alias_method :/, :search
321
+
322
+ # Find the first matching node for the CSS or XPath
323
+ # +expr+ string.
324
+ def at(expr)
325
+ search(expr).first
326
+ end
327
+ alias_method :%, :at
328
+
329
+ # +traverse_element+ traverses elements in the tree.
330
+ # It yields elements in depth first order.
331
+ #
332
+ # If _names_ are empty, it yields all elements.
333
+ # If non-empty _names_ are given, it should be list of universal names.
334
+ #
335
+ # A nested element is yielded in depth first order as follows.
336
+ #
337
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
338
+ # t.traverse_element("a", "c") {|e| p e}
339
+ # # =>
340
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
341
+ # {emptyelem <a id="1">}
342
+ # {emptyelem <c id="2">}
343
+ #
344
+ # Universal names are specified as follows.
345
+ #
346
+ # t = Hpricot(<<'End')
347
+ # <html>
348
+ # <meta name="robots" content="index,nofollow">
349
+ # <meta name="author" content="Who am I?">
350
+ # </html>
351
+ # End
352
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
353
+ # # =>
354
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
355
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
356
+ #
357
+ def traverse_element(*names, &block) # :yields: element
358
+ if names.empty?
359
+ traverse_all_element(&block)
360
+ else
361
+ name_set = {}
362
+ names.each {|n| name_set[n] = true }
363
+ traverse_some_element(name_set, &block)
364
+ end
365
+ nil
366
+ end
367
+
368
+ # Find children of a given +tag_name+.
369
+ #
370
+ # ele.children_of_type('p')
371
+ # #=> [...array of paragraphs...]
372
+ #
373
+ def children_of_type(tag_name)
374
+ if respond_to? :children
375
+ children.find_all do |x|
376
+ x.respond_to?(:pathname) && x.pathname == tag_name
377
+ end
378
+ end
379
+ end
380
+
381
+ end
382
+
383
+ module Container::Trav
384
+ # Return all children of this node which can contain other
385
+ # nodes. This is a good way to get all HTML elements which
386
+ # aren't text, comment, doctype or processing instruction nodes.
387
+ def containers
388
+ children.grep(Container::Trav)
389
+ end
390
+
391
+ # Returns the container node neighboring this node to the south: just below it.
392
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
393
+ # See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
394
+ def next_sibling
395
+ sib = parent.containers
396
+ sib[sib.index(self) + 1] if parent
397
+ end
398
+
399
+ # Returns the container node neighboring this node to the north: just above it.
400
+ # By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
401
+ # See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
402
+ def previous_sibling
403
+ sib = parent.containers
404
+ x = sib.index(self) - 1
405
+ sib[x] if sib and x >= 0
406
+ end
407
+
408
+ # Find all preceding sibling elements. Like the other "sibling" methods, this weeds
409
+ # out text and comment nodes.
410
+ def preceding_siblings()
411
+ sibs = parent.containers
412
+ si = sibs.index(self)
413
+ return Elements[*sibs[0...si]]
414
+ end
415
+
416
+ # Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
417
+ # out text and comment nodes.
418
+ def following_siblings()
419
+ sibs = parent.containers
420
+ si = sibs.index(self) + 1
421
+ return Elements[*sibs[si...sibs.length]]
422
+ end
423
+
424
+ # Puts together an array of neighboring sibling elements based on their proximity
425
+ # to this element.
426
+ #
427
+ # This method accepts ranges and sets of numbers.
428
+ #
429
+ # ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
430
+ # ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
431
+ # ele.siblings_at(0, 5..6) # the current element and two others
432
+ #
433
+ # Like the other "sibling" methods, this doesn't find text and comment nodes.
434
+ # Use nodes_at to include those nodes.
435
+ def siblings_at(*pos)
436
+ sib = parent.containers
437
+ i, si = 0, sib.index(self)
438
+ Elements[*
439
+ sib.select do |x|
440
+ sel = case i - si when *pos
441
+ true
442
+ end
443
+ i += 1
444
+ sel
445
+ end
446
+ ]
447
+ end
448
+
449
+ # Replace +old+, a child of the current node, with +new+ node.
450
+ def replace_child(old, new)
451
+ reparent new
452
+ children[children.index(old), 1] = [*new]
453
+ end
454
+
455
+ # Insert +nodes+, an array of HTML elements or a single element,
456
+ # before the node +ele+, a child of the current node.
457
+ def insert_before(nodes, ele)
458
+ case nodes
459
+ when Array
460
+ nodes.each { |n| insert_before(n, ele) }
461
+ else
462
+ reparent nodes
463
+ children[children.index(ele) || 0, 0] = nodes
464
+ end
465
+ end
466
+
467
+ # Insert +nodes+, an array of HTML elements or a single element,
468
+ # after the node +ele+, a child of the current node.
469
+ def insert_after(nodes, ele)
470
+ case nodes
471
+ when Array
472
+ nodes.reverse_each { |n| insert_after(n, ele) }
473
+ else
474
+ reparent nodes
475
+ idx = children.index(ele)
476
+ children[idx ? idx + 1 : children.length, 0] = nodes
477
+ end
478
+ end
479
+
480
+ # +each_child+ iterates over each child.
481
+ def each_child(&block) # :yields: child_node
482
+ children.each(&block)
483
+ nil
484
+ end
485
+
486
+ # +each_child_with_index+ iterates over each child.
487
+ def each_child_with_index(&block) # :yields: child_node, index
488
+ children.each_with_index(&block)
489
+ nil
490
+ end
491
+
492
+ # +find_element+ searches an element which universal name is specified by
493
+ # the arguments.
494
+ # It returns nil if not found.
495
+ def find_element(*names)
496
+ traverse_element(*names) {|e| return e }
497
+ nil
498
+ end
499
+
500
+ # Returns a list of CSS classes to which this element belongs.
501
+ def classes
502
+ get_attribute('class').to_s.strip.split(/\s+/)
503
+ end
504
+
505
+ def get_element_by_id(id)
506
+ traverse_all_element do |ele|
507
+ if ele.elem? and eid = ele.get_attribute('id')
508
+ return ele if eid.to_s == id
509
+ end
510
+ end
511
+ nil
512
+ end
513
+
514
+ def get_elements_by_tag_name(*a)
515
+ list = Elements[]
516
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
517
+ list << e
518
+ end
519
+ list
520
+ end
521
+
522
+ def each_hyperlink_attribute
523
+ traverse_element(
524
+ '{http://www.w3.org/1999/xhtml}a',
525
+ '{http://www.w3.org/1999/xhtml}area',
526
+ '{http://www.w3.org/1999/xhtml}link',
527
+ '{http://www.w3.org/1999/xhtml}img',
528
+ '{http://www.w3.org/1999/xhtml}object',
529
+ '{http://www.w3.org/1999/xhtml}q',
530
+ '{http://www.w3.org/1999/xhtml}blockquote',
531
+ '{http://www.w3.org/1999/xhtml}ins',
532
+ '{http://www.w3.org/1999/xhtml}del',
533
+ '{http://www.w3.org/1999/xhtml}form',
534
+ '{http://www.w3.org/1999/xhtml}input',
535
+ '{http://www.w3.org/1999/xhtml}head',
536
+ '{http://www.w3.org/1999/xhtml}base',
537
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
538
+ case elem.name
539
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
540
+ attrs = ['href']
541
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
542
+ attrs = ['src', 'longdesc', 'usemap']
543
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
544
+ attrs = ['classid', 'codebase', 'data', 'usemap']
545
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
546
+ attrs = ['cite']
547
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
548
+ attrs = ['action']
549
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
550
+ attrs = ['src', 'usemap']
551
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
552
+ attrs = ['profile']
553
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
554
+ attrs = ['src', 'for']
555
+ end
556
+ attrs.each {|attr|
557
+ if hyperlink = elem.get_attribute(attr)
558
+ yield elem, attr, hyperlink
559
+ end
560
+ }
561
+ }
562
+ end
563
+ private :each_hyperlink_attribute
564
+
565
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
566
+ # of A element.
567
+ #
568
+ # It yields Hpricot::Text and URI for each hyperlink.
569
+ #
570
+ # The URI objects are created with a base URI which is given by
571
+ # HTML BASE element or the argument ((|base_uri|)).
572
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
573
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
574
+ base_uri = URI.parse(base_uri) if String === base_uri
575
+ links = []
576
+ each_hyperlink_attribute {|elem, attr, hyperlink|
577
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
578
+ base_uri = URI.parse(hyperlink.to_s)
579
+ else
580
+ links << hyperlink
581
+ end
582
+ }
583
+ if base_uri
584
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
585
+ else
586
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
587
+ end
588
+ end
589
+
590
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
591
+ # of A element.
592
+ #
593
+ # It yields Hpricot::Text.
594
+ #
595
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
596
+ def each_hyperlink # :yields: text
597
+ links = []
598
+ each_hyperlink_attribute {|elem, attr, hyperlink|
599
+ yield hyperlink
600
+ }
601
+ end
602
+
603
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
604
+ # of A element.
605
+ #
606
+ # It yields URI for each hyperlink.
607
+ #
608
+ # The URI objects are created with a base URI which is given by
609
+ # HTML BASE element or the argument ((|base_uri|)).
610
+ def each_uri(base_uri=nil) # :yields: URI
611
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
612
+ end
613
+ end
614
+
615
+ # :stopdoc:
616
+ module Doc::Trav
617
+ def traverse_all_element(&block)
618
+ children.each {|c| c.traverse_all_element(&block) }
619
+ end
620
+ def xpath
621
+ "/"
622
+ end
623
+ def css_path
624
+ nil
625
+ end
626
+ end
627
+
628
+ module Elem::Trav
629
+ def traverse_all_element(&block)
630
+ yield self
631
+ children.each {|c| c.traverse_all_element(&block) }
632
+ end
633
+ end
634
+
635
+ module Leaf::Trav
636
+ def traverse_all_element
637
+ yield self
638
+ end
639
+ end
640
+
641
+ module Doc::Trav
642
+ def traverse_some_element(name_set, &block)
643
+ children.each {|c| c.traverse_some_element(name_set, &block) }
644
+ end
645
+ end
646
+
647
+ module Elem::Trav
648
+ def traverse_some_element(name_set, &block)
649
+ yield self if name_set.include? self.name
650
+ children.each {|c| c.traverse_some_element(name_set, &block) }
651
+ end
652
+ end
653
+
654
+ module Leaf::Trav
655
+ def traverse_some_element(name_set)
656
+ end
657
+ end
658
+ # :startdoc:
659
+
660
+ module Traverse
661
+ # +traverse_text+ traverses texts in the tree
662
+ def traverse_text(&block) # :yields: text
663
+ traverse_text_internal(&block)
664
+ nil
665
+ end
666
+ end
667
+
668
+ # :stopdoc:
669
+ module Container::Trav
670
+ def traverse_text_internal(&block)
671
+ each_child {|c| c.traverse_text_internal(&block) }
672
+ end
673
+ end
674
+
675
+ module Leaf::Trav
676
+ def traverse_text_internal
677
+ end
678
+ end
679
+
680
+ module Text::Trav
681
+ def traverse_text_internal
682
+ yield self
683
+ end
684
+ end
685
+ # :startdoc:
686
+
687
+ module Container::Trav
688
+ # +filter+ rebuilds the tree without some components.
689
+ #
690
+ # node.filter {|descendant_node| predicate } -> node
691
+ # loc.filter {|descendant_loc| predicate } -> node
692
+ #
693
+ # +filter+ yields each node except top node.
694
+ # If given block returns false, corresponding node is dropped.
695
+ # If given block returns true, corresponding node is retained and
696
+ # inner nodes are examined.
697
+ #
698
+ # +filter+ returns an node.
699
+ # It doesn't return location object even if self is location object.
700
+ #
701
+ def filter(&block)
702
+ subst = {}
703
+ each_child_with_index {|descendant, i|
704
+ if yield descendant
705
+ if descendant.elem?
706
+ subst[i] = descendant.filter(&block)
707
+ else
708
+ subst[i] = descendant
709
+ end
710
+ else
711
+ subst[i] = nil
712
+ end
713
+ }
714
+ to_node.subst_subnode(subst)
715
+ end
716
+ end
717
+
718
+ module Doc::Trav
719
+ # +title+ searches title and return it as a text.
720
+ # It returns nil if not found.
721
+ #
722
+ # +title+ searchs following information.
723
+ #
724
+ # - <title>...</title> in HTML
725
+ # - <title>...</title> in RSS
726
+ def title
727
+ e = find_element('title',
728
+ '{http://www.w3.org/1999/xhtml}title',
729
+ '{http://purl.org/rss/1.0/}title',
730
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
731
+ e && e.extract_text
732
+ end
733
+
734
+ # +author+ searches author and return it as a text.
735
+ # It returns nil if not found.
736
+ #
737
+ # +author+ searchs following information.
738
+ #
739
+ # - <meta name="author" content="author-name"> in HTML
740
+ # - <link rev="made" title="author-name"> in HTML
741
+ # - <dc:creator>author-name</dc:creator> in RSS
742
+ # - <dc:publisher>author-name</dc:publisher> in RSS
743
+ def author
744
+ traverse_element('meta',
745
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
746
+ begin
747
+ next unless e.fetch_attr('name').downcase == 'author'
748
+ author = e.fetch_attribute('content').strip
749
+ return author if !author.empty?
750
+ rescue IndexError
751
+ end
752
+ }
753
+
754
+ traverse_element('link',
755
+ '{http://www.w3.org/1999/xhtml}link') {|e|
756
+ begin
757
+ next unless e.fetch_attr('rev').downcase == 'made'
758
+ author = e.fetch_attribute('title').strip
759
+ return author if !author.empty?
760
+ rescue IndexError
761
+ end
762
+ }
763
+
764
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
765
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
766
+ begin
767
+ author = e.extract_text.strip
768
+ return author if !author.empty?
769
+ rescue IndexError
770
+ end
771
+ }
772
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
773
+ begin
774
+ author = e.extract_text.strip
775
+ return author if !author.empty?
776
+ rescue IndexError
777
+ end
778
+ }
779
+ end
780
+
781
+ nil
782
+ end
783
+
784
+ end
785
+
786
+ module Doc::Trav
787
+ def root
788
+ es = []
789
+ children.each {|c| es << c if c.elem? }
790
+ raise Hpricot::Error, "no element" if es.empty?
791
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
792
+ es[0]
793
+ end
794
+ end
795
+
796
+ module Elem::Trav
797
+ def has_attribute?(name)
798
+ self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
799
+ end
800
+ def get_attribute(name)
801
+ a = self.raw_attributes && self.raw_attributes[name.to_s]
802
+ a = Hpricot.uxs(a) if a
803
+ a
804
+ end
805
+ alias_method :[], :get_attribute
806
+ def set_attribute(name, val)
807
+ altered!
808
+ self.raw_attributes ||= {}
809
+ self.raw_attributes[name.to_s] = Hpricot.xs(val)
810
+ end
811
+ alias_method :[]=, :set_attribute
812
+ def remove_attribute(name)
813
+ name = name.to_s
814
+ if has_attribute? name
815
+ altered!
816
+ self.raw_attributes.delete(name)
817
+ end
818
+ end
819
+ end
820
+
821
+ end