hpricot 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,115 @@
1
+ require 'hpricot/modules'
2
+ require 'hpricot/raw_string'
3
+ require 'hpricot/htmlinfo'
4
+ require 'hpricot/encoder'
5
+ require 'hpricot/fstr'
6
+ require 'iconv'
7
+
8
+ module Hpricot
9
+ class Text
10
+ # :stopdoc:
11
+ class << self
12
+ alias new_internal new
13
+ end
14
+ # :startdoc:
15
+
16
+ def Text.new(arg)
17
+ arg = arg.to_node if Hpricot::Location === arg
18
+ if Text === arg
19
+ new_internal arg.rcdata, arg.normalized_rcdata
20
+ elsif String === arg
21
+ arg2 = arg.gsub(/&/, '&amp;')
22
+ arg = arg2.freeze if arg != arg2
23
+ new_internal arg
24
+ else
25
+ raise TypeError, "cannot initialize Text with #{arg.inspect}"
26
+ end
27
+ end
28
+
29
+ def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
30
+ init_raw_string
31
+ @rcdata = rcdata && Hpricot.frozen_string(rcdata)
32
+ @normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
33
+ end
34
+ attr_reader :rcdata, :normalized_rcdata
35
+
36
+ def internal_normalize(rcdata)
37
+ # - character references are decoded as much as possible.
38
+ # - undecodable character references are converted to decimal numeric character refereces.
39
+ result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
40
+ u = nil
41
+ if $1
42
+ u = $1.to_i
43
+ elsif $2
44
+ u = $2.hex
45
+ elsif $3
46
+ u = NamedCharacters[$3]
47
+ end
48
+ if !u || u < 0 || 0x7fffffff < u
49
+ '?'
50
+ elsif u == 38 # '&' character.
51
+ '&#38;'
52
+ elsif u <= 0x7f
53
+ [u].pack("C")
54
+ else
55
+ begin
56
+ Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
57
+ rescue Iconv::Failure
58
+ "&##{u};"
59
+ end
60
+ end
61
+ }
62
+ Hpricot.frozen_string(result)
63
+ end
64
+ private :internal_normalize
65
+
66
+ # Hpricot::Text#to_s converts the text to a string.
67
+ # - character references are decoded as much as possible.
68
+ # - undecodable character reference are converted to `?' character.
69
+ def to_s
70
+ @normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
71
+ u = $1.to_i
72
+ if 0 <= u && u <= 0x7f
73
+ [u].pack("C")
74
+ else
75
+ '?'
76
+ end
77
+ }
78
+ end
79
+
80
+ def empty?
81
+ @normalized_rcdata.empty?
82
+ end
83
+
84
+ def strip
85
+ rcdata = @normalized_rcdata.dup
86
+ rcdata.sub!(/\A(?:\s|&nbsp;)+/, '')
87
+ rcdata.sub!(/(?:\s|&nbsp;)+\z/, '')
88
+ if rcdata == @normalized_rcdata
89
+ self
90
+ else
91
+ rcdata.freeze
92
+ Text.new_internal(rcdata, rcdata)
93
+ end
94
+ end
95
+
96
+ # Hpricot::Text.concat returns a text which is concatenation of arguments.
97
+ #
98
+ # An argument should be one of follows.
99
+ # - String
100
+ # - Hpricot::Text
101
+ # - Hpricot::Location which points Hpricot::Text
102
+ def Text.concat(*args)
103
+ rcdata = ''
104
+ args.each {|arg|
105
+ arg = arg.to_node if Hpricot::Location === arg
106
+ if Text === arg
107
+ rcdata << arg.rcdata
108
+ else
109
+ rcdata << arg.gsub(/&/, '&amp;')
110
+ end
111
+ }
112
+ new_internal rcdata
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,511 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ def doc?() Doc::Trav === self end
7
+ def elem?() Elem::Trav === self end
8
+ def text?() Text::Trav === self end
9
+ def xmldecl?() XMLDecl::Trav === self end
10
+ def doctype?() DocType::Trav === self end
11
+ def procins?() ProcIns::Trav === self end
12
+ def comment?() Comment::Trav === self end
13
+ def bogusetag?() BogusETag::Trav === self end
14
+
15
+ def to_html
16
+ output("")
17
+ end
18
+ alias_method :to_s, :to_html
19
+
20
+ def get_subnode(*indexes)
21
+ n = self
22
+ indexes.each {|index|
23
+ n = n.get_subnode_internal(index)
24
+ }
25
+ n
26
+ end
27
+ end
28
+
29
+ module Container::Trav
30
+ def containers
31
+ children.grep(Container::Trav)
32
+ end
33
+ def replace_child(old, new)
34
+ children[children.index(old), 1] = [*new]
35
+ end
36
+ def insert_before(nodes, ele)
37
+ case nodes
38
+ when Array
39
+ nodes.each { |n| insert_before(n, ele) }
40
+ else
41
+ children[children.index(ele) || 0, 0] = nodes
42
+ end
43
+ end
44
+ def insert_after(nodes, ele)
45
+ case nodes
46
+ when Array
47
+ nodes.each { |n| insert_after(n, ele) }
48
+ else
49
+ idx = children.index(ele)
50
+ children[idx ? idx + 1 : children.length, 0] = nodes
51
+ end
52
+ end
53
+ def inner_html
54
+ children.map { |x| x.output("") }.join
55
+ end
56
+ alias_method :innerHTML, :inner_html
57
+ def inner_html=(inner)
58
+ case inner
59
+ when String, IO
60
+ self.children = Hpricot.parse(inner).children
61
+ when Array
62
+ self.children = inner
63
+ when nil
64
+ self.children = []
65
+ end
66
+ end
67
+ alias_method :innerHTML=, :inner_html=
68
+ def search(expr, &blk)
69
+ last = nil
70
+ nodes = [self]
71
+ done = []
72
+ expr = expr.to_s
73
+ until expr.empty?
74
+ expr = clean_path(expr)
75
+ expr.gsub!(%r!^//!, '')
76
+
77
+ case expr
78
+ when %r!^/?\.\.!
79
+ expr = $'
80
+ nodes.map! { |node| node.parent }
81
+ when %r!^[>/]!
82
+ expr = $'
83
+ nodes = Elements[*nodes.map { |node| node.containers }.flatten]
84
+ when %r!^\+!
85
+ expr = $'
86
+ nodes.map! do |node|
87
+ siblings = node.parent.containers
88
+ siblings[siblings.index(node)+1]
89
+ end
90
+ nodes.compact!
91
+ when %r!^~!
92
+ expr = $'
93
+ nodes.map! do |node|
94
+ siblings = node.parent.containers
95
+ siblings[(siblings.index(node)+1)..-1]
96
+ end
97
+ nodes.flatten!
98
+ when %r!^[|,]!
99
+ expr = " #$'"
100
+ nodes.shift if nodes.first == self
101
+ done += nodes
102
+ nodes = [self]
103
+ else
104
+ m = expr.match %r!^([#.]?)([a-z0-9\\*_-]*)!i
105
+ expr = $'
106
+ if m[1] == '#'
107
+ oid = get_element_by_id(m[2])
108
+ nodes = oid ? [oid] : []
109
+ else
110
+ m[2] = "*" if m[2] == "" || m[1] == "."
111
+ ret = []
112
+ nodes.each do |node|
113
+ case m[2]
114
+ when '*'
115
+ else
116
+ ret += [*node.get_elements_by_tag_name(m[2])]
117
+ end
118
+ end
119
+ nodes = ret
120
+ end
121
+ end
122
+
123
+ nodes, expr = Elements.filter(nodes, expr)
124
+ end
125
+ nodes = done + nodes.flatten.uniq
126
+ if blk
127
+ nodes.each(&blk)
128
+ self
129
+ else
130
+ Elements[*nodes]
131
+ end
132
+ end
133
+ alias_method :/, :search
134
+
135
+ def at(expr, &blk)
136
+ search(expr, &blk).first
137
+ end
138
+ alias_method :%, :at
139
+
140
+ def clean_path(path)
141
+ path.gsub(/^\s+|\s+$/, '')
142
+ end
143
+
144
+ # +each_child+ iterates over each child.
145
+ def each_child(&block) # :yields: child_node
146
+ children.each(&block)
147
+ nil
148
+ end
149
+
150
+ # +each_child_with_index+ iterates over each child.
151
+ def each_child_with_index(&block) # :yields: child_node, index
152
+ children.each_with_index(&block)
153
+ nil
154
+ end
155
+
156
+ # +find_element+ searches an element which universal name is specified by
157
+ # the arguments.
158
+ # It returns nil if not found.
159
+ def find_element(*names)
160
+ traverse_element(*names) {|e| return e }
161
+ nil
162
+ end
163
+
164
+ # +traverse_element+ traverses elements in the tree.
165
+ # It yields elements in depth first order.
166
+ #
167
+ # If _names_ are empty, it yields all elements.
168
+ # If non-empty _names_ are given, it should be list of universal names.
169
+ #
170
+ # A nested element is yielded in depth first order as follows.
171
+ #
172
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
173
+ # t.traverse_element("a", "c") {|e| p e}
174
+ # # =>
175
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
176
+ # {emptyelem <a id="1">}
177
+ # {emptyelem <c id="2">}
178
+ #
179
+ # Universal names are specified as follows.
180
+ #
181
+ # t = Hpricot(<<'End')
182
+ # <html>
183
+ # <meta name="robots" content="index,nofollow">
184
+ # <meta name="author" content="Who am I?">
185
+ # </html>
186
+ # End
187
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
188
+ # # =>
189
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
190
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
191
+ #
192
+ def traverse_element(*names, &block) # :yields: element
193
+ if names.empty?
194
+ traverse_all_element(&block)
195
+ else
196
+ name_set = {}
197
+ names.each {|n| name_set[n] = true }
198
+ traverse_some_element(name_set, &block)
199
+ end
200
+ nil
201
+ end
202
+
203
+ def classes
204
+ get_attribute('class').to_s.strip.split(/\s+/)
205
+ end
206
+
207
+ def get_element_by_id(id)
208
+ traverse_all_element do |ele|
209
+ if eid = ele.get_attribute('id')
210
+ return ele if eid.to_s == id
211
+ end
212
+ end
213
+ nil
214
+ end
215
+
216
+ def get_elements_by_tag_name(*a)
217
+ list = Elements[]
218
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
219
+ list << e
220
+ end
221
+ list
222
+ end
223
+
224
+ def each_hyperlink_attribute
225
+ traverse_element(
226
+ '{http://www.w3.org/1999/xhtml}a',
227
+ '{http://www.w3.org/1999/xhtml}area',
228
+ '{http://www.w3.org/1999/xhtml}link',
229
+ '{http://www.w3.org/1999/xhtml}img',
230
+ '{http://www.w3.org/1999/xhtml}object',
231
+ '{http://www.w3.org/1999/xhtml}q',
232
+ '{http://www.w3.org/1999/xhtml}blockquote',
233
+ '{http://www.w3.org/1999/xhtml}ins',
234
+ '{http://www.w3.org/1999/xhtml}del',
235
+ '{http://www.w3.org/1999/xhtml}form',
236
+ '{http://www.w3.org/1999/xhtml}input',
237
+ '{http://www.w3.org/1999/xhtml}head',
238
+ '{http://www.w3.org/1999/xhtml}base',
239
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
240
+ case elem.name
241
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
242
+ attrs = ['href']
243
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
244
+ attrs = ['src', 'longdesc', 'usemap']
245
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
246
+ attrs = ['classid', 'codebase', 'data', 'usemap']
247
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
248
+ attrs = ['cite']
249
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
250
+ attrs = ['action']
251
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
252
+ attrs = ['src', 'usemap']
253
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
254
+ attrs = ['profile']
255
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
256
+ attrs = ['src', 'for']
257
+ end
258
+ attrs.each {|attr|
259
+ if hyperlink = elem.get_attribute(attr)
260
+ yield elem, attr, hyperlink
261
+ end
262
+ }
263
+ }
264
+ end
265
+ private :each_hyperlink_attribute
266
+
267
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
268
+ # of A element.
269
+ #
270
+ # It yields Hpricot::Text (or Hpricot::Loc) and URI for each hyperlink.
271
+ #
272
+ # The URI objects are created with a base URI which is given by
273
+ # HTML BASE element or the argument ((|base_uri|)).
274
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
275
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
276
+ base_uri = URI.parse(base_uri) if String === base_uri
277
+ links = []
278
+ each_hyperlink_attribute {|elem, attr, hyperlink|
279
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
280
+ base_uri = URI.parse(hyperlink.to_s)
281
+ else
282
+ links << hyperlink
283
+ end
284
+ }
285
+ if base_uri
286
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
287
+ else
288
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
289
+ end
290
+ end
291
+
292
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
293
+ # of A element.
294
+ #
295
+ # It yields Hpricot::Text or Hpricot::Loc.
296
+ #
297
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
298
+ def each_hyperlink # :yields: text
299
+ links = []
300
+ each_hyperlink_attribute {|elem, attr, hyperlink|
301
+ yield hyperlink
302
+ }
303
+ end
304
+
305
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
306
+ # of A element.
307
+ #
308
+ # It yields URI for each hyperlink.
309
+ #
310
+ # The URI objects are created with a base URI which is given by
311
+ # HTML BASE element or the argument ((|base_uri|)).
312
+ def each_uri(base_uri=nil) # :yields: URI
313
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
314
+ end
315
+ end
316
+
317
+ # :stopdoc:
318
+ module Doc::Trav
319
+ def traverse_all_element(&block)
320
+ children.each {|c| c.traverse_all_element(&block) }
321
+ end
322
+ end
323
+
324
+ module Elem::Trav
325
+ def traverse_all_element(&block)
326
+ yield self
327
+ children.each {|c| c.traverse_all_element(&block) }
328
+ end
329
+ end
330
+
331
+ module Leaf::Trav
332
+ def traverse_all_element
333
+ end
334
+ end
335
+
336
+ module Doc::Trav
337
+ def traverse_some_element(name_set, &block)
338
+ children.each {|c| c.traverse_some_element(name_set, &block) }
339
+ end
340
+ end
341
+
342
+ module Elem::Trav
343
+ def traverse_some_element(name_set, &block)
344
+ yield self if name_set.include? self.name
345
+ children.each {|c| c.traverse_some_element(name_set, &block) }
346
+ end
347
+ end
348
+
349
+ module Leaf::Trav
350
+ def traverse_some_element(name_set)
351
+ end
352
+ end
353
+ # :startdoc:
354
+
355
+ module Traverse
356
+ # +traverse_text+ traverses texts in the tree
357
+ def traverse_text(&block) # :yields: text
358
+ traverse_text_internal(&block)
359
+ nil
360
+ end
361
+ end
362
+
363
+ # :stopdoc:
364
+ module Container::Trav
365
+ def traverse_text_internal(&block)
366
+ each_child {|c| c.traverse_text_internal(&block) }
367
+ end
368
+ end
369
+
370
+ module Leaf::Trav
371
+ def traverse_text_internal
372
+ end
373
+ end
374
+
375
+ module Text::Trav
376
+ def traverse_text_internal
377
+ yield self
378
+ end
379
+ end
380
+ # :startdoc:
381
+
382
+ module Container::Trav
383
+ # +filter+ rebuilds the tree without some components.
384
+ #
385
+ # node.filter {|descendant_node| predicate } -> node
386
+ # loc.filter {|descendant_loc| predicate } -> node
387
+ #
388
+ # +filter+ yields each node except top node.
389
+ # If given block returns false, corresponding node is dropped.
390
+ # If given block returns true, corresponding node is retained and
391
+ # inner nodes are examined.
392
+ #
393
+ # +filter+ returns an node.
394
+ # It doesn't return location object even if self is location object.
395
+ #
396
+ def filter(&block)
397
+ subst = {}
398
+ each_child_with_index {|descendant, i|
399
+ if yield descendant
400
+ if descendant.elem?
401
+ subst[i] = descendant.filter(&block)
402
+ else
403
+ subst[i] = descendant
404
+ end
405
+ else
406
+ subst[i] = nil
407
+ end
408
+ }
409
+ to_node.subst_subnode(subst)
410
+ end
411
+ end
412
+
413
+ module Doc::Trav
414
+ # +title+ searches title and return it as a text.
415
+ # It returns nil if not found.
416
+ #
417
+ # +title+ searchs following information.
418
+ #
419
+ # - <title>...</title> in HTML
420
+ # - <title>...</title> in RSS
421
+ def title
422
+ e = find_element('title',
423
+ '{http://www.w3.org/1999/xhtml}title',
424
+ '{http://purl.org/rss/1.0/}title',
425
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
426
+ e && e.extract_text
427
+ end
428
+
429
+ # +author+ searches author and return it as a text.
430
+ # It returns nil if not found.
431
+ #
432
+ # +author+ searchs following information.
433
+ #
434
+ # - <meta name="author" content="author-name"> in HTML
435
+ # - <link rev="made" title="author-name"> in HTML
436
+ # - <dc:creator>author-name</dc:creator> in RSS
437
+ # - <dc:publisher>author-name</dc:publisher> in RSS
438
+ def author
439
+ traverse_element('meta',
440
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
441
+ begin
442
+ next unless e.fetch_attr('name').downcase == 'author'
443
+ author = e.fetch_attribute('content').strip
444
+ return author if !author.empty?
445
+ rescue IndexError
446
+ end
447
+ }
448
+
449
+ traverse_element('link',
450
+ '{http://www.w3.org/1999/xhtml}link') {|e|
451
+ begin
452
+ next unless e.fetch_attr('rev').downcase == 'made'
453
+ author = e.fetch_attribute('title').strip
454
+ return author if !author.empty?
455
+ rescue IndexError
456
+ end
457
+ }
458
+
459
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
460
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
461
+ begin
462
+ author = e.extract_text.strip
463
+ return author if !author.empty?
464
+ rescue IndexError
465
+ end
466
+ }
467
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
468
+ begin
469
+ author = e.extract_text.strip
470
+ return author if !author.empty?
471
+ rescue IndexError
472
+ end
473
+ }
474
+ end
475
+
476
+ nil
477
+ end
478
+
479
+ end
480
+
481
+ module Doc::Trav
482
+ def root
483
+ es = []
484
+ children.each {|c| es << c if c.elem? }
485
+ raise Hpricot::Error, "no element" if es.empty?
486
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
487
+ es[0]
488
+ end
489
+ end
490
+
491
+ module Elem::Trav
492
+ def has_attribute?(name)
493
+ self.attributes && self.attributes.has_key?(name.to_s)
494
+ end
495
+ def get_attribute(name)
496
+ self.attributes && self.attributes[name.to_s]
497
+ end
498
+ alias_method :[], :get_attribute
499
+ def set_attribute(name, val)
500
+ self.attributes ||= {}
501
+ self.attributes[name.to_s] = val
502
+ end
503
+ alias_method :[]=, :set_attribute
504
+ def remove_attribute(name)
505
+ if has_attribute? name
506
+ self.attributes.delete(name)
507
+ end
508
+ end
509
+ end
510
+
511
+ end