hpricot 0.4-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,115 @@
1
+ require 'hpricot/modules'
2
+ require 'hpricot/raw_string'
3
+ require 'hpricot/htmlinfo'
4
+ require 'hpricot/encoder'
5
+ require 'hpricot/fstr'
6
+ require 'iconv'
7
+
8
+ module Hpricot
9
+ class Text
10
+ # :stopdoc:
11
+ class << self
12
+ alias new_internal new
13
+ end
14
+ # :startdoc:
15
+
16
+ def Text.new(arg)
17
+ arg = arg.to_node if Hpricot::Location === arg
18
+ if Text === arg
19
+ new_internal arg.rcdata, arg.normalized_rcdata
20
+ elsif String === arg
21
+ arg2 = arg.gsub(/&/, '&amp;')
22
+ arg = arg2.freeze if arg != arg2
23
+ new_internal arg
24
+ else
25
+ raise TypeError, "cannot initialize Text with #{arg.inspect}"
26
+ end
27
+ end
28
+
29
+ def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
30
+ init_raw_string
31
+ @rcdata = rcdata && Hpricot.frozen_string(rcdata)
32
+ @normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
33
+ end
34
+ attr_reader :rcdata, :normalized_rcdata
35
+
36
+ def internal_normalize(rcdata)
37
+ # - character references are decoded as much as possible.
38
+ # - undecodable character references are converted to decimal numeric character refereces.
39
+ result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
40
+ u = nil
41
+ if $1
42
+ u = $1.to_i
43
+ elsif $2
44
+ u = $2.hex
45
+ elsif $3
46
+ u = NamedCharacters[$3]
47
+ end
48
+ if !u || u < 0 || 0x7fffffff < u
49
+ '?'
50
+ elsif u == 38 # '&' character.
51
+ '&#38;'
52
+ elsif u <= 0x7f
53
+ [u].pack("C")
54
+ else
55
+ begin
56
+ Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
57
+ rescue Iconv::Failure
58
+ "&##{u};"
59
+ end
60
+ end
61
+ }
62
+ Hpricot.frozen_string(result)
63
+ end
64
+ private :internal_normalize
65
+
66
+ # Hpricot::Text#to_s converts the text to a string.
67
+ # - character references are decoded as much as possible.
68
+ # - undecodable character reference are converted to `?' character.
69
+ def to_s
70
+ @normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
71
+ u = $1.to_i
72
+ if 0 <= u && u <= 0x7f
73
+ [u].pack("C")
74
+ else
75
+ '?'
76
+ end
77
+ }
78
+ end
79
+
80
+ def empty?
81
+ @normalized_rcdata.empty?
82
+ end
83
+
84
+ def strip
85
+ rcdata = @normalized_rcdata.dup
86
+ rcdata.sub!(/\A(?:\s|&nbsp;)+/, '')
87
+ rcdata.sub!(/(?:\s|&nbsp;)+\z/, '')
88
+ if rcdata == @normalized_rcdata
89
+ self
90
+ else
91
+ rcdata.freeze
92
+ Text.new_internal(rcdata, rcdata)
93
+ end
94
+ end
95
+
96
+ # Hpricot::Text.concat returns a text which is concatenation of arguments.
97
+ #
98
+ # An argument should be one of follows.
99
+ # - String
100
+ # - Hpricot::Text
101
+ # - Hpricot::Location which points Hpricot::Text
102
+ def Text.concat(*args)
103
+ rcdata = ''
104
+ args.each {|arg|
105
+ arg = arg.to_node if Hpricot::Location === arg
106
+ if Text === arg
107
+ rcdata << arg.rcdata
108
+ else
109
+ rcdata << arg.gsub(/&/, '&amp;')
110
+ end
111
+ }
112
+ new_internal rcdata
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,511 @@
1
+ require 'hpricot/elements'
2
+ require 'uri'
3
+
4
+ module Hpricot
5
+ module Traverse
6
+ def doc?() Doc::Trav === self end
7
+ def elem?() Elem::Trav === self end
8
+ def text?() Text::Trav === self end
9
+ def xmldecl?() XMLDecl::Trav === self end
10
+ def doctype?() DocType::Trav === self end
11
+ def procins?() ProcIns::Trav === self end
12
+ def comment?() Comment::Trav === self end
13
+ def bogusetag?() BogusETag::Trav === self end
14
+
15
+ def to_html
16
+ output("")
17
+ end
18
+ alias_method :to_s, :to_html
19
+
20
+ def get_subnode(*indexes)
21
+ n = self
22
+ indexes.each {|index|
23
+ n = n.get_subnode_internal(index)
24
+ }
25
+ n
26
+ end
27
+ end
28
+
29
+ module Container::Trav
30
+ def containers
31
+ children.grep(Container::Trav)
32
+ end
33
+ def replace_child(old, new)
34
+ children[children.index(old), 1] = [*new]
35
+ end
36
+ def insert_before(nodes, ele)
37
+ case nodes
38
+ when Array
39
+ nodes.each { |n| insert_before(n, ele) }
40
+ else
41
+ children[children.index(ele) || 0, 0] = nodes
42
+ end
43
+ end
44
+ def insert_after(nodes, ele)
45
+ case nodes
46
+ when Array
47
+ nodes.each { |n| insert_after(n, ele) }
48
+ else
49
+ idx = children.index(ele)
50
+ children[idx ? idx + 1 : children.length, 0] = nodes
51
+ end
52
+ end
53
+ def inner_html
54
+ children.map { |x| x.output("") }.join
55
+ end
56
+ alias_method :innerHTML, :inner_html
57
+ def inner_html=(inner)
58
+ case inner
59
+ when String, IO
60
+ self.children = Hpricot.parse(inner).children
61
+ when Array
62
+ self.children = inner
63
+ when nil
64
+ self.children = []
65
+ end
66
+ end
67
+ alias_method :innerHTML=, :inner_html=
68
+ def search(expr, &blk)
69
+ last = nil
70
+ nodes = [self]
71
+ done = []
72
+ expr = expr.to_s
73
+ until expr.empty?
74
+ expr = clean_path(expr)
75
+ expr.gsub!(%r!^//!, '')
76
+
77
+ case expr
78
+ when %r!^/?\.\.!
79
+ expr = $'
80
+ nodes.map! { |node| node.parent }
81
+ when %r!^[>/]!
82
+ expr = $'
83
+ nodes = Elements[*nodes.map { |node| node.containers }.flatten]
84
+ when %r!^\+!
85
+ expr = $'
86
+ nodes.map! do |node|
87
+ siblings = node.parent.containers
88
+ siblings[siblings.index(node)+1]
89
+ end
90
+ nodes.compact!
91
+ when %r!^~!
92
+ expr = $'
93
+ nodes.map! do |node|
94
+ siblings = node.parent.containers
95
+ siblings[(siblings.index(node)+1)..-1]
96
+ end
97
+ nodes.flatten!
98
+ when %r!^[|,]!
99
+ expr = " #$'"
100
+ nodes.shift if nodes.first == self
101
+ done += nodes
102
+ nodes = [self]
103
+ else
104
+ m = expr.match %r!^([#.]?)([a-z0-9\\*_-]*)!i
105
+ expr = $'
106
+ if m[1] == '#'
107
+ oid = get_element_by_id(m[2])
108
+ nodes = oid ? [oid] : []
109
+ else
110
+ m[2] = "*" if m[2] == "" || m[1] == "."
111
+ ret = []
112
+ nodes.each do |node|
113
+ case m[2]
114
+ when '*'
115
+ else
116
+ ret += [*node.get_elements_by_tag_name(m[2])]
117
+ end
118
+ end
119
+ nodes = ret
120
+ end
121
+ end
122
+
123
+ nodes, expr = Elements.filter(nodes, expr)
124
+ end
125
+ nodes = done + nodes.flatten.uniq
126
+ if blk
127
+ nodes.each(&blk)
128
+ self
129
+ else
130
+ Elements[*nodes]
131
+ end
132
+ end
133
+ alias_method :/, :search
134
+
135
+ def at(expr, &blk)
136
+ search(expr, &blk).first
137
+ end
138
+ alias_method :%, :at
139
+
140
+ def clean_path(path)
141
+ path.gsub(/^\s+|\s+$/, '')
142
+ end
143
+
144
+ # +each_child+ iterates over each child.
145
+ def each_child(&block) # :yields: child_node
146
+ children.each(&block)
147
+ nil
148
+ end
149
+
150
+ # +each_child_with_index+ iterates over each child.
151
+ def each_child_with_index(&block) # :yields: child_node, index
152
+ children.each_with_index(&block)
153
+ nil
154
+ end
155
+
156
+ # +find_element+ searches an element which universal name is specified by
157
+ # the arguments.
158
+ # It returns nil if not found.
159
+ def find_element(*names)
160
+ traverse_element(*names) {|e| return e }
161
+ nil
162
+ end
163
+
164
+ # +traverse_element+ traverses elements in the tree.
165
+ # It yields elements in depth first order.
166
+ #
167
+ # If _names_ are empty, it yields all elements.
168
+ # If non-empty _names_ are given, it should be list of universal names.
169
+ #
170
+ # A nested element is yielded in depth first order as follows.
171
+ #
172
+ # t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
173
+ # t.traverse_element("a", "c") {|e| p e}
174
+ # # =>
175
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
176
+ # {emptyelem <a id="1">}
177
+ # {emptyelem <c id="2">}
178
+ #
179
+ # Universal names are specified as follows.
180
+ #
181
+ # t = Hpricot(<<'End')
182
+ # <html>
183
+ # <meta name="robots" content="index,nofollow">
184
+ # <meta name="author" content="Who am I?">
185
+ # </html>
186
+ # End
187
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
188
+ # # =>
189
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
190
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
191
+ #
192
+ def traverse_element(*names, &block) # :yields: element
193
+ if names.empty?
194
+ traverse_all_element(&block)
195
+ else
196
+ name_set = {}
197
+ names.each {|n| name_set[n] = true }
198
+ traverse_some_element(name_set, &block)
199
+ end
200
+ nil
201
+ end
202
+
203
+ def classes
204
+ get_attribute('class').to_s.strip.split(/\s+/)
205
+ end
206
+
207
+ def get_element_by_id(id)
208
+ traverse_all_element do |ele|
209
+ if eid = ele.get_attribute('id')
210
+ return ele if eid.to_s == id
211
+ end
212
+ end
213
+ nil
214
+ end
215
+
216
+ def get_elements_by_tag_name(*a)
217
+ list = Elements[]
218
+ traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
219
+ list << e
220
+ end
221
+ list
222
+ end
223
+
224
+ def each_hyperlink_attribute
225
+ traverse_element(
226
+ '{http://www.w3.org/1999/xhtml}a',
227
+ '{http://www.w3.org/1999/xhtml}area',
228
+ '{http://www.w3.org/1999/xhtml}link',
229
+ '{http://www.w3.org/1999/xhtml}img',
230
+ '{http://www.w3.org/1999/xhtml}object',
231
+ '{http://www.w3.org/1999/xhtml}q',
232
+ '{http://www.w3.org/1999/xhtml}blockquote',
233
+ '{http://www.w3.org/1999/xhtml}ins',
234
+ '{http://www.w3.org/1999/xhtml}del',
235
+ '{http://www.w3.org/1999/xhtml}form',
236
+ '{http://www.w3.org/1999/xhtml}input',
237
+ '{http://www.w3.org/1999/xhtml}head',
238
+ '{http://www.w3.org/1999/xhtml}base',
239
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
240
+ case elem.name
241
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
242
+ attrs = ['href']
243
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
244
+ attrs = ['src', 'longdesc', 'usemap']
245
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
246
+ attrs = ['classid', 'codebase', 'data', 'usemap']
247
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
248
+ attrs = ['cite']
249
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
250
+ attrs = ['action']
251
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
252
+ attrs = ['src', 'usemap']
253
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
254
+ attrs = ['profile']
255
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
256
+ attrs = ['src', 'for']
257
+ end
258
+ attrs.each {|attr|
259
+ if hyperlink = elem.get_attribute(attr)
260
+ yield elem, attr, hyperlink
261
+ end
262
+ }
263
+ }
264
+ end
265
+ private :each_hyperlink_attribute
266
+
267
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
268
+ # of A element.
269
+ #
270
+ # It yields Hpricot::Text (or Hpricot::Loc) and URI for each hyperlink.
271
+ #
272
+ # The URI objects are created with a base URI which is given by
273
+ # HTML BASE element or the argument ((|base_uri|)).
274
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
275
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
276
+ base_uri = URI.parse(base_uri) if String === base_uri
277
+ links = []
278
+ each_hyperlink_attribute {|elem, attr, hyperlink|
279
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
280
+ base_uri = URI.parse(hyperlink.to_s)
281
+ else
282
+ links << hyperlink
283
+ end
284
+ }
285
+ if base_uri
286
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
287
+ else
288
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
289
+ end
290
+ end
291
+
292
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
293
+ # of A element.
294
+ #
295
+ # It yields Hpricot::Text or Hpricot::Loc.
296
+ #
297
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
298
+ def each_hyperlink # :yields: text
299
+ links = []
300
+ each_hyperlink_attribute {|elem, attr, hyperlink|
301
+ yield hyperlink
302
+ }
303
+ end
304
+
305
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
306
+ # of A element.
307
+ #
308
+ # It yields URI for each hyperlink.
309
+ #
310
+ # The URI objects are created with a base URI which is given by
311
+ # HTML BASE element or the argument ((|base_uri|)).
312
+ def each_uri(base_uri=nil) # :yields: URI
313
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
314
+ end
315
+ end
316
+
317
+ # :stopdoc:
318
+ module Doc::Trav
319
+ def traverse_all_element(&block)
320
+ children.each {|c| c.traverse_all_element(&block) }
321
+ end
322
+ end
323
+
324
+ module Elem::Trav
325
+ def traverse_all_element(&block)
326
+ yield self
327
+ children.each {|c| c.traverse_all_element(&block) }
328
+ end
329
+ end
330
+
331
+ module Leaf::Trav
332
+ def traverse_all_element
333
+ end
334
+ end
335
+
336
+ module Doc::Trav
337
+ def traverse_some_element(name_set, &block)
338
+ children.each {|c| c.traverse_some_element(name_set, &block) }
339
+ end
340
+ end
341
+
342
+ module Elem::Trav
343
+ def traverse_some_element(name_set, &block)
344
+ yield self if name_set.include? self.name
345
+ children.each {|c| c.traverse_some_element(name_set, &block) }
346
+ end
347
+ end
348
+
349
+ module Leaf::Trav
350
+ def traverse_some_element(name_set)
351
+ end
352
+ end
353
+ # :startdoc:
354
+
355
+ module Traverse
356
+ # +traverse_text+ traverses texts in the tree
357
+ def traverse_text(&block) # :yields: text
358
+ traverse_text_internal(&block)
359
+ nil
360
+ end
361
+ end
362
+
363
+ # :stopdoc:
364
+ module Container::Trav
365
+ def traverse_text_internal(&block)
366
+ each_child {|c| c.traverse_text_internal(&block) }
367
+ end
368
+ end
369
+
370
+ module Leaf::Trav
371
+ def traverse_text_internal
372
+ end
373
+ end
374
+
375
+ module Text::Trav
376
+ def traverse_text_internal
377
+ yield self
378
+ end
379
+ end
380
+ # :startdoc:
381
+
382
+ module Container::Trav
383
+ # +filter+ rebuilds the tree without some components.
384
+ #
385
+ # node.filter {|descendant_node| predicate } -> node
386
+ # loc.filter {|descendant_loc| predicate } -> node
387
+ #
388
+ # +filter+ yields each node except top node.
389
+ # If given block returns false, corresponding node is dropped.
390
+ # If given block returns true, corresponding node is retained and
391
+ # inner nodes are examined.
392
+ #
393
+ # +filter+ returns an node.
394
+ # It doesn't return location object even if self is location object.
395
+ #
396
+ def filter(&block)
397
+ subst = {}
398
+ each_child_with_index {|descendant, i|
399
+ if yield descendant
400
+ if descendant.elem?
401
+ subst[i] = descendant.filter(&block)
402
+ else
403
+ subst[i] = descendant
404
+ end
405
+ else
406
+ subst[i] = nil
407
+ end
408
+ }
409
+ to_node.subst_subnode(subst)
410
+ end
411
+ end
412
+
413
+ module Doc::Trav
414
+ # +title+ searches title and return it as a text.
415
+ # It returns nil if not found.
416
+ #
417
+ # +title+ searchs following information.
418
+ #
419
+ # - <title>...</title> in HTML
420
+ # - <title>...</title> in RSS
421
+ def title
422
+ e = find_element('title',
423
+ '{http://www.w3.org/1999/xhtml}title',
424
+ '{http://purl.org/rss/1.0/}title',
425
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
426
+ e && e.extract_text
427
+ end
428
+
429
+ # +author+ searches author and return it as a text.
430
+ # It returns nil if not found.
431
+ #
432
+ # +author+ searchs following information.
433
+ #
434
+ # - <meta name="author" content="author-name"> in HTML
435
+ # - <link rev="made" title="author-name"> in HTML
436
+ # - <dc:creator>author-name</dc:creator> in RSS
437
+ # - <dc:publisher>author-name</dc:publisher> in RSS
438
+ def author
439
+ traverse_element('meta',
440
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
441
+ begin
442
+ next unless e.fetch_attr('name').downcase == 'author'
443
+ author = e.fetch_attribute('content').strip
444
+ return author if !author.empty?
445
+ rescue IndexError
446
+ end
447
+ }
448
+
449
+ traverse_element('link',
450
+ '{http://www.w3.org/1999/xhtml}link') {|e|
451
+ begin
452
+ next unless e.fetch_attr('rev').downcase == 'made'
453
+ author = e.fetch_attribute('title').strip
454
+ return author if !author.empty?
455
+ rescue IndexError
456
+ end
457
+ }
458
+
459
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
460
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
461
+ begin
462
+ author = e.extract_text.strip
463
+ return author if !author.empty?
464
+ rescue IndexError
465
+ end
466
+ }
467
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
468
+ begin
469
+ author = e.extract_text.strip
470
+ return author if !author.empty?
471
+ rescue IndexError
472
+ end
473
+ }
474
+ end
475
+
476
+ nil
477
+ end
478
+
479
+ end
480
+
481
+ module Doc::Trav
482
+ def root
483
+ es = []
484
+ children.each {|c| es << c if c.elem? }
485
+ raise Hpricot::Error, "no element" if es.empty?
486
+ raise Hpricot::Error, "multiple top elements" if 1 < es.length
487
+ es[0]
488
+ end
489
+ end
490
+
491
+ module Elem::Trav
492
+ def has_attribute?(name)
493
+ self.attributes && self.attributes.has_key?(name.to_s)
494
+ end
495
+ def get_attribute(name)
496
+ self.attributes && self.attributes[name.to_s]
497
+ end
498
+ alias_method :[], :get_attribute
499
+ def set_attribute(name, val)
500
+ self.attributes ||= {}
501
+ self.attributes[name.to_s] = val
502
+ end
503
+ alias_method :[]=, :set_attribute
504
+ def remove_attribute(name)
505
+ if has_attribute? name
506
+ self.attributes.delete(name)
507
+ end
508
+ end
509
+ end
510
+
511
+ end