feedtools 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
@@ -0,0 +1,115 @@
1
+ # :stopdoc:
2
+ require 'htree/modules'
3
+ require 'htree/raw_string'
4
+ require 'htree/htmlinfo'
5
+ require 'htree/encoder'
6
+ require 'htree/fstr'
7
+ require 'iconv'
8
+
9
+ module HTree # :nodoc:
10
+ class Text # :nodoc:
11
+ class << self
12
+ alias new_internal new
13
+ end
14
+
15
+ def Text.new(arg)
16
+ arg = arg.to_node if HTree::Location === arg
17
+ if Text === arg
18
+ new_internal arg.rcdata, arg.normalized_rcdata
19
+ elsif String === arg
20
+ arg2 = arg.gsub(/&/, '&amp;')
21
+ arg = arg2.freeze if arg != arg2
22
+ new_internal arg
23
+ else
24
+ raise TypeError, "cannot initialize Text with #{arg.inspect}"
25
+ end
26
+ end
27
+
28
+ def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
29
+ init_raw_string
30
+ @rcdata = rcdata && HTree.frozen_string(rcdata)
31
+ @normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
32
+ end
33
+ attr_reader :rcdata, :normalized_rcdata
34
+
35
+ def internal_normalize(rcdata)
36
+ # - character references are decoded as much as possible.
37
+ # - undecodable character references are converted to decimal numeric character refereces.
38
+ result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
39
+ u = nil
40
+ if $1
41
+ u = $1.to_i
42
+ elsif $2
43
+ u = $2.hex
44
+ elsif $3
45
+ u = NamedCharacters[$3]
46
+ end
47
+ if !u || u < 0 || 0x7fffffff < u
48
+ '?'
49
+ elsif u == 38 # '&' character.
50
+ '&#38;'
51
+ elsif u <= 0x7f
52
+ [u].pack("C")
53
+ else
54
+ begin
55
+ Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
56
+ rescue Iconv::Failure
57
+ "&##{u};"
58
+ end
59
+ end
60
+ }
61
+ HTree.frozen_string(result)
62
+ end
63
+ private :internal_normalize
64
+
65
+ # HTree::Text#to_s converts the text to a string.
66
+ # - character references are decoded as much as possible.
67
+ # - undecodable character reference are converted to `?' character.
68
+ def to_s
69
+ @normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
70
+ u = $1.to_i
71
+ if 0 <= u && u <= 0x7f
72
+ [u].pack("C")
73
+ else
74
+ '?'
75
+ end
76
+ }
77
+ end
78
+
79
+ def empty?
80
+ @normalized_rcdata.empty?
81
+ end
82
+
83
+ def strip
84
+ rcdata = @normalized_rcdata.dup
85
+ rcdata.sub!(/\A(?:\s|&nbsp;)+/, '')
86
+ rcdata.sub!(/(?:\s|&nbsp;)+\z/, '')
87
+ if rcdata == @normalized_rcdata
88
+ self
89
+ else
90
+ rcdata.freeze
91
+ Text.new_internal(rcdata, rcdata)
92
+ end
93
+ end
94
+
95
+ # HTree::Text.concat returns a text which is concatenation of arguments.
96
+ #
97
+ # An argument should be one of follows.
98
+ # - String
99
+ # - HTree::Text
100
+ # - HTree::Location which points HTree::Text
101
+ def Text.concat(*args)
102
+ rcdata = ''
103
+ args.each {|arg|
104
+ arg = arg.to_node if HTree::Location === arg
105
+ if Text === arg
106
+ rcdata << arg.rcdata
107
+ else
108
+ rcdata << arg.gsub(/&/, '&amp;')
109
+ end
110
+ }
111
+ new_internal rcdata
112
+ end
113
+ end
114
+ end
115
+ # :startdoc:
@@ -0,0 +1,465 @@
1
+ # :stopdoc:
2
+ require 'htree/doc'
3
+ require 'htree/elem'
4
+ require 'htree/loc'
5
+ require 'htree/extract_text'
6
+ require 'uri'
7
+
8
+ module HTree # :nodoc:
9
+ module Traverse # :nodoc:
10
+ def doc?() Doc::Trav === self end
11
+ def elem?() Elem::Trav === self end
12
+ def text?() Text::Trav === self end
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ def doctype?() DocType::Trav === self end
15
+ def procins?() ProcIns::Trav === self end
16
+ def comment?() Comment::Trav === self end
17
+ def bogusetag?() BogusETag::Trav === self end
18
+
19
+ def get_subnode(*indexes)
20
+ n = self
21
+ indexes.each {|index|
22
+ n = n.get_subnode_internal(index)
23
+ }
24
+ n
25
+ end
26
+ end
27
+
28
+ module Container::Trav # :nodoc:
29
+ # +each_child+ iterates over each child.
30
+ def each_child(&block) # :yields: child_node
31
+ children.each(&block)
32
+ nil
33
+ end
34
+
35
+ # +each_child_with_index+ iterates over each child.
36
+ def each_child_with_index(&block) # :yields: child_node, index
37
+ children.each_with_index(&block)
38
+ nil
39
+ end
40
+
41
+ # +find_element+ searches an element which universal name is specified by
42
+ # the arguments.
43
+ # It returns nil if not found.
44
+ def find_element(*names)
45
+ traverse_element(*names) {|e| return e }
46
+ nil
47
+ end
48
+
49
+ # +traverse_element+ traverses elements in the tree.
50
+ # It yields elements in depth first order.
51
+ #
52
+ # If _names_ are empty, it yields all elements.
53
+ # If non-empty _names_ are given, it should be list of universal names.
54
+ #
55
+ # A nested element is yielded in depth first order as follows.
56
+ #
57
+ # t = HTree('<a id=0><b><a id=1 /></b><c id=2 /></a>')
58
+ # t.traverse_element("a", "c") {|e| p e}
59
+ # # =>
60
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
61
+ # {emptyelem <a id="1">}
62
+ # {emptyelem <c id="2">}
63
+ #
64
+ # Universal names are specified as follows.
65
+ #
66
+ # t = HTree(<<'End')
67
+ # <html>
68
+ # <meta name="robots" content="index,nofollow">
69
+ # <meta name="author" content="Who am I?">
70
+ # </html>
71
+ # End
72
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
73
+ # # =>
74
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
75
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
76
+ #
77
+ def traverse_element(*names, &block) # :yields: element
78
+ if names.empty?
79
+ traverse_all_element(&block)
80
+ else
81
+ name_set = {}
82
+ names.each {|n| name_set[n] = true }
83
+ traverse_some_element(name_set, &block)
84
+ end
85
+ nil
86
+ end
87
+
88
+ def each_hyperlink_attribute
89
+ traverse_element(
90
+ '{http://www.w3.org/1999/xhtml}a',
91
+ '{http://www.w3.org/1999/xhtml}area',
92
+ '{http://www.w3.org/1999/xhtml}link',
93
+ '{http://www.w3.org/1999/xhtml}img',
94
+ '{http://www.w3.org/1999/xhtml}object',
95
+ '{http://www.w3.org/1999/xhtml}q',
96
+ '{http://www.w3.org/1999/xhtml}blockquote',
97
+ '{http://www.w3.org/1999/xhtml}ins',
98
+ '{http://www.w3.org/1999/xhtml}del',
99
+ '{http://www.w3.org/1999/xhtml}form',
100
+ '{http://www.w3.org/1999/xhtml}input',
101
+ '{http://www.w3.org/1999/xhtml}head',
102
+ '{http://www.w3.org/1999/xhtml}base',
103
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
104
+ case elem.name
105
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
106
+ attrs = ['href']
107
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
108
+ attrs = ['src', 'longdesc', 'usemap']
109
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
110
+ attrs = ['classid', 'codebase', 'data', 'usemap']
111
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
112
+ attrs = ['cite']
113
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
114
+ attrs = ['action']
115
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
116
+ attrs = ['src', 'usemap']
117
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
118
+ attrs = ['profile']
119
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
120
+ attrs = ['src', 'for']
121
+ end
122
+ attrs.each {|attr|
123
+ if hyperlink = elem.get_attribute(attr)
124
+ yield elem, attr, hyperlink
125
+ end
126
+ }
127
+ }
128
+ end
129
+ private :each_hyperlink_attribute
130
+
131
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
132
+ # of A element.
133
+ #
134
+ # It yields HTree::Text (or HTree::Loc) and URI for each hyperlink.
135
+ #
136
+ # The URI objects are created with a base URI which is given by
137
+ # HTML BASE element or the argument ((|base_uri|)).
138
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
139
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
140
+ base_uri = URI.parse(base_uri) if String === base_uri
141
+ links = []
142
+ each_hyperlink_attribute {|elem, attr, hyperlink|
143
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
144
+ base_uri = URI.parse(hyperlink.to_s)
145
+ else
146
+ links << hyperlink
147
+ end
148
+ }
149
+ if base_uri
150
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
151
+ else
152
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
153
+ end
154
+ end
155
+
156
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
157
+ # of A element.
158
+ #
159
+ # It yields HTree::Text or HTree::Loc.
160
+ #
161
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
162
+ def each_hyperlink # :yields: text
163
+ links = []
164
+ each_hyperlink_attribute {|elem, attr, hyperlink|
165
+ yield hyperlink
166
+ }
167
+ end
168
+
169
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
170
+ # of A element.
171
+ #
172
+ # It yields URI for each hyperlink.
173
+ #
174
+ # The URI objects are created with a base URI which is given by
175
+ # HTML BASE element or the argument ((|base_uri|)).
176
+ def each_uri(base_uri=nil) # :yields: URI
177
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
178
+ end
179
+ end
180
+
181
+ module Doc::Trav
182
+ def traverse_all_element(&block)
183
+ children.each {|c| c.traverse_all_element(&block) }
184
+ end
185
+ end
186
+
187
+ module Elem::Trav
188
+ def traverse_all_element(&block)
189
+ yield self
190
+ children.each {|c| c.traverse_all_element(&block) }
191
+ end
192
+ end
193
+
194
+ module Leaf::Trav
195
+ def traverse_all_element
196
+ end
197
+ end
198
+
199
+ module Doc::Trav
200
+ def traverse_some_element(name_set, &block)
201
+ children.each {|c| c.traverse_some_element(name_set, &block) }
202
+ end
203
+ end
204
+
205
+ module Elem::Trav
206
+ def traverse_some_element(name_set, &block)
207
+ yield self if name_set.include? self.name
208
+ children.each {|c| c.traverse_some_element(name_set, &block) }
209
+ end
210
+ end
211
+
212
+ module Leaf::Trav
213
+ def traverse_some_element(name_set)
214
+ end
215
+ end
216
+
217
+ module Traverse # :nodoc:
218
+ # +traverse_text+ traverses texts in the tree
219
+ def traverse_text(&block) # :yields: text
220
+ traverse_text_internal(&block)
221
+ nil
222
+ end
223
+ end
224
+
225
+ module Container::Trav # :nodoc:
226
+ def traverse_text_internal(&block)
227
+ each_child {|c| c.traverse_text_internal(&block) }
228
+ end
229
+ end
230
+
231
+ module Leaf::Trav # :nodoc:
232
+ def traverse_text_internal
233
+ end
234
+ end
235
+
236
+ module Text::Trav # :nodoc:
237
+ def traverse_text_internal
238
+ yield self
239
+ end
240
+ end
241
+
242
+ module Container::Trav # :nodoc:
243
+ # +filter+ rebuilds the tree without some components.
244
+ #
245
+ # node.filter {|descendant_node| predicate } -> node
246
+ # loc.filter {|descendant_loc| predicate } -> node
247
+ #
248
+ # +filter+ yields each node except top node.
249
+ # If given block returns false, corresponding node is dropped.
250
+ # If given block returns true, corresponding node is retained and
251
+ # inner nodes are examined.
252
+ #
253
+ # +filter+ returns an node.
254
+ # It doesn't return location object even if self is location object.
255
+ #
256
+ def filter(&block)
257
+ subst = {}
258
+ each_child_with_index {|descendant, i|
259
+ if yield descendant
260
+ if descendant.elem?
261
+ subst[i] = descendant.filter(&block)
262
+ else
263
+ subst[i] = descendant
264
+ end
265
+ else
266
+ subst[i] = nil
267
+ end
268
+ }
269
+ to_node.subst_subnode(subst)
270
+ end
271
+ end
272
+
273
+ module Doc::Trav # :nodoc:
274
+ # +title+ searches title and return it as a text.
275
+ # It returns nil if not found.
276
+ #
277
+ # +title+ searchs following information.
278
+ #
279
+ # - <title>...</title> in HTML
280
+ # - <title>...</title> in RSS
281
+ def title
282
+ e = find_element('title',
283
+ '{http://www.w3.org/1999/xhtml}title',
284
+ '{http://purl.org/rss/1.0/}title',
285
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
286
+ e && e.extract_text
287
+ end
288
+
289
+ # +author+ searches author and return it as a text.
290
+ # It returns nil if not found.
291
+ #
292
+ # +author+ searchs following information.
293
+ #
294
+ # - <meta name="author" content="author-name"> in HTML
295
+ # - <link rev="made" title="author-name"> in HTML
296
+ # - <dc:creator>author-name</dc:creator> in RSS
297
+ # - <dc:publisher>author-name</dc:publisher> in RSS
298
+ def author
299
+ traverse_element('meta',
300
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
301
+ begin
302
+ next unless e.fetch_attr('name').downcase == 'author'
303
+ author = e.fetch_attribute('content').strip
304
+ return author if !author.empty?
305
+ rescue IndexError
306
+ end
307
+ }
308
+
309
+ traverse_element('link',
310
+ '{http://www.w3.org/1999/xhtml}link') {|e|
311
+ begin
312
+ next unless e.fetch_attr('rev').downcase == 'made'
313
+ author = e.fetch_attribute('title').strip
314
+ return author if !author.empty?
315
+ rescue IndexError
316
+ end
317
+ }
318
+
319
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
320
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
321
+ begin
322
+ author = e.extract_text.strip
323
+ return author if !author.empty?
324
+ rescue IndexError
325
+ end
326
+ }
327
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
328
+ begin
329
+ author = e.extract_text.strip
330
+ return author if !author.empty?
331
+ rescue IndexError
332
+ end
333
+ }
334
+ end
335
+
336
+ nil
337
+ end
338
+
339
+ end
340
+
341
+ module Doc::Trav # :nodoc:
342
+ def root
343
+ es = []
344
+ children.each {|c| es << c if c.elem? }
345
+ raise HTree::Error, "no element" if es.empty?
346
+ raise HTree::Error, "multiple top elements" if 1 < es.length
347
+ es[0]
348
+ end
349
+ end
350
+
351
+ module Elem::Trav # :nodoc:
352
+ # +name+ returns the universal name of the element as a string.
353
+ #
354
+ # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.name
355
+ # # =>
356
+ # "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF"
357
+ #
358
+ def name() element_name.universal_name end
359
+
360
+ # +qualified_name+ returns the qualified name of the element as a string.
361
+ #
362
+ # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.qualified_name
363
+ # # =>
364
+ # "rdf:RDF"
365
+ def qualified_name() element_name.qualified_name end
366
+
367
+ # +attributes+ returns attributes as a hash.
368
+ # The hash keys are HTree::Name objects.
369
+ # The hash values are HTree::Text or HTree::Location objects.
370
+ #
371
+ # p HTree('<a name="xx" href="uu">').root.attributes
372
+ # # =>
373
+ # {href=>{text "uu"}, name=>{text "xx"}}
374
+ #
375
+ # p HTree('<a name="xx" href="uu">').make_loc.root.attributes
376
+ # # =>
377
+ # {href=>#<HTree::Location: doc()/a/@href>, name=>#<HTree::Location: doc()/a/@name>}
378
+ #
379
+ def attributes
380
+ result = {}
381
+ each_attribute {|name, text|
382
+ result[name] = text
383
+ }
384
+ result
385
+ end
386
+
387
+ def each_attr
388
+ each_attribute {|name, text|
389
+ uname = name.universal_name
390
+ str = text.to_s
391
+ yield uname, str
392
+ }
393
+ end
394
+
395
+ # call-seq:
396
+ # elem.fetch_attribute(name) -> text or raise IndexError
397
+ # elem.fetch_attribute(name, default) -> text or default
398
+ # elem.fetch_attribute(name) {|uname| default } -> text or default
399
+ #
400
+ # +fetch_attribute+ returns an attribute value as a text.
401
+ #
402
+ # elem may be an instance of HTree::Elem or a location points to it.
403
+ def fetch_attribute(uname, *rest, &block)
404
+ if 1 < rest.length
405
+ raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
406
+ end
407
+ if !rest.empty? && block_given?
408
+ raise ArgumentError, "block supersedes default value argument"
409
+ end
410
+ uname = uname.universal_name if uname.respond_to? :universal_name
411
+ return update_attribute_hash.fetch(uname) {
412
+ if block_given?
413
+ return yield(uname)
414
+ elsif !rest.empty?
415
+ return rest[0]
416
+ else
417
+ raise IndexError, "attribute not found: #{uname.inspect}"
418
+ end
419
+ }
420
+ end
421
+
422
+ # call-seq:
423
+ # elem.fetch_attr(name) -> string or raise IndexError
424
+ # elem.fetch_attr(name, default) -> string or default
425
+ # elem.fetch_attr(name) {|uname| default } -> string or default
426
+ #
427
+ # +fetch_attr+ returns an attribute value as a string.
428
+ #
429
+ # elem may be an instance of HTree::Elem or a location points to it.
430
+ def fetch_attr(uname, *rest, &block)
431
+ if 1 < rest.length
432
+ raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
433
+ end
434
+ if !rest.empty? && block_given?
435
+ raise ArgumentError, "block supersedes default value argument"
436
+ end
437
+ uname = uname.universal_name if uname.respond_to? :universal_name
438
+ return update_attribute_hash.fetch(uname) {
439
+ if block_given?
440
+ return yield(uname)
441
+ elsif !rest.empty?
442
+ return rest[0]
443
+ else
444
+ raise IndexError, "attribute not found: #{uname.inspect}"
445
+ end
446
+ }.to_s
447
+ end
448
+
449
+ def get_attribute(uname)
450
+ uname = uname.universal_name if uname.respond_to? :universal_name
451
+ update_attribute_hash[uname]
452
+ end
453
+
454
+ def get_attr(uname)
455
+ if text = update_attribute_hash[uname]
456
+ text.to_s
457
+ else
458
+ nil
459
+ end
460
+ end
461
+
462
+ end
463
+
464
+ end
465
+ # :startdoc: