feedtools 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
@@ -0,0 +1,115 @@
1
+ # :stopdoc:
2
+ require 'htree/modules'
3
+ require 'htree/raw_string'
4
+ require 'htree/htmlinfo'
5
+ require 'htree/encoder'
6
+ require 'htree/fstr'
7
+ require 'iconv'
8
+
9
+ module HTree # :nodoc:
10
+ class Text # :nodoc:
11
+ class << self
12
+ alias new_internal new
13
+ end
14
+
15
+ def Text.new(arg)
16
+ arg = arg.to_node if HTree::Location === arg
17
+ if Text === arg
18
+ new_internal arg.rcdata, arg.normalized_rcdata
19
+ elsif String === arg
20
+ arg2 = arg.gsub(/&/, '&amp;')
21
+ arg = arg2.freeze if arg != arg2
22
+ new_internal arg
23
+ else
24
+ raise TypeError, "cannot initialize Text with #{arg.inspect}"
25
+ end
26
+ end
27
+
28
+ def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
29
+ init_raw_string
30
+ @rcdata = rcdata && HTree.frozen_string(rcdata)
31
+ @normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
32
+ end
33
+ attr_reader :rcdata, :normalized_rcdata
34
+
35
+ def internal_normalize(rcdata)
36
+ # - character references are decoded as much as possible.
37
+ # - undecodable character references are converted to decimal numeric character refereces.
38
+ result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
39
+ u = nil
40
+ if $1
41
+ u = $1.to_i
42
+ elsif $2
43
+ u = $2.hex
44
+ elsif $3
45
+ u = NamedCharacters[$3]
46
+ end
47
+ if !u || u < 0 || 0x7fffffff < u
48
+ '?'
49
+ elsif u == 38 # '&' character.
50
+ '&#38;'
51
+ elsif u <= 0x7f
52
+ [u].pack("C")
53
+ else
54
+ begin
55
+ Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
56
+ rescue Iconv::Failure
57
+ "&##{u};"
58
+ end
59
+ end
60
+ }
61
+ HTree.frozen_string(result)
62
+ end
63
+ private :internal_normalize
64
+
65
+ # HTree::Text#to_s converts the text to a string.
66
+ # - character references are decoded as much as possible.
67
+ # - undecodable character reference are converted to `?' character.
68
+ def to_s
69
+ @normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
70
+ u = $1.to_i
71
+ if 0 <= u && u <= 0x7f
72
+ [u].pack("C")
73
+ else
74
+ '?'
75
+ end
76
+ }
77
+ end
78
+
79
+ def empty?
80
+ @normalized_rcdata.empty?
81
+ end
82
+
83
+ def strip
84
+ rcdata = @normalized_rcdata.dup
85
+ rcdata.sub!(/\A(?:\s|&nbsp;)+/, '')
86
+ rcdata.sub!(/(?:\s|&nbsp;)+\z/, '')
87
+ if rcdata == @normalized_rcdata
88
+ self
89
+ else
90
+ rcdata.freeze
91
+ Text.new_internal(rcdata, rcdata)
92
+ end
93
+ end
94
+
95
+ # HTree::Text.concat returns a text which is concatenation of arguments.
96
+ #
97
+ # An argument should be one of follows.
98
+ # - String
99
+ # - HTree::Text
100
+ # - HTree::Location which points HTree::Text
101
+ def Text.concat(*args)
102
+ rcdata = ''
103
+ args.each {|arg|
104
+ arg = arg.to_node if HTree::Location === arg
105
+ if Text === arg
106
+ rcdata << arg.rcdata
107
+ else
108
+ rcdata << arg.gsub(/&/, '&amp;')
109
+ end
110
+ }
111
+ new_internal rcdata
112
+ end
113
+ end
114
+ end
115
+ # :startdoc:
@@ -0,0 +1,465 @@
1
+ # :stopdoc:
2
+ require 'htree/doc'
3
+ require 'htree/elem'
4
+ require 'htree/loc'
5
+ require 'htree/extract_text'
6
+ require 'uri'
7
+
8
+ module HTree # :nodoc:
9
+ module Traverse # :nodoc:
10
+ def doc?() Doc::Trav === self end
11
+ def elem?() Elem::Trav === self end
12
+ def text?() Text::Trav === self end
13
+ def xmldecl?() XMLDecl::Trav === self end
14
+ def doctype?() DocType::Trav === self end
15
+ def procins?() ProcIns::Trav === self end
16
+ def comment?() Comment::Trav === self end
17
+ def bogusetag?() BogusETag::Trav === self end
18
+
19
+ def get_subnode(*indexes)
20
+ n = self
21
+ indexes.each {|index|
22
+ n = n.get_subnode_internal(index)
23
+ }
24
+ n
25
+ end
26
+ end
27
+
28
+ module Container::Trav # :nodoc:
29
+ # +each_child+ iterates over each child.
30
+ def each_child(&block) # :yields: child_node
31
+ children.each(&block)
32
+ nil
33
+ end
34
+
35
+ # +each_child_with_index+ iterates over each child.
36
+ def each_child_with_index(&block) # :yields: child_node, index
37
+ children.each_with_index(&block)
38
+ nil
39
+ end
40
+
41
+ # +find_element+ searches an element which universal name is specified by
42
+ # the arguments.
43
+ # It returns nil if not found.
44
+ def find_element(*names)
45
+ traverse_element(*names) {|e| return e }
46
+ nil
47
+ end
48
+
49
+ # +traverse_element+ traverses elements in the tree.
50
+ # It yields elements in depth first order.
51
+ #
52
+ # If _names_ are empty, it yields all elements.
53
+ # If non-empty _names_ are given, it should be list of universal names.
54
+ #
55
+ # A nested element is yielded in depth first order as follows.
56
+ #
57
+ # t = HTree('<a id=0><b><a id=1 /></b><c id=2 /></a>')
58
+ # t.traverse_element("a", "c") {|e| p e}
59
+ # # =>
60
+ # {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
61
+ # {emptyelem <a id="1">}
62
+ # {emptyelem <c id="2">}
63
+ #
64
+ # Universal names are specified as follows.
65
+ #
66
+ # t = HTree(<<'End')
67
+ # <html>
68
+ # <meta name="robots" content="index,nofollow">
69
+ # <meta name="author" content="Who am I?">
70
+ # </html>
71
+ # End
72
+ # t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
73
+ # # =>
74
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
75
+ # {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
76
+ #
77
+ def traverse_element(*names, &block) # :yields: element
78
+ if names.empty?
79
+ traverse_all_element(&block)
80
+ else
81
+ name_set = {}
82
+ names.each {|n| name_set[n] = true }
83
+ traverse_some_element(name_set, &block)
84
+ end
85
+ nil
86
+ end
87
+
88
+ def each_hyperlink_attribute
89
+ traverse_element(
90
+ '{http://www.w3.org/1999/xhtml}a',
91
+ '{http://www.w3.org/1999/xhtml}area',
92
+ '{http://www.w3.org/1999/xhtml}link',
93
+ '{http://www.w3.org/1999/xhtml}img',
94
+ '{http://www.w3.org/1999/xhtml}object',
95
+ '{http://www.w3.org/1999/xhtml}q',
96
+ '{http://www.w3.org/1999/xhtml}blockquote',
97
+ '{http://www.w3.org/1999/xhtml}ins',
98
+ '{http://www.w3.org/1999/xhtml}del',
99
+ '{http://www.w3.org/1999/xhtml}form',
100
+ '{http://www.w3.org/1999/xhtml}input',
101
+ '{http://www.w3.org/1999/xhtml}head',
102
+ '{http://www.w3.org/1999/xhtml}base',
103
+ '{http://www.w3.org/1999/xhtml}script') {|elem|
104
+ case elem.name
105
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
106
+ attrs = ['href']
107
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
108
+ attrs = ['src', 'longdesc', 'usemap']
109
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
110
+ attrs = ['classid', 'codebase', 'data', 'usemap']
111
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
112
+ attrs = ['cite']
113
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
114
+ attrs = ['action']
115
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
116
+ attrs = ['src', 'usemap']
117
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
118
+ attrs = ['profile']
119
+ when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
120
+ attrs = ['src', 'for']
121
+ end
122
+ attrs.each {|attr|
123
+ if hyperlink = elem.get_attribute(attr)
124
+ yield elem, attr, hyperlink
125
+ end
126
+ }
127
+ }
128
+ end
129
+ private :each_hyperlink_attribute
130
+
131
+ # +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
132
+ # of A element.
133
+ #
134
+ # It yields HTree::Text (or HTree::Loc) and URI for each hyperlink.
135
+ #
136
+ # The URI objects are created with a base URI which is given by
137
+ # HTML BASE element or the argument ((|base_uri|)).
138
+ # +each_hyperlink_uri+ doesn't yields href of the BASE element.
139
+ def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
140
+ base_uri = URI.parse(base_uri) if String === base_uri
141
+ links = []
142
+ each_hyperlink_attribute {|elem, attr, hyperlink|
143
+ if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
144
+ base_uri = URI.parse(hyperlink.to_s)
145
+ else
146
+ links << hyperlink
147
+ end
148
+ }
149
+ if base_uri
150
+ links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
151
+ else
152
+ links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
153
+ end
154
+ end
155
+
156
+ # +each_hyperlink+ traverses hyperlinks such as HTML href attribute
157
+ # of A element.
158
+ #
159
+ # It yields HTree::Text or HTree::Loc.
160
+ #
161
+ # Note that +each_hyperlink+ yields HTML href attribute of BASE element.
162
+ def each_hyperlink # :yields: text
163
+ links = []
164
+ each_hyperlink_attribute {|elem, attr, hyperlink|
165
+ yield hyperlink
166
+ }
167
+ end
168
+
169
+ # +each_uri+ traverses hyperlinks such as HTML href attribute
170
+ # of A element.
171
+ #
172
+ # It yields URI for each hyperlink.
173
+ #
174
+ # The URI objects are created with a base URI which is given by
175
+ # HTML BASE element or the argument ((|base_uri|)).
176
+ def each_uri(base_uri=nil) # :yields: URI
177
+ each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
178
+ end
179
+ end
180
+
181
+ module Doc::Trav
182
+ def traverse_all_element(&block)
183
+ children.each {|c| c.traverse_all_element(&block) }
184
+ end
185
+ end
186
+
187
+ module Elem::Trav
188
+ def traverse_all_element(&block)
189
+ yield self
190
+ children.each {|c| c.traverse_all_element(&block) }
191
+ end
192
+ end
193
+
194
+ module Leaf::Trav
195
+ def traverse_all_element
196
+ end
197
+ end
198
+
199
+ module Doc::Trav
200
+ def traverse_some_element(name_set, &block)
201
+ children.each {|c| c.traverse_some_element(name_set, &block) }
202
+ end
203
+ end
204
+
205
+ module Elem::Trav
206
+ def traverse_some_element(name_set, &block)
207
+ yield self if name_set.include? self.name
208
+ children.each {|c| c.traverse_some_element(name_set, &block) }
209
+ end
210
+ end
211
+
212
+ module Leaf::Trav
213
+ def traverse_some_element(name_set)
214
+ end
215
+ end
216
+
217
+ module Traverse # :nodoc:
218
+ # +traverse_text+ traverses texts in the tree
219
+ def traverse_text(&block) # :yields: text
220
+ traverse_text_internal(&block)
221
+ nil
222
+ end
223
+ end
224
+
225
+ module Container::Trav # :nodoc:
226
+ def traverse_text_internal(&block)
227
+ each_child {|c| c.traverse_text_internal(&block) }
228
+ end
229
+ end
230
+
231
+ module Leaf::Trav # :nodoc:
232
+ def traverse_text_internal
233
+ end
234
+ end
235
+
236
+ module Text::Trav # :nodoc:
237
+ def traverse_text_internal
238
+ yield self
239
+ end
240
+ end
241
+
242
+ module Container::Trav # :nodoc:
243
+ # +filter+ rebuilds the tree without some components.
244
+ #
245
+ # node.filter {|descendant_node| predicate } -> node
246
+ # loc.filter {|descendant_loc| predicate } -> node
247
+ #
248
+ # +filter+ yields each node except top node.
249
+ # If given block returns false, corresponding node is dropped.
250
+ # If given block returns true, corresponding node is retained and
251
+ # inner nodes are examined.
252
+ #
253
+ # +filter+ returns an node.
254
+ # It doesn't return location object even if self is location object.
255
+ #
256
+ def filter(&block)
257
+ subst = {}
258
+ each_child_with_index {|descendant, i|
259
+ if yield descendant
260
+ if descendant.elem?
261
+ subst[i] = descendant.filter(&block)
262
+ else
263
+ subst[i] = descendant
264
+ end
265
+ else
266
+ subst[i] = nil
267
+ end
268
+ }
269
+ to_node.subst_subnode(subst)
270
+ end
271
+ end
272
+
273
+ module Doc::Trav # :nodoc:
274
+ # +title+ searches title and return it as a text.
275
+ # It returns nil if not found.
276
+ #
277
+ # +title+ searchs following information.
278
+ #
279
+ # - <title>...</title> in HTML
280
+ # - <title>...</title> in RSS
281
+ def title
282
+ e = find_element('title',
283
+ '{http://www.w3.org/1999/xhtml}title',
284
+ '{http://purl.org/rss/1.0/}title',
285
+ '{http://my.netscape.com/rdf/simple/0.9/}title')
286
+ e && e.extract_text
287
+ end
288
+
289
+ # +author+ searches author and return it as a text.
290
+ # It returns nil if not found.
291
+ #
292
+ # +author+ searchs following information.
293
+ #
294
+ # - <meta name="author" content="author-name"> in HTML
295
+ # - <link rev="made" title="author-name"> in HTML
296
+ # - <dc:creator>author-name</dc:creator> in RSS
297
+ # - <dc:publisher>author-name</dc:publisher> in RSS
298
+ def author
299
+ traverse_element('meta',
300
+ '{http://www.w3.org/1999/xhtml}meta') {|e|
301
+ begin
302
+ next unless e.fetch_attr('name').downcase == 'author'
303
+ author = e.fetch_attribute('content').strip
304
+ return author if !author.empty?
305
+ rescue IndexError
306
+ end
307
+ }
308
+
309
+ traverse_element('link',
310
+ '{http://www.w3.org/1999/xhtml}link') {|e|
311
+ begin
312
+ next unless e.fetch_attr('rev').downcase == 'made'
313
+ author = e.fetch_attribute('title').strip
314
+ return author if !author.empty?
315
+ rescue IndexError
316
+ end
317
+ }
318
+
319
+ if channel = find_element('{http://purl.org/rss/1.0/}channel')
320
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
321
+ begin
322
+ author = e.extract_text.strip
323
+ return author if !author.empty?
324
+ rescue IndexError
325
+ end
326
+ }
327
+ channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
328
+ begin
329
+ author = e.extract_text.strip
330
+ return author if !author.empty?
331
+ rescue IndexError
332
+ end
333
+ }
334
+ end
335
+
336
+ nil
337
+ end
338
+
339
+ end
340
+
341
+ module Doc::Trav # :nodoc:
342
+ def root
343
+ es = []
344
+ children.each {|c| es << c if c.elem? }
345
+ raise HTree::Error, "no element" if es.empty?
346
+ raise HTree::Error, "multiple top elements" if 1 < es.length
347
+ es[0]
348
+ end
349
+ end
350
+
351
+ module Elem::Trav # :nodoc:
352
+ # +name+ returns the universal name of the element as a string.
353
+ #
354
+ # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.name
355
+ # # =>
356
+ # "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF"
357
+ #
358
+ def name() element_name.universal_name end
359
+
360
+ # +qualified_name+ returns the qualified name of the element as a string.
361
+ #
362
+ # p HTree('<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>').root.qualified_name
363
+ # # =>
364
+ # "rdf:RDF"
365
+ def qualified_name() element_name.qualified_name end
366
+
367
+ # +attributes+ returns attributes as a hash.
368
+ # The hash keys are HTree::Name objects.
369
+ # The hash values are HTree::Text or HTree::Location objects.
370
+ #
371
+ # p HTree('<a name="xx" href="uu">').root.attributes
372
+ # # =>
373
+ # {href=>{text "uu"}, name=>{text "xx"}}
374
+ #
375
+ # p HTree('<a name="xx" href="uu">').make_loc.root.attributes
376
+ # # =>
377
+ # {href=>#<HTree::Location: doc()/a/@href>, name=>#<HTree::Location: doc()/a/@name>}
378
+ #
379
+ def attributes
380
+ result = {}
381
+ each_attribute {|name, text|
382
+ result[name] = text
383
+ }
384
+ result
385
+ end
386
+
387
+ def each_attr
388
+ each_attribute {|name, text|
389
+ uname = name.universal_name
390
+ str = text.to_s
391
+ yield uname, str
392
+ }
393
+ end
394
+
395
+ # call-seq:
396
+ # elem.fetch_attribute(name) -> text or raise IndexError
397
+ # elem.fetch_attribute(name, default) -> text or default
398
+ # elem.fetch_attribute(name) {|uname| default } -> text or default
399
+ #
400
+ # +fetch_attribute+ returns an attribute value as a text.
401
+ #
402
+ # elem may be an instance of HTree::Elem or a location points to it.
403
+ def fetch_attribute(uname, *rest, &block)
404
+ if 1 < rest.length
405
+ raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
406
+ end
407
+ if !rest.empty? && block_given?
408
+ raise ArgumentError, "block supersedes default value argument"
409
+ end
410
+ uname = uname.universal_name if uname.respond_to? :universal_name
411
+ return update_attribute_hash.fetch(uname) {
412
+ if block_given?
413
+ return yield(uname)
414
+ elsif !rest.empty?
415
+ return rest[0]
416
+ else
417
+ raise IndexError, "attribute not found: #{uname.inspect}"
418
+ end
419
+ }
420
+ end
421
+
422
+ # call-seq:
423
+ # elem.fetch_attr(name) -> string or raise IndexError
424
+ # elem.fetch_attr(name, default) -> string or default
425
+ # elem.fetch_attr(name) {|uname| default } -> string or default
426
+ #
427
+ # +fetch_attr+ returns an attribute value as a string.
428
+ #
429
+ # elem may be an instance of HTree::Elem or a location points to it.
430
+ def fetch_attr(uname, *rest, &block)
431
+ if 1 < rest.length
432
+ raise ArgumentError, "wrong number of arguments (#{1+rest.length} for 2)"
433
+ end
434
+ if !rest.empty? && block_given?
435
+ raise ArgumentError, "block supersedes default value argument"
436
+ end
437
+ uname = uname.universal_name if uname.respond_to? :universal_name
438
+ return update_attribute_hash.fetch(uname) {
439
+ if block_given?
440
+ return yield(uname)
441
+ elsif !rest.empty?
442
+ return rest[0]
443
+ else
444
+ raise IndexError, "attribute not found: #{uname.inspect}"
445
+ end
446
+ }.to_s
447
+ end
448
+
449
+ def get_attribute(uname)
450
+ uname = uname.universal_name if uname.respond_to? :universal_name
451
+ update_attribute_hash[uname]
452
+ end
453
+
454
+ def get_attr(uname)
455
+ if text = update_attribute_hash[uname]
456
+ text.to_s
457
+ else
458
+ nil
459
+ end
460
+ end
461
+
462
+ end
463
+
464
+ end
465
+ # :startdoc: