yyyc514-syndication 0.6.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,531 @@
1
+ # Provides classes for parsing Atom web syndication feeds.
2
+ # See Syndication class for documentation.
3
+ #
4
+ # Copyright � mathew <meta@pobox.com> 2005-2006.
5
+ # Licensed under the same terms as Ruby.
6
+
7
+ require 'uri'
8
+ require 'rexml/parsers/streamparser'
9
+ require 'rexml/streamlistener'
10
+ require 'rexml/document'
11
+ require 'date'
12
+ require 'syndication/common'
13
+
14
+ module Syndication
15
+
16
+ # The Atom syndication format is defined at
17
+ # <URL:http://www.ietf.org/internet-drafts/draft-ietf-atompub-format-11.txt>.
18
+ # It is finalized, and should become an RFC soon.
19
+ #
20
+ # For an introduction, see "An overview of the Atom 1.0 Syndication Format"
21
+ # at <URL:http://www-128.ibm.com/developerworks/xml/library/x-atom10.html>
22
+ #
23
+ # For a comparison of Atom and RSS, see
24
+ # <URL:http://www.tbray.org/atom/RSS-and-Atom>
25
+ #
26
+ # To parse Atom feeds, use Syndication::Atom::Parser.
27
+ #
28
+ # The earlier Atom 0.3 format is partially supported; the 'mode' attribute
29
+ # is ignored and assumed to be 'xml' (as for Atom 1.0).
30
+ #
31
+ # Base64 encoded data in Atom 1.0 feeds is not supported (yet).
32
+ module Atom
33
+
34
+ # A value in an Atom feed which might be plain ASCII text, HTML, XHTML,
35
+ # or some random MIME type.
36
+
37
+ # TODO: Implement base64 support
38
+ # See http://ietfreport.isoc.org/all-ids/draft-ietf-atompub-format-11.txt
39
+ # section 4.1.3.3.
40
+
41
+ #:stopdoc:
42
+ # This object has to be handled specially; the parser feeds in all the
43
+ # REXML events, so the object can reconstruct embedded XML/XHTML.
44
+ # (Normally, the parser handles text buffering for a Container and
45
+ # calls store() when the container's element is closed.)
46
+ #:startdoc:
47
+ class Data < Container
48
+ # The decoded data, if the type is not text or XML
49
+ attr_reader :data
50
+
51
+ # Table of entities ripped from the XHTML spec.
52
+ ENTITIES = {
53
+ 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194,
54
+ 'acirc' => 226, 'acute' => 180, 'AElig' => 198,
55
+ 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224,
56
+ 'amp' => 38, 'Aring' => 197, 'aring' => 229,
57
+ 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196,
58
+ 'auml' => 228, 'brvbar' => 166, 'Ccedil' => 199,
59
+ 'ccedil' => 231, 'cedil' => 184, 'cent' => 162,
60
+ 'copy' => 169, 'curren' => 164, 'deg' => 176,
61
+ 'divide' => 247, 'Eacute' => 201, 'eacute' => 233,
62
+ 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200,
63
+ 'egrave' => 232, 'ETH' => 208, 'eth' => 240,
64
+ 'Euml' => 203, 'euml' => 235, 'frac12' => 189,
65
+ 'frac14' => 188, 'frac34' => 190, 'gt' => 62,
66
+ 'Iacute' => 205, 'iacute' => 237, 'Icirc' => 206,
67
+ 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204,
68
+ 'igrave' => 236, 'iquest' => 191, 'Iuml' => 207,
69
+ 'iuml' => 239, 'laquo' => 171, 'lt' => 60,
70
+ 'macr' => 175, 'micro' => 181, 'middot' => 183,
71
+ 'nbsp' => 160, 'not' => 172, 'Ntilde' => 209,
72
+ 'ntilde' => 241, 'Oacute' => 211, 'oacute' => 243,
73
+ 'Ocirc' => 212, 'ocirc' => 244, 'Ograve' => 210,
74
+ 'ograve' => 242, 'ordf' => 170, 'ordm' => 186,
75
+ 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213,
76
+ 'otilde' => 245, 'Ouml' => 214, 'ouml' => 246,
77
+ 'para' => 182, 'plusmn' => 177, 'pound' => 163,
78
+ 'quot' => 34, 'raquo' => 187, 'reg' => 174,
79
+ 'sect' => 167, 'shy' => 173, 'sup1' => 185,
80
+ 'sup2' => 178, 'sup3' => 179, 'szlig' => 223,
81
+ 'THORN' => 222, 'thorn' => 254, 'times' => 215,
82
+ 'Uacute' => 218, 'uacute' => 250, 'Ucirc' => 219,
83
+ 'ucirc' => 251, 'Ugrave' => 217, 'ugrave' => 249,
84
+ 'uml' => 168, 'Uuml' => 220, 'uuml' => 252,
85
+ 'Yacute' => 221, 'yacute' => 253, 'yen' => 165,
86
+ 'yuml' => 255
87
+ }
88
+
89
+ def initialize(parent, tag, attrs = nil)
90
+ @tag = tag
91
+ @parent = parent
92
+ @type = 'text' # the default, as per the standard
93
+ if attrs['type']
94
+ @type = attrs['type']
95
+ end
96
+ @div_trimmed = false
97
+ case @type
98
+ when 'xhtml'
99
+ @xhtml = ''
100
+ when 'html'
101
+ @html = ''
102
+ when 'text'
103
+ @text = ''
104
+ end
105
+ end
106
+
107
+ # Convert a text representation to HTML.
108
+ def text2html(text)
109
+ html = text.gsub('&','&amp;')
110
+ html.gsub!('<','&lt;')
111
+ html.gsub!('>','&gt;')
112
+ return html
113
+ end
114
+
115
+ # Convert an HTML representation to text.
116
+ # This is done by throwing away all tags and converting all entities.
117
+ # Not ideal, but I can't think of a better simple approach.
118
+ def html2text(html)
119
+ text = html.gsub(/<[^>]*>/, '')
120
+ text = text.gsub(/&(\w)+;/) {|x|
121
+ ENTITIES[x] ? ENTITIES[x] : ''
122
+ }
123
+ return text
124
+ end
125
+
126
+ # Return value of Data as HTML.
127
+ def html
128
+ return @html if @html
129
+ return @xhtml if @xhtml
130
+ return text2html(@text) if @text
131
+ return nil
132
+ end
133
+
134
+ # Return value of Data as ASCII text.
135
+ # If the field started off as (X)HTML, this is done by ruthlessly
136
+ # discarding markup and entities, so it is highly recommended that you
137
+ # use the XHTML or HTML and convert to text in a more intelligent way.
138
+ def txt
139
+ return @text if @text
140
+ return html2text(@xhtml) if @xhtml
141
+ return html2text(@html) if @html
142
+ return nil
143
+ end
144
+
145
+ # Return value of Data as XHTML.
146
+ def xhtml
147
+ return @xhtml if @xhtml
148
+ return @html if @html
149
+ return text2html(@text) if @text
150
+ return nil
151
+ end
152
+
153
+ # Catch tag start events if we're collecting embedded XHTML.
154
+ def tag_start(tag, attrs = nil)
155
+ if @type == 'xhtml'
156
+ t = tag.sub(/^xhtml:/,'')
157
+ @xhtml += "<#{t}>"
158
+ else
159
+ super
160
+ end
161
+ end
162
+
163
+ # Catch tag end events if we're collecting embedded XHTML.
164
+ def tag_end(endtag, current)
165
+ if @tag == endtag
166
+ if @type == 'xhtml' and !defined? @div_stripped
167
+ @xhtml.sub!(/^\s*<div>\s*/m,'')
168
+ @xhtml.sub!(/\s*<\/div>\s*$/m,'')
169
+ @div_stripped = true
170
+ end
171
+ return @parent
172
+ end
173
+ if @type == 'xhtml'
174
+ t = endtag.sub(/^xhtml:/,'')
175
+ @xhtml += "</#{t}>"
176
+ return self
177
+ else
178
+ super
179
+ end
180
+ end
181
+
182
+ # Store/buffer text in the appropriate internal field.
183
+ def text(s)
184
+ case @type
185
+ when 'xhtml'
186
+ @xhtml += s
187
+ when 'html'
188
+ @html += s
189
+ when 'text'
190
+ @text += s
191
+ end
192
+ end
193
+ end
194
+
195
+ # A Link represents a hypertext link to another object from an Atom feed.
196
+ # Examples include the link with rel=self to the canonical URL of the feed.
197
+ class Link < Container
198
+ attr_accessor :href # The URI of the link.
199
+ attr_accessor :rel # The type of relationship the link expresses.
200
+ attr_accessor :type # The type of object at the other end of the link.
201
+ attr_accessor :title # The title for the link.
202
+ attr_accessor :length # The length of the linked-to object in bytes.
203
+
204
+ def initialize(parent, tag, attrs = nil)
205
+ @tag = tag
206
+ @parent = parent
207
+ if attrs
208
+ attrs.each_pair {|key, value|
209
+ self.store(key, value)
210
+ }
211
+ end
212
+ end
213
+ end
214
+
215
+ # XML or text content
216
+ class Content < Data
217
+ attr_accessor :xml # The raw XML contents of the content element.
218
+
219
+ def initialize(parent, tag = nil, attrs = nil)
220
+ @xml = ""
221
+ super
222
+ end
223
+
224
+ def store(tag, obj)
225
+ puts "[obj:#{tag}]"
226
+ end
227
+
228
+ def tag_start(tag, attrs = nil)
229
+ puts "[tag:#{tag}]"
230
+ attrlist = ""
231
+ if attrs
232
+ for a in attrs.keys
233
+ if attrlist != ""
234
+ attrlist += " "
235
+ end
236
+ attrlist += "#{a}=\"#{attrs[a]}\""
237
+ end
238
+ @xml += "<#{tag} #{attrlist}>"
239
+ else
240
+ @xml += "<#{tag}>"
241
+ end
242
+ end
243
+
244
+ def tag_end(endtag, current)
245
+ puts "[endtag:#{endtag}]"
246
+ puts "[@tag:#{@tag}]"
247
+ if @tag = endtag
248
+ return @parent
249
+ end
250
+ @xml += "</#{endtag}>"
251
+ return self
252
+ end
253
+
254
+ def text(s)
255
+ puts "[text:#{s}]"
256
+ @xml += s
257
+ end
258
+
259
+ end
260
+
261
+ # A person, corporation or similar entity within an Atom feed.
262
+ class Person < Container
263
+ attr_accessor :name # Human-readable name of person.
264
+ attr_accessor :uri # URI associated with the person.
265
+ attr_accessor :email # RFC2822 e-mail address of person.
266
+
267
+ # For Atom 0.3 compatibility
268
+ def url=(x)
269
+ @uri = x
270
+ end
271
+ end
272
+
273
+ # A category (keyword) in an Atom feed.
274
+ # For convenience, Category#to_s is the same as Category#label.
275
+ class Category < Container
276
+ # The category itself, possibly encoded.
277
+ attr_accessor :term
278
+ # A human-readable version of Category#term.
279
+ attr_accessor :label
280
+ # URI to the schema definition.
281
+ attr_accessor :scheme
282
+
283
+ #:stopdoc:
284
+ # parent = parent object
285
+ # tag = XML tag which caused creation of this object
286
+ # attrs = XML attributes as a hash
287
+ def initialize(parent, tag, attrs = nil)
288
+ @tag = tag
289
+ @parent = parent
290
+ if attrs
291
+ attrs.each_pair {|key, value|
292
+ self.store(key, value)
293
+ }
294
+ end
295
+ end
296
+
297
+ alias to_s label
298
+ #:startdoc:
299
+ end
300
+
301
+ # Represents a parsed Atom feed, as returned by Syndication::Atom::Parser.
302
+ class Feed < Container
303
+ # Title of feed as a Syndication::Data object.
304
+ attr_accessor :title
305
+ # Subtitle of feed as a Syndication::Data object.
306
+ attr_accessor :subtitle
307
+ # Last update time, accepts an ISO8601 date/time as per the Atom spec.
308
+ attr_writer :updated
309
+ # Software which generated feed as a String.
310
+ attr_accessor :generator
311
+ # URI of icon to represent channel as a String.
312
+ attr_accessor :icon
313
+ # Globally unique ID of feed as a String.
314
+ attr_accessor :id
315
+ # URI of logo for channel as a String.
316
+ attr_accessor :logo
317
+ # Copyright or other rights information as a String.
318
+ attr_accessor :rights
319
+ # Author of feed as a Syndication::Person object.
320
+ attr_accessor :author
321
+ # Array of Syndication::Entry objects representing the entries in the feed.
322
+ attr_reader :entries
323
+ # Array of Syndication::Category objects representing taxonomic
324
+ # categories for the feed.
325
+ attr_reader :categories
326
+ # Array of Syndication::Person objects representing contributors.
327
+ attr_reader :contributors
328
+ # Array of Syndication::Link objects representing various types of link.
329
+ attr_reader :links
330
+ # Atom 0.3 info element (obsolete)
331
+ attr_accessor :info
332
+
333
+ # For Atom 0.3 compatibility
334
+ def tagline=(x)
335
+ @subtitle = x
336
+ end
337
+
338
+ # For Atom 0.3 compatibility
339
+ def copyright=(x)
340
+ @rights = x
341
+ end
342
+
343
+ # For Atom 0.3 compatibility
344
+ def modified=(x)
345
+ @updated = x
346
+ end
347
+
348
+ # Add a Syndication::Category value to the feed
349
+ def category=(obj)
350
+ if !defined? @categories
351
+ @categories = Array.new
352
+ end
353
+ @categories.push(obj)
354
+ end
355
+
356
+ # Add a Syndication::Entry to the feed
357
+ def entry=(obj)
358
+ if !defined? @entries
359
+ @entries = Array.new
360
+ end
361
+ @entries.push(obj)
362
+ end
363
+
364
+ # Add a Syndication::Person contributor to the feed
365
+ def contributor=(obj)
366
+ if !defined? @contributors
367
+ @contributors = Array.new
368
+ end
369
+ @contributors.push(obj)
370
+ end
371
+
372
+ # Add a Syndication::Link to the feed
373
+ def link=(obj)
374
+ if !defined? @links
375
+ @links = Array.new
376
+ end
377
+ @links.push(obj)
378
+ end
379
+
380
+ # Last update date/time as a DateTime object if it can be parsed,
381
+ # a String otherwise.
382
+ def updated
383
+ parse_date(@updated)
384
+ end
385
+ end
386
+
387
+ # An entry within an Atom feed.
388
+ class Entry < Container
389
+ # Title of entry.
390
+ attr_accessor :title
391
+ # Summary of content.
392
+ attr_accessor :summary
393
+ # Source feed metadata as Feed object.
394
+ attr_accessor :source
395
+ # Last update date/time as DateTime object.
396
+ attr_writer :updated
397
+ # Publication date/time as DateTime object.
398
+ attr_writer :published
399
+ # Author of entry as a Person object.
400
+ attr_accessor :author
401
+ # Copyright or other rights information.
402
+ attr_accessor :rights
403
+ # Globally unique ID of Entry.
404
+ attr_accessor :id
405
+ # Array of taxonomic categories for feed.
406
+ attr_reader :categories
407
+ # Array of Link objects.
408
+ attr_reader :links
409
+ # Array of Person objects representing contributors.
410
+ attr_reader :contributors
411
+ # Atom 0.3 creation date/time (obsolete)
412
+ attr_writer :created
413
+ # Content element as Atom::Content object
414
+ attr_reader :content
415
+
416
+ # For Atom 0.3 compatibility
417
+ def modified=(x)
418
+ @updated = x
419
+ end
420
+
421
+ # For Atom 0.3 compatibility
422
+ def issued=(x)
423
+ @published = x
424
+ end
425
+
426
+ # For Atom 0.3 compatibility
427
+ def copyright=(x)
428
+ @rights = x
429
+ end
430
+
431
+ # Add a Content object to the entry
432
+ def content=(obj)
433
+ @content = obj
434
+ end
435
+
436
+ # Add a Category object to the entry
437
+ def category=(obj)
438
+ if !defined? @categories
439
+ @categories = Array.new
440
+ end
441
+ @categories.push(obj)
442
+ end
443
+
444
+ # Add a Person to the entry to represent a contributor
445
+ def contributor=(obj)
446
+ if !defined? @contributors
447
+ @contributors = Array.new
448
+ end
449
+ @contributors.push(obj)
450
+ end
451
+
452
+ # Add a Link to the entry
453
+ def link=(obj)
454
+ if !defined? @links
455
+ @links = Array.new
456
+ end
457
+ @links.push(obj)
458
+ end
459
+
460
+ # The last update DateTime
461
+ def updated
462
+ parse_date(@updated)
463
+ end
464
+
465
+ # The DateTime of publication
466
+ def published
467
+ parse_date(@published)
468
+ end
469
+
470
+ # The DateTime of creation (Atom 0.3, obsolete)
471
+ def created
472
+ parse_date(@created)
473
+ end
474
+ end
475
+
476
+ # A parser for Atom feeds.
477
+ # See Syndication::Parser in common.rb for the abstract class this
478
+ # specializes.
479
+ class Parser < AbstractParser
480
+ include REXML::StreamListener
481
+
482
+ #:stopdoc:
483
+ # A hash of tags which require the creation of new objects, and the class
484
+ # to use for creating the object.
485
+ CLASS_FOR_TAG = {
486
+ 'entry' => Entry,
487
+ 'author' => Person,
488
+ 'contributor' => Person,
489
+ 'title' => Data,
490
+ 'subtitle' => Data,
491
+ 'summary' => Data,
492
+ 'link' => Link,
493
+ 'source' => Feed,
494
+ 'category' => Category,
495
+ 'content' => Content
496
+ }
497
+
498
+ # Called when REXML finds a text fragment.
499
+ # For Atom parsing, we need to handle Data objects specially:
500
+ # They need all events passed through verbatim, because
501
+ # they might contain XHTML which will be sent through
502
+ # as REXML events and will need to be reconstructed.
503
+ def text(s)
504
+ if @current_object.kind_of?(Data)
505
+ @current_object.text(s)
506
+ return
507
+ end
508
+ if @textstack.last
509
+ @textstack.last << s
510
+ end
511
+ end
512
+ #:startdoc:
513
+
514
+ # Reset the parser ready to parse a new feed.
515
+ def reset
516
+ # Set up an empty Feed object and make it the current object
517
+ @parsetree = Feed.new(nil)
518
+ # Set up the class-for-tag hash
519
+ @class_for_tag = CLASS_FOR_TAG
520
+ # Everything else is common to both kinds of parser
521
+ super
522
+ end
523
+
524
+ # The most recently parsed feed as a Syndication::Feed object.
525
+ def feed
526
+ return @parsetree
527
+ end
528
+
529
+ end
530
+ end
531
+ end