syndication 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,326 @@
1
+ # This module provides classes and methods for parsing RSS web syndication
2
+ # feeds.
3
+ #
4
+ # Copyright � mathew <meta@pobox.com> 2005.
5
+ # Licensed under the same terms as Ruby.
6
+
7
+ require 'uri'
8
+ require 'rexml/parsers/streamparser'
9
+ require 'rexml/streamlistener'
10
+ require 'rexml/document'
11
+ require 'date'
12
+ require 'syndication/common'
13
+
14
+ module Syndication
15
+ class Container
16
+
17
+ # This method is used by objects in RSS feeds that accept
18
+ # <category> elements
19
+ def store_category(cat)
20
+ if cat.kind_of?(String)
21
+ if !@category
22
+ @category = Array.new
23
+ end
24
+ @category << cat
25
+ end
26
+ end
27
+ end
28
+
29
+ # RSS is a method of syndicating web site content.
30
+ #
31
+ # There are nine different versions of RSS; see
32
+ # <URL:http://diveintomark.org/archives/2004/02/04/incompatible-rss>
33
+ #
34
+ # This code attempts to parse all of them, and provide the same API via
35
+ # the same data model regardless of the particular flavor of RSS fed in.
36
+ #
37
+ # One thing to be aware of is that RSS 0.9x and 2.0x have no mechanism for
38
+ # indicating the type of text in a description, whether plain text or HTML.
39
+ # As a result, this library leaves it to you to write code to 'sniff'
40
+ # the data returned and decide whether you think it looks like text or HTML.
41
+ #
42
+ # RSS 1.0 solves the problem via the content module, which is supported
43
+ # via Syndication::Content. Atom solves the problem too.
44
+ module RSS
45
+
46
+ # Represents an individual story or entry in an RSS feed.
47
+ class Item < Container
48
+ # The title of the item as a String.
49
+ attr_accessor :title
50
+ # The URL of the item as a String.
51
+ attr_accessor :link
52
+ # A textual description of the item as a String.
53
+ attr_accessor :description
54
+ # E-mail address of item author.
55
+ attr_accessor :author
56
+ # One or more categories for the item, as an Array of Strings.
57
+ attr_reader :category
58
+ alias category= store_category
59
+ # URL for feedback on this item as a String.
60
+ attr_accessor :comments
61
+ # A media object attached to the item, as a Syndication::Enclosure.
62
+ attr_accessor :enclosure
63
+ # A globally unique identifier for this item, a String.
64
+ attr_accessor :guid
65
+ # The publication date for this item. Accepts anything DateTime can
66
+ # parse, which includes RFC822-style dates as specified by the RSS
67
+ # standards.
68
+ attr_writer :pubdate
69
+ # An RSS channel this item was copied from, used to give credit for
70
+ # copied links. A URL String.
71
+ attr_accessor :source
72
+
73
+ # Publication date as a DateTime if possible; if it won't parse,
74
+ # returns the original string.
75
+ def pubdate
76
+ parse_date(@pubdate)
77
+ end
78
+ end
79
+
80
+ # Used to represent graphical images provided in an RSS feed, with the
81
+ # intent that they be used to represent the channel in a graphical user
82
+ # interface, or on a web page.
83
+ #
84
+ # Typically found via Syndication::Channel#image
85
+ class Image < Container
86
+ # URL of image.
87
+ attr_accessor :url
88
+ # Title of image for use as ALT text.
89
+ attr_accessor :title
90
+ # Link to use when image is clicked on.
91
+ attr_accessor :link
92
+ # Width of image in pixels, as an integer.
93
+ attr_reader :width
94
+ # Height of image in pixels, as an integer.
95
+ attr_reader :height
96
+
97
+ # Set width in pixels.
98
+ def width=(x)
99
+ if x.kind_of?(String)
100
+ @width = x.to_i
101
+ end
102
+ end
103
+
104
+ # Set height in pixels.
105
+ def height=(x)
106
+ if x.kind_of?(String)
107
+ @height = x.to_i
108
+ end
109
+ end
110
+ end
111
+
112
+ # Represents a text input box to be used in association with an RSS feed, for
113
+ # example a search box or e-mail subscription input box.
114
+ #
115
+ # Typically found via Syndication::Channel#textinput method.
116
+ class TextInput < Container
117
+ # Label for Submit button in text input area.
118
+ attr_accessor :title
119
+ # Label to explain purpose of text input area.
120
+ attr_accessor :description
121
+ # Name of text object in input area, for form submission.
122
+ attr_accessor :name
123
+ # URL to submit data to via HTTP POST.
124
+ attr_accessor :link end
125
+
126
+ # Represents metadata about an RSS feed as a whole.
127
+ # Typically found via the Syndication::RSS::Feed#channel method.
128
+ class Channel < Container
129
+ # The title of the channel.
130
+ attr_accessor :title
131
+ # The URL of the web site this is a channel for.
132
+ attr_accessor :link
133
+ # A textual description of the channel.
134
+ attr_accessor :description
135
+ # Copyright statement for channel.
136
+ attr_accessor :copyright
137
+ # ISO code for the language the channel is written in.
138
+ attr_accessor :language
139
+ # E-mail address of person responsible for editorial content.
140
+ attr_accessor :managingeditor
141
+ # E-mail address of person responsible for technical issues with feed.
142
+ attr_accessor :webmaster
143
+ # Publication date of content in channel.
144
+ attr_writer :pubdate
145
+ # Last time content in channel changed.
146
+ attr_writer :lastbuilddate
147
+ # The graphical image to represent the channel, as a
148
+ # Syndication::Image object.
149
+ attr_accessor :image
150
+ # One or more categories for the channel, as an Array of Strings.
151
+ attr_accessor :category
152
+ alias category= store_category
153
+ # The software that generated the channel.
154
+ attr_accessor :generator
155
+ # The URL of some documentation on what the RSS format is.
156
+ attr_accessor :docs
157
+ # Time to live for this copy of the channel.
158
+ attr_accessor :ttl
159
+ # rssCloud interface (for Radio UserLand).
160
+ attr_accessor :cloud
161
+ # PICS rating for channel.
162
+ attr_accessor :rating
163
+ # The TextInput area as a Syndication::TextInput object.
164
+ attr_accessor :textinput
165
+ # Hours when the feed can be skipped (because it will not have new content).
166
+ # Returned as an Array of values in the range 0..23 (even if parsing the
167
+ # UserLand variant of RSS 0.91).
168
+ attr_reader :skiphours
169
+ # Full names (in English) of days when the feed can be skipped.
170
+ attr_reader :skipdays
171
+
172
+ # Publication date of content in channel, as a DateTime object if it
173
+ # can be parsed by DateTime; otherwise, as a String.
174
+ def pubdate
175
+ return parse_date(@pubdate)
176
+ end
177
+
178
+ # Last time content in channel changed, as a DateTime object if it
179
+ # can be parsed by DateTime; otherwise, as a String.
180
+ def lastbuilddate
181
+ return parse_date(@lastbuilddate)
182
+ end
183
+
184
+ # Add an hour to the list of hours to skip.
185
+ #
186
+ # The <hour> element in fact comes inside <skipHours>, but we don't enforce
187
+ # that; we just make the Channel recognize it and store the values.
188
+ def hour=(hr)
189
+ if hr.kind_of?(String)
190
+ if !@skiphours
191
+ @skiphours = Array.new
192
+ end
193
+ h = hr.to_i
194
+ @skiphours << (h == 24 ? 0 : h)
195
+ end
196
+ end
197
+
198
+ # Add a day name to the list of days to skip.
199
+ #
200
+ # The <day> element in fact comes inside <skipDays>, but we don't enforce
201
+ # that; we just make the Channel recognize it and store the values.
202
+ def day=(dayname)
203
+ if dayname.kind_of?(String)
204
+ if !@skipdays
205
+ @skipdays = Array.new
206
+ end
207
+ @skipdays << dayname
208
+ end
209
+ end
210
+ end
211
+
212
+ # The <cloud> element is very rarely used. It was added to the RSS standards
213
+ # to support the rssCloud protocol of Radio UserLand.
214
+ class Cloud < Container
215
+ # The hostname to connect to.
216
+ attr_accessor :domain
217
+ # The TCP/IP port number.
218
+ attr_reader :port
219
+ # The request path.
220
+ attr_accessor :path
221
+ # The registration method.
222
+ attr_accessor :registerprocedure
223
+ # The protocol to use.
224
+ attr_accessor :protocol
225
+
226
+ # Set port number
227
+ def port=(x)
228
+ @port = x.to_i
229
+ end
230
+
231
+ def initialize(parent, tag, attrs = nil)
232
+ @tag = tag
233
+ @parent = parent
234
+ if attrs
235
+ attrs.each_pair {|key, value|
236
+ self.store(key, value)
237
+ }
238
+ end
239
+ end
240
+ end
241
+
242
+ # Represents a multimedia enclosure in an RSS item.
243
+ # Typically found as Syndication::Item#enclosure
244
+ class Enclosure < Container
245
+ # The URL to the multimedia file.
246
+ attr_accessor :url
247
+ # The MIME type of the file.
248
+ attr_accessor :type
249
+ # The length of the file, in bytes.
250
+ attr_reader :length
251
+
252
+ # Set length in bytes.
253
+ def length=(x)
254
+ @length = x.to_i
255
+ end
256
+
257
+ def initialize(parent, tag, attrs = nil)
258
+ @tag = tag
259
+ @parent = parent
260
+ if attrs
261
+ attrs.each_pair {|key, value|
262
+ self.store(key, value)
263
+ }
264
+ end
265
+ end
266
+ end
267
+
268
+ # Represents a parsed RSS feed, as returned by Syndication::RSS::Parser.
269
+ class Feed < Container
270
+ # The Channel metadata and contents of the feed as a
271
+ # Syndication::Channel object
272
+ attr_accessor :channel
273
+ # The items in the feed as an Array of Syndication::Item objects.
274
+ attr_reader :items
275
+ # The text input area as a Syndication::TextInput object.
276
+ attr_accessor :textinput
277
+ # The image for the feed, as a Syndication::Image object.
278
+ attr_accessor :image
279
+
280
+ # Add an item to the feed.
281
+ def item=(obj)
282
+ if !@items
283
+ @items = Array.new
284
+ end
285
+ @items.push(obj)
286
+ end
287
+ end
288
+
289
+ # A parser for RSS feeds.
290
+ # See Syndication::Parser in common.rb for the abstract class this
291
+ # specializes.
292
+ class Parser < AbstractParser
293
+ include REXML::StreamListener
294
+
295
+ #:stopdoc:
296
+ # A hash of tags which require the creation of new objects, and the class
297
+ # to use for creating the object.
298
+ CLASS_FOR_TAG = {
299
+ 'item' => Item,
300
+ 'entry' => Item,
301
+ 'image' => Image,
302
+ 'channel' => Channel,
303
+ 'cloud' => Cloud,
304
+ 'textinput' => TextInput,
305
+ 'textInput' => TextInput,
306
+ 'enclosure' => Enclosure
307
+ }
308
+ #:startdoc:
309
+
310
+ # Reset the parser ready to parse a new feed.
311
+ def reset
312
+ # Set up an empty RSS::Feed object and make it the current object
313
+ @parsetree = Feed.new(nil)
314
+ # Set up the class-for-tag hash
315
+ @class_for_tag = CLASS_FOR_TAG
316
+ # Everything else is common to both kinds of parser
317
+ super
318
+ end
319
+
320
+ # The most recently parsed feed as a Syndication::RSS::Feed object.
321
+ def feed
322
+ return @parsetree
323
+ end
324
+ end
325
+ end
326
+ end
@@ -0,0 +1,45 @@
1
+
2
+ require 'date'
3
+
4
+ module Syndication
5
+
6
+ # Mixin for RSS 1.0 syndication data (draft standard for RSS 1.0).
7
+ #
8
+ # If you require 'syndication/syndication' these methods are added to the
9
+ # Syndication::Channel class.
10
+ #
11
+ # Access methods are named after the XML elements, prefixed with sy_.
12
+ #
13
+ module Syndication
14
+ # The period over which the channel is updated. Allowed values are
15
+ # 'hourly', 'daily', 'weekly', 'monthly', 'yearly'. If omitted, 'daily'
16
+ # is assumed.
17
+ attr_accessor :sy_updateperiod
18
+
19
+ # Frequency of updates, in relation to sy_updateperiod. Indicates how many
20
+ # times in each sy_updateperiod the channel is updated. For example,
21
+ # sy_updateperiod = 'daily' and sy_updatefrequency = 4 means four times
22
+ # per day.
23
+ attr_accessor :sy_updatefrequency
24
+
25
+ # Base date used to calculate publishing times. When combined with
26
+ # sy_updateperiod and sy_updatefrequency, the publishing schedule can
27
+ # be derived. Returned as a DateTime if possible, otherwise as a String.
28
+ attr_reader :sy_updatebase
29
+
30
+ def sy_updatebase=(x)
31
+ d = DateTime.parse(x)
32
+ if d
33
+ @sy_updatebase = d
34
+ else
35
+ @sy_updatebase = x
36
+ end
37
+ end
38
+ end
39
+
40
+ #:enddoc:
41
+ class Channel
42
+ include Syndication
43
+ end
44
+
45
+ end
@@ -0,0 +1,186 @@
1
+
2
+ require 'syndication/atom'
3
+ require 'test/unit'
4
+
5
+ module Syndication
6
+
7
+ # This class contains the unit tests for the Syndication module.
8
+ class Tests < Test::Unit::TestCase
9
+
10
+ # A set of minimal assertions that can be applied to every well-formed parsed
11
+ # feed.
12
+ def baseline_assertions(feed)
13
+ assert_not_nil(feed, 'Parser returned nil')
14
+ assert_kind_of(Syndication::Atom::Feed, feed)
15
+ assert_not_nil(feed.title, 'Feed#title was nil')
16
+ assert_not_nil(feed.id, 'Feed#id was nil')
17
+ assert_not_nil(feed.updated, 'Feed#updated was nil')
18
+ assert_kind_of(DateTime, feed.updated)
19
+ assert(feed.entries.length > 0, 'No entries in feed')
20
+ for entry in feed.entries
21
+ assert_not_nil(entry.title, 'Entry#title was nil')
22
+ assert_not_nil(entry.id, 'Entry#id was nil')
23
+ assert(entry.links.length > 0, 'No links in entry')
24
+ assert_not_nil(entry.links[0], 'Entry#links[0] was nil')
25
+ assert_not_nil(entry.updated, 'Entry#updated was nil')
26
+ assert_kind_of(DateTime, entry.updated)
27
+ end
28
+ end
29
+
30
+ # Minimal test
31
+ def test_atom_minimal
32
+ xml = <<-EOF
33
+ <?xml version="1.0" encoding="utf-8"?>
34
+ <feed xmlns="http://www.w3.org/2005/Atom">
35
+ <title>One good turn usually gets most of the blanket.</title>
36
+ <updated>2005-08-20T21:14:38Z</updated>
37
+ <id>urn:uuid:035d3aa3022c1b1b2a17e37ae2dcc376</id>
38
+ <entry>
39
+ <title>Quidquid latine dictum sit, altum viditur.</title>
40
+ <link href="http://example.com/05/08/20/2114.html"/>
41
+ <id>urn:uuid:89d96d76a99426264f6f1f520c1b93c2</id>
42
+ <updated>2005-08-20T21:14:38Z</updated>
43
+ </entry>
44
+ </feed>
45
+ EOF
46
+ f = Syndication::Atom::Parser.new.parse(xml)
47
+ baseline_assertions(f)
48
+ assert(f.title.txt == 'One good turn usually gets most of the blanket.')
49
+ assert(f.updated.strftime('%F %T') == '2005-08-20 21:14:38')
50
+ assert(f.entries.length == 1, 'Wrong number of entries in feed')
51
+ assert(f.id == 'urn:uuid:035d3aa3022c1b1b2a17e37ae2dcc376')
52
+ e = f.entries[0]
53
+ assert(e.title.txt == 'Quidquid latine dictum sit, altum viditur.')
54
+ assert(e.links.length == 1, 'Wrong number of links in entry')
55
+ l = e.links[0]
56
+ assert(l.href == 'http://example.com/05/08/20/2114.html')
57
+ assert(e.id == 'urn:uuid:89d96d76a99426264f6f1f520c1b93c2')
58
+ assert(e.updated.strftime('%F %T') == '2005-08-20 21:14:38')
59
+ end
60
+
61
+ # Test a well-formed Atom feed with all possible elements
62
+ def test_atom_wf_full
63
+ xml = <<-EOF
64
+ <?xml version="1.0" encoding="utf-8"?>
65
+ <feed xmlns="http://www.w3.org/2005/Atom">
66
+ <title type="text">It is the quality rather than the quantity that matters.</title>
67
+ <updated>2005-08-20T21:43:44Z</updated>
68
+ <id>urn:uuid:dc03a676cc5f04b9f0c728592270c8b7</id>
69
+ <author>
70
+ <name>mathew</name>
71
+ <email>meta@pobox.com</email>
72
+ <uri>http://www.pobox.com/~meta/</uri>
73
+ </author>
74
+ <category term="test"/>
75
+ <category term="Ruby"/>
76
+ <contributor>
77
+ <name>Phil Space</name>
78
+ <email>space@example.com</email>
79
+ </contributor>
80
+ <contributor>
81
+ <name>Anne Example</name>
82
+ <email>anne@example.com</email>
83
+ </contributor>
84
+ <generator uri="http://example.com/ruby/syndication" version="1.0">
85
+ Ruby Syndication Library
86
+ </generator>
87
+ <icon>http://www.example.com/goatseicon.gif</icon>
88
+ <link rel="self" type="application/ruby" href="file://atom.rb"/>
89
+ <logo>http://www.example.com/goatse.jpg</logo>
90
+ <rights>Copyright (c) meta@pobox.com 2005</rights>
91
+ <subtitle type="xhtml">
92
+ <div xmlns="http://www.w3.org/1999/xhtml">
93
+ <p>This is <b>XHTML</b> content.</p>
94
+ </div>
95
+ </subtitle>
96
+ <entry>
97
+ <title>Cleanliness is next to impossible.</title>
98
+ <summary type="xhtml">
99
+ <xhtml:div xmlns:xhtml="http://www.w3.org/1999/xhtml">
100
+ This is <xhtml:b>XHTML</xhtml:b> content.
101
+ </xhtml:div>
102
+ </summary>
103
+ <link href="http://example.com/05/08/20/2143.html"/>
104
+ <id>urn:uuid:380b651e97c2e6ecc68eaa66c90939b6</id>
105
+ <published>1978-03-12T10:22:11Z</published>
106
+ <updated>2005-08-20T21:43:44Z</updated>
107
+ <author>
108
+ <name>Stu Dapples</name>
109
+ <email>stu@example.com</email>
110
+ </author>
111
+ <category term="fortune"/>
112
+ <category term="aphorism"/>
113
+ <content type="text">
114
+ Cleanliness of code is certainly next to impossible if you have to parse
115
+ Atom feeds with all their features.
116
+ </content>
117
+ <contributor>
118
+ <name>Ben Dover</name>
119
+ </contributor>
120
+ <contributor>
121
+ <name>Eileen Dover</name>
122
+ </contributor>
123
+ <rights>This test entry is in the public domain.</rights>
124
+ </entry>
125
+ <entry>
126
+ <title type="html">&lt;b>WE HAVE TACOS&lt;/b></title>
127
+ <link href="http://www.pobox.com/~meta/"/>
128
+ <id>urn:uuid:13be6c856fac98d9a7fd144b61dee06d</id>
129
+ <updated>2004-12-23T21:22:23-06:00</updated>
130
+ <source>
131
+ <author><name>Rick O'Shea</name></author>
132
+ <category term="example"/>
133
+ <contributor><name>Hugh Cares</name></contributor>
134
+ <generator uri="http://www.pobox.com/~meta/" version="1">
135
+ Typed in by hand by some poor guy.
136
+ </generator>
137
+ <icon>http://www.example.com/icon2.png</icon>
138
+ <id>urn:uuid:1234decafbad7890deadbeef5678304</id>
139
+ <link rel="alternate" type="text/html"
140
+ href="http://www.pobox.com/~meta/"/>
141
+ <logo>http://www.example.com/logo.svg</logo>
142
+ <rights>Some rights reserved, some not</rights>
143
+ <title>More example stuff</title>
144
+ <subtitle>MAKE IT STOP!</subtitle>
145
+ <updated>2005-08-20T22:11-05:00</updated>
146
+ </source>
147
+ </entry>
148
+ </feed>
149
+ EOF
150
+ f = Syndication::Atom::Parser.new.parse(xml)
151
+ baseline_assertions(f)
152
+ assert(f.categories.length == 2)
153
+ assert(f.contributors.length == 2)
154
+ assert(f.contributors[0].name == 'Phil Space', "Feed#contributors name didn't match")
155
+ assert(f.contributors[1].name == 'Anne Example', "Feed#contributors name didn't match")
156
+ assert(f.categories[0].term = 'test', "Feed#categories didn't match")
157
+ assert(f.categories[1].term = 'Ruby', "Feed#categories didn't match")
158
+ assert(f.title.txt == 'It is the quality rather than the quantity that matters.')
159
+ assert(f.updated == DateTime.parse('2005-08-20 21:43:44Z'), 'Feed#updated incorrectly parsed')
160
+ assert(f.author.name == 'mathew')
161
+ assert(f.author.email == 'meta@pobox.com')
162
+ assert(f.author.uri == 'http://www.pobox.com/~meta/')
163
+ assert(f.generator == 'Ruby Syndication Library')
164
+ assert(f.icon == 'http://www.example.com/goatseicon.gif')
165
+ assert(f.links.length == 1)
166
+ assert(f.links[0].rel == 'self')
167
+ assert(f.links[0].href == 'file://atom.rb')
168
+ assert(f.links[0].type == 'application/ruby')
169
+ assert(f.logo == 'http://www.example.com/goatse.jpg')
170
+ assert(f.rights == 'Copyright (c) meta@pobox.com 2005')
171
+ assert(f.subtitle.xhtml == '<p>This is <b>XHTML</b> content.</p>')
172
+ assert(f.entries.length == 2)
173
+ e1 = f.entries[0]
174
+ assert(e1.summary.xhtml == 'This is <b>XHTML</b> content.')
175
+ assert(e1.categories.length == 2)
176
+ assert(e1.categories[0].term == 'fortune')
177
+ assert(e1.categories[1].term == 'aphorism')
178
+ e2 = f.entries[1]
179
+ assert(e2.title.html == '<b>WE HAVE TACOS</b>')
180
+ s = e2.source
181
+ assert(s.kind_of?(Syndication::Atom::Feed))
182
+ assert(s.title.txt == 'More example stuff')
183
+ assert(s.updated == DateTime.parse('2005-08-20 22:11:00-0500'))
184
+ end
185
+ end
186
+ end