syndication 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,267 @@
1
+ # The file common.rb contains code common to both Atom and RSS parsing.
2
+ #
3
+ # Copyright � mathew <meta@pobox.com> 2005.
4
+ # Licensed under the same terms as Ruby.
5
+
6
+ require 'uri'
7
+ require 'rexml/parsers/streamparser'
8
+ require 'rexml/streamlistener'
9
+ require 'rexml/document'
10
+ require 'date'
11
+
12
+ # To parse Atom feeds, use Syndication::Atom::Parser.
13
+ # To parse RSS feeds, use Syndication::RSS::Parser.
14
+ module Syndication
15
+
16
+ # A Container is an object in the parse tree that stores data, and possibly
17
+ # other objects. Its naming and behavior is an internal detail, not part
18
+ # of the API, and hence subject to change.
19
+ #
20
+ # In other words, to use the library you don't have to know about anything
21
+ # below.
22
+ class Container
23
+
24
+ # Convert a tag (possibly with namespace) to a method name.
25
+ def tag2method(tag)
26
+ return tag.downcase.sub(/:/, '_') + '='
27
+ end
28
+
29
+ # Create a container.
30
+ # parent is the new container's parent object in the final parse tree.
31
+ # tag is the XML tag which caused creation of the container.
32
+ # attrs is a hash of {attr => value} of the XML attributes in the tag.
33
+ def initialize(parent, tag = nil, attrs = nil)
34
+ @parent = parent
35
+ @tag = tag
36
+ # and ignore attrs by default
37
+ end
38
+
39
+ # Handle a start tag and attributes.
40
+ # Checks to see if self has a field with the appropriate name.
41
+ # If so, we send it the attributes (if any), and record that the
42
+ # current method is the method to access that field.
43
+ def tag_start(tag, attrs = nil)
44
+ method = tag2method(tag)
45
+ if self.respond_to?(method)
46
+ if attrs
47
+ self.send(method, attrs)
48
+ end
49
+ @current_method = method
50
+ end
51
+ end
52
+
53
+ # Handle an end tag, and return what the new current object should be.
54
+ #
55
+ # If the tag matches the one we were created with, this container is
56
+ # complete and the new current object is its parent.
57
+ #
58
+ # If there's no parent (i.e. this is the top level container in the
59
+ # parse tree), the new current object must be unchanged.
60
+ #
61
+ # Otherwise, pass the end tag up to the parent to see if it can do
62
+ # anything with it.
63
+ def tag_end(endtag, current)
64
+ if @tag == endtag
65
+ return @parent
66
+ end
67
+ if @parent == nil
68
+ return current
69
+ end
70
+ return @parent.tag_end(endtag, current)
71
+ end
72
+
73
+ # Store an object in the parse tree, either in self, or in one of self's
74
+ # ancestors.
75
+ def store(tag, obj)
76
+ method = tag2method(tag)
77
+ if self.respond_to?(method)
78
+ self.send(method, obj)
79
+ else
80
+ @parent.store(tag, obj) if @parent
81
+ end
82
+ end
83
+
84
+ # Parse a date field on demand. DateTime.parse is sloooow, so don't call
85
+ # it unless you really have to.
86
+ def parse_date(field)
87
+ if !field
88
+ return nil
89
+ end
90
+ if field.kind_of?(String)
91
+ dt = DateTime.parse(field)
92
+ if dt.kind_of?(DateTime)
93
+ field = dt
94
+ end
95
+ end
96
+ return field
97
+ end
98
+
99
+ # Strip the parent field from a container, used to make a container
100
+ # more amenable to pretty-printing.
101
+ def strip
102
+ @parent = nil
103
+ return self
104
+ end
105
+ end
106
+
107
+ # Shared parts of parser code for Atom and RSS. This is an abstract class;
108
+ # Atom::Parser and RSS::Parser are the concrete classes which actually parse
109
+ # syndication feeds.
110
+ #
111
+ # You don't need to know about anything below in order to use the library.
112
+ #
113
+ # The basic parsing strategy is:
114
+ #
115
+ # - The parser keeps a current_object pointer which represents the object
116
+ # in the parse tree that corresponds to where we are in the XML tree. To
117
+ # use a metaphor, it's the object where parse tree growth is occurring.
118
+ #
119
+ # - REXML dispatches events to the parser representing start and end tags and
120
+ # text. The parser sends the events to the current_object, which replies with
121
+ # what the new current_object should be after the event has been dealt with.
122
+ #
123
+ # - The job of creating child objects when appropriate is handled by the
124
+ # objects of the parse tree.
125
+ #
126
+ # - Reflection is used to store data in the parse tree. Accessor names are
127
+ # derived from tags in a standard way once namespaces have been standardized.
128
+ class AbstractParser
129
+ include REXML::StreamListener
130
+
131
+ # A Hash of namespace URLs the module knows about, returning the standard
132
+ # prefix to remap to.
133
+ KNOWN_NAMESPACES = {
134
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
135
+ 'http://purl.org/dc/terms/' => 'dcterms',
136
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
137
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
138
+ 'http://www.itunes.com/DTDs/Podcast-1.0.dtd' => 'itunes',
139
+ 'http://www.w3.org/1999/xhtml' => 'xhtml'
140
+ }
141
+
142
+ # Create a new AbstractParser. The optional argument consists of text to
143
+ # parse.
144
+ def initialize(text = nil)
145
+ reset
146
+ # Initialize mapping from tags to classes, which only needs to be done
147
+ # once and not reset. Concrete classes which do actual parsing will
148
+ # fill the hash.
149
+ @tag_to_class = Hash.new
150
+ parse(text) if text
151
+ end
152
+
153
+ # Catch any stuff that drops right through the parse tree, and simply
154
+ # ignore it.
155
+ def store(tag, obj)
156
+ end
157
+
158
+ # Catch and ignore closing tags that don't match anything open.
159
+ def end_tag(tag, current)
160
+ return current
161
+ end
162
+
163
+ # Reset the parser ready to parse a new feed.
164
+ def reset
165
+ @current_object = @parsetree
166
+ @tagstack = Array.new
167
+ @textstack = Array.new
168
+ @xhtml = ''
169
+ @xhtmlmode = false
170
+ @namespacemap = Hash.new
171
+ # @parsetree is set up by the concrete classes
172
+ end
173
+
174
+ # Parse the text provided. Returns a Syndication::Atom::Feed or
175
+ # Syndication::RSS::Feed object, according to which concrete Parser
176
+ # class is being used.
177
+ def parse(text)
178
+ REXML::Document.parse_stream(text, self)
179
+ return @parsetree
180
+ end
181
+
182
+ # Handle namespace translation for a raw tag.
183
+ def handle_namespace(tag, attrs = nil)
184
+ if attrs and tag.match(/^(rss|\w+:rdf|\w+:div)$/i)
185
+ for key in attrs.keys
186
+ if key.match(/xmlns:(\w+)/i)
187
+ define_namespace($1, attrs[key])
188
+ end
189
+ end
190
+ end
191
+ if tag.match(/(\w+):(\w+)/)
192
+ if @namespacemap[$1]
193
+ tag = "#{@namespacemap[$1]}:#{$2}"
194
+ end
195
+ end
196
+ return tag
197
+ end
198
+
199
+ # Process a namespace definition for the given prefix and namespace
200
+ # definition URL.
201
+ #
202
+ # If we recongnize the URL, we set up a mapping from their prefix to
203
+ # our canonical choice of prefix.
204
+ def define_namespace(prefix, url)
205
+ myprefix = KNOWN_NAMESPACES[url]
206
+ if myprefix
207
+ @namespacemap[prefix] = myprefix
208
+ end
209
+ end
210
+
211
+ # Called when REXML finds the start of an XML element.
212
+ def tag_start(tag, attrs) #:nodoc:
213
+ tag = handle_namespace(tag, attrs)
214
+ cl = @class_for_tag[tag.downcase]
215
+ if cl
216
+ # If the tag requires the creation of an object, we create it as a
217
+ # child of the current object, then ask the current object to store
218
+ # it. It becomes the new current object.
219
+ newobj = cl.new(@current_object, tag, attrs)
220
+ @current_object.store(tag, newobj)
221
+ @current_object = newobj
222
+ else
223
+ # Otherwise, we ask the current object to do something with the tag.
224
+ if @current_object
225
+ @current_object.tag_start(tag, attrs)
226
+ end
227
+ end
228
+ # We also push to the stacks we use for text buffering.
229
+ @tagstack.push(tag)
230
+ @textstack.push('')
231
+ end
232
+
233
+ # Called when REXML finds the end of an XML element.
234
+ def tag_end(endtag)
235
+ endtag = handle_namespace(endtag, nil)
236
+ # There are two tasks to perform: 1. store the data from the buffers,
237
+ # and 2. work out if we need to close out any objects in the parse
238
+ # tree and move the current object pointer
239
+ begin
240
+ # Store the top text buffer that's on the stacks by passing it to the
241
+ # current object along with its tag. Repeat until we find a stacked
242
+ # tag which matches the endtag, or run out of buffers.
243
+ tag = @tagstack.pop
244
+ text = @textstack.pop
245
+ if text
246
+ text.strip!
247
+ if text.length > 0 and @current_object
248
+ @current_object.store(tag, text)
249
+ end
250
+ end
251
+ end until tag == endtag or @tagstack.length == 0
252
+ # Pass the tag end event to the current object to find out what the
253
+ # new current object should be.
254
+ if @current_object
255
+ @current_object = @current_object.tag_end(endtag, @current_object)
256
+ end
257
+ end
258
+
259
+ # Called when REXML finds a text fragment.
260
+ # Buffers the text on the buffer stacks ready for the end tag.
261
+ def text(s)
262
+ if @textstack.last
263
+ @textstack.last << s
264
+ end
265
+ end
266
+ end
267
+ end
@@ -0,0 +1,37 @@
1
+
2
+ module Syndication
3
+
4
+ # Mixin for RSS 1.0 content module.
5
+ #
6
+ # This is the approved way to include actual HTML text in an RSS feed.
7
+ # To use it, require 'syndication/content' to add the content_encoded
8
+ # and content_decoded methods to the Syndication::Item class.
9
+ #
10
+ module Content
11
+ # Actual web content, entity encoded or CDATA-escaped.
12
+ attr_accessor :content_encoded
13
+
14
+ # Decoded version of content_encoded, as HTML.
15
+ def content_decoded
16
+ if !@content_encoded or @content_encoded == ''
17
+ return @content_encoded
18
+ end
19
+ # CDATA is the easier
20
+ if @content_encoded.match(/<!\[CDATA\[(.*)\]\]!>/)
21
+ return $1
22
+ end
23
+ # OK, must be entity-encoded
24
+ x = @content_encoded.gsub(/&lt;/, '<')
25
+ x.gsub!(/&gt;/, '>')
26
+ return x.gsub(/&amp;/, '&')
27
+ end
28
+ end
29
+
30
+ #:enddoc:
31
+ module RSS
32
+ class Item
33
+ include Content
34
+ end
35
+ end
36
+
37
+ end
@@ -0,0 +1,92 @@
1
+
2
+ module Syndication
3
+
4
+ # Mixin for Dublin Core metadata in RSS feeds.
5
+ #
6
+ # If you require 'syndication/dublincore' these methods are added to the
7
+ # Syndication::Channel, Syndication::Item, Syndication::Image and
8
+ # Syndication::TextInput classes.
9
+ #
10
+ # The access method names are the Dublin Core element names, prefixed with
11
+ # dc_.
12
+ #
13
+ module DublinCore
14
+ # A name by which the item is formally known.
15
+ attr_accessor :dc_title
16
+
17
+ # The entity primarily responsible for making the content of the item.
18
+ attr_accessor :dc_creator
19
+
20
+ # The topic of the content of the item, typically as keywords
21
+ # or key phrases.
22
+ attr_accessor :dc_subject
23
+
24
+ # A description of the content of the item.
25
+ attr_accessor :dc_description
26
+
27
+ # Entity responsible for making the item available.
28
+ attr_accessor :dc_publisher
29
+
30
+ # Entity responsible for contributing this item.
31
+ attr_accessor :dc_contributor
32
+
33
+ # Date of creation or availability of item.
34
+ # Returned as a DateTime if it will parse; otherwise, returned as a
35
+ # string. (Dublin Core does not require any particular date and time
36
+ # format, so guaranteeing parsing is not possible.)
37
+ def dc_date
38
+ if @dc_date and !@dc_date.kind_of?(DateTime)
39
+ @dc_date = DateTime.parse(@dc_date)
40
+ end
41
+ return @dc_date
42
+ end
43
+
44
+ # Date of creation or availability of item.
45
+ attr_writer :dc_date
46
+
47
+ # Nature or genre of item, usually from a controlled vocabulary.
48
+ attr_accessor :dc_type
49
+
50
+ # Physical or digital format of item.
51
+ attr_accessor :dc_format
52
+
53
+ # An unambigious identifier which identifies the item.
54
+ attr_accessor :dc_identifier
55
+
56
+ # A reference to a resource from which the item is derived.
57
+ attr_accessor :dc_source
58
+
59
+ # The language the item is in, coded as per RFC 1766.
60
+ attr_accessor :dc_language
61
+
62
+ # A reference to a related resource.
63
+ attr_accessor :dc_relation
64
+
65
+ # The extent or scope of coverage of the item, e.g. a geographical area.
66
+ attr_accessor :dc_coverage
67
+
68
+ # Information about rights held over the item, e.g. copyright or patents.
69
+ attr_accessor :dc_rights
70
+ end
71
+
72
+ #:enddoc:
73
+ # Now we mix in the DublinCore elements to all the Syndication classes that
74
+ # can contain them. There's probably some clever way to do this via
75
+ # reflection, but there _is_ such a thing as being too clever.
76
+ class Item
77
+ include DublinCore
78
+ end
79
+
80
+ class Channel
81
+ include DublinCore
82
+ end
83
+
84
+ class Image
85
+ include DublinCore
86
+ end
87
+
88
+ class TextInput
89
+ include DublinCore
90
+ end
91
+
92
+ end
@@ -0,0 +1,85 @@
1
+ module Syndication
2
+
3
+ # Mixin for iTunes podcast RSS elements.
4
+ #
5
+ # To use this, require 'syndication/podcast' to add appropriate methods
6
+ # to the Item and Channel classes.
7
+ #
8
+ # See <URL:http://phobos.apple.com/static/iTunesRSS.html> for more
9
+ # information.
10
+ #
11
+ # See Syndication::Podcast::Both for methods added to both Item and
12
+ # Channel RSS objects.
13
+ #
14
+ # See Syndication::Podcast::Channel for methods added to Channel objects.
15
+ #
16
+ # See Syndication::Podcast::Item for methods added to Item objects.
17
+ #
18
+ module Podcast
19
+ # iTunes fields which occur in Items only.
20
+ module Item
21
+ # Artist column in iTunes.
22
+ attr_accessor :itunes_author
23
+ # Duration of item, in seconds.
24
+ attr_reader :itunes_duration
25
+
26
+ # Set the duration. Apple specifies four possible formats for the
27
+ # XML data: HH:MM:SS, H:MM:SS, MM:SS, or M:SS.
28
+ def itunes_duration=(x)
29
+ if x.match(/(\d?\d):(\d\d):(\d\d)/)
30
+ @itunes_duration = $3.to_i + $2.to_i * 60 + $1.to_i * 3600
31
+ elsif x.match(/(\d?\d):(\d\d)/)
32
+ @itunes_duration = $2.to_i + $1.to_i * 60
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ # iTunes fields which occur in Channels only.
39
+ module Channel
40
+ # Owner, not shown, used for contact only.
41
+ attr_accessor :itunes_owner
42
+ end
43
+
44
+ # iTunes fields which occur both in Channels and in Items.
45
+ module Both
46
+ # Prevent this entity from appearing in the iTunes podcast directory?
47
+ attr_accessor :itunes_block
48
+ # Parental advisory graphic?
49
+ attr_accessor :itunes_explicit
50
+ # Keywords, not shown but can be searched via iTunes.
51
+ attr_accessor :itunes_keywords
52
+ # Description column in iTunes.
53
+ attr_accessor :itunes_subtitle
54
+ # Summary, shown when i-in-circle icon is clicked in Description
55
+ # column of iTunes.
56
+ attr_accessor :itunes_summary
57
+ # Category column(s) in iTunes and music store browser, as an array
58
+ # of strings (categories then subcategories).
59
+ attr_reader :itunes_category
60
+
61
+ # Add an iTunes category; they can be nested.
62
+ def itunes_category=(x)
63
+ if !@itunes_category
64
+ @itunes_category = Array.new
65
+ end
66
+ @itunes_category.push(x)
67
+ end
68
+
69
+ end
70
+ end
71
+
72
+ #:enddoc:
73
+ module RSS
74
+ class Item
75
+ include Podcast::Item
76
+ include Podcast::Both
77
+ end
78
+
79
+ class Channel
80
+ include Podcast::Channel
81
+ include Podcast::Both
82
+ end
83
+ end
84
+
85
+ end