yyyc514-syndication 0.6.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,289 @@
1
+ # The file common.rb contains code common to both Atom and RSS parsing.
2
+ #
3
+ # Copyright � mathew <meta@pobox.com> 2006.
4
+ # Licensed under the same terms as Ruby.
5
+ #
6
+ # $Header: /var/cvs/syndication/syndication/lib/syndication/common.rb,v 1.4 2005/10/23 22:51:17 meta Exp $
7
+
8
+ require 'uri'
9
+ require 'rexml/parsers/streamparser'
10
+ require 'rexml/streamlistener'
11
+ require 'rexml/document'
12
+ require 'date'
13
+
14
+ # To parse Atom feeds, use Syndication::Atom::Parser.
15
+ # To parse RSS feeds, use Syndication::RSS::Parser.
16
+ module Syndication
17
+
18
+ # A Container is an object in the parse tree that stores data, and possibly
19
+ # other objects. Its naming and behavior is an internal detail, not part
20
+ # of the API, and hence subject to change.
21
+ #
22
+ # In other words, to use the library you don't have to know about anything
23
+ # below.
24
+ class Container
25
+
26
+ # Convert a tag (possibly with namespace) to a method name.
27
+ def tag2method(tag)
28
+ return tag.downcase.sub(/:/, '_') + '='
29
+ end
30
+
31
+ # Create a container.
32
+ # parent is the new container's parent object in the final parse tree.
33
+ # tag is the XML tag which caused creation of the container.
34
+ # attrs is a hash of {attr => value} of the XML attributes in the tag.
35
+ def initialize(parent, tag = nil, attrs = nil)
36
+ @parent = parent
37
+ @tag = tag
38
+ # and ignore attrs by default
39
+ end
40
+
41
+ # Handle a start tag and attributes.
42
+ # Checks to see if self has a field with the appropriate name.
43
+ # If so, we send it the attributes (if any), and record that the
44
+ # current method is the method to access that field.
45
+ def tag_start(tag, attrs = nil)
46
+ method = tag2method(tag)
47
+ if self.respond_to?(method)
48
+ if attrs
49
+ self.send(method, attrs)
50
+ end
51
+ @current_method = method
52
+ end
53
+ end
54
+
55
+ # Handle an end tag, and return what the new current object should be.
56
+ #
57
+ # If the tag matches the one we were created with, this container is
58
+ # complete and the new current object is its parent.
59
+ #
60
+ # If there's no parent (i.e. this is the top level container in the
61
+ # parse tree), the new current object must be unchanged.
62
+ #
63
+ # Otherwise, pass the end tag up to the parent to see if it can do
64
+ # anything with it.
65
+ def tag_end(endtag, current)
66
+ if @tag == endtag
67
+ return @parent
68
+ end
69
+ if @parent == nil
70
+ return current
71
+ end
72
+ return @parent.tag_end(endtag, current)
73
+ end
74
+
75
+ # Store an object in the parse tree, either in self, or in one of self's
76
+ # ancestors.
77
+ def store(tag, obj)
78
+ method = tag2method(tag)
79
+ if self.respond_to?(method)
80
+ self.send(method, obj)
81
+ else
82
+ @parent.store(tag, obj) if @parent
83
+ end
84
+ end
85
+
86
+ # Parse a date field on demand. DateTime.parse is sloooow, so don't call
87
+ # it unless you really have to.
88
+ def parse_date(field)
89
+ if !field
90
+ return nil
91
+ end
92
+ if field.kind_of?(String)
93
+ dt = DateTime.parse(field)
94
+ if dt.kind_of?(DateTime)
95
+ field = dt
96
+ end
97
+ end
98
+ return field
99
+ end
100
+
101
+ # Strip the parent field from a container, used to make a container
102
+ # more amenable to pretty-printing.
103
+ def strip
104
+ @parent = nil
105
+ return self
106
+ end
107
+ end
108
+
109
+ # Shared parts of parser code for Atom and RSS. This is an abstract class;
110
+ # Atom::Parser and RSS::Parser are the concrete classes which actually parse
111
+ # syndication feeds.
112
+ #
113
+ # You don't need to know about anything below in order to use the library.
114
+ #
115
+ # The basic parsing strategy is:
116
+ #
117
+ # - The parser keeps a current_object pointer which represents the object
118
+ # in the parse tree that corresponds to where we are in the XML tree. To
119
+ # use a metaphor, it's the object where parse tree growth is occurring.
120
+ #
121
+ # - REXML dispatches events to the parser representing start and end tags and
122
+ # text. The parser sends the events to the current_object, which replies with
123
+ # what the new current_object should be after the event has been dealt with.
124
+ #
125
+ # - The job of creating child objects when appropriate is handled by the
126
+ # objects of the parse tree.
127
+ #
128
+ # - Reflection is used to store data in the parse tree. Accessor names are
129
+ # derived from tags in a standard way once namespaces have been standardized.
130
+ class AbstractParser
131
+ include REXML::StreamListener
132
+
133
+ # A Hash of namespace URLs the module knows about, returning the standard
134
+ # prefix to remap to.
135
+ KNOWN_NAMESPACES = {
136
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
137
+ 'http://purl.org/dc/terms/' => 'dcterms',
138
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
139
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
140
+ 'http://www.itunes.com/DTDs/Podcast-1.0.dtd' => 'itunes',
141
+ 'http://www.w3.org/1999/xhtml' => 'xhtml',
142
+ 'http://schemas.google.com/g/2005' => 'gd',
143
+ 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner'
144
+ }
145
+
146
+ # Create a new AbstractParser. The optional argument consists of text to
147
+ # parse.
148
+ def initialize(text = nil)
149
+ reset
150
+ # Initialize mapping from tags to classes, which only needs to be done
151
+ # once and not reset. Concrete classes which do actual parsing will
152
+ # fill the hash.
153
+ @tag_to_class = Hash.new
154
+ parse(text) if text
155
+ end
156
+
157
+ # Catch any stuff that drops right through the parse tree, and simply
158
+ # ignore it.
159
+ def store(tag, obj)
160
+ end
161
+
162
+ # Catch and ignore closing tags that don't match anything open.
163
+ def end_tag(tag, current)
164
+ return current
165
+ end
166
+
167
+ # Reset the parser ready to parse a new feed.
168
+ def reset
169
+ @current_object = @parsetree
170
+ @tagstack = Array.new
171
+ @textstack = Array.new
172
+ @xhtml = ''
173
+ @xhtmlmode = false
174
+ @namespacemap = Hash.new
175
+ # @parsetree is set up by the concrete classes
176
+ end
177
+
178
+ # Parse the text provided. Returns a Syndication::Atom::Feed or
179
+ # Syndication::RSS::Feed object, according to which concrete Parser
180
+ # class is being used.
181
+ # The second argument is optional and determines the parser engine to
182
+ # use. The default is REXML. To use TagSoup, pass in the value
183
+ # Syndication::TagSoup
184
+ def parse(text, classname = REXML::Document)
185
+ classname.parse_stream(text, self)
186
+ return @parsetree
187
+ end
188
+
189
+ # Handle namespace translation for a raw tag.
190
+ def handle_namespace(tag, attrs = nil)
191
+ if attrs and tag.match(/^(rss|\w+:rdf|\w+:div)$/i)
192
+ for key in attrs.keys
193
+ if key.match(/xmlns:(\w+)/i)
194
+ define_namespace($1, attrs[key])
195
+ end
196
+ end
197
+ end
198
+ if tag.match(/(\w+):(\w+)/)
199
+ if @namespacemap[$1]
200
+ tag = "#{@namespacemap[$1]}:#{$2}"
201
+ end
202
+ end
203
+ return tag
204
+ end
205
+
206
+ # Process a namespace definition for the given prefix and namespace
207
+ # definition URL.
208
+ #
209
+ # If we recongnize the URL, we set up a mapping from their prefix to
210
+ # our canonical choice of prefix.
211
+ def define_namespace(prefix, url)
212
+ myprefix = KNOWN_NAMESPACES[url]
213
+ if myprefix
214
+ @namespacemap[prefix] = myprefix
215
+ end
216
+ end
217
+
218
+ # Called when REXML finds the start of an XML element.
219
+ def tag_start(tag, attrs) #:nodoc:
220
+ tag = handle_namespace(tag, attrs)
221
+ cl = @class_for_tag[tag.downcase]
222
+ if cl
223
+ # If the tag requires the creation of an object, we create it as a
224
+ # child of the current object, then ask the current object to store
225
+ # it. It becomes the new current object.
226
+ newobj = cl.new(@current_object, tag, attrs)
227
+ @current_object.store(tag, newobj)
228
+ @current_object = newobj
229
+ else
230
+ # Otherwise, we ask the current object to do something with the tag.
231
+ if @current_object
232
+ @current_object.tag_start(tag, attrs)
233
+ end
234
+ end
235
+ # We also push to the stacks we use for text buffering.
236
+ @tagstack.push(tag)
237
+ @textstack.push('')
238
+ end
239
+
240
+ # Called when REXML finds the end of an XML element.
241
+ def tag_end(endtag)
242
+ endtag = handle_namespace(endtag, nil)
243
+ # There are two tasks to perform: 1. store the data from the buffers,
244
+ # and 2. work out if we need to close out any objects in the parse
245
+ # tree and move the current object pointer
246
+ begin
247
+ # Store the top text buffer that's on the stacks by passing it to the
248
+ # current object along with its tag. Repeat until we find a stacked
249
+ # tag which matches the endtag, or run out of buffers.
250
+ tag = @tagstack.pop
251
+ text = @textstack.pop
252
+ if text
253
+ text.strip!
254
+ if text.length > 0 and @current_object
255
+ @current_object.store(tag, text)
256
+ end
257
+ end
258
+ end until tag == endtag or @tagstack.length == 0
259
+ # Pass the tag end event to the current object to find out what the
260
+ # new current object should be.
261
+ if @current_object
262
+ @current_object = @current_object.tag_end(endtag, @current_object)
263
+ end
264
+ end
265
+
266
+ # Called when REXML finds a text fragment.
267
+ # Buffers the text on the buffer stacks ready for the end tag.
268
+ def text(s)
269
+ if @textstack.last
270
+ @textstack.last << s
271
+ end
272
+ end
273
+
274
+ # Supposed to be called when REXML finds a CDATA-encoded piece of text.
275
+ def cdata(s)
276
+ # For content_encoded we re-encode, because (a) the API for RSS content
277
+ # module provides both encoded and decoded results to the user, and
278
+ # (b) REXML doesn't always seem to pass CDATA via this callback method.
279
+ # For other elements, we keep the text decoded.
280
+ if @textstack.last
281
+ if @tagstack.last == 'content:encoded'
282
+ @textstack.last << "<![CDATA[#{s}]]>"
283
+ else
284
+ @textstack.last << s
285
+ end
286
+ end
287
+ end
288
+ end
289
+ end
@@ -0,0 +1,44 @@
1
+ # Copyright � mathew <meta@pobox.com> 2005.
2
+ # Licensed under the same terms as Ruby.
3
+ #
4
+ # $Header$
5
+
6
+ module Syndication
7
+
8
+ # Mixin for RSS 1.0 content module.
9
+ #
10
+ # This is the approved way to include actual HTML text in an RSS feed.
11
+ # To use it, require 'syndication/content' to add the content_encoded
12
+ # and content_decoded methods to the Syndication::Item class.
13
+ #
14
+ module Content
15
+ # Actual web content, entity encoded or CDATA-escaped.
16
+ attr_accessor :content_encoded
17
+
18
+ # Decoded version of content_encoded, as HTML.
19
+ def content_decoded
20
+ if !@content_encoded or @content_encoded == ''
21
+ return @content_encoded
22
+ end
23
+ # CDATA is the easier case
24
+ if @content_encoded.match(/<!\[CDATA\[(.*)\]\]>/m)
25
+ return $1
26
+ end
27
+ # Decode escaped entities
28
+ x = @content_encoded.gsub(/&lt;/, '<')
29
+ x.gsub!(/&gt;/, '>')
30
+ return x.gsub(/&amp;/, '&')
31
+ end
32
+ end
33
+
34
+ #:enddoc:
35
+ module RSS
36
+ class Item
37
+ include Content
38
+ end
39
+ class Channel
40
+ include Content
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,98 @@
1
+ # Copyright � mathew <meta@pobox.com> 2005.
2
+ # Licensed under the same terms as Ruby.
3
+ #
4
+ # $Header$
5
+
6
+ module Syndication
7
+
8
+ # Mixin for Dublin Core metadata in RSS feeds.
9
+ #
10
+ # If you require 'syndication/dublincore' these methods are added to the
11
+ # Syndication::Channel, Syndication::Item, Syndication::Image and
12
+ # Syndication::TextInput classes.
13
+ #
14
+ # The access method names are the Dublin Core element names, prefixed with
15
+ # dc_.
16
+ #
17
+ module DublinCore
18
+ # A name by which the item is formally known.
19
+ attr_accessor :dc_title
20
+
21
+ # The entity primarily responsible for making the content of the item.
22
+ attr_accessor :dc_creator
23
+
24
+ # The topic of the content of the item, typically as keywords
25
+ # or key phrases.
26
+ attr_accessor :dc_subject
27
+
28
+ # A description of the content of the item.
29
+ attr_accessor :dc_description
30
+
31
+ # Entity responsible for making the item available.
32
+ attr_accessor :dc_publisher
33
+
34
+ # Entity responsible for contributing this item.
35
+ attr_accessor :dc_contributor
36
+
37
+ # Date of creation or availability of item.
38
+ # Returned as a DateTime if it will parse; otherwise, returned as a
39
+ # string. (Dublin Core does not require any particular date and time
40
+ # format, so guaranteeing parsing is not possible.)
41
+ def dc_date
42
+ if @dc_date and !@dc_date.kind_of?(DateTime)
43
+ @dc_date = DateTime.parse(@dc_date)
44
+ end
45
+ return @dc_date
46
+ end
47
+
48
+ # Date of creation or availability of item.
49
+ attr_writer :dc_date
50
+
51
+ # Nature or genre of item, usually from a controlled vocabulary.
52
+ attr_accessor :dc_type
53
+
54
+ # Physical or digital format of item.
55
+ attr_accessor :dc_format
56
+
57
+ # An unambigious identifier which identifies the item.
58
+ attr_accessor :dc_identifier
59
+
60
+ # A reference to a resource from which the item is derived.
61
+ attr_accessor :dc_source
62
+
63
+ # The language the item is in, coded as per RFC 1766.
64
+ attr_accessor :dc_language
65
+
66
+ # A reference to a related resource.
67
+ attr_accessor :dc_relation
68
+
69
+ # The extent or scope of coverage of the item, e.g. a geographical area.
70
+ attr_accessor :dc_coverage
71
+
72
+ # Information about rights held over the item, e.g. copyright or patents.
73
+ attr_accessor :dc_rights
74
+ end
75
+
76
+ #:enddoc:
77
+ module RSS
78
+ # Now we mix in the DublinCore elements to all the Syndication classes that
79
+ # can contain them. There's probably some clever way to do this via
80
+ # reflection, but there _is_ such a thing as being too clever.
81
+ class Item
82
+ include DublinCore
83
+ end
84
+
85
+ class Channel
86
+ include DublinCore
87
+ end
88
+
89
+ class Image
90
+ include DublinCore
91
+ end
92
+
93
+ class TextInput
94
+ include DublinCore
95
+ end
96
+ end
97
+
98
+ end
@@ -0,0 +1,18 @@
1
+
2
+ module Syndication
3
+
4
+ module Feedburner
5
+ module Item
6
+ # The original URL, before feedburner rewrote it for tracking purposes
7
+ attr_accessor :feedburner_origlink
8
+ end
9
+
10
+ end
11
+
12
+ module RSS
13
+ class Item
14
+ include Feedburner::Item
15
+ end
16
+ end
17
+
18
+ end
@@ -0,0 +1,58 @@
1
+ # Copyright � mathew <meta@pobox.com> 2006.
2
+ # Licensed under the same terms as Ruby.
3
+
4
+ module Syndication
5
+
6
+ # Mixin for Google Data in Atom feeds.
7
+ #
8
+ # If you require 'syndication/google' these methods are added to the
9
+ # Syndication::Atom::Entry and Syndication::Atom::Feed classes.
10
+ #
11
+ # See http://code.google.com/apis/gdata/calendar.html for more information
12
+ # on Google Calendar Data APIs.
13
+ #
14
+ # See examples/google.rb for a simple example.
15
+ #
16
+ module Google
17
+ # Where the event is to occur
18
+ attr_reader :gd_where
19
+
20
+ def gd_where=(attrs)
21
+ if attrs['valueString']
22
+ @gd_where = attrs['valueString']
23
+ end
24
+ end
25
+
26
+ def gd_when=(attrs)
27
+ if attrs['startTime']
28
+ @starttime = attrs['startTime']
29
+ end
30
+ if attrs['endTime']
31
+ @endtime = attrs['endTime']
32
+ end
33
+ end
34
+
35
+ # When the event is to occur, as an Array of [start DateTime, end DateTime].
36
+ def gd_when
37
+ s = e = nil
38
+ if @starttime
39
+ s = DateTime.parse(@starttime)
40
+ end
41
+ if @endtime
42
+ e = DateTime.parse(@endtime)
43
+ end
44
+ return [s,e]
45
+ end
46
+ end
47
+
48
+ module Atom
49
+ class Entry
50
+ include Google
51
+ end
52
+
53
+ class Feed
54
+ include Google
55
+ end
56
+ end
57
+
58
+ end
@@ -0,0 +1,90 @@
1
+ # Copyright � mathew <meta@pobox.com> 2005.
2
+ # Licensed under the same terms as Ruby.
3
+ #
4
+ # $Header$
5
+
6
+ module Syndication
7
+
8
+ # Mixin for iTunes podcast RSS elements.
9
+ #
10
+ # To use this, require 'syndication/podcast' to add appropriate methods
11
+ # to the Item and Channel classes.
12
+ #
13
+ # See <URL:http://phobos.apple.com/static/iTunesRSS.html> for more
14
+ # information.
15
+ #
16
+ # See Syndication::Podcast::Both for methods added to both Item and
17
+ # Channel RSS objects.
18
+ #
19
+ # See Syndication::Podcast::Channel for methods added to Channel objects.
20
+ #
21
+ # See Syndication::Podcast::Item for methods added to Item objects.
22
+ #
23
+ module Podcast
24
+ # iTunes fields which occur in Items only.
25
+ module Item
26
+ # Artist column in iTunes.
27
+ attr_accessor :itunes_author
28
+ # Duration of item, in seconds.
29
+ attr_reader :itunes_duration
30
+
31
+ # Set the duration. Apple specifies four possible formats for the
32
+ # XML data: HH:MM:SS, H:MM:SS, MM:SS, or M:SS.
33
+ def itunes_duration=(x)
34
+ if x.match(/(\d?\d):(\d\d):(\d\d)/)
35
+ @itunes_duration = $3.to_i + $2.to_i * 60 + $1.to_i * 3600
36
+ elsif x.match(/(\d?\d):(\d\d)/)
37
+ @itunes_duration = $2.to_i + $1.to_i * 60
38
+ end
39
+ end
40
+
41
+ end
42
+
43
+ # iTunes fields which occur in Channels only.
44
+ module Channel
45
+ # Owner, not shown, used for contact only.
46
+ attr_accessor :itunes_owner
47
+ end
48
+
49
+ # iTunes fields which occur both in Channels and in Items.
50
+ module Both
51
+ # Prevent this entity from appearing in the iTunes podcast directory?
52
+ attr_accessor :itunes_block
53
+ # Parental advisory graphic?
54
+ attr_accessor :itunes_explicit
55
+ # Keywords, not shown but can be searched via iTunes.
56
+ attr_accessor :itunes_keywords
57
+ # Description column in iTunes.
58
+ attr_accessor :itunes_subtitle
59
+ # Summary, shown when i-in-circle icon is clicked in Description
60
+ # column of iTunes.
61
+ attr_accessor :itunes_summary
62
+ # Category column(s) in iTunes and music store browser, as an array
63
+ # of strings (categories then subcategories).
64
+ attr_reader :itunes_category
65
+
66
+ # Add an iTunes category; they can be nested.
67
+ def itunes_category=(x)
68
+ if !@itunes_category
69
+ @itunes_category = Array.new
70
+ end
71
+ @itunes_category.push(x)
72
+ end
73
+
74
+ end
75
+ end
76
+
77
+ #:enddoc:
78
+ module RSS
79
+ class Item
80
+ include Podcast::Item
81
+ include Podcast::Both
82
+ end
83
+
84
+ class Channel
85
+ include Podcast::Channel
86
+ include Podcast::Both
87
+ end
88
+ end
89
+
90
+ end