syndication 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/IMPLEMENTATION +33 -0
- data/README +208 -0
- data/examples/yahoo.rb +21 -0
- data/lib/syndication/atom.rb +479 -0
- data/lib/syndication/common.rb +267 -0
- data/lib/syndication/content.rb +37 -0
- data/lib/syndication/dublincore.rb +92 -0
- data/lib/syndication/podcast.rb +85 -0
- data/lib/syndication/rss.rb +326 -0
- data/lib/syndication/syndication.rb +45 -0
- data/test/atomtest.rb +186 -0
- data/test/rsstest.rb +314 -0
- metadata +55 -0
@@ -0,0 +1,267 @@
|
|
1
|
+
# The file common.rb contains code common to both Atom and RSS parsing.
|
2
|
+
#
|
3
|
+
# Copyright � mathew <meta@pobox.com> 2005.
|
4
|
+
# Licensed under the same terms as Ruby.
|
5
|
+
|
6
|
+
require 'uri'
|
7
|
+
require 'rexml/parsers/streamparser'
|
8
|
+
require 'rexml/streamlistener'
|
9
|
+
require 'rexml/document'
|
10
|
+
require 'date'
|
11
|
+
|
12
|
+
# To parse Atom feeds, use Syndication::Atom::Parser.
|
13
|
+
# To parse RSS feeds, use Syndication::RSS::Parser.
|
14
|
+
module Syndication
|
15
|
+
|
16
|
+
# A Container is an object in the parse tree that stores data, and possibly
|
17
|
+
# other objects. Its naming and behavior is an internal detail, not part
|
18
|
+
# of the API, and hence subject to change.
|
19
|
+
#
|
20
|
+
# In other words, to use the library you don't have to know about anything
|
21
|
+
# below.
|
22
|
+
class Container
|
23
|
+
|
24
|
+
# Convert a tag (possibly with namespace) to a method name.
|
25
|
+
def tag2method(tag)
|
26
|
+
return tag.downcase.sub(/:/, '_') + '='
|
27
|
+
end
|
28
|
+
|
29
|
+
# Create a container.
|
30
|
+
# parent is the new container's parent object in the final parse tree.
|
31
|
+
# tag is the XML tag which caused creation of the container.
|
32
|
+
# attrs is a hash of {attr => value} of the XML attributes in the tag.
|
33
|
+
def initialize(parent, tag = nil, attrs = nil)
|
34
|
+
@parent = parent
|
35
|
+
@tag = tag
|
36
|
+
# and ignore attrs by default
|
37
|
+
end
|
38
|
+
|
39
|
+
# Handle a start tag and attributes.
|
40
|
+
# Checks to see if self has a field with the appropriate name.
|
41
|
+
# If so, we send it the attributes (if any), and record that the
|
42
|
+
# current method is the method to access that field.
|
43
|
+
def tag_start(tag, attrs = nil)
|
44
|
+
method = tag2method(tag)
|
45
|
+
if self.respond_to?(method)
|
46
|
+
if attrs
|
47
|
+
self.send(method, attrs)
|
48
|
+
end
|
49
|
+
@current_method = method
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Handle an end tag, and return what the new current object should be.
|
54
|
+
#
|
55
|
+
# If the tag matches the one we were created with, this container is
|
56
|
+
# complete and the new current object is its parent.
|
57
|
+
#
|
58
|
+
# If there's no parent (i.e. this is the top level container in the
|
59
|
+
# parse tree), the new current object must be unchanged.
|
60
|
+
#
|
61
|
+
# Otherwise, pass the end tag up to the parent to see if it can do
|
62
|
+
# anything with it.
|
63
|
+
def tag_end(endtag, current)
|
64
|
+
if @tag == endtag
|
65
|
+
return @parent
|
66
|
+
end
|
67
|
+
if @parent == nil
|
68
|
+
return current
|
69
|
+
end
|
70
|
+
return @parent.tag_end(endtag, current)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Store an object in the parse tree, either in self, or in one of self's
|
74
|
+
# ancestors.
|
75
|
+
def store(tag, obj)
|
76
|
+
method = tag2method(tag)
|
77
|
+
if self.respond_to?(method)
|
78
|
+
self.send(method, obj)
|
79
|
+
else
|
80
|
+
@parent.store(tag, obj) if @parent
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# Parse a date field on demand. DateTime.parse is sloooow, so don't call
|
85
|
+
# it unless you really have to.
|
86
|
+
def parse_date(field)
|
87
|
+
if !field
|
88
|
+
return nil
|
89
|
+
end
|
90
|
+
if field.kind_of?(String)
|
91
|
+
dt = DateTime.parse(field)
|
92
|
+
if dt.kind_of?(DateTime)
|
93
|
+
field = dt
|
94
|
+
end
|
95
|
+
end
|
96
|
+
return field
|
97
|
+
end
|
98
|
+
|
99
|
+
# Strip the parent field from a container, used to make a container
|
100
|
+
# more amenable to pretty-printing.
|
101
|
+
def strip
|
102
|
+
@parent = nil
|
103
|
+
return self
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Shared parts of parser code for Atom and RSS. This is an abstract class;
|
108
|
+
# Atom::Parser and RSS::Parser are the concrete classes which actually parse
|
109
|
+
# syndication feeds.
|
110
|
+
#
|
111
|
+
# You don't need to know about anything below in order to use the library.
|
112
|
+
#
|
113
|
+
# The basic parsing strategy is:
|
114
|
+
#
|
115
|
+
# - The parser keeps a current_object pointer which represents the object
|
116
|
+
# in the parse tree that corresponds to where we are in the XML tree. To
|
117
|
+
# use a metaphor, it's the object where parse tree growth is occurring.
|
118
|
+
#
|
119
|
+
# - REXML dispatches events to the parser representing start and end tags and
|
120
|
+
# text. The parser sends the events to the current_object, which replies with
|
121
|
+
# what the new current_object should be after the event has been dealt with.
|
122
|
+
#
|
123
|
+
# - The job of creating child objects when appropriate is handled by the
|
124
|
+
# objects of the parse tree.
|
125
|
+
#
|
126
|
+
# - Reflection is used to store data in the parse tree. Accessor names are
|
127
|
+
# derived from tags in a standard way once namespaces have been standardized.
|
128
|
+
class AbstractParser
|
129
|
+
include REXML::StreamListener
|
130
|
+
|
131
|
+
# A Hash of namespace URLs the module knows about, returning the standard
|
132
|
+
# prefix to remap to.
|
133
|
+
KNOWN_NAMESPACES = {
|
134
|
+
'http://purl.org/dc/elements/1.1/' => 'dc',
|
135
|
+
'http://purl.org/dc/terms/' => 'dcterms',
|
136
|
+
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
|
137
|
+
'http://purl.org/rss/1.0/modules/content/' => 'content',
|
138
|
+
'http://www.itunes.com/DTDs/Podcast-1.0.dtd' => 'itunes',
|
139
|
+
'http://www.w3.org/1999/xhtml' => 'xhtml'
|
140
|
+
}
|
141
|
+
|
142
|
+
# Create a new AbstractParser. The optional argument consists of text to
|
143
|
+
# parse.
|
144
|
+
def initialize(text = nil)
|
145
|
+
reset
|
146
|
+
# Initialize mapping from tags to classes, which only needs to be done
|
147
|
+
# once and not reset. Concrete classes which do actual parsing will
|
148
|
+
# fill the hash.
|
149
|
+
@tag_to_class = Hash.new
|
150
|
+
parse(text) if text
|
151
|
+
end
|
152
|
+
|
153
|
+
# Catch any stuff that drops right through the parse tree, and simply
|
154
|
+
# ignore it.
|
155
|
+
def store(tag, obj)
|
156
|
+
end
|
157
|
+
|
158
|
+
# Catch and ignore closing tags that don't match anything open.
|
159
|
+
def end_tag(tag, current)
|
160
|
+
return current
|
161
|
+
end
|
162
|
+
|
163
|
+
# Reset the parser ready to parse a new feed.
|
164
|
+
def reset
|
165
|
+
@current_object = @parsetree
|
166
|
+
@tagstack = Array.new
|
167
|
+
@textstack = Array.new
|
168
|
+
@xhtml = ''
|
169
|
+
@xhtmlmode = false
|
170
|
+
@namespacemap = Hash.new
|
171
|
+
# @parsetree is set up by the concrete classes
|
172
|
+
end
|
173
|
+
|
174
|
+
# Parse the text provided. Returns a Syndication::Atom::Feed or
|
175
|
+
# Syndication::RSS::Feed object, according to which concrete Parser
|
176
|
+
# class is being used.
|
177
|
+
def parse(text)
|
178
|
+
REXML::Document.parse_stream(text, self)
|
179
|
+
return @parsetree
|
180
|
+
end
|
181
|
+
|
182
|
+
# Handle namespace translation for a raw tag.
|
183
|
+
def handle_namespace(tag, attrs = nil)
|
184
|
+
if attrs and tag.match(/^(rss|\w+:rdf|\w+:div)$/i)
|
185
|
+
for key in attrs.keys
|
186
|
+
if key.match(/xmlns:(\w+)/i)
|
187
|
+
define_namespace($1, attrs[key])
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
if tag.match(/(\w+):(\w+)/)
|
192
|
+
if @namespacemap[$1]
|
193
|
+
tag = "#{@namespacemap[$1]}:#{$2}"
|
194
|
+
end
|
195
|
+
end
|
196
|
+
return tag
|
197
|
+
end
|
198
|
+
|
199
|
+
# Process a namespace definition for the given prefix and namespace
|
200
|
+
# definition URL.
|
201
|
+
#
|
202
|
+
# If we recongnize the URL, we set up a mapping from their prefix to
|
203
|
+
# our canonical choice of prefix.
|
204
|
+
def define_namespace(prefix, url)
|
205
|
+
myprefix = KNOWN_NAMESPACES[url]
|
206
|
+
if myprefix
|
207
|
+
@namespacemap[prefix] = myprefix
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# Called when REXML finds the start of an XML element.
|
212
|
+
def tag_start(tag, attrs) #:nodoc:
|
213
|
+
tag = handle_namespace(tag, attrs)
|
214
|
+
cl = @class_for_tag[tag.downcase]
|
215
|
+
if cl
|
216
|
+
# If the tag requires the creation of an object, we create it as a
|
217
|
+
# child of the current object, then ask the current object to store
|
218
|
+
# it. It becomes the new current object.
|
219
|
+
newobj = cl.new(@current_object, tag, attrs)
|
220
|
+
@current_object.store(tag, newobj)
|
221
|
+
@current_object = newobj
|
222
|
+
else
|
223
|
+
# Otherwise, we ask the current object to do something with the tag.
|
224
|
+
if @current_object
|
225
|
+
@current_object.tag_start(tag, attrs)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
# We also push to the stacks we use for text buffering.
|
229
|
+
@tagstack.push(tag)
|
230
|
+
@textstack.push('')
|
231
|
+
end
|
232
|
+
|
233
|
+
# Called when REXML finds the end of an XML element.
|
234
|
+
def tag_end(endtag)
|
235
|
+
endtag = handle_namespace(endtag, nil)
|
236
|
+
# There are two tasks to perform: 1. store the data from the buffers,
|
237
|
+
# and 2. work out if we need to close out any objects in the parse
|
238
|
+
# tree and move the current object pointer
|
239
|
+
begin
|
240
|
+
# Store the top text buffer that's on the stacks by passing it to the
|
241
|
+
# current object along with its tag. Repeat until we find a stacked
|
242
|
+
# tag which matches the endtag, or run out of buffers.
|
243
|
+
tag = @tagstack.pop
|
244
|
+
text = @textstack.pop
|
245
|
+
if text
|
246
|
+
text.strip!
|
247
|
+
if text.length > 0 and @current_object
|
248
|
+
@current_object.store(tag, text)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end until tag == endtag or @tagstack.length == 0
|
252
|
+
# Pass the tag end event to the current object to find out what the
|
253
|
+
# new current object should be.
|
254
|
+
if @current_object
|
255
|
+
@current_object = @current_object.tag_end(endtag, @current_object)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
# Called when REXML finds a text fragment.
|
260
|
+
# Buffers the text on the buffer stacks ready for the end tag.
|
261
|
+
def text(s)
|
262
|
+
if @textstack.last
|
263
|
+
@textstack.last << s
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
module Syndication
|
3
|
+
|
4
|
+
# Mixin for RSS 1.0 content module.
|
5
|
+
#
|
6
|
+
# This is the approved way to include actual HTML text in an RSS feed.
|
7
|
+
# To use it, require 'syndication/content' to add the content_encoded
|
8
|
+
# and content_decoded methods to the Syndication::Item class.
|
9
|
+
#
|
10
|
+
module Content
|
11
|
+
# Actual web content, entity encoded or CDATA-escaped.
|
12
|
+
attr_accessor :content_encoded
|
13
|
+
|
14
|
+
# Decoded version of content_encoded, as HTML.
|
15
|
+
def content_decoded
|
16
|
+
if !@content_encoded or @content_encoded == ''
|
17
|
+
return @content_encoded
|
18
|
+
end
|
19
|
+
# CDATA is the easier
|
20
|
+
if @content_encoded.match(/<!\[CDATA\[(.*)\]\]!>/)
|
21
|
+
return $1
|
22
|
+
end
|
23
|
+
# OK, must be entity-encoded
|
24
|
+
x = @content_encoded.gsub(/</, '<')
|
25
|
+
x.gsub!(/>/, '>')
|
26
|
+
return x.gsub(/&/, '&')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#:enddoc:
|
31
|
+
module RSS
|
32
|
+
class Item
|
33
|
+
include Content
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
|
2
|
+
module Syndication
|
3
|
+
|
4
|
+
# Mixin for Dublin Core metadata in RSS feeds.
|
5
|
+
#
|
6
|
+
# If you require 'syndication/dublincore' these methods are added to the
|
7
|
+
# Syndication::Channel, Syndication::Item, Syndication::Image and
|
8
|
+
# Syndication::TextInput classes.
|
9
|
+
#
|
10
|
+
# The access method names are the Dublin Core element names, prefixed with
|
11
|
+
# dc_.
|
12
|
+
#
|
13
|
+
module DublinCore
|
14
|
+
# A name by which the item is formally known.
|
15
|
+
attr_accessor :dc_title
|
16
|
+
|
17
|
+
# The entity primarily responsible for making the content of the item.
|
18
|
+
attr_accessor :dc_creator
|
19
|
+
|
20
|
+
# The topic of the content of the item, typically as keywords
|
21
|
+
# or key phrases.
|
22
|
+
attr_accessor :dc_subject
|
23
|
+
|
24
|
+
# A description of the content of the item.
|
25
|
+
attr_accessor :dc_description
|
26
|
+
|
27
|
+
# Entity responsible for making the item available.
|
28
|
+
attr_accessor :dc_publisher
|
29
|
+
|
30
|
+
# Entity responsible for contributing this item.
|
31
|
+
attr_accessor :dc_contributor
|
32
|
+
|
33
|
+
# Date of creation or availability of item.
|
34
|
+
# Returned as a DateTime if it will parse; otherwise, returned as a
|
35
|
+
# string. (Dublin Core does not require any particular date and time
|
36
|
+
# format, so guaranteeing parsing is not possible.)
|
37
|
+
def dc_date
|
38
|
+
if @dc_date and !@dc_date.kind_of?(DateTime)
|
39
|
+
@dc_date = DateTime.parse(@dc_date)
|
40
|
+
end
|
41
|
+
return @dc_date
|
42
|
+
end
|
43
|
+
|
44
|
+
# Date of creation or availability of item.
|
45
|
+
attr_writer :dc_date
|
46
|
+
|
47
|
+
# Nature or genre of item, usually from a controlled vocabulary.
|
48
|
+
attr_accessor :dc_type
|
49
|
+
|
50
|
+
# Physical or digital format of item.
|
51
|
+
attr_accessor :dc_format
|
52
|
+
|
53
|
+
# An unambigious identifier which identifies the item.
|
54
|
+
attr_accessor :dc_identifier
|
55
|
+
|
56
|
+
# A reference to a resource from which the item is derived.
|
57
|
+
attr_accessor :dc_source
|
58
|
+
|
59
|
+
# The language the item is in, coded as per RFC 1766.
|
60
|
+
attr_accessor :dc_language
|
61
|
+
|
62
|
+
# A reference to a related resource.
|
63
|
+
attr_accessor :dc_relation
|
64
|
+
|
65
|
+
# The extent or scope of coverage of the item, e.g. a geographical area.
|
66
|
+
attr_accessor :dc_coverage
|
67
|
+
|
68
|
+
# Information about rights held over the item, e.g. copyright or patents.
|
69
|
+
attr_accessor :dc_rights
|
70
|
+
end
|
71
|
+
|
72
|
+
#:enddoc:
|
73
|
+
# Now we mix in the DublinCore elements to all the Syndication classes that
|
74
|
+
# can contain them. There's probably some clever way to do this via
|
75
|
+
# reflection, but there _is_ such a thing as being too clever.
|
76
|
+
class Item
|
77
|
+
include DublinCore
|
78
|
+
end
|
79
|
+
|
80
|
+
class Channel
|
81
|
+
include DublinCore
|
82
|
+
end
|
83
|
+
|
84
|
+
class Image
|
85
|
+
include DublinCore
|
86
|
+
end
|
87
|
+
|
88
|
+
class TextInput
|
89
|
+
include DublinCore
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Syndication
|
2
|
+
|
3
|
+
# Mixin for iTunes podcast RSS elements.
|
4
|
+
#
|
5
|
+
# To use this, require 'syndication/podcast' to add appropriate methods
|
6
|
+
# to the Item and Channel classes.
|
7
|
+
#
|
8
|
+
# See <URL:http://phobos.apple.com/static/iTunesRSS.html> for more
|
9
|
+
# information.
|
10
|
+
#
|
11
|
+
# See Syndication::Podcast::Both for methods added to both Item and
|
12
|
+
# Channel RSS objects.
|
13
|
+
#
|
14
|
+
# See Syndication::Podcast::Channel for methods added to Channel objects.
|
15
|
+
#
|
16
|
+
# See Syndication::Podcast::Item for methods added to Item objects.
|
17
|
+
#
|
18
|
+
module Podcast
|
19
|
+
# iTunes fields which occur in Items only.
|
20
|
+
module Item
|
21
|
+
# Artist column in iTunes.
|
22
|
+
attr_accessor :itunes_author
|
23
|
+
# Duration of item, in seconds.
|
24
|
+
attr_reader :itunes_duration
|
25
|
+
|
26
|
+
# Set the duration. Apple specifies four possible formats for the
|
27
|
+
# XML data: HH:MM:SS, H:MM:SS, MM:SS, or M:SS.
|
28
|
+
def itunes_duration=(x)
|
29
|
+
if x.match(/(\d?\d):(\d\d):(\d\d)/)
|
30
|
+
@itunes_duration = $3.to_i + $2.to_i * 60 + $1.to_i * 3600
|
31
|
+
elsif x.match(/(\d?\d):(\d\d)/)
|
32
|
+
@itunes_duration = $2.to_i + $1.to_i * 60
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
# iTunes fields which occur in Channels only.
|
39
|
+
module Channel
|
40
|
+
# Owner, not shown, used for contact only.
|
41
|
+
attr_accessor :itunes_owner
|
42
|
+
end
|
43
|
+
|
44
|
+
# iTunes fields which occur both in Channels and in Items.
|
45
|
+
module Both
|
46
|
+
# Prevent this entity from appearing in the iTunes podcast directory?
|
47
|
+
attr_accessor :itunes_block
|
48
|
+
# Parental advisory graphic?
|
49
|
+
attr_accessor :itunes_explicit
|
50
|
+
# Keywords, not shown but can be searched via iTunes.
|
51
|
+
attr_accessor :itunes_keywords
|
52
|
+
# Description column in iTunes.
|
53
|
+
attr_accessor :itunes_subtitle
|
54
|
+
# Summary, shown when i-in-circle icon is clicked in Description
|
55
|
+
# column of iTunes.
|
56
|
+
attr_accessor :itunes_summary
|
57
|
+
# Category column(s) in iTunes and music store browser, as an array
|
58
|
+
# of strings (categories then subcategories).
|
59
|
+
attr_reader :itunes_category
|
60
|
+
|
61
|
+
# Add an iTunes category; they can be nested.
|
62
|
+
def itunes_category=(x)
|
63
|
+
if !@itunes_category
|
64
|
+
@itunes_category = Array.new
|
65
|
+
end
|
66
|
+
@itunes_category.push(x)
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
#:enddoc:
|
73
|
+
module RSS
|
74
|
+
class Item
|
75
|
+
include Podcast::Item
|
76
|
+
include Podcast::Both
|
77
|
+
end
|
78
|
+
|
79
|
+
class Channel
|
80
|
+
include Podcast::Channel
|
81
|
+
include Podcast::Both
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|