yyyc514-syndication 0.6.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +10 -0
- data/DEVELOPER +5 -0
- data/IMPLEMENTATION +55 -0
- data/README +228 -0
- data/examples/apple.rb +24 -0
- data/examples/google.rb +23 -0
- data/examples/yahoo.rb +21 -0
- data/lib/syndication/atom.rb +531 -0
- data/lib/syndication/common.rb +289 -0
- data/lib/syndication/content.rb +44 -0
- data/lib/syndication/dublincore.rb +98 -0
- data/lib/syndication/feedburner.rb +18 -0
- data/lib/syndication/google.rb +58 -0
- data/lib/syndication/podcast.rb +90 -0
- data/lib/syndication/rss.rb +332 -0
- data/lib/syndication/syndication.rb +49 -0
- data/lib/syndication/tagsoup.rb +51 -0
- data/rakefile +60 -0
- data/test/atomtest.rb +190 -0
- data/test/feedburntest.rb +79 -0
- data/test/google.rb +91 -0
- data/test/rsstest.rb +422 -0
- data/test/tagsouptest.rb +86 -0
- metadata +83 -0
data/CHANGES
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# == Changes in 0.5.1
|
2
|
+
#
|
3
|
+
# - Fixes for handling of CDATA-encoded text.
|
4
|
+
#
|
5
|
+
# == Changes in 0.5
|
6
|
+
#
|
7
|
+
# - Fixed problem with syndication/dublincore reported by Ura Takefumi.
|
8
|
+
#
|
9
|
+
# - Added new TagSoup completely-non-validating parser, tests for same,
|
10
|
+
# and option to use it for parsing feeds.
|
data/DEVELOPER
ADDED
data/IMPLEMENTATION
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# = Implementation notes
|
2
|
+
# == Syndication 0.5
|
3
|
+
#
|
4
|
+
# For this release, I added a parser called TagSoup. The name is taken from
|
5
|
+
# the jargon term used for HTML written without any regard to the rules of
|
6
|
+
# HTML structure, i.e. HTML with many common authoring mistakes in.
|
7
|
+
#
|
8
|
+
# TagSoup is a very small and very dumb parser which implements the stream
|
9
|
+
# API of REXML. The test code compares it against REXML for some simple
|
10
|
+
# example XML and makes sure it calls the same callbacks in the same order
|
11
|
+
# with the same parameters.
|
12
|
+
#
|
13
|
+
# Note that hacking together your own XML parser is, generally speaking, the
|
14
|
+
# wrong thing to do. Using TagSoup as a general replacement for REXML is very
|
15
|
+
# definitely the wrong thing to do. Please don't do it.
|
16
|
+
#
|
17
|
+
# A real XML parser does all kinds of things that TagSoup doesn't, like pay
|
18
|
+
# attention to DTDs, handle quoted special characters in element attributes,
|
19
|
+
# handle whitespace in a documented standard way, and so on. The fact that
|
20
|
+
# TagSoup is defective in many areas is intentional. It's designed to be
|
21
|
+
# used as a last resort, for parsing web syndication feeds which are invalid.
|
22
|
+
#
|
23
|
+
# == Syndication 0.4
|
24
|
+
#
|
25
|
+
# As discussed in the README, this is really my fourth attempt at writing
|
26
|
+
# RSS parsing code. For the record, I thought I'd list the approaches I
|
27
|
+
# tried and abandoned. In a way, that's more interesting than the one I
|
28
|
+
# picked...
|
29
|
+
#
|
30
|
+
# First I used hashes for storage and just looked for matching tags.
|
31
|
+
# That approach works, kinda, but it doesn't really understand nested
|
32
|
+
# elements at all. As a result, it becomes really hard to deal with Atom
|
33
|
+
# feeds, where an <email> element could belong to one of a number of kinds
|
34
|
+
# of person. Plus, I wanted a real object-based approach which would be
|
35
|
+
# amenable to RDoc documentation.
|
36
|
+
#
|
37
|
+
# Next I wrote a classic stack-based parser, with a container stack and a
|
38
|
+
# text buffer stack. That worked well for RSS; I got it parsing every RSS
|
39
|
+
# variant, and even went as far as a test suite. However, as I tried
|
40
|
+
# extending it to deal with Atom, I realized that the parser code was
|
41
|
+
# becoming hard to follow, as the state machine gained more and more
|
42
|
+
# special cases.
|
43
|
+
#
|
44
|
+
# For a third iteration, I tried to generalize the knowledge represented by the
|
45
|
+
# state machine, by placing it in the context stack. That is, I would have a
|
46
|
+
# smart stack that knew which XML elements could go inside other elements.
|
47
|
+
# Actually, there would have been four context stacks, for containers,
|
48
|
+
# attributes, tags and textual data.
|
49
|
+
#
|
50
|
+
# That design never made it past the paper stage, because I realized that I
|
51
|
+
# could move all the knowledge into the classes used to create the objects of
|
52
|
+
# the final parse tree. With the new model--the one used in this code--the
|
53
|
+
# parser really doesn't know anything about Atom or RSS. It just forwards
|
54
|
+
# events to a tree of objects, which construct child objects as appropriate to
|
55
|
+
# grow the tree and represent the feed.
|
data/README
ADDED
@@ -0,0 +1,228 @@
|
|
1
|
+
# = Syndication 0.6
|
2
|
+
#
|
3
|
+
# This module provides classes for parsing web syndication feeds in RSS and
|
4
|
+
# Atom formats.
|
5
|
+
#
|
6
|
+
# To parse RSS, use Syndication::RSS::Parser.
|
7
|
+
#
|
8
|
+
# To parse Atom, use Syndication::Atom::Parser.
|
9
|
+
#
|
10
|
+
# If you want my advice on which to generate, my order of preference would
|
11
|
+
# be:
|
12
|
+
#
|
13
|
+
# 1. Atom 1.0
|
14
|
+
# 2. RSS 1.0
|
15
|
+
# 3. RSS 2.0
|
16
|
+
#
|
17
|
+
# My reasoning is simply that I hate having to sniff for HTML (see
|
18
|
+
# Syndication::RSS).
|
19
|
+
#
|
20
|
+
# == License
|
21
|
+
#
|
22
|
+
# Syndication is Copyright 2005-2006 mathew <meta@pobox.com>, and is licensed
|
23
|
+
# under the same terms as Ruby.
|
24
|
+
#
|
25
|
+
# == Requirements
|
26
|
+
#
|
27
|
+
# Built and tested using Ruby 1.8.4. Needs only the standard library.
|
28
|
+
#
|
29
|
+
# == Rationale
|
30
|
+
#
|
31
|
+
# Ruby already has an RSS library as part of the standard library, so you
|
32
|
+
# might be wondering why I decided to write another one.
|
33
|
+
#
|
34
|
+
# I started out trying to document the standard rss module, but found the
|
35
|
+
# code rather impenetrable. It was also difficult to see how it could be made
|
36
|
+
# documentable via Rdoc.
|
37
|
+
#
|
38
|
+
# Then I tried writing code to use the standard RSS library, and discovered
|
39
|
+
# that it had a number of (what I consider to be) defects:
|
40
|
+
#
|
41
|
+
# - It doesn't support RSS 2.0 with extensions (such as iTunes podcast feeds),
|
42
|
+
# and it wasn't clear to me how to extend it to do so.
|
43
|
+
#
|
44
|
+
# - It doesn't support RSS 0.9.
|
45
|
+
#
|
46
|
+
# - It doesn't support Atom.
|
47
|
+
#
|
48
|
+
# - The API is different depending on what kind of RSS feed you are parsing.
|
49
|
+
#
|
50
|
+
# I asked around, and discovered that I wasn't the only person dissatisfied
|
51
|
+
# with the RSS library. Since fixing the problems would have resulted in
|
52
|
+
# breaking existing code that used the RSS module, I opted for an all-new
|
53
|
+
# implementation.
|
54
|
+
#
|
55
|
+
# This is the result. The first release was version 0.4, which was actually my
|
56
|
+
# fourth attempt at putting together a clean, simple, universal API for RSS
|
57
|
+
# and Atom parsing. (The first three never saw public release.)
|
58
|
+
#
|
59
|
+
# == Features
|
60
|
+
#
|
61
|
+
# Here are what I see as the key improvements over the rss module in the
|
62
|
+
# Ruby standard library:
|
63
|
+
#
|
64
|
+
# - Supports all RSS versions, including RSS 0.9, as well as Atom.
|
65
|
+
#
|
66
|
+
# - Provides a unified API/object model for accessing the decoded data,
|
67
|
+
# with no need to know what format the feed is in.
|
68
|
+
#
|
69
|
+
# - Allows use of extended RSS 2.0 feeds.
|
70
|
+
#
|
71
|
+
# - Simple API, fully documented.
|
72
|
+
#
|
73
|
+
# - Test suite with over 220 test assertions.
|
74
|
+
#
|
75
|
+
# - Commented source code.
|
76
|
+
#
|
77
|
+
# - Less source code than the standard library rss module.
|
78
|
+
#
|
79
|
+
# - Faster than the standard library (at least, in my tests).
|
80
|
+
#
|
81
|
+
# Other features:
|
82
|
+
#
|
83
|
+
# - Optional support for RSS 1.0 Dublin Core, Syndication and Content modules,
|
84
|
+
# Apple iTunes Podcast elements, and Google Calendar.
|
85
|
+
#
|
86
|
+
# - Content module decodes CDATA-escaped or encoded HTML content for you.
|
87
|
+
#
|
88
|
+
# - Supports namespaces, and encoded XHTML/HTML in Atom feeds.
|
89
|
+
#
|
90
|
+
# - Dates decoded to Ruby DateTime objects. Note, however, that this is slow,
|
91
|
+
# so parsing is only performed if you ask for the value.
|
92
|
+
#
|
93
|
+
# - Simple to extend to support your own RSS extensions, uses reflection.
|
94
|
+
#
|
95
|
+
# - Uses REXML fast stream parsing API for speed, or built-in TagSoup parser
|
96
|
+
# for invalid feeds.
|
97
|
+
#
|
98
|
+
# - Non-validating, tries to be as forgiving as possible of structural errors.
|
99
|
+
#
|
100
|
+
# - Remaps namespace prefixes to standard values if it recognizes the module's
|
101
|
+
# URL.
|
102
|
+
#
|
103
|
+
# In the interests of balance, here are some key disadvantages over the
|
104
|
+
# standard library RSS support:
|
105
|
+
#
|
106
|
+
# - No support for _generating_ RSS feeds, only for parsing them. If
|
107
|
+
# you're using Rails, you can use RXML; if not, you can use rss/maker.
|
108
|
+
# My feeling is that XML generation isn't a wheel that needs reinventing.
|
109
|
+
#
|
110
|
+
# - Different API, not a drop-in replacement.
|
111
|
+
#
|
112
|
+
# - Incomplete support for Atom 0.3 draft. (Anyone still using it?)
|
113
|
+
#
|
114
|
+
# - No support for base64 data in Atom feeds (yet).
|
115
|
+
#
|
116
|
+
# - No Japanese documentation.
|
117
|
+
#
|
118
|
+
# - No XSL output options.
|
119
|
+
#
|
120
|
+
# - Slower if there are dates in the feed and you ask for their values.
|
121
|
+
#
|
122
|
+
# == Other options
|
123
|
+
#
|
124
|
+
# There are, of course, other Ruby RSS/Atom libraries out there. The ones I
|
125
|
+
# know about:
|
126
|
+
#
|
127
|
+
# = simple-rss
|
128
|
+
#
|
129
|
+
# http://rubyforge.org/projects/simple-rss
|
130
|
+
#
|
131
|
+
# Pros:
|
132
|
+
# - Much smaller than syndication or rss.
|
133
|
+
#
|
134
|
+
# - Completely non-validating.
|
135
|
+
#
|
136
|
+
# - Backwards compatible with rss in standard library.
|
137
|
+
#
|
138
|
+
# Cons:
|
139
|
+
# - Doesn't use a real XML parser.
|
140
|
+
#
|
141
|
+
# - No support for namespaces.
|
142
|
+
#
|
143
|
+
# - Incomplete Atom support (e.g. can't get name and e-mail of <atom:person>
|
144
|
+
# elements as separate fields, you still have to decode XHTML data yourself)
|
145
|
+
#
|
146
|
+
# - No documentation.
|
147
|
+
#
|
148
|
+
# For the record, I started work on my library long before simple-rss was
|
149
|
+
# announced.
|
150
|
+
#
|
151
|
+
# = feedtools
|
152
|
+
#
|
153
|
+
# http://rubyforge.org/projects/feedtools/
|
154
|
+
#
|
155
|
+
# This one solves most of the same problems as Syndication; however the two
|
156
|
+
# were developed in parallel, in ignorance of each other.
|
157
|
+
#
|
158
|
+
# Feedtools builds in database caching and persistance, and HTTP fetching.
|
159
|
+
# Personally, I don't think those belong in a feed parsing library--they
|
160
|
+
# are easily implemented using other standard libraries if you want them.
|
161
|
+
#
|
162
|
+
# Pros:
|
163
|
+
# - Lots of test cases.
|
164
|
+
#
|
165
|
+
# - Used by lots of Rails people.
|
166
|
+
#
|
167
|
+
# - Knows about many more namespaces.
|
168
|
+
#
|
169
|
+
# - Can generate feeds.
|
170
|
+
#
|
171
|
+
# Cons:
|
172
|
+
# - Skimpy documentation.
|
173
|
+
#
|
174
|
+
# - Uses HTree then XPath parsing, rather than a single stream parse.
|
175
|
+
#
|
176
|
+
# - Tries to unify RSS and Atom APIs, at the expense of Atom functionality.
|
177
|
+
# (Which could also be a pro, depending on your viewpoint.)
|
178
|
+
#
|
179
|
+
# == Design philosophy
|
180
|
+
#
|
181
|
+
# Here's my design philosophy for this module:
|
182
|
+
#
|
183
|
+
# - The interface should be via standard Ruby objects and methods; e.g.
|
184
|
+
# feed.channel.item[0].title, rather than (say) a dictionary hash.
|
185
|
+
#
|
186
|
+
# - It should be easier to parse RSS via the module than to hack something
|
187
|
+
# together using REXML, even if all you want is a list of titles and URLs.
|
188
|
+
#
|
189
|
+
# - It should be easy to add support for new RSS extensions without needing
|
190
|
+
# to know anything about reflection or other advanced topics. Just define
|
191
|
+
# a mixin with a bunch of appropriately-named methods, and you're done.
|
192
|
+
#
|
193
|
+
# - The code should be simple to understand.
|
194
|
+
#
|
195
|
+
# - Even so, good complete documentation is extremely important.
|
196
|
+
#
|
197
|
+
# - Be lenient in what you accept.
|
198
|
+
#
|
199
|
+
# - Be conservative in what you generate.
|
200
|
+
#
|
201
|
+
# - Get well-formed feeds parsing reliably, then worry about broken feeds.
|
202
|
+
#
|
203
|
+
# - Atom will hopefully be the future. Provide full support for RSS, but don't
|
204
|
+
# hold Atom back by trying to force it into an RSS data model.
|
205
|
+
#
|
206
|
+
# == Future plans
|
207
|
+
#
|
208
|
+
# Here are some possible improvements:
|
209
|
+
#
|
210
|
+
# - RSS and Atom generation. Create objects, then call Syndication::FeedMaker
|
211
|
+
# to generate XML in various flavors. This probably won't happen until an XML
|
212
|
+
# generator is picked for the Ruby standard library.
|
213
|
+
#
|
214
|
+
# - Faster date parsing. It turns out that when I asked for parsed dates in
|
215
|
+
# my test code, the profiler showed Date.parse chewing up 25% of the total
|
216
|
+
# CPU time used. A more specific ISO8601 specific date parser could cut
|
217
|
+
# that down drastically.
|
218
|
+
#
|
219
|
+
# - Additional Google Data support. I just wanted to be able to display my
|
220
|
+
# upcoming calendar dates, but clearly there is a lot more that could be
|
221
|
+
# implemented. Unfortunately, recurring events don't seem to have a clean
|
222
|
+
# XML representation in Google's data feeds yet.
|
223
|
+
#
|
224
|
+
# == Feedback
|
225
|
+
#
|
226
|
+
# There are doubtless things I could have done better. Comments, suggestions,
|
227
|
+
# etc are welcome; e-mail <meta@pobox.com>.
|
228
|
+
#
|
data/examples/apple.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# Example of using RSS 1.0 content module in RSS 2.0.
|
2
|
+
# (Naughty, but there you go.)
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'syndication/rss'
|
6
|
+
require 'syndication/content'
|
7
|
+
require 'open-uri'
|
8
|
+
|
9
|
+
url = 'http://docs.info.apple.com/rss/allproducts.rss'
|
10
|
+
|
11
|
+
parser = Syndication::RSS::Parser.new
|
12
|
+
|
13
|
+
xml = nil
|
14
|
+
|
15
|
+
open(url) { |http|
|
16
|
+
xml = http.read
|
17
|
+
}
|
18
|
+
|
19
|
+
feed = parser.parse(xml)
|
20
|
+
|
21
|
+
for i in feed.items
|
22
|
+
puts i.content_encoded
|
23
|
+
puts
|
24
|
+
end
|
data/examples/google.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# Atom syndication example:
|
2
|
+
# Output upcoming events from a Google calendar feed
|
3
|
+
|
4
|
+
require 'open-uri'
|
5
|
+
require 'syndication/atom'
|
6
|
+
require 'syndication/google'
|
7
|
+
|
8
|
+
MY_CALENDAR = 'http://www.google.com/calendar/feeds/j4a3sad66efnj3rm5ou2fbnsbg@group.calendar.google.com/public/full'
|
9
|
+
|
10
|
+
parser = Syndication::Atom::Parser.new
|
11
|
+
feed = nil
|
12
|
+
open(MY_CALENDAR) {|file|
|
13
|
+
text = file.read
|
14
|
+
feed = parser.parse(text)
|
15
|
+
}
|
16
|
+
t = feed.updated.strftime("%H:%I on %A %d %B")
|
17
|
+
puts "#{feed.title.txt}: #{feed.subtitle.txt} (updated #{t})"
|
18
|
+
for e in feed.entries
|
19
|
+
if e.gd_when && e.gd_when.first
|
20
|
+
t = e.gd_when.first.strftime("%d %b %y")
|
21
|
+
puts "#{t}: #{e.title.txt}"
|
22
|
+
end
|
23
|
+
end
|
data/examples/yahoo.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
# RSS Syndication example:
|
3
|
+
#
|
4
|
+
# Output Yahoo news headlines, dated.
|
5
|
+
|
6
|
+
require 'open-uri'
|
7
|
+
require 'syndication/rss'
|
8
|
+
|
9
|
+
parser = Syndication::RSS::Parser.new
|
10
|
+
feed = nil
|
11
|
+
open("http://rss.news.yahoo.com/rss/topstories") {|file|
|
12
|
+
text = file.read
|
13
|
+
feed = parser.parse(text)
|
14
|
+
}
|
15
|
+
chan = feed.channel
|
16
|
+
t = chan.lastbuilddate.strftime("%H:%I on %A %d %B")
|
17
|
+
puts "#{chan.title} at #{t}"
|
18
|
+
for i in feed.items
|
19
|
+
t = i.pubdate.strftime("%d %b")
|
20
|
+
puts "#{t}: #{i.title}"
|
21
|
+
end
|