ruby-feedparser 0.7

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,14 @@
1
+ Ruby-Feedparser
2
+ -----------------
3
+ by Lucas Nussbaum <lucas@lucas-nussbaum.net>
4
+
5
+ Currently, all the information is provided on
6
+
7
+ http://home.gna.org/ruby-feedparser/
8
+
9
+ If you need to ask questions, feel free to ask them on the
10
+ ruby-feedparser-devel@gna.org mailing list.
11
+
12
+ Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
13
+ which is compatible with the GNU GPL (see the COPYING file) via an explicit
14
+ dual-licensing clause.
data/Rakefile ADDED
@@ -0,0 +1,84 @@
1
+ require 'rake/testtask'
2
+ require 'rake/rdoctask'
3
+ require 'rake/packagetask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+ PKG_NAME = 'ruby-feedparser'
9
+ PKG_VERSION = '0.7'
10
+
11
+ PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
12
+ Find.find('lib/', 'test/', 'tools/') do |f|
13
+ if FileTest.directory?(f) and f =~ /\.svn/
14
+ Find.prune
15
+ else
16
+ PKG_FILES << f
17
+ end
18
+ end
19
+
20
+ PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
21
+
22
+ task :default => [:package]
23
+
24
+ Rake::TestTask.new do |t|
25
+ t.libs << "test"
26
+ t.test_files = FileList['test/tc_*.rb']
27
+ end
28
+
29
+ Rake::RDocTask.new do |rd|
30
+ f = []
31
+ Find.find('lib/') do |file|
32
+ if FileTest.directory?(file) and file =~ /\.svn/
33
+ Find.prune
34
+ else
35
+ f << file if not FileTest.directory?(file)
36
+ end
37
+ end
38
+ f.delete('lib/feedparser.rb')
39
+ # hack to document the Feedparser module properly
40
+ f.unshift('lib/feedparser.rb')
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ rd.options << '--diagram'
44
+ rd.options << '--fileboxes'
45
+ rd.options << '--inline-source'
46
+ rd.options << '--line-numbers'
47
+ rd.rdoc_dir = 'rdoc'
48
+ end
49
+
50
+ task :doctoweb => [:rdoc] do |t|
51
+ # copies the rdoc to the CVS repository for ruby-feedparser website
52
+ # repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
53
+ sh "tools/doctoweb.bash"
54
+ end
55
+
56
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
57
+ p.need_tar = true
58
+ p.need_zip = true
59
+ p.package_files = PKG_FILES
60
+ end
61
+
62
+ # "Gem" part of the Rakefile
63
+ begin
64
+ require 'rake/gempackagetask'
65
+
66
+ spec = Gem::Specification.new do |s|
67
+ s.platform = Gem::Platform::RUBY
68
+ s.summary = "Ruby library to parse ATOM and RSS feeds"
69
+ s.name = PKG_NAME
70
+ s.version = PKG_VERSION
71
+ s.requirements << 'none'
72
+ s.require_path = 'lib'
73
+ s.autorequire = 'feedparser'
74
+ s.files = PKG_FILES
75
+ s.description = "Ruby library to parse ATOM and RSS feeds"
76
+ end
77
+
78
+ Rake::GemPackageTask.new(spec) do |pkg|
79
+ pkg.need_zip = true
80
+ pkg.need_tar = true
81
+ end
82
+ rescue LoadError
83
+ puts "Will not generate gem."
84
+ end
data/lib/feedparser.rb ADDED
@@ -0,0 +1,28 @@
1
+ # =Ruby-feedparser - ATOM/RSS feed parser for Ruby
2
+ # License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
3
+ # Website::http://home.gna.org/ruby-feedparser/
4
+ #
5
+ # ==Introduction
6
+ #
7
+ # Ruby-Feedparser is an RSS and Atom parser for Ruby.
8
+ # Ruby-feedparser is :
9
+ # * based on REXML
10
+ # * built for robustness : most feeds are not valid, a parser can't ignore that
11
+ # * fully unit-tested
12
+ # * easy to use (it can output text or HTML easily)
13
+ #
14
+ # ==Example
15
+ # require 'net/http'
16
+ # require 'feedparser'
17
+ # require 'uri'
18
+ # s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
19
+ # f = FeedParser::Feed::new(s)
20
+ # f.title
21
+ # => "Slashdot"
22
+ # f.items.each { |i| puts i.title }
23
+ # [...]
24
+ # require 'feedparser/html-output'
25
+ # f.items.each { |i| puts i.to_html }
26
+ #
27
+
28
+ require 'feedparser/feedparser'
@@ -0,0 +1,343 @@
1
+ require 'rexml/document'
2
+ require 'time'
3
+ require 'feedparser/textconverters'
4
+ require 'feedparser/rexml_patch'
5
+ require 'feedparser/text-output'
6
+ require 'base64'
7
+
8
+ module FeedParser
9
+
10
+ VERSION = "0.7"
11
+
12
+ class UnknownFeedTypeException < RuntimeError
13
+ end
14
+
15
+ # an RSS/Atom feed
16
+ class Feed
17
+ attr_reader :type, :title, :link, :description, :creator, :encoding, :items
18
+
19
+ # REXML::Element for this feed.
20
+ attr_reader :xml
21
+
22
+ # parse str to build a Feed
23
+ def initialize(str = nil)
24
+ parse(str) if str
25
+ end
26
+
27
+ # Determines all the fields using a string containing an
28
+ # XML document
29
+ def parse(str)
30
+ # Dirty hack: some feeds contain the & char. It must be changed to &amp;
31
+ str.gsub!(/&(\s+)/, '&amp;\1')
32
+ doc = REXML::Document.new(str)
33
+ @xml = doc.root
34
+ # get feed info
35
+ @encoding = doc.encoding
36
+ @title,@link,@description,@creator = nil
37
+ @items = []
38
+ if doc.root.elements['channel'] || doc.root.elements['rss:channel']
39
+ @type = "rss"
40
+ # We have a RSS feed!
41
+ # Title
42
+ if (e = doc.root.elements['channel/title'] ||
43
+ doc.root.elements['rss:channel/rss:title']) && e.text
44
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
45
+ end
46
+ # Link
47
+ if (e = doc.root.elements['channel/link'] ||
48
+ doc.root.elements['rss:channel/rss:link']) && e.text
49
+ @link = e.text.rmWhiteSpace!
50
+ end
51
+ # Description
52
+ if (e = doc.root.elements['channel/description'] ||
53
+ doc.root.elements['rss:channel/rss:description']) && e.text
54
+ @description = e.text.toUTF8(@encoding).rmWhiteSpace!
55
+ end
56
+ # Creator
57
+ if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
58
+ ((e = doc.root.elements['channel/author'] ||
59
+ doc.root.elements['rss:channel/rss:author']) && e.text)
60
+ @creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
61
+ end
62
+ # Items
63
+ if doc.root.elements['channel/item']
64
+ query = 'channel/item'
65
+ elsif doc.root.elements['item']
66
+ query = 'item'
67
+ elsif doc.root.elements['rss:channel/rss:item']
68
+ query = 'rss:channel/rss:item'
69
+ else
70
+ query = 'rss:item'
71
+ end
72
+ doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
73
+
74
+ elsif doc.root.elements['/feed']
75
+ # We have an ATOM feed!
76
+ @type = "atom"
77
+ # Title
78
+ if (e = doc.root.elements['/feed/title']) && e.text
79
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
80
+ end
81
+ # Link
82
+ doc.root.each_element('/feed/link') do |e|
83
+ if e.attribute('type') and (
84
+ e.attribute('type').value == 'text/html' or
85
+ e.attribute('type').value == 'application/xhtml' or
86
+ e.attribute('type').value == 'application/xhtml+xml')
87
+ if (h = e.attribute('href')) && h
88
+ @link = h.value.rmWhiteSpace!
89
+ end
90
+ end
91
+ end
92
+ # Description
93
+ if e = doc.root.elements['/feed/info']
94
+ e = e.elements['div'] || e
95
+ @description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
96
+ end
97
+ # Items
98
+ doc.root.each_element('/feed/entry') do |e|
99
+ @items << AtomItem::new(e, self)
100
+ end
101
+ else
102
+ raise UnknownFeedTypeException::new
103
+ end
104
+ end
105
+
106
+ def to_s(localtime = true)
107
+ s = ''
108
+ s += "Type: #{@type}\n"
109
+ s += "Encoding: #{@encoding}\n"
110
+ s += "Title: #{@title}\n"
111
+ s += "Link: #{@link}\n"
112
+ s += "Description: #{@description}\n"
113
+ s += "Creator: #{@creator}\n"
114
+ s += "\n"
115
+ @items.each { |i| s += i.to_s(localtime) }
116
+ s
117
+ end
118
+ end
119
+
120
+ # an Item from a feed
121
+ class FeedItem
122
+ attr_accessor :title, :link, :content, :date, :creators, :subject,
123
+ :cacheditem
124
+
125
+ # The item's categories/tags. An array of strings.
126
+ attr_accessor :categories
127
+
128
+ # The item's enclosures childs. An array of (url, length, type) triplets.
129
+ attr_accessor :enclosures
130
+
131
+ attr_reader :feed
132
+
133
+ # REXML::Element for this item
134
+ attr_reader :xml
135
+
136
+ def initialize(item = nil, feed = nil)
137
+ @xml = item
138
+ @feed = feed
139
+ @title, @link, @content, @date, @subject = nil
140
+ @creators = []
141
+ @categories = []
142
+ @enclosures = []
143
+ parse(item) if item
144
+ end
145
+
146
+ def parse(item)
147
+ raise "parse() should be implemented by subclasses!"
148
+ end
149
+
150
+ def creator
151
+ case @creators.length
152
+ when 0
153
+ return nil
154
+ when 1
155
+ return creators[0]
156
+ else
157
+ return creators[0...-1].join(", ")+" and "+creators[-1]
158
+ end
159
+ end
160
+
161
+ def to_s(localtime = true)
162
+ s = "--------------------------------\n" +
163
+ "Title: #{@title}\nLink: #{@link}\n"
164
+ if localtime or @date.nil?
165
+ s += "Date: #{@date.to_s}\n"
166
+ else
167
+ s += "Date: #{@date.getutc.to_s}\n"
168
+ end
169
+ s += "Creator: #{creator}\n" +
170
+ "Subject: #{@subject}\n"
171
+ if defined?(@categories) and @categories.length > 0
172
+ s += "Filed under: " + @categories.join(', ') + "\n"
173
+ end
174
+ s += "Content:\n#{content}\n"
175
+ if defined?(@enclosures) and @enclosures.length > 0
176
+ s2 = "Enclosures:\n"
177
+ @enclosures.each do |e|
178
+ s2 += e.join(' ') + "\n"
179
+ end
180
+ s += s2
181
+ end
182
+ return s
183
+ end
184
+ end
185
+
186
+ class RSSItem < FeedItem
187
+
188
+
189
+ def parse(item)
190
+ # Title. If no title, use the pubDate as fallback.
191
+ if ((e = item.elements['title'] || item.elements['rss:title']) &&
192
+ e.text) ||
193
+ ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
194
+ e.text)
195
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
196
+ end
197
+ # Link
198
+ if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
199
+ (e = item.elements['guid'] || item.elements['rss:guid'] and
200
+ not (e.attribute('isPermaLink') and
201
+ e.attribute('isPermaLink').value == 'false'))
202
+ @link = e.text.rmWhiteSpace!
203
+ end
204
+ # Content
205
+ if (e = item.elements['content:encoded']) ||
206
+ (e = item.elements['description'] || item.elements['rss:description'])
207
+ @content = FeedParser::getcontent(e, @feed)
208
+ end
209
+ # Date
210
+ if e = item.elements['dc:date'] || item.elements['pubDate'] ||
211
+ item.elements['rss:pubDate']
212
+ begin
213
+ @date = Time::xmlschema(e.text)
214
+ rescue
215
+ begin
216
+ @date = Time::rfc2822(e.text)
217
+ rescue
218
+ begin
219
+ @date = Time::parse(e.text)
220
+ rescue
221
+ @date = nil
222
+ end
223
+ end
224
+ end
225
+ end
226
+ # Creator
227
+ if (e = item.elements['dc:creator'] || item.elements['author'] ||
228
+ item.elements['rss:author']) && e.text
229
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
230
+ end
231
+ @creators << @feed.creator if @creators.empty? and @feed.creator
232
+
233
+ # Subject
234
+ if (e = item.elements['dc:subject']) && e.text
235
+ @subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
236
+ end
237
+ # Categories
238
+ cat_elts = []
239
+ item.each_element('dc:category') { |e| cat_elts << e if e.text }
240
+ item.each_element('category') { |e| cat_elts << e if e.text }
241
+ item.each_element('rss:category') { |e| cat_elts << e if e.text }
242
+
243
+ cat_elts.each do |e|
244
+ @categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
245
+ end
246
+ # Enclosures
247
+ item.each_element('enclosure') do |e|
248
+ url = e.attribute('url').value if e.attribute('url')
249
+ length = e.attribute('length').value if e.attribute('length')
250
+ type = e.attribute('type').value if e.attribute('type')
251
+ @enclosures << [ url, length, type ] if url
252
+ end
253
+ end
254
+ end
255
+
256
+ class AtomItem < FeedItem
257
+ def parse(item)
258
+ # Title
259
+ if (e = item.elements['title']) && e.text
260
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
261
+ end
262
+ # Link
263
+ item.each_element('link') do |e|
264
+ if (h = e.attribute('href')) && h.value
265
+ @link = h.value
266
+ end
267
+ end
268
+ # Content
269
+ if e = item.elements['content'] || item.elements['summary']
270
+ if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
271
+ e.text
272
+ @content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
273
+ else
274
+ @content = FeedParser::getcontent(e, @feed)
275
+ end
276
+ end
277
+ # Date
278
+ if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
279
+ begin
280
+ @date = Time::xmlschema(e.text)
281
+ rescue
282
+ begin
283
+ @date = Time::rfc2822(e.text)
284
+ rescue
285
+ begin
286
+ @date = Time::parse(e.text)
287
+ rescue
288
+ @date = nil
289
+ end
290
+ end
291
+ end
292
+ end
293
+ # Creator
294
+ item.each_element('author/name') do |e|
295
+ if e.text
296
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
297
+ end
298
+ end
299
+
300
+ @creators << @feed.creator if @creators.empty? and @feed.creator
301
+
302
+ # Categories
303
+ item.each_element('category') do |e|
304
+ if (h = e.attribute('term')) && h.value
305
+ # Use human-readable label if it is provided
306
+ if (l = e.attribute('label')) && l.value
307
+ cat = l.value
308
+ else
309
+ cat = h.value
310
+ end
311
+
312
+ @categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ def FeedParser::getcontent(e, feed = nil)
319
+ encoding = feed ? feed.encoding : 'utf-8'
320
+ children = e.children.reject do |i|
321
+ i.class == REXML::Text and i.to_s.chomp == ''
322
+ end
323
+ if children.length > 1
324
+ s = ''
325
+ children.each do |c|
326
+ s += c.to_s if c.class != REXML::Comment
327
+ end
328
+ return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
329
+ elsif children.length == 1
330
+ c = children[0]
331
+ if c.class == REXML::Text
332
+ return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
333
+ elsif c.class == REXML::CData
334
+ return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
335
+ elsif c.class == REXML::Element
336
+ # only one element. recurse.
337
+ return getcontent(c, feed)
338
+ elsif c.text
339
+ return c.text.toUTF8(encoding).text2html(feed)
340
+ end
341
+ end
342
+ end
343
+ end