ruby-feedparser 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,14 @@
1
+ Ruby-Feedparser
2
+ -----------------
3
+ by Lucas Nussbaum <lucas@lucas-nussbaum.net>
4
+
5
+ Currently, all the information is provided on
6
+
7
+ http://home.gna.org/ruby-feedparser/
8
+
9
+ If you need to ask questions, feel free to ask them on the
10
+ ruby-feedparser-devel@gna.org mailing list.
11
+
12
+ Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
13
+ which is compatible with the GNU GPL (see the COPYING file) via an explicit
14
+ dual-licensing clause.
data/Rakefile ADDED
@@ -0,0 +1,84 @@
1
+ require 'rake/testtask'
2
+ require 'rake/rdoctask'
3
+ require 'rake/packagetask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+ PKG_NAME = 'ruby-feedparser'
9
+ PKG_VERSION = '0.7'
10
+
11
+ PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
12
+ Find.find('lib/', 'test/', 'tools/') do |f|
13
+ if FileTest.directory?(f) and f =~ /\.svn/
14
+ Find.prune
15
+ else
16
+ PKG_FILES << f
17
+ end
18
+ end
19
+
20
+ PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
21
+
22
+ task :default => [:package]
23
+
24
+ Rake::TestTask.new do |t|
25
+ t.libs << "test"
26
+ t.test_files = FileList['test/tc_*.rb']
27
+ end
28
+
29
+ Rake::RDocTask.new do |rd|
30
+ f = []
31
+ Find.find('lib/') do |file|
32
+ if FileTest.directory?(file) and file =~ /\.svn/
33
+ Find.prune
34
+ else
35
+ f << file if not FileTest.directory?(file)
36
+ end
37
+ end
38
+ f.delete('lib/feedparser.rb')
39
+ # hack to document the Feedparser module properly
40
+ f.unshift('lib/feedparser.rb')
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ rd.options << '--diagram'
44
+ rd.options << '--fileboxes'
45
+ rd.options << '--inline-source'
46
+ rd.options << '--line-numbers'
47
+ rd.rdoc_dir = 'rdoc'
48
+ end
49
+
50
+ task :doctoweb => [:rdoc] do |t|
51
+ # copies the rdoc to the CVS repository for ruby-feedparser website
52
+ # repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
53
+ sh "tools/doctoweb.bash"
54
+ end
55
+
56
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
57
+ p.need_tar = true
58
+ p.need_zip = true
59
+ p.package_files = PKG_FILES
60
+ end
61
+
62
+ # "Gem" part of the Rakefile
63
+ begin
64
+ require 'rake/gempackagetask'
65
+
66
+ spec = Gem::Specification.new do |s|
67
+ s.platform = Gem::Platform::RUBY
68
+ s.summary = "Ruby library to parse ATOM and RSS feeds"
69
+ s.name = PKG_NAME
70
+ s.version = PKG_VERSION
71
+ s.requirements << 'none'
72
+ s.require_path = 'lib'
73
+ s.autorequire = 'feedparser'
74
+ s.files = PKG_FILES
75
+ s.description = "Ruby library to parse ATOM and RSS feeds"
76
+ end
77
+
78
+ Rake::GemPackageTask.new(spec) do |pkg|
79
+ pkg.need_zip = true
80
+ pkg.need_tar = true
81
+ end
82
+ rescue LoadError
83
+ puts "Will not generate gem."
84
+ end
data/lib/feedparser.rb ADDED
@@ -0,0 +1,28 @@
1
+ # =Ruby-feedparser - ATOM/RSS feed parser for Ruby
2
+ # License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
3
+ # Website::http://home.gna.org/ruby-feedparser/
4
+ #
5
+ # ==Introduction
6
+ #
7
+ # Ruby-Feedparser is an RSS and Atom parser for Ruby.
8
+ # Ruby-feedparser is :
9
+ # * based on REXML
10
+ # * built for robustness : most feeds are not valid, a parser can't ignore that
11
+ # * fully unit-tested
12
+ # * easy to use (it can output text or HTML easily)
13
+ #
14
+ # ==Example
15
+ # require 'net/http'
16
+ # require 'feedparser'
17
+ # require 'uri'
18
+ # s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
19
+ # f = FeedParser::Feed::new(s)
20
+ # f.title
21
+ # => "Slashdot"
22
+ # f.items.each { |i| puts i.title }
23
+ # [...]
24
+ # require 'feedparser/html-output'
25
+ # f.items.each { |i| puts i.to_html }
26
+ #
27
+
28
+ require 'feedparser/feedparser'
@@ -0,0 +1,343 @@
1
+ require 'rexml/document'
2
+ require 'time'
3
+ require 'feedparser/textconverters'
4
+ require 'feedparser/rexml_patch'
5
+ require 'feedparser/text-output'
6
+ require 'base64'
7
+
8
+ module FeedParser
9
+
10
+ VERSION = "0.7"
11
+
12
+ class UnknownFeedTypeException < RuntimeError
13
+ end
14
+
15
+ # an RSS/Atom feed
16
+ class Feed
17
+ attr_reader :type, :title, :link, :description, :creator, :encoding, :items
18
+
19
+ # REXML::Element for this feed.
20
+ attr_reader :xml
21
+
22
+ # parse str to build a Feed
23
+ def initialize(str = nil)
24
+ parse(str) if str
25
+ end
26
+
27
+ # Determines all the fields using a string containing an
28
+ # XML document
29
+ def parse(str)
30
+ # Dirty hack: some feeds contain the & char. It must be changed to &amp;
31
+ str.gsub!(/&(\s+)/, '&amp;\1')
32
+ doc = REXML::Document.new(str)
33
+ @xml = doc.root
34
+ # get feed info
35
+ @encoding = doc.encoding
36
+ @title,@link,@description,@creator = nil
37
+ @items = []
38
+ if doc.root.elements['channel'] || doc.root.elements['rss:channel']
39
+ @type = "rss"
40
+ # We have a RSS feed!
41
+ # Title
42
+ if (e = doc.root.elements['channel/title'] ||
43
+ doc.root.elements['rss:channel/rss:title']) && e.text
44
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
45
+ end
46
+ # Link
47
+ if (e = doc.root.elements['channel/link'] ||
48
+ doc.root.elements['rss:channel/rss:link']) && e.text
49
+ @link = e.text.rmWhiteSpace!
50
+ end
51
+ # Description
52
+ if (e = doc.root.elements['channel/description'] ||
53
+ doc.root.elements['rss:channel/rss:description']) && e.text
54
+ @description = e.text.toUTF8(@encoding).rmWhiteSpace!
55
+ end
56
+ # Creator
57
+ if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
58
+ ((e = doc.root.elements['channel/author'] ||
59
+ doc.root.elements['rss:channel/rss:author']) && e.text)
60
+ @creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
61
+ end
62
+ # Items
63
+ if doc.root.elements['channel/item']
64
+ query = 'channel/item'
65
+ elsif doc.root.elements['item']
66
+ query = 'item'
67
+ elsif doc.root.elements['rss:channel/rss:item']
68
+ query = 'rss:channel/rss:item'
69
+ else
70
+ query = 'rss:item'
71
+ end
72
+ doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
73
+
74
+ elsif doc.root.elements['/feed']
75
+ # We have an ATOM feed!
76
+ @type = "atom"
77
+ # Title
78
+ if (e = doc.root.elements['/feed/title']) && e.text
79
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
80
+ end
81
+ # Link
82
+ doc.root.each_element('/feed/link') do |e|
83
+ if e.attribute('type') and (
84
+ e.attribute('type').value == 'text/html' or
85
+ e.attribute('type').value == 'application/xhtml' or
86
+ e.attribute('type').value == 'application/xhtml+xml')
87
+ if (h = e.attribute('href')) && h
88
+ @link = h.value.rmWhiteSpace!
89
+ end
90
+ end
91
+ end
92
+ # Description
93
+ if e = doc.root.elements['/feed/info']
94
+ e = e.elements['div'] || e
95
+ @description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
96
+ end
97
+ # Items
98
+ doc.root.each_element('/feed/entry') do |e|
99
+ @items << AtomItem::new(e, self)
100
+ end
101
+ else
102
+ raise UnknownFeedTypeException::new
103
+ end
104
+ end
105
+
106
+ def to_s(localtime = true)
107
+ s = ''
108
+ s += "Type: #{@type}\n"
109
+ s += "Encoding: #{@encoding}\n"
110
+ s += "Title: #{@title}\n"
111
+ s += "Link: #{@link}\n"
112
+ s += "Description: #{@description}\n"
113
+ s += "Creator: #{@creator}\n"
114
+ s += "\n"
115
+ @items.each { |i| s += i.to_s(localtime) }
116
+ s
117
+ end
118
+ end
119
+
120
+ # an Item from a feed
121
+ class FeedItem
122
+ attr_accessor :title, :link, :content, :date, :creators, :subject,
123
+ :cacheditem
124
+
125
+ # The item's categories/tags. An array of strings.
126
+ attr_accessor :categories
127
+
128
+ # The item's enclosures childs. An array of (url, length, type) triplets.
129
+ attr_accessor :enclosures
130
+
131
+ attr_reader :feed
132
+
133
+ # REXML::Element for this item
134
+ attr_reader :xml
135
+
136
+ def initialize(item = nil, feed = nil)
137
+ @xml = item
138
+ @feed = feed
139
+ @title, @link, @content, @date, @subject = nil
140
+ @creators = []
141
+ @categories = []
142
+ @enclosures = []
143
+ parse(item) if item
144
+ end
145
+
146
+ def parse(item)
147
+ raise "parse() should be implemented by subclasses!"
148
+ end
149
+
150
+ def creator
151
+ case @creators.length
152
+ when 0
153
+ return nil
154
+ when 1
155
+ return creators[0]
156
+ else
157
+ return creators[0...-1].join(", ")+" and "+creators[-1]
158
+ end
159
+ end
160
+
161
+ def to_s(localtime = true)
162
+ s = "--------------------------------\n" +
163
+ "Title: #{@title}\nLink: #{@link}\n"
164
+ if localtime or @date.nil?
165
+ s += "Date: #{@date.to_s}\n"
166
+ else
167
+ s += "Date: #{@date.getutc.to_s}\n"
168
+ end
169
+ s += "Creator: #{creator}\n" +
170
+ "Subject: #{@subject}\n"
171
+ if defined?(@categories) and @categories.length > 0
172
+ s += "Filed under: " + @categories.join(', ') + "\n"
173
+ end
174
+ s += "Content:\n#{content}\n"
175
+ if defined?(@enclosures) and @enclosures.length > 0
176
+ s2 = "Enclosures:\n"
177
+ @enclosures.each do |e|
178
+ s2 += e.join(' ') + "\n"
179
+ end
180
+ s += s2
181
+ end
182
+ return s
183
+ end
184
+ end
185
+
186
+ class RSSItem < FeedItem
187
+
188
+
189
+ def parse(item)
190
+ # Title. If no title, use the pubDate as fallback.
191
+ if ((e = item.elements['title'] || item.elements['rss:title']) &&
192
+ e.text) ||
193
+ ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
194
+ e.text)
195
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
196
+ end
197
+ # Link
198
+ if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
199
+ (e = item.elements['guid'] || item.elements['rss:guid'] and
200
+ not (e.attribute('isPermaLink') and
201
+ e.attribute('isPermaLink').value == 'false'))
202
+ @link = e.text.rmWhiteSpace!
203
+ end
204
+ # Content
205
+ if (e = item.elements['content:encoded']) ||
206
+ (e = item.elements['description'] || item.elements['rss:description'])
207
+ @content = FeedParser::getcontent(e, @feed)
208
+ end
209
+ # Date
210
+ if e = item.elements['dc:date'] || item.elements['pubDate'] ||
211
+ item.elements['rss:pubDate']
212
+ begin
213
+ @date = Time::xmlschema(e.text)
214
+ rescue
215
+ begin
216
+ @date = Time::rfc2822(e.text)
217
+ rescue
218
+ begin
219
+ @date = Time::parse(e.text)
220
+ rescue
221
+ @date = nil
222
+ end
223
+ end
224
+ end
225
+ end
226
+ # Creator
227
+ if (e = item.elements['dc:creator'] || item.elements['author'] ||
228
+ item.elements['rss:author']) && e.text
229
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
230
+ end
231
+ @creators << @feed.creator if @creators.empty? and @feed.creator
232
+
233
+ # Subject
234
+ if (e = item.elements['dc:subject']) && e.text
235
+ @subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
236
+ end
237
+ # Categories
238
+ cat_elts = []
239
+ item.each_element('dc:category') { |e| cat_elts << e if e.text }
240
+ item.each_element('category') { |e| cat_elts << e if e.text }
241
+ item.each_element('rss:category') { |e| cat_elts << e if e.text }
242
+
243
+ cat_elts.each do |e|
244
+ @categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
245
+ end
246
+ # Enclosures
247
+ item.each_element('enclosure') do |e|
248
+ url = e.attribute('url').value if e.attribute('url')
249
+ length = e.attribute('length').value if e.attribute('length')
250
+ type = e.attribute('type').value if e.attribute('type')
251
+ @enclosures << [ url, length, type ] if url
252
+ end
253
+ end
254
+ end
255
+
256
+ class AtomItem < FeedItem
257
+ def parse(item)
258
+ # Title
259
+ if (e = item.elements['title']) && e.text
260
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
261
+ end
262
+ # Link
263
+ item.each_element('link') do |e|
264
+ if (h = e.attribute('href')) && h.value
265
+ @link = h.value
266
+ end
267
+ end
268
+ # Content
269
+ if e = item.elements['content'] || item.elements['summary']
270
+ if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
271
+ e.text
272
+ @content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
273
+ else
274
+ @content = FeedParser::getcontent(e, @feed)
275
+ end
276
+ end
277
+ # Date
278
+ if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
279
+ begin
280
+ @date = Time::xmlschema(e.text)
281
+ rescue
282
+ begin
283
+ @date = Time::rfc2822(e.text)
284
+ rescue
285
+ begin
286
+ @date = Time::parse(e.text)
287
+ rescue
288
+ @date = nil
289
+ end
290
+ end
291
+ end
292
+ end
293
+ # Creator
294
+ item.each_element('author/name') do |e|
295
+ if e.text
296
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
297
+ end
298
+ end
299
+
300
+ @creators << @feed.creator if @creators.empty? and @feed.creator
301
+
302
+ # Categories
303
+ item.each_element('category') do |e|
304
+ if (h = e.attribute('term')) && h.value
305
+ # Use human-readable label if it is provided
306
+ if (l = e.attribute('label')) && l.value
307
+ cat = l.value
308
+ else
309
+ cat = h.value
310
+ end
311
+
312
+ @categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ def FeedParser::getcontent(e, feed = nil)
319
+ encoding = feed ? feed.encoding : 'utf-8'
320
+ children = e.children.reject do |i|
321
+ i.class == REXML::Text and i.to_s.chomp == ''
322
+ end
323
+ if children.length > 1
324
+ s = ''
325
+ children.each do |c|
326
+ s += c.to_s if c.class != REXML::Comment
327
+ end
328
+ return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
329
+ elsif children.length == 1
330
+ c = children[0]
331
+ if c.class == REXML::Text
332
+ return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
333
+ elsif c.class == REXML::CData
334
+ return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
335
+ elsif c.class == REXML::Element
336
+ # only one element. recurse.
337
+ return getcontent(c, feed)
338
+ elsif c.text
339
+ return c.text.toUTF8(encoding).text2html(feed)
340
+ end
341
+ end
342
+ end
343
+ end