penso-feedparser 0.8

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,14 @@
1
+ Ruby-Feedparser
2
+ -----------------
3
+ by Lucas Nussbaum <lucas@lucas-nussbaum.net>
4
+
5
+ Currently, all the information is provided on
6
+
7
+ http://home.gna.org/ruby-feedparser/
8
+
9
+ If you need to ask questions, feel free to ask them on the
10
+ ruby-feedparser-devel@gna.org mailing list.
11
+
12
+ Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
13
+ which is compatible with the GNU GPL (see the COPYING file) via an explicit
14
+ dual-licensing clause.
data/Rakefile ADDED
@@ -0,0 +1,85 @@
1
+ require 'rake/testtask'
2
+ require 'rake/rdoctask'
3
+ require 'rake/packagetask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+ PKG_NAME = 'penso-feedparser'
9
+ PKG_VERSION = '0.8'
10
+
11
+ PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
12
+ Find.find('lib/', 'test/', 'tools/') do |f|
13
+ if FileTest.directory?(f) and f =~ /\.svn/
14
+ Find.prune
15
+ else
16
+ PKG_FILES << f
17
+ end
18
+ end
19
+
20
+ PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
21
+
22
+ task :default => [:package]
23
+
24
+ Rake::TestTask.new do |t|
25
+ t.libs << "test"
26
+ t.test_files = FileList['test/tc_*.rb']
27
+ end
28
+
29
+ Rake::RDocTask.new do |rd|
30
+ f = []
31
+ Find.find('lib/') do |file|
32
+ if FileTest.directory?(file) and file =~ /\.svn/
33
+ Find.prune
34
+ else
35
+ f << file if not FileTest.directory?(file)
36
+ end
37
+ end
38
+ f.delete('lib/feedparser.rb')
39
+ # hack to document the Feedparser module properly
40
+ f.unshift('lib/feedparser.rb')
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ rd.options << '--diagram'
44
+ rd.options << '--fileboxes'
45
+ rd.options << '--inline-source'
46
+ rd.options << '--line-numbers'
47
+ rd.rdoc_dir = 'rdoc'
48
+ end
49
+
50
+ task :doctoweb => [:rdoc] do |t|
51
+ # copies the rdoc to the CVS repository for ruby-feedparser website
52
+ # repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
53
+ sh "tools/doctoweb.bash"
54
+ end
55
+
56
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
57
+ p.need_tar = true
58
+ p.need_zip = true
59
+ p.package_files = PKG_FILES
60
+ end
61
+
62
+ # "Gem" part of the Rakefile
63
+ begin
64
+ require 'rake/gempackagetask'
65
+
66
+ spec = Gem::Specification.new do |s|
67
+ s.platform = Gem::Platform::RUBY
68
+ s.summary = "Ruby library to parse ATOM and RSS feeds"
69
+ s.name = PKG_NAME
70
+ s.version = PKG_VERSION
71
+ s.requirements << 'none'
72
+ s.require_path = 'lib'
73
+ s.autorequire = 'feedparser'
74
+ s.files = PKG_FILES
75
+ s.description = "Ruby library to parse ATOM and RSS feeds"
76
+ s.author = "gna, penso"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.need_zip = true
81
+ pkg.need_tar = true
82
+ end
83
+ rescue LoadError
84
+ puts "Will not generate gem."
85
+ end
data/lib/feedparser.rb ADDED
@@ -0,0 +1,28 @@
1
+ # =Ruby-feedparser - ATOM/RSS feed parser for Ruby
2
+ # License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
3
+ # Website::http://home.gna.org/ruby-feedparser/
4
+ #
5
+ # ==Introduction
6
+ #
7
+ # Ruby-Feedparser is an RSS and Atom parser for Ruby.
8
+ # Ruby-feedparser is :
9
+ # * based on REXML
10
+ # * built for robustness : most feeds are not valid, a parser can't ignore that
11
+ # * fully unit-tested
12
+ # * easy to use (it can output text or HTML easily)
13
+ #
14
+ # ==Example
15
+ # require 'net/http'
16
+ # require 'feedparser'
17
+ # require 'uri'
18
+ # s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
19
+ # f = FeedParser::Feed::new(s)
20
+ # f.title
21
+ # => "Slashdot"
22
+ # f.items.each { |i| puts i.title }
23
+ # [...]
24
+ # require 'feedparser/html-output'
25
+ # f.items.each { |i| puts i.to_html }
26
+ #
27
+
28
+ require 'feedparser/feedparser'
@@ -0,0 +1,372 @@
1
+ require 'rexml/document'
2
+ require 'time'
3
+ require 'feedparser/textconverters'
4
+ require 'feedparser/rexml_patch'
5
+ require 'feedparser/text-output'
6
+ require 'base64'
7
+
8
+ module FeedParser
9
+
10
+ VERSION = "0.8"
11
+
12
+ class UnknownFeedTypeException < RuntimeError
13
+ end
14
+
15
+ # an RSS/Atom feed
16
+ class Feed
17
+ attr_reader :type, :title, :link, :description, :creator, :encoding, :items
18
+
19
+ # REXML::Element for this feed.
20
+ attr_reader :xml
21
+
22
+ # parse str to build a Feed
23
+ def initialize(str = nil)
24
+ parse(str) if str
25
+ end
26
+
27
+ # Determines all the fields using a string containing an
28
+ # XML document
29
+ def parse(str)
30
+ # Dirty hack: some feeds contain the & char. It must be changed to &amp;
31
+ str.gsub!(/&(\s+)/, '&amp;\1')
32
+ doc = REXML::Document.new(str)
33
+ @xml = doc.root
34
+ # get feed info
35
+ @encoding = doc.encoding
36
+ @title,@link,@description,@creator = nil
37
+ @title = ""
38
+ @items = []
39
+ if doc.root.elements['channel'] || doc.root.elements['rss:channel']
40
+ @type = "rss"
41
+ # We have a RSS feed!
42
+ # Title
43
+ if (e = doc.root.elements['channel/title'] ||
44
+ doc.root.elements['rss:channel/rss:title']) && e.text
45
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
46
+ end
47
+ # Link
48
+ if (e = doc.root.elements['channel/link'] ||
49
+ doc.root.elements['rss:channel/rss:link']) && e.text
50
+ @link = e.text.rmWhiteSpace!
51
+ end
52
+ # Description
53
+ if (e = doc.root.elements['channel/description'] ||
54
+ doc.root.elements['rss:channel/rss:description']) && e.text
55
+ @description = e.text.toUTF8(@encoding).rmWhiteSpace!
56
+ end
57
+
58
+ # Creator
59
+ if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
60
+ ((e = doc.root.elements['channel/author'] ||
61
+ doc.root.elements['rss:channel/rss:author']) && e.text)
62
+ @creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
63
+ end
64
+ # Items
65
+ if doc.root.elements['channel/item']
66
+ query = 'channel/item'
67
+ elsif doc.root.elements['item']
68
+ query = 'item'
69
+ elsif doc.root.elements['rss:channel/rss:item']
70
+ query = 'rss:channel/rss:item'
71
+ else
72
+ query = 'rss:item'
73
+ end
74
+ doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
75
+
76
+ elsif doc.root.elements['/feed']
77
+ # We have an ATOM feed!
78
+ @type = "atom"
79
+ # Title
80
+ if (e = doc.root.elements['/feed/title']) && e.text
81
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
82
+ end
83
+ # Link
84
+ doc.root.each_element('/feed/link') do |e|
85
+ if e.attribute('type') and (
86
+ e.attribute('type').value == 'text/html' or
87
+ e.attribute('type').value == 'application/xhtml' or
88
+ e.attribute('type').value == 'application/xhtml+xml')
89
+ if (h = e.attribute('href')) && h
90
+ @link = h.value.rmWhiteSpace!
91
+ end
92
+ end
93
+ end
94
+ # Description
95
+ if e = doc.root.elements['/feed/info']
96
+ e = e.elements['div'] || e
97
+ @description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
98
+ end
99
+ # Items
100
+ doc.root.each_element('/feed/entry') do |e|
101
+ @items << AtomItem::new(e, self)
102
+ end
103
+ else
104
+ raise UnknownFeedTypeException::new
105
+ end
106
+ end
107
+
108
+ def to_s(localtime = true)
109
+ s = ''
110
+ s += "Type: #{@type}\n"
111
+ s += "Encoding: #{@encoding}\n"
112
+ s += "Title: #{@title}\n"
113
+ s += "Link: #{@link}\n"
114
+ s += "Description: #{@description}\n"
115
+ s += "Creator: #{@creator}\n"
116
+ s += "\n"
117
+ @items.each { |i| s += i.to_s(localtime) }
118
+ s
119
+ end
120
+ end
121
+
122
+ # an Item from a feed
123
+ class FeedItem
124
+ attr_accessor :title, :link, :content, :date, :creators, :subject,
125
+ :cacheditem, :links, :item_id, :georss
126
+
127
+ # The item's categories/tags. An array of strings.
128
+ attr_accessor :categories
129
+
130
+ # The item's enclosures childs. An array of (url, length, type) triplets.
131
+ attr_accessor :enclosures
132
+
133
+ attr_reader :feed
134
+
135
+ # REXML::Element for this item
136
+ attr_reader :xml
137
+
138
+ def initialize(item = nil, feed = nil)
139
+ @xml = item
140
+ @feed = feed
141
+ @title, @link, @content, @date, @subject = nil
142
+ @links = []
143
+ @georss = []
144
+ @item_id = ""
145
+ @creators = []
146
+ @categories = []
147
+ @enclosures = []
148
+
149
+ @title = ""
150
+ parse(item) if item
151
+ end
152
+
153
+ def parse(item)
154
+ raise "parse() should be implemented by subclasses!"
155
+ end
156
+
157
+ def creator
158
+ case @creators.length
159
+ when 0
160
+ return nil
161
+ when 1
162
+ return creators[0]
163
+ else
164
+ return creators[0...-1].join(", ")+" and "+creators[-1]
165
+ end
166
+ end
167
+
168
+ def to_s(localtime = true)
169
+ s = "--------------------------------\n" +
170
+ "Title: #{@title}\nLink: #{@link}\n"
171
+ if localtime or @date.nil?
172
+ s += "Date: #{@date.to_s}\n"
173
+ else
174
+ s += "Date: #{@date.getutc.to_s}\n"
175
+ end
176
+ s += "Creator: #{creator}\n" +
177
+ "Subject: #{@subject}\n"
178
+ if defined?(@categories) and @categories.length > 0
179
+ s += "Filed under: " + @categories.join(', ') + "\n"
180
+ end
181
+ s += "Content:\n#{content}\n"
182
+ if defined?(@enclosures) and @enclosures.length > 0
183
+ s2 = "Enclosures:\n"
184
+ @enclosures.each do |e|
185
+ s2 += e.join(' ') + "\n"
186
+ end
187
+ s += s2
188
+ end
189
+ return s
190
+ end
191
+ end
192
+
193
+ class RSSItem < FeedItem
194
+
195
+
196
+ def parse(item)
197
+ # Title. If no title, use the pubDate as fallback.
198
+ if ((e = item.elements['title'] || item.elements['rss:title']) &&
199
+ e.text) ||
200
+ ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
201
+ e.text)
202
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
203
+ end
204
+ # Geo
205
+ if (e = item.elements['georss:point']) && e.text
206
+ @georss = e.text.split
207
+ end
208
+ # Link
209
+ if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
210
+ (e = item.elements['guid'] || item.elements['rss:guid'] and
211
+ not (e.attribute('isPermaLink') and
212
+ e.attribute('isPermaLink').value == 'false'))
213
+ @link = e.text.rmWhiteSpace!
214
+ end
215
+ # Content
216
+ if (e = item.elements['content:encoded']) ||
217
+ (e = item.elements['description'] || item.elements['rss:description'])
218
+ @content = FeedParser::getcontent(e, @feed)
219
+ end
220
+ # Date
221
+ if e = item.elements['dc:date'] || item.elements['pubDate'] ||
222
+ item.elements['rss:pubDate']
223
+ begin
224
+ @date = Time::xmlschema(e.text)
225
+ rescue
226
+ begin
227
+ @date = Time::rfc2822(e.text)
228
+ rescue
229
+ begin
230
+ @date = Time::parse(e.text)
231
+ rescue
232
+ @date = nil
233
+ end
234
+ end
235
+ end
236
+ end
237
+ # Creator
238
+ if (e = item.elements['dc:creator'] || item.elements['author'] ||
239
+ item.elements['rss:author']) && e.text
240
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
241
+ end
242
+ @creators << @feed.creator if @creators.empty? and @feed.creator
243
+
244
+ # Subject
245
+ if (e = item.elements['dc:subject']) && e.text
246
+ @subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
247
+ end
248
+ # Categories
249
+ cat_elts = []
250
+ item.each_element('dc:category') { |e| cat_elts << e if e.text }
251
+ item.each_element('category') { |e| cat_elts << e if e.text }
252
+ item.each_element('rss:category') { |e| cat_elts << e if e.text }
253
+
254
+ cat_elts.each do |e|
255
+ @categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
256
+ end
257
+ # Enclosures
258
+ item.each_element('enclosure') do |e|
259
+ url = e.attribute('url').value if e.attribute('url')
260
+ length = e.attribute('length').value if e.attribute('length')
261
+ type = e.attribute('type').value if e.attribute('type')
262
+ @enclosures << [ url, length, type ] if url
263
+ end
264
+ end
265
+ end
266
+
267
+ class AtomItem < FeedItem
268
+ def parse(item)
269
+ # Title
270
+ if (e = item.elements['title']) && e.text
271
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
272
+ end
273
+ # Geo
274
+ if (e = item.elements['georss:point']) && e.text
275
+ @georss = e.text.split
276
+ end
277
+ # Link
278
+ item.each_element('link') do |e|
279
+
280
+ if (h = e.attribute('href')) && h.value
281
+ @link = h.value
282
+
283
+ if e.attribute('rel') and e.attribute('type')
284
+ @links << {:rel => e.attribute('rel').value, :href => h.value, :type => e.attribute('type').value}
285
+ else
286
+ @links << {:href => h.value}
287
+ end
288
+
289
+ end
290
+ end
291
+ # Content
292
+ if e = item.elements['content'] || item.elements['summary']
293
+ if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
294
+ e.text
295
+ @content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
296
+ else
297
+ @content = FeedParser::getcontent(e, @feed)
298
+ end
299
+ end
300
+
301
+ if (h = item.elements['id'])
302
+ @item_id = h.text
303
+ end
304
+
305
+ # Date
306
+ if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
307
+ begin
308
+ @date = Time::xmlschema(e.text)
309
+ rescue
310
+ begin
311
+ @date = Time::rfc2822(e.text)
312
+ rescue
313
+ begin
314
+ @date = Time::parse(e.text)
315
+ rescue
316
+ @date = nil
317
+ end
318
+ end
319
+ end
320
+ end
321
+ # Creator
322
+ item.each_element('author/name') do |e|
323
+ if e.text
324
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
325
+ end
326
+ end
327
+
328
+ @creators << @feed.creator if @creators.empty? and @feed.creator
329
+
330
+
331
+ # Categories
332
+ item.each_element('category') do |e|
333
+ if (h = e.attribute('term')) && h.value
334
+ # Use human-readable label if it is provided
335
+ if (l = e.attribute('label')) && l.value
336
+ cat = l.value
337
+ else
338
+ cat = h.value
339
+ end
340
+
341
+ @categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
342
+ end
343
+ end
344
+ end
345
+ end
346
+
347
+ def FeedParser::getcontent(e, feed = nil)
348
+ encoding = feed ? feed.encoding : 'utf-8'
349
+ children = e.children.reject do |i|
350
+ i.class == REXML::Text and i.to_s.chomp == ''
351
+ end
352
+ if children.length > 1
353
+ s = ''
354
+ children.each do |c|
355
+ s += c.to_s if c.class != REXML::Comment
356
+ end
357
+ return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
358
+ elsif children.length == 1
359
+ c = children[0]
360
+ if c.class == REXML::Text
361
+ return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
362
+ elsif c.class == REXML::CData
363
+ return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
364
+ elsif c.class == REXML::Element
365
+ # only one element. recurse.
366
+ return getcontent(c, feed)
367
+ elsif c.text
368
+ return c.text.toUTF8(encoding).text2html(feed)
369
+ end
370
+ end
371
+ end
372
+ end