penso-feedparser 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,14 @@
1
+ Ruby-Feedparser
2
+ -----------------
3
+ by Lucas Nussbaum <lucas@lucas-nussbaum.net>
4
+
5
+ Currently, all the information is provided on
6
+
7
+ http://home.gna.org/ruby-feedparser/
8
+
9
+ If you need to ask questions, feel free to ask them on the
10
+ ruby-feedparser-devel@gna.org mailing list.
11
+
12
+ Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
13
+ which is compatible with the GNU GPL (see the COPYING file) via an explicit
14
+ dual-licensing clause.
data/Rakefile ADDED
@@ -0,0 +1,85 @@
1
+ require 'rake/testtask'
2
+ require 'rake/rdoctask'
3
+ require 'rake/packagetask'
4
+ require 'rake'
5
+ require 'find'
6
+
7
+ # Globals
8
+ PKG_NAME = 'penso-feedparser'
9
+ PKG_VERSION = '0.8'
10
+
11
+ PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
12
+ Find.find('lib/', 'test/', 'tools/') do |f|
13
+ if FileTest.directory?(f) and f =~ /\.svn/
14
+ Find.prune
15
+ else
16
+ PKG_FILES << f
17
+ end
18
+ end
19
+
20
+ PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
21
+
22
+ task :default => [:package]
23
+
24
+ Rake::TestTask.new do |t|
25
+ t.libs << "test"
26
+ t.test_files = FileList['test/tc_*.rb']
27
+ end
28
+
29
+ Rake::RDocTask.new do |rd|
30
+ f = []
31
+ Find.find('lib/') do |file|
32
+ if FileTest.directory?(file) and file =~ /\.svn/
33
+ Find.prune
34
+ else
35
+ f << file if not FileTest.directory?(file)
36
+ end
37
+ end
38
+ f.delete('lib/feedparser.rb')
39
+ # hack to document the Feedparser module properly
40
+ f.unshift('lib/feedparser.rb')
41
+ rd.rdoc_files.include(f)
42
+ rd.options << '--all'
43
+ rd.options << '--diagram'
44
+ rd.options << '--fileboxes'
45
+ rd.options << '--inline-source'
46
+ rd.options << '--line-numbers'
47
+ rd.rdoc_dir = 'rdoc'
48
+ end
49
+
50
+ task :doctoweb => [:rdoc] do |t|
51
+ # copies the rdoc to the CVS repository for ruby-feedparser website
52
+ # repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
53
+ sh "tools/doctoweb.bash"
54
+ end
55
+
56
+ Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
57
+ p.need_tar = true
58
+ p.need_zip = true
59
+ p.package_files = PKG_FILES
60
+ end
61
+
62
+ # "Gem" part of the Rakefile
63
+ begin
64
+ require 'rake/gempackagetask'
65
+
66
+ spec = Gem::Specification.new do |s|
67
+ s.platform = Gem::Platform::RUBY
68
+ s.summary = "Ruby library to parse ATOM and RSS feeds"
69
+ s.name = PKG_NAME
70
+ s.version = PKG_VERSION
71
+ s.requirements << 'none'
72
+ s.require_path = 'lib'
73
+ s.autorequire = 'feedparser'
74
+ s.files = PKG_FILES
75
+ s.description = "Ruby library to parse ATOM and RSS feeds"
76
+ s.author = "gna, penso"
77
+ end
78
+
79
+ Rake::GemPackageTask.new(spec) do |pkg|
80
+ pkg.need_zip = true
81
+ pkg.need_tar = true
82
+ end
83
+ rescue LoadError
84
+ puts "Will not generate gem."
85
+ end
data/lib/feedparser.rb ADDED
@@ -0,0 +1,28 @@
1
+ # =Ruby-feedparser - ATOM/RSS feed parser for Ruby
2
+ # License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
3
+ # Website::http://home.gna.org/ruby-feedparser/
4
+ #
5
+ # ==Introduction
6
+ #
7
+ # Ruby-Feedparser is an RSS and Atom parser for Ruby.
8
+ # Ruby-feedparser is :
9
+ # * based on REXML
10
+ # * built for robustness : most feeds are not valid, a parser can't ignore that
11
+ # * fully unit-tested
12
+ # * easy to use (it can output text or HTML easily)
13
+ #
14
+ # ==Example
15
+ # require 'net/http'
16
+ # require 'feedparser'
17
+ # require 'uri'
18
+ # s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
19
+ # f = FeedParser::Feed::new(s)
20
+ # f.title
21
+ # => "Slashdot"
22
+ # f.items.each { |i| puts i.title }
23
+ # [...]
24
+ # require 'feedparser/html-output'
25
+ # f.items.each { |i| puts i.to_html }
26
+ #
27
+
28
+ require 'feedparser/feedparser'
@@ -0,0 +1,372 @@
1
+ require 'rexml/document'
2
+ require 'time'
3
+ require 'feedparser/textconverters'
4
+ require 'feedparser/rexml_patch'
5
+ require 'feedparser/text-output'
6
+ require 'base64'
7
+
8
+ module FeedParser
9
+
10
+ VERSION = "0.8"
11
+
12
+ class UnknownFeedTypeException < RuntimeError
13
+ end
14
+
15
+ # an RSS/Atom feed
16
+ class Feed
17
+ attr_reader :type, :title, :link, :description, :creator, :encoding, :items
18
+
19
+ # REXML::Element for this feed.
20
+ attr_reader :xml
21
+
22
+ # parse str to build a Feed
23
+ def initialize(str = nil)
24
+ parse(str) if str
25
+ end
26
+
27
+ # Determines all the fields using a string containing an
28
+ # XML document
29
+ def parse(str)
30
+ # Dirty hack: some feeds contain the & char. It must be changed to &amp;
31
+ str.gsub!(/&(\s+)/, '&amp;\1')
32
+ doc = REXML::Document.new(str)
33
+ @xml = doc.root
34
+ # get feed info
35
+ @encoding = doc.encoding
36
+ @title,@link,@description,@creator = nil
37
+ @title = ""
38
+ @items = []
39
+ if doc.root.elements['channel'] || doc.root.elements['rss:channel']
40
+ @type = "rss"
41
+ # We have a RSS feed!
42
+ # Title
43
+ if (e = doc.root.elements['channel/title'] ||
44
+ doc.root.elements['rss:channel/rss:title']) && e.text
45
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
46
+ end
47
+ # Link
48
+ if (e = doc.root.elements['channel/link'] ||
49
+ doc.root.elements['rss:channel/rss:link']) && e.text
50
+ @link = e.text.rmWhiteSpace!
51
+ end
52
+ # Description
53
+ if (e = doc.root.elements['channel/description'] ||
54
+ doc.root.elements['rss:channel/rss:description']) && e.text
55
+ @description = e.text.toUTF8(@encoding).rmWhiteSpace!
56
+ end
57
+
58
+ # Creator
59
+ if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
60
+ ((e = doc.root.elements['channel/author'] ||
61
+ doc.root.elements['rss:channel/rss:author']) && e.text)
62
+ @creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
63
+ end
64
+ # Items
65
+ if doc.root.elements['channel/item']
66
+ query = 'channel/item'
67
+ elsif doc.root.elements['item']
68
+ query = 'item'
69
+ elsif doc.root.elements['rss:channel/rss:item']
70
+ query = 'rss:channel/rss:item'
71
+ else
72
+ query = 'rss:item'
73
+ end
74
+ doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
75
+
76
+ elsif doc.root.elements['/feed']
77
+ # We have an ATOM feed!
78
+ @type = "atom"
79
+ # Title
80
+ if (e = doc.root.elements['/feed/title']) && e.text
81
+ @title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
82
+ end
83
+ # Link
84
+ doc.root.each_element('/feed/link') do |e|
85
+ if e.attribute('type') and (
86
+ e.attribute('type').value == 'text/html' or
87
+ e.attribute('type').value == 'application/xhtml' or
88
+ e.attribute('type').value == 'application/xhtml+xml')
89
+ if (h = e.attribute('href')) && h
90
+ @link = h.value.rmWhiteSpace!
91
+ end
92
+ end
93
+ end
94
+ # Description
95
+ if e = doc.root.elements['/feed/info']
96
+ e = e.elements['div'] || e
97
+ @description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
98
+ end
99
+ # Items
100
+ doc.root.each_element('/feed/entry') do |e|
101
+ @items << AtomItem::new(e, self)
102
+ end
103
+ else
104
+ raise UnknownFeedTypeException::new
105
+ end
106
+ end
107
+
108
+ def to_s(localtime = true)
109
+ s = ''
110
+ s += "Type: #{@type}\n"
111
+ s += "Encoding: #{@encoding}\n"
112
+ s += "Title: #{@title}\n"
113
+ s += "Link: #{@link}\n"
114
+ s += "Description: #{@description}\n"
115
+ s += "Creator: #{@creator}\n"
116
+ s += "\n"
117
+ @items.each { |i| s += i.to_s(localtime) }
118
+ s
119
+ end
120
+ end
121
+
122
+ # an Item from a feed
123
+ class FeedItem
124
+ attr_accessor :title, :link, :content, :date, :creators, :subject,
125
+ :cacheditem, :links, :item_id, :georss
126
+
127
+ # The item's categories/tags. An array of strings.
128
+ attr_accessor :categories
129
+
130
+ # The item's enclosures childs. An array of (url, length, type) triplets.
131
+ attr_accessor :enclosures
132
+
133
+ attr_reader :feed
134
+
135
+ # REXML::Element for this item
136
+ attr_reader :xml
137
+
138
+ def initialize(item = nil, feed = nil)
139
+ @xml = item
140
+ @feed = feed
141
+ @title, @link, @content, @date, @subject = nil
142
+ @links = []
143
+ @georss = []
144
+ @item_id = ""
145
+ @creators = []
146
+ @categories = []
147
+ @enclosures = []
148
+
149
+ @title = ""
150
+ parse(item) if item
151
+ end
152
+
153
+ def parse(item)
154
+ raise "parse() should be implemented by subclasses!"
155
+ end
156
+
157
+ def creator
158
+ case @creators.length
159
+ when 0
160
+ return nil
161
+ when 1
162
+ return creators[0]
163
+ else
164
+ return creators[0...-1].join(", ")+" and "+creators[-1]
165
+ end
166
+ end
167
+
168
+ def to_s(localtime = true)
169
+ s = "--------------------------------\n" +
170
+ "Title: #{@title}\nLink: #{@link}\n"
171
+ if localtime or @date.nil?
172
+ s += "Date: #{@date.to_s}\n"
173
+ else
174
+ s += "Date: #{@date.getutc.to_s}\n"
175
+ end
176
+ s += "Creator: #{creator}\n" +
177
+ "Subject: #{@subject}\n"
178
+ if defined?(@categories) and @categories.length > 0
179
+ s += "Filed under: " + @categories.join(', ') + "\n"
180
+ end
181
+ s += "Content:\n#{content}\n"
182
+ if defined?(@enclosures) and @enclosures.length > 0
183
+ s2 = "Enclosures:\n"
184
+ @enclosures.each do |e|
185
+ s2 += e.join(' ') + "\n"
186
+ end
187
+ s += s2
188
+ end
189
+ return s
190
+ end
191
+ end
192
+
193
+ class RSSItem < FeedItem
194
+
195
+
196
+ def parse(item)
197
+ # Title. If no title, use the pubDate as fallback.
198
+ if ((e = item.elements['title'] || item.elements['rss:title']) &&
199
+ e.text) ||
200
+ ((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
201
+ e.text)
202
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
203
+ end
204
+ # Geo
205
+ if (e = item.elements['georss:point']) && e.text
206
+ @georss = e.text.split
207
+ end
208
+ # Link
209
+ if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
210
+ (e = item.elements['guid'] || item.elements['rss:guid'] and
211
+ not (e.attribute('isPermaLink') and
212
+ e.attribute('isPermaLink').value == 'false'))
213
+ @link = e.text.rmWhiteSpace!
214
+ end
215
+ # Content
216
+ if (e = item.elements['content:encoded']) ||
217
+ (e = item.elements['description'] || item.elements['rss:description'])
218
+ @content = FeedParser::getcontent(e, @feed)
219
+ end
220
+ # Date
221
+ if e = item.elements['dc:date'] || item.elements['pubDate'] ||
222
+ item.elements['rss:pubDate']
223
+ begin
224
+ @date = Time::xmlschema(e.text)
225
+ rescue
226
+ begin
227
+ @date = Time::rfc2822(e.text)
228
+ rescue
229
+ begin
230
+ @date = Time::parse(e.text)
231
+ rescue
232
+ @date = nil
233
+ end
234
+ end
235
+ end
236
+ end
237
+ # Creator
238
+ if (e = item.elements['dc:creator'] || item.elements['author'] ||
239
+ item.elements['rss:author']) && e.text
240
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
241
+ end
242
+ @creators << @feed.creator if @creators.empty? and @feed.creator
243
+
244
+ # Subject
245
+ if (e = item.elements['dc:subject']) && e.text
246
+ @subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
247
+ end
248
+ # Categories
249
+ cat_elts = []
250
+ item.each_element('dc:category') { |e| cat_elts << e if e.text }
251
+ item.each_element('category') { |e| cat_elts << e if e.text }
252
+ item.each_element('rss:category') { |e| cat_elts << e if e.text }
253
+
254
+ cat_elts.each do |e|
255
+ @categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
256
+ end
257
+ # Enclosures
258
+ item.each_element('enclosure') do |e|
259
+ url = e.attribute('url').value if e.attribute('url')
260
+ length = e.attribute('length').value if e.attribute('length')
261
+ type = e.attribute('type').value if e.attribute('type')
262
+ @enclosures << [ url, length, type ] if url
263
+ end
264
+ end
265
+ end
266
+
267
+ class AtomItem < FeedItem
268
+ def parse(item)
269
+ # Title
270
+ if (e = item.elements['title']) && e.text
271
+ @title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
272
+ end
273
+ # Geo
274
+ if (e = item.elements['georss:point']) && e.text
275
+ @georss = e.text.split
276
+ end
277
+ # Link
278
+ item.each_element('link') do |e|
279
+
280
+ if (h = e.attribute('href')) && h.value
281
+ @link = h.value
282
+
283
+ if e.attribute('rel') and e.attribute('type')
284
+ @links << {:rel => e.attribute('rel').value, :href => h.value, :type => e.attribute('type').value}
285
+ else
286
+ @links << {:href => h.value}
287
+ end
288
+
289
+ end
290
+ end
291
+ # Content
292
+ if e = item.elements['content'] || item.elements['summary']
293
+ if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
294
+ e.text
295
+ @content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
296
+ else
297
+ @content = FeedParser::getcontent(e, @feed)
298
+ end
299
+ end
300
+
301
+ if (h = item.elements['id'])
302
+ @item_id = h.text
303
+ end
304
+
305
+ # Date
306
+ if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
307
+ begin
308
+ @date = Time::xmlschema(e.text)
309
+ rescue
310
+ begin
311
+ @date = Time::rfc2822(e.text)
312
+ rescue
313
+ begin
314
+ @date = Time::parse(e.text)
315
+ rescue
316
+ @date = nil
317
+ end
318
+ end
319
+ end
320
+ end
321
+ # Creator
322
+ item.each_element('author/name') do |e|
323
+ if e.text
324
+ @creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
325
+ end
326
+ end
327
+
328
+ @creators << @feed.creator if @creators.empty? and @feed.creator
329
+
330
+
331
+ # Categories
332
+ item.each_element('category') do |e|
333
+ if (h = e.attribute('term')) && h.value
334
+ # Use human-readable label if it is provided
335
+ if (l = e.attribute('label')) && l.value
336
+ cat = l.value
337
+ else
338
+ cat = h.value
339
+ end
340
+
341
+ @categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
342
+ end
343
+ end
344
+ end
345
+ end
346
+
347
+ def FeedParser::getcontent(e, feed = nil)
348
+ encoding = feed ? feed.encoding : 'utf-8'
349
+ children = e.children.reject do |i|
350
+ i.class == REXML::Text and i.to_s.chomp == ''
351
+ end
352
+ if children.length > 1
353
+ s = ''
354
+ children.each do |c|
355
+ s += c.to_s if c.class != REXML::Comment
356
+ end
357
+ return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
358
+ elsif children.length == 1
359
+ c = children[0]
360
+ if c.class == REXML::Text
361
+ return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
362
+ elsif c.class == REXML::CData
363
+ return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
364
+ elsif c.class == REXML::Element
365
+ # only one element. recurse.
366
+ return getcontent(c, feed)
367
+ elsif c.text
368
+ return c.text.toUTF8(encoding).text2html(feed)
369
+ end
370
+ end
371
+ end
372
+ end