ruby-feedparser 0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +340 -0
- data/ChangeLog +59 -0
- data/LICENSE +60 -0
- data/README +14 -0
- data/Rakefile +84 -0
- data/lib/feedparser.rb +28 -0
- data/lib/feedparser/feedparser.rb +343 -0
- data/lib/feedparser/filesizes.rb +14 -0
- data/lib/feedparser/html-output.rb +126 -0
- data/lib/feedparser/html2text-parser.rb +413 -0
- data/lib/feedparser/rexml_patch.rb +28 -0
- data/lib/feedparser/sgml-parser.rb +332 -0
- data/lib/feedparser/text-output.rb +108 -0
- data/lib/feedparser/textconverters.rb +120 -0
- data/setup.rb +1586 -0
- data/test/tc_feed_parse.rb +117 -0
- data/test/tc_htmloutput.rb +52 -0
- data/test/tc_parser.rb +48 -0
- data/test/tc_textoutput.rb +48 -0
- data/test/tc_textwrappedoutput.rb +48 -0
- data/test/ts_feedparser.rb +12 -0
- data/tools/doctoweb.bash +30 -0
- metadata +76 -0
data/README
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Ruby-Feedparser
|
2
|
+
-----------------
|
3
|
+
by Lucas Nussbaum <lucas@lucas-nussbaum.net>
|
4
|
+
|
5
|
+
Currently, all the information is provided on
|
6
|
+
|
7
|
+
http://home.gna.org/ruby-feedparser/
|
8
|
+
|
9
|
+
If you need to ask questions, feel free to ask them on the
|
10
|
+
ruby-feedparser-devel@gna.org mailing list.
|
11
|
+
|
12
|
+
Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
|
13
|
+
which is compatible with the GNU GPL (see the COPYING file) via an explicit
|
14
|
+
dual-licensing clause.
|
data/Rakefile
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'rake/packagetask'
|
4
|
+
require 'rake'
|
5
|
+
require 'find'
|
6
|
+
|
7
|
+
# Globals
|
8
|
+
PKG_NAME = 'ruby-feedparser'
|
9
|
+
PKG_VERSION = '0.7'
|
10
|
+
|
11
|
+
PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
|
12
|
+
Find.find('lib/', 'test/', 'tools/') do |f|
|
13
|
+
if FileTest.directory?(f) and f =~ /\.svn/
|
14
|
+
Find.prune
|
15
|
+
else
|
16
|
+
PKG_FILES << f
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
|
21
|
+
|
22
|
+
task :default => [:package]
|
23
|
+
|
24
|
+
Rake::TestTask.new do |t|
|
25
|
+
t.libs << "test"
|
26
|
+
t.test_files = FileList['test/tc_*.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Rake::RDocTask.new do |rd|
|
30
|
+
f = []
|
31
|
+
Find.find('lib/') do |file|
|
32
|
+
if FileTest.directory?(file) and file =~ /\.svn/
|
33
|
+
Find.prune
|
34
|
+
else
|
35
|
+
f << file if not FileTest.directory?(file)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
f.delete('lib/feedparser.rb')
|
39
|
+
# hack to document the Feedparser module properly
|
40
|
+
f.unshift('lib/feedparser.rb')
|
41
|
+
rd.rdoc_files.include(f)
|
42
|
+
rd.options << '--all'
|
43
|
+
rd.options << '--diagram'
|
44
|
+
rd.options << '--fileboxes'
|
45
|
+
rd.options << '--inline-source'
|
46
|
+
rd.options << '--line-numbers'
|
47
|
+
rd.rdoc_dir = 'rdoc'
|
48
|
+
end
|
49
|
+
|
50
|
+
task :doctoweb => [:rdoc] do |t|
|
51
|
+
# copies the rdoc to the CVS repository for ruby-feedparser website
|
52
|
+
# repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
|
53
|
+
sh "tools/doctoweb.bash"
|
54
|
+
end
|
55
|
+
|
56
|
+
Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
|
57
|
+
p.need_tar = true
|
58
|
+
p.need_zip = true
|
59
|
+
p.package_files = PKG_FILES
|
60
|
+
end
|
61
|
+
|
62
|
+
# "Gem" part of the Rakefile
|
63
|
+
begin
|
64
|
+
require 'rake/gempackagetask'
|
65
|
+
|
66
|
+
spec = Gem::Specification.new do |s|
|
67
|
+
s.platform = Gem::Platform::RUBY
|
68
|
+
s.summary = "Ruby library to parse ATOM and RSS feeds"
|
69
|
+
s.name = PKG_NAME
|
70
|
+
s.version = PKG_VERSION
|
71
|
+
s.requirements << 'none'
|
72
|
+
s.require_path = 'lib'
|
73
|
+
s.autorequire = 'feedparser'
|
74
|
+
s.files = PKG_FILES
|
75
|
+
s.description = "Ruby library to parse ATOM and RSS feeds"
|
76
|
+
end
|
77
|
+
|
78
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
79
|
+
pkg.need_zip = true
|
80
|
+
pkg.need_tar = true
|
81
|
+
end
|
82
|
+
rescue LoadError
|
83
|
+
puts "Will not generate gem."
|
84
|
+
end
|
data/lib/feedparser.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# =Ruby-feedparser - ATOM/RSS feed parser for Ruby
|
2
|
+
# License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
|
3
|
+
# Website::http://home.gna.org/ruby-feedparser/
|
4
|
+
#
|
5
|
+
# ==Introduction
|
6
|
+
#
|
7
|
+
# Ruby-Feedparser is an RSS and Atom parser for Ruby.
|
8
|
+
# Ruby-feedparser is :
|
9
|
+
# * based on REXML
|
10
|
+
# * built for robustness : most feeds are not valid, a parser can't ignore that
|
11
|
+
# * fully unit-tested
|
12
|
+
# * easy to use (it can output text or HTML easily)
|
13
|
+
#
|
14
|
+
# ==Example
|
15
|
+
# require 'net/http'
|
16
|
+
# require 'feedparser'
|
17
|
+
# require 'uri'
|
18
|
+
# s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
|
19
|
+
# f = FeedParser::Feed::new(s)
|
20
|
+
# f.title
|
21
|
+
# => "Slashdot"
|
22
|
+
# f.items.each { |i| puts i.title }
|
23
|
+
# [...]
|
24
|
+
# require 'feedparser/html-output'
|
25
|
+
# f.items.each { |i| puts i.to_html }
|
26
|
+
#
|
27
|
+
|
28
|
+
require 'feedparser/feedparser'
|
@@ -0,0 +1,343 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'time'
|
3
|
+
require 'feedparser/textconverters'
|
4
|
+
require 'feedparser/rexml_patch'
|
5
|
+
require 'feedparser/text-output'
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module FeedParser
|
9
|
+
|
10
|
+
VERSION = "0.7"
|
11
|
+
|
12
|
+
class UnknownFeedTypeException < RuntimeError
|
13
|
+
end
|
14
|
+
|
15
|
+
# an RSS/Atom feed
|
16
|
+
class Feed
|
17
|
+
attr_reader :type, :title, :link, :description, :creator, :encoding, :items
|
18
|
+
|
19
|
+
# REXML::Element for this feed.
|
20
|
+
attr_reader :xml
|
21
|
+
|
22
|
+
# parse str to build a Feed
|
23
|
+
def initialize(str = nil)
|
24
|
+
parse(str) if str
|
25
|
+
end
|
26
|
+
|
27
|
+
# Determines all the fields using a string containing an
|
28
|
+
# XML document
|
29
|
+
def parse(str)
|
30
|
+
# Dirty hack: some feeds contain the & char. It must be changed to &
|
31
|
+
str.gsub!(/&(\s+)/, '&\1')
|
32
|
+
doc = REXML::Document.new(str)
|
33
|
+
@xml = doc.root
|
34
|
+
# get feed info
|
35
|
+
@encoding = doc.encoding
|
36
|
+
@title,@link,@description,@creator = nil
|
37
|
+
@items = []
|
38
|
+
if doc.root.elements['channel'] || doc.root.elements['rss:channel']
|
39
|
+
@type = "rss"
|
40
|
+
# We have a RSS feed!
|
41
|
+
# Title
|
42
|
+
if (e = doc.root.elements['channel/title'] ||
|
43
|
+
doc.root.elements['rss:channel/rss:title']) && e.text
|
44
|
+
@title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
45
|
+
end
|
46
|
+
# Link
|
47
|
+
if (e = doc.root.elements['channel/link'] ||
|
48
|
+
doc.root.elements['rss:channel/rss:link']) && e.text
|
49
|
+
@link = e.text.rmWhiteSpace!
|
50
|
+
end
|
51
|
+
# Description
|
52
|
+
if (e = doc.root.elements['channel/description'] ||
|
53
|
+
doc.root.elements['rss:channel/rss:description']) && e.text
|
54
|
+
@description = e.text.toUTF8(@encoding).rmWhiteSpace!
|
55
|
+
end
|
56
|
+
# Creator
|
57
|
+
if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
|
58
|
+
((e = doc.root.elements['channel/author'] ||
|
59
|
+
doc.root.elements['rss:channel/rss:author']) && e.text)
|
60
|
+
@creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
61
|
+
end
|
62
|
+
# Items
|
63
|
+
if doc.root.elements['channel/item']
|
64
|
+
query = 'channel/item'
|
65
|
+
elsif doc.root.elements['item']
|
66
|
+
query = 'item'
|
67
|
+
elsif doc.root.elements['rss:channel/rss:item']
|
68
|
+
query = 'rss:channel/rss:item'
|
69
|
+
else
|
70
|
+
query = 'rss:item'
|
71
|
+
end
|
72
|
+
doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
|
73
|
+
|
74
|
+
elsif doc.root.elements['/feed']
|
75
|
+
# We have an ATOM feed!
|
76
|
+
@type = "atom"
|
77
|
+
# Title
|
78
|
+
if (e = doc.root.elements['/feed/title']) && e.text
|
79
|
+
@title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
80
|
+
end
|
81
|
+
# Link
|
82
|
+
doc.root.each_element('/feed/link') do |e|
|
83
|
+
if e.attribute('type') and (
|
84
|
+
e.attribute('type').value == 'text/html' or
|
85
|
+
e.attribute('type').value == 'application/xhtml' or
|
86
|
+
e.attribute('type').value == 'application/xhtml+xml')
|
87
|
+
if (h = e.attribute('href')) && h
|
88
|
+
@link = h.value.rmWhiteSpace!
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
# Description
|
93
|
+
if e = doc.root.elements['/feed/info']
|
94
|
+
e = e.elements['div'] || e
|
95
|
+
@description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
|
96
|
+
end
|
97
|
+
# Items
|
98
|
+
doc.root.each_element('/feed/entry') do |e|
|
99
|
+
@items << AtomItem::new(e, self)
|
100
|
+
end
|
101
|
+
else
|
102
|
+
raise UnknownFeedTypeException::new
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def to_s(localtime = true)
|
107
|
+
s = ''
|
108
|
+
s += "Type: #{@type}\n"
|
109
|
+
s += "Encoding: #{@encoding}\n"
|
110
|
+
s += "Title: #{@title}\n"
|
111
|
+
s += "Link: #{@link}\n"
|
112
|
+
s += "Description: #{@description}\n"
|
113
|
+
s += "Creator: #{@creator}\n"
|
114
|
+
s += "\n"
|
115
|
+
@items.each { |i| s += i.to_s(localtime) }
|
116
|
+
s
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# an Item from a feed
|
121
|
+
class FeedItem
|
122
|
+
attr_accessor :title, :link, :content, :date, :creators, :subject,
|
123
|
+
:cacheditem
|
124
|
+
|
125
|
+
# The item's categories/tags. An array of strings.
|
126
|
+
attr_accessor :categories
|
127
|
+
|
128
|
+
# The item's enclosures childs. An array of (url, length, type) triplets.
|
129
|
+
attr_accessor :enclosures
|
130
|
+
|
131
|
+
attr_reader :feed
|
132
|
+
|
133
|
+
# REXML::Element for this item
|
134
|
+
attr_reader :xml
|
135
|
+
|
136
|
+
def initialize(item = nil, feed = nil)
|
137
|
+
@xml = item
|
138
|
+
@feed = feed
|
139
|
+
@title, @link, @content, @date, @subject = nil
|
140
|
+
@creators = []
|
141
|
+
@categories = []
|
142
|
+
@enclosures = []
|
143
|
+
parse(item) if item
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse(item)
|
147
|
+
raise "parse() should be implemented by subclasses!"
|
148
|
+
end
|
149
|
+
|
150
|
+
def creator
|
151
|
+
case @creators.length
|
152
|
+
when 0
|
153
|
+
return nil
|
154
|
+
when 1
|
155
|
+
return creators[0]
|
156
|
+
else
|
157
|
+
return creators[0...-1].join(", ")+" and "+creators[-1]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def to_s(localtime = true)
|
162
|
+
s = "--------------------------------\n" +
|
163
|
+
"Title: #{@title}\nLink: #{@link}\n"
|
164
|
+
if localtime or @date.nil?
|
165
|
+
s += "Date: #{@date.to_s}\n"
|
166
|
+
else
|
167
|
+
s += "Date: #{@date.getutc.to_s}\n"
|
168
|
+
end
|
169
|
+
s += "Creator: #{creator}\n" +
|
170
|
+
"Subject: #{@subject}\n"
|
171
|
+
if defined?(@categories) and @categories.length > 0
|
172
|
+
s += "Filed under: " + @categories.join(', ') + "\n"
|
173
|
+
end
|
174
|
+
s += "Content:\n#{content}\n"
|
175
|
+
if defined?(@enclosures) and @enclosures.length > 0
|
176
|
+
s2 = "Enclosures:\n"
|
177
|
+
@enclosures.each do |e|
|
178
|
+
s2 += e.join(' ') + "\n"
|
179
|
+
end
|
180
|
+
s += s2
|
181
|
+
end
|
182
|
+
return s
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class RSSItem < FeedItem
|
187
|
+
|
188
|
+
|
189
|
+
def parse(item)
|
190
|
+
# Title. If no title, use the pubDate as fallback.
|
191
|
+
if ((e = item.elements['title'] || item.elements['rss:title']) &&
|
192
|
+
e.text) ||
|
193
|
+
((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
|
194
|
+
e.text)
|
195
|
+
@title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
|
196
|
+
end
|
197
|
+
# Link
|
198
|
+
if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
|
199
|
+
(e = item.elements['guid'] || item.elements['rss:guid'] and
|
200
|
+
not (e.attribute('isPermaLink') and
|
201
|
+
e.attribute('isPermaLink').value == 'false'))
|
202
|
+
@link = e.text.rmWhiteSpace!
|
203
|
+
end
|
204
|
+
# Content
|
205
|
+
if (e = item.elements['content:encoded']) ||
|
206
|
+
(e = item.elements['description'] || item.elements['rss:description'])
|
207
|
+
@content = FeedParser::getcontent(e, @feed)
|
208
|
+
end
|
209
|
+
# Date
|
210
|
+
if e = item.elements['dc:date'] || item.elements['pubDate'] ||
|
211
|
+
item.elements['rss:pubDate']
|
212
|
+
begin
|
213
|
+
@date = Time::xmlschema(e.text)
|
214
|
+
rescue
|
215
|
+
begin
|
216
|
+
@date = Time::rfc2822(e.text)
|
217
|
+
rescue
|
218
|
+
begin
|
219
|
+
@date = Time::parse(e.text)
|
220
|
+
rescue
|
221
|
+
@date = nil
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
# Creator
|
227
|
+
if (e = item.elements['dc:creator'] || item.elements['author'] ||
|
228
|
+
item.elements['rss:author']) && e.text
|
229
|
+
@creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
230
|
+
end
|
231
|
+
@creators << @feed.creator if @creators.empty? and @feed.creator
|
232
|
+
|
233
|
+
# Subject
|
234
|
+
if (e = item.elements['dc:subject']) && e.text
|
235
|
+
@subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
236
|
+
end
|
237
|
+
# Categories
|
238
|
+
cat_elts = []
|
239
|
+
item.each_element('dc:category') { |e| cat_elts << e if e.text }
|
240
|
+
item.each_element('category') { |e| cat_elts << e if e.text }
|
241
|
+
item.each_element('rss:category') { |e| cat_elts << e if e.text }
|
242
|
+
|
243
|
+
cat_elts.each do |e|
|
244
|
+
@categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
245
|
+
end
|
246
|
+
# Enclosures
|
247
|
+
item.each_element('enclosure') do |e|
|
248
|
+
url = e.attribute('url').value if e.attribute('url')
|
249
|
+
length = e.attribute('length').value if e.attribute('length')
|
250
|
+
type = e.attribute('type').value if e.attribute('type')
|
251
|
+
@enclosures << [ url, length, type ] if url
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
class AtomItem < FeedItem
|
257
|
+
def parse(item)
|
258
|
+
# Title
|
259
|
+
if (e = item.elements['title']) && e.text
|
260
|
+
@title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
|
261
|
+
end
|
262
|
+
# Link
|
263
|
+
item.each_element('link') do |e|
|
264
|
+
if (h = e.attribute('href')) && h.value
|
265
|
+
@link = h.value
|
266
|
+
end
|
267
|
+
end
|
268
|
+
# Content
|
269
|
+
if e = item.elements['content'] || item.elements['summary']
|
270
|
+
if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
|
271
|
+
e.text
|
272
|
+
@content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
|
273
|
+
else
|
274
|
+
@content = FeedParser::getcontent(e, @feed)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
# Date
|
278
|
+
if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
|
279
|
+
begin
|
280
|
+
@date = Time::xmlschema(e.text)
|
281
|
+
rescue
|
282
|
+
begin
|
283
|
+
@date = Time::rfc2822(e.text)
|
284
|
+
rescue
|
285
|
+
begin
|
286
|
+
@date = Time::parse(e.text)
|
287
|
+
rescue
|
288
|
+
@date = nil
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
# Creator
|
294
|
+
item.each_element('author/name') do |e|
|
295
|
+
if e.text
|
296
|
+
@creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
@creators << @feed.creator if @creators.empty? and @feed.creator
|
301
|
+
|
302
|
+
# Categories
|
303
|
+
item.each_element('category') do |e|
|
304
|
+
if (h = e.attribute('term')) && h.value
|
305
|
+
# Use human-readable label if it is provided
|
306
|
+
if (l = e.attribute('label')) && l.value
|
307
|
+
cat = l.value
|
308
|
+
else
|
309
|
+
cat = h.value
|
310
|
+
end
|
311
|
+
|
312
|
+
@categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def FeedParser::getcontent(e, feed = nil)
|
319
|
+
encoding = feed ? feed.encoding : 'utf-8'
|
320
|
+
children = e.children.reject do |i|
|
321
|
+
i.class == REXML::Text and i.to_s.chomp == ''
|
322
|
+
end
|
323
|
+
if children.length > 1
|
324
|
+
s = ''
|
325
|
+
children.each do |c|
|
326
|
+
s += c.to_s if c.class != REXML::Comment
|
327
|
+
end
|
328
|
+
return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
329
|
+
elsif children.length == 1
|
330
|
+
c = children[0]
|
331
|
+
if c.class == REXML::Text
|
332
|
+
return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
333
|
+
elsif c.class == REXML::CData
|
334
|
+
return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
335
|
+
elsif c.class == REXML::Element
|
336
|
+
# only one element. recurse.
|
337
|
+
return getcontent(c, feed)
|
338
|
+
elsif c.text
|
339
|
+
return c.text.toUTF8(encoding).text2html(feed)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|