ruby-feedparser 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +340 -0
- data/ChangeLog +59 -0
- data/LICENSE +60 -0
- data/README +14 -0
- data/Rakefile +84 -0
- data/lib/feedparser.rb +28 -0
- data/lib/feedparser/feedparser.rb +343 -0
- data/lib/feedparser/filesizes.rb +14 -0
- data/lib/feedparser/html-output.rb +126 -0
- data/lib/feedparser/html2text-parser.rb +413 -0
- data/lib/feedparser/rexml_patch.rb +28 -0
- data/lib/feedparser/sgml-parser.rb +332 -0
- data/lib/feedparser/text-output.rb +108 -0
- data/lib/feedparser/textconverters.rb +120 -0
- data/setup.rb +1586 -0
- data/test/tc_feed_parse.rb +117 -0
- data/test/tc_htmloutput.rb +52 -0
- data/test/tc_parser.rb +48 -0
- data/test/tc_textoutput.rb +48 -0
- data/test/tc_textwrappedoutput.rb +48 -0
- data/test/ts_feedparser.rb +12 -0
- data/tools/doctoweb.bash +30 -0
- metadata +76 -0
data/README
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Ruby-Feedparser
|
2
|
+
-----------------
|
3
|
+
by Lucas Nussbaum <lucas@lucas-nussbaum.net>
|
4
|
+
|
5
|
+
Currently, all the information is provided on
|
6
|
+
|
7
|
+
http://home.gna.org/ruby-feedparser/
|
8
|
+
|
9
|
+
If you need to ask questions, feel free to ask them on the
|
10
|
+
ruby-feedparser-devel@gna.org mailing list.
|
11
|
+
|
12
|
+
Ruby-Feedparser is released under the Ruby license (see the LICENSE file),
|
13
|
+
which is compatible with the GNU GPL (see the COPYING file) via an explicit
|
14
|
+
dual-licensing clause.
|
data/Rakefile
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'rake/testtask'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'rake/packagetask'
|
4
|
+
require 'rake'
|
5
|
+
require 'find'
|
6
|
+
|
7
|
+
# Globals
|
8
|
+
PKG_NAME = 'ruby-feedparser'
|
9
|
+
PKG_VERSION = '0.7'
|
10
|
+
|
11
|
+
PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
|
12
|
+
Find.find('lib/', 'test/', 'tools/') do |f|
|
13
|
+
if FileTest.directory?(f) and f =~ /\.svn/
|
14
|
+
Find.prune
|
15
|
+
else
|
16
|
+
PKG_FILES << f
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
|
21
|
+
|
22
|
+
task :default => [:package]
|
23
|
+
|
24
|
+
Rake::TestTask.new do |t|
|
25
|
+
t.libs << "test"
|
26
|
+
t.test_files = FileList['test/tc_*.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Rake::RDocTask.new do |rd|
|
30
|
+
f = []
|
31
|
+
Find.find('lib/') do |file|
|
32
|
+
if FileTest.directory?(file) and file =~ /\.svn/
|
33
|
+
Find.prune
|
34
|
+
else
|
35
|
+
f << file if not FileTest.directory?(file)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
f.delete('lib/feedparser.rb')
|
39
|
+
# hack to document the Feedparser module properly
|
40
|
+
f.unshift('lib/feedparser.rb')
|
41
|
+
rd.rdoc_files.include(f)
|
42
|
+
rd.options << '--all'
|
43
|
+
rd.options << '--diagram'
|
44
|
+
rd.options << '--fileboxes'
|
45
|
+
rd.options << '--inline-source'
|
46
|
+
rd.options << '--line-numbers'
|
47
|
+
rd.rdoc_dir = 'rdoc'
|
48
|
+
end
|
49
|
+
|
50
|
+
task :doctoweb => [:rdoc] do |t|
|
51
|
+
# copies the rdoc to the CVS repository for ruby-feedparser website
|
52
|
+
# repository is in $CVSDIR (default: ~/dev/ruby-feedparser-web)
|
53
|
+
sh "tools/doctoweb.bash"
|
54
|
+
end
|
55
|
+
|
56
|
+
Rake::PackageTask.new(PKG_NAME, PKG_VERSION) do |p|
|
57
|
+
p.need_tar = true
|
58
|
+
p.need_zip = true
|
59
|
+
p.package_files = PKG_FILES
|
60
|
+
end
|
61
|
+
|
62
|
+
# "Gem" part of the Rakefile
|
63
|
+
begin
|
64
|
+
require 'rake/gempackagetask'
|
65
|
+
|
66
|
+
spec = Gem::Specification.new do |s|
|
67
|
+
s.platform = Gem::Platform::RUBY
|
68
|
+
s.summary = "Ruby library to parse ATOM and RSS feeds"
|
69
|
+
s.name = PKG_NAME
|
70
|
+
s.version = PKG_VERSION
|
71
|
+
s.requirements << 'none'
|
72
|
+
s.require_path = 'lib'
|
73
|
+
s.autorequire = 'feedparser'
|
74
|
+
s.files = PKG_FILES
|
75
|
+
s.description = "Ruby library to parse ATOM and RSS feeds"
|
76
|
+
end
|
77
|
+
|
78
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
79
|
+
pkg.need_zip = true
|
80
|
+
pkg.need_tar = true
|
81
|
+
end
|
82
|
+
rescue LoadError
|
83
|
+
puts "Will not generate gem."
|
84
|
+
end
|
data/lib/feedparser.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# =Ruby-feedparser - ATOM/RSS feed parser for Ruby
|
2
|
+
# License:: Ruby's license (see the LICENSE file) or GNU GPL, at your option.
|
3
|
+
# Website::http://home.gna.org/ruby-feedparser/
|
4
|
+
#
|
5
|
+
# ==Introduction
|
6
|
+
#
|
7
|
+
# Ruby-Feedparser is an RSS and Atom parser for Ruby.
|
8
|
+
# Ruby-feedparser is :
|
9
|
+
# * based on REXML
|
10
|
+
# * built for robustness : most feeds are not valid, a parser can't ignore that
|
11
|
+
# * fully unit-tested
|
12
|
+
# * easy to use (it can output text or HTML easily)
|
13
|
+
#
|
14
|
+
# ==Example
|
15
|
+
# require 'net/http'
|
16
|
+
# require 'feedparser'
|
17
|
+
# require 'uri'
|
18
|
+
# s = Net::HTTP::get URI::parse('http://rss.slashdot.org/Slashdot/slashdot')
|
19
|
+
# f = FeedParser::Feed::new(s)
|
20
|
+
# f.title
|
21
|
+
# => "Slashdot"
|
22
|
+
# f.items.each { |i| puts i.title }
|
23
|
+
# [...]
|
24
|
+
# require 'feedparser/html-output'
|
25
|
+
# f.items.each { |i| puts i.to_html }
|
26
|
+
#
|
27
|
+
|
28
|
+
require 'feedparser/feedparser'
|
@@ -0,0 +1,343 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
require 'time'
|
3
|
+
require 'feedparser/textconverters'
|
4
|
+
require 'feedparser/rexml_patch'
|
5
|
+
require 'feedparser/text-output'
|
6
|
+
require 'base64'
|
7
|
+
|
8
|
+
module FeedParser
|
9
|
+
|
10
|
+
VERSION = "0.7"
|
11
|
+
|
12
|
+
class UnknownFeedTypeException < RuntimeError
|
13
|
+
end
|
14
|
+
|
15
|
+
# an RSS/Atom feed
|
16
|
+
class Feed
|
17
|
+
attr_reader :type, :title, :link, :description, :creator, :encoding, :items
|
18
|
+
|
19
|
+
# REXML::Element for this feed.
|
20
|
+
attr_reader :xml
|
21
|
+
|
22
|
+
# parse str to build a Feed
|
23
|
+
def initialize(str = nil)
|
24
|
+
parse(str) if str
|
25
|
+
end
|
26
|
+
|
27
|
+
# Determines all the fields using a string containing an
|
28
|
+
# XML document
|
29
|
+
def parse(str)
|
30
|
+
# Dirty hack: some feeds contain the & char. It must be changed to &
|
31
|
+
str.gsub!(/&(\s+)/, '&\1')
|
32
|
+
doc = REXML::Document.new(str)
|
33
|
+
@xml = doc.root
|
34
|
+
# get feed info
|
35
|
+
@encoding = doc.encoding
|
36
|
+
@title,@link,@description,@creator = nil
|
37
|
+
@items = []
|
38
|
+
if doc.root.elements['channel'] || doc.root.elements['rss:channel']
|
39
|
+
@type = "rss"
|
40
|
+
# We have a RSS feed!
|
41
|
+
# Title
|
42
|
+
if (e = doc.root.elements['channel/title'] ||
|
43
|
+
doc.root.elements['rss:channel/rss:title']) && e.text
|
44
|
+
@title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
45
|
+
end
|
46
|
+
# Link
|
47
|
+
if (e = doc.root.elements['channel/link'] ||
|
48
|
+
doc.root.elements['rss:channel/rss:link']) && e.text
|
49
|
+
@link = e.text.rmWhiteSpace!
|
50
|
+
end
|
51
|
+
# Description
|
52
|
+
if (e = doc.root.elements['channel/description'] ||
|
53
|
+
doc.root.elements['rss:channel/rss:description']) && e.text
|
54
|
+
@description = e.text.toUTF8(@encoding).rmWhiteSpace!
|
55
|
+
end
|
56
|
+
# Creator
|
57
|
+
if ((e = doc.root.elements['channel/dc:creator']) && e.text) ||
|
58
|
+
((e = doc.root.elements['channel/author'] ||
|
59
|
+
doc.root.elements['rss:channel/rss:author']) && e.text)
|
60
|
+
@creator = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
61
|
+
end
|
62
|
+
# Items
|
63
|
+
if doc.root.elements['channel/item']
|
64
|
+
query = 'channel/item'
|
65
|
+
elsif doc.root.elements['item']
|
66
|
+
query = 'item'
|
67
|
+
elsif doc.root.elements['rss:channel/rss:item']
|
68
|
+
query = 'rss:channel/rss:item'
|
69
|
+
else
|
70
|
+
query = 'rss:item'
|
71
|
+
end
|
72
|
+
doc.root.each_element(query) { |e| @items << RSSItem::new(e, self) }
|
73
|
+
|
74
|
+
elsif doc.root.elements['/feed']
|
75
|
+
# We have an ATOM feed!
|
76
|
+
@type = "atom"
|
77
|
+
# Title
|
78
|
+
if (e = doc.root.elements['/feed/title']) && e.text
|
79
|
+
@title = e.text.unescape_html.toUTF8(@encoding).rmWhiteSpace!
|
80
|
+
end
|
81
|
+
# Link
|
82
|
+
doc.root.each_element('/feed/link') do |e|
|
83
|
+
if e.attribute('type') and (
|
84
|
+
e.attribute('type').value == 'text/html' or
|
85
|
+
e.attribute('type').value == 'application/xhtml' or
|
86
|
+
e.attribute('type').value == 'application/xhtml+xml')
|
87
|
+
if (h = e.attribute('href')) && h
|
88
|
+
@link = h.value.rmWhiteSpace!
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
# Description
|
93
|
+
if e = doc.root.elements['/feed/info']
|
94
|
+
e = e.elements['div'] || e
|
95
|
+
@description = e.to_s.toUTF8(@encoding).rmWhiteSpace!
|
96
|
+
end
|
97
|
+
# Items
|
98
|
+
doc.root.each_element('/feed/entry') do |e|
|
99
|
+
@items << AtomItem::new(e, self)
|
100
|
+
end
|
101
|
+
else
|
102
|
+
raise UnknownFeedTypeException::new
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def to_s(localtime = true)
|
107
|
+
s = ''
|
108
|
+
s += "Type: #{@type}\n"
|
109
|
+
s += "Encoding: #{@encoding}\n"
|
110
|
+
s += "Title: #{@title}\n"
|
111
|
+
s += "Link: #{@link}\n"
|
112
|
+
s += "Description: #{@description}\n"
|
113
|
+
s += "Creator: #{@creator}\n"
|
114
|
+
s += "\n"
|
115
|
+
@items.each { |i| s += i.to_s(localtime) }
|
116
|
+
s
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# an Item from a feed
|
121
|
+
class FeedItem
|
122
|
+
attr_accessor :title, :link, :content, :date, :creators, :subject,
|
123
|
+
:cacheditem
|
124
|
+
|
125
|
+
# The item's categories/tags. An array of strings.
|
126
|
+
attr_accessor :categories
|
127
|
+
|
128
|
+
# The item's enclosures childs. An array of (url, length, type) triplets.
|
129
|
+
attr_accessor :enclosures
|
130
|
+
|
131
|
+
attr_reader :feed
|
132
|
+
|
133
|
+
# REXML::Element for this item
|
134
|
+
attr_reader :xml
|
135
|
+
|
136
|
+
def initialize(item = nil, feed = nil)
|
137
|
+
@xml = item
|
138
|
+
@feed = feed
|
139
|
+
@title, @link, @content, @date, @subject = nil
|
140
|
+
@creators = []
|
141
|
+
@categories = []
|
142
|
+
@enclosures = []
|
143
|
+
parse(item) if item
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse(item)
|
147
|
+
raise "parse() should be implemented by subclasses!"
|
148
|
+
end
|
149
|
+
|
150
|
+
def creator
|
151
|
+
case @creators.length
|
152
|
+
when 0
|
153
|
+
return nil
|
154
|
+
when 1
|
155
|
+
return creators[0]
|
156
|
+
else
|
157
|
+
return creators[0...-1].join(", ")+" and "+creators[-1]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def to_s(localtime = true)
|
162
|
+
s = "--------------------------------\n" +
|
163
|
+
"Title: #{@title}\nLink: #{@link}\n"
|
164
|
+
if localtime or @date.nil?
|
165
|
+
s += "Date: #{@date.to_s}\n"
|
166
|
+
else
|
167
|
+
s += "Date: #{@date.getutc.to_s}\n"
|
168
|
+
end
|
169
|
+
s += "Creator: #{creator}\n" +
|
170
|
+
"Subject: #{@subject}\n"
|
171
|
+
if defined?(@categories) and @categories.length > 0
|
172
|
+
s += "Filed under: " + @categories.join(', ') + "\n"
|
173
|
+
end
|
174
|
+
s += "Content:\n#{content}\n"
|
175
|
+
if defined?(@enclosures) and @enclosures.length > 0
|
176
|
+
s2 = "Enclosures:\n"
|
177
|
+
@enclosures.each do |e|
|
178
|
+
s2 += e.join(' ') + "\n"
|
179
|
+
end
|
180
|
+
s += s2
|
181
|
+
end
|
182
|
+
return s
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class RSSItem < FeedItem
|
187
|
+
|
188
|
+
|
189
|
+
def parse(item)
|
190
|
+
# Title. If no title, use the pubDate as fallback.
|
191
|
+
if ((e = item.elements['title'] || item.elements['rss:title']) &&
|
192
|
+
e.text) ||
|
193
|
+
((e = item.elements['pubDate'] || item.elements['rss:pubDate']) &&
|
194
|
+
e.text)
|
195
|
+
@title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
|
196
|
+
end
|
197
|
+
# Link
|
198
|
+
if ((e = item.elements['link'] || item.elements['rss:link']) && e.text)||
|
199
|
+
(e = item.elements['guid'] || item.elements['rss:guid'] and
|
200
|
+
not (e.attribute('isPermaLink') and
|
201
|
+
e.attribute('isPermaLink').value == 'false'))
|
202
|
+
@link = e.text.rmWhiteSpace!
|
203
|
+
end
|
204
|
+
# Content
|
205
|
+
if (e = item.elements['content:encoded']) ||
|
206
|
+
(e = item.elements['description'] || item.elements['rss:description'])
|
207
|
+
@content = FeedParser::getcontent(e, @feed)
|
208
|
+
end
|
209
|
+
# Date
|
210
|
+
if e = item.elements['dc:date'] || item.elements['pubDate'] ||
|
211
|
+
item.elements['rss:pubDate']
|
212
|
+
begin
|
213
|
+
@date = Time::xmlschema(e.text)
|
214
|
+
rescue
|
215
|
+
begin
|
216
|
+
@date = Time::rfc2822(e.text)
|
217
|
+
rescue
|
218
|
+
begin
|
219
|
+
@date = Time::parse(e.text)
|
220
|
+
rescue
|
221
|
+
@date = nil
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
# Creator
|
227
|
+
if (e = item.elements['dc:creator'] || item.elements['author'] ||
|
228
|
+
item.elements['rss:author']) && e.text
|
229
|
+
@creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
230
|
+
end
|
231
|
+
@creators << @feed.creator if @creators.empty? and @feed.creator
|
232
|
+
|
233
|
+
# Subject
|
234
|
+
if (e = item.elements['dc:subject']) && e.text
|
235
|
+
@subject = e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
236
|
+
end
|
237
|
+
# Categories
|
238
|
+
cat_elts = []
|
239
|
+
item.each_element('dc:category') { |e| cat_elts << e if e.text }
|
240
|
+
item.each_element('category') { |e| cat_elts << e if e.text }
|
241
|
+
item.each_element('rss:category') { |e| cat_elts << e if e.text }
|
242
|
+
|
243
|
+
cat_elts.each do |e|
|
244
|
+
@categories << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
245
|
+
end
|
246
|
+
# Enclosures
|
247
|
+
item.each_element('enclosure') do |e|
|
248
|
+
url = e.attribute('url').value if e.attribute('url')
|
249
|
+
length = e.attribute('length').value if e.attribute('length')
|
250
|
+
type = e.attribute('type').value if e.attribute('type')
|
251
|
+
@enclosures << [ url, length, type ] if url
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
class AtomItem < FeedItem
|
257
|
+
def parse(item)
|
258
|
+
# Title
|
259
|
+
if (e = item.elements['title']) && e.text
|
260
|
+
@title = e.text.unescape_html.toUTF8(@feed.encoding).html2text.rmWhiteSpace!
|
261
|
+
end
|
262
|
+
# Link
|
263
|
+
item.each_element('link') do |e|
|
264
|
+
if (h = e.attribute('href')) && h.value
|
265
|
+
@link = h.value
|
266
|
+
end
|
267
|
+
end
|
268
|
+
# Content
|
269
|
+
if e = item.elements['content'] || item.elements['summary']
|
270
|
+
if (e.attribute('mode') and e.attribute('mode').value == 'escaped') &&
|
271
|
+
e.text
|
272
|
+
@content = e.text.toUTF8(@feed.encoding).rmWhiteSpace!
|
273
|
+
else
|
274
|
+
@content = FeedParser::getcontent(e, @feed)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
# Date
|
278
|
+
if (e = item.elements['issued'] || e = item.elements['created'] || e = item.elements['updated'] || e = item.elements['published']) && e.text
|
279
|
+
begin
|
280
|
+
@date = Time::xmlschema(e.text)
|
281
|
+
rescue
|
282
|
+
begin
|
283
|
+
@date = Time::rfc2822(e.text)
|
284
|
+
rescue
|
285
|
+
begin
|
286
|
+
@date = Time::parse(e.text)
|
287
|
+
rescue
|
288
|
+
@date = nil
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
# Creator
|
294
|
+
item.each_element('author/name') do |e|
|
295
|
+
if e.text
|
296
|
+
@creators << e.text.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
@creators << @feed.creator if @creators.empty? and @feed.creator
|
301
|
+
|
302
|
+
# Categories
|
303
|
+
item.each_element('category') do |e|
|
304
|
+
if (h = e.attribute('term')) && h.value
|
305
|
+
# Use human-readable label if it is provided
|
306
|
+
if (l = e.attribute('label')) && l.value
|
307
|
+
cat = l.value
|
308
|
+
else
|
309
|
+
cat = h.value
|
310
|
+
end
|
311
|
+
|
312
|
+
@categories << cat.unescape_html.toUTF8(@feed.encoding).rmWhiteSpace!
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def FeedParser::getcontent(e, feed = nil)
|
319
|
+
encoding = feed ? feed.encoding : 'utf-8'
|
320
|
+
children = e.children.reject do |i|
|
321
|
+
i.class == REXML::Text and i.to_s.chomp == ''
|
322
|
+
end
|
323
|
+
if children.length > 1
|
324
|
+
s = ''
|
325
|
+
children.each do |c|
|
326
|
+
s += c.to_s if c.class != REXML::Comment
|
327
|
+
end
|
328
|
+
return s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
329
|
+
elsif children.length == 1
|
330
|
+
c = children[0]
|
331
|
+
if c.class == REXML::Text
|
332
|
+
return e.text.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
333
|
+
elsif c.class == REXML::CData
|
334
|
+
return c.to_s.toUTF8(encoding).rmWhiteSpace!.text2html(feed)
|
335
|
+
elsif c.class == REXML::Element
|
336
|
+
# only one element. recurse.
|
337
|
+
return getcontent(c, feed)
|
338
|
+
elsif c.text
|
339
|
+
return c.text.toUTF8(encoding).text2html(feed)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|