ruby-feedparser 0.7 → 0.9.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/{ChangeLog → ChangeLog.md} +23 -16
- data/Rakefile +16 -8
- data/lib/feedparser/feedparser.rb +85 -10
- data/lib/feedparser/html-output.rb +4 -4
- data/lib/feedparser/html2text-parser.rb +24 -10
- data/lib/feedparser/rexml_patch.rb +3 -0
- data/lib/feedparser/sgml-parser.rb +3 -4
- data/lib/feedparser/text-output.rb +3 -4
- data/lib/feedparser/textconverters.rb +1 -1
- data/lib/feedparser/version.rb +3 -0
- data/test/tc_feed_parse.rb +52 -1
- data/test/tc_feeditem.rb +47 -0
- data/test/tc_html2text_parser.rb +43 -0
- data/test/tc_htmloutput.rb +13 -13
- data/test/tc_parser.rb +13 -13
- data/test/tc_sgml_parser.rb +22 -0
- data/test/tc_textoutput.rb +13 -13
- data/test/tc_textwrappedoutput.rb +13 -13
- metadata +49 -43
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e198cf8ee7423ff4edf5ba4367ac809ba1fe2a9d6361fcf53d12b984aa138228
|
4
|
+
data.tar.gz: bbbd8c024c4e85c991ae2ceae4494e24d8b0865d2fe6a4df2646007e798e96ac
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ac90154cfa40180e03d4b7b1d631186c6db1d70d79bdbb7f4edb4c54a66eddab3085e04480ed965b7c2e055770976873224b9d12f20be3d282817d6cd34245be
|
7
|
+
data.tar.gz: 4f658dc07c1d692b44f9abd0d400449cb0e1aa3d1cda8c782b052e45b193a8ad462ce7e2d8f951b3d15b301764e651f3781da93cf07f28c1ea8438ab87a1c989
|
data/{ChangeLog → ChangeLog.md}
RENAMED
@@ -1,22 +1,29 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# 0.9.4 (25/03/2016)
|
2
|
+
|
3
|
+
Bug fixes:
|
4
|
+
|
5
|
+
* feedparser: relax exception check for Magic errors; by Eric Wong
|
6
|
+
* Always sort author list to avoid unecessary invalidation of caches; by Sébastien Dailly
|
7
|
+
|
8
|
+
# 0.7 (27/07/2009)
|
9
|
+
|
3
10
|
* Handled several creators per feed item
|
4
11
|
* Fix bug with urls into tag attributes
|
5
12
|
* Better item categories support
|
6
13
|
* Reworked text output formatting
|
7
14
|
* Ignore ­, as some blog software (dotclear2) misuse it.
|
8
15
|
|
9
|
-
|
10
|
-
|
11
|
-
* Moved to_human_readable from class Fixnum to class Integer.
|
16
|
+
# 0.6 (23/07/2008)
|
17
|
+
|
18
|
+
* Moved `to_human_readable` from class Fixnum to class Integer.
|
12
19
|
* Correctly parse http://www.tbray.org/ongoing/ongoing.atom. Thanks
|
13
20
|
to Janico Greifenberg for reporting this.
|
14
21
|
* String#html2text now takes an additional wrapto parameter, allowing
|
15
22
|
to wrap the text to a specified number of chars. Thanks to
|
16
23
|
Maxime Petazzoni for the patch.
|
17
24
|
|
18
|
-
|
19
|
-
|
25
|
+
# 0.5 (26/10/2007)
|
26
|
+
|
20
27
|
* Fixed a bug with items with both non-escaped and escaped HTML. Reported,
|
21
28
|
then patch provided by Gregory Hartman <gghartma@cs.cmu.edu>.
|
22
29
|
* In Atom feeds, use the date provided in <updated>, and use it in
|
@@ -27,33 +34,33 @@ Ruby-Feedparser 0.5 (26/10/2007)
|
|
27
34
|
* Make checks for HTML tags case-insensitive. Broke Dilbert feeds!!
|
28
35
|
Reported by Michal Čihař. Closes gna bug #10199.
|
29
36
|
|
30
|
-
|
31
|
-
|
37
|
+
# 0.4 (01/05/2007)
|
38
|
+
|
32
39
|
* Fixed a problem with html entities in the items' titles.
|
33
40
|
* Date was not fetched for blogspot's atom feeds.
|
34
41
|
Patch from Jason Ling <jason.ling@jeyel.com>.
|
35
42
|
* Tests are now timezone-friendly. (closes GNA bug #8145).
|
36
43
|
* Much nicer text output.
|
37
44
|
|
38
|
-
|
39
|
-
|
45
|
+
# 0.3 (01/12/2006)
|
46
|
+
|
40
47
|
* Much nicer HTML output
|
41
48
|
* Fixed a problem with some feeds with broken enclosures (without url)
|
42
|
-
* Now automatically fixes non-absolute
|
49
|
+
* Now automatically fixes non-absolute `<a href>` or `<img src>`
|
43
50
|
* Fixed small parser bugs
|
44
51
|
* Now displays enclosures in the text and html outputs. Ready for
|
45
52
|
podcasting :-)
|
46
53
|
* Now escape title, creator, subject and category internally. This minor
|
47
54
|
fix avoids & stuff in the titles, for example.
|
48
55
|
|
49
|
-
|
50
|
-
|
56
|
+
* 0.2 (05/06/2006)
|
57
|
+
|
51
58
|
* Fixed a problem when parsing some ATOM feeds with <link> without type
|
52
59
|
attribute. (Thanks Michal Cihar !)
|
53
60
|
* FeedParser::Feed and FeedParser::FeedItem now have an xml attribute to
|
54
61
|
get the related REXML::Element.
|
55
62
|
* <enclosure/> support in RSS.
|
56
63
|
|
57
|
-
|
58
|
-
|
64
|
+
# 0.1 (24/11/2005)
|
65
|
+
|
59
66
|
* first public release.
|
data/Rakefile
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
require 'rake/testtask'
|
2
|
-
require '
|
3
|
-
require '
|
2
|
+
require 'rdoc/task'
|
3
|
+
require 'rubygems/package_task'
|
4
4
|
require 'rake'
|
5
5
|
require 'find'
|
6
|
+
require_relative 'lib/feedparser/version.rb'
|
6
7
|
|
7
8
|
# Globals
|
8
9
|
PKG_NAME = 'ruby-feedparser'
|
9
|
-
PKG_VERSION =
|
10
|
+
PKG_VERSION = FeedParser::VERSION
|
10
11
|
|
11
|
-
PKG_FILES = [ 'ChangeLog', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
|
12
|
+
PKG_FILES = [ 'ChangeLog.md', 'README', 'COPYING', 'LICENSE', 'setup.rb', 'Rakefile']
|
12
13
|
Find.find('lib/', 'test/', 'tools/') do |f|
|
13
14
|
if FileTest.directory?(f) and f =~ /\.svn/
|
14
15
|
Find.prune
|
@@ -19,7 +20,7 @@ end
|
|
19
20
|
|
20
21
|
PKG_FILES.reject! { |f| f =~ /^test\/(source|.*_output)\// }
|
21
22
|
|
22
|
-
task :default => [:
|
23
|
+
task :default => [:test]
|
23
24
|
|
24
25
|
Rake::TestTask.new do |t|
|
25
26
|
t.libs << "test"
|
@@ -61,8 +62,6 @@ end
|
|
61
62
|
|
62
63
|
# "Gem" part of the Rakefile
|
63
64
|
begin
|
64
|
-
require 'rake/gempackagetask'
|
65
|
-
|
66
65
|
spec = Gem::Specification.new do |s|
|
67
66
|
s.platform = Gem::Platform::RUBY
|
68
67
|
s.summary = "Ruby library to parse ATOM and RSS feeds"
|
@@ -73,12 +72,21 @@ begin
|
|
73
72
|
s.autorequire = 'feedparser'
|
74
73
|
s.files = PKG_FILES
|
75
74
|
s.description = "Ruby library to parse ATOM and RSS feeds"
|
75
|
+
s.authors = ['Lucas Nussbaum']
|
76
|
+
s.add_runtime_dependency 'magic'
|
76
77
|
end
|
77
78
|
|
78
|
-
|
79
|
+
Gem::PackageTask.new(spec) do |pkg|
|
79
80
|
pkg.need_zip = true
|
80
81
|
pkg.need_tar = true
|
81
82
|
end
|
82
83
|
rescue LoadError
|
83
84
|
puts "Will not generate gem."
|
84
85
|
end
|
86
|
+
|
87
|
+
task :release => :repackage do
|
88
|
+
sh 'git', 'tag', 'v' + PKG_VERSION
|
89
|
+
sh 'git', 'push'
|
90
|
+
sh 'git', 'push', '--tags'
|
91
|
+
sh 'gem', 'push', "pkg/#{PKG_NAME}-#{PKG_VERSION}.gem"
|
92
|
+
end
|
@@ -1,17 +1,47 @@
|
|
1
|
+
require 'cgi'
|
1
2
|
require 'rexml/document'
|
2
3
|
require 'time'
|
3
4
|
require 'feedparser/textconverters'
|
4
5
|
require 'feedparser/rexml_patch'
|
5
6
|
require 'feedparser/text-output'
|
7
|
+
require 'feedparser/version'
|
6
8
|
require 'base64'
|
9
|
+
require 'magic'
|
10
|
+
require 'uri'
|
7
11
|
|
8
12
|
module FeedParser
|
9
13
|
|
10
|
-
VERSION = "0.7"
|
11
|
-
|
12
14
|
class UnknownFeedTypeException < RuntimeError
|
13
15
|
end
|
14
16
|
|
17
|
+
def self.recode(str)
|
18
|
+
encoding = nil
|
19
|
+
begin
|
20
|
+
encoding = Magic.guess_string_mime_encoding(str)
|
21
|
+
rescue => e
|
22
|
+
raise unless e.class.to_s =~ /\AMagic::(?:Exception|Error)\z/
|
23
|
+
# this happens when magic does not find any content at all, e.g. with
|
24
|
+
# strings that contain only whitespace. In these case it *should* be safe
|
25
|
+
# to assume UTF-8
|
26
|
+
encoding = Encoding::UTF_8
|
27
|
+
end
|
28
|
+
if encoding == 'unknown-8bit'
|
29
|
+
# find first substring with a valid encoding that is not us-ascii
|
30
|
+
length = 1 # has to start at 1, magic requires at least 2 bytes
|
31
|
+
while length < str.length && ['us-ascii', 'unknown-8bit'].include?(encoding)
|
32
|
+
encoding = Magic.guess_string_mime_encoding(str[0..length])
|
33
|
+
length = length + 1
|
34
|
+
end
|
35
|
+
# need to remove iso-8859-1 control characters
|
36
|
+
if encoding == 'iso-8859-1'
|
37
|
+
str = str.bytes.select { |c| c < 128 || c > 159 }.map(&:chr).join
|
38
|
+
end
|
39
|
+
end
|
40
|
+
str.force_encoding(encoding)
|
41
|
+
str = str.chars.select { |c| c.valid_encoding? }.join
|
42
|
+
str.encode('UTF-8')
|
43
|
+
end
|
44
|
+
|
15
45
|
# an RSS/Atom feed
|
16
46
|
class Feed
|
17
47
|
attr_reader :type, :title, :link, :description, :creator, :encoding, :items
|
@@ -20,13 +50,16 @@ module FeedParser
|
|
20
50
|
attr_reader :xml
|
21
51
|
|
22
52
|
# parse str to build a Feed
|
23
|
-
def initialize(str = nil)
|
53
|
+
def initialize(str = nil, uri = nil)
|
24
54
|
parse(str) if str
|
55
|
+
parse_origin(uri) if uri
|
25
56
|
end
|
26
57
|
|
27
58
|
# Determines all the fields using a string containing an
|
28
59
|
# XML document
|
29
60
|
def parse(str)
|
61
|
+
str = FeedParser.recode(str)
|
62
|
+
|
30
63
|
# Dirty hack: some feeds contain the & char. It must be changed to &
|
31
64
|
str.gsub!(/&(\s+)/, '&\1')
|
32
65
|
doc = REXML::Document.new(str)
|
@@ -34,6 +67,7 @@ module FeedParser
|
|
34
67
|
# get feed info
|
35
68
|
@encoding = doc.encoding
|
36
69
|
@title,@link,@description,@creator = nil
|
70
|
+
@title = ""
|
37
71
|
@items = []
|
38
72
|
if doc.root.elements['channel'] || doc.root.elements['rss:channel']
|
39
73
|
@type = "rss"
|
@@ -108,19 +142,28 @@ module FeedParser
|
|
108
142
|
s += "Type: #{@type}\n"
|
109
143
|
s += "Encoding: #{@encoding}\n"
|
110
144
|
s += "Title: #{@title}\n"
|
111
|
-
s += "Link: #{
|
145
|
+
s += "Link: #{link}\n"
|
112
146
|
s += "Description: #{@description}\n"
|
113
147
|
s += "Creator: #{@creator}\n"
|
114
148
|
s += "\n"
|
115
149
|
@items.each { |i| s += i.to_s(localtime) }
|
116
150
|
s
|
117
151
|
end
|
152
|
+
|
153
|
+
def parse_origin(uri)
|
154
|
+
uri = URI.parse(uri)
|
155
|
+
if uri.hostname && uri.scheme
|
156
|
+
@origin = "#{uri.scheme}://#{uri.hostname}"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
attr_reader :origin
|
118
161
|
end
|
119
162
|
|
120
163
|
# an Item from a feed
|
121
164
|
class FeedItem
|
122
|
-
attr_accessor :title, :
|
123
|
-
:cacheditem
|
165
|
+
attr_accessor :title, :content, :date, :creators, :subject,
|
166
|
+
:cacheditem, :links
|
124
167
|
|
125
168
|
# The item's categories/tags. An array of strings.
|
126
169
|
attr_accessor :categories
|
@@ -137,9 +180,12 @@ module FeedParser
|
|
137
180
|
@xml = item
|
138
181
|
@feed = feed
|
139
182
|
@title, @link, @content, @date, @subject = nil
|
183
|
+
@links = []
|
140
184
|
@creators = []
|
141
185
|
@categories = []
|
142
186
|
@enclosures = []
|
187
|
+
|
188
|
+
@title = ""
|
143
189
|
parse(item) if item
|
144
190
|
end
|
145
191
|
|
@@ -154,13 +200,14 @@ module FeedParser
|
|
154
200
|
when 1
|
155
201
|
return creators[0]
|
156
202
|
else
|
157
|
-
|
203
|
+
sorted_creators = creators.sort
|
204
|
+
return sorted_creators[0...-1].join(", ") + " and " + sorted_creators[-1]
|
158
205
|
end
|
159
206
|
end
|
160
207
|
|
161
208
|
def to_s(localtime = true)
|
162
209
|
s = "--------------------------------\n" +
|
163
|
-
"Title: #{@title}\nLink: #{
|
210
|
+
"Title: #{@title}\nLink: #{link}\n"
|
164
211
|
if localtime or @date.nil?
|
165
212
|
s += "Date: #{@date.to_s}\n"
|
166
213
|
else
|
@@ -181,6 +228,26 @@ module FeedParser
|
|
181
228
|
end
|
182
229
|
return s
|
183
230
|
end
|
231
|
+
|
232
|
+
attr_writer :link
|
233
|
+
|
234
|
+
def link
|
235
|
+
if @link
|
236
|
+
begin
|
237
|
+
uri = URI.parse(@link)
|
238
|
+
rescue URI::InvalidURIError
|
239
|
+
return @link
|
240
|
+
end
|
241
|
+
if uri.hostname && uri.scheme
|
242
|
+
@link
|
243
|
+
elsif feed && feed.origin
|
244
|
+
[feed.origin, @link].compact.join
|
245
|
+
else
|
246
|
+
@link
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
184
251
|
end
|
185
252
|
|
186
253
|
class RSSItem < FeedItem
|
@@ -199,7 +266,7 @@ module FeedParser
|
|
199
266
|
(e = item.elements['guid'] || item.elements['rss:guid'] and
|
200
267
|
not (e.attribute('isPermaLink') and
|
201
268
|
e.attribute('isPermaLink').value == 'false'))
|
202
|
-
|
269
|
+
self.link = e.text.rmWhiteSpace!
|
203
270
|
end
|
204
271
|
# Content
|
205
272
|
if (e = item.elements['content:encoded']) ||
|
@@ -261,8 +328,16 @@ module FeedParser
|
|
261
328
|
end
|
262
329
|
# Link
|
263
330
|
item.each_element('link') do |e|
|
331
|
+
|
264
332
|
if (h = e.attribute('href')) && h.value
|
265
|
-
|
333
|
+
self.link = h.value
|
334
|
+
|
335
|
+
if e.attribute('type')
|
336
|
+
@links << {:href => h.value, :type => e.attribute('type').value}
|
337
|
+
else
|
338
|
+
@links << {:href => h.value, :type => ''}
|
339
|
+
end
|
340
|
+
|
266
341
|
end
|
267
342
|
end
|
268
343
|
# Content
|
@@ -80,13 +80,13 @@ module FeedParser
|
|
80
80
|
s += (headline % ["Feed:", r])
|
81
81
|
|
82
82
|
r = ""
|
83
|
-
r += "<a href=\"#{
|
83
|
+
r += "<a href=\"#{link}\">" if link
|
84
84
|
if @title
|
85
85
|
r += "<b>#{@title.escape_html}</b>\n"
|
86
|
-
elsif
|
87
|
-
r += "<b>#{
|
86
|
+
elsif link
|
87
|
+
r += "<b>#{link.escape_html}</b>\n"
|
88
88
|
end
|
89
|
-
r += "</a>\n" if
|
89
|
+
r += "</a>\n" if link
|
90
90
|
s += (headline % ["Item:", r])
|
91
91
|
s += "</table></td></tr></table>\n"
|
92
92
|
s += "\n"
|
@@ -11,16 +11,16 @@ module FeedParser
|
|
11
11
|
@pre = false
|
12
12
|
@href = nil
|
13
13
|
@links = []
|
14
|
+
@curlink = []
|
14
15
|
@imgs = []
|
15
|
-
@img_index = '
|
16
|
+
@img_index = 'A'
|
16
17
|
super(verbose)
|
17
18
|
end
|
18
19
|
|
19
20
|
def next_img_index
|
20
|
-
|
21
|
-
@img_index =
|
22
|
-
|
23
|
-
return @img_index
|
21
|
+
idx = @img_index
|
22
|
+
@img_index = @img_index.next
|
23
|
+
idx
|
24
24
|
end
|
25
25
|
|
26
26
|
def handle_data(data)
|
@@ -29,7 +29,8 @@ module FeedParser
|
|
29
29
|
data.gsub!(/\n/, ' ')
|
30
30
|
data.gsub!(/( )+/, ' ')
|
31
31
|
end
|
32
|
-
|
32
|
+
data = FeedParser.recode(data)
|
33
|
+
@savedata << data.encode(Encoding::UTF_8)
|
33
34
|
end
|
34
35
|
|
35
36
|
def unknown_starttag(tag, attrs)
|
@@ -70,7 +71,14 @@ module FeedParser
|
|
70
71
|
end
|
71
72
|
end
|
72
73
|
if @href
|
73
|
-
@
|
74
|
+
@href.gsub!(/^("|'|)(.*)("|')$/,'\2')
|
75
|
+
@curlink = @links.find_index(@href)
|
76
|
+
if @curlink.nil?
|
77
|
+
@links << @href
|
78
|
+
@curlink = @links.length
|
79
|
+
else
|
80
|
+
@curlink += 1
|
81
|
+
end
|
74
82
|
end
|
75
83
|
when 'img'
|
76
84
|
# find src in args
|
@@ -81,8 +89,14 @@ module FeedParser
|
|
81
89
|
end
|
82
90
|
end
|
83
91
|
if src
|
84
|
-
|
85
|
-
@imgs
|
92
|
+
src.gsub!(/^("|'|)(.*)("|')$/,'\2')
|
93
|
+
i = @imgs.index { |e| e[1] == src }
|
94
|
+
if i.nil?
|
95
|
+
idx = next_img_index
|
96
|
+
@imgs << [ idx, src ]
|
97
|
+
else
|
98
|
+
idx = @imgs[i][0]
|
99
|
+
end
|
86
100
|
@savedata << "[#{idx}]"
|
87
101
|
end
|
88
102
|
else
|
@@ -125,7 +139,7 @@ module FeedParser
|
|
125
139
|
@pre = false
|
126
140
|
when 'a'
|
127
141
|
if @href
|
128
|
-
@savedata << "[#{@
|
142
|
+
@savedata << "[#{@curlink}]"
|
129
143
|
@href = nil
|
130
144
|
end
|
131
145
|
end
|
@@ -6,14 +6,17 @@ require 'feedparser/textconverters'
|
|
6
6
|
# With those changes, it uses unpack/pack with some error handling
|
7
7
|
module REXML
|
8
8
|
module Encoding
|
9
|
+
alias rexml_decode decode
|
9
10
|
def decode(str)
|
10
11
|
return str.toUTF8(@encoding)
|
11
12
|
end
|
12
13
|
|
14
|
+
alias rexml_encode encode
|
13
15
|
def encode(str)
|
14
16
|
return str
|
15
17
|
end
|
16
18
|
|
19
|
+
alias rexml_encoding= encoding=
|
17
20
|
def encoding=(enc)
|
18
21
|
return if defined? @encoding and enc == @encoding
|
19
22
|
@encoding = enc || 'utf-8'
|
@@ -293,12 +293,11 @@ module FeedParser
|
|
293
293
|
end
|
294
294
|
|
295
295
|
def handle_charref(name)
|
296
|
-
|
297
|
-
if !(0 <= n && n <= 255)
|
296
|
+
if name =~ /[0-9]+/
|
298
297
|
unknown_charref(name)
|
299
|
-
|
298
|
+
else
|
299
|
+
handle_data(name)
|
300
300
|
end
|
301
|
-
handle_data(n.chr)
|
302
301
|
end
|
303
302
|
|
304
303
|
def handle_entityref(name)
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'feedparser'
|
2
1
|
require 'feedparser/html2text-parser'
|
3
2
|
require 'feedparser/filesizes'
|
4
3
|
|
@@ -61,7 +60,7 @@ module FeedParser
|
|
61
60
|
if header
|
62
61
|
s += "Item: "
|
63
62
|
s += @title if @title
|
64
|
-
s += "\n<#{
|
63
|
+
s += "\n<#{link}>" if link
|
65
64
|
if @date
|
66
65
|
if localtime
|
67
66
|
s += "\nDate: #{@date.to_s}"
|
@@ -71,7 +70,7 @@ module FeedParser
|
|
71
70
|
end
|
72
71
|
s += "\n"
|
73
72
|
else
|
74
|
-
s += "<#{
|
73
|
+
s += "<#{link}>\n\n" if link
|
75
74
|
end
|
76
75
|
s += "#{@content.html2text(wrapto).chomp}\n" if @content
|
77
76
|
if @enclosures and @enclosures.length > 0
|
@@ -89,7 +88,7 @@ module FeedParser
|
|
89
88
|
if not header
|
90
89
|
s += "\nItem: "
|
91
90
|
s += @title if @title
|
92
|
-
s += "\n<#{
|
91
|
+
s += "\n<#{link}>" if link
|
93
92
|
if @date
|
94
93
|
if localtime
|
95
94
|
s += "\nDate: #{@date.to_s}"
|
@@ -59,7 +59,7 @@ end
|
|
59
59
|
text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
|
60
60
|
text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
|
61
61
|
# uris
|
62
|
-
text.gsub!(/([^'"])(#{URI::
|
62
|
+
text.gsub!(/([^'"])(#{URI::DEFAULT_PARSER.make_regexp(['http','ftp','https'])})/,
|
63
63
|
'\1<a href="\2">\2</a>')
|
64
64
|
end
|
65
65
|
# Handle broken hrefs in <a> and <img>
|
data/test/tc_feed_parse.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# encoding: UTF-8
|
2
2
|
|
3
3
|
$:.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
4
4
|
|
@@ -114,4 +114,55 @@ class FeedParserTest < Test::Unit::TestCase
|
|
114
114
|
# the third one should be removed because an enclosure should have an url, or it's useless.
|
115
115
|
assert_equal([["url1", "1", "type1"], ["url2", nil, "type2"], ["url1", "1", nil]], ch.items[0].enclosures)
|
116
116
|
end
|
117
|
+
|
118
|
+
def test_recode_utf8
|
119
|
+
assert_equal 'UTF-8', FeedParser.recode("áéíóú").encoding.name
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_recode_blank
|
123
|
+
assert_equal 'UTF-8', FeedParser.recode('').encoding.name
|
124
|
+
end
|
125
|
+
|
126
|
+
def test_recode_iso88519
|
127
|
+
assert_equal 'UTF-8', FeedParser.recode("áéíóú".encode('iso-8859-1')).encoding.name
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_recode_utf8_mixed_with_ASCIIBIT
|
131
|
+
recoded = FeedParser.recode("áé\x8Díóú")
|
132
|
+
assert_equal'UTF-8', recoded.encoding.name
|
133
|
+
assert_equal 'áéíóú', recoded
|
134
|
+
end
|
135
|
+
|
136
|
+
def test_recode_unicode_char
|
137
|
+
assert_equal "1280×1024", FeedParser.recode("1280×1024")
|
138
|
+
end
|
139
|
+
|
140
|
+
def test_almost_valid_iso88591
|
141
|
+
input = "Codifica\xE7\xE3o \x96 quase v\xE1lida"
|
142
|
+
assert_equal "Codificação quase válida", FeedParser.recode(input)
|
143
|
+
end
|
144
|
+
|
145
|
+
def test_feed_origin
|
146
|
+
feed = FeedParser::Feed.new(nil, 'http://foo.com/feed')
|
147
|
+
assert_equal "http://foo.com", feed.origin
|
148
|
+
end
|
149
|
+
|
150
|
+
def test_item_origin
|
151
|
+
feed = FeedParser::Feed.new(nil, 'http://foo.com/feed')
|
152
|
+
item = FeedParser::FeedItem.new(nil, feed)
|
153
|
+
item.link = '/foo/bar'
|
154
|
+
assert_equal 'http://foo.com/foo/bar', item.link
|
155
|
+
end
|
156
|
+
|
157
|
+
def test_item_origin_no_link
|
158
|
+
item = FeedParser::FeedItem.new(nil, nil)
|
159
|
+
assert_nil item.link
|
160
|
+
end
|
161
|
+
|
162
|
+
def test_item_no_feed
|
163
|
+
item = FeedParser::FeedItem.new(nil, nil)
|
164
|
+
item.link = '/foo/bar'
|
165
|
+
assert_equal '/foo/bar', item.link
|
166
|
+
end
|
167
|
+
|
117
168
|
end
|
data/test/tc_feeditem.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'feedparser/feedparser'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
|
5
|
+
class FeedItemTest < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@item = FeedParser::FeedItem.new(nil, nil)
|
8
|
+
end
|
9
|
+
|
10
|
+
########################################################################
|
11
|
+
|
12
|
+
def test_link_no_link
|
13
|
+
assert @item.link.nil?
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_link_basic
|
17
|
+
@item.instance_variable_set('@link', 'https://www.example.com/')
|
18
|
+
assert_equal "https://www.example.com/", @item.link
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_link_path_only
|
22
|
+
@item.instance_variable_set('@link', '/foo/bar/')
|
23
|
+
assert_equal "/foo/bar/", @item.link
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_link_path_only_with_feed_origin
|
27
|
+
@item.instance_variable_set('@link', '/foo/bar/')
|
28
|
+
feed = FeedParser::Feed.new
|
29
|
+
feed.instance_variable_set('@origin', 'https://www.exampleorigin.com')
|
30
|
+
@item.instance_variable_set('@feed', feed)
|
31
|
+
assert_equal "https://www.exampleorigin.com/foo/bar/", @item.link
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_link_full_link_with_feed_origin
|
35
|
+
@item.instance_variable_set('@link', 'https://www.exampleorigin.com/foo/bar/')
|
36
|
+
feed = FeedParser::Feed.new
|
37
|
+
feed.instance_variable_set('@origin', 'https://www.exampleorigin.com')
|
38
|
+
@item.instance_variable_set('@feed', feed)
|
39
|
+
assert_equal "https://www.exampleorigin.com/foo/bar/", @item.link
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_link_with_non_ascii
|
43
|
+
@item.instance_variable_set('@link', 'https://www.example.people/☭/')
|
44
|
+
assert_equal "https://www.example.people/☭/", @item.link
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
require 'feedparser/feedparser'
|
6
|
+
|
7
|
+
class Html2TextParserTest < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def test_next_img_index
|
10
|
+
parser = FeedParser::HTML2TextParser.new
|
11
|
+
assert_equal 'A', parser.next_img_index
|
12
|
+
assert_equal 'B', parser.next_img_index
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_numerical_entity
|
16
|
+
parser = FeedParser::HTML2TextParser.new
|
17
|
+
parser.feed('1280×1024')
|
18
|
+
parser.close
|
19
|
+
assert_equal "1280×1024", parser.savedata
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_numerical_entity_large_known
|
23
|
+
parser = FeedParser::HTML2TextParser.new
|
24
|
+
parser.feed('→')
|
25
|
+
parser.close
|
26
|
+
assert_equal "→", parser.savedata
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_numerical_entity_large
|
30
|
+
parser = FeedParser::HTML2TextParser.new
|
31
|
+
parser.feed('✐')
|
32
|
+
parser.close
|
33
|
+
assert_equal "✐", parser.savedata
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_non_numerical_entity
|
37
|
+
parser = FeedParser::HTML2TextParser.new
|
38
|
+
parser.feed('HTML&CO')
|
39
|
+
parser.close
|
40
|
+
assert_equal "HTML&CO", parser.savedata
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
data/test/tc_htmloutput.rb
CHANGED
@@ -19,12 +19,10 @@ class HTMLOutputTest < Test::Unit::TestCase
|
|
19
19
|
else
|
20
20
|
raise 'source directory not found.'
|
21
21
|
end
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
next if ENV['SOURCE'] != nil and ENV['SOURCE'] != f
|
27
|
-
puts "Checking #{f}"
|
22
|
+
Dir.foreach(SRCDIR) do |f|
|
23
|
+
next if f !~ /.xml$/
|
24
|
+
testname = 'test_' + File.basename(f).gsub(/\W/, '_')
|
25
|
+
define_method(testname) do
|
28
26
|
str = File::read(SRCDIR + '/' + f)
|
29
27
|
chan = FeedParser::Feed::new(str)
|
30
28
|
chanstr = chan.to_html(false)
|
@@ -34,19 +32,21 @@ class HTMLOutputTest < Test::Unit::TestCase
|
|
34
32
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output.new'), "w") do |fd|
|
35
33
|
fd.print(chanstr)
|
36
34
|
end
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
assert(
|
36
|
+
false,
|
37
|
+
[
|
38
|
+
"Test failed for #{f}.",
|
39
|
+
" Check: diff -u #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{,.new}",
|
40
|
+
" Commit: mv -f #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{.new,}",
|
41
|
+
].join("\n")
|
42
|
+
)
|
41
43
|
end
|
42
44
|
else
|
43
|
-
puts "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!"
|
44
45
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output'), "w") do |f|
|
45
46
|
f.print(chanstr)
|
46
47
|
end
|
47
|
-
|
48
|
+
assert(false, "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!")
|
48
49
|
end
|
49
50
|
end
|
50
|
-
assert(allok)
|
51
51
|
end
|
52
52
|
end
|
data/test/tc_parser.rb
CHANGED
@@ -15,12 +15,10 @@ class ParserTest < Test::Unit::TestCase
|
|
15
15
|
else
|
16
16
|
raise 'source directory not found.'
|
17
17
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
next if ENV['SOURCE'] != nil and ENV['SOURCE'] != f
|
23
|
-
puts "Checking #{f}"
|
18
|
+
Dir.foreach(SRCDIR) do |f|
|
19
|
+
next if f !~ /.xml$/
|
20
|
+
testname = 'test_' + File.basename(f).gsub(/\W/, '_')
|
21
|
+
define_method(testname) do
|
24
22
|
str = File::read(SRCDIR + '/' + f)
|
25
23
|
chan = FeedParser::Feed::new(str)
|
26
24
|
chanstr = chan.to_s(false)
|
@@ -30,19 +28,21 @@ class ParserTest < Test::Unit::TestCase
|
|
30
28
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output.new'), "w") do |fd|
|
31
29
|
fd.print(chanstr)
|
32
30
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
assert(
|
32
|
+
false,
|
33
|
+
[
|
34
|
+
"Test failed for #{f}.",
|
35
|
+
" Check: diff -u #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{,.new}",
|
36
|
+
" Commit: mv -f #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{.new,}",
|
37
|
+
].join("\n")
|
38
|
+
)
|
37
39
|
end
|
38
40
|
else
|
39
|
-
puts "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!"
|
40
41
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output'), "w") do |f|
|
41
42
|
f.print(chanstr)
|
42
43
|
end
|
43
|
-
|
44
|
+
assert(false, "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!")
|
44
45
|
end
|
45
46
|
end
|
46
|
-
assert(allok)
|
47
47
|
end
|
48
48
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'mocha/setup'
|
5
|
+
|
6
|
+
require 'feedparser/sgml-parser'
|
7
|
+
|
8
|
+
class SGMLParserTest < Test::Unit::TestCase
|
9
|
+
|
10
|
+
def test_numerical_charref
|
11
|
+
parser = FeedParser::SGMLParser.new
|
12
|
+
parser.expects(:unknown_charref).with('215')
|
13
|
+
parser.handle_charref('215')
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_non_numerical_charref
|
17
|
+
parser = FeedParser::SGMLParser.new
|
18
|
+
parser.expects(:handle_data).with('amp')
|
19
|
+
parser.handle_charref('amp')
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/test/tc_textoutput.rb
CHANGED
@@ -15,12 +15,10 @@ class TextOutputTest < Test::Unit::TestCase
|
|
15
15
|
else
|
16
16
|
raise 'source directory not found.'
|
17
17
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
next if ENV['SOURCE'] != nil and ENV['SOURCE'] != f
|
23
|
-
puts "Checking #{f}"
|
18
|
+
Dir.foreach(SRCDIR) do |f|
|
19
|
+
next if f !~ /.xml$/
|
20
|
+
testname = 'test_' + File.basename(f).gsub(/\W/, '_')
|
21
|
+
define_method(testname) do
|
24
22
|
str = File::read(SRCDIR + '/' + f)
|
25
23
|
chan = FeedParser::Feed::new(str)
|
26
24
|
chanstr = chan.to_text(false) # localtime set to false
|
@@ -30,19 +28,21 @@ class TextOutputTest < Test::Unit::TestCase
|
|
30
28
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output.new'), "w") do |fd|
|
31
29
|
fd.print(chanstr)
|
32
30
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
assert(
|
32
|
+
false,
|
33
|
+
[
|
34
|
+
"Test failed for #{f}.",
|
35
|
+
" Check: diff -u #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{,.new}",
|
36
|
+
" Commit: mv -f #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{.new,}",
|
37
|
+
].join("\n")
|
38
|
+
)
|
37
39
|
end
|
38
40
|
else
|
39
|
-
puts "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!"
|
40
41
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output'), "w") do |f|
|
41
42
|
f.print(chanstr)
|
42
43
|
end
|
43
|
-
|
44
|
+
assert(false, "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!")
|
44
45
|
end
|
45
46
|
end
|
46
|
-
assert(allok)
|
47
47
|
end
|
48
48
|
end
|
@@ -15,12 +15,10 @@ class TextWrappedOutputTest < Test::Unit::TestCase
|
|
15
15
|
else
|
16
16
|
raise 'source directory not found.'
|
17
17
|
end
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
next if ENV['SOURCE'] != nil and ENV['SOURCE'] != f
|
23
|
-
puts "Checking #{f}"
|
18
|
+
Dir.foreach(SRCDIR) do |f|
|
19
|
+
next if f !~ /.xml$/
|
20
|
+
testname = 'test_' + File.basename(f).gsub(/\W/, '_')
|
21
|
+
define_method(testname) do
|
24
22
|
str = File::read(SRCDIR + '/' + f)
|
25
23
|
chan = FeedParser::Feed::new(str)
|
26
24
|
chanstr = chan.to_text(false, 72) # localtime set to false
|
@@ -30,19 +28,21 @@ class TextWrappedOutputTest < Test::Unit::TestCase
|
|
30
28
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output.new'), "w") do |fd|
|
31
29
|
fd.print(chanstr)
|
32
30
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
assert(
|
32
|
+
false,
|
33
|
+
[
|
34
|
+
"Test failed for #{f}.",
|
35
|
+
" Check: diff -u #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{,.new}",
|
36
|
+
" Commit: mv -f #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}{.new,}",
|
37
|
+
].join("\n")
|
38
|
+
)
|
37
39
|
end
|
38
40
|
else
|
39
|
-
puts "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!"
|
40
41
|
File::open(DSTDIR + '/' + f.gsub(/.xml$/, '.output'), "w") do |f|
|
41
42
|
f.print(chanstr)
|
42
43
|
end
|
43
|
-
|
44
|
+
assert(false, "Missing #{DSTDIR + '/' + f.gsub(/.xml$/, '.output')}. Writing it, but check manually!")
|
44
45
|
end
|
45
46
|
end
|
46
|
-
assert(allok)
|
47
47
|
end
|
48
48
|
end
|
metadata
CHANGED
@@ -1,76 +1,82 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-feedparser
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.7
|
5
5
|
platform: ruby
|
6
|
-
authors:
|
7
|
-
|
6
|
+
authors:
|
7
|
+
- Lucas Nussbaum
|
8
8
|
autorequire: feedparser
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
date: 2021-02-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: magic
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
16
27
|
description: Ruby library to parse ATOM and RSS feeds
|
17
28
|
email:
|
18
29
|
executables: []
|
19
|
-
|
20
30
|
extensions: []
|
21
|
-
|
22
31
|
extra_rdoc_files: []
|
23
|
-
|
24
|
-
files:
|
25
|
-
- ChangeLog
|
26
|
-
- README
|
32
|
+
files:
|
27
33
|
- COPYING
|
34
|
+
- ChangeLog.md
|
28
35
|
- LICENSE
|
29
|
-
-
|
36
|
+
- README
|
30
37
|
- Rakefile
|
31
|
-
- lib/feedparser
|
38
|
+
- lib/feedparser.rb
|
39
|
+
- lib/feedparser/feedparser.rb
|
32
40
|
- lib/feedparser/filesizes.rb
|
33
41
|
- lib/feedparser/html-output.rb
|
34
|
-
- lib/feedparser/rexml_patch.rb
|
35
42
|
- lib/feedparser/html2text-parser.rb
|
36
|
-
- lib/feedparser/
|
37
|
-
- lib/feedparser/feedparser.rb
|
43
|
+
- lib/feedparser/rexml_patch.rb
|
38
44
|
- lib/feedparser/sgml-parser.rb
|
39
|
-
- lib/feedparser.rb
|
45
|
+
- lib/feedparser/text-output.rb
|
46
|
+
- lib/feedparser/textconverters.rb
|
47
|
+
- lib/feedparser/version.rb
|
48
|
+
- setup.rb
|
40
49
|
- test/tc_feed_parse.rb
|
41
|
-
- test/
|
50
|
+
- test/tc_feeditem.rb
|
51
|
+
- test/tc_html2text_parser.rb
|
42
52
|
- test/tc_htmloutput.rb
|
53
|
+
- test/tc_parser.rb
|
54
|
+
- test/tc_sgml_parser.rb
|
55
|
+
- test/tc_textoutput.rb
|
43
56
|
- test/tc_textwrappedoutput.rb
|
44
57
|
- test/ts_feedparser.rb
|
45
|
-
- test/tc_parser.rb
|
46
58
|
- tools/doctoweb.bash
|
47
|
-
has_rdoc: true
|
48
59
|
homepage:
|
49
60
|
licenses: []
|
50
|
-
|
61
|
+
metadata: {}
|
51
62
|
post_install_message:
|
52
63
|
rdoc_options: []
|
53
|
-
|
54
|
-
require_paths:
|
64
|
+
require_paths:
|
55
65
|
- lib
|
56
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
-
requirements:
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
58
68
|
- - ">="
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
version:
|
61
|
-
|
62
|
-
|
63
|
-
requirements:
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
64
73
|
- - ">="
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version:
|
67
|
-
|
68
|
-
requirements:
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements:
|
69
77
|
- none
|
70
|
-
|
71
|
-
rubygems_version: 1.3.4
|
78
|
+
rubygems_version: 3.2.5
|
72
79
|
signing_key:
|
73
|
-
specification_version:
|
80
|
+
specification_version: 4
|
74
81
|
summary: Ruby library to parse ATOM and RSS feeds
|
75
82
|
test_files: []
|
76
|
-
|