feed-normalizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/Rakefile +2 -2
- data/lib/html-cleaner.rb +3 -2
- data/lib/parsers/rss.rb +15 -3
- data/lib/parsers/simple-rss.rb +43 -4
- data/lib/structures.rb +76 -4
- data/test/data/atom03.xml +1 -1
- data/test/data/atom10.xml +2 -2
- data/test/data/rss20.xml +13 -2
- data/test/data/rss20diff.xml +11 -0
- data/test/data/rss20diff_short.xml +11 -0
- data/test/test_feednormalizer.rb +69 -9
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
1.5.0
|
2
|
+
* Add support for new fields:
|
3
|
+
* Atom 0.3: issued is now available through entry.date_published.
|
4
|
+
* RSS: feed.skip_hours, feed.skip_days, feed.ttl [joshpeek]
|
5
|
+
* All: entry.last_updated, this is an alias to entry.date_published for RSS.
|
6
|
+
* Rewrite relative links in content [joshpeek]
|
7
|
+
* Handle CDATA sections consistently across all formats. [sam.lown]
|
8
|
+
* Prevent SimpleRSS from doing its own escaping. [reported by: paul.stadig, lionel.bouton]
|
9
|
+
* Reparse Time classes [reported by: sam.lown]
|
10
|
+
|
1
11
|
1.4.0
|
2
12
|
|
3
13
|
* Support content:encoded. Accessible via Entry#content.
|
data/Rakefile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'hoe'
|
2
2
|
|
3
|
-
Hoe.new("feed-normalizer", "1.
|
3
|
+
Hoe.new("feed-normalizer", "1.5.0") do |s|
|
4
4
|
s.author = "Andrew A. Smith"
|
5
5
|
s.email = "andy@tinnedfruit.org"
|
6
6
|
s.url = "http://feed-normalizer.rubyforge.org/"
|
7
7
|
s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
|
8
|
-
s.description = s.paragraphs_of('
|
8
|
+
s.description = s.paragraphs_of('README.txt', 1..2).join("\n\n")
|
9
9
|
s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
|
10
10
|
s.extra_deps << ["simple-rss", ">= 1.1"]
|
11
11
|
s.extra_deps << ["hpricot", ">= 0.6"]
|
data/lib/html-cleaner.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'hpricot'
|
2
3
|
require 'cgi'
|
3
4
|
|
@@ -99,10 +100,10 @@ module FeedNormalizer
|
|
99
100
|
doc = Hpricot(str, :xhtml_strict => true)
|
100
101
|
doc = subtree(doc, :body)
|
101
102
|
|
102
|
-
out =
|
103
|
+
out = []
|
103
104
|
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
104
105
|
|
105
|
-
return out
|
106
|
+
return out.join
|
106
107
|
end
|
107
108
|
|
108
109
|
# Returns true if the given string contains a suspicious URL,
|
data/lib/parsers/rss.rb
CHANGED
@@ -42,7 +42,8 @@ module FeedNormalizer
|
|
42
42
|
:copyright => :copyright,
|
43
43
|
:authors => :managingEditor,
|
44
44
|
:last_updated => [:lastBuildDate, :pubDate, :dc_date],
|
45
|
-
:id => :guid
|
45
|
+
:id => :guid,
|
46
|
+
:ttl => :ttl
|
46
47
|
}
|
47
48
|
|
48
49
|
# make two passes, to catch all possible root elements
|
@@ -51,6 +52,8 @@ module FeedNormalizer
|
|
51
52
|
|
52
53
|
# custom channel elements
|
53
54
|
feed.image = rss.image ? rss.image.url : nil
|
55
|
+
feed.skip_hours = skip(rss, :skipHours)
|
56
|
+
feed.skip_days = skip(rss, :skipDays)
|
54
57
|
|
55
58
|
# item elements
|
56
59
|
item_mapping = {
|
@@ -59,7 +62,8 @@ module FeedNormalizer
|
|
59
62
|
:description => :description,
|
60
63
|
:content => [:content_encoded, :description],
|
61
64
|
:title => :title,
|
62
|
-
:authors => [:author, :dc_creator]
|
65
|
+
:authors => [:author, :dc_creator],
|
66
|
+
:last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
|
63
67
|
}
|
64
68
|
|
65
69
|
rss.items.each do |rss_item|
|
@@ -79,6 +83,14 @@ module FeedNormalizer
|
|
79
83
|
feed
|
80
84
|
end
|
81
85
|
|
86
|
+
def self.skip(parser, attribute)
|
87
|
+
attributes = case attribute
|
88
|
+
when :skipHours: :hours
|
89
|
+
when :skipDays: :days
|
90
|
+
end
|
91
|
+
channel = parser.channel
|
92
|
+
channel.respond_to?(attribute) && channel.send(attribute).send(attributes).map { |e| e.content }
|
93
|
+
end
|
94
|
+
|
82
95
|
end
|
83
96
|
end
|
84
|
-
|
data/lib/parsers/simple-rss.rb
CHANGED
@@ -1,5 +1,42 @@
|
|
1
1
|
require 'simple-rss'
|
2
2
|
|
3
|
+
# Monkey patches for outstanding issues logged in the simple-rss project.
|
4
|
+
# * Add support for issued time field:
|
5
|
+
# http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
|
6
|
+
# * The '+' symbol is lost when escaping fields.
|
7
|
+
# http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
|
8
|
+
#
|
9
|
+
class SimpleRSS
|
10
|
+
@@item_tags << :issued
|
11
|
+
|
12
|
+
undef clean_content
|
13
|
+
def clean_content(tag, attrs, content)
|
14
|
+
content = content.to_s
|
15
|
+
case tag
|
16
|
+
when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
|
17
|
+
Time.parse(content) rescue unescape(content)
|
18
|
+
when :author, :contributor, :skipHours, :skipDays
|
19
|
+
unescape(content.gsub(/<.*?>/,''))
|
20
|
+
else
|
21
|
+
content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
undef unescape
|
26
|
+
def unescape(s)
|
27
|
+
if s =~ /^(<!\[CDATA\[|\]\]>)/
|
28
|
+
# Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
|
29
|
+
s.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
|
30
|
+
elsif s =~ /[<>]/
|
31
|
+
# Already looks like HTML.
|
32
|
+
s
|
33
|
+
else
|
34
|
+
# Make it HTML.
|
35
|
+
FeedNormalizer::HtmlCleaner.unescapeHTML(s)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
3
40
|
module FeedNormalizer
|
4
41
|
|
5
42
|
# The SimpleRSS parser can handle both RSS and Atom feeds.
|
@@ -38,7 +75,8 @@ module FeedNormalizer
|
|
38
75
|
:copyright => [:copyright, :rights],
|
39
76
|
:authors => [:author, :webMaster, :managingEditor, :contributor],
|
40
77
|
:urls => :link,
|
41
|
-
:description => [:description, :subtitle]
|
78
|
+
:description => [:description, :subtitle],
|
79
|
+
:ttl => :ttl
|
42
80
|
}
|
43
81
|
|
44
82
|
map_functions!(feed_mapping, atomrss, feed)
|
@@ -50,13 +88,14 @@ module FeedNormalizer
|
|
50
88
|
|
51
89
|
# entry elements
|
52
90
|
entry_mapping = {
|
53
|
-
:date_published => [:pubDate, :published, :dc_date],
|
91
|
+
:date_published => [:pubDate, :published, :dc_date, :issued],
|
54
92
|
:urls => :link,
|
55
93
|
:description => [:description, :summary],
|
56
94
|
:content => [:content, :content_encoded, :description],
|
57
95
|
:title => :title,
|
58
96
|
:authors => [:author, :contributor, :dc_creator],
|
59
|
-
:categories => :category
|
97
|
+
:categories => :category,
|
98
|
+
:last_updated => [:updated, :dc_date, :pubDate]
|
60
99
|
}
|
61
100
|
|
62
101
|
atomrss.entries.each do |atomrss_entry|
|
@@ -76,7 +115,7 @@ module FeedNormalizer
|
|
76
115
|
def self.image(parser)
|
77
116
|
if parser.respond_to?(:image) && parser.image
|
78
117
|
if parser.image =~ /<url>/ # RSS image contains an <url> spec
|
79
|
-
parser.image.scan(/<url>(
|
118
|
+
parser.image.scan(/<url>(.*?)<\/url>/).to_s
|
80
119
|
else
|
81
120
|
parser.image # Atom contains just the url
|
82
121
|
end
|
data/lib/structures.rb
CHANGED
@@ -115,13 +115,55 @@ module FeedNormalizer
|
|
115
115
|
end
|
116
116
|
end
|
117
117
|
|
118
|
+
module TimeFix
|
119
|
+
# Reparse any Time instances, due to RSS::Parser's redefinition of
|
120
|
+
# certain aspects of the Time class that creates unexpected behaviour
|
121
|
+
# when extending the Time class, as some common third party libraries do.
|
122
|
+
# See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
|
123
|
+
def reparse(obj)
|
124
|
+
@parsed ||= false
|
125
|
+
|
126
|
+
return obj if @parsed
|
127
|
+
|
128
|
+
if obj.is_a?(Time)
|
129
|
+
@parsed = true
|
130
|
+
Time.at(obj) rescue obj
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
module RewriteRelativeLinks
|
136
|
+
def rewrite_relative_links(text, url)
|
137
|
+
if host = url_host(url)
|
138
|
+
text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
|
139
|
+
else
|
140
|
+
text
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
def url_host(url)
|
146
|
+
URI.parse(url).host rescue nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
118
150
|
|
119
151
|
# Represents a feed item entry.
|
152
|
+
# Available fields are:
|
153
|
+
# * content
|
154
|
+
# * description
|
155
|
+
# * title
|
156
|
+
# * date_published
|
157
|
+
# * urls / url
|
158
|
+
# * id
|
159
|
+
# * authors / author
|
160
|
+
# * copyright
|
161
|
+
# * categories
|
120
162
|
class Entry
|
121
|
-
include Singular, ElementEquality, ElementCleaner
|
163
|
+
include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
|
122
164
|
|
123
165
|
HTML_ELEMENTS = [:content, :description, :title]
|
124
|
-
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
|
166
|
+
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
|
125
167
|
BLENDED_ELEMENTS = []
|
126
168
|
|
127
169
|
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
@@ -132,19 +174,41 @@ module FeedNormalizer
|
|
132
174
|
@urls = []
|
133
175
|
@authors = []
|
134
176
|
@categories = []
|
177
|
+
@date_published, @content = nil
|
178
|
+
end
|
179
|
+
|
180
|
+
undef date_published
|
181
|
+
def date_published
|
182
|
+
@date_published = reparse(@date_published)
|
183
|
+
end
|
184
|
+
|
185
|
+
undef content
|
186
|
+
def content
|
187
|
+
@content = rewrite_relative_links(@content, url)
|
135
188
|
end
|
136
189
|
|
137
190
|
end
|
138
191
|
|
139
192
|
# Represents the root element of a feed.
|
193
|
+
# Available fields are:
|
194
|
+
# * title
|
195
|
+
# * description
|
196
|
+
# * id
|
197
|
+
# * last_updated
|
198
|
+
# * copyright
|
199
|
+
# * authors / author
|
200
|
+
# * urls / url
|
201
|
+
# * image
|
202
|
+
# * generator
|
203
|
+
# * items / channel
|
140
204
|
class Feed
|
141
|
-
include Singular, ElementEquality, ElementCleaner
|
205
|
+
include Singular, ElementEquality, ElementCleaner, TimeFix
|
142
206
|
|
143
207
|
# Elements that can contain HTML fragments.
|
144
208
|
HTML_ELEMENTS = [:title, :description]
|
145
209
|
|
146
210
|
# Elements that contain 'plain' Strings, with HTML escaped.
|
147
|
-
SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
|
211
|
+
SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
|
148
212
|
|
149
213
|
# Elements that contain both HTML and escaped HTML.
|
150
214
|
BLENDED_ELEMENTS = [:items]
|
@@ -160,8 +224,16 @@ module FeedNormalizer
|
|
160
224
|
# set up associations (i.e. arrays where needed)
|
161
225
|
@urls = []
|
162
226
|
@authors = []
|
227
|
+
@skip_hours = []
|
228
|
+
@skip_days = []
|
163
229
|
@items = []
|
164
230
|
@parser = wrapper.parser.to_s
|
231
|
+
@last_updated = nil
|
232
|
+
end
|
233
|
+
|
234
|
+
undef last_updated
|
235
|
+
def last_updated
|
236
|
+
@last_updated = reparse(@last_updated)
|
165
237
|
end
|
166
238
|
|
167
239
|
def channel() self end
|
data/test/data/atom03.xml
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
Kmart has the Levi Strauss Signature Girl’s Low Rise Flare Jean for $10 after $5 instant savings (ends 9/2)
|
25
25
|
Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex
|
26
26
|
]]></summary>
|
27
|
-
<content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="
|
27
|
+
<content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="/link/tplclick?lid=41000000011334249&pubid=21000000000053626" target=_"blank"><img src="/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
|
28
28
|
<strong>Kmart has the <a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&pubid=21000000000053626" target=_"blank">Levi Strauss Signature Girl’s Low Rise Flare Jean</a> for $10 after $5 instant savings (ends 9/2)</strong></p>
|
29
29
|
<p>Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex</p>
|
30
30
|
]]></content>
|
data/test/data/atom10.xml
CHANGED
@@ -17,8 +17,8 @@
|
|
17
17
|
<link href="http://habtm.com/articles/2006/08/16/a-forum-on-rails" rel="alternate" type="text/html"/>
|
18
18
|
<category term="rails" scheme="http://habtm.com/articles/category/rails" label="rails"/>
|
19
19
|
<category term="ruby" scheme="http://habtm.com/articles/category/ruby" label="ruby"/>
|
20
|
-
<summary type="html"><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></summary>
|
21
|
-
<content type="html"><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></content>
|
20
|
+
<summary type="html"><plaintext><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></summary>
|
21
|
+
<content type="html"><plaintext><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></content>
|
22
22
|
</entry>
|
23
23
|
<entry>
|
24
24
|
<author>
|
data/test/data/rss20.xml
CHANGED
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
@@ -19,7 +30,7 @@
|
|
19
30
|
|
20
31
|
<item>
|
21
32
|
<title>Concerns over security software</title>
|
22
|
-
<description
|
33
|
+
<description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
|
23
34
|
<content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
|
24
35
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
25
36
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
@@ -40,7 +51,7 @@
|
|
40
51
|
|
41
52
|
<item>
|
42
53
|
<title>MP3 player court order overturned</title>
|
43
|
-
<description
|
54
|
+
<description><b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
44
55
|
<content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
|
45
56
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
46
57
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
data/test/data/rss20diff.xml
CHANGED
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
data/test/test_feednormalizer.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'test/unit'
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
4
|
require 'feed-normalizer'
|
3
5
|
require 'yaml'
|
4
6
|
|
@@ -66,8 +68,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
66
68
|
|
67
69
|
assert_equal "BBC News | Technology | UK Edition", feed.title
|
68
70
|
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
71
|
+
assert_equal 15, feed.ttl
|
72
|
+
assert_equal [6, 7, 8, 9, 10, 11], feed.skip_hours
|
73
|
+
assert_equal ["Sunday"], feed.skip_days
|
69
74
|
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
70
|
-
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
75
|
+
assert_equal "<b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
71
76
|
assert_match(/test\d/, feed.entries.last.content)
|
72
77
|
assert_instance_of Time, feed.entries.last.date_published
|
73
78
|
end
|
@@ -77,6 +82,9 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
77
82
|
|
78
83
|
assert_equal "~:caboose", feed.title
|
79
84
|
assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
|
85
|
+
assert_equal nil, feed.ttl
|
86
|
+
assert_equal [], feed.skip_hours
|
87
|
+
assert_equal [], feed.skip_days
|
80
88
|
assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
|
81
89
|
assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
|
82
90
|
|
@@ -134,9 +142,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
134
142
|
def test_clean
|
135
143
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
136
144
|
|
137
|
-
|
145
|
+
assert_match(/<plaintext>/, feed.entries.first.content)
|
146
|
+
assert_match(/<plaintext>/, feed.entries.first.description)
|
138
147
|
feed.clean!
|
139
|
-
|
148
|
+
assert_no_match(/<plaintext>/, feed.entries.first.content)
|
149
|
+
assert_no_match(/<plaintext>/, feed.entries.first.description)
|
140
150
|
end
|
141
151
|
|
142
152
|
def test_malformed_feed
|
@@ -145,25 +155,21 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
145
155
|
|
146
156
|
def test_dublin_core_date_ruby_rss
|
147
157
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
148
|
-
assert_equal 'RSS::Parser', feed.parser
|
149
158
|
assert_instance_of Time, feed.entries.first.date_published
|
150
159
|
end
|
151
160
|
|
152
161
|
def test_dublin_core_date_simple_rss
|
153
162
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
154
|
-
assert_equal 'SimpleRSS', feed.parser
|
155
163
|
assert_instance_of Time, feed.entries.first.date_published
|
156
164
|
end
|
157
165
|
|
158
166
|
def test_dublin_core_creator_ruby_rss
|
159
167
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
160
|
-
assert_equal 'RSS::Parser', feed.parser
|
161
168
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
162
169
|
end
|
163
170
|
|
164
171
|
def test_dublin_core_creator_simple_rss
|
165
172
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
166
|
-
assert_equal 'SimpleRSS', feed.parser
|
167
173
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
168
174
|
end
|
169
175
|
|
@@ -191,7 +197,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
191
197
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
192
198
|
|
193
199
|
feed.entries.each_with_index do |e, i|
|
194
|
-
|
200
|
+
assert_equal("<p>test#{i+1}</p>", e.content)
|
195
201
|
end
|
196
202
|
end
|
197
203
|
|
@@ -199,9 +205,63 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
199
205
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
200
206
|
|
201
207
|
feed.entries.each_with_index do |e, i|
|
202
|
-
|
208
|
+
assert_equal("<p>test#{i+1}</p>", e.content)
|
203
209
|
end
|
204
210
|
end
|
205
211
|
|
212
|
+
def test_atom_content_contains_pluses
|
213
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
|
214
|
+
|
215
|
+
assert_equal 2, feed.entries.last.content.scan(/\+/).size
|
216
|
+
end
|
217
|
+
|
218
|
+
# http://code.google.com/p/feed-normalizer/issues/detail?id=13
|
219
|
+
def test_times_are_reparsed
|
220
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
221
|
+
|
222
|
+
Time.class_eval "alias :old_to_s :to_s; def to_s(x=1); old_to_s; end"
|
223
|
+
|
224
|
+
assert_equal "Sat Sep 09 07:57:06 -0700 2006", feed.last_updated.to_s(:foo)
|
225
|
+
assert_equal "Sat Sep 09 05:45:35 -0700 2006", feed.entries.first.date_published.to_s(:foo)
|
226
|
+
end
|
227
|
+
|
228
|
+
def test_atom03_has_issued
|
229
|
+
SimpleRSS.class_eval "@@item_tags.delete(:issued)"
|
230
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
|
231
|
+
assert_nil feed.entries.first.date_published
|
232
|
+
|
233
|
+
SimpleRSS.class_eval "@@item_tags << :issued"
|
234
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
|
235
|
+
assert_equal "Tue Aug 29 02:31:03 UTC 2006", feed.entries.first.date_published.to_s
|
236
|
+
end
|
237
|
+
|
238
|
+
def test_html_should_be_escaped_by_default
|
239
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
240
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
241
|
+
|
242
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
243
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
244
|
+
end
|
245
|
+
|
246
|
+
def test_relative_links_and_images_should_be_rewritten_with_url_base
|
247
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03])
|
248
|
+
assert_match '<a href="http://www.cheapstingybargains.com/link/tplclick?lid=41000000011334249&pubid=21000000000053626"' +
|
249
|
+
' target=_"blank"><img src="http://www.cheapstingybargains.com/assets/images/product/productDetail/9990000058546711.jpg"' +
|
250
|
+
' width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a>',
|
251
|
+
feed.items.first.content
|
252
|
+
end
|
253
|
+
|
254
|
+
def test_last_updated_simple_rss
|
255
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
|
256
|
+
|
257
|
+
assert_equal Time.parse("Wed Aug 16 09:59:44 -0700 2006"), feed.entries.first.last_updated
|
258
|
+
end
|
259
|
+
|
260
|
+
def test_last_updated_ruby_rss
|
261
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
262
|
+
|
263
|
+
assert_equal feed.entries.first.date_published, feed.entries.first.last_updated
|
264
|
+
end
|
265
|
+
|
206
266
|
end
|
207
267
|
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date:
|
6
|
+
version: 1.5.0
|
7
|
+
date: 2008-02-05 00:00:00 -08:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -90,5 +90,5 @@ dependencies:
|
|
90
90
|
requirements:
|
91
91
|
- - ">="
|
92
92
|
- !ruby/object:Gem::Version
|
93
|
-
version: 1.
|
93
|
+
version: 1.5.0
|
94
94
|
version:
|