feed-normalizer 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Rakefile +2 -2
- data/lib/html-cleaner.rb +3 -2
- data/lib/parsers/rss.rb +15 -3
- data/lib/parsers/simple-rss.rb +43 -4
- data/lib/structures.rb +76 -4
- data/test/data/atom03.xml +1 -1
- data/test/data/atom10.xml +2 -2
- data/test/data/rss20.xml +13 -2
- data/test/data/rss20diff.xml +11 -0
- data/test/data/rss20diff_short.xml +11 -0
- data/test/test_feednormalizer.rb +69 -9
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
1.5.0
|
2
|
+
* Add support for new fields:
|
3
|
+
* Atom 0.3: issued is now available through entry.date_published.
|
4
|
+
* RSS: feed.skip_hours, feed.skip_days, feed.ttl [joshpeek]
|
5
|
+
* All: entry.last_updated, this is an alias to entry.date_published for RSS.
|
6
|
+
* Rewrite relative links in content [joshpeek]
|
7
|
+
* Handle CDATA sections consistently across all formats. [sam.lown]
|
8
|
+
* Prevent SimpleRSS from doing its own escaping. [reported by: paul.stadig, lionel.bouton]
|
9
|
+
* Reparse Time classes [reported by: sam.lown]
|
10
|
+
|
1
11
|
1.4.0
|
2
12
|
|
3
13
|
* Support content:encoded. Accessible via Entry#content.
|
data/Rakefile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'hoe'
|
2
2
|
|
3
|
-
Hoe.new("feed-normalizer", "1.
|
3
|
+
Hoe.new("feed-normalizer", "1.5.0") do |s|
|
4
4
|
s.author = "Andrew A. Smith"
|
5
5
|
s.email = "andy@tinnedfruit.org"
|
6
6
|
s.url = "http://feed-normalizer.rubyforge.org/"
|
7
7
|
s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
|
8
|
-
s.description = s.paragraphs_of('
|
8
|
+
s.description = s.paragraphs_of('README.txt', 1..2).join("\n\n")
|
9
9
|
s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
|
10
10
|
s.extra_deps << ["simple-rss", ">= 1.1"]
|
11
11
|
s.extra_deps << ["hpricot", ">= 0.6"]
|
data/lib/html-cleaner.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'hpricot'
|
2
3
|
require 'cgi'
|
3
4
|
|
@@ -99,10 +100,10 @@ module FeedNormalizer
|
|
99
100
|
doc = Hpricot(str, :xhtml_strict => true)
|
100
101
|
doc = subtree(doc, :body)
|
101
102
|
|
102
|
-
out =
|
103
|
+
out = []
|
103
104
|
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
104
105
|
|
105
|
-
return out
|
106
|
+
return out.join
|
106
107
|
end
|
107
108
|
|
108
109
|
# Returns true if the given string contains a suspicious URL,
|
data/lib/parsers/rss.rb
CHANGED
@@ -42,7 +42,8 @@ module FeedNormalizer
|
|
42
42
|
:copyright => :copyright,
|
43
43
|
:authors => :managingEditor,
|
44
44
|
:last_updated => [:lastBuildDate, :pubDate, :dc_date],
|
45
|
-
:id => :guid
|
45
|
+
:id => :guid,
|
46
|
+
:ttl => :ttl
|
46
47
|
}
|
47
48
|
|
48
49
|
# make two passes, to catch all possible root elements
|
@@ -51,6 +52,8 @@ module FeedNormalizer
|
|
51
52
|
|
52
53
|
# custom channel elements
|
53
54
|
feed.image = rss.image ? rss.image.url : nil
|
55
|
+
feed.skip_hours = skip(rss, :skipHours)
|
56
|
+
feed.skip_days = skip(rss, :skipDays)
|
54
57
|
|
55
58
|
# item elements
|
56
59
|
item_mapping = {
|
@@ -59,7 +62,8 @@ module FeedNormalizer
|
|
59
62
|
:description => :description,
|
60
63
|
:content => [:content_encoded, :description],
|
61
64
|
:title => :title,
|
62
|
-
:authors => [:author, :dc_creator]
|
65
|
+
:authors => [:author, :dc_creator],
|
66
|
+
:last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
|
63
67
|
}
|
64
68
|
|
65
69
|
rss.items.each do |rss_item|
|
@@ -79,6 +83,14 @@ module FeedNormalizer
|
|
79
83
|
feed
|
80
84
|
end
|
81
85
|
|
86
|
+
def self.skip(parser, attribute)
|
87
|
+
attributes = case attribute
|
88
|
+
when :skipHours: :hours
|
89
|
+
when :skipDays: :days
|
90
|
+
end
|
91
|
+
channel = parser.channel
|
92
|
+
channel.respond_to?(attribute) && channel.send(attribute).send(attributes).map { |e| e.content }
|
93
|
+
end
|
94
|
+
|
82
95
|
end
|
83
96
|
end
|
84
|
-
|
data/lib/parsers/simple-rss.rb
CHANGED
@@ -1,5 +1,42 @@
|
|
1
1
|
require 'simple-rss'
|
2
2
|
|
3
|
+
# Monkey patches for outstanding issues logged in the simple-rss project.
|
4
|
+
# * Add support for issued time field:
|
5
|
+
# http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
|
6
|
+
# * The '+' symbol is lost when escaping fields.
|
7
|
+
# http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
|
8
|
+
#
|
9
|
+
class SimpleRSS
|
10
|
+
@@item_tags << :issued
|
11
|
+
|
12
|
+
undef clean_content
|
13
|
+
def clean_content(tag, attrs, content)
|
14
|
+
content = content.to_s
|
15
|
+
case tag
|
16
|
+
when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
|
17
|
+
Time.parse(content) rescue unescape(content)
|
18
|
+
when :author, :contributor, :skipHours, :skipDays
|
19
|
+
unescape(content.gsub(/<.*?>/,''))
|
20
|
+
else
|
21
|
+
content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
undef unescape
|
26
|
+
def unescape(s)
|
27
|
+
if s =~ /^(<!\[CDATA\[|\]\]>)/
|
28
|
+
# Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
|
29
|
+
s.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
|
30
|
+
elsif s =~ /[<>]/
|
31
|
+
# Already looks like HTML.
|
32
|
+
s
|
33
|
+
else
|
34
|
+
# Make it HTML.
|
35
|
+
FeedNormalizer::HtmlCleaner.unescapeHTML(s)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
3
40
|
module FeedNormalizer
|
4
41
|
|
5
42
|
# The SimpleRSS parser can handle both RSS and Atom feeds.
|
@@ -38,7 +75,8 @@ module FeedNormalizer
|
|
38
75
|
:copyright => [:copyright, :rights],
|
39
76
|
:authors => [:author, :webMaster, :managingEditor, :contributor],
|
40
77
|
:urls => :link,
|
41
|
-
:description => [:description, :subtitle]
|
78
|
+
:description => [:description, :subtitle],
|
79
|
+
:ttl => :ttl
|
42
80
|
}
|
43
81
|
|
44
82
|
map_functions!(feed_mapping, atomrss, feed)
|
@@ -50,13 +88,14 @@ module FeedNormalizer
|
|
50
88
|
|
51
89
|
# entry elements
|
52
90
|
entry_mapping = {
|
53
|
-
:date_published => [:pubDate, :published, :dc_date],
|
91
|
+
:date_published => [:pubDate, :published, :dc_date, :issued],
|
54
92
|
:urls => :link,
|
55
93
|
:description => [:description, :summary],
|
56
94
|
:content => [:content, :content_encoded, :description],
|
57
95
|
:title => :title,
|
58
96
|
:authors => [:author, :contributor, :dc_creator],
|
59
|
-
:categories => :category
|
97
|
+
:categories => :category,
|
98
|
+
:last_updated => [:updated, :dc_date, :pubDate]
|
60
99
|
}
|
61
100
|
|
62
101
|
atomrss.entries.each do |atomrss_entry|
|
@@ -76,7 +115,7 @@ module FeedNormalizer
|
|
76
115
|
def self.image(parser)
|
77
116
|
if parser.respond_to?(:image) && parser.image
|
78
117
|
if parser.image =~ /<url>/ # RSS image contains an <url> spec
|
79
|
-
parser.image.scan(/<url>(
|
118
|
+
parser.image.scan(/<url>(.*?)<\/url>/).to_s
|
80
119
|
else
|
81
120
|
parser.image # Atom contains just the url
|
82
121
|
end
|
data/lib/structures.rb
CHANGED
@@ -115,13 +115,55 @@ module FeedNormalizer
|
|
115
115
|
end
|
116
116
|
end
|
117
117
|
|
118
|
+
module TimeFix
|
119
|
+
# Reparse any Time instances, due to RSS::Parser's redefinition of
|
120
|
+
# certain aspects of the Time class that creates unexpected behaviour
|
121
|
+
# when extending the Time class, as some common third party libraries do.
|
122
|
+
# See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
|
123
|
+
def reparse(obj)
|
124
|
+
@parsed ||= false
|
125
|
+
|
126
|
+
return obj if @parsed
|
127
|
+
|
128
|
+
if obj.is_a?(Time)
|
129
|
+
@parsed = true
|
130
|
+
Time.at(obj) rescue obj
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
module RewriteRelativeLinks
|
136
|
+
def rewrite_relative_links(text, url)
|
137
|
+
if host = url_host(url)
|
138
|
+
text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
|
139
|
+
else
|
140
|
+
text
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
def url_host(url)
|
146
|
+
URI.parse(url).host rescue nil
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
118
150
|
|
119
151
|
# Represents a feed item entry.
|
152
|
+
# Available fields are:
|
153
|
+
# * content
|
154
|
+
# * description
|
155
|
+
# * title
|
156
|
+
# * date_published
|
157
|
+
# * urls / url
|
158
|
+
# * id
|
159
|
+
# * authors / author
|
160
|
+
# * copyright
|
161
|
+
# * categories
|
120
162
|
class Entry
|
121
|
-
include Singular, ElementEquality, ElementCleaner
|
163
|
+
include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
|
122
164
|
|
123
165
|
HTML_ELEMENTS = [:content, :description, :title]
|
124
|
-
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
|
166
|
+
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
|
125
167
|
BLENDED_ELEMENTS = []
|
126
168
|
|
127
169
|
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
@@ -132,19 +174,41 @@ module FeedNormalizer
|
|
132
174
|
@urls = []
|
133
175
|
@authors = []
|
134
176
|
@categories = []
|
177
|
+
@date_published, @content = nil
|
178
|
+
end
|
179
|
+
|
180
|
+
undef date_published
|
181
|
+
def date_published
|
182
|
+
@date_published = reparse(@date_published)
|
183
|
+
end
|
184
|
+
|
185
|
+
undef content
|
186
|
+
def content
|
187
|
+
@content = rewrite_relative_links(@content, url)
|
135
188
|
end
|
136
189
|
|
137
190
|
end
|
138
191
|
|
139
192
|
# Represents the root element of a feed.
|
193
|
+
# Available fields are:
|
194
|
+
# * title
|
195
|
+
# * description
|
196
|
+
# * id
|
197
|
+
# * last_updated
|
198
|
+
# * copyright
|
199
|
+
# * authors / author
|
200
|
+
# * urls / url
|
201
|
+
# * image
|
202
|
+
# * generator
|
203
|
+
# * items / channel
|
140
204
|
class Feed
|
141
|
-
include Singular, ElementEquality, ElementCleaner
|
205
|
+
include Singular, ElementEquality, ElementCleaner, TimeFix
|
142
206
|
|
143
207
|
# Elements that can contain HTML fragments.
|
144
208
|
HTML_ELEMENTS = [:title, :description]
|
145
209
|
|
146
210
|
# Elements that contain 'plain' Strings, with HTML escaped.
|
147
|
-
SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
|
211
|
+
SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
|
148
212
|
|
149
213
|
# Elements that contain both HTML and escaped HTML.
|
150
214
|
BLENDED_ELEMENTS = [:items]
|
@@ -160,8 +224,16 @@ module FeedNormalizer
|
|
160
224
|
# set up associations (i.e. arrays where needed)
|
161
225
|
@urls = []
|
162
226
|
@authors = []
|
227
|
+
@skip_hours = []
|
228
|
+
@skip_days = []
|
163
229
|
@items = []
|
164
230
|
@parser = wrapper.parser.to_s
|
231
|
+
@last_updated = nil
|
232
|
+
end
|
233
|
+
|
234
|
+
undef last_updated
|
235
|
+
def last_updated
|
236
|
+
@last_updated = reparse(@last_updated)
|
165
237
|
end
|
166
238
|
|
167
239
|
def channel() self end
|
data/test/data/atom03.xml
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
Kmart has the Levi Strauss Signature Girl’s Low Rise Flare Jean for $10 after $5 instant savings (ends 9/2)
|
25
25
|
Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex
|
26
26
|
]]></summary>
|
27
|
-
<content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="
|
27
|
+
<content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="/link/tplclick?lid=41000000011334249&pubid=21000000000053626" target=_"blank"><img src="/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
|
28
28
|
<strong>Kmart has the <a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&pubid=21000000000053626" target=_"blank">Levi Strauss Signature Girl’s Low Rise Flare Jean</a> for $10 after $5 instant savings (ends 9/2)</strong></p>
|
29
29
|
<p>Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex</p>
|
30
30
|
]]></content>
|
data/test/data/atom10.xml
CHANGED
@@ -17,8 +17,8 @@
|
|
17
17
|
<link href="http://habtm.com/articles/2006/08/16/a-forum-on-rails" rel="alternate" type="text/html"/>
|
18
18
|
<category term="rails" scheme="http://habtm.com/articles/category/rails" label="rails"/>
|
19
19
|
<category term="ruby" scheme="http://habtm.com/articles/category/ruby" label="ruby"/>
|
20
|
-
<summary type="html"><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></summary>
|
21
|
-
<content type="html"><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></content>
|
20
|
+
<summary type="html"><plaintext><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></summary>
|
21
|
+
<content type="html"><plaintext><p>Josh Goebel and I took an evening to bang out a little project: <a href="http://beast.caboo.se/">Beast</a>. It&#8217;s our minimal no-fluff Rails forum. It&#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. <a href="http://svn.techno-weenie.net/projects/beast/">Check it out</a>!</p></content>
|
22
22
|
</entry>
|
23
23
|
<entry>
|
24
24
|
<author>
|
data/test/data/rss20.xml
CHANGED
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
@@ -19,7 +30,7 @@
|
|
19
30
|
|
20
31
|
<item>
|
21
32
|
<title>Concerns over security software</title>
|
22
|
-
<description
|
33
|
+
<description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
|
23
34
|
<content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
|
24
35
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
25
36
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
@@ -40,7 +51,7 @@
|
|
40
51
|
|
41
52
|
<item>
|
42
53
|
<title>MP3 player court order overturned</title>
|
43
|
-
<description
|
54
|
+
<description><b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
44
55
|
<content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
|
45
56
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
46
57
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
data/test/data/rss20diff.xml
CHANGED
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
@@ -10,6 +10,17 @@
|
|
10
10
|
<copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
|
11
11
|
<docs>http://www.bbc.co.uk/syndication/</docs>
|
12
12
|
<ttl>15</ttl>
|
13
|
+
<skipHours>
|
14
|
+
<hour>6</hour>
|
15
|
+
<hour>7</hour>
|
16
|
+
<hour>8</hour>
|
17
|
+
<hour>9</hour>
|
18
|
+
<hour>10</hour>
|
19
|
+
<hour>11</hour>
|
20
|
+
</skipHours>
|
21
|
+
<skipDays>
|
22
|
+
<day>Sunday</day>
|
23
|
+
</skipDays>
|
13
24
|
|
14
25
|
<image>
|
15
26
|
<title>BBC News</title>
|
data/test/test_feednormalizer.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'test/unit'
|
2
|
+
$:.unshift(File.dirname(__FILE__))
|
3
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
4
|
require 'feed-normalizer'
|
3
5
|
require 'yaml'
|
4
6
|
|
@@ -66,8 +68,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
66
68
|
|
67
69
|
assert_equal "BBC News | Technology | UK Edition", feed.title
|
68
70
|
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
71
|
+
assert_equal 15, feed.ttl
|
72
|
+
assert_equal [6, 7, 8, 9, 10, 11], feed.skip_hours
|
73
|
+
assert_equal ["Sunday"], feed.skip_days
|
69
74
|
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
70
|
-
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
75
|
+
assert_equal "<b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
71
76
|
assert_match(/test\d/, feed.entries.last.content)
|
72
77
|
assert_instance_of Time, feed.entries.last.date_published
|
73
78
|
end
|
@@ -77,6 +82,9 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
77
82
|
|
78
83
|
assert_equal "~:caboose", feed.title
|
79
84
|
assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
|
85
|
+
assert_equal nil, feed.ttl
|
86
|
+
assert_equal [], feed.skip_hours
|
87
|
+
assert_equal [], feed.skip_days
|
80
88
|
assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
|
81
89
|
assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
|
82
90
|
|
@@ -134,9 +142,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
134
142
|
def test_clean
|
135
143
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
|
136
144
|
|
137
|
-
|
145
|
+
assert_match(/<plaintext>/, feed.entries.first.content)
|
146
|
+
assert_match(/<plaintext>/, feed.entries.first.description)
|
138
147
|
feed.clean!
|
139
|
-
|
148
|
+
assert_no_match(/<plaintext>/, feed.entries.first.content)
|
149
|
+
assert_no_match(/<plaintext>/, feed.entries.first.description)
|
140
150
|
end
|
141
151
|
|
142
152
|
def test_malformed_feed
|
@@ -145,25 +155,21 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
145
155
|
|
146
156
|
def test_dublin_core_date_ruby_rss
|
147
157
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
148
|
-
assert_equal 'RSS::Parser', feed.parser
|
149
158
|
assert_instance_of Time, feed.entries.first.date_published
|
150
159
|
end
|
151
160
|
|
152
161
|
def test_dublin_core_date_simple_rss
|
153
162
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
154
|
-
assert_equal 'SimpleRSS', feed.parser
|
155
163
|
assert_instance_of Time, feed.entries.first.date_published
|
156
164
|
end
|
157
165
|
|
158
166
|
def test_dublin_core_creator_ruby_rss
|
159
167
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
160
|
-
assert_equal 'RSS::Parser', feed.parser
|
161
168
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
162
169
|
end
|
163
170
|
|
164
171
|
def test_dublin_core_creator_simple_rss
|
165
172
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
166
|
-
assert_equal 'SimpleRSS', feed.parser
|
167
173
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
168
174
|
end
|
169
175
|
|
@@ -191,7 +197,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
191
197
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
192
198
|
|
193
199
|
feed.entries.each_with_index do |e, i|
|
194
|
-
|
200
|
+
assert_equal("<p>test#{i+1}</p>", e.content)
|
195
201
|
end
|
196
202
|
end
|
197
203
|
|
@@ -199,9 +205,63 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
199
205
|
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
200
206
|
|
201
207
|
feed.entries.each_with_index do |e, i|
|
202
|
-
|
208
|
+
assert_equal("<p>test#{i+1}</p>", e.content)
|
203
209
|
end
|
204
210
|
end
|
205
211
|
|
212
|
+
def test_atom_content_contains_pluses
|
213
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
|
214
|
+
|
215
|
+
assert_equal 2, feed.entries.last.content.scan(/\+/).size
|
216
|
+
end
|
217
|
+
|
218
|
+
# http://code.google.com/p/feed-normalizer/issues/detail?id=13
|
219
|
+
def test_times_are_reparsed
|
220
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
221
|
+
|
222
|
+
Time.class_eval "alias :old_to_s :to_s; def to_s(x=1); old_to_s; end"
|
223
|
+
|
224
|
+
assert_equal "Sat Sep 09 07:57:06 -0700 2006", feed.last_updated.to_s(:foo)
|
225
|
+
assert_equal "Sat Sep 09 05:45:35 -0700 2006", feed.entries.first.date_published.to_s(:foo)
|
226
|
+
end
|
227
|
+
|
228
|
+
def test_atom03_has_issued
|
229
|
+
SimpleRSS.class_eval "@@item_tags.delete(:issued)"
|
230
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
|
231
|
+
assert_nil feed.entries.first.date_published
|
232
|
+
|
233
|
+
SimpleRSS.class_eval "@@item_tags << :issued"
|
234
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
|
235
|
+
assert_equal "Tue Aug 29 02:31:03 UTC 2006", feed.entries.first.date_published.to_s
|
236
|
+
end
|
237
|
+
|
238
|
+
def test_html_should_be_escaped_by_default
|
239
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
240
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
241
|
+
|
242
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
243
|
+
assert_match "<b>SanDisk</b>", feed.items.last.description
|
244
|
+
end
|
245
|
+
|
246
|
+
def test_relative_links_and_images_should_be_rewritten_with_url_base
|
247
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03])
|
248
|
+
assert_match '<a href="http://www.cheapstingybargains.com/link/tplclick?lid=41000000011334249&pubid=21000000000053626"' +
|
249
|
+
' target=_"blank"><img src="http://www.cheapstingybargains.com/assets/images/product/productDetail/9990000058546711.jpg"' +
|
250
|
+
' width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a>',
|
251
|
+
feed.items.first.content
|
252
|
+
end
|
253
|
+
|
254
|
+
def test_last_updated_simple_rss
|
255
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
|
256
|
+
|
257
|
+
assert_equal Time.parse("Wed Aug 16 09:59:44 -0700 2006"), feed.entries.first.last_updated
|
258
|
+
end
|
259
|
+
|
260
|
+
def test_last_updated_ruby_rss
|
261
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
262
|
+
|
263
|
+
assert_equal feed.entries.first.date_published, feed.entries.first.last_updated
|
264
|
+
end
|
265
|
+
|
206
266
|
end
|
207
267
|
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date:
|
6
|
+
version: 1.5.0
|
7
|
+
date: 2008-02-05 00:00:00 -08:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -90,5 +90,5 @@ dependencies:
|
|
90
90
|
requirements:
|
91
91
|
- - ">="
|
92
92
|
- !ruby/object:Gem::Version
|
93
|
-
version: 1.
|
93
|
+
version: 1.5.0
|
94
94
|
version:
|