feed-normalizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ 1.5.0
2
+ * Add support for new fields:
3
+ * Atom 0.3: issued is now available through entry.date_published.
4
+ * RSS: feed.skip_hours, feed.skip_days, feed.ttl [joshpeek]
5
+ * All: entry.last_updated, this is an alias to entry.date_published for RSS.
6
+ * Rewrite relative links in content [joshpeek]
7
+ * Handle CDATA sections consistently across all formats. [sam.lown]
8
+ * Prevent SimpleRSS from doing its own escaping. [reported by: paul.stadig, lionel.bouton]
9
+ * Reparse Time classes [reported by: sam.lown]
10
+
1
11
  1.4.0
2
12
 
3
13
  * Support content:encoded. Accessible via Entry#content.
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
1
  require 'hoe'
2
2
 
3
- Hoe.new("feed-normalizer", "1.4.0") do |s|
3
+ Hoe.new("feed-normalizer", "1.5.0") do |s|
4
4
  s.author = "Andrew A. Smith"
5
5
  s.email = "andy@tinnedfruit.org"
6
6
  s.url = "http://feed-normalizer.rubyforge.org/"
7
7
  s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
8
- s.description = s.paragraphs_of('Readme.txt', 1..2).join("\n\n")
8
+ s.description = s.paragraphs_of('README.txt', 1..2).join("\n\n")
9
9
  s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
10
10
  s.extra_deps << ["simple-rss", ">= 1.1"]
11
11
  s.extra_deps << ["hpricot", ">= 0.6"]
data/lib/html-cleaner.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
  require 'hpricot'
2
3
  require 'cgi'
3
4
 
@@ -99,10 +100,10 @@ module FeedNormalizer
99
100
  doc = Hpricot(str, :xhtml_strict => true)
100
101
  doc = subtree(doc, :body)
101
102
 
102
- out = ""
103
+ out = []
103
104
  doc.traverse_text {|t| out << add_entities(t.to_html)}
104
105
 
105
- return out
106
+ return out.join
106
107
  end
107
108
 
108
109
  # Returns true if the given string contains a suspicious URL,
data/lib/parsers/rss.rb CHANGED
@@ -42,7 +42,8 @@ module FeedNormalizer
42
42
  :copyright => :copyright,
43
43
  :authors => :managingEditor,
44
44
  :last_updated => [:lastBuildDate, :pubDate, :dc_date],
45
- :id => :guid
45
+ :id => :guid,
46
+ :ttl => :ttl
46
47
  }
47
48
 
48
49
  # make two passes, to catch all possible root elements
@@ -51,6 +52,8 @@ module FeedNormalizer
51
52
 
52
53
  # custom channel elements
53
54
  feed.image = rss.image ? rss.image.url : nil
55
+ feed.skip_hours = skip(rss, :skipHours)
56
+ feed.skip_days = skip(rss, :skipDays)
54
57
 
55
58
  # item elements
56
59
  item_mapping = {
@@ -59,7 +62,8 @@ module FeedNormalizer
59
62
  :description => :description,
60
63
  :content => [:content_encoded, :description],
61
64
  :title => :title,
62
- :authors => [:author, :dc_creator]
65
+ :authors => [:author, :dc_creator],
66
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
63
67
  }
64
68
 
65
69
  rss.items.each do |rss_item|
@@ -79,6 +83,14 @@ module FeedNormalizer
79
83
  feed
80
84
  end
81
85
 
86
+ def self.skip(parser, attribute)
87
+ attributes = case attribute
88
+ when :skipHours: :hours
89
+ when :skipDays: :days
90
+ end
91
+ channel = parser.channel
92
+ channel.respond_to?(attribute) && channel.send(attribute).send(attributes).map { |e| e.content }
93
+ end
94
+
82
95
  end
83
96
  end
84
-
@@ -1,5 +1,42 @@
1
1
  require 'simple-rss'
2
2
 
3
+ # Monkey patches for outstanding issues logged in the simple-rss project.
4
+ # * Add support for issued time field:
5
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
6
+ # * The '+' symbol is lost when escaping fields.
7
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
8
+ #
9
+ class SimpleRSS
10
+ @@item_tags << :issued
11
+
12
+ undef clean_content
13
+ def clean_content(tag, attrs, content)
14
+ content = content.to_s
15
+ case tag
16
+ when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
17
+ Time.parse(content) rescue unescape(content)
18
+ when :author, :contributor, :skipHours, :skipDays
19
+ unescape(content.gsub(/<.*?>/,''))
20
+ else
21
+ content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
22
+ end
23
+ end
24
+
25
+ undef unescape
26
+ def unescape(s)
27
+ if s =~ /^(<!\[CDATA\[|\]\]>)/
28
+ # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
29
+ s.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
30
+ elsif s =~ /[<>]/
31
+ # Already looks like HTML.
32
+ s
33
+ else
34
+ # Make it HTML.
35
+ FeedNormalizer::HtmlCleaner.unescapeHTML(s)
36
+ end
37
+ end
38
+ end
39
+
3
40
  module FeedNormalizer
4
41
 
5
42
  # The SimpleRSS parser can handle both RSS and Atom feeds.
@@ -38,7 +75,8 @@ module FeedNormalizer
38
75
  :copyright => [:copyright, :rights],
39
76
  :authors => [:author, :webMaster, :managingEditor, :contributor],
40
77
  :urls => :link,
41
- :description => [:description, :subtitle]
78
+ :description => [:description, :subtitle],
79
+ :ttl => :ttl
42
80
  }
43
81
 
44
82
  map_functions!(feed_mapping, atomrss, feed)
@@ -50,13 +88,14 @@ module FeedNormalizer
50
88
 
51
89
  # entry elements
52
90
  entry_mapping = {
53
- :date_published => [:pubDate, :published, :dc_date],
91
+ :date_published => [:pubDate, :published, :dc_date, :issued],
54
92
  :urls => :link,
55
93
  :description => [:description, :summary],
56
94
  :content => [:content, :content_encoded, :description],
57
95
  :title => :title,
58
96
  :authors => [:author, :contributor, :dc_creator],
59
- :categories => :category
97
+ :categories => :category,
98
+ :last_updated => [:updated, :dc_date, :pubDate]
60
99
  }
61
100
 
62
101
  atomrss.entries.each do |atomrss_entry|
@@ -76,7 +115,7 @@ module FeedNormalizer
76
115
  def self.image(parser)
77
116
  if parser.respond_to?(:image) && parser.image
78
117
  if parser.image =~ /<url>/ # RSS image contains an <url> spec
79
- parser.image.scan(/<url>(.*)<\/url>/).to_s
118
+ parser.image.scan(/<url>(.*?)<\/url>/).to_s
80
119
  else
81
120
  parser.image # Atom contains just the url
82
121
  end
data/lib/structures.rb CHANGED
@@ -115,13 +115,55 @@ module FeedNormalizer
115
115
  end
116
116
  end
117
117
 
118
+ module TimeFix
119
+ # Reparse any Time instances, due to RSS::Parser's redefinition of
120
+ # certain aspects of the Time class that creates unexpected behaviour
121
+ # when extending the Time class, as some common third party libraries do.
122
+ # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
123
+ def reparse(obj)
124
+ @parsed ||= false
125
+
126
+ return obj if @parsed
127
+
128
+ if obj.is_a?(Time)
129
+ @parsed = true
130
+ Time.at(obj) rescue obj
131
+ end
132
+ end
133
+ end
134
+
135
+ module RewriteRelativeLinks
136
+ def rewrite_relative_links(text, url)
137
+ if host = url_host(url)
138
+ text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
139
+ else
140
+ text
141
+ end
142
+ end
143
+
144
+ private
145
+ def url_host(url)
146
+ URI.parse(url).host rescue nil
147
+ end
148
+ end
149
+
118
150
 
119
151
  # Represents a feed item entry.
152
+ # Available fields are:
153
+ # * content
154
+ # * description
155
+ # * title
156
+ # * date_published
157
+ # * urls / url
158
+ # * id
159
+ # * authors / author
160
+ # * copyright
161
+ # * categories
120
162
  class Entry
121
- include Singular, ElementEquality, ElementCleaner
163
+ include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
122
164
 
123
165
  HTML_ELEMENTS = [:content, :description, :title]
124
- SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
166
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
125
167
  BLENDED_ELEMENTS = []
126
168
 
127
169
  ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
@@ -132,19 +174,41 @@ module FeedNormalizer
132
174
  @urls = []
133
175
  @authors = []
134
176
  @categories = []
177
+ @date_published, @content = nil
178
+ end
179
+
180
+ undef date_published
181
+ def date_published
182
+ @date_published = reparse(@date_published)
183
+ end
184
+
185
+ undef content
186
+ def content
187
+ @content = rewrite_relative_links(@content, url)
135
188
  end
136
189
 
137
190
  end
138
191
 
139
192
  # Represents the root element of a feed.
193
+ # Available fields are:
194
+ # * title
195
+ # * description
196
+ # * id
197
+ # * last_updated
198
+ # * copyright
199
+ # * authors / author
200
+ # * urls / url
201
+ # * image
202
+ # * generator
203
+ # * items / channel
140
204
  class Feed
141
- include Singular, ElementEquality, ElementCleaner
205
+ include Singular, ElementEquality, ElementCleaner, TimeFix
142
206
 
143
207
  # Elements that can contain HTML fragments.
144
208
  HTML_ELEMENTS = [:title, :description]
145
209
 
146
210
  # Elements that contain 'plain' Strings, with HTML escaped.
147
- SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
211
+ SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
148
212
 
149
213
  # Elements that contain both HTML and escaped HTML.
150
214
  BLENDED_ELEMENTS = [:items]
@@ -160,8 +224,16 @@ module FeedNormalizer
160
224
  # set up associations (i.e. arrays where needed)
161
225
  @urls = []
162
226
  @authors = []
227
+ @skip_hours = []
228
+ @skip_days = []
163
229
  @items = []
164
230
  @parser = wrapper.parser.to_s
231
+ @last_updated = nil
232
+ end
233
+
234
+ undef last_updated
235
+ def last_updated
236
+ @last_updated = reparse(@last_updated)
165
237
  end
166
238
 
167
239
  def channel() self end
data/test/data/atom03.xml CHANGED
@@ -24,7 +24,7 @@
24
24
  Kmart has the Levi Strauss Signature Girl&#8217;s Low Rise Flare Jean for $10 after $5 instant savings (ends 9/2)
25
25
  Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex
26
26
  ]]></summary>
27
- <content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank"><img src="http://images.kmart.com/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
27
+ <content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank"><img src="/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
28
28
  <strong>Kmart has the <a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank">Levi Strauss Signature Girl&#8217;s Low Rise Flare Jean</a> for $10 after $5 instant savings (ends 9/2)</strong></p>
29
29
  <p>Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex</p>
30
30
  ]]></content>
data/test/data/atom10.xml CHANGED
@@ -17,8 +17,8 @@
17
17
  <link href="http://habtm.com/articles/2006/08/16/a-forum-on-rails" rel="alternate" type="text/html"/>
18
18
  <category term="rails" scheme="http://habtm.com/articles/category/rails" label="rails"/>
19
19
  <category term="ruby" scheme="http://habtm.com/articles/category/ruby" label="ruby"/>
20
- <summary type="html">&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</summary>
21
- <content type="html">&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</content>
20
+ <summary type="html">&lt;plaintext&gt;&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</summary>
21
+ <content type="html">&lt;plaintext&gt;&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</content>
22
22
  </entry>
23
23
  <entry>
24
24
  <author>
data/test/data/rss20.xml CHANGED
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -19,7 +30,7 @@
19
30
 
20
31
  <item>
21
32
  <title>Concerns over security software</title>
22
- <description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
33
+ <description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
23
34
  <content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
24
35
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
25
36
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
@@ -40,7 +51,7 @@
40
51
 
41
52
  <item>
42
53
  <title>MP3 player court order overturned</title>
43
- <description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
54
+ <description>&lt;b&gt;SanDisk&lt;/b&gt; puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
44
55
  <content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
45
56
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
46
57
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -1,4 +1,6 @@
1
1
  require 'test/unit'
2
+ $:.unshift(File.dirname(__FILE__))
3
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
4
  require 'feed-normalizer'
3
5
  require 'yaml'
4
6
 
@@ -66,8 +68,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
66
68
 
67
69
  assert_equal "BBC News | Technology | UK Edition", feed.title
68
70
  assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
71
+ assert_equal 15, feed.ttl
72
+ assert_equal [6, 7, 8, 9, 10, 11], feed.skip_hours
73
+ assert_equal ["Sunday"], feed.skip_days
69
74
  assert_equal "MP3 player court order overturned", feed.entries.last.title
70
- assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
75
+ assert_equal "<b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
71
76
  assert_match(/test\d/, feed.entries.last.content)
72
77
  assert_instance_of Time, feed.entries.last.date_published
73
78
  end
@@ -77,6 +82,9 @@ class FeedNormalizerTest < Test::Unit::TestCase
77
82
 
78
83
  assert_equal "~:caboose", feed.title
79
84
  assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
85
+ assert_equal nil, feed.ttl
86
+ assert_equal [], feed.skip_hours
87
+ assert_equal [], feed.skip_days
80
88
  assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
81
89
  assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
82
90
 
@@ -134,9 +142,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
134
142
  def test_clean
135
143
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
136
144
 
137
- assert feed.entries.first.content !~ /\<p\>/
145
+ assert_match(/<plaintext>/, feed.entries.first.content)
146
+ assert_match(/<plaintext>/, feed.entries.first.description)
138
147
  feed.clean!
139
- assert feed.entries.first.content =~ /\<p\>/
148
+ assert_no_match(/<plaintext>/, feed.entries.first.content)
149
+ assert_no_match(/<plaintext>/, feed.entries.first.description)
140
150
  end
141
151
 
142
152
  def test_malformed_feed
@@ -145,25 +155,21 @@ class FeedNormalizerTest < Test::Unit::TestCase
145
155
 
146
156
  def test_dublin_core_date_ruby_rss
147
157
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
148
- assert_equal 'RSS::Parser', feed.parser
149
158
  assert_instance_of Time, feed.entries.first.date_published
150
159
  end
151
160
 
152
161
  def test_dublin_core_date_simple_rss
153
162
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
154
- assert_equal 'SimpleRSS', feed.parser
155
163
  assert_instance_of Time, feed.entries.first.date_published
156
164
  end
157
165
 
158
166
  def test_dublin_core_creator_ruby_rss
159
167
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
160
- assert_equal 'RSS::Parser', feed.parser
161
168
  assert_equal 'Jeff Hecht', feed.entries.last.author
162
169
  end
163
170
 
164
171
  def test_dublin_core_creator_simple_rss
165
172
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
166
- assert_equal 'SimpleRSS', feed.parser
167
173
  assert_equal 'Jeff Hecht', feed.entries.last.author
168
174
  end
169
175
 
@@ -191,7 +197,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
191
197
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
192
198
 
193
199
  feed.entries.each_with_index do |e, i|
194
- assert_match(/test#{i+1}/, e.content)
200
+ assert_equal("<p>test#{i+1}</p>", e.content)
195
201
  end
196
202
  end
197
203
 
@@ -199,9 +205,63 @@ class FeedNormalizerTest < Test::Unit::TestCase
199
205
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
200
206
 
201
207
  feed.entries.each_with_index do |e, i|
202
- assert_match(/test#{i+1}/, e.content)
208
+ assert_equal("<p>test#{i+1}</p>", e.content)
203
209
  end
204
210
  end
205
211
 
212
+ def test_atom_content_contains_pluses
213
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
214
+
215
+ assert_equal 2, feed.entries.last.content.scan(/\+/).size
216
+ end
217
+
218
+ # http://code.google.com/p/feed-normalizer/issues/detail?id=13
219
+ def test_times_are_reparsed
220
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
221
+
222
+ Time.class_eval "alias :old_to_s :to_s; def to_s(x=1); old_to_s; end"
223
+
224
+ assert_equal "Sat Sep 09 07:57:06 -0700 2006", feed.last_updated.to_s(:foo)
225
+ assert_equal "Sat Sep 09 05:45:35 -0700 2006", feed.entries.first.date_published.to_s(:foo)
226
+ end
227
+
228
+ def test_atom03_has_issued
229
+ SimpleRSS.class_eval "@@item_tags.delete(:issued)"
230
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
231
+ assert_nil feed.entries.first.date_published
232
+
233
+ SimpleRSS.class_eval "@@item_tags << :issued"
234
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
235
+ assert_equal "Tue Aug 29 02:31:03 UTC 2006", feed.entries.first.date_published.to_s
236
+ end
237
+
238
+ def test_html_should_be_escaped_by_default
239
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
240
+ assert_match "<b>SanDisk</b>", feed.items.last.description
241
+
242
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
243
+ assert_match "<b>SanDisk</b>", feed.items.last.description
244
+ end
245
+
246
+ def test_relative_links_and_images_should_be_rewritten_with_url_base
247
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03])
248
+ assert_match '<a href="http://www.cheapstingybargains.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626"' +
249
+ ' target=_"blank"><img src="http://www.cheapstingybargains.com/assets/images/product/productDetail/9990000058546711.jpg"' +
250
+ ' width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a>',
251
+ feed.items.first.content
252
+ end
253
+
254
+ def test_last_updated_simple_rss
255
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
256
+
257
+ assert_equal Time.parse("Wed Aug 16 09:59:44 -0700 2006"), feed.entries.first.last_updated
258
+ end
259
+
260
+ def test_last_updated_ruby_rss
261
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
262
+
263
+ assert_equal feed.entries.first.date_published, feed.entries.first.last_updated
264
+ end
265
+
206
266
  end
207
267
 
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: feed-normalizer
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.4.0
7
- date: 2007-07-10 00:00:00 -07:00
6
+ version: 1.5.0
7
+ date: 2008-02-05 00:00:00 -08:00
8
8
  summary: Extensible Ruby wrapper for Atom and RSS parsers
9
9
  require_paths:
10
10
  - lib
@@ -90,5 +90,5 @@ dependencies:
90
90
  requirements:
91
91
  - - ">="
92
92
  - !ruby/object:Gem::Version
93
- version: 1.2.1
93
+ version: 1.5.0
94
94
  version: