feed-normalizer 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ 1.5.0
2
+ * Add support for new fields:
3
+ * Atom 0.3: issued is now available through entry.date_published.
4
+ * RSS: feed.skip_hours, feed.skip_days, feed.ttl [joshpeek]
5
+ * All: entry.last_updated, this is an alias to entry.date_published for RSS.
6
+ * Rewrite relative links in content [joshpeek]
7
+ * Handle CDATA sections consistently across all formats. [sam.lown]
8
+ * Prevent SimpleRSS from doing its own escaping. [reported by: paul.stadig, lionel.bouton]
9
+ * Reparse Time classes [reported by: sam.lown]
10
+
1
11
  1.4.0
2
12
 
3
13
  * Support content:encoded. Accessible via Entry#content.
data/Rakefile CHANGED
@@ -1,11 +1,11 @@
1
1
  require 'hoe'
2
2
 
3
- Hoe.new("feed-normalizer", "1.4.0") do |s|
3
+ Hoe.new("feed-normalizer", "1.5.0") do |s|
4
4
  s.author = "Andrew A. Smith"
5
5
  s.email = "andy@tinnedfruit.org"
6
6
  s.url = "http://feed-normalizer.rubyforge.org/"
7
7
  s.summary = "Extensible Ruby wrapper for Atom and RSS parsers"
8
- s.description = s.paragraphs_of('Readme.txt', 1..2).join("\n\n")
8
+ s.description = s.paragraphs_of('README.txt', 1..2).join("\n\n")
9
9
  s.changes = s.paragraphs_of('History.txt', 0..1).join("\n\n")
10
10
  s.extra_deps << ["simple-rss", ">= 1.1"]
11
11
  s.extra_deps << ["hpricot", ">= 0.6"]
data/lib/html-cleaner.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
  require 'hpricot'
2
3
  require 'cgi'
3
4
 
@@ -99,10 +100,10 @@ module FeedNormalizer
99
100
  doc = Hpricot(str, :xhtml_strict => true)
100
101
  doc = subtree(doc, :body)
101
102
 
102
- out = ""
103
+ out = []
103
104
  doc.traverse_text {|t| out << add_entities(t.to_html)}
104
105
 
105
- return out
106
+ return out.join
106
107
  end
107
108
 
108
109
  # Returns true if the given string contains a suspicious URL,
data/lib/parsers/rss.rb CHANGED
@@ -42,7 +42,8 @@ module FeedNormalizer
42
42
  :copyright => :copyright,
43
43
  :authors => :managingEditor,
44
44
  :last_updated => [:lastBuildDate, :pubDate, :dc_date],
45
- :id => :guid
45
+ :id => :guid,
46
+ :ttl => :ttl
46
47
  }
47
48
 
48
49
  # make two passes, to catch all possible root elements
@@ -51,6 +52,8 @@ module FeedNormalizer
51
52
 
52
53
  # custom channel elements
53
54
  feed.image = rss.image ? rss.image.url : nil
55
+ feed.skip_hours = skip(rss, :skipHours)
56
+ feed.skip_days = skip(rss, :skipDays)
54
57
 
55
58
  # item elements
56
59
  item_mapping = {
@@ -59,7 +62,8 @@ module FeedNormalizer
59
62
  :description => :description,
60
63
  :content => [:content_encoded, :description],
61
64
  :title => :title,
62
- :authors => [:author, :dc_creator]
65
+ :authors => [:author, :dc_creator],
66
+ :last_updated => [:pubDate, :dc_date] # This is effectively an alias for date_published for this parser.
63
67
  }
64
68
 
65
69
  rss.items.each do |rss_item|
@@ -79,6 +83,14 @@ module FeedNormalizer
79
83
  feed
80
84
  end
81
85
 
86
+ def self.skip(parser, attribute)
87
+ attributes = case attribute
88
+ when :skipHours: :hours
89
+ when :skipDays: :days
90
+ end
91
+ channel = parser.channel
92
+ channel.respond_to?(attribute) && channel.send(attribute).send(attributes).map { |e| e.content }
93
+ end
94
+
82
95
  end
83
96
  end
84
-
@@ -1,5 +1,42 @@
1
1
  require 'simple-rss'
2
2
 
3
+ # Monkey patches for outstanding issues logged in the simple-rss project.
4
+ # * Add support for issued time field:
5
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=13980&group_id=893&atid=3517
6
+ # * The '+' symbol is lost when escaping fields.
7
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=10852&group_id=893&atid=3517
8
+ #
9
+ class SimpleRSS
10
+ @@item_tags << :issued
11
+
12
+ undef clean_content
13
+ def clean_content(tag, attrs, content)
14
+ content = content.to_s
15
+ case tag
16
+ when :pubDate, :lastBuildDate, :published, :updated, :expirationDate, :modified, :'dc:date', :issued
17
+ Time.parse(content) rescue unescape(content)
18
+ when :author, :contributor, :skipHours, :skipDays
19
+ unescape(content.gsub(/<.*?>/,''))
20
+ else
21
+ content.empty? && "#{attrs} " =~ /href=['"]?([^'"]*)['" ]/mi ? $1.strip : unescape(content)
22
+ end
23
+ end
24
+
25
+ undef unescape
26
+ def unescape(s)
27
+ if s =~ /^(<!\[CDATA\[|\]\]>)/
28
+ # Raw HTML is inside the CDATA, so just remove the CDATA wrapper.
29
+ s.gsub(/(<!\[CDATA\[|\]\]>)/,'').strip
30
+ elsif s =~ /[<>]/
31
+ # Already looks like HTML.
32
+ s
33
+ else
34
+ # Make it HTML.
35
+ FeedNormalizer::HtmlCleaner.unescapeHTML(s)
36
+ end
37
+ end
38
+ end
39
+
3
40
  module FeedNormalizer
4
41
 
5
42
  # The SimpleRSS parser can handle both RSS and Atom feeds.
@@ -38,7 +75,8 @@ module FeedNormalizer
38
75
  :copyright => [:copyright, :rights],
39
76
  :authors => [:author, :webMaster, :managingEditor, :contributor],
40
77
  :urls => :link,
41
- :description => [:description, :subtitle]
78
+ :description => [:description, :subtitle],
79
+ :ttl => :ttl
42
80
  }
43
81
 
44
82
  map_functions!(feed_mapping, atomrss, feed)
@@ -50,13 +88,14 @@ module FeedNormalizer
50
88
 
51
89
  # entry elements
52
90
  entry_mapping = {
53
- :date_published => [:pubDate, :published, :dc_date],
91
+ :date_published => [:pubDate, :published, :dc_date, :issued],
54
92
  :urls => :link,
55
93
  :description => [:description, :summary],
56
94
  :content => [:content, :content_encoded, :description],
57
95
  :title => :title,
58
96
  :authors => [:author, :contributor, :dc_creator],
59
- :categories => :category
97
+ :categories => :category,
98
+ :last_updated => [:updated, :dc_date, :pubDate]
60
99
  }
61
100
 
62
101
  atomrss.entries.each do |atomrss_entry|
@@ -76,7 +115,7 @@ module FeedNormalizer
76
115
  def self.image(parser)
77
116
  if parser.respond_to?(:image) && parser.image
78
117
  if parser.image =~ /<url>/ # RSS image contains an <url> spec
79
- parser.image.scan(/<url>(.*)<\/url>/).to_s
118
+ parser.image.scan(/<url>(.*?)<\/url>/).to_s
80
119
  else
81
120
  parser.image # Atom contains just the url
82
121
  end
data/lib/structures.rb CHANGED
@@ -115,13 +115,55 @@ module FeedNormalizer
115
115
  end
116
116
  end
117
117
 
118
+ module TimeFix
119
+ # Reparse any Time instances, due to RSS::Parser's redefinition of
120
+ # certain aspects of the Time class that creates unexpected behaviour
121
+ # when extending the Time class, as some common third party libraries do.
122
+ # See http://code.google.com/p/feed-normalizer/issues/detail?id=13.
123
+ def reparse(obj)
124
+ @parsed ||= false
125
+
126
+ return obj if @parsed
127
+
128
+ if obj.is_a?(Time)
129
+ @parsed = true
130
+ Time.at(obj) rescue obj
131
+ end
132
+ end
133
+ end
134
+
135
+ module RewriteRelativeLinks
136
+ def rewrite_relative_links(text, url)
137
+ if host = url_host(url)
138
+ text.to_s.gsub(/(href|src)=('|")\//, '\1=\2http://' + host + '/')
139
+ else
140
+ text
141
+ end
142
+ end
143
+
144
+ private
145
+ def url_host(url)
146
+ URI.parse(url).host rescue nil
147
+ end
148
+ end
149
+
118
150
 
119
151
  # Represents a feed item entry.
152
+ # Available fields are:
153
+ # * content
154
+ # * description
155
+ # * title
156
+ # * date_published
157
+ # * urls / url
158
+ # * id
159
+ # * authors / author
160
+ # * copyright
161
+ # * categories
120
162
  class Entry
121
- include Singular, ElementEquality, ElementCleaner
163
+ include Singular, ElementEquality, ElementCleaner, TimeFix, RewriteRelativeLinks
122
164
 
123
165
  HTML_ELEMENTS = [:content, :description, :title]
124
- SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
166
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories, :last_updated]
125
167
  BLENDED_ELEMENTS = []
126
168
 
127
169
  ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
@@ -132,19 +174,41 @@ module FeedNormalizer
132
174
  @urls = []
133
175
  @authors = []
134
176
  @categories = []
177
+ @date_published, @content = nil
178
+ end
179
+
180
+ undef date_published
181
+ def date_published
182
+ @date_published = reparse(@date_published)
183
+ end
184
+
185
+ undef content
186
+ def content
187
+ @content = rewrite_relative_links(@content, url)
135
188
  end
136
189
 
137
190
  end
138
191
 
139
192
  # Represents the root element of a feed.
193
+ # Available fields are:
194
+ # * title
195
+ # * description
196
+ # * id
197
+ # * last_updated
198
+ # * copyright
199
+ # * authors / author
200
+ # * urls / url
201
+ # * image
202
+ # * generator
203
+ # * items / channel
140
204
  class Feed
141
- include Singular, ElementEquality, ElementCleaner
205
+ include Singular, ElementEquality, ElementCleaner, TimeFix
142
206
 
143
207
  # Elements that can contain HTML fragments.
144
208
  HTML_ELEMENTS = [:title, :description]
145
209
 
146
210
  # Elements that contain 'plain' Strings, with HTML escaped.
147
- SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator]
211
+ SIMPLE_ELEMENTS = [:id, :last_updated, :copyright, :authors, :urls, :image, :generator, :ttl, :skip_hours, :skip_days]
148
212
 
149
213
  # Elements that contain both HTML and escaped HTML.
150
214
  BLENDED_ELEMENTS = [:items]
@@ -160,8 +224,16 @@ module FeedNormalizer
160
224
  # set up associations (i.e. arrays where needed)
161
225
  @urls = []
162
226
  @authors = []
227
+ @skip_hours = []
228
+ @skip_days = []
163
229
  @items = []
164
230
  @parser = wrapper.parser.to_s
231
+ @last_updated = nil
232
+ end
233
+
234
+ undef last_updated
235
+ def last_updated
236
+ @last_updated = reparse(@last_updated)
165
237
  end
166
238
 
167
239
  def channel() self end
data/test/data/atom03.xml CHANGED
@@ -24,7 +24,7 @@
24
24
  Kmart has the Levi Strauss Signature Girl&#8217;s Low Rise Flare Jean for $10 after $5 instant savings (ends 9/2)
25
25
  Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex
26
26
  ]]></summary>
27
- <content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank"><img src="http://images.kmart.com/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
27
+ <content type="text/html" mode="escaped" xml:base="http://www.cheapstingybargains.com/24557/levi-strauss-signature-girls-low-rise-slim-fit-flare-jeans-10/"><![CDATA[<p><a href="/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank"><img src="/assets/images/product/productDetail/9990000058546711.jpg" width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a><br />
28
28
  <strong>Kmart has the <a href="http://clickserve.cc-dt.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626" target=_"blank">Levi Strauss Signature Girl&#8217;s Low Rise Flare Jean</a> for $10 after $5 instant savings (ends 9/2)</strong></p>
29
29
  <p>Slim fit through hip and thigh, with zip-fly with button-through closure. Machine washable 99% Cotton/1% Spandex</p>
30
30
  ]]></content>
data/test/data/atom10.xml CHANGED
@@ -17,8 +17,8 @@
17
17
  <link href="http://habtm.com/articles/2006/08/16/a-forum-on-rails" rel="alternate" type="text/html"/>
18
18
  <category term="rails" scheme="http://habtm.com/articles/category/rails" label="rails"/>
19
19
  <category term="ruby" scheme="http://habtm.com/articles/category/ruby" label="ruby"/>
20
- <summary type="html">&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</summary>
21
- <content type="html">&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</content>
20
+ <summary type="html">&lt;plaintext&gt;&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</summary>
21
+ <content type="html">&lt;plaintext&gt;&lt;p&gt;Josh Goebel and I took an evening to bang out a little project: &lt;a href="http://beast.caboo.se/"&gt;Beast&lt;/a&gt;. It&amp;#8217;s our minimal no-fluff Rails forum. It&amp;#8217;s no beast of an application either, clocking in at 285 LOC and a 1:1.5 test ratio. &lt;a href="http://svn.techno-weenie.net/projects/beast/"&gt;Check it out&lt;/a&gt;!&lt;/p&gt;</content>
22
22
  </entry>
23
23
  <entry>
24
24
  <author>
data/test/data/rss20.xml CHANGED
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -19,7 +30,7 @@
19
30
 
20
31
  <item>
21
32
  <title>Concerns over security software</title>
22
- <description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
33
+ <description><![CDATA[BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.]]></description>
23
34
  <content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
24
35
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
25
36
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
@@ -40,7 +51,7 @@
40
51
 
41
52
  <item>
42
53
  <title>MP3 player court order overturned</title>
43
- <description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
54
+ <description>&lt;b&gt;SanDisk&lt;/b&gt; puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
44
55
  <content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
45
56
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
46
57
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -10,6 +10,17 @@
10
10
  <copyright>Copyright: (C) British Broadcasting Corporation, see http://news.bbc.co.uk/1/hi/help/rss/4498287.stm for terms and conditions of reuse</copyright>
11
11
  <docs>http://www.bbc.co.uk/syndication/</docs>
12
12
  <ttl>15</ttl>
13
+ <skipHours>
14
+ <hour>6</hour>
15
+ <hour>7</hour>
16
+ <hour>8</hour>
17
+ <hour>9</hour>
18
+ <hour>10</hour>
19
+ <hour>11</hour>
20
+ </skipHours>
21
+ <skipDays>
22
+ <day>Sunday</day>
23
+ </skipDays>
13
24
 
14
25
  <image>
15
26
  <title>BBC News</title>
@@ -1,4 +1,6 @@
1
1
  require 'test/unit'
2
+ $:.unshift(File.dirname(__FILE__))
3
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
4
  require 'feed-normalizer'
3
5
  require 'yaml'
4
6
 
@@ -66,8 +68,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
66
68
 
67
69
  assert_equal "BBC News | Technology | UK Edition", feed.title
68
70
  assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
71
+ assert_equal 15, feed.ttl
72
+ assert_equal [6, 7, 8, 9, 10, 11], feed.skip_hours
73
+ assert_equal ["Sunday"], feed.skip_days
69
74
  assert_equal "MP3 player court order overturned", feed.entries.last.title
70
- assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
75
+ assert_equal "<b>SanDisk</b> puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
71
76
  assert_match(/test\d/, feed.entries.last.content)
72
77
  assert_instance_of Time, feed.entries.last.date_published
73
78
  end
@@ -77,6 +82,9 @@ class FeedNormalizerTest < Test::Unit::TestCase
77
82
 
78
83
  assert_equal "~:caboose", feed.title
79
84
  assert_equal "http://habtm.com/xml/atom10/feed.xml", feed.url
85
+ assert_equal nil, feed.ttl
86
+ assert_equal [], feed.skip_hours
87
+ assert_equal [], feed.skip_days
80
88
  assert_equal "Starfish - Easy Distribution of Site Maintenance", feed.entries.last.title
81
89
  assert_equal "urn:uuid:6c028f36-f87a-4f53-b7e3-1f943d2341f0", feed.entries.last.id
82
90
 
@@ -134,9 +142,11 @@ class FeedNormalizerTest < Test::Unit::TestCase
134
142
  def test_clean
135
143
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10])
136
144
 
137
- assert feed.entries.first.content !~ /\<p\>/
145
+ assert_match(/<plaintext>/, feed.entries.first.content)
146
+ assert_match(/<plaintext>/, feed.entries.first.description)
138
147
  feed.clean!
139
- assert feed.entries.first.content =~ /\<p\>/
148
+ assert_no_match(/<plaintext>/, feed.entries.first.content)
149
+ assert_no_match(/<plaintext>/, feed.entries.first.description)
140
150
  end
141
151
 
142
152
  def test_malformed_feed
@@ -145,25 +155,21 @@ class FeedNormalizerTest < Test::Unit::TestCase
145
155
 
146
156
  def test_dublin_core_date_ruby_rss
147
157
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
148
- assert_equal 'RSS::Parser', feed.parser
149
158
  assert_instance_of Time, feed.entries.first.date_published
150
159
  end
151
160
 
152
161
  def test_dublin_core_date_simple_rss
153
162
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
154
- assert_equal 'SimpleRSS', feed.parser
155
163
  assert_instance_of Time, feed.entries.first.date_published
156
164
  end
157
165
 
158
166
  def test_dublin_core_creator_ruby_rss
159
167
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
160
- assert_equal 'RSS::Parser', feed.parser
161
168
  assert_equal 'Jeff Hecht', feed.entries.last.author
162
169
  end
163
170
 
164
171
  def test_dublin_core_creator_simple_rss
165
172
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
166
- assert_equal 'SimpleRSS', feed.parser
167
173
  assert_equal 'Jeff Hecht', feed.entries.last.author
168
174
  end
169
175
 
@@ -191,7 +197,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
191
197
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
192
198
 
193
199
  feed.entries.each_with_index do |e, i|
194
- assert_match(/test#{i+1}/, e.content)
200
+ assert_equal("<p>test#{i+1}</p>", e.content)
195
201
  end
196
202
  end
197
203
 
@@ -199,9 +205,63 @@ class FeedNormalizerTest < Test::Unit::TestCase
199
205
  feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
200
206
 
201
207
  feed.entries.each_with_index do |e, i|
202
- assert_match(/test#{i+1}/, e.content)
208
+ assert_equal("<p>test#{i+1}</p>", e.content)
203
209
  end
204
210
  end
205
211
 
212
+ def test_atom_content_contains_pluses
213
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
214
+
215
+ assert_equal 2, feed.entries.last.content.scan(/\+/).size
216
+ end
217
+
218
+ # http://code.google.com/p/feed-normalizer/issues/detail?id=13
219
+ def test_times_are_reparsed
220
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
221
+
222
+ Time.class_eval "alias :old_to_s :to_s; def to_s(x=1); old_to_s; end"
223
+
224
+ assert_equal "Sat Sep 09 07:57:06 -0700 2006", feed.last_updated.to_s(:foo)
225
+ assert_equal "Sat Sep 09 05:45:35 -0700 2006", feed.entries.first.date_published.to_s(:foo)
226
+ end
227
+
228
+ def test_atom03_has_issued
229
+ SimpleRSS.class_eval "@@item_tags.delete(:issued)"
230
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
231
+ assert_nil feed.entries.first.date_published
232
+
233
+ SimpleRSS.class_eval "@@item_tags << :issued"
234
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03], :force_parser => SimpleRssParser, :try_others => false)
235
+ assert_equal "Tue Aug 29 02:31:03 UTC 2006", feed.entries.first.date_published.to_s
236
+ end
237
+
238
+ def test_html_should_be_escaped_by_default
239
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
240
+ assert_match "<b>SanDisk</b>", feed.items.last.description
241
+
242
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
243
+ assert_match "<b>SanDisk</b>", feed.items.last.description
244
+ end
245
+
246
+ def test_relative_links_and_images_should_be_rewritten_with_url_base
247
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom03])
248
+ assert_match '<a href="http://www.cheapstingybargains.com/link/tplclick?lid=41000000011334249&#038;pubid=21000000000053626"' +
249
+ ' target=_"blank"><img src="http://www.cheapstingybargains.com/assets/images/product/productDetail/9990000058546711.jpg"' +
250
+ ' width="150" height="150" border="0" style="float: right; margin: 0px 0px 5px 5px;" /></a>',
251
+ feed.items.first.content
252
+ end
253
+
254
+ def test_last_updated_simple_rss
255
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:atom10], :force_parser => SimpleRssParser, :try_others => false)
256
+
257
+ assert_equal Time.parse("Wed Aug 16 09:59:44 -0700 2006"), feed.entries.first.last_updated
258
+ end
259
+
260
+ def test_last_updated_ruby_rss
261
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
262
+
263
+ assert_equal feed.entries.first.date_published, feed.entries.first.last_updated
264
+ end
265
+
206
266
  end
207
267
 
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.2
2
+ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: feed-normalizer
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.4.0
7
- date: 2007-07-10 00:00:00 -07:00
6
+ version: 1.5.0
7
+ date: 2008-02-05 00:00:00 -08:00
8
8
  summary: Extensible Ruby wrapper for Atom and RSS parsers
9
9
  require_paths:
10
10
  - lib
@@ -90,5 +90,5 @@ dependencies:
90
90
  requirements:
91
91
  - - ">="
92
92
  - !ruby/object:Gem::Version
93
- version: 1.2.1
93
+ version: 1.5.0
94
94
  version: