feed-normalizer 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ 1.4.0
2
+
3
+ * Support content:encoded. Accessible via Entry#content.
4
+ * Support categories. Accessible via Entry#categories.
5
+ * Introduces a new parsing feature 'loose parsing'. Use :loose => true
6
+ when parsing if the required output should retain extra data, rather
7
+ than drop it in the interests of 'lowest common denomiator' normalization.
8
+ Currently affects how categories works. See the documentation in
9
+ FeedNormalizer#parse for more details.
10
+
1
11
  1.3.2
2
12
 
3
13
  * Add support for applicable dublin core elements. (dc:date and dc:creator)
data/License.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2006, Andrew A. Smith
1
+ Copyright (c) 2006-2007, Andrew A. Smith
2
2
  All rights reserved.
3
3
 
4
4
  Redistribution and use in source and binary forms, with or without modification,
data/Manifest.txt CHANGED
@@ -2,7 +2,7 @@ History.txt
2
2
  License.txt
3
3
  Manifest.txt
4
4
  Rakefile
5
- Readme.txt
5
+ README.txt
6
6
  lib/feed-normalizer.rb
7
7
  lib/html-cleaner.rb
8
8
  lib/parsers/rss.rb
@@ -23,7 +23,7 @@ object graph, regardless of the underlying feed format.
23
23
  feed.entries.first.url # => "http://www.iht.com/articles/2006/10/03/frontpage/web.1003UN.php"
24
24
 
25
25
  feed.class # => FeedNormalizer::Feed
26
- feed.parser # => RSS::Parser
26
+ feed.parser # => "RSS::Parser"
27
27
 
28
28
  Now read an Atom feed, and the same class is returned, and the same terminology applies:
29
29
 
@@ -36,7 +36,7 @@ Now read an Atom feed, and the same class is returned, and the same terminology
36
36
  The feed representation stays the same, even though a different parser was used.
37
37
 
38
38
  feed.class # => FeedNormalizer::Feed
39
- feed.parser # => SimpleRSS
39
+ feed.parser # => "SimpleRSS"
40
40
 
41
41
  == Cleaning / Sanitizing
42
42
 
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  require 'hoe'
2
2
 
3
- Hoe.new("feed-normalizer", "1.3.2") do |s|
3
+ Hoe.new("feed-normalizer", "1.4.0") do |s|
4
4
  s.author = "Andrew A. Smith"
5
5
  s.email = "andy@tinnedfruit.org"
6
6
  s.url = "http://feed-normalizer.rubyforge.org/"
@@ -13,7 +13,7 @@ module FeedNormalizer
13
13
 
14
14
  # Parses the given feed, and returns a normalized representation.
15
15
  # Returns nil if the feed could not be parsed.
16
- def self.parse(feed)
16
+ def self.parse(feed, loose)
17
17
  nil
18
18
  end
19
19
 
@@ -41,7 +41,10 @@ module FeedNormalizer
41
41
  src[src_function]
42
42
  end
43
43
 
44
- append_or_set!(value, dest, dest_function) if value
44
+ unless value.to_s.empty?
45
+ append_or_set!(value, dest, dest_function)
46
+ break
47
+ end
45
48
  end
46
49
 
47
50
  end
@@ -85,24 +88,46 @@ module FeedNormalizer
85
88
  class FeedNormalizer
86
89
 
87
90
  # Parses the given xml and attempts to return a normalized Feed object.
88
- # Setting forced parser to a suitable parser will mean that parser is
89
- # used first, and if try_others is false, it is the only parser used,
90
- # otherwise all parsers in the ParserRegistry are attempted next, in
91
+ # Setting +force_parser+ to a suitable parser will mean that parser is
92
+ # used first, and if +try_others+ is false, it is the only parser used,
93
+ # otherwise all parsers in the ParserRegistry are attempted, in
91
94
  # order of priority.
95
+ #
96
+ # ===Available options
97
+ #
98
+ # * <tt>:force_parser</tt> - instruct feed-normalizer to try the specified
99
+ # parser first. Takes a class, such as RubyRssParser, or SimpleRssParser.
100
+ #
101
+ # * <tt>:try_others</tt> - +true+ or +false+, defaults to +true+.
102
+ # If +true+, other parsers will be used as described above. The option
103
+ # is useful if combined with +force_parser+ to only use a single parser.
104
+ #
105
+ # * <tt>:loose</tt> - +true+ or +false+, defaults to +false+.
106
+ #
107
+ # Specifies parsing should be done loosely. This means that when
108
+ # feed-normalizer would usually throw away data in order to meet
109
+ # the requirement of keeping resulting feed outputs the same regardless
110
+ # of the underlying parser, the data will instead be kept. This currently
111
+ # affects the following items:
112
+ # * <em>Categories:</em> RSS allows for multiple categories per feed item.
113
+ # * <em>Limitation:</em> SimpleRSS can only return the first category
114
+ # for an item.
115
+ # * <em>Result:</em> When loose is true, the extra categories are kept,
116
+ # of course, only if the parser is not SimpleRSS.
92
117
  def self.parse(xml, opts = {})
93
118
 
94
119
  # Get a string ASAP, as multiple read()'s will start returning nil..
95
120
  xml = xml.respond_to?(:read) ? xml.read : xml.to_s
96
121
 
97
122
  if opts[:force_parser]
98
- result = opts[:force_parser].parse(xml)
123
+ result = opts[:force_parser].parse(xml, opts[:loose])
99
124
 
100
125
  return result if result
101
126
  return nil if opts[:try_others] == false
102
127
  end
103
128
 
104
129
  ParserRegistry.parsers.each do |parser|
105
- result = parser.parse(xml)
130
+ result = parser.parse(xml, opts[:loose])
106
131
  return result if result
107
132
  end
108
133
 
data/lib/html-cleaner.rb CHANGED
@@ -165,7 +165,7 @@ module FeedNormalizer
165
165
  end
166
166
 
167
167
 
168
- module Enumerable
168
+ module Enumerable #:nodoc:
169
169
  def build_hash
170
170
  result = {}
171
171
  self.each do |elt|
@@ -180,7 +180,7 @@ end
180
180
  # Subject: A simple Hpricot text setter
181
181
  # From: Chris Gehlker <canyonrat mac.com>
182
182
  # Date: Fri, 11 Aug 2006 03:19:13 +0900
183
- class Hpricot::Text
183
+ class Hpricot::Text #:nodoc:
184
184
  def set(string)
185
185
  @content = string
186
186
  self.raw_string = string
data/lib/parsers/rss.rb CHANGED
@@ -1,5 +1,10 @@
1
1
  require 'rss'
2
2
 
3
+ # For some reason, this is only included in the RDF Item by default.
4
+ class RSS::Rss::Channel::Item # :nodoc:
5
+ include RSS::ContentModel
6
+ end
7
+
3
8
  module FeedNormalizer
4
9
  class RubyRssParser < Parser
5
10
 
@@ -7,7 +12,7 @@ module FeedNormalizer
7
12
  RSS::Parser
8
13
  end
9
14
 
10
- def self.parse(xml)
15
+ def self.parse(xml, loose)
11
16
  begin
12
17
  rss = parser.parse(xml)
13
18
  rescue Exception => e
@@ -15,7 +20,7 @@ module FeedNormalizer
15
20
  return nil
16
21
  end
17
22
 
18
- rss ? package(rss) : nil
23
+ rss ? package(rss, loose) : nil
19
24
  end
20
25
 
21
26
  # Fairly high priority; a fast and strict parser.
@@ -25,7 +30,7 @@ module FeedNormalizer
25
30
 
26
31
  protected
27
32
 
28
- def self.package(rss)
33
+ def self.package(rss, loose)
29
34
  feed = Feed.new(self)
30
35
 
31
36
  # channel elements
@@ -52,7 +57,7 @@ module FeedNormalizer
52
57
  :date_published => [:pubDate, :dc_date],
53
58
  :urls => :link,
54
59
  :description => :description,
55
- :content => :description,
60
+ :content => [:content_encoded, :description],
56
61
  :title => :title,
57
62
  :authors => [:author, :dc_creator]
58
63
  }
@@ -64,6 +69,9 @@ module FeedNormalizer
64
69
  # custom item elements
65
70
  feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
66
71
  feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
72
+ feed_entry.categories = loose ?
73
+ rss_item.categories.collect{|c|c.content} :
74
+ [rss_item.categories.first.content] rescue []
67
75
 
68
76
  feed.entries << feed_entry
69
77
  end
@@ -9,7 +9,7 @@ module FeedNormalizer
9
9
  SimpleRSS
10
10
  end
11
11
 
12
- def self.parse(xml)
12
+ def self.parse(xml, loose)
13
13
  begin
14
14
  atomrss = parser.parse(xml)
15
15
  rescue Exception => e
@@ -53,9 +53,10 @@ module FeedNormalizer
53
53
  :date_published => [:pubDate, :published, :dc_date],
54
54
  :urls => :link,
55
55
  :description => [:description, :summary],
56
- :content => [:content, :description],
56
+ :content => [:content, :content_encoded, :description],
57
57
  :title => :title,
58
- :authors => [:author, :contributor, :dc_creator]
58
+ :authors => [:author, :contributor, :dc_creator],
59
+ :categories => :category
59
60
  }
60
61
 
61
62
  atomrss.entries.each do |atomrss_entry|
@@ -95,4 +96,3 @@ module FeedNormalizer
95
96
 
96
97
  end
97
98
  end
98
-
data/lib/structures.rb CHANGED
@@ -121,7 +121,7 @@ module FeedNormalizer
121
121
  include Singular, ElementEquality, ElementCleaner
122
122
 
123
123
  HTML_ELEMENTS = [:content, :description, :title]
124
- SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright]
124
+ SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
125
125
  BLENDED_ELEMENTS = []
126
126
 
127
127
  ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
@@ -131,6 +131,7 @@ module FeedNormalizer
131
131
  def initialize
132
132
  @urls = []
133
133
  @authors = []
134
+ @categories = []
134
135
  end
135
136
 
136
137
  end
data/test/data/rss20.xml CHANGED
@@ -1,6 +1,6 @@
1
1
  <?xml version="1.0" encoding="ISO-8859-1" ?>
2
2
  <?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
3
- <rss version="2.0">
3
+ <rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
4
4
  <channel>
5
5
  <title>BBC News | Technology | UK Edition</title>
6
6
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
@@ -20,6 +20,7 @@
20
20
  <item>
21
21
  <title>Concerns over security software</title>
22
22
  <description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
23
+ <content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
23
24
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
24
25
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
25
26
  <pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
@@ -29,19 +30,21 @@
29
30
  <item>
30
31
  <title>Top prize for 'light' inventor</title>
31
32
  <description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
33
+ <content:encoded><![CDATA[<p>test2</p>]]></content:encoded>
32
34
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5328446.stm</link>
33
35
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
34
36
  <pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
35
37
  <category>Technology</category>
38
+ <category>Japan</category>
36
39
  </item>
37
40
 
38
41
  <item>
39
42
  <title>MP3 player court order overturned</title>
40
43
  <description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
44
+ <content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
41
45
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
42
46
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
43
47
  <pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
44
- <category>Technology</category>
45
48
  </item>
46
49
 
47
50
  </channel>
@@ -41,7 +41,6 @@
41
41
  <link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
42
42
  <guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
43
43
  <pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
44
- <category>Technology</category>
45
44
  </item>
46
45
 
47
46
  </channel>
@@ -68,7 +68,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
68
68
  assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
69
69
  assert_equal "MP3 player court order overturned", feed.entries.last.title
70
70
  assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
71
- assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.content
71
+ assert_match(/test\d/, feed.entries.last.content)
72
72
  assert_instance_of Time, feed.entries.last.date_published
73
73
  end
74
74
 
@@ -108,7 +108,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
108
108
  no_diff = feed.diff(feed)
109
109
 
110
110
  assert diff.keys.all? {|key| [:title, :items].include?(key)}
111
- assert_equal 2, diff[:items].size
111
+ assert_equal 3, diff[:items].size
112
112
 
113
113
  assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
114
114
  assert_equal [3,2], diff_short[:items]
@@ -144,28 +144,64 @@ class FeedNormalizerTest < Test::Unit::TestCase
144
144
  end
145
145
 
146
146
  def test_dublin_core_date_ruby_rss
147
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
147
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
148
148
  assert_equal 'RSS::Parser', feed.parser
149
149
  assert_instance_of Time, feed.entries.first.date_published
150
150
  end
151
151
 
152
152
  def test_dublin_core_date_simple_rss
153
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
153
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
154
154
  assert_equal 'SimpleRSS', feed.parser
155
155
  assert_instance_of Time, feed.entries.first.date_published
156
156
  end
157
157
 
158
158
  def test_dublin_core_creator_ruby_rss
159
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
159
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
160
160
  assert_equal 'RSS::Parser', feed.parser
161
161
  assert_equal 'Jeff Hecht', feed.entries.last.author
162
162
  end
163
163
 
164
164
  def test_dublin_core_creator_simple_rss
165
- feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
165
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
166
166
  assert_equal 'SimpleRSS', feed.parser
167
167
  assert_equal 'Jeff Hecht', feed.entries.last.author
168
168
  end
169
169
 
170
+ def test_entry_categories_ruby_rss
171
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
172
+ assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
173
+ end
174
+
175
+ def test_entry_categories_simple_rss
176
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
177
+ assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
178
+ end
179
+
180
+ def test_loose_categories_ruby_rss
181
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false, :loose => true)
182
+ assert_equal [1,2,0], feed.entries.collect{|e|e.categories.size}
183
+ end
184
+
185
+ def test_loose_categories_simple_rss
186
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false, :loose => true)
187
+ assert_equal [1,1,0], feed.entries.collect{|e|e.categories.size}
188
+ end
189
+
190
+ def test_content_encoded_simple_rss
191
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
192
+
193
+ feed.entries.each_with_index do |e, i|
194
+ assert_match(/test#{i+1}/, e.content)
195
+ end
196
+ end
197
+
198
+ def test_content_encoded_ruby_rss
199
+ feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
200
+
201
+ feed.entries.each_with_index do |e, i|
202
+ assert_match(/test#{i+1}/, e.content)
203
+ end
204
+ end
205
+
170
206
  end
171
207
 
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: feed-normalizer
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.3.2
7
- date: 2007-07-02 00:00:00 -07:00
6
+ version: 1.4.0
7
+ date: 2007-07-10 00:00:00 -07:00
8
8
  summary: Extensible Ruby wrapper for Atom and RSS parsers
9
9
  require_paths:
10
10
  - lib
@@ -33,7 +33,7 @@ files:
33
33
  - License.txt
34
34
  - Manifest.txt
35
35
  - Rakefile
36
- - Readme.txt
36
+ - README.txt
37
37
  - lib/feed-normalizer.rb
38
38
  - lib/html-cleaner.rb
39
39
  - lib/parsers/rss.rb
@@ -57,7 +57,7 @@ extra_rdoc_files:
57
57
  - History.txt
58
58
  - License.txt
59
59
  - Manifest.txt
60
- - Readme.txt
60
+ - README.txt
61
61
  executables: []
62
62
 
63
63
  extensions: []