feed-normalizer 1.3.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/License.txt +1 -1
- data/Manifest.txt +1 -1
- data/{Readme.txt → README.txt} +2 -2
- data/Rakefile +1 -1
- data/lib/feed-normalizer.rb +32 -7
- data/lib/html-cleaner.rb +2 -2
- data/lib/parsers/rss.rb +12 -4
- data/lib/parsers/simple-rss.rb +4 -4
- data/lib/structures.rb +2 -1
- data/test/data/rss20.xml +5 -2
- data/test/data/rss20diff.xml +0 -1
- data/test/test_feednormalizer.rb +42 -6
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
1.4.0
|
2
|
+
|
3
|
+
* Support content:encoded. Accessible via Entry#content.
|
4
|
+
* Support categories. Accessible via Entry#categories.
|
5
|
+
* Introduces a new parsing feature 'loose parsing'. Use :loose => true
|
6
|
+
when parsing if the required output should retain extra data, rather
|
7
|
+
than drop it in the interests of 'lowest common denomiator' normalization.
|
8
|
+
Currently affects how categories works. See the documentation in
|
9
|
+
FeedNormalizer#parse for more details.
|
10
|
+
|
1
11
|
1.3.2
|
2
12
|
|
3
13
|
* Add support for applicable dublin core elements. (dc:date and dc:creator)
|
data/License.txt
CHANGED
data/Manifest.txt
CHANGED
data/{Readme.txt → README.txt}
RENAMED
@@ -23,7 +23,7 @@ object graph, regardless of the underlying feed format.
|
|
23
23
|
feed.entries.first.url # => "http://www.iht.com/articles/2006/10/03/frontpage/web.1003UN.php"
|
24
24
|
|
25
25
|
feed.class # => FeedNormalizer::Feed
|
26
|
-
feed.parser # => RSS::Parser
|
26
|
+
feed.parser # => "RSS::Parser"
|
27
27
|
|
28
28
|
Now read an Atom feed, and the same class is returned, and the same terminology applies:
|
29
29
|
|
@@ -36,7 +36,7 @@ Now read an Atom feed, and the same class is returned, and the same terminology
|
|
36
36
|
The feed representation stays the same, even though a different parser was used.
|
37
37
|
|
38
38
|
feed.class # => FeedNormalizer::Feed
|
39
|
-
feed.parser # => SimpleRSS
|
39
|
+
feed.parser # => "SimpleRSS"
|
40
40
|
|
41
41
|
== Cleaning / Sanitizing
|
42
42
|
|
data/Rakefile
CHANGED
data/lib/feed-normalizer.rb
CHANGED
@@ -13,7 +13,7 @@ module FeedNormalizer
|
|
13
13
|
|
14
14
|
# Parses the given feed, and returns a normalized representation.
|
15
15
|
# Returns nil if the feed could not be parsed.
|
16
|
-
def self.parse(feed)
|
16
|
+
def self.parse(feed, loose)
|
17
17
|
nil
|
18
18
|
end
|
19
19
|
|
@@ -41,7 +41,10 @@ module FeedNormalizer
|
|
41
41
|
src[src_function]
|
42
42
|
end
|
43
43
|
|
44
|
-
|
44
|
+
unless value.to_s.empty?
|
45
|
+
append_or_set!(value, dest, dest_function)
|
46
|
+
break
|
47
|
+
end
|
45
48
|
end
|
46
49
|
|
47
50
|
end
|
@@ -85,24 +88,46 @@ module FeedNormalizer
|
|
85
88
|
class FeedNormalizer
|
86
89
|
|
87
90
|
# Parses the given xml and attempts to return a normalized Feed object.
|
88
|
-
# Setting
|
89
|
-
# used first, and if try_others is false, it is the only parser used,
|
90
|
-
# otherwise all parsers in the ParserRegistry are attempted
|
91
|
+
# Setting +force_parser+ to a suitable parser will mean that parser is
|
92
|
+
# used first, and if +try_others+ is false, it is the only parser used,
|
93
|
+
# otherwise all parsers in the ParserRegistry are attempted, in
|
91
94
|
# order of priority.
|
95
|
+
#
|
96
|
+
# ===Available options
|
97
|
+
#
|
98
|
+
# * <tt>:force_parser</tt> - instruct feed-normalizer to try the specified
|
99
|
+
# parser first. Takes a class, such as RubyRssParser, or SimpleRssParser.
|
100
|
+
#
|
101
|
+
# * <tt>:try_others</tt> - +true+ or +false+, defaults to +true+.
|
102
|
+
# If +true+, other parsers will be used as described above. The option
|
103
|
+
# is useful if combined with +force_parser+ to only use a single parser.
|
104
|
+
#
|
105
|
+
# * <tt>:loose</tt> - +true+ or +false+, defaults to +false+.
|
106
|
+
#
|
107
|
+
# Specifies parsing should be done loosely. This means that when
|
108
|
+
# feed-normalizer would usually throw away data in order to meet
|
109
|
+
# the requirement of keeping resulting feed outputs the same regardless
|
110
|
+
# of the underlying parser, the data will instead be kept. This currently
|
111
|
+
# affects the following items:
|
112
|
+
# * <em>Categories:</em> RSS allows for multiple categories per feed item.
|
113
|
+
# * <em>Limitation:</em> SimpleRSS can only return the first category
|
114
|
+
# for an item.
|
115
|
+
# * <em>Result:</em> When loose is true, the extra categories are kept,
|
116
|
+
# of course, only if the parser is not SimpleRSS.
|
92
117
|
def self.parse(xml, opts = {})
|
93
118
|
|
94
119
|
# Get a string ASAP, as multiple read()'s will start returning nil..
|
95
120
|
xml = xml.respond_to?(:read) ? xml.read : xml.to_s
|
96
121
|
|
97
122
|
if opts[:force_parser]
|
98
|
-
result = opts[:force_parser].parse(xml)
|
123
|
+
result = opts[:force_parser].parse(xml, opts[:loose])
|
99
124
|
|
100
125
|
return result if result
|
101
126
|
return nil if opts[:try_others] == false
|
102
127
|
end
|
103
128
|
|
104
129
|
ParserRegistry.parsers.each do |parser|
|
105
|
-
result = parser.parse(xml)
|
130
|
+
result = parser.parse(xml, opts[:loose])
|
106
131
|
return result if result
|
107
132
|
end
|
108
133
|
|
data/lib/html-cleaner.rb
CHANGED
@@ -165,7 +165,7 @@ module FeedNormalizer
|
|
165
165
|
end
|
166
166
|
|
167
167
|
|
168
|
-
module Enumerable
|
168
|
+
module Enumerable #:nodoc:
|
169
169
|
def build_hash
|
170
170
|
result = {}
|
171
171
|
self.each do |elt|
|
@@ -180,7 +180,7 @@ end
|
|
180
180
|
# Subject: A simple Hpricot text setter
|
181
181
|
# From: Chris Gehlker <canyonrat mac.com>
|
182
182
|
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
183
|
-
class Hpricot::Text
|
183
|
+
class Hpricot::Text #:nodoc:
|
184
184
|
def set(string)
|
185
185
|
@content = string
|
186
186
|
self.raw_string = string
|
data/lib/parsers/rss.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
require 'rss'
|
2
2
|
|
3
|
+
# For some reason, this is only included in the RDF Item by default.
|
4
|
+
class RSS::Rss::Channel::Item # :nodoc:
|
5
|
+
include RSS::ContentModel
|
6
|
+
end
|
7
|
+
|
3
8
|
module FeedNormalizer
|
4
9
|
class RubyRssParser < Parser
|
5
10
|
|
@@ -7,7 +12,7 @@ module FeedNormalizer
|
|
7
12
|
RSS::Parser
|
8
13
|
end
|
9
14
|
|
10
|
-
def self.parse(xml)
|
15
|
+
def self.parse(xml, loose)
|
11
16
|
begin
|
12
17
|
rss = parser.parse(xml)
|
13
18
|
rescue Exception => e
|
@@ -15,7 +20,7 @@ module FeedNormalizer
|
|
15
20
|
return nil
|
16
21
|
end
|
17
22
|
|
18
|
-
rss ? package(rss) : nil
|
23
|
+
rss ? package(rss, loose) : nil
|
19
24
|
end
|
20
25
|
|
21
26
|
# Fairly high priority; a fast and strict parser.
|
@@ -25,7 +30,7 @@ module FeedNormalizer
|
|
25
30
|
|
26
31
|
protected
|
27
32
|
|
28
|
-
def self.package(rss)
|
33
|
+
def self.package(rss, loose)
|
29
34
|
feed = Feed.new(self)
|
30
35
|
|
31
36
|
# channel elements
|
@@ -52,7 +57,7 @@ module FeedNormalizer
|
|
52
57
|
:date_published => [:pubDate, :dc_date],
|
53
58
|
:urls => :link,
|
54
59
|
:description => :description,
|
55
|
-
:content => :description,
|
60
|
+
:content => [:content_encoded, :description],
|
56
61
|
:title => :title,
|
57
62
|
:authors => [:author, :dc_creator]
|
58
63
|
}
|
@@ -64,6 +69,9 @@ module FeedNormalizer
|
|
64
69
|
# custom item elements
|
65
70
|
feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
|
66
71
|
feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
|
72
|
+
feed_entry.categories = loose ?
|
73
|
+
rss_item.categories.collect{|c|c.content} :
|
74
|
+
[rss_item.categories.first.content] rescue []
|
67
75
|
|
68
76
|
feed.entries << feed_entry
|
69
77
|
end
|
data/lib/parsers/simple-rss.rb
CHANGED
@@ -9,7 +9,7 @@ module FeedNormalizer
|
|
9
9
|
SimpleRSS
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.parse(xml)
|
12
|
+
def self.parse(xml, loose)
|
13
13
|
begin
|
14
14
|
atomrss = parser.parse(xml)
|
15
15
|
rescue Exception => e
|
@@ -53,9 +53,10 @@ module FeedNormalizer
|
|
53
53
|
:date_published => [:pubDate, :published, :dc_date],
|
54
54
|
:urls => :link,
|
55
55
|
:description => [:description, :summary],
|
56
|
-
:content => [:content, :description],
|
56
|
+
:content => [:content, :content_encoded, :description],
|
57
57
|
:title => :title,
|
58
|
-
:authors => [:author, :contributor, :dc_creator]
|
58
|
+
:authors => [:author, :contributor, :dc_creator],
|
59
|
+
:categories => :category
|
59
60
|
}
|
60
61
|
|
61
62
|
atomrss.entries.each do |atomrss_entry|
|
@@ -95,4 +96,3 @@ module FeedNormalizer
|
|
95
96
|
|
96
97
|
end
|
97
98
|
end
|
98
|
-
|
data/lib/structures.rb
CHANGED
@@ -121,7 +121,7 @@ module FeedNormalizer
|
|
121
121
|
include Singular, ElementEquality, ElementCleaner
|
122
122
|
|
123
123
|
HTML_ELEMENTS = [:content, :description, :title]
|
124
|
-
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright]
|
124
|
+
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
|
125
125
|
BLENDED_ELEMENTS = []
|
126
126
|
|
127
127
|
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
@@ -131,6 +131,7 @@ module FeedNormalizer
|
|
131
131
|
def initialize
|
132
132
|
@urls = []
|
133
133
|
@authors = []
|
134
|
+
@categories = []
|
134
135
|
end
|
135
136
|
|
136
137
|
end
|
data/test/data/rss20.xml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
2
|
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
-
<rss version="2.0">
|
3
|
+
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
4
4
|
<channel>
|
5
5
|
<title>BBC News | Technology | UK Edition</title>
|
6
6
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
@@ -20,6 +20,7 @@
|
|
20
20
|
<item>
|
21
21
|
<title>Concerns over security software</title>
|
22
22
|
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
23
|
+
<content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
|
23
24
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
24
25
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
25
26
|
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
@@ -29,19 +30,21 @@
|
|
29
30
|
<item>
|
30
31
|
<title>Top prize for 'light' inventor</title>
|
31
32
|
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
33
|
+
<content:encoded><![CDATA[<p>test2</p>]]></content:encoded>
|
32
34
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5328446.stm</link>
|
33
35
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
34
36
|
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
35
37
|
<category>Technology</category>
|
38
|
+
<category>Japan</category>
|
36
39
|
</item>
|
37
40
|
|
38
41
|
<item>
|
39
42
|
<title>MP3 player court order overturned</title>
|
40
43
|
<description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
44
|
+
<content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
|
41
45
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
42
46
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
43
47
|
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
44
|
-
<category>Technology</category>
|
45
48
|
</item>
|
46
49
|
|
47
50
|
</channel>
|
data/test/data/rss20diff.xml
CHANGED
@@ -41,7 +41,6 @@
|
|
41
41
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
42
42
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
43
43
|
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
44
|
-
<category>Technology</category>
|
45
44
|
</item>
|
46
45
|
|
47
46
|
</channel>
|
data/test/test_feednormalizer.rb
CHANGED
@@ -68,7 +68,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
68
68
|
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
69
69
|
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
70
70
|
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
71
|
-
|
71
|
+
assert_match(/test\d/, feed.entries.last.content)
|
72
72
|
assert_instance_of Time, feed.entries.last.date_published
|
73
73
|
end
|
74
74
|
|
@@ -108,7 +108,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
108
108
|
no_diff = feed.diff(feed)
|
109
109
|
|
110
110
|
assert diff.keys.all? {|key| [:title, :items].include?(key)}
|
111
|
-
assert_equal
|
111
|
+
assert_equal 3, diff[:items].size
|
112
112
|
|
113
113
|
assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
|
114
114
|
assert_equal [3,2], diff_short[:items]
|
@@ -144,28 +144,64 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
144
144
|
end
|
145
145
|
|
146
146
|
def test_dublin_core_date_ruby_rss
|
147
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
|
147
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
148
148
|
assert_equal 'RSS::Parser', feed.parser
|
149
149
|
assert_instance_of Time, feed.entries.first.date_published
|
150
150
|
end
|
151
151
|
|
152
152
|
def test_dublin_core_date_simple_rss
|
153
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
|
153
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
154
154
|
assert_equal 'SimpleRSS', feed.parser
|
155
155
|
assert_instance_of Time, feed.entries.first.date_published
|
156
156
|
end
|
157
157
|
|
158
158
|
def test_dublin_core_creator_ruby_rss
|
159
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
|
159
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
160
160
|
assert_equal 'RSS::Parser', feed.parser
|
161
161
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
162
162
|
end
|
163
163
|
|
164
164
|
def test_dublin_core_creator_simple_rss
|
165
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
|
165
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
166
166
|
assert_equal 'SimpleRSS', feed.parser
|
167
167
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
168
168
|
end
|
169
169
|
|
170
|
+
def test_entry_categories_ruby_rss
|
171
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
172
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
173
|
+
end
|
174
|
+
|
175
|
+
def test_entry_categories_simple_rss
|
176
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
177
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_loose_categories_ruby_rss
|
181
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false, :loose => true)
|
182
|
+
assert_equal [1,2,0], feed.entries.collect{|e|e.categories.size}
|
183
|
+
end
|
184
|
+
|
185
|
+
def test_loose_categories_simple_rss
|
186
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false, :loose => true)
|
187
|
+
assert_equal [1,1,0], feed.entries.collect{|e|e.categories.size}
|
188
|
+
end
|
189
|
+
|
190
|
+
def test_content_encoded_simple_rss
|
191
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
192
|
+
|
193
|
+
feed.entries.each_with_index do |e, i|
|
194
|
+
assert_match(/test#{i+1}/, e.content)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def test_content_encoded_ruby_rss
|
199
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
200
|
+
|
201
|
+
feed.entries.each_with_index do |e, i|
|
202
|
+
assert_match(/test#{i+1}/, e.content)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
170
206
|
end
|
171
207
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2007-07-
|
6
|
+
version: 1.4.0
|
7
|
+
date: 2007-07-10 00:00:00 -07:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -33,7 +33,7 @@ files:
|
|
33
33
|
- License.txt
|
34
34
|
- Manifest.txt
|
35
35
|
- Rakefile
|
36
|
-
-
|
36
|
+
- README.txt
|
37
37
|
- lib/feed-normalizer.rb
|
38
38
|
- lib/html-cleaner.rb
|
39
39
|
- lib/parsers/rss.rb
|
@@ -57,7 +57,7 @@ extra_rdoc_files:
|
|
57
57
|
- History.txt
|
58
58
|
- License.txt
|
59
59
|
- Manifest.txt
|
60
|
-
-
|
60
|
+
- README.txt
|
61
61
|
executables: []
|
62
62
|
|
63
63
|
extensions: []
|