feed-normalizer 1.3.2 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/License.txt +1 -1
- data/Manifest.txt +1 -1
- data/{Readme.txt → README.txt} +2 -2
- data/Rakefile +1 -1
- data/lib/feed-normalizer.rb +32 -7
- data/lib/html-cleaner.rb +2 -2
- data/lib/parsers/rss.rb +12 -4
- data/lib/parsers/simple-rss.rb +4 -4
- data/lib/structures.rb +2 -1
- data/test/data/rss20.xml +5 -2
- data/test/data/rss20diff.xml +0 -1
- data/test/test_feednormalizer.rb +42 -6
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
1.4.0
|
2
|
+
|
3
|
+
* Support content:encoded. Accessible via Entry#content.
|
4
|
+
* Support categories. Accessible via Entry#categories.
|
5
|
+
* Introduces a new parsing feature 'loose parsing'. Use :loose => true
|
6
|
+
when parsing if the required output should retain extra data, rather
|
7
|
+
than drop it in the interests of 'lowest common denomiator' normalization.
|
8
|
+
Currently affects how categories works. See the documentation in
|
9
|
+
FeedNormalizer#parse for more details.
|
10
|
+
|
1
11
|
1.3.2
|
2
12
|
|
3
13
|
* Add support for applicable dublin core elements. (dc:date and dc:creator)
|
data/License.txt
CHANGED
data/Manifest.txt
CHANGED
data/{Readme.txt → README.txt}
RENAMED
@@ -23,7 +23,7 @@ object graph, regardless of the underlying feed format.
|
|
23
23
|
feed.entries.first.url # => "http://www.iht.com/articles/2006/10/03/frontpage/web.1003UN.php"
|
24
24
|
|
25
25
|
feed.class # => FeedNormalizer::Feed
|
26
|
-
feed.parser # => RSS::Parser
|
26
|
+
feed.parser # => "RSS::Parser"
|
27
27
|
|
28
28
|
Now read an Atom feed, and the same class is returned, and the same terminology applies:
|
29
29
|
|
@@ -36,7 +36,7 @@ Now read an Atom feed, and the same class is returned, and the same terminology
|
|
36
36
|
The feed representation stays the same, even though a different parser was used.
|
37
37
|
|
38
38
|
feed.class # => FeedNormalizer::Feed
|
39
|
-
feed.parser # => SimpleRSS
|
39
|
+
feed.parser # => "SimpleRSS"
|
40
40
|
|
41
41
|
== Cleaning / Sanitizing
|
42
42
|
|
data/Rakefile
CHANGED
data/lib/feed-normalizer.rb
CHANGED
@@ -13,7 +13,7 @@ module FeedNormalizer
|
|
13
13
|
|
14
14
|
# Parses the given feed, and returns a normalized representation.
|
15
15
|
# Returns nil if the feed could not be parsed.
|
16
|
-
def self.parse(feed)
|
16
|
+
def self.parse(feed, loose)
|
17
17
|
nil
|
18
18
|
end
|
19
19
|
|
@@ -41,7 +41,10 @@ module FeedNormalizer
|
|
41
41
|
src[src_function]
|
42
42
|
end
|
43
43
|
|
44
|
-
|
44
|
+
unless value.to_s.empty?
|
45
|
+
append_or_set!(value, dest, dest_function)
|
46
|
+
break
|
47
|
+
end
|
45
48
|
end
|
46
49
|
|
47
50
|
end
|
@@ -85,24 +88,46 @@ module FeedNormalizer
|
|
85
88
|
class FeedNormalizer
|
86
89
|
|
87
90
|
# Parses the given xml and attempts to return a normalized Feed object.
|
88
|
-
# Setting
|
89
|
-
# used first, and if try_others is false, it is the only parser used,
|
90
|
-
# otherwise all parsers in the ParserRegistry are attempted
|
91
|
+
# Setting +force_parser+ to a suitable parser will mean that parser is
|
92
|
+
# used first, and if +try_others+ is false, it is the only parser used,
|
93
|
+
# otherwise all parsers in the ParserRegistry are attempted, in
|
91
94
|
# order of priority.
|
95
|
+
#
|
96
|
+
# ===Available options
|
97
|
+
#
|
98
|
+
# * <tt>:force_parser</tt> - instruct feed-normalizer to try the specified
|
99
|
+
# parser first. Takes a class, such as RubyRssParser, or SimpleRssParser.
|
100
|
+
#
|
101
|
+
# * <tt>:try_others</tt> - +true+ or +false+, defaults to +true+.
|
102
|
+
# If +true+, other parsers will be used as described above. The option
|
103
|
+
# is useful if combined with +force_parser+ to only use a single parser.
|
104
|
+
#
|
105
|
+
# * <tt>:loose</tt> - +true+ or +false+, defaults to +false+.
|
106
|
+
#
|
107
|
+
# Specifies parsing should be done loosely. This means that when
|
108
|
+
# feed-normalizer would usually throw away data in order to meet
|
109
|
+
# the requirement of keeping resulting feed outputs the same regardless
|
110
|
+
# of the underlying parser, the data will instead be kept. This currently
|
111
|
+
# affects the following items:
|
112
|
+
# * <em>Categories:</em> RSS allows for multiple categories per feed item.
|
113
|
+
# * <em>Limitation:</em> SimpleRSS can only return the first category
|
114
|
+
# for an item.
|
115
|
+
# * <em>Result:</em> When loose is true, the extra categories are kept,
|
116
|
+
# of course, only if the parser is not SimpleRSS.
|
92
117
|
def self.parse(xml, opts = {})
|
93
118
|
|
94
119
|
# Get a string ASAP, as multiple read()'s will start returning nil..
|
95
120
|
xml = xml.respond_to?(:read) ? xml.read : xml.to_s
|
96
121
|
|
97
122
|
if opts[:force_parser]
|
98
|
-
result = opts[:force_parser].parse(xml)
|
123
|
+
result = opts[:force_parser].parse(xml, opts[:loose])
|
99
124
|
|
100
125
|
return result if result
|
101
126
|
return nil if opts[:try_others] == false
|
102
127
|
end
|
103
128
|
|
104
129
|
ParserRegistry.parsers.each do |parser|
|
105
|
-
result = parser.parse(xml)
|
130
|
+
result = parser.parse(xml, opts[:loose])
|
106
131
|
return result if result
|
107
132
|
end
|
108
133
|
|
data/lib/html-cleaner.rb
CHANGED
@@ -165,7 +165,7 @@ module FeedNormalizer
|
|
165
165
|
end
|
166
166
|
|
167
167
|
|
168
|
-
module Enumerable
|
168
|
+
module Enumerable #:nodoc:
|
169
169
|
def build_hash
|
170
170
|
result = {}
|
171
171
|
self.each do |elt|
|
@@ -180,7 +180,7 @@ end
|
|
180
180
|
# Subject: A simple Hpricot text setter
|
181
181
|
# From: Chris Gehlker <canyonrat mac.com>
|
182
182
|
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
183
|
-
class Hpricot::Text
|
183
|
+
class Hpricot::Text #:nodoc:
|
184
184
|
def set(string)
|
185
185
|
@content = string
|
186
186
|
self.raw_string = string
|
data/lib/parsers/rss.rb
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
require 'rss'
|
2
2
|
|
3
|
+
# For some reason, this is only included in the RDF Item by default.
|
4
|
+
class RSS::Rss::Channel::Item # :nodoc:
|
5
|
+
include RSS::ContentModel
|
6
|
+
end
|
7
|
+
|
3
8
|
module FeedNormalizer
|
4
9
|
class RubyRssParser < Parser
|
5
10
|
|
@@ -7,7 +12,7 @@ module FeedNormalizer
|
|
7
12
|
RSS::Parser
|
8
13
|
end
|
9
14
|
|
10
|
-
def self.parse(xml)
|
15
|
+
def self.parse(xml, loose)
|
11
16
|
begin
|
12
17
|
rss = parser.parse(xml)
|
13
18
|
rescue Exception => e
|
@@ -15,7 +20,7 @@ module FeedNormalizer
|
|
15
20
|
return nil
|
16
21
|
end
|
17
22
|
|
18
|
-
rss ? package(rss) : nil
|
23
|
+
rss ? package(rss, loose) : nil
|
19
24
|
end
|
20
25
|
|
21
26
|
# Fairly high priority; a fast and strict parser.
|
@@ -25,7 +30,7 @@ module FeedNormalizer
|
|
25
30
|
|
26
31
|
protected
|
27
32
|
|
28
|
-
def self.package(rss)
|
33
|
+
def self.package(rss, loose)
|
29
34
|
feed = Feed.new(self)
|
30
35
|
|
31
36
|
# channel elements
|
@@ -52,7 +57,7 @@ module FeedNormalizer
|
|
52
57
|
:date_published => [:pubDate, :dc_date],
|
53
58
|
:urls => :link,
|
54
59
|
:description => :description,
|
55
|
-
:content => :description,
|
60
|
+
:content => [:content_encoded, :description],
|
56
61
|
:title => :title,
|
57
62
|
:authors => [:author, :dc_creator]
|
58
63
|
}
|
@@ -64,6 +69,9 @@ module FeedNormalizer
|
|
64
69
|
# custom item elements
|
65
70
|
feed_entry.id = rss_item.guid.content if rss_item.respond_to?(:guid) && rss_item.guid
|
66
71
|
feed_entry.copyright = rss.copyright if rss_item.respond_to? :copyright
|
72
|
+
feed_entry.categories = loose ?
|
73
|
+
rss_item.categories.collect{|c|c.content} :
|
74
|
+
[rss_item.categories.first.content] rescue []
|
67
75
|
|
68
76
|
feed.entries << feed_entry
|
69
77
|
end
|
data/lib/parsers/simple-rss.rb
CHANGED
@@ -9,7 +9,7 @@ module FeedNormalizer
|
|
9
9
|
SimpleRSS
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.parse(xml)
|
12
|
+
def self.parse(xml, loose)
|
13
13
|
begin
|
14
14
|
atomrss = parser.parse(xml)
|
15
15
|
rescue Exception => e
|
@@ -53,9 +53,10 @@ module FeedNormalizer
|
|
53
53
|
:date_published => [:pubDate, :published, :dc_date],
|
54
54
|
:urls => :link,
|
55
55
|
:description => [:description, :summary],
|
56
|
-
:content => [:content, :description],
|
56
|
+
:content => [:content, :content_encoded, :description],
|
57
57
|
:title => :title,
|
58
|
-
:authors => [:author, :contributor, :dc_creator]
|
58
|
+
:authors => [:author, :contributor, :dc_creator],
|
59
|
+
:categories => :category
|
59
60
|
}
|
60
61
|
|
61
62
|
atomrss.entries.each do |atomrss_entry|
|
@@ -95,4 +96,3 @@ module FeedNormalizer
|
|
95
96
|
|
96
97
|
end
|
97
98
|
end
|
98
|
-
|
data/lib/structures.rb
CHANGED
@@ -121,7 +121,7 @@ module FeedNormalizer
|
|
121
121
|
include Singular, ElementEquality, ElementCleaner
|
122
122
|
|
123
123
|
HTML_ELEMENTS = [:content, :description, :title]
|
124
|
-
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright]
|
124
|
+
SIMPLE_ELEMENTS = [:date_published, :urls, :id, :authors, :copyright, :categories]
|
125
125
|
BLENDED_ELEMENTS = []
|
126
126
|
|
127
127
|
ELEMENTS = HTML_ELEMENTS + SIMPLE_ELEMENTS + BLENDED_ELEMENTS
|
@@ -131,6 +131,7 @@ module FeedNormalizer
|
|
131
131
|
def initialize
|
132
132
|
@urls = []
|
133
133
|
@authors = []
|
134
|
+
@categories = []
|
134
135
|
end
|
135
136
|
|
136
137
|
end
|
data/test/data/rss20.xml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
<?xml version="1.0" encoding="ISO-8859-1" ?>
|
2
2
|
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="/shared/bsp/xsl/rss/nolsol.xsl"?>
|
3
|
-
<rss version="2.0">
|
3
|
+
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
4
4
|
<channel>
|
5
5
|
<title>BBC News | Technology | UK Edition</title>
|
6
6
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm</link>
|
@@ -20,6 +20,7 @@
|
|
20
20
|
<item>
|
21
21
|
<title>Concerns over security software</title>
|
22
22
|
<description>BBC Click investigates free security software and finds out who will protect PCs when Microsoft launches Vista.</description>
|
23
|
+
<content:encoded><![CDATA[<p>test1</p>]]></content:encoded>
|
23
24
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/programmes/click_online/5326654.stm</link>
|
24
25
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/programmes/click_online/5326654.stm</guid>
|
25
26
|
<pubDate>Sat, 09 Sep 2006 12:45:35 GMT</pubDate>
|
@@ -29,19 +30,21 @@
|
|
29
30
|
<item>
|
30
31
|
<title>Top prize for 'light' inventor</title>
|
31
32
|
<description>A Japanese scientist who invented a sustainable form of light is awarded the Millennium Technology Prize.</description>
|
33
|
+
<content:encoded><![CDATA[<p>test2</p>]]></content:encoded>
|
32
34
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5328446.stm</link>
|
33
35
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5328446.stm</guid>
|
34
36
|
<pubDate>Fri, 08 Sep 2006 16:18:08 GMT</pubDate>
|
35
37
|
<category>Technology</category>
|
38
|
+
<category>Japan</category>
|
36
39
|
</item>
|
37
40
|
|
38
41
|
<item>
|
39
42
|
<title>MP3 player court order overturned</title>
|
40
43
|
<description>SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.</description>
|
44
|
+
<content:encoded><![CDATA[<p>test3</p>]]></content:encoded>
|
41
45
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
42
46
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
43
47
|
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
44
|
-
<category>Technology</category>
|
45
48
|
</item>
|
46
49
|
|
47
50
|
</channel>
|
data/test/data/rss20diff.xml
CHANGED
@@ -41,7 +41,6 @@
|
|
41
41
|
<link>http://news.bbc.co.uk/go/rss/-/1/hi/technology/5326660.stm</link>
|
42
42
|
<guid isPermaLink="false">http://news.bbc.co.uk/1/hi/technology/5326660.stm</guid>
|
43
43
|
<pubDate>Fri, 08 Sep 2006 10:14:41 GMT</pubDate>
|
44
|
-
<category>Technology</category>
|
45
44
|
</item>
|
46
45
|
|
47
46
|
</channel>
|
data/test/test_feednormalizer.rb
CHANGED
@@ -68,7 +68,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
68
68
|
assert_equal ["http://news.bbc.co.uk/go/rss/-/1/hi/technology/default.stm"], feed.urls
|
69
69
|
assert_equal "MP3 player court order overturned", feed.entries.last.title
|
70
70
|
assert_equal "SanDisk puts its MP3 players back on display at a German electronics show after overturning a court injunction.", feed.entries.last.description
|
71
|
-
|
71
|
+
assert_match(/test\d/, feed.entries.last.content)
|
72
72
|
assert_instance_of Time, feed.entries.last.date_published
|
73
73
|
end
|
74
74
|
|
@@ -108,7 +108,7 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
108
108
|
no_diff = feed.diff(feed)
|
109
109
|
|
110
110
|
assert diff.keys.all? {|key| [:title, :items].include?(key)}
|
111
|
-
assert_equal
|
111
|
+
assert_equal 3, diff[:items].size
|
112
112
|
|
113
113
|
assert diff_short.keys.all? {|key| [:title, :items].include?(key)}
|
114
114
|
assert_equal [3,2], diff_short[:items]
|
@@ -144,28 +144,64 @@ class FeedNormalizerTest < Test::Unit::TestCase
|
|
144
144
|
end
|
145
145
|
|
146
146
|
def test_dublin_core_date_ruby_rss
|
147
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
|
147
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
148
148
|
assert_equal 'RSS::Parser', feed.parser
|
149
149
|
assert_instance_of Time, feed.entries.first.date_published
|
150
150
|
end
|
151
151
|
|
152
152
|
def test_dublin_core_date_simple_rss
|
153
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
|
153
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
154
154
|
assert_equal 'SimpleRSS', feed.parser
|
155
155
|
assert_instance_of Time, feed.entries.first.date_published
|
156
156
|
end
|
157
157
|
|
158
158
|
def test_dublin_core_creator_ruby_rss
|
159
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser)
|
159
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => RubyRssParser, :try_others => false)
|
160
160
|
assert_equal 'RSS::Parser', feed.parser
|
161
161
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
162
162
|
end
|
163
163
|
|
164
164
|
def test_dublin_core_creator_simple_rss
|
165
|
-
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser)
|
165
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rdf10], :force_parser => SimpleRssParser, :try_others => false)
|
166
166
|
assert_equal 'SimpleRSS', feed.parser
|
167
167
|
assert_equal 'Jeff Hecht', feed.entries.last.author
|
168
168
|
end
|
169
169
|
|
170
|
+
def test_entry_categories_ruby_rss
|
171
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
172
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
173
|
+
end
|
174
|
+
|
175
|
+
def test_entry_categories_simple_rss
|
176
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
177
|
+
assert_equal [['Click'],['Technology'],[]], feed.items.collect {|i|i.categories}
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_loose_categories_ruby_rss
|
181
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false, :loose => true)
|
182
|
+
assert_equal [1,2,0], feed.entries.collect{|e|e.categories.size}
|
183
|
+
end
|
184
|
+
|
185
|
+
def test_loose_categories_simple_rss
|
186
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false, :loose => true)
|
187
|
+
assert_equal [1,1,0], feed.entries.collect{|e|e.categories.size}
|
188
|
+
end
|
189
|
+
|
190
|
+
def test_content_encoded_simple_rss
|
191
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => SimpleRssParser, :try_others => false)
|
192
|
+
|
193
|
+
feed.entries.each_with_index do |e, i|
|
194
|
+
assert_match(/test#{i+1}/, e.content)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def test_content_encoded_ruby_rss
|
199
|
+
feed = FeedNormalizer::FeedNormalizer.parse(XML_FILES[:rss20], :force_parser => RubyRssParser, :try_others => false)
|
200
|
+
|
201
|
+
feed.entries.each_with_index do |e, i|
|
202
|
+
assert_match(/test#{i+1}/, e.content)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
170
206
|
end
|
171
207
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: feed-normalizer
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2007-07-
|
6
|
+
version: 1.4.0
|
7
|
+
date: 2007-07-10 00:00:00 -07:00
|
8
8
|
summary: Extensible Ruby wrapper for Atom and RSS parsers
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -33,7 +33,7 @@ files:
|
|
33
33
|
- License.txt
|
34
34
|
- Manifest.txt
|
35
35
|
- Rakefile
|
36
|
-
-
|
36
|
+
- README.txt
|
37
37
|
- lib/feed-normalizer.rb
|
38
38
|
- lib/html-cleaner.rb
|
39
39
|
- lib/parsers/rss.rb
|
@@ -57,7 +57,7 @@ extra_rdoc_files:
|
|
57
57
|
- History.txt
|
58
58
|
- License.txt
|
59
59
|
- Manifest.txt
|
60
|
-
-
|
60
|
+
- README.txt
|
61
61
|
executables: []
|
62
62
|
|
63
63
|
extensions: []
|