feed_parser 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -2
- data/Rakefile +12 -4
- data/feed_parser.gemspec +3 -4
- data/lib/feed_parser.rb +31 -3
- data/lib/feed_parser/feed.rb +2 -33
- data/spec/feed_parser_spec.rb +81 -83
- metadata +20 -9
data/README.md
CHANGED
@@ -23,7 +23,7 @@ Add to Gemfile
|
|
23
23
|
# sanitizing custom field set
|
24
24
|
fp = FeedParser.new(:url => "http://example.com/feed/", :sanitizer => MyBestestSanitizer.new, :fields_to_sanitize => [:title, :content])
|
25
25
|
|
26
|
-
#
|
26
|
+
# retrieve the feed xml and parse it
|
27
27
|
feed = fp.parse
|
28
28
|
|
29
29
|
# using parsed feed in your code
|
@@ -42,7 +42,10 @@ If the fetched XML is not a valid RSS or an ATOM feed, a FeedParser::UnknownFeed
|
|
42
42
|
|
43
43
|
## Running tests
|
44
44
|
|
45
|
-
Install dependencies
|
45
|
+
Install dependencies:
|
46
|
+
|
47
|
+
$ gem install bundler
|
48
|
+
$ bundle install
|
46
49
|
|
47
50
|
Run rspec tests:
|
48
51
|
|
data/Rakefile
CHANGED
@@ -19,9 +19,17 @@ end
|
|
19
19
|
desc "Default: Run specs"
|
20
20
|
task :default => :spec
|
21
21
|
|
22
|
-
namespace :
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
namespace :rubies do
|
23
|
+
rvm_rubies_command = "rvm 1.8.7-p302@feed_parser,1.9.3-p194@feed_parser do"
|
24
|
+
|
25
|
+
desc "Update dependencies for all Ruby versions"
|
26
|
+
task :update_dependencies do
|
27
|
+
system("#{rvm_rubies_command} bundle install")
|
28
|
+
system("#{rvm_rubies_command} bundle update")
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "Run tests with Ruby versions 1.8.7 and 1.9.3"
|
32
|
+
task :spec do
|
33
|
+
system("#{rvm_rubies_command} bundle exec rake spec")
|
26
34
|
end
|
27
35
|
end
|
data/feed_parser.gemspec
CHANGED
@@ -7,19 +7,18 @@ require 'feed_parser'
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = 'feed_parser'
|
9
9
|
s.version = FeedParser::VERSION
|
10
|
-
s.platform = Gem::Platform::RUBY
|
11
10
|
s.authors = ['Arttu Tervo']
|
12
11
|
s.email = ['arttu.tervo@gmail.com']
|
13
|
-
s.homepage = '
|
12
|
+
s.homepage = 'https://github.com/arttu/feed_parser'
|
14
13
|
s.summary = %q{Rss and Atom feed parser}
|
15
14
|
s.description = %q{Rss and Atom feed parser with sanitizer support built on top of Nokogiri.}
|
16
15
|
|
17
16
|
s.add_dependency 'nokogiri'
|
18
17
|
|
19
|
-
s.add_development_dependency '
|
18
|
+
s.add_development_dependency 'rake', '>= 0.9'
|
19
|
+
s.add_development_dependency 'rspec', '>= 2.10'
|
20
20
|
|
21
21
|
s.extra_rdoc_files = %w[README.md]
|
22
|
-
s.require_paths = %w[lib]
|
23
22
|
|
24
23
|
s.files = `git ls-files`.split("\n")
|
25
24
|
s.test_files = `git ls-files -- spec/*`.split("\n")
|
data/lib/feed_parser.rb
CHANGED
@@ -3,15 +3,16 @@ require 'nokogiri'
|
|
3
3
|
|
4
4
|
class FeedParser
|
5
5
|
|
6
|
-
VERSION = "0.3.
|
6
|
+
VERSION = "0.3.3"
|
7
7
|
|
8
8
|
USER_AGENT = "Ruby / FeedParser gem"
|
9
9
|
|
10
10
|
class FeedParser::UnknownFeedType < Exception ; end
|
11
|
+
class FeedParser::InvalidURI < Exception ; end
|
11
12
|
|
12
13
|
def initialize(opts)
|
13
14
|
@url = opts[:url]
|
14
|
-
@http_options = opts[:http] || {}
|
15
|
+
@http_options = {"User-Agent" => FeedParser::USER_AGENT}.merge(opts[:http] || {})
|
15
16
|
@@sanitizer = (opts[:sanitizer] || SelfSanitizer.new)
|
16
17
|
@@fields_to_sanitize = (opts[:fields_to_sanitize] || [:content])
|
17
18
|
self
|
@@ -26,7 +27,34 @@ class FeedParser
|
|
26
27
|
end
|
27
28
|
|
28
29
|
def parse
|
29
|
-
|
30
|
+
feed_xml = open_or_follow_redirect(@url)
|
31
|
+
@feed ||= Feed.new(feed_xml)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def open_or_follow_redirect(feed_url)
|
37
|
+
uri = URI.parse(feed_url)
|
38
|
+
|
39
|
+
if uri.userinfo
|
40
|
+
@http_options[:http_basic_authentication] = [uri.user, uri.password].compact
|
41
|
+
uri.userinfo = uri.user = uri.password = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
@http_options[:redirect] = true if RUBY_VERSION >= '1.9'
|
45
|
+
|
46
|
+
if ['http', 'https'].include?(uri.scheme)
|
47
|
+
open(uri.to_s, @http_options)
|
48
|
+
else
|
49
|
+
raise FeedParser::InvalidURI.new("Only URIs with http or https protocol are supported")
|
50
|
+
end
|
51
|
+
rescue RuntimeError => ex
|
52
|
+
redirect_url = ex.to_s.split(" ").last
|
53
|
+
if URI.parse(feed_url).scheme == "http" && URI.parse(redirect_url).scheme == "https"
|
54
|
+
open_or_follow_redirect(redirect_url)
|
55
|
+
else
|
56
|
+
raise ex
|
57
|
+
end
|
30
58
|
end
|
31
59
|
end
|
32
60
|
|
data/lib/feed_parser/feed.rb
CHANGED
@@ -2,10 +2,8 @@ class FeedParser
|
|
2
2
|
class Feed
|
3
3
|
attr_reader :type
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
@
|
7
|
-
raw_feed = open_or_follow_redirect(feed_url)
|
8
|
-
@feed = Nokogiri::XML(raw_feed)
|
5
|
+
def initialize(feed_xml)
|
6
|
+
@feed = Nokogiri::XML(feed_xml)
|
9
7
|
@feed.remove_namespaces!
|
10
8
|
@type = ((@feed.xpath('/rss')[0] && :rss) || (@feed.xpath('/feed')[0] && :atom))
|
11
9
|
raise FeedParser::UnknownFeedType.new("Feed is not an RSS feed or an ATOM feed") unless @type
|
@@ -44,34 +42,5 @@ class FeedParser
|
|
44
42
|
:items => items.map(&:as_json)
|
45
43
|
}
|
46
44
|
end
|
47
|
-
|
48
|
-
private
|
49
|
-
|
50
|
-
# Some feeds
|
51
|
-
def open_or_follow_redirect(feed_url)
|
52
|
-
parsed_url = URI.parse(feed_url)
|
53
|
-
|
54
|
-
connection_options = {"User-Agent" => FeedParser::USER_AGENT}
|
55
|
-
connection_options.merge!(@http_options)
|
56
|
-
if parsed_url.userinfo
|
57
|
-
connection_options[:http_basic_authentication] = [parsed_url.user, parsed_url.password].compact
|
58
|
-
parsed_url.userinfo = parsed_url.user = parsed_url.password = nil
|
59
|
-
end
|
60
|
-
|
61
|
-
connection_options[:redirect] = true if RUBY_VERSION >= '1.9'
|
62
|
-
|
63
|
-
if parsed_url.scheme
|
64
|
-
open(parsed_url.to_s, connection_options)
|
65
|
-
else
|
66
|
-
open(parsed_url.to_s)
|
67
|
-
end
|
68
|
-
rescue RuntimeError => ex
|
69
|
-
redirect_url = ex.to_s.split(" ").last
|
70
|
-
if URI.split(feed_url).first == "http" && URI.split(redirect_url).first == "https"
|
71
|
-
open_or_follow_redirect(redirect_url)
|
72
|
-
else
|
73
|
-
raise ex
|
74
|
-
end
|
75
|
-
end
|
76
45
|
end
|
77
46
|
end
|
data/spec/feed_parser_spec.rb
CHANGED
@@ -20,96 +20,104 @@ describe FeedParser do
|
|
20
20
|
|
21
21
|
describe "#new" do
|
22
22
|
it "should forward given http options to the OpenURI" do
|
23
|
-
FeedParser
|
23
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE)).and_return(feed_xml)
|
24
24
|
fp = FeedParser.new(:url => "http://blog.example.com/feed/", :http => {:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE})
|
25
25
|
fp.parse
|
26
26
|
end
|
27
|
-
end
|
28
27
|
|
29
|
-
describe FeedParser::Feed, "#new" do
|
30
28
|
it "should fetch a feed by url" do
|
31
|
-
FeedParser
|
32
|
-
FeedParser
|
29
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options).and_return(feed_xml)
|
30
|
+
fp = FeedParser.new({:url => "http://blog.example.com/feed/"}.merge(http_connection_options))
|
31
|
+
fp.parse
|
33
32
|
end
|
34
33
|
|
35
34
|
it "should fetch a feed using basic auth if auth embedded to the url" do
|
36
|
-
FeedParser
|
37
|
-
FeedParser
|
35
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:http_basic_authentication => ["user", "pass"])).and_return(feed_xml)
|
36
|
+
fp = FeedParser.new({:url => "http://user:pass@blog.example.com/feed/"}.merge(http_connection_options))
|
37
|
+
fp.parse
|
38
38
|
end
|
39
39
|
|
40
40
|
it "should fetch a feed with only a user name embedded to the url" do
|
41
|
-
FeedParser
|
42
|
-
FeedParser
|
41
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:http_basic_authentication => ["user"])).and_return(feed_xml)
|
42
|
+
fp = FeedParser.new({:url => "http://user@blog.example.com/feed/"}.merge(http_connection_options))
|
43
|
+
fp.parse
|
43
44
|
end
|
44
45
|
|
45
|
-
it "should follow redirect based on the exception message" do
|
46
|
-
FeedParser
|
47
|
-
FeedParser
|
48
|
-
FeedParser
|
46
|
+
it "should follow redirect based on the exception message (even if OpenURI don't want to do it)" do
|
47
|
+
FeedParser.any_instance.should_receive(:open).with("http://example.com/feed", http_connection_options).and_raise(RuntimeError.new("redirection forbidden: http://example.com/feed -> https://example.com/feed"))
|
48
|
+
FeedParser.any_instance.should_receive(:open).with("https://example.com/feed", http_connection_options).and_return(feed_xml)
|
49
|
+
fp = FeedParser.new({:url => "http://example.com/feed"}.merge(http_connection_options))
|
50
|
+
fp.parse
|
49
51
|
end
|
50
52
|
|
51
|
-
it "should not follow redirect from secure connection to non-secure one" do
|
52
|
-
FeedParser
|
53
|
-
FeedParser
|
53
|
+
it "should not follow redirect from a secure connection to a non-secure one" do
|
54
|
+
FeedParser.any_instance.should_receive(:open).with("https://example.com/feed", http_connection_options).and_raise(RuntimeError.new("redirection forbidden: https://example.com/feed -> http://example.com/feed"))
|
55
|
+
FeedParser.any_instance.should_not_receive(:open).with("http://example.com/feed", http_connection_options)
|
54
56
|
lambda {
|
55
|
-
FeedParser
|
57
|
+
fp = FeedParser.new({:url => "https://example.com/feed"}.merge(http_connection_options))
|
58
|
+
fp.parse
|
56
59
|
}.should raise_error(RuntimeError, "redirection forbidden: https://example.com/feed -> http://example.com/feed")
|
57
60
|
end
|
58
61
|
|
59
|
-
it "should use alternate url if there is no valid self url in the received feed xml" do
|
60
|
-
FeedParser::Feed.any_instance.should_receive(:open).with("https://developers.facebook.com/blog/feed", http_connection_options).and_return(feed_xml('facebook.atom.xml'))
|
61
|
-
lambda {
|
62
|
-
feed = FeedParser::Feed.new("https://developers.facebook.com/blog/feed")
|
63
|
-
feed.url.should == "https://developers.facebook.com/blog/feed"
|
64
|
-
}.should_not raise_error
|
65
|
-
end
|
66
|
-
|
67
62
|
it "should raise an error unless retrieved XML is not an RSS or an ATOM feed" do
|
68
|
-
FeedParser
|
63
|
+
FeedParser.any_instance.should_receive(:open).with("http://example.com/blog/feed/invalid.xml", http_connection_options).and_return("foo bar")
|
69
64
|
lambda {
|
70
|
-
FeedParser
|
65
|
+
fp = FeedParser.new({:url => "http://example.com/blog/feed/invalid.xml"}.merge(http_connection_options))
|
66
|
+
fp.parse
|
71
67
|
}.should raise_error(FeedParser::UnknownFeedType, "Feed is not an RSS feed or an ATOM feed")
|
72
68
|
end
|
73
|
-
end
|
74
69
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}.should_not raise_error
|
81
|
-
end
|
82
|
-
|
83
|
-
it "should populate every item" do
|
84
|
-
@feed = @feed_parser.parse
|
85
|
-
@feed.items.each do |item|
|
86
|
-
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
87
|
-
item.send(attribute).should_not be_nil
|
88
|
-
item.send(attribute).should_not be_empty
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
70
|
+
it "should not allow feeds without http(s) protocol" do
|
71
|
+
lambda {
|
72
|
+
fp = FeedParser.new({:url => "feed://example.com/feed"}.merge(http_connection_options))
|
73
|
+
fp.parse
|
74
|
+
}.should raise_error(FeedParser::InvalidURI, "Only URIs with http or https protocol are supported")
|
92
75
|
end
|
76
|
+
end
|
93
77
|
|
94
|
-
|
78
|
+
describe "::Feed" do
|
79
|
+
def case_tester(feed, test_cases)
|
95
80
|
test_cases.each do |test_case|
|
96
81
|
if test_case.last.is_a?(Array)
|
97
82
|
test_case.last.each do |_case|
|
98
|
-
|
83
|
+
feed.as_json[test_case.first].should include(_case)
|
99
84
|
end
|
100
85
|
else
|
101
|
-
|
86
|
+
feed.send(test_case.first).should include(test_case.last)
|
102
87
|
end
|
103
88
|
end
|
104
89
|
end
|
105
90
|
|
91
|
+
describe "sanitizer" do
|
92
|
+
it "should sanitize with custom sanitizer" do
|
93
|
+
FeedParser.new(:url => "https://example.com/feed", :sanitizer => NotSaneSanitizer.new)
|
94
|
+
|
95
|
+
feed = FeedParser::Feed.new(feed_xml('sanitize.me.rss.xml'))
|
96
|
+
feed.items.first.content.should_not =~ (/flowdock/i)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should sanitize custom fields" do
|
100
|
+
FeedParser.new(:url => "https://example.com/feed", :sanitizer => NotSaneSanitizer.new, :fields_to_sanitize => [:author, :content])
|
101
|
+
|
102
|
+
feed = FeedParser::Feed.new(feed_xml('sanitize.me.rss.xml'))
|
103
|
+
feed.items.first.author.should == 'Sanitized'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
106
107
|
describe "rss feeds" do
|
107
|
-
|
108
|
-
|
108
|
+
it "should be an rss feed" do
|
109
|
+
feed = FeedParser::Feed.new(feed_xml('nodeta.rss.xml'))
|
110
|
+
feed.type.should == :rss
|
109
111
|
end
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
+
it "should populate every item" do
|
114
|
+
feed = FeedParser::Feed.new(feed_xml('nodeta.rss.xml'))
|
115
|
+
feed.items.each do |item|
|
116
|
+
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
117
|
+
item.send(attribute).should_not be_nil
|
118
|
+
item.send(attribute).should_not be_empty
|
119
|
+
end
|
120
|
+
end
|
113
121
|
end
|
114
122
|
|
115
123
|
{
|
@@ -165,40 +173,27 @@ describe FeedParser do
|
|
165
173
|
},
|
166
174
|
}.each do |rss_fixture, test_cases|
|
167
175
|
it "should parse #{rss_fixture}" do
|
168
|
-
|
169
|
-
|
170
|
-
@feed = @feed_parser.parse
|
176
|
+
feed = FeedParser::Feed.new(feed_xml(rss_fixture))
|
171
177
|
|
172
|
-
case_tester(test_cases)
|
178
|
+
case_tester(feed, test_cases)
|
173
179
|
end
|
174
180
|
end
|
175
|
-
|
176
|
-
it "should sanitize with custom sanitizer" do
|
177
|
-
@feed_parser = FeedParser.new(:url => File.join(File.dirname(__FILE__), 'fixtures', 'sanitize.me.rss.xml'), :sanitizer => NotSaneSanitizer.new)
|
178
|
-
|
179
|
-
@feed = @feed_parser.parse
|
180
|
-
|
181
|
-
@feed.items.first.content.should_not =~ (/flowdock/i)
|
182
|
-
end
|
183
|
-
|
184
|
-
it "should sanitize custom fields" do
|
185
|
-
@feed_parser = FeedParser.new(:url => File.join(File.dirname(__FILE__), 'fixtures', 'sanitize.me.rss.xml'), :sanitizer => NotSaneSanitizer.new, :fields_to_sanitize => [:author, :content])
|
186
|
-
|
187
|
-
@feed = @feed_parser.parse
|
188
|
-
|
189
|
-
@feed.items.first.author.should == 'Sanitized'
|
190
|
-
end
|
191
|
-
|
192
|
-
it_should_behave_like "feed parser"
|
193
181
|
end
|
194
182
|
|
195
183
|
describe "atom feeds" do
|
196
|
-
|
197
|
-
|
184
|
+
it "should be an atom feed" do
|
185
|
+
feed = FeedParser::Feed.new(feed_xml('smashingmagazine.atom.xml'))
|
186
|
+
feed.type.should == :atom
|
198
187
|
end
|
199
188
|
|
200
|
-
|
201
|
-
|
189
|
+
it "should populate every item" do
|
190
|
+
feed = FeedParser::Feed.new(feed_xml('smashingmagazine.atom.xml'))
|
191
|
+
feed.items.each do |item|
|
192
|
+
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
193
|
+
item.send(attribute).should_not be_nil
|
194
|
+
item.send(attribute).should_not be_empty
|
195
|
+
end
|
196
|
+
end
|
202
197
|
end
|
203
198
|
|
204
199
|
{
|
@@ -230,15 +225,18 @@ describe FeedParser do
|
|
230
225
|
}
|
231
226
|
}.each do |atom_fixture, test_cases|
|
232
227
|
it "should parse #{atom_fixture}" do
|
233
|
-
|
234
|
-
|
235
|
-
@feed = @feed_parser.parse
|
228
|
+
feed = FeedParser::Feed.new(feed_xml(atom_fixture))
|
236
229
|
|
237
|
-
case_tester(test_cases)
|
230
|
+
case_tester(feed, test_cases)
|
238
231
|
end
|
239
232
|
end
|
240
233
|
|
241
|
-
|
234
|
+
it "should use alternate url if there is no valid self url in the received feed xml" do
|
235
|
+
lambda {
|
236
|
+
feed = FeedParser::Feed.new(feed_xml('facebook.atom.xml'))
|
237
|
+
feed.url.should == "https://developers.facebook.com/blog/feed"
|
238
|
+
}.should_not raise_error
|
239
|
+
end
|
242
240
|
end
|
243
241
|
end
|
244
242
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feed_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-06-26 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153222140 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,18 +21,29 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153222140
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
27
|
-
requirement: &
|
26
|
+
name: rake
|
27
|
+
requirement: &2153221640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
|
-
- -
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0.9'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2153221640
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &2153221140 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
31
42
|
- !ruby/object:Gem::Version
|
32
|
-
version: '2.
|
43
|
+
version: '2.10'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153221140
|
36
47
|
description: Rss and Atom feed parser with sanitizer support built on top of Nokogiri.
|
37
48
|
email:
|
38
49
|
- arttu.tervo@gmail.com
|
@@ -62,7 +73,7 @@ files:
|
|
62
73
|
- spec/fixtures/sanitize.me.rss.xml
|
63
74
|
- spec/fixtures/scrumalliance.rss.xml
|
64
75
|
- spec/fixtures/smashingmagazine.atom.xml
|
65
|
-
homepage:
|
76
|
+
homepage: https://github.com/arttu/feed_parser
|
66
77
|
licenses: []
|
67
78
|
post_install_message:
|
68
79
|
rdoc_options: []
|