feed_parser 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +5 -2
- data/Rakefile +12 -4
- data/feed_parser.gemspec +3 -4
- data/lib/feed_parser.rb +31 -3
- data/lib/feed_parser/feed.rb +2 -33
- data/spec/feed_parser_spec.rb +81 -83
- metadata +20 -9
data/README.md
CHANGED
@@ -23,7 +23,7 @@ Add to Gemfile
|
|
23
23
|
# sanitizing custom field set
|
24
24
|
fp = FeedParser.new(:url => "http://example.com/feed/", :sanitizer => MyBestestSanitizer.new, :fields_to_sanitize => [:title, :content])
|
25
25
|
|
26
|
-
#
|
26
|
+
# retrieve the feed xml and parse it
|
27
27
|
feed = fp.parse
|
28
28
|
|
29
29
|
# using parsed feed in your code
|
@@ -42,7 +42,10 @@ If the fetched XML is not a valid RSS or an ATOM feed, a FeedParser::UnknownFeed
|
|
42
42
|
|
43
43
|
## Running tests
|
44
44
|
|
45
|
-
Install dependencies
|
45
|
+
Install dependencies:
|
46
|
+
|
47
|
+
$ gem install bundler
|
48
|
+
$ bundle install
|
46
49
|
|
47
50
|
Run rspec tests:
|
48
51
|
|
data/Rakefile
CHANGED
@@ -19,9 +19,17 @@ end
|
|
19
19
|
desc "Default: Run specs"
|
20
20
|
task :default => :spec
|
21
21
|
|
22
|
-
namespace :
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
namespace :rubies do
|
23
|
+
rvm_rubies_command = "rvm 1.8.7-p302@feed_parser,1.9.3-p194@feed_parser do"
|
24
|
+
|
25
|
+
desc "Update dependencies for all Ruby versions"
|
26
|
+
task :update_dependencies do
|
27
|
+
system("#{rvm_rubies_command} bundle install")
|
28
|
+
system("#{rvm_rubies_command} bundle update")
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "Run tests with Ruby versions 1.8.7 and 1.9.3"
|
32
|
+
task :spec do
|
33
|
+
system("#{rvm_rubies_command} bundle exec rake spec")
|
26
34
|
end
|
27
35
|
end
|
data/feed_parser.gemspec
CHANGED
@@ -7,19 +7,18 @@ require 'feed_parser'
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = 'feed_parser'
|
9
9
|
s.version = FeedParser::VERSION
|
10
|
-
s.platform = Gem::Platform::RUBY
|
11
10
|
s.authors = ['Arttu Tervo']
|
12
11
|
s.email = ['arttu.tervo@gmail.com']
|
13
|
-
s.homepage = '
|
12
|
+
s.homepage = 'https://github.com/arttu/feed_parser'
|
14
13
|
s.summary = %q{Rss and Atom feed parser}
|
15
14
|
s.description = %q{Rss and Atom feed parser with sanitizer support built on top of Nokogiri.}
|
16
15
|
|
17
16
|
s.add_dependency 'nokogiri'
|
18
17
|
|
19
|
-
s.add_development_dependency '
|
18
|
+
s.add_development_dependency 'rake', '>= 0.9'
|
19
|
+
s.add_development_dependency 'rspec', '>= 2.10'
|
20
20
|
|
21
21
|
s.extra_rdoc_files = %w[README.md]
|
22
|
-
s.require_paths = %w[lib]
|
23
22
|
|
24
23
|
s.files = `git ls-files`.split("\n")
|
25
24
|
s.test_files = `git ls-files -- spec/*`.split("\n")
|
data/lib/feed_parser.rb
CHANGED
@@ -3,15 +3,16 @@ require 'nokogiri'
|
|
3
3
|
|
4
4
|
class FeedParser
|
5
5
|
|
6
|
-
VERSION = "0.3.
|
6
|
+
VERSION = "0.3.3"
|
7
7
|
|
8
8
|
USER_AGENT = "Ruby / FeedParser gem"
|
9
9
|
|
10
10
|
class FeedParser::UnknownFeedType < Exception ; end
|
11
|
+
class FeedParser::InvalidURI < Exception ; end
|
11
12
|
|
12
13
|
def initialize(opts)
|
13
14
|
@url = opts[:url]
|
14
|
-
@http_options = opts[:http] || {}
|
15
|
+
@http_options = {"User-Agent" => FeedParser::USER_AGENT}.merge(opts[:http] || {})
|
15
16
|
@@sanitizer = (opts[:sanitizer] || SelfSanitizer.new)
|
16
17
|
@@fields_to_sanitize = (opts[:fields_to_sanitize] || [:content])
|
17
18
|
self
|
@@ -26,7 +27,34 @@ class FeedParser
|
|
26
27
|
end
|
27
28
|
|
28
29
|
def parse
|
29
|
-
|
30
|
+
feed_xml = open_or_follow_redirect(@url)
|
31
|
+
@feed ||= Feed.new(feed_xml)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def open_or_follow_redirect(feed_url)
|
37
|
+
uri = URI.parse(feed_url)
|
38
|
+
|
39
|
+
if uri.userinfo
|
40
|
+
@http_options[:http_basic_authentication] = [uri.user, uri.password].compact
|
41
|
+
uri.userinfo = uri.user = uri.password = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
@http_options[:redirect] = true if RUBY_VERSION >= '1.9'
|
45
|
+
|
46
|
+
if ['http', 'https'].include?(uri.scheme)
|
47
|
+
open(uri.to_s, @http_options)
|
48
|
+
else
|
49
|
+
raise FeedParser::InvalidURI.new("Only URIs with http or https protocol are supported")
|
50
|
+
end
|
51
|
+
rescue RuntimeError => ex
|
52
|
+
redirect_url = ex.to_s.split(" ").last
|
53
|
+
if URI.parse(feed_url).scheme == "http" && URI.parse(redirect_url).scheme == "https"
|
54
|
+
open_or_follow_redirect(redirect_url)
|
55
|
+
else
|
56
|
+
raise ex
|
57
|
+
end
|
30
58
|
end
|
31
59
|
end
|
32
60
|
|
data/lib/feed_parser/feed.rb
CHANGED
@@ -2,10 +2,8 @@ class FeedParser
|
|
2
2
|
class Feed
|
3
3
|
attr_reader :type
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
@
|
7
|
-
raw_feed = open_or_follow_redirect(feed_url)
|
8
|
-
@feed = Nokogiri::XML(raw_feed)
|
5
|
+
def initialize(feed_xml)
|
6
|
+
@feed = Nokogiri::XML(feed_xml)
|
9
7
|
@feed.remove_namespaces!
|
10
8
|
@type = ((@feed.xpath('/rss')[0] && :rss) || (@feed.xpath('/feed')[0] && :atom))
|
11
9
|
raise FeedParser::UnknownFeedType.new("Feed is not an RSS feed or an ATOM feed") unless @type
|
@@ -44,34 +42,5 @@ class FeedParser
|
|
44
42
|
:items => items.map(&:as_json)
|
45
43
|
}
|
46
44
|
end
|
47
|
-
|
48
|
-
private
|
49
|
-
|
50
|
-
# Some feeds
|
51
|
-
def open_or_follow_redirect(feed_url)
|
52
|
-
parsed_url = URI.parse(feed_url)
|
53
|
-
|
54
|
-
connection_options = {"User-Agent" => FeedParser::USER_AGENT}
|
55
|
-
connection_options.merge!(@http_options)
|
56
|
-
if parsed_url.userinfo
|
57
|
-
connection_options[:http_basic_authentication] = [parsed_url.user, parsed_url.password].compact
|
58
|
-
parsed_url.userinfo = parsed_url.user = parsed_url.password = nil
|
59
|
-
end
|
60
|
-
|
61
|
-
connection_options[:redirect] = true if RUBY_VERSION >= '1.9'
|
62
|
-
|
63
|
-
if parsed_url.scheme
|
64
|
-
open(parsed_url.to_s, connection_options)
|
65
|
-
else
|
66
|
-
open(parsed_url.to_s)
|
67
|
-
end
|
68
|
-
rescue RuntimeError => ex
|
69
|
-
redirect_url = ex.to_s.split(" ").last
|
70
|
-
if URI.split(feed_url).first == "http" && URI.split(redirect_url).first == "https"
|
71
|
-
open_or_follow_redirect(redirect_url)
|
72
|
-
else
|
73
|
-
raise ex
|
74
|
-
end
|
75
|
-
end
|
76
45
|
end
|
77
46
|
end
|
data/spec/feed_parser_spec.rb
CHANGED
@@ -20,96 +20,104 @@ describe FeedParser do
|
|
20
20
|
|
21
21
|
describe "#new" do
|
22
22
|
it "should forward given http options to the OpenURI" do
|
23
|
-
FeedParser
|
23
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE)).and_return(feed_xml)
|
24
24
|
fp = FeedParser.new(:url => "http://blog.example.com/feed/", :http => {:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE})
|
25
25
|
fp.parse
|
26
26
|
end
|
27
|
-
end
|
28
27
|
|
29
|
-
describe FeedParser::Feed, "#new" do
|
30
28
|
it "should fetch a feed by url" do
|
31
|
-
FeedParser
|
32
|
-
FeedParser
|
29
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options).and_return(feed_xml)
|
30
|
+
fp = FeedParser.new({:url => "http://blog.example.com/feed/"}.merge(http_connection_options))
|
31
|
+
fp.parse
|
33
32
|
end
|
34
33
|
|
35
34
|
it "should fetch a feed using basic auth if auth embedded to the url" do
|
36
|
-
FeedParser
|
37
|
-
FeedParser
|
35
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:http_basic_authentication => ["user", "pass"])).and_return(feed_xml)
|
36
|
+
fp = FeedParser.new({:url => "http://user:pass@blog.example.com/feed/"}.merge(http_connection_options))
|
37
|
+
fp.parse
|
38
38
|
end
|
39
39
|
|
40
40
|
it "should fetch a feed with only a user name embedded to the url" do
|
41
|
-
FeedParser
|
42
|
-
FeedParser
|
41
|
+
FeedParser.any_instance.should_receive(:open).with("http://blog.example.com/feed/", http_connection_options.merge(:http_basic_authentication => ["user"])).and_return(feed_xml)
|
42
|
+
fp = FeedParser.new({:url => "http://user@blog.example.com/feed/"}.merge(http_connection_options))
|
43
|
+
fp.parse
|
43
44
|
end
|
44
45
|
|
45
|
-
it "should follow redirect based on the exception message" do
|
46
|
-
FeedParser
|
47
|
-
FeedParser
|
48
|
-
FeedParser
|
46
|
+
it "should follow redirect based on the exception message (even if OpenURI don't want to do it)" do
|
47
|
+
FeedParser.any_instance.should_receive(:open).with("http://example.com/feed", http_connection_options).and_raise(RuntimeError.new("redirection forbidden: http://example.com/feed -> https://example.com/feed"))
|
48
|
+
FeedParser.any_instance.should_receive(:open).with("https://example.com/feed", http_connection_options).and_return(feed_xml)
|
49
|
+
fp = FeedParser.new({:url => "http://example.com/feed"}.merge(http_connection_options))
|
50
|
+
fp.parse
|
49
51
|
end
|
50
52
|
|
51
|
-
it "should not follow redirect from secure connection to non-secure one" do
|
52
|
-
FeedParser
|
53
|
-
FeedParser
|
53
|
+
it "should not follow redirect from a secure connection to a non-secure one" do
|
54
|
+
FeedParser.any_instance.should_receive(:open).with("https://example.com/feed", http_connection_options).and_raise(RuntimeError.new("redirection forbidden: https://example.com/feed -> http://example.com/feed"))
|
55
|
+
FeedParser.any_instance.should_not_receive(:open).with("http://example.com/feed", http_connection_options)
|
54
56
|
lambda {
|
55
|
-
FeedParser
|
57
|
+
fp = FeedParser.new({:url => "https://example.com/feed"}.merge(http_connection_options))
|
58
|
+
fp.parse
|
56
59
|
}.should raise_error(RuntimeError, "redirection forbidden: https://example.com/feed -> http://example.com/feed")
|
57
60
|
end
|
58
61
|
|
59
|
-
it "should use alternate url if there is no valid self url in the received feed xml" do
|
60
|
-
FeedParser::Feed.any_instance.should_receive(:open).with("https://developers.facebook.com/blog/feed", http_connection_options).and_return(feed_xml('facebook.atom.xml'))
|
61
|
-
lambda {
|
62
|
-
feed = FeedParser::Feed.new("https://developers.facebook.com/blog/feed")
|
63
|
-
feed.url.should == "https://developers.facebook.com/blog/feed"
|
64
|
-
}.should_not raise_error
|
65
|
-
end
|
66
|
-
|
67
62
|
it "should raise an error unless retrieved XML is not an RSS or an ATOM feed" do
|
68
|
-
FeedParser
|
63
|
+
FeedParser.any_instance.should_receive(:open).with("http://example.com/blog/feed/invalid.xml", http_connection_options).and_return("foo bar")
|
69
64
|
lambda {
|
70
|
-
FeedParser
|
65
|
+
fp = FeedParser.new({:url => "http://example.com/blog/feed/invalid.xml"}.merge(http_connection_options))
|
66
|
+
fp.parse
|
71
67
|
}.should raise_error(FeedParser::UnknownFeedType, "Feed is not an RSS feed or an ATOM feed")
|
72
68
|
end
|
73
|
-
end
|
74
69
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
}.should_not raise_error
|
81
|
-
end
|
82
|
-
|
83
|
-
it "should populate every item" do
|
84
|
-
@feed = @feed_parser.parse
|
85
|
-
@feed.items.each do |item|
|
86
|
-
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
87
|
-
item.send(attribute).should_not be_nil
|
88
|
-
item.send(attribute).should_not be_empty
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
70
|
+
it "should not allow feeds without http(s) protocol" do
|
71
|
+
lambda {
|
72
|
+
fp = FeedParser.new({:url => "feed://example.com/feed"}.merge(http_connection_options))
|
73
|
+
fp.parse
|
74
|
+
}.should raise_error(FeedParser::InvalidURI, "Only URIs with http or https protocol are supported")
|
92
75
|
end
|
76
|
+
end
|
93
77
|
|
94
|
-
|
78
|
+
describe "::Feed" do
|
79
|
+
def case_tester(feed, test_cases)
|
95
80
|
test_cases.each do |test_case|
|
96
81
|
if test_case.last.is_a?(Array)
|
97
82
|
test_case.last.each do |_case|
|
98
|
-
|
83
|
+
feed.as_json[test_case.first].should include(_case)
|
99
84
|
end
|
100
85
|
else
|
101
|
-
|
86
|
+
feed.send(test_case.first).should include(test_case.last)
|
102
87
|
end
|
103
88
|
end
|
104
89
|
end
|
105
90
|
|
91
|
+
describe "sanitizer" do
|
92
|
+
it "should sanitize with custom sanitizer" do
|
93
|
+
FeedParser.new(:url => "https://example.com/feed", :sanitizer => NotSaneSanitizer.new)
|
94
|
+
|
95
|
+
feed = FeedParser::Feed.new(feed_xml('sanitize.me.rss.xml'))
|
96
|
+
feed.items.first.content.should_not =~ (/flowdock/i)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should sanitize custom fields" do
|
100
|
+
FeedParser.new(:url => "https://example.com/feed", :sanitizer => NotSaneSanitizer.new, :fields_to_sanitize => [:author, :content])
|
101
|
+
|
102
|
+
feed = FeedParser::Feed.new(feed_xml('sanitize.me.rss.xml'))
|
103
|
+
feed.items.first.author.should == 'Sanitized'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
106
107
|
describe "rss feeds" do
|
107
|
-
|
108
|
-
|
108
|
+
it "should be an rss feed" do
|
109
|
+
feed = FeedParser::Feed.new(feed_xml('nodeta.rss.xml'))
|
110
|
+
feed.type.should == :rss
|
109
111
|
end
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
+
it "should populate every item" do
|
114
|
+
feed = FeedParser::Feed.new(feed_xml('nodeta.rss.xml'))
|
115
|
+
feed.items.each do |item|
|
116
|
+
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
117
|
+
item.send(attribute).should_not be_nil
|
118
|
+
item.send(attribute).should_not be_empty
|
119
|
+
end
|
120
|
+
end
|
113
121
|
end
|
114
122
|
|
115
123
|
{
|
@@ -165,40 +173,27 @@ describe FeedParser do
|
|
165
173
|
},
|
166
174
|
}.each do |rss_fixture, test_cases|
|
167
175
|
it "should parse #{rss_fixture}" do
|
168
|
-
|
169
|
-
|
170
|
-
@feed = @feed_parser.parse
|
176
|
+
feed = FeedParser::Feed.new(feed_xml(rss_fixture))
|
171
177
|
|
172
|
-
case_tester(test_cases)
|
178
|
+
case_tester(feed, test_cases)
|
173
179
|
end
|
174
180
|
end
|
175
|
-
|
176
|
-
it "should sanitize with custom sanitizer" do
|
177
|
-
@feed_parser = FeedParser.new(:url => File.join(File.dirname(__FILE__), 'fixtures', 'sanitize.me.rss.xml'), :sanitizer => NotSaneSanitizer.new)
|
178
|
-
|
179
|
-
@feed = @feed_parser.parse
|
180
|
-
|
181
|
-
@feed.items.first.content.should_not =~ (/flowdock/i)
|
182
|
-
end
|
183
|
-
|
184
|
-
it "should sanitize custom fields" do
|
185
|
-
@feed_parser = FeedParser.new(:url => File.join(File.dirname(__FILE__), 'fixtures', 'sanitize.me.rss.xml'), :sanitizer => NotSaneSanitizer.new, :fields_to_sanitize => [:author, :content])
|
186
|
-
|
187
|
-
@feed = @feed_parser.parse
|
188
|
-
|
189
|
-
@feed.items.first.author.should == 'Sanitized'
|
190
|
-
end
|
191
|
-
|
192
|
-
it_should_behave_like "feed parser"
|
193
181
|
end
|
194
182
|
|
195
183
|
describe "atom feeds" do
|
196
|
-
|
197
|
-
|
184
|
+
it "should be an atom feed" do
|
185
|
+
feed = FeedParser::Feed.new(feed_xml('smashingmagazine.atom.xml'))
|
186
|
+
feed.type.should == :atom
|
198
187
|
end
|
199
188
|
|
200
|
-
|
201
|
-
|
189
|
+
it "should populate every item" do
|
190
|
+
feed = FeedParser::Feed.new(feed_xml('smashingmagazine.atom.xml'))
|
191
|
+
feed.items.each do |item|
|
192
|
+
[:guid, :link, :title, :categories, :author, :content].each do |attribute|
|
193
|
+
item.send(attribute).should_not be_nil
|
194
|
+
item.send(attribute).should_not be_empty
|
195
|
+
end
|
196
|
+
end
|
202
197
|
end
|
203
198
|
|
204
199
|
{
|
@@ -230,15 +225,18 @@ describe FeedParser do
|
|
230
225
|
}
|
231
226
|
}.each do |atom_fixture, test_cases|
|
232
227
|
it "should parse #{atom_fixture}" do
|
233
|
-
|
234
|
-
|
235
|
-
@feed = @feed_parser.parse
|
228
|
+
feed = FeedParser::Feed.new(feed_xml(atom_fixture))
|
236
229
|
|
237
|
-
case_tester(test_cases)
|
230
|
+
case_tester(feed, test_cases)
|
238
231
|
end
|
239
232
|
end
|
240
233
|
|
241
|
-
|
234
|
+
it "should use alternate url if there is no valid self url in the received feed xml" do
|
235
|
+
lambda {
|
236
|
+
feed = FeedParser::Feed.new(feed_xml('facebook.atom.xml'))
|
237
|
+
feed.url.should == "https://developers.facebook.com/blog/feed"
|
238
|
+
}.should_not raise_error
|
239
|
+
end
|
242
240
|
end
|
243
241
|
end
|
244
242
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feed_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-06-26 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153222140 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,18 +21,29 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153222140
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
27
|
-
requirement: &
|
26
|
+
name: rake
|
27
|
+
requirement: &2153221640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
|
-
- -
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0.9'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2153221640
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &2153221140 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
31
42
|
- !ruby/object:Gem::Version
|
32
|
-
version: '2.
|
43
|
+
version: '2.10'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *2153221140
|
36
47
|
description: Rss and Atom feed parser with sanitizer support built on top of Nokogiri.
|
37
48
|
email:
|
38
49
|
- arttu.tervo@gmail.com
|
@@ -62,7 +73,7 @@ files:
|
|
62
73
|
- spec/fixtures/sanitize.me.rss.xml
|
63
74
|
- spec/fixtures/scrumalliance.rss.xml
|
64
75
|
- spec/fixtures/smashingmagazine.atom.xml
|
65
|
-
homepage:
|
76
|
+
homepage: https://github.com/arttu/feed_parser
|
66
77
|
licenses: []
|
67
78
|
post_install_message:
|
68
79
|
rdoc_options: []
|