spix_parser 1.6.1 → 1.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/spix_parser/tools/feed_discovery/base.rb +3 -23
- data/lib/spix_parser/tools/feed_discovery/document.rb +106 -38
- data/lib/spix_parser/tools/feed_discovery/feed.rb +18 -4
- data/lib/spix_parser/tools/feed_discovery.rb +4 -2
- data/lib/spix_parser/version.rb +1 -1
- data/spec/spix_parser/tools/feed_discovery/document_spec.rb +42 -21
- data/spec/spix_parser/tools/feed_discovery_spec.rb +25 -9
- metadata +3 -4
- data/spec/spix_parser/tools/feed_list_spec.rb +0 -17
@@ -27,11 +27,8 @@ module Spix
|
|
27
27
|
end
|
28
28
|
private :document
|
29
29
|
|
30
|
-
def list
|
31
|
-
|
32
|
-
extract_feeds_from_links
|
33
|
-
include_it_self if feed?
|
34
|
-
items
|
30
|
+
def list &block
|
31
|
+
@document.feed_uris &block
|
35
32
|
end
|
36
33
|
|
37
34
|
def html?
|
@@ -39,24 +36,7 @@ module Spix
|
|
39
36
|
end
|
40
37
|
private :html?
|
41
38
|
|
42
|
-
def
|
43
|
-
document.feed_uris_from_anchors +
|
44
|
-
document.generic_uris_from_anchors.map { |uri|
|
45
|
-
FeedDiscovery::Document.new(uri).feed_uris_from_anchors
|
46
|
-
}.flatten.each { |uri|
|
47
|
-
items << Feed.new(uri, document.favicon)
|
48
|
-
}
|
49
|
-
end
|
50
|
-
private :extract_feeds_from_anchors
|
51
|
-
|
52
|
-
def extract_feeds_from_links
|
53
|
-
document.feed_uris_from_links.each { |uri|
|
54
|
-
items << Feed.new(uri, document.favicon)
|
55
|
-
}
|
56
|
-
end
|
57
|
-
private :extract_feeds_from_links
|
58
|
-
|
59
|
-
def include_it_self
|
39
|
+
def include_it_self &block
|
60
40
|
items << Feed.new(uri.to_s, document.favicon)
|
61
41
|
end
|
62
42
|
|
@@ -16,53 +16,81 @@ module Spix
|
|
16
16
|
open(@uri.to_s, req_headers).read
|
17
17
|
end
|
18
18
|
end
|
19
|
-
|
19
|
+
protected :content
|
20
20
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
uri = @uri.merge node.get_attribute 'href'
|
26
|
-
uri.to_s
|
27
|
-
}
|
28
|
-
end
|
29
|
-
|
30
|
-
def generic_uris_from_anchors
|
31
|
-
@document.search("a").select { |anchor|
|
32
|
-
not rss_or_atom_content_type_in? anchor
|
33
|
-
}.map { |node|
|
34
|
-
uri = @uri.merge node.get_attribute 'href'
|
35
|
-
uri.to_s
|
36
|
-
}
|
21
|
+
def feed_uris &block
|
22
|
+
items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
|
23
|
+
items << feed_unsing_address(@uri, &block) if feed?
|
24
|
+
items
|
37
25
|
end
|
38
26
|
|
39
|
-
def feed_uris_from_links
|
27
|
+
def feed_uris_from_links &block
|
40
28
|
@document.search(
|
41
29
|
"link[@type='application/atom+xml']",
|
42
30
|
"link[@type='application/rss+xml']"
|
43
|
-
).map { |node|
|
44
|
-
|
45
|
-
|
31
|
+
).map { |node| feed_from node, &block }
|
32
|
+
end
|
33
|
+
private :feed_uris_from_links
|
34
|
+
|
35
|
+
def feed_uris_from_anchors &block
|
36
|
+
@document.search('a').select { |node|
|
37
|
+
valid_url_in? node
|
38
|
+
}.select { |node|
|
39
|
+
rss_or_atom_content_type_in? node
|
40
|
+
}.map { |node|
|
41
|
+
feed_from node, &block
|
46
42
|
}
|
47
43
|
end
|
44
|
+
private :feed_uris_from_anchors
|
45
|
+
|
46
|
+
def feed_from node, &block
|
47
|
+
uri = @uri.merge node.get_attribute 'href'
|
48
|
+
feed_unsing_address uri, &block
|
49
|
+
end
|
50
|
+
private :feed_from
|
51
|
+
|
52
|
+
def feed_unsing_address uri, &block
|
53
|
+
begin
|
54
|
+
Feed.new(uri, favicon).tap do |feed|
|
55
|
+
block.call feed if block_given?
|
56
|
+
end
|
57
|
+
rescue => error
|
58
|
+
error.tap do |e|
|
59
|
+
block.call e if block_given?
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
private :feed_unsing_address
|
48
64
|
|
65
|
+
def valid_url_in? anchor
|
66
|
+
uri = address_from anchor
|
67
|
+
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
|
68
|
+
end
|
69
|
+
private :valid_url_in?
|
70
|
+
|
49
71
|
def rss_or_atom_content_type_in? anchor
|
50
72
|
req, path = request_and_path_using address_from anchor
|
51
73
|
resp = req.request_head path
|
52
74
|
resp['content-type'] =~ /rss|atom/
|
75
|
+
rescue
|
76
|
+
true
|
53
77
|
end
|
54
78
|
private :rss_or_atom_content_type_in?
|
55
79
|
|
56
|
-
def request_and_path_using
|
57
|
-
uri = @uri.merge URI.parse address
|
80
|
+
def request_and_path_using uri
|
58
81
|
req = Net::HTTP.new uri.host, uri.port
|
59
|
-
|
60
|
-
return req, path.to_s
|
82
|
+
return req, path_from(uri) || uri.to_s
|
61
83
|
end
|
62
84
|
private :request_and_path_using
|
63
85
|
|
86
|
+
def path_from uri
|
87
|
+
path = uri - uri.select(:scheme, :host).join("://")
|
88
|
+
path.to_s unless path.to_s.blank?
|
89
|
+
end
|
90
|
+
private :path_from
|
91
|
+
|
64
92
|
def address_from node
|
65
|
-
node.get_attribute("href")
|
93
|
+
@uri.merge URI.parse node.get_attribute("href")
|
66
94
|
end
|
67
95
|
private :address_from
|
68
96
|
|
@@ -75,30 +103,70 @@ module Spix
|
|
75
103
|
end
|
76
104
|
|
77
105
|
def favicon
|
78
|
-
|
106
|
+
shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
|
79
107
|
end
|
80
108
|
|
81
|
-
def
|
82
|
-
shortcuts = @document
|
83
|
-
shortcuts.any? ? shortcuts : nil
|
109
|
+
def shortcut_in_document
|
110
|
+
shortcuts = find_shortcut_in @document
|
111
|
+
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
84
112
|
end
|
85
|
-
private :
|
113
|
+
private :shortcut_in_document
|
114
|
+
|
115
|
+
def shortcut_from_original_page
|
116
|
+
if feed?
|
117
|
+
if node = @document.search('link').first
|
118
|
+
path = URI.parse node.content.strip
|
119
|
+
shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
private :shortcut_from_original_page
|
86
124
|
|
87
|
-
def
|
88
|
-
doc =
|
89
|
-
doc
|
125
|
+
def shortcut_from base_uri
|
126
|
+
doc = get base_uri
|
127
|
+
shortcuts = find_shortcut_in doc
|
128
|
+
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
90
129
|
|
91
130
|
rescue Net::HTTPError, Net::HTTPFatalError
|
92
131
|
logger.warn "error opening favicon: #{$!}"
|
93
132
|
nil
|
94
133
|
end
|
95
|
-
private :
|
134
|
+
private :shortcut_from
|
135
|
+
|
136
|
+
def find_shortcut_in doc
|
137
|
+
doc.xpath(
|
138
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
|
139
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
|
140
|
+
).map { |node| node.get_attribute "href" }
|
141
|
+
end
|
96
142
|
|
97
|
-
def
|
98
|
-
|
143
|
+
def default_favico_if_exist
|
144
|
+
http = Net::HTTP.new base_uri.host, base_uri.port
|
145
|
+
resp = http.request_head 'favicon.ico'
|
146
|
+
base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
|
147
|
+
rescue
|
148
|
+
nil
|
149
|
+
end
|
150
|
+
private :default_favico_if_exist
|
151
|
+
|
152
|
+
def get uri
|
153
|
+
resp = Net::HTTP.get_response uri
|
154
|
+
doc = Nokogiri::HTML(resp.body)
|
155
|
+
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
156
|
+
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
157
|
+
get URI.parse path
|
158
|
+
else
|
159
|
+
doc
|
160
|
+
end
|
161
|
+
rescue
|
162
|
+
Nokogiri::HTML('')
|
99
163
|
end
|
100
|
-
private :base_path
|
101
164
|
|
165
|
+
def base_uri
|
166
|
+
@base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
|
167
|
+
end
|
168
|
+
private :base_uri
|
169
|
+
|
102
170
|
end
|
103
171
|
end
|
104
172
|
end
|
@@ -1,15 +1,26 @@
|
|
1
1
|
module Spix
|
2
2
|
module FeedDiscovery
|
3
|
-
class Feed <
|
3
|
+
class Feed < Hash
|
4
4
|
|
5
5
|
def initialize url, favicon
|
6
|
-
self.url = url
|
6
|
+
self.url = url.to_s
|
7
7
|
self.favicon = favicon
|
8
8
|
self.title = get_title
|
9
9
|
end
|
10
10
|
|
11
|
+
%w[url favicon title].each do |attr|
|
12
|
+
define_method attr do
|
13
|
+
self[attr.to_sym]
|
14
|
+
end
|
15
|
+
|
16
|
+
define_method "#{attr}=" do |value|
|
17
|
+
self[attr.to_sym] = value
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
def get_title
|
12
|
-
content.search('title').first
|
22
|
+
node = content.search('title').first
|
23
|
+
node.content if node
|
13
24
|
end
|
14
25
|
private :get_title
|
15
26
|
|
@@ -22,9 +33,12 @@ module Spix
|
|
22
33
|
private :content
|
23
34
|
|
24
35
|
def uri
|
25
|
-
URI.parse url
|
36
|
+
@uri ||= URI.parse url
|
26
37
|
end
|
27
38
|
private :uri
|
39
|
+
|
40
|
+
def to_hash
|
41
|
+
end
|
28
42
|
end
|
29
43
|
end
|
30
44
|
end
|
data/lib/spix_parser/version.rb
CHANGED
@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
|
|
4
4
|
|
5
5
|
let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
|
6
6
|
|
7
|
-
describe '#
|
8
|
-
it 'should return only uris from anchors' do
|
9
|
-
document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
|
10
|
-
end
|
11
|
-
end
|
7
|
+
describe '#feed_uris' do
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
context 'when the uri exists' do
|
10
|
+
|
11
|
+
it 'should return only feed uris' do
|
12
|
+
document.should_receive(:feed_uris).and_return expected_feeds
|
13
|
+
document.feed_uris
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should yield feed with correct uri content' do
|
17
|
+
document.feed_uris do |feed|
|
18
|
+
expected_feeds.should include(feed)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
before :each do
|
23
|
+
stub_requests
|
24
|
+
end
|
18
25
|
|
19
|
-
describe '#generic_uris' do
|
20
|
-
it 'should return only ordinary uris from anchors' do
|
21
|
-
document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
|
22
26
|
end
|
27
|
+
|
23
28
|
end
|
24
29
|
|
25
30
|
describe '#html?' do
|
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
|
|
34
39
|
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
|
35
40
|
document.html?.should eql false
|
36
41
|
end
|
42
|
+
|
43
|
+
before :each do
|
44
|
+
stub_requests
|
45
|
+
end
|
46
|
+
|
37
47
|
end
|
38
48
|
|
39
49
|
describe '#feed?' do
|
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
|
|
48
58
|
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
|
49
59
|
document.feed?.should eql false
|
50
60
|
end
|
61
|
+
|
62
|
+
before :each do
|
63
|
+
stub_requests
|
64
|
+
end
|
51
65
|
end
|
52
66
|
|
53
67
|
before :all do
|
54
|
-
@
|
68
|
+
@domain = 'http://diveintomark.org'
|
69
|
+
@rss_uri = @domain + '/rss_list.html'
|
55
70
|
@content = load_fixture("rss_list.html")
|
56
71
|
@document = Nokogiri::XML(@content)
|
57
72
|
end
|
58
73
|
|
59
|
-
before :each do
|
60
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
|
61
|
-
FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
|
62
|
-
FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
|
63
|
-
end
|
64
74
|
end
|
65
75
|
|
66
|
-
def
|
67
|
-
|
76
|
+
def stub_requests
|
77
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
|
78
|
+
%w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
|
79
|
+
FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
|
80
|
+
FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
|
81
|
+
}
|
82
|
+
FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
|
83
|
+
end
|
84
|
+
|
85
|
+
def expected_feeds
|
86
|
+
@feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
|
87
|
+
Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
|
88
|
+
}
|
68
89
|
end
|
@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
4
4
|
|
5
5
|
describe "when the feed have an absolute URI" do
|
6
6
|
it "should return the feed url" do
|
7
|
-
fake_requests_for :
|
7
|
+
fake_requests_for :ignore => ['/html4-002.html'],
|
8
|
+
:accept => ['/tests/client/autodiscovery/html4-001.xml'],
|
8
9
|
:resource_path => @domain_url,
|
9
10
|
:content => load_fixture("absolute_uri.html")
|
10
11
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
|
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
14
15
|
describe "when the feed have a relative URI" do
|
15
16
|
describe "which is relative to a path" do
|
16
17
|
it "should return the feed url when the URI is at the top domain" do
|
17
|
-
fake_requests_for :
|
18
|
+
fake_requests_for :ignore => ['/html4-003.html'],
|
19
|
+
:accept => ['/html4-002.xml'],
|
18
20
|
:resource_path => @domain_url,
|
19
21
|
:content => load_fixture("relative_uri.html")
|
20
22
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
|
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
23
25
|
it "should return the feed url when the URI is inside a path" do
|
24
26
|
@path_url = "/foo/bar/"
|
25
27
|
@feed_url = @domain_url + @path_url
|
26
|
-
fake_requests_for :
|
28
|
+
fake_requests_for :ignore => ['html4-003.html'],
|
29
|
+
:accept => ['html4-002.xml'],
|
27
30
|
:resource_path => @feed_url,
|
28
31
|
:content => load_fixture('relative_uri.html')
|
29
32
|
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "html4-002.xml"
|
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
32
35
|
|
33
36
|
describe "which is relative to the top domain" do
|
34
37
|
it "should return the feed url when the URI is at the top domain" do
|
35
|
-
fake_requests_for :
|
38
|
+
fake_requests_for :ignore => ['/html4-004.html'],
|
39
|
+
:accept => ['/html4-003.xml'],
|
36
40
|
:resource_path => @domain_url,
|
37
41
|
:content => load_fixture("relative_uri_top_domain.html")
|
38
42
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
42
46
|
@path_url = "/foo/bar/"
|
43
47
|
@feed_url = @domain_url + @path_url
|
44
48
|
|
45
|
-
fake_requests_for :
|
49
|
+
fake_requests_for :ignore => ['/html4-004.html'],
|
50
|
+
:accept => ['/html4-003.xml'],
|
46
51
|
:resource_path => @feed_url,
|
47
52
|
:content => load_fixture("relative_uri_top_domain.html")
|
48
53
|
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
|
|
100
105
|
end
|
101
106
|
|
102
107
|
before(:all) do
|
103
|
-
@domain_url = "http://
|
108
|
+
@domain_url = "http://diveintomark.org"
|
104
109
|
end
|
105
110
|
|
106
111
|
end
|
107
112
|
|
108
113
|
def fake_requests_for options = {}
|
114
|
+
|
109
115
|
content = options.delete(:content)
|
110
|
-
|
116
|
+
ignore = options.delete(:ignore)
|
117
|
+
accept = options.delete(:accept)
|
111
118
|
resource_path = options.delete(:resource_path)
|
119
|
+
|
112
120
|
FakeWeb.register_uri(:get, resource_path, :body => content)
|
113
|
-
|
114
|
-
|
121
|
+
|
122
|
+
ignore.each do |path|
|
123
|
+
FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
|
124
|
+
end
|
125
|
+
|
126
|
+
accept.each do |path|
|
127
|
+
FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
|
128
|
+
FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
|
129
|
+
end
|
130
|
+
|
115
131
|
end
|
metadata
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
name: spix_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.6.
|
5
|
+
version: 1.6.4
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
|
+
- Marcio Lopes de Faria
|
8
9
|
- Marcelo Eden
|
9
10
|
- Fabio Mont'Alegre
|
10
11
|
- "Lucas H\xC3\xBAngaro"
|
@@ -13,7 +14,7 @@ autorequire:
|
|
13
14
|
bindir: bin
|
14
15
|
cert_chain: []
|
15
16
|
|
16
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 -03:00
|
17
18
|
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|
@@ -117,7 +118,6 @@ files:
|
|
117
118
|
- spec/spix_parser/tools/feed_discovery/document_spec.rb
|
118
119
|
- spec/spix_parser/tools/feed_discovery/feed_spec.rb
|
119
120
|
- spec/spix_parser/tools/feed_discovery_spec.rb
|
120
|
-
- spec/spix_parser/tools/feed_list_spec.rb
|
121
121
|
- spec/spix_parser/utils_spec.rb
|
122
122
|
has_rdoc: true
|
123
123
|
homepage: http://github.com/busk/spix_parser
|
@@ -153,5 +153,4 @@ test_files:
|
|
153
153
|
- spec/spix_parser/tools/feed_discovery/document_spec.rb
|
154
154
|
- spec/spix_parser/tools/feed_discovery/feed_spec.rb
|
155
155
|
- spec/spix_parser/tools/feed_discovery_spec.rb
|
156
|
-
- spec/spix_parser/tools/feed_list_spec.rb
|
157
156
|
- spec/spix_parser/utils_spec.rb
|
@@ -1,17 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Spix::FeedDiscoveryList do
|
4
|
-
|
5
|
-
let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
|
6
|
-
|
7
|
-
it "should inherit from array" do
|
8
|
-
feed_discovery_list.class.superclass.should == Array
|
9
|
-
end
|
10
|
-
|
11
|
-
describe "#invalids" do
|
12
|
-
it "should return an empty array from invalids accessor method" do
|
13
|
-
feed_discovery_list.invalids.should == []
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|