spix_parser 1.6.1 → 1.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/spix_parser/tools/feed_discovery/base.rb +3 -23
- data/lib/spix_parser/tools/feed_discovery/document.rb +106 -38
- data/lib/spix_parser/tools/feed_discovery/feed.rb +18 -4
- data/lib/spix_parser/tools/feed_discovery.rb +4 -2
- data/lib/spix_parser/version.rb +1 -1
- data/spec/spix_parser/tools/feed_discovery/document_spec.rb +42 -21
- data/spec/spix_parser/tools/feed_discovery_spec.rb +25 -9
- metadata +3 -4
- data/spec/spix_parser/tools/feed_list_spec.rb +0 -17
@@ -27,11 +27,8 @@ module Spix
|
|
27
27
|
end
|
28
28
|
private :document
|
29
29
|
|
30
|
-
def list
|
31
|
-
|
32
|
-
extract_feeds_from_links
|
33
|
-
include_it_self if feed?
|
34
|
-
items
|
30
|
+
def list &block
|
31
|
+
@document.feed_uris &block
|
35
32
|
end
|
36
33
|
|
37
34
|
def html?
|
@@ -39,24 +36,7 @@ module Spix
|
|
39
36
|
end
|
40
37
|
private :html?
|
41
38
|
|
42
|
-
def
|
43
|
-
document.feed_uris_from_anchors +
|
44
|
-
document.generic_uris_from_anchors.map { |uri|
|
45
|
-
FeedDiscovery::Document.new(uri).feed_uris_from_anchors
|
46
|
-
}.flatten.each { |uri|
|
47
|
-
items << Feed.new(uri, document.favicon)
|
48
|
-
}
|
49
|
-
end
|
50
|
-
private :extract_feeds_from_anchors
|
51
|
-
|
52
|
-
def extract_feeds_from_links
|
53
|
-
document.feed_uris_from_links.each { |uri|
|
54
|
-
items << Feed.new(uri, document.favicon)
|
55
|
-
}
|
56
|
-
end
|
57
|
-
private :extract_feeds_from_links
|
58
|
-
|
59
|
-
def include_it_self
|
39
|
+
def include_it_self &block
|
60
40
|
items << Feed.new(uri.to_s, document.favicon)
|
61
41
|
end
|
62
42
|
|
@@ -16,53 +16,81 @@ module Spix
|
|
16
16
|
open(@uri.to_s, req_headers).read
|
17
17
|
end
|
18
18
|
end
|
19
|
-
|
19
|
+
protected :content
|
20
20
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
uri = @uri.merge node.get_attribute 'href'
|
26
|
-
uri.to_s
|
27
|
-
}
|
28
|
-
end
|
29
|
-
|
30
|
-
def generic_uris_from_anchors
|
31
|
-
@document.search("a").select { |anchor|
|
32
|
-
not rss_or_atom_content_type_in? anchor
|
33
|
-
}.map { |node|
|
34
|
-
uri = @uri.merge node.get_attribute 'href'
|
35
|
-
uri.to_s
|
36
|
-
}
|
21
|
+
def feed_uris &block
|
22
|
+
items = feed_uris_from_links(&block) + feed_uris_from_anchors(&block)
|
23
|
+
items << feed_unsing_address(@uri, &block) if feed?
|
24
|
+
items
|
37
25
|
end
|
38
26
|
|
39
|
-
def feed_uris_from_links
|
27
|
+
def feed_uris_from_links &block
|
40
28
|
@document.search(
|
41
29
|
"link[@type='application/atom+xml']",
|
42
30
|
"link[@type='application/rss+xml']"
|
43
|
-
).map { |node|
|
44
|
-
|
45
|
-
|
31
|
+
).map { |node| feed_from node, &block }
|
32
|
+
end
|
33
|
+
private :feed_uris_from_links
|
34
|
+
|
35
|
+
def feed_uris_from_anchors &block
|
36
|
+
@document.search('a').select { |node|
|
37
|
+
valid_url_in? node
|
38
|
+
}.select { |node|
|
39
|
+
rss_or_atom_content_type_in? node
|
40
|
+
}.map { |node|
|
41
|
+
feed_from node, &block
|
46
42
|
}
|
47
43
|
end
|
44
|
+
private :feed_uris_from_anchors
|
45
|
+
|
46
|
+
def feed_from node, &block
|
47
|
+
uri = @uri.merge node.get_attribute 'href'
|
48
|
+
feed_unsing_address uri, &block
|
49
|
+
end
|
50
|
+
private :feed_from
|
51
|
+
|
52
|
+
def feed_unsing_address uri, &block
|
53
|
+
begin
|
54
|
+
Feed.new(uri, favicon).tap do |feed|
|
55
|
+
block.call feed if block_given?
|
56
|
+
end
|
57
|
+
rescue => error
|
58
|
+
error.tap do |e|
|
59
|
+
block.call e if block_given?
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
private :feed_unsing_address
|
48
64
|
|
65
|
+
def valid_url_in? anchor
|
66
|
+
uri = address_from anchor
|
67
|
+
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
|
68
|
+
end
|
69
|
+
private :valid_url_in?
|
70
|
+
|
49
71
|
def rss_or_atom_content_type_in? anchor
|
50
72
|
req, path = request_and_path_using address_from anchor
|
51
73
|
resp = req.request_head path
|
52
74
|
resp['content-type'] =~ /rss|atom/
|
75
|
+
rescue
|
76
|
+
true
|
53
77
|
end
|
54
78
|
private :rss_or_atom_content_type_in?
|
55
79
|
|
56
|
-
def request_and_path_using
|
57
|
-
uri = @uri.merge URI.parse address
|
80
|
+
def request_and_path_using uri
|
58
81
|
req = Net::HTTP.new uri.host, uri.port
|
59
|
-
|
60
|
-
return req, path.to_s
|
82
|
+
return req, path_from(uri) || uri.to_s
|
61
83
|
end
|
62
84
|
private :request_and_path_using
|
63
85
|
|
86
|
+
def path_from uri
|
87
|
+
path = uri - uri.select(:scheme, :host).join("://")
|
88
|
+
path.to_s unless path.to_s.blank?
|
89
|
+
end
|
90
|
+
private :path_from
|
91
|
+
|
64
92
|
def address_from node
|
65
|
-
node.get_attribute("href")
|
93
|
+
@uri.merge URI.parse node.get_attribute("href")
|
66
94
|
end
|
67
95
|
private :address_from
|
68
96
|
|
@@ -75,30 +103,70 @@ module Spix
|
|
75
103
|
end
|
76
104
|
|
77
105
|
def favicon
|
78
|
-
|
106
|
+
shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
|
79
107
|
end
|
80
108
|
|
81
|
-
def
|
82
|
-
shortcuts = @document
|
83
|
-
shortcuts.any? ? shortcuts : nil
|
109
|
+
def shortcut_in_document
|
110
|
+
shortcuts = find_shortcut_in @document
|
111
|
+
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
84
112
|
end
|
85
|
-
private :
|
113
|
+
private :shortcut_in_document
|
114
|
+
|
115
|
+
def shortcut_from_original_page
|
116
|
+
if feed?
|
117
|
+
if node = @document.search('link').first
|
118
|
+
path = URI.parse node.content.strip
|
119
|
+
shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
private :shortcut_from_original_page
|
86
124
|
|
87
|
-
def
|
88
|
-
doc =
|
89
|
-
doc
|
125
|
+
def shortcut_from base_uri
|
126
|
+
doc = get base_uri
|
127
|
+
shortcuts = find_shortcut_in doc
|
128
|
+
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
90
129
|
|
91
130
|
rescue Net::HTTPError, Net::HTTPFatalError
|
92
131
|
logger.warn "error opening favicon: #{$!}"
|
93
132
|
nil
|
94
133
|
end
|
95
|
-
private :
|
134
|
+
private :shortcut_from
|
135
|
+
|
136
|
+
def find_shortcut_in doc
|
137
|
+
doc.xpath(
|
138
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
|
139
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
|
140
|
+
).map { |node| node.get_attribute "href" }
|
141
|
+
end
|
96
142
|
|
97
|
-
def
|
98
|
-
|
143
|
+
def default_favico_if_exist
|
144
|
+
http = Net::HTTP.new base_uri.host, base_uri.port
|
145
|
+
resp = http.request_head 'favicon.ico'
|
146
|
+
base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
|
147
|
+
rescue
|
148
|
+
nil
|
149
|
+
end
|
150
|
+
private :default_favico_if_exist
|
151
|
+
|
152
|
+
def get uri
|
153
|
+
resp = Net::HTTP.get_response uri
|
154
|
+
doc = Nokogiri::HTML(resp.body)
|
155
|
+
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
156
|
+
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
157
|
+
get URI.parse path
|
158
|
+
else
|
159
|
+
doc
|
160
|
+
end
|
161
|
+
rescue
|
162
|
+
Nokogiri::HTML('')
|
99
163
|
end
|
100
|
-
private :base_path
|
101
164
|
|
165
|
+
def base_uri
|
166
|
+
@base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
|
167
|
+
end
|
168
|
+
private :base_uri
|
169
|
+
|
102
170
|
end
|
103
171
|
end
|
104
172
|
end
|
@@ -1,15 +1,26 @@
|
|
1
1
|
module Spix
|
2
2
|
module FeedDiscovery
|
3
|
-
class Feed <
|
3
|
+
class Feed < Hash
|
4
4
|
|
5
5
|
def initialize url, favicon
|
6
|
-
self.url = url
|
6
|
+
self.url = url.to_s
|
7
7
|
self.favicon = favicon
|
8
8
|
self.title = get_title
|
9
9
|
end
|
10
10
|
|
11
|
+
%w[url favicon title].each do |attr|
|
12
|
+
define_method attr do
|
13
|
+
self[attr.to_sym]
|
14
|
+
end
|
15
|
+
|
16
|
+
define_method "#{attr}=" do |value|
|
17
|
+
self[attr.to_sym] = value
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
def get_title
|
12
|
-
content.search('title').first
|
22
|
+
node = content.search('title').first
|
23
|
+
node.content if node
|
13
24
|
end
|
14
25
|
private :get_title
|
15
26
|
|
@@ -22,9 +33,12 @@ module Spix
|
|
22
33
|
private :content
|
23
34
|
|
24
35
|
def uri
|
25
|
-
URI.parse url
|
36
|
+
@uri ||= URI.parse url
|
26
37
|
end
|
27
38
|
private :uri
|
39
|
+
|
40
|
+
def to_hash
|
41
|
+
end
|
28
42
|
end
|
29
43
|
end
|
30
44
|
end
|
data/lib/spix_parser/version.rb
CHANGED
@@ -4,22 +4,27 @@ describe Spix::FeedDiscovery::Document do
|
|
4
4
|
|
5
5
|
let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
|
6
6
|
|
7
|
-
describe '#
|
8
|
-
it 'should return only uris from anchors' do
|
9
|
-
document.feed_uris_from_anchors.should eql expected_uris_inside('a.has_feed')
|
10
|
-
end
|
11
|
-
end
|
7
|
+
describe '#feed_uris' do
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
context 'when the uri exists' do
|
10
|
+
|
11
|
+
it 'should return only feed uris' do
|
12
|
+
document.should_receive(:feed_uris).and_return expected_feeds
|
13
|
+
document.feed_uris
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should yield feed with correct uri content' do
|
17
|
+
document.feed_uris do |feed|
|
18
|
+
expected_feeds.should include(feed)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
before :each do
|
23
|
+
stub_requests
|
24
|
+
end
|
18
25
|
|
19
|
-
describe '#generic_uris' do
|
20
|
-
it 'should return only ordinary uris from anchors' do
|
21
|
-
document.generic_uris_from_anchors.should eql expected_uris_inside('a.generic')
|
22
26
|
end
|
27
|
+
|
23
28
|
end
|
24
29
|
|
25
30
|
describe '#html?' do
|
@@ -34,6 +39,11 @@ describe Spix::FeedDiscovery::Document do
|
|
34
39
|
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
|
35
40
|
document.html?.should eql false
|
36
41
|
end
|
42
|
+
|
43
|
+
before :each do
|
44
|
+
stub_requests
|
45
|
+
end
|
46
|
+
|
37
47
|
end
|
38
48
|
|
39
49
|
describe '#feed?' do
|
@@ -48,21 +58,32 @@ describe Spix::FeedDiscovery::Document do
|
|
48
58
|
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(content)
|
49
59
|
document.feed?.should eql false
|
50
60
|
end
|
61
|
+
|
62
|
+
before :each do
|
63
|
+
stub_requests
|
64
|
+
end
|
51
65
|
end
|
52
66
|
|
53
67
|
before :all do
|
54
|
-
@
|
68
|
+
@domain = 'http://diveintomark.org'
|
69
|
+
@rss_uri = @domain + '/rss_list.html'
|
55
70
|
@content = load_fixture("rss_list.html")
|
56
71
|
@document = Nokogiri::XML(@content)
|
57
72
|
end
|
58
73
|
|
59
|
-
before :each do
|
60
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
|
61
|
-
FakeWeb.register_uri(:head, 'http://myfeed.com/has_feed.html', :content_type => 'application/atom+xml')
|
62
|
-
FakeWeb.register_uri(:head, 'http://myfeed.com/generic.html', :content_type => 'text/html' )
|
63
|
-
end
|
64
74
|
end
|
65
75
|
|
66
|
-
def
|
67
|
-
|
76
|
+
def stub_requests
|
77
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content).and_return(@content)
|
78
|
+
%w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].each { |path|
|
79
|
+
FakeWeb.register_uri(:head, @domain + '/' + path, :content_type => 'application/atom+xml')
|
80
|
+
FakeWeb.register_uri(:get, @domain + '/' + path, :body => load_fixture(path))
|
81
|
+
}
|
82
|
+
FakeWeb.register_uri(:head, @domain + '/generic.html', :content_type => 'text/html' )
|
83
|
+
end
|
84
|
+
|
85
|
+
def expected_feeds
|
86
|
+
@feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
|
87
|
+
Spix::FeedDiscovery::Feed.new(@domain + '/' + path, @domain + '/images/favicon.ico')
|
88
|
+
}
|
68
89
|
end
|
@@ -4,7 +4,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
4
4
|
|
5
5
|
describe "when the feed have an absolute URI" do
|
6
6
|
it "should return the feed url" do
|
7
|
-
fake_requests_for :
|
7
|
+
fake_requests_for :ignore => ['/html4-002.html'],
|
8
|
+
:accept => ['/tests/client/autodiscovery/html4-001.xml'],
|
8
9
|
:resource_path => @domain_url,
|
9
10
|
:content => load_fixture("absolute_uri.html")
|
10
11
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == "http://diveintomark.org/tests/client/autodiscovery/html4-001.xml"
|
@@ -14,7 +15,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
14
15
|
describe "when the feed have a relative URI" do
|
15
16
|
describe "which is relative to a path" do
|
16
17
|
it "should return the feed url when the URI is at the top domain" do
|
17
|
-
fake_requests_for :
|
18
|
+
fake_requests_for :ignore => ['/html4-003.html'],
|
19
|
+
:accept => ['/html4-002.xml'],
|
18
20
|
:resource_path => @domain_url,
|
19
21
|
:content => load_fixture("relative_uri.html")
|
20
22
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/" + "html4-002.xml"
|
@@ -23,7 +25,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
23
25
|
it "should return the feed url when the URI is inside a path" do
|
24
26
|
@path_url = "/foo/bar/"
|
25
27
|
@feed_url = @domain_url + @path_url
|
26
|
-
fake_requests_for :
|
28
|
+
fake_requests_for :ignore => ['html4-003.html'],
|
29
|
+
:accept => ['html4-002.xml'],
|
27
30
|
:resource_path => @feed_url,
|
28
31
|
:content => load_fixture('relative_uri.html')
|
29
32
|
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @feed_url + "html4-002.xml"
|
@@ -32,7 +35,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
32
35
|
|
33
36
|
describe "which is relative to the top domain" do
|
34
37
|
it "should return the feed url when the URI is at the top domain" do
|
35
|
-
fake_requests_for :
|
38
|
+
fake_requests_for :ignore => ['/html4-004.html'],
|
39
|
+
:accept => ['/html4-003.xml'],
|
36
40
|
:resource_path => @domain_url,
|
37
41
|
:content => load_fixture("relative_uri_top_domain.html")
|
38
42
|
Spix::FeedDiscovery.list(@domain_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
@@ -42,7 +46,8 @@ describe Spix::FeedDiscovery, "#list" do
|
|
42
46
|
@path_url = "/foo/bar/"
|
43
47
|
@feed_url = @domain_url + @path_url
|
44
48
|
|
45
|
-
fake_requests_for :
|
49
|
+
fake_requests_for :ignore => ['/html4-004.html'],
|
50
|
+
:accept => ['/html4-003.xml'],
|
46
51
|
:resource_path => @feed_url,
|
47
52
|
:content => load_fixture("relative_uri_top_domain.html")
|
48
53
|
Spix::FeedDiscovery.list(@feed_url).first[:url].should == @domain_url + "/tests/client/autodiscovery/html4-003.xml"
|
@@ -100,16 +105,27 @@ describe Spix::FeedDiscovery, "#list" do
|
|
100
105
|
end
|
101
106
|
|
102
107
|
before(:all) do
|
103
|
-
@domain_url = "http://
|
108
|
+
@domain_url = "http://diveintomark.org"
|
104
109
|
end
|
105
110
|
|
106
111
|
end
|
107
112
|
|
108
113
|
def fake_requests_for options = {}
|
114
|
+
|
109
115
|
content = options.delete(:content)
|
110
|
-
|
116
|
+
ignore = options.delete(:ignore)
|
117
|
+
accept = options.delete(:accept)
|
111
118
|
resource_path = options.delete(:resource_path)
|
119
|
+
|
112
120
|
FakeWeb.register_uri(:get, resource_path, :body => content)
|
113
|
-
|
114
|
-
|
121
|
+
|
122
|
+
ignore.each do |path|
|
123
|
+
FakeWeb.register_uri(:head, resource_path + path, :content => 'text/html')
|
124
|
+
end
|
125
|
+
|
126
|
+
accept.each do |path|
|
127
|
+
FakeWeb.register_uri(:head, resource_path + path, :content => 'application/atom+xml')
|
128
|
+
FakeWeb.register_uri(:get, resource_path + path, :body => load_fixture('feed.rss'))
|
129
|
+
end
|
130
|
+
|
115
131
|
end
|
metadata
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
name: spix_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.6.
|
5
|
+
version: 1.6.4
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
|
+
- Marcio Lopes de Faria
|
8
9
|
- Marcelo Eden
|
9
10
|
- Fabio Mont'Alegre
|
10
11
|
- "Lucas H\xC3\xBAngaro"
|
@@ -13,7 +14,7 @@ autorequire:
|
|
13
14
|
bindir: bin
|
14
15
|
cert_chain: []
|
15
16
|
|
16
|
-
date: 2011-
|
17
|
+
date: 2011-06-03 00:00:00 -03:00
|
17
18
|
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|
@@ -117,7 +118,6 @@ files:
|
|
117
118
|
- spec/spix_parser/tools/feed_discovery/document_spec.rb
|
118
119
|
- spec/spix_parser/tools/feed_discovery/feed_spec.rb
|
119
120
|
- spec/spix_parser/tools/feed_discovery_spec.rb
|
120
|
-
- spec/spix_parser/tools/feed_list_spec.rb
|
121
121
|
- spec/spix_parser/utils_spec.rb
|
122
122
|
has_rdoc: true
|
123
123
|
homepage: http://github.com/busk/spix_parser
|
@@ -153,5 +153,4 @@ test_files:
|
|
153
153
|
- spec/spix_parser/tools/feed_discovery/document_spec.rb
|
154
154
|
- spec/spix_parser/tools/feed_discovery/feed_spec.rb
|
155
155
|
- spec/spix_parser/tools/feed_discovery_spec.rb
|
156
|
-
- spec/spix_parser/tools/feed_list_spec.rb
|
157
156
|
- spec/spix_parser/utils_spec.rb
|
@@ -1,17 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Spix::FeedDiscoveryList do
|
4
|
-
|
5
|
-
let(:feed_discovery_list) { Spix::FeedDiscoveryList.new }
|
6
|
-
|
7
|
-
it "should inherit from array" do
|
8
|
-
feed_discovery_list.class.superclass.should == Array
|
9
|
-
end
|
10
|
-
|
11
|
-
describe "#invalids" do
|
12
|
-
it "should return an empty array from invalids accessor method" do
|
13
|
-
feed_discovery_list.invalids.should == []
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
end
|