spix_parser 1.6.6 → 1.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/spix_parser/tools/feed_discovery.rb +5 -2
- data/lib/spix_parser/tools/feed_discovery/base.rb +4 -0
- data/lib/spix_parser/tools/feed_discovery/document.rb +100 -36
- data/lib/spix_parser/tools/feed_discovery/feed.rb +33 -22
- data/lib/spix_parser/version.rb +1 -1
- data/spec/spix_parser/tools/feed_discovery/document_spec.rb +17 -13
- data/spec/spix_parser/tools/feed_discovery/feed_spec.rb +4 -7
- metadata +2 -2
@@ -2,86 +2,146 @@ module Spix
|
|
2
2
|
module FeedDiscovery
|
3
3
|
class Document
|
4
4
|
|
5
|
-
def initialize
|
6
|
-
@uri =
|
7
|
-
@document = Nokogiri::XML(content)
|
5
|
+
def initialize uri_name
|
6
|
+
@uri = parse_uri uri_name
|
8
7
|
end
|
9
8
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
9
|
+
def load_content
|
10
|
+
@document ||= content_from @uri.to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def content_from uri_name, limit=10
|
14
|
+
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
|
15
|
+
uri = parse_uri uri_name
|
16
|
+
connection, path = connection_and_path_using uri
|
17
|
+
response = connection.request_get path
|
18
|
+
content = Nokogiri::XML response.body
|
19
|
+
if response.kind_of? Net::HTTPRedirection
|
20
|
+
content_from response['location'], limit - 1
|
21
|
+
elsif meta_refresh = content.search('//meta[contains(translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "refresh")]').first
|
22
|
+
content_from meta_refresh.get_attribute('content')[/http:\/\/.*/], limit - 1;
|
13
23
|
else
|
14
|
-
|
15
|
-
req_headers["User-Agent"] = USER_AGENT
|
16
|
-
open(@uri.to_s, req_headers).read
|
24
|
+
content
|
17
25
|
end
|
18
26
|
end
|
19
|
-
protected :
|
27
|
+
protected :content_from
|
20
28
|
|
21
29
|
def feed_uris &block
|
22
|
-
items =
|
30
|
+
items = []
|
31
|
+
load_content
|
32
|
+
items = uris_from_links(&block) + uris_from_anchors(&block)
|
33
|
+
items << Feed.new(@uri).tap { |uri|
|
34
|
+
block.call uri if block_given?
|
35
|
+
} if feed?
|
36
|
+
rescue => error
|
37
|
+
items << feed_exception_from(error)
|
38
|
+
ensure
|
39
|
+
return items
|
40
|
+
end
|
41
|
+
|
42
|
+
def feeds &block
|
43
|
+
items = []
|
44
|
+
load_content
|
45
|
+
items = feeds_from_links(&block) + feeds_from_anchors(&block)
|
23
46
|
items << feed_unsing_address(@uri, &block) if feed?
|
24
|
-
|
47
|
+
rescue => error
|
48
|
+
items << feed_exception_from(error)
|
49
|
+
ensure
|
50
|
+
return items
|
51
|
+
end
|
52
|
+
|
53
|
+
def feed_exception_from error, &block
|
54
|
+
Feed.new(@uri).tap { |item|
|
55
|
+
item.exceptions = [error.to_s]
|
56
|
+
block.call item if block_given?
|
57
|
+
}
|
25
58
|
end
|
26
59
|
|
27
|
-
def
|
60
|
+
def feeds_from_links &block
|
61
|
+
from_links.map { |node| feed_from node, &block }
|
62
|
+
end
|
63
|
+
private :feeds_from_links
|
64
|
+
|
65
|
+
def uris_from_links &block
|
66
|
+
from_links.map { |node|
|
67
|
+
Feed.new(node.get_attribute('href')).tap do |item|
|
68
|
+
block.call item if block_given?
|
69
|
+
end
|
70
|
+
}
|
71
|
+
end
|
72
|
+
private :uris_from_links
|
73
|
+
|
74
|
+
def from_links
|
28
75
|
@document.search(
|
29
76
|
"link[@type='application/atom+xml']",
|
30
77
|
"link[@type='application/rss+xml']"
|
31
|
-
)
|
78
|
+
)
|
32
79
|
end
|
33
|
-
private :
|
80
|
+
private :from_links
|
34
81
|
|
35
|
-
def
|
82
|
+
def feeds_from_anchors &block
|
83
|
+
from_anchors.map { |node|
|
84
|
+
feed_from node, &block
|
85
|
+
}
|
86
|
+
end
|
87
|
+
private :feeds_from_anchors
|
88
|
+
|
89
|
+
def uris_from_anchors &block
|
90
|
+
from_anchors.map { |node|
|
91
|
+
Feed.new(node.get_attribute('href')).tap do |item|
|
92
|
+
block.call item if block_given?
|
93
|
+
end
|
94
|
+
}
|
95
|
+
end
|
96
|
+
private :uris_from_anchors
|
97
|
+
|
98
|
+
def from_anchors
|
36
99
|
@document.search('a').select { |node|
|
37
100
|
valid_url_in? node
|
38
101
|
}.select { |node|
|
39
102
|
rss_or_atom_content_type_in? node
|
40
|
-
}
|
41
|
-
feed_from node, &block
|
42
|
-
}
|
103
|
+
}
|
43
104
|
end
|
44
|
-
private :
|
105
|
+
private :from_anchors
|
45
106
|
|
46
107
|
def feed_from node, &block
|
47
|
-
uri = @uri.merge node.get_attribute
|
108
|
+
uri = @uri.merge node.get_attribute("href").to_s
|
48
109
|
feed_unsing_address uri, &block
|
49
110
|
end
|
50
111
|
private :feed_from
|
51
112
|
|
52
113
|
def feed_unsing_address uri, &block
|
53
114
|
begin
|
54
|
-
Feed.new(uri)
|
115
|
+
Feed.new(uri) { |feed|
|
116
|
+
feed.set_title
|
117
|
+
feed.set_favicon
|
118
|
+
}.tap { |feed|
|
55
119
|
block.call feed if block_given?
|
56
|
-
|
57
|
-
rescue => error
|
58
|
-
error.tap do |e|
|
59
|
-
block.call e if block_given?
|
60
|
-
end
|
120
|
+
}
|
61
121
|
end
|
62
122
|
end
|
63
123
|
private :feed_unsing_address
|
64
124
|
|
65
125
|
def valid_url_in? anchor
|
66
126
|
uri = address_from anchor
|
67
|
-
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
|
127
|
+
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host) and uri.to_s.present?
|
68
128
|
end
|
69
129
|
private :valid_url_in?
|
70
130
|
|
71
131
|
def rss_or_atom_content_type_in? anchor
|
72
|
-
|
73
|
-
|
74
|
-
|
132
|
+
connection, path = connection_and_path_using address_from anchor
|
133
|
+
response = connection.request_head path
|
134
|
+
response['content-type'] =~ /rss|atom/
|
75
135
|
rescue
|
76
136
|
true
|
77
137
|
end
|
78
138
|
private :rss_or_atom_content_type_in?
|
79
139
|
|
80
|
-
def
|
81
|
-
|
82
|
-
return
|
140
|
+
def connection_and_path_using uri
|
141
|
+
connection = Net::HTTP.new uri.host, uri.port
|
142
|
+
return connection, path_from(uri) || uri.to_s
|
83
143
|
end
|
84
|
-
private :
|
144
|
+
private :connection_and_path_using
|
85
145
|
|
86
146
|
def path_from uri
|
87
147
|
path = uri - uri.select(:scheme, :host).join("://")
|
@@ -90,7 +150,7 @@ module Spix
|
|
90
150
|
private :path_from
|
91
151
|
|
92
152
|
def address_from node
|
93
|
-
@uri.merge
|
153
|
+
@uri.merge parse_uri node.get_attribute("href").to_s
|
94
154
|
end
|
95
155
|
private :address_from
|
96
156
|
|
@@ -101,6 +161,10 @@ module Spix
|
|
101
161
|
def feed?
|
102
162
|
%w[rss feed].include? @document.root.name
|
103
163
|
end
|
164
|
+
|
165
|
+
def parse_uri path
|
166
|
+
URI.parse URI.encode path.to_s
|
167
|
+
end
|
104
168
|
|
105
169
|
end
|
106
170
|
end
|
@@ -4,11 +4,14 @@ module Spix
|
|
4
4
|
|
5
5
|
def initialize url
|
6
6
|
self.url = url.to_s
|
7
|
-
self.
|
8
|
-
self.
|
7
|
+
self.similars = []
|
8
|
+
self.exceptions = []
|
9
|
+
yield self if block_given?
|
10
|
+
rescue => error
|
11
|
+
self.errors = [error]
|
9
12
|
end
|
10
13
|
|
11
|
-
%w[url favicon title].each do |attr|
|
14
|
+
%w[url favicon title exceptions similars].each do |attr|
|
12
15
|
define_method attr do
|
13
16
|
self[attr.to_sym]
|
14
17
|
end
|
@@ -18,21 +21,20 @@ module Spix
|
|
18
21
|
end
|
19
22
|
end
|
20
23
|
|
21
|
-
def
|
24
|
+
def set_title
|
22
25
|
node = content.search('title').first
|
23
|
-
node.content if node
|
26
|
+
self.title = node.content if node
|
24
27
|
end
|
25
|
-
private :get_title
|
26
28
|
|
27
|
-
def
|
29
|
+
def set_favicon
|
28
30
|
if node = content.search('link').first
|
29
|
-
path =
|
30
|
-
shortcut_from
|
31
|
+
path = parse_uri node.content.strip
|
32
|
+
self.favicon = shortcut_from parse_uri path.select(:scheme, :host).join("://") rescue nil
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
36
|
def shortcut_from base_uri
|
35
|
-
doc =
|
37
|
+
doc = fetch_html base_uri
|
36
38
|
shortcuts = find_shortcut_in doc
|
37
39
|
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
38
40
|
|
@@ -49,21 +51,31 @@ module Spix
|
|
49
51
|
).map { |node| node.get_attribute "href" }
|
50
52
|
end
|
51
53
|
|
52
|
-
def
|
54
|
+
def fetch uri, limit = 10
|
55
|
+
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
|
53
56
|
resp = Net::HTTP.get_response uri
|
54
|
-
doc = Nokogiri::HTML(resp.body)
|
55
57
|
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
56
58
|
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
57
|
-
|
59
|
+
from_redirect = parse_uri path
|
60
|
+
self.url = from_redirect.to_s
|
61
|
+
fetch from_redirect, limit - 1
|
58
62
|
else
|
59
|
-
|
63
|
+
resp.body
|
60
64
|
end
|
61
65
|
rescue
|
62
|
-
|
66
|
+
String.new
|
67
|
+
end
|
68
|
+
|
69
|
+
def fetch_xml uri
|
70
|
+
Nokogiri::XML fetch uri
|
71
|
+
end
|
72
|
+
|
73
|
+
def fetch_html uri
|
74
|
+
Nokogiri::HTML fetch uri
|
63
75
|
end
|
64
76
|
|
65
77
|
def base_uri
|
66
|
-
@base_uri ||=
|
78
|
+
@base_uri ||= parse_uri uri.select(:scheme, :host).join("://")
|
67
79
|
end
|
68
80
|
private :base_uri
|
69
81
|
|
@@ -73,20 +85,19 @@ module Spix
|
|
73
85
|
private :content
|
74
86
|
|
75
87
|
def load_content
|
76
|
-
|
77
|
-
path = uri - uri.select(:scheme, :host).join("://")
|
78
|
-
resp = req.request_get path.to_s
|
79
|
-
Nokogiri::XML(resp.body)
|
88
|
+
fetch_xml uri
|
80
89
|
end
|
81
90
|
private :load_content
|
82
91
|
|
83
92
|
def uri
|
84
|
-
@uri ||=
|
93
|
+
@uri ||= parse_uri url
|
85
94
|
end
|
86
95
|
private :uri
|
87
96
|
|
88
|
-
def
|
97
|
+
def parse_uri path
|
98
|
+
URI.parse URI.encode path
|
89
99
|
end
|
100
|
+
|
90
101
|
end
|
91
102
|
end
|
92
103
|
end
|
data/lib/spix_parser/version.rb
CHANGED
@@ -4,17 +4,17 @@ describe Spix::FeedDiscovery::Document do
|
|
4
4
|
|
5
5
|
let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
|
6
6
|
|
7
|
-
describe '#
|
7
|
+
describe '#feed' do
|
8
8
|
|
9
9
|
context 'when the uri exists' do
|
10
10
|
|
11
11
|
it 'should return only feed uris' do
|
12
|
-
document.should_receive(:
|
13
|
-
document.
|
12
|
+
document.should_receive(:feeds).and_return expected_feeds
|
13
|
+
document.feeds
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should yield feed with correct uri content' do
|
17
|
-
document.
|
17
|
+
document.feeds do |feed|
|
18
18
|
expected_feeds.should include(feed)
|
19
19
|
end
|
20
20
|
end
|
@@ -29,14 +29,16 @@ describe Spix::FeedDiscovery::Document do
|
|
29
29
|
|
30
30
|
describe '#html?' do
|
31
31
|
it 'should return true if is a html document' do
|
32
|
-
content = load_fixture('rss_list.html')
|
33
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
32
|
+
content = Nokogiri::XML load_fixture('rss_list.html')
|
33
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
34
|
+
document.load_content
|
34
35
|
document.html?.should eql true
|
35
36
|
end
|
36
37
|
|
37
38
|
it 'should return false if is a rss/feed document' do
|
38
|
-
content = load_fixture('feed.rss')
|
39
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
39
|
+
content = Nokogiri::XML load_fixture('feed.rss')
|
40
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
41
|
+
document.load_content
|
40
42
|
document.html?.should eql false
|
41
43
|
end
|
42
44
|
|
@@ -48,14 +50,16 @@ describe Spix::FeedDiscovery::Document do
|
|
48
50
|
|
49
51
|
describe '#feed?' do
|
50
52
|
it 'should return true if a feed document' do
|
51
|
-
content = load_fixture('feed.rss')
|
52
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
53
|
+
content = Nokogiri::XML load_fixture('feed.rss')
|
54
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
55
|
+
document.load_content
|
53
56
|
document.feed?.should eql true
|
54
57
|
end
|
55
58
|
|
56
59
|
it 'should return false if hot a html document' do
|
57
|
-
content = load_fixture('rss_list.html')
|
58
|
-
|
60
|
+
content = Nokogiri::XML load_fixture('rss_list.html')
|
61
|
+
document.load_content
|
62
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
59
63
|
document.feed?.should eql false
|
60
64
|
end
|
61
65
|
|
@@ -84,6 +88,6 @@ end
|
|
84
88
|
|
85
89
|
def expected_feeds
|
86
90
|
@feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
|
87
|
-
Spix::FeedDiscovery::Feed.new(@domain + '/' + path
|
91
|
+
Spix::FeedDiscovery::Feed.new(@domain + '/' + path)
|
88
92
|
}
|
89
93
|
end
|
@@ -4,23 +4,20 @@ describe Spix::FeedDiscovery::Feed do
|
|
4
4
|
|
5
5
|
context 'given an expecific uri' do
|
6
6
|
|
7
|
-
let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
|
8
|
-
|
9
|
-
it 'should set the favicon' do
|
10
|
-
feed.favicon.should == @favicon_uri
|
11
|
-
end
|
12
|
-
|
13
7
|
it 'should set the url' do
|
8
|
+
feed = described_class.new @feed_uri
|
14
9
|
feed.url.should == @feed_uri
|
15
10
|
end
|
16
11
|
|
17
12
|
it 'should set title' do
|
13
|
+
feed = described_class.new @feed_uri do |f|
|
14
|
+
f.set_title
|
15
|
+
end
|
18
16
|
feed.title.should == @document.search('title').first.content
|
19
17
|
end
|
20
18
|
|
21
19
|
before :all do
|
22
20
|
@feed_uri = "http://myfeed.com/feed.rss"
|
23
|
-
@favicon_uri = "http://myfeed.com/images/favicon.ico"
|
24
21
|
@document = Nokogiri::XML load_fixture 'feed.rss'
|
25
22
|
FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
|
26
23
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: spix_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.6.
|
5
|
+
version: 1.6.7
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Marcio Lopes de Faria
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-06-
|
17
|
+
date: 2011-06-06 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|