spix_parser 1.6.6 → 1.6.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/spix_parser/tools/feed_discovery.rb +5 -2
- data/lib/spix_parser/tools/feed_discovery/base.rb +4 -0
- data/lib/spix_parser/tools/feed_discovery/document.rb +100 -36
- data/lib/spix_parser/tools/feed_discovery/feed.rb +33 -22
- data/lib/spix_parser/version.rb +1 -1
- data/spec/spix_parser/tools/feed_discovery/document_spec.rb +17 -13
- data/spec/spix_parser/tools/feed_discovery/feed_spec.rb +4 -7
- metadata +2 -2
@@ -2,86 +2,146 @@ module Spix
|
|
2
2
|
module FeedDiscovery
|
3
3
|
class Document
|
4
4
|
|
5
|
-
def initialize
|
6
|
-
@uri =
|
7
|
-
@document = Nokogiri::XML(content)
|
5
|
+
def initialize uri_name
|
6
|
+
@uri = parse_uri uri_name
|
8
7
|
end
|
9
8
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
9
|
+
def load_content
|
10
|
+
@document ||= content_from @uri.to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def content_from uri_name, limit=10
|
14
|
+
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
|
15
|
+
uri = parse_uri uri_name
|
16
|
+
connection, path = connection_and_path_using uri
|
17
|
+
response = connection.request_get path
|
18
|
+
content = Nokogiri::XML response.body
|
19
|
+
if response.kind_of? Net::HTTPRedirection
|
20
|
+
content_from response['location'], limit - 1
|
21
|
+
elsif meta_refresh = content.search('//meta[contains(translate(@http-equiv, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "refresh")]').first
|
22
|
+
content_from meta_refresh.get_attribute('content')[/http:\/\/.*/], limit - 1;
|
13
23
|
else
|
14
|
-
|
15
|
-
req_headers["User-Agent"] = USER_AGENT
|
16
|
-
open(@uri.to_s, req_headers).read
|
24
|
+
content
|
17
25
|
end
|
18
26
|
end
|
19
|
-
protected :
|
27
|
+
protected :content_from
|
20
28
|
|
21
29
|
def feed_uris &block
|
22
|
-
items =
|
30
|
+
items = []
|
31
|
+
load_content
|
32
|
+
items = uris_from_links(&block) + uris_from_anchors(&block)
|
33
|
+
items << Feed.new(@uri).tap { |uri|
|
34
|
+
block.call uri if block_given?
|
35
|
+
} if feed?
|
36
|
+
rescue => error
|
37
|
+
items << feed_exception_from(error)
|
38
|
+
ensure
|
39
|
+
return items
|
40
|
+
end
|
41
|
+
|
42
|
+
def feeds &block
|
43
|
+
items = []
|
44
|
+
load_content
|
45
|
+
items = feeds_from_links(&block) + feeds_from_anchors(&block)
|
23
46
|
items << feed_unsing_address(@uri, &block) if feed?
|
24
|
-
|
47
|
+
rescue => error
|
48
|
+
items << feed_exception_from(error)
|
49
|
+
ensure
|
50
|
+
return items
|
51
|
+
end
|
52
|
+
|
53
|
+
def feed_exception_from error, &block
|
54
|
+
Feed.new(@uri).tap { |item|
|
55
|
+
item.exceptions = [error.to_s]
|
56
|
+
block.call item if block_given?
|
57
|
+
}
|
25
58
|
end
|
26
59
|
|
27
|
-
def
|
60
|
+
def feeds_from_links &block
|
61
|
+
from_links.map { |node| feed_from node, &block }
|
62
|
+
end
|
63
|
+
private :feeds_from_links
|
64
|
+
|
65
|
+
def uris_from_links &block
|
66
|
+
from_links.map { |node|
|
67
|
+
Feed.new(node.get_attribute('href')).tap do |item|
|
68
|
+
block.call item if block_given?
|
69
|
+
end
|
70
|
+
}
|
71
|
+
end
|
72
|
+
private :uris_from_links
|
73
|
+
|
74
|
+
def from_links
|
28
75
|
@document.search(
|
29
76
|
"link[@type='application/atom+xml']",
|
30
77
|
"link[@type='application/rss+xml']"
|
31
|
-
)
|
78
|
+
)
|
32
79
|
end
|
33
|
-
private :
|
80
|
+
private :from_links
|
34
81
|
|
35
|
-
def
|
82
|
+
def feeds_from_anchors &block
|
83
|
+
from_anchors.map { |node|
|
84
|
+
feed_from node, &block
|
85
|
+
}
|
86
|
+
end
|
87
|
+
private :feeds_from_anchors
|
88
|
+
|
89
|
+
def uris_from_anchors &block
|
90
|
+
from_anchors.map { |node|
|
91
|
+
Feed.new(node.get_attribute('href')).tap do |item|
|
92
|
+
block.call item if block_given?
|
93
|
+
end
|
94
|
+
}
|
95
|
+
end
|
96
|
+
private :uris_from_anchors
|
97
|
+
|
98
|
+
def from_anchors
|
36
99
|
@document.search('a').select { |node|
|
37
100
|
valid_url_in? node
|
38
101
|
}.select { |node|
|
39
102
|
rss_or_atom_content_type_in? node
|
40
|
-
}
|
41
|
-
feed_from node, &block
|
42
|
-
}
|
103
|
+
}
|
43
104
|
end
|
44
|
-
private :
|
105
|
+
private :from_anchors
|
45
106
|
|
46
107
|
def feed_from node, &block
|
47
|
-
uri = @uri.merge node.get_attribute
|
108
|
+
uri = @uri.merge node.get_attribute("href").to_s
|
48
109
|
feed_unsing_address uri, &block
|
49
110
|
end
|
50
111
|
private :feed_from
|
51
112
|
|
52
113
|
def feed_unsing_address uri, &block
|
53
114
|
begin
|
54
|
-
Feed.new(uri)
|
115
|
+
Feed.new(uri) { |feed|
|
116
|
+
feed.set_title
|
117
|
+
feed.set_favicon
|
118
|
+
}.tap { |feed|
|
55
119
|
block.call feed if block_given?
|
56
|
-
|
57
|
-
rescue => error
|
58
|
-
error.tap do |e|
|
59
|
-
block.call e if block_given?
|
60
|
-
end
|
120
|
+
}
|
61
121
|
end
|
62
122
|
end
|
63
123
|
private :feed_unsing_address
|
64
124
|
|
65
125
|
def valid_url_in? anchor
|
66
126
|
uri = address_from anchor
|
67
|
-
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host)
|
127
|
+
uri.scheme != "javascript" and uri.to_s != "#" and (uri.relative? or uri.host == @uri.host) and uri.to_s.present?
|
68
128
|
end
|
69
129
|
private :valid_url_in?
|
70
130
|
|
71
131
|
def rss_or_atom_content_type_in? anchor
|
72
|
-
|
73
|
-
|
74
|
-
|
132
|
+
connection, path = connection_and_path_using address_from anchor
|
133
|
+
response = connection.request_head path
|
134
|
+
response['content-type'] =~ /rss|atom/
|
75
135
|
rescue
|
76
136
|
true
|
77
137
|
end
|
78
138
|
private :rss_or_atom_content_type_in?
|
79
139
|
|
80
|
-
def
|
81
|
-
|
82
|
-
return
|
140
|
+
def connection_and_path_using uri
|
141
|
+
connection = Net::HTTP.new uri.host, uri.port
|
142
|
+
return connection, path_from(uri) || uri.to_s
|
83
143
|
end
|
84
|
-
private :
|
144
|
+
private :connection_and_path_using
|
85
145
|
|
86
146
|
def path_from uri
|
87
147
|
path = uri - uri.select(:scheme, :host).join("://")
|
@@ -90,7 +150,7 @@ module Spix
|
|
90
150
|
private :path_from
|
91
151
|
|
92
152
|
def address_from node
|
93
|
-
@uri.merge
|
153
|
+
@uri.merge parse_uri node.get_attribute("href").to_s
|
94
154
|
end
|
95
155
|
private :address_from
|
96
156
|
|
@@ -101,6 +161,10 @@ module Spix
|
|
101
161
|
def feed?
|
102
162
|
%w[rss feed].include? @document.root.name
|
103
163
|
end
|
164
|
+
|
165
|
+
def parse_uri path
|
166
|
+
URI.parse URI.encode path.to_s
|
167
|
+
end
|
104
168
|
|
105
169
|
end
|
106
170
|
end
|
@@ -4,11 +4,14 @@ module Spix
|
|
4
4
|
|
5
5
|
def initialize url
|
6
6
|
self.url = url.to_s
|
7
|
-
self.
|
8
|
-
self.
|
7
|
+
self.similars = []
|
8
|
+
self.exceptions = []
|
9
|
+
yield self if block_given?
|
10
|
+
rescue => error
|
11
|
+
self.errors = [error]
|
9
12
|
end
|
10
13
|
|
11
|
-
%w[url favicon title].each do |attr|
|
14
|
+
%w[url favicon title exceptions similars].each do |attr|
|
12
15
|
define_method attr do
|
13
16
|
self[attr.to_sym]
|
14
17
|
end
|
@@ -18,21 +21,20 @@ module Spix
|
|
18
21
|
end
|
19
22
|
end
|
20
23
|
|
21
|
-
def
|
24
|
+
def set_title
|
22
25
|
node = content.search('title').first
|
23
|
-
node.content if node
|
26
|
+
self.title = node.content if node
|
24
27
|
end
|
25
|
-
private :get_title
|
26
28
|
|
27
|
-
def
|
29
|
+
def set_favicon
|
28
30
|
if node = content.search('link').first
|
29
|
-
path =
|
30
|
-
shortcut_from
|
31
|
+
path = parse_uri node.content.strip
|
32
|
+
self.favicon = shortcut_from parse_uri path.select(:scheme, :host).join("://") rescue nil
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
36
|
def shortcut_from base_uri
|
35
|
-
doc =
|
37
|
+
doc = fetch_html base_uri
|
36
38
|
shortcuts = find_shortcut_in doc
|
37
39
|
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
38
40
|
|
@@ -49,21 +51,31 @@ module Spix
|
|
49
51
|
).map { |node| node.get_attribute "href" }
|
50
52
|
end
|
51
53
|
|
52
|
-
def
|
54
|
+
def fetch uri, limit = 10
|
55
|
+
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
|
53
56
|
resp = Net::HTTP.get_response uri
|
54
|
-
doc = Nokogiri::HTML(resp.body)
|
55
57
|
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
56
58
|
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
57
|
-
|
59
|
+
from_redirect = parse_uri path
|
60
|
+
self.url = from_redirect.to_s
|
61
|
+
fetch from_redirect, limit - 1
|
58
62
|
else
|
59
|
-
|
63
|
+
resp.body
|
60
64
|
end
|
61
65
|
rescue
|
62
|
-
|
66
|
+
String.new
|
67
|
+
end
|
68
|
+
|
69
|
+
def fetch_xml uri
|
70
|
+
Nokogiri::XML fetch uri
|
71
|
+
end
|
72
|
+
|
73
|
+
def fetch_html uri
|
74
|
+
Nokogiri::HTML fetch uri
|
63
75
|
end
|
64
76
|
|
65
77
|
def base_uri
|
66
|
-
@base_uri ||=
|
78
|
+
@base_uri ||= parse_uri uri.select(:scheme, :host).join("://")
|
67
79
|
end
|
68
80
|
private :base_uri
|
69
81
|
|
@@ -73,20 +85,19 @@ module Spix
|
|
73
85
|
private :content
|
74
86
|
|
75
87
|
def load_content
|
76
|
-
|
77
|
-
path = uri - uri.select(:scheme, :host).join("://")
|
78
|
-
resp = req.request_get path.to_s
|
79
|
-
Nokogiri::XML(resp.body)
|
88
|
+
fetch_xml uri
|
80
89
|
end
|
81
90
|
private :load_content
|
82
91
|
|
83
92
|
def uri
|
84
|
-
@uri ||=
|
93
|
+
@uri ||= parse_uri url
|
85
94
|
end
|
86
95
|
private :uri
|
87
96
|
|
88
|
-
def
|
97
|
+
def parse_uri path
|
98
|
+
URI.parse URI.encode path
|
89
99
|
end
|
100
|
+
|
90
101
|
end
|
91
102
|
end
|
92
103
|
end
|
data/lib/spix_parser/version.rb
CHANGED
@@ -4,17 +4,17 @@ describe Spix::FeedDiscovery::Document do
|
|
4
4
|
|
5
5
|
let(:document) { Spix::FeedDiscovery::Document.new @rss_uri }
|
6
6
|
|
7
|
-
describe '#
|
7
|
+
describe '#feed' do
|
8
8
|
|
9
9
|
context 'when the uri exists' do
|
10
10
|
|
11
11
|
it 'should return only feed uris' do
|
12
|
-
document.should_receive(:
|
13
|
-
document.
|
12
|
+
document.should_receive(:feeds).and_return expected_feeds
|
13
|
+
document.feeds
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should yield feed with correct uri content' do
|
17
|
-
document.
|
17
|
+
document.feeds do |feed|
|
18
18
|
expected_feeds.should include(feed)
|
19
19
|
end
|
20
20
|
end
|
@@ -29,14 +29,16 @@ describe Spix::FeedDiscovery::Document do
|
|
29
29
|
|
30
30
|
describe '#html?' do
|
31
31
|
it 'should return true if is a html document' do
|
32
|
-
content = load_fixture('rss_list.html')
|
33
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
32
|
+
content = Nokogiri::XML load_fixture('rss_list.html')
|
33
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
34
|
+
document.load_content
|
34
35
|
document.html?.should eql true
|
35
36
|
end
|
36
37
|
|
37
38
|
it 'should return false if is a rss/feed document' do
|
38
|
-
content = load_fixture('feed.rss')
|
39
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
39
|
+
content = Nokogiri::XML load_fixture('feed.rss')
|
40
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
41
|
+
document.load_content
|
40
42
|
document.html?.should eql false
|
41
43
|
end
|
42
44
|
|
@@ -48,14 +50,16 @@ describe Spix::FeedDiscovery::Document do
|
|
48
50
|
|
49
51
|
describe '#feed?' do
|
50
52
|
it 'should return true if a feed document' do
|
51
|
-
content = load_fixture('feed.rss')
|
52
|
-
Spix::FeedDiscovery::Document.any_instance.stub(:
|
53
|
+
content = Nokogiri::XML load_fixture('feed.rss')
|
54
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
55
|
+
document.load_content
|
53
56
|
document.feed?.should eql true
|
54
57
|
end
|
55
58
|
|
56
59
|
it 'should return false if hot a html document' do
|
57
|
-
content = load_fixture('rss_list.html')
|
58
|
-
|
60
|
+
content = Nokogiri::XML load_fixture('rss_list.html')
|
61
|
+
document.load_content
|
62
|
+
Spix::FeedDiscovery::Document.any_instance.stub(:content_from).and_return(content)
|
59
63
|
document.feed?.should eql false
|
60
64
|
end
|
61
65
|
|
@@ -84,6 +88,6 @@ end
|
|
84
88
|
|
85
89
|
def expected_feeds
|
86
90
|
@feeds ||= %w[feed.atom carnaval-rss.xml cultura-rss.xml feed.rss].map { |path|
|
87
|
-
Spix::FeedDiscovery::Feed.new(@domain + '/' + path
|
91
|
+
Spix::FeedDiscovery::Feed.new(@domain + '/' + path)
|
88
92
|
}
|
89
93
|
end
|
@@ -4,23 +4,20 @@ describe Spix::FeedDiscovery::Feed do
|
|
4
4
|
|
5
5
|
context 'given an expecific uri' do
|
6
6
|
|
7
|
-
let(:feed) { Spix::FeedDiscovery::Feed.new @feed_uri, @favicon_uri }
|
8
|
-
|
9
|
-
it 'should set the favicon' do
|
10
|
-
feed.favicon.should == @favicon_uri
|
11
|
-
end
|
12
|
-
|
13
7
|
it 'should set the url' do
|
8
|
+
feed = described_class.new @feed_uri
|
14
9
|
feed.url.should == @feed_uri
|
15
10
|
end
|
16
11
|
|
17
12
|
it 'should set title' do
|
13
|
+
feed = described_class.new @feed_uri do |f|
|
14
|
+
f.set_title
|
15
|
+
end
|
18
16
|
feed.title.should == @document.search('title').first.content
|
19
17
|
end
|
20
18
|
|
21
19
|
before :all do
|
22
20
|
@feed_uri = "http://myfeed.com/feed.rss"
|
23
|
-
@favicon_uri = "http://myfeed.com/images/favicon.ico"
|
24
21
|
@document = Nokogiri::XML load_fixture 'feed.rss'
|
25
22
|
FakeWeb.register_uri(:get, @feed_uri, :body => @document.to_s)
|
26
23
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: spix_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.6.
|
5
|
+
version: 1.6.7
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Marcio Lopes de Faria
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-06-
|
17
|
+
date: 2011-06-06 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|