spix_parser 1.6.4 → 1.6.5
Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'ruby-debug'
|
1
2
|
module Spix
|
2
3
|
module FeedDiscovery
|
3
4
|
class Document
|
@@ -51,7 +52,7 @@ module Spix
|
|
51
52
|
|
52
53
|
def feed_unsing_address uri, &block
|
53
54
|
begin
|
54
|
-
Feed.new(uri
|
55
|
+
Feed.new(uri).tap do |feed|
|
55
56
|
block.call feed if block_given?
|
56
57
|
end
|
57
58
|
rescue => error
|
@@ -101,71 +102,6 @@ module Spix
|
|
101
102
|
def feed?
|
102
103
|
%w[rss feed].include? @document.root.name
|
103
104
|
end
|
104
|
-
|
105
|
-
def favicon
|
106
|
-
shortcut_in_document or shortcut_from_original_page or shortcut_from(base_uri) or default_favico_if_exist
|
107
|
-
end
|
108
|
-
|
109
|
-
def shortcut_in_document
|
110
|
-
shortcuts = find_shortcut_in @document
|
111
|
-
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
112
|
-
end
|
113
|
-
private :shortcut_in_document
|
114
|
-
|
115
|
-
def shortcut_from_original_page
|
116
|
-
if feed?
|
117
|
-
if node = @document.search('link').first
|
118
|
-
path = URI.parse node.content.strip
|
119
|
-
shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
private :shortcut_from_original_page
|
124
|
-
|
125
|
-
def shortcut_from base_uri
|
126
|
-
doc = get base_uri
|
127
|
-
shortcuts = find_shortcut_in doc
|
128
|
-
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
129
|
-
|
130
|
-
rescue Net::HTTPError, Net::HTTPFatalError
|
131
|
-
logger.warn "error opening favicon: #{$!}"
|
132
|
-
nil
|
133
|
-
end
|
134
|
-
private :shortcut_from
|
135
|
-
|
136
|
-
def find_shortcut_in doc
|
137
|
-
doc.xpath(
|
138
|
-
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
|
139
|
-
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
|
140
|
-
).map { |node| node.get_attribute "href" }
|
141
|
-
end
|
142
|
-
|
143
|
-
def default_favico_if_exist
|
144
|
-
http = Net::HTTP.new base_uri.host, base_uri.port
|
145
|
-
resp = http.request_head 'favicon.ico'
|
146
|
-
base_uri.merge('favicon.ico').to_s unless resp.kind_of? Net::HTTPError
|
147
|
-
rescue
|
148
|
-
nil
|
149
|
-
end
|
150
|
-
private :default_favico_if_exist
|
151
|
-
|
152
|
-
def get uri
|
153
|
-
resp = Net::HTTP.get_response uri
|
154
|
-
doc = Nokogiri::HTML(resp.body)
|
155
|
-
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
156
|
-
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
157
|
-
get URI.parse path
|
158
|
-
else
|
159
|
-
doc
|
160
|
-
end
|
161
|
-
rescue
|
162
|
-
Nokogiri::HTML('')
|
163
|
-
end
|
164
|
-
|
165
|
-
def base_uri
|
166
|
-
@base_uri ||= URI.parse @uri.select(:scheme, :host).join("://")
|
167
|
-
end
|
168
|
-
private :base_uri
|
169
105
|
|
170
106
|
end
|
171
107
|
end
|
@@ -2,9 +2,9 @@ module Spix
|
|
2
2
|
module FeedDiscovery
|
3
3
|
class Feed < Hash
|
4
4
|
|
5
|
-
def initialize url
|
5
|
+
def initialize url
|
6
6
|
self.url = url.to_s
|
7
|
-
self.favicon =
|
7
|
+
self.favicon = get_favicon
|
8
8
|
self.title = get_title
|
9
9
|
end
|
10
10
|
|
@@ -24,13 +24,61 @@ module Spix
|
|
24
24
|
end
|
25
25
|
private :get_title
|
26
26
|
|
27
|
+
def get_favicon
|
28
|
+
if node = content.search('link').first
|
29
|
+
path = URI.parse node.content.strip
|
30
|
+
shortcut_from URI.parse path.select(:scheme, :host).join("://") rescue nil
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def shortcut_from base_uri
|
35
|
+
doc = get base_uri
|
36
|
+
shortcuts = find_shortcut_in doc
|
37
|
+
shortcuts.any? ? base_uri.merge(shortcuts.first.to_s).to_s : nil
|
38
|
+
|
39
|
+
rescue Net::HTTPError, Net::HTTPFatalError
|
40
|
+
logger.warn "error opening favicon: #{$!}"
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
private :shortcut_from
|
44
|
+
|
45
|
+
def find_shortcut_in doc
|
46
|
+
doc.xpath(
|
47
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "shortcut")]',
|
48
|
+
'//link[contains(translate(@rel, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "ico")]'
|
49
|
+
).map { |node| node.get_attribute "href" }
|
50
|
+
end
|
51
|
+
|
52
|
+
def get uri
|
53
|
+
resp = Net::HTTP.get_response uri
|
54
|
+
doc = Nokogiri::HTML(resp.body)
|
55
|
+
if resp.kind_of?(Net::HTTPRedirection) or (refresh_metatags = Nokogiri::HTML(resp.body).search('meta[@http-equiv=REFRESH]')).any?
|
56
|
+
path = resp['location'] || refresh_metatags.first.get_attribute('content')[/http:\/\/.*/]
|
57
|
+
get URI.parse path
|
58
|
+
else
|
59
|
+
doc
|
60
|
+
end
|
61
|
+
rescue
|
62
|
+
Nokogiri::HTML('')
|
63
|
+
end
|
64
|
+
|
65
|
+
def base_uri
|
66
|
+
@base_uri ||= URI.parse uri.select(:scheme, :host).join("://")
|
67
|
+
end
|
68
|
+
private :base_uri
|
69
|
+
|
27
70
|
def content
|
71
|
+
@content ||= load_content
|
72
|
+
end
|
73
|
+
private :content
|
74
|
+
|
75
|
+
def load_content
|
28
76
|
req = Net::HTTP.new uri.host, uri.port
|
29
77
|
path = uri - uri.select(:scheme, :host).join("://")
|
30
78
|
resp = req.request_get path.to_s
|
31
79
|
Nokogiri::XML(resp.body)
|
32
80
|
end
|
33
|
-
private :
|
81
|
+
private :load_content
|
34
82
|
|
35
83
|
def uri
|
36
84
|
@uri ||= URI.parse url
|
data/lib/spix_parser/version.rb
CHANGED