rfeedfinder 0.9.12 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/lib/rfeedfinder.rb +370 -164
- data/lib/rfeedfinder/version.rb +3 -3
- data/test/test_rfeedfinder.rb +3 -3
- metadata +75 -56
data/History.txt
CHANGED
data/lib/rfeedfinder.rb
CHANGED
@@ -1,140 +1,102 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'rubygems'
|
3
|
-
require 'htmlentities'
|
4
3
|
require 'open-uri'
|
5
4
|
require 'hpricot'
|
6
5
|
require 'timeout'
|
7
6
|
|
8
|
-
require
|
7
|
+
require File.dirname(__FILE__) + "/rfeedfinder/version"
|
9
8
|
|
10
|
-
module Rfeedfinder
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
def searchLinks(data, baseuri, regexp)
|
38
|
-
links = []
|
39
|
-
data.search(regexp).map!{|link|
|
40
|
-
if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
|
41
|
-
uri = link[:href].to_s
|
42
|
-
uri = link[:HREF].to_s if uri.empty?
|
43
|
-
uri = link[:src].to_s if uri.empty?
|
44
|
-
uri = link[:SRC].to_s if uri.empty?
|
45
|
-
if !uri.strip.empty? and uri !~ /^javascript/
|
46
|
-
uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
|
47
|
-
links << uri
|
48
|
-
end
|
49
|
-
end
|
50
|
-
}
|
51
|
-
#links.each{|link| puts "searchLinks: #{link}"}
|
52
|
-
return links.uniq
|
53
|
-
end
|
54
|
-
|
55
|
-
def getLocalLinks(links, baseuri)
|
56
|
-
locallinks = []
|
57
|
-
links.each do |link|
|
58
|
-
locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
|
59
|
-
end
|
60
|
-
links = links.select{|link| link !~ /^\//} #remove local links from link array
|
61
|
-
return [links, locallinks]
|
10
|
+
class Rfeedfinder
|
11
|
+
#
|
12
|
+
# Takes:
|
13
|
+
# * +init_values+ (hash)
|
14
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
15
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
16
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
17
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
18
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
19
|
+
#
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
#
|
23
|
+
# Rfeedfinder.new({:proxy => "http://127.0.0.1:1234",
|
24
|
+
# :user_agent => "MyApp",
|
25
|
+
# :from => "contant@domain.com",
|
26
|
+
# :referer => "http://domain.com"})
|
27
|
+
#
|
28
|
+
#
|
29
|
+
# Returns a new instance of Rfeedfinder
|
30
|
+
#
|
31
|
+
def initialize(init_values = {})
|
32
|
+
@options = init_values
|
62
33
|
end
|
63
34
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
35
|
+
#
|
36
|
+
# Takes:
|
37
|
+
# * +uri+ (string)
|
38
|
+
#
|
39
|
+
# Returns:
|
40
|
+
# * array of urls
|
41
|
+
#
|
42
|
+
def feeds(uri)
|
43
|
+
Rfeedfinder.feeds(uri, @options.dup)
|
70
44
|
end
|
71
45
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
def
|
80
|
-
|
81
|
-
begin
|
82
|
-
response = Net::HTTP.get_response(URI.parse(feed))
|
83
|
-
#puts "Verify #{feed} - code: #{response.code}"
|
84
|
-
if response.code == "302"
|
85
|
-
newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
|
86
|
-
|
87
|
-
feedlist.delete(feed)
|
88
|
-
feedlist << newuri
|
89
|
-
feedlist.uniq!
|
90
|
-
end
|
91
|
-
rescue
|
92
|
-
# rescue net error
|
93
|
-
end
|
94
|
-
end
|
95
|
-
return feedlist
|
96
|
-
end
|
97
|
-
|
98
|
-
def isFeedData?(data)
|
99
|
-
# if no html tag and rss, rdf or feed tag, it's a feed
|
100
|
-
return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
|
46
|
+
#
|
47
|
+
# Takes:
|
48
|
+
# * +uri+ (string)
|
49
|
+
#
|
50
|
+
# Returns:
|
51
|
+
# * url (string)
|
52
|
+
#
|
53
|
+
def feed(uri)
|
54
|
+
result = Rfeedfinder.feed(uri, @options.dup)
|
101
55
|
end
|
102
56
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
57
|
+
#
|
58
|
+
# Takes:
|
59
|
+
# * +uri+ (string): The URI to check
|
60
|
+
# * +options+ (hash)
|
61
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
62
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
63
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
64
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
65
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# Example:
|
69
|
+
#
|
70
|
+
# Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
|
71
|
+
# :user_agent => "MyApp",
|
72
|
+
# :from => "contant@domain.com",
|
73
|
+
# :referer => "http://domain.com"})
|
74
|
+
#
|
75
|
+
#
|
76
|
+
# Returns:
|
77
|
+
# * array of urls
|
78
|
+
# * array of hashes if the :keep_data option is true
|
79
|
+
# Example:
|
80
|
+
# [{:url => "url1", :data => "some data"},{:url => "url2", :data => "feed data"}]
|
81
|
+
#
|
82
|
+
# Raises:
|
83
|
+
# * ArgumentError if +uri+ is not a valid URL, and :use_google => false
|
84
|
+
# * ArgumentError if :use_google => true but it's not your lucky day
|
85
|
+
#
|
86
|
+
def self.feeds(uri, options = {})
|
112
87
|
|
113
|
-
|
114
|
-
|
88
|
+
# We have to create a hash for the data
|
89
|
+
# if the user has asked us to keep the data
|
90
|
+
options[:data] = {} if options[:keep_data]
|
91
|
+
|
92
|
+
options[:original_uri] = uri if !Rfeedfinder.isAValidURL?(uri) and options[:use_google]
|
115
93
|
|
116
|
-
|
117
|
-
|
94
|
+
uri = URI.decode(uri)
|
95
|
+
options[:recurs] = [uri] if options[:recurs].nil?
|
96
|
+
fulluri = Rfeedfinder.makeFullURI(uri)
|
118
97
|
|
119
|
-
|
120
|
-
|
121
|
-
begin
|
122
|
-
server = Syndic8.new
|
123
|
-
feedids = server.find_feeds(uri)
|
124
|
-
infolist = server.feed_info(feedids, ['headlines_rank','status','dataurl'])
|
125
|
-
infolist.sort_by{|feedInfo| feedInfo[:headlines_rank]}
|
126
|
-
infolist.each do |feed|
|
127
|
-
feeds << feed[:dataurl] if feed[:status]=='Syndicated'
|
128
|
-
end
|
129
|
-
rescue
|
130
|
-
end
|
131
|
-
return feeds
|
132
|
-
end
|
133
|
-
|
134
|
-
def feeds(uri, all=false, querySyndic8=false, _recurs=nil)
|
135
|
-
uri = HTMLEntities.decode_entities(uri)
|
136
|
-
_recurs = [uri] if _recurs.nil?
|
137
|
-
fulluri = makeFullURI(uri)
|
98
|
+
raise ArgumentError, "#{fulluri} is not a valid URI." \
|
99
|
+
if !Rfeedfinder.isAValidURL?(fulluri) and !options[:use_google]
|
138
100
|
|
139
101
|
# Add youtube support
|
140
102
|
if fulluri =~ /youtube\.com\/user\/(.*[^\/])/
|
@@ -143,70 +105,77 @@ module Rfeedfinder
|
|
143
105
|
if fulluri =~ /youtube\.com\/tag\/(.*[^\/])/
|
144
106
|
fulluri = "http://www.youtube.com/rss/tag/#{$1}/videos.rss"
|
145
107
|
end
|
146
|
-
|
147
|
-
data = open_doc(fulluri)
|
108
|
+
|
109
|
+
data = Rfeedfinder.open_doc(fulluri, options)
|
148
110
|
return [] if data.nil?
|
149
111
|
|
112
|
+
# If we used the google link finder, then we should set the new URL
|
113
|
+
fulluri = options[:google_link] if options[:google_link]
|
114
|
+
|
150
115
|
# is this already a feed?
|
151
|
-
if isFeedData?(data)
|
116
|
+
if Rfeedfinder.isFeedData?(data)
|
152
117
|
feedlist = [fulluri]
|
153
|
-
verifyRedirect(feedlist)
|
118
|
+
Rfeedfinder.verifyRedirect(feedlist)
|
154
119
|
return feedlist
|
155
120
|
end
|
156
121
|
|
157
122
|
#verify redirection
|
158
|
-
newuri = tryBrokenRedirect(data)
|
123
|
+
newuri = Rfeedfinder.tryBrokenRedirect(data)
|
159
124
|
if !newuri.nil? and !newuri.empty?
|
160
|
-
unless
|
161
|
-
|
162
|
-
|
125
|
+
options[:recurs] = [] unless options[:recurs]
|
126
|
+
unless options[:recurs].include?(newuri)
|
127
|
+
options[:recurs] << newuri
|
128
|
+
return feeds(newuri, options)
|
163
129
|
end
|
164
130
|
end
|
165
131
|
|
166
132
|
#verify frameset
|
167
|
-
frames = getFrameLinks(data, fulluri)
|
133
|
+
frames = Rfeedfinder.getFrameLinks(data, fulluri)
|
168
134
|
frames.each {|newuri|
|
169
135
|
if !newuri.nil? and !newuri.empty?
|
170
|
-
unless
|
171
|
-
|
172
|
-
|
136
|
+
options[:recurs] = [] unless options[:recurs]
|
137
|
+
unless options[:recurs].include?(newuri)
|
138
|
+
options[:recurs] << newuri
|
139
|
+
return feeds(newuri, options)
|
173
140
|
end
|
174
141
|
end
|
175
142
|
}
|
176
|
-
|
177
|
-
# nope, it's a page, try LINK tags first
|
178
|
-
outfeeds = getLinks(data, fulluri)
|
179
|
-
outfeeds.select {|link| isFeed?(link)}
|
180
143
|
|
144
|
+
# nope, it's a page, try LINK tags first
|
145
|
+
outfeeds = Rfeedfinder.getLinks(data, fulluri).select {|link| Rfeedfinder.isFeed?(link, options)}
|
146
|
+
|
181
147
|
#_debuglog('found %s feeds through LINK tags' % len(outfeeds))
|
182
148
|
if outfeeds.empty?
|
183
149
|
# no LINK tags, look for regular <A> links that point to feeds
|
184
150
|
begin
|
185
|
-
links = getALinks(data, fulluri)
|
151
|
+
links = Rfeedfinder.getALinks(data, fulluri)
|
186
152
|
rescue
|
187
153
|
links = []
|
188
154
|
end
|
189
155
|
|
190
156
|
# Get local links
|
191
|
-
links, locallinks = getLocalLinks(links, fulluri)
|
157
|
+
links, locallinks = Rfeedfinder.getLocalLinks(links, fulluri)
|
158
|
+
|
159
|
+
# TODO:
|
160
|
+
# implement support for :only_first down her
|
192
161
|
|
193
162
|
# look for obvious feed links on the same server
|
194
|
-
selected_feeds = locallinks.select{|link| isFeedLink?(link) and isFeed?(link)}
|
163
|
+
selected_feeds = locallinks.select{|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)}
|
195
164
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
196
165
|
# outfeeds.each{|link| puts "1 #{link}"}
|
197
166
|
|
198
167
|
# look harder for feed links on the same server
|
199
|
-
selected_feeds = locallinks.select{|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
168
|
+
selected_feeds = locallinks.select{|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
200
169
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
201
170
|
# outfeeds.each{|link| puts "2 #{link}"}
|
202
171
|
|
203
172
|
# look for obvious feed links on another server
|
204
|
-
selected_feeds = links.select {|link| isFeedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
173
|
+
selected_feeds = links.select {|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
205
174
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
206
175
|
# outfeeds.each{|link| puts "3 #{link}"}
|
207
176
|
|
208
177
|
# look harder for feed links on another server
|
209
|
-
selected_feeds = links.select {|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
178
|
+
selected_feeds = links.select {|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
210
179
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
211
180
|
# outfeeds.each{|link| puts "4 #{link}"}
|
212
181
|
end
|
@@ -226,63 +195,300 @@ module Rfeedfinder
|
|
226
195
|
|
227
196
|
guesses.each { |guess|
|
228
197
|
uri = URI.join(fulluri, guess).to_s
|
229
|
-
outfeeds << uri if isFeed?(uri)
|
198
|
+
outfeeds << uri if Rfeedfinder.isFeed?(uri, options)
|
230
199
|
}
|
231
200
|
end
|
232
201
|
|
233
202
|
# try with adding ending slash
|
234
203
|
if outfeeds.empty? and fulluri !~ /\/$/
|
235
|
-
outfeeds = feeds(fulluri + "/",
|
204
|
+
outfeeds = Rfeedfinder.feeds(fulluri + "/", options)
|
236
205
|
end
|
237
|
-
|
238
|
-
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
|
239
|
-
#_debuglog('still no luck, searching Syndic8')
|
240
|
-
outfeeds << getFeedsFromSyndic8(uri) if querySyndic8 and outfeeds.empty?
|
241
|
-
#outfeeds = list(set(outfeeds)) if hasattr(__builtins__, 'set') or __builtins__.has_key('set')
|
242
|
-
|
206
|
+
|
243
207
|
# Verify redirection
|
244
|
-
verifyRedirect(outfeeds)
|
208
|
+
Rfeedfinder.verifyRedirect(outfeeds)
|
209
|
+
|
210
|
+
# This has to be used until proper :only_first support has been built in
|
211
|
+
outfeeds = outfeeds.first if options[:only_first] and outfeeds.size > 1
|
245
212
|
|
246
|
-
|
213
|
+
if options[:keep_data]
|
214
|
+
output = []
|
215
|
+
outfeeds.each do |feed|
|
216
|
+
output << {:url => feed, :data => options[:data][feed]}
|
217
|
+
end
|
218
|
+
return output
|
219
|
+
else
|
220
|
+
return outfeeds
|
221
|
+
end
|
247
222
|
end
|
248
223
|
|
249
|
-
|
250
|
-
|
251
|
-
|
224
|
+
|
225
|
+
#
|
226
|
+
# Takes:
|
227
|
+
# * +uri+ (string): The URI to check
|
228
|
+
# * +options+ (hash)
|
229
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
230
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
231
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
232
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
233
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
234
|
+
#
|
235
|
+
#
|
236
|
+
# Example:
|
237
|
+
#
|
238
|
+
# Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
|
239
|
+
# :user_agent => "MyApp",
|
240
|
+
# :from => "contant@domain.com",
|
241
|
+
# :referer => "http://domain.com"})
|
242
|
+
#
|
243
|
+
#
|
244
|
+
# Returns:
|
245
|
+
# * one URL as a string or nil
|
246
|
+
# * one hash if the :keep_data option is true
|
247
|
+
# Example:
|
248
|
+
# {:url => "url1", :data => "some data"}
|
249
|
+
#
|
250
|
+
# Raises:
|
251
|
+
# * ArgumentError if +uri+ is not a valid URL, and :use_google => false
|
252
|
+
# * ArgumentError if :use_google => true but it's not your lucky day
|
253
|
+
#
|
254
|
+
def self.feed(uri, options = {})
|
255
|
+
options[:only_first] = true
|
256
|
+
feedlist = Rfeedfinder.feeds(uri, options)
|
252
257
|
unless feedlist.empty?
|
253
258
|
return feedlist[0]
|
254
259
|
else
|
255
260
|
return nil
|
256
261
|
end
|
257
262
|
end
|
263
|
+
|
264
|
+
#
|
265
|
+
# Takes:
|
266
|
+
# * +data+ (string)
|
267
|
+
#
|
268
|
+
# Returns:
|
269
|
+
# * +true+ if the data has a rss, rdf or feed tag
|
270
|
+
# * +false+ if the data has a html tag
|
271
|
+
#
|
272
|
+
def self.isFeedData?(data)
|
273
|
+
# if no html tag and rss, rdf or feed tag, it's a feed
|
274
|
+
# puts data
|
275
|
+
return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
|
276
|
+
end
|
277
|
+
|
278
|
+
#
|
279
|
+
# Takes:
|
280
|
+
# * +uri+ (string)
|
281
|
+
#
|
282
|
+
# Downloads the URI and checkes the content
|
283
|
+
# with the +isFeedData?+ class method
|
284
|
+
#
|
285
|
+
# Returns:
|
286
|
+
# * +true+ if the uri points to a feed
|
287
|
+
# * +false+ if not
|
288
|
+
#
|
289
|
+
def self.isFeed?(uri, options)
|
290
|
+
# We return false if the user only wants one result
|
291
|
+
# and we already have found it so there aren't made
|
292
|
+
# any additional external calls
|
293
|
+
return false if options[:only_first] and options[:already_found_one]
|
294
|
+
|
295
|
+
uri.gsub!(/\/\/www\d\./, "//www.")
|
296
|
+
begin
|
297
|
+
protocol = URI.split(uri)
|
298
|
+
return false if !protocol[0].index(/^[http|https]/)
|
299
|
+
rescue
|
300
|
+
# URI error
|
301
|
+
return false
|
302
|
+
end
|
303
|
+
|
304
|
+
data = Rfeedfinder.open_doc(uri, options)
|
305
|
+
return false if data.nil?
|
306
|
+
|
307
|
+
if Rfeedfinder.isFeedData?(data)
|
308
|
+
options[:already_found_one] = true if options[:only_first]
|
309
|
+
return true
|
310
|
+
else
|
311
|
+
return false
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
protected
|
316
|
+
def self.makeFullURI(uri)
|
317
|
+
uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
|
318
|
+
if /^http|https/.match(uri)
|
319
|
+
return uri
|
320
|
+
else
|
321
|
+
return "http://" << uri
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
def self.getLinks(data, baseuri)
|
326
|
+
return Rfeedfinder.searchLinks(data, baseuri, "[@rel='alternate'][@type*='xml'][@href*='http']")
|
327
|
+
end
|
328
|
+
|
329
|
+
def self.getALinks(data, baseuri)
|
330
|
+
return Rfeedfinder.searchLinks(data, baseuri, "a")
|
331
|
+
end
|
332
|
+
|
333
|
+
def self.getFrameLinks(data, baseuri)
|
334
|
+
links = Rfeedfinder.searchLinks(data, baseuri, "frame")
|
335
|
+
links += Rfeedfinder.searchLinks(data, baseuri, "FRAME")
|
336
|
+
return links
|
337
|
+
end
|
258
338
|
|
259
|
-
def
|
339
|
+
def self.searchLinks(data, baseuri, regexp)
|
340
|
+
links = []
|
341
|
+
data.search(regexp).map!{|link|
|
342
|
+
if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
|
343
|
+
uri = link[:href].to_s
|
344
|
+
uri = link[:HREF].to_s if uri.empty?
|
345
|
+
uri = link[:src].to_s if uri.empty?
|
346
|
+
uri = link[:SRC].to_s if uri.empty?
|
347
|
+
if !uri.strip.empty? and uri !~ /^javascript/
|
348
|
+
uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
|
349
|
+
links << uri
|
350
|
+
end
|
351
|
+
end
|
352
|
+
}
|
353
|
+
#links.each{|link| puts "Rfeedfinder.searchLinks: #{link}"}
|
354
|
+
return links.uniq
|
355
|
+
end
|
356
|
+
|
357
|
+
def self.getLocalLinks(links, baseuri)
|
358
|
+
locallinks = []
|
359
|
+
links.each do |link|
|
360
|
+
locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
|
361
|
+
end
|
362
|
+
links = links.select{|link| link !~ /^\//} #remove local links from link array
|
363
|
+
return [links, locallinks]
|
364
|
+
end
|
365
|
+
|
366
|
+
def self.isFeedLink?(link)
|
367
|
+
return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
|
368
|
+
end
|
369
|
+
|
370
|
+
def self.isXMLRelatedLink?(link)
|
371
|
+
return link.downcase =~ /rss|rdf|xml|atom/
|
372
|
+
end
|
373
|
+
|
374
|
+
def self.tryBrokenRedirect(data)
|
375
|
+
newuris = (data/:newLocation)
|
376
|
+
if !newuris.empty?
|
377
|
+
return newuris[0].strip
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
def self.verifyRedirect(feedlist)
|
382
|
+
feedlist.each do |feed|
|
383
|
+
begin
|
384
|
+
response = Net::HTTP.get_response(URI.parse(feed))
|
385
|
+
#puts "Verify #{feed} - code: #{response.code}"
|
386
|
+
if response.code == "302"
|
387
|
+
newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
|
388
|
+
|
389
|
+
feedlist.delete(feed)
|
390
|
+
feedlist << newuri
|
391
|
+
feedlist.uniq!
|
392
|
+
end
|
393
|
+
rescue
|
394
|
+
# rescue net error
|
395
|
+
end
|
396
|
+
end
|
397
|
+
return feedlist
|
398
|
+
end
|
399
|
+
|
400
|
+
def self.open_doc(link, options)
|
401
|
+
|
402
|
+
# Setting default values for missing options
|
403
|
+
options[:proxy] = URI.parse(options[:proxy]) if options[:proxy]
|
404
|
+
options[:user_agent] = options[:user_agent] || "Ruby/#{RUBY_VERSION} - " + \
|
405
|
+
"Rfeedfinder #{Rfeedfinder::VERSION::STRING}"
|
406
|
+
options[:from] = options[:from] || "rfeedfinder@googlegroups.com"
|
407
|
+
options[:referer] = options[:referer] || "http://rfeedfinder.rubyforge.org/"
|
408
|
+
|
260
409
|
data = nil
|
410
|
+
|
411
|
+
if !Rfeedfinder.isAValidURL?(link) and options[:use_google]
|
412
|
+
# Used google lucky script as found on
|
413
|
+
# http://www.leancrew.com/all-this/2006/07/lucky-linking/
|
414
|
+
# It doesn't work to well...
|
415
|
+
# TODO: Improve it somehow. The real google function works a lot better!
|
416
|
+
# TODO: Build in support for languages through parameter "hl" (=> "en" by default)
|
417
|
+
prefix = "http://www.google.com/search?q="
|
418
|
+
suffix = "&btnI=I'm+Feeling+Lucky"
|
419
|
+
goodURL = URI.escape(prefix + options[:original_uri] + suffix)
|
420
|
+
puts "Checking #{goodURL}"
|
421
|
+
response = Net::HTTP.get_response(URI.parse(goodURL))
|
422
|
+
link = response.to_hash['location'].first
|
423
|
+
options[:google_link] = link
|
424
|
+
raise ArgumentError, "Google couldn't save us. We couldn't find anything for #{options[:original_uri]}" if link.nil?
|
425
|
+
end
|
426
|
+
|
261
427
|
begin
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
"
|
267
|
-
|
428
|
+
|
429
|
+
Timeout::timeout(20) do
|
430
|
+
|
431
|
+
data = Hpricot(open(link, {
|
432
|
+
"User-Agent" => options[:user_agent],
|
433
|
+
"From" => options[:from],
|
434
|
+
"Referer" => options[:referer],
|
435
|
+
:proxy => options[:proxy]
|
436
|
+
}), :xml => true)
|
437
|
+
|
438
|
+
end
|
439
|
+
|
268
440
|
rescue OpenURI::HTTPError
|
441
|
+
|
269
442
|
begin
|
270
|
-
|
443
|
+
|
444
|
+
Timeout::timeout(20) do
|
445
|
+
|
271
446
|
html = Net::HTTP.get(URI.parse(link))
|
272
447
|
data = Hpricot(html, :xml => true) if html.to_s !~ /404 Not Found/
|
273
|
-
|
448
|
+
|
449
|
+
end
|
450
|
+
|
274
451
|
rescue Timeout::Error
|
275
452
|
return nil
|
453
|
+
|
276
454
|
rescue => err
|
277
455
|
puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
|
278
456
|
return nil
|
457
|
+
|
279
458
|
end
|
459
|
+
|
280
460
|
rescue Timeout::Error
|
281
461
|
return nil
|
462
|
+
|
282
463
|
rescue => err
|
283
464
|
puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
|
284
465
|
return nil
|
466
|
+
|
285
467
|
end
|
468
|
+
|
469
|
+
# Store the data for the URL if the user has requested it
|
470
|
+
options[:data][link] = data.to_original_html if options[:keep_data]
|
471
|
+
|
286
472
|
return data
|
287
473
|
end
|
474
|
+
|
475
|
+
def self.isAValidURL?(url_to_check)
|
476
|
+
return false if url_to_check == nil
|
477
|
+
|
478
|
+
# The protocols that we allow are the following
|
479
|
+
protocol_whitelist = ["http", "https"]
|
480
|
+
# I guess we could have included some more, but that doesn't really
|
481
|
+
# make sense anyway as these are the ones that should be used.
|
482
|
+
# We'll see if the need arises and then add more later if needed.
|
483
|
+
|
484
|
+
re = Regexp.new("(#{protocol_whitelist.join('|')}):" + \
|
485
|
+
"\/\/([[:alpha:][:digit:].]{2,})([.]{1})([[:alpha:]]{2,4})(\/)")
|
486
|
+
|
487
|
+
# For the sake of the regular expression check we add a back slash
|
488
|
+
# at the end of the URL
|
489
|
+
url_to_check += "/"
|
490
|
+
return true unless (re =~ url_to_check) == nil
|
491
|
+
false
|
492
|
+
end
|
493
|
+
|
288
494
|
end
|
data/lib/rfeedfinder/version.rb
CHANGED
data/test/test_rfeedfinder.rb
CHANGED
@@ -6,12 +6,12 @@ class TestRfeedfinder < Test::Unit::TestCase
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def test_feed
|
9
|
-
feed_finder "scripting.com",
|
9
|
+
feed_finder "http://scripting.com",
|
10
10
|
"http://www.scripting.com/rss.xml"
|
11
11
|
end
|
12
12
|
|
13
13
|
def test_feeds
|
14
|
-
feeds = Rfeedfinder.feeds("flickr.com/photos/alx")
|
14
|
+
feeds = Rfeedfinder.feeds("http://flickr.com/photos/alx")
|
15
15
|
assert_equal 2, feeds.size
|
16
16
|
end
|
17
17
|
|
@@ -143,6 +143,6 @@ class TestRfeedfinder < Test::Unit::TestCase
|
|
143
143
|
end
|
144
144
|
|
145
145
|
def test_nytimes
|
146
|
-
feed_finder "http://www.nytimes.com/"
|
146
|
+
feed_finder "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml"
|
147
147
|
end
|
148
148
|
end
|
metadata
CHANGED
@@ -1,33 +1,59 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: rfeedfinder
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2008-04-03 00:00:00 +02:00
|
8
|
-
summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: alx.girard@gmail.com
|
12
|
-
homepage: http://rfeedfinder.rubyforge.org
|
13
|
-
rubyforge_project: rfeedfinder
|
14
|
-
description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.9.13
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Alexandre Girard
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-11 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.6"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 4.0.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: hoe
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.7.0
|
44
|
+
version:
|
45
|
+
description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
46
|
+
email: alx.girard@gmail.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- History.txt
|
53
|
+
- License.txt
|
54
|
+
- Manifest.txt
|
55
|
+
- README.txt
|
56
|
+
- website/index.txt
|
31
57
|
files:
|
32
58
|
- History.txt
|
33
59
|
- License.txt
|
@@ -45,40 +71,33 @@ files:
|
|
45
71
|
- website/javascripts/rounded_corners_lite.inc.js
|
46
72
|
- website/stylesheets/screen.css
|
47
73
|
- website/template.rhtml
|
48
|
-
|
49
|
-
|
50
|
-
|
74
|
+
has_rdoc: true
|
75
|
+
homepage: http://rfeedfinder.rubyforge.org
|
76
|
+
post_install_message:
|
51
77
|
rdoc_options:
|
52
78
|
- --main
|
53
79
|
- README.txt
|
54
|
-
|
55
|
-
-
|
56
|
-
|
57
|
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: "0"
|
87
|
+
version:
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: "0"
|
93
|
+
version:
|
64
94
|
requirements: []
|
65
95
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
version: "0.6"
|
75
|
-
version:
|
76
|
-
- !ruby/object:Gem::Dependency
|
77
|
-
name: htmlentities
|
78
|
-
version_requirement:
|
79
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
80
|
-
requirements:
|
81
|
-
- - ">="
|
82
|
-
- !ruby/object:Gem::Version
|
83
|
-
version: 4.0.0
|
84
|
-
version:
|
96
|
+
rubyforge_project: rfeedfinder
|
97
|
+
rubygems_version: 1.2.0
|
98
|
+
signing_key:
|
99
|
+
specification_version: 2
|
100
|
+
summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
101
|
+
test_files:
|
102
|
+
- test/test_helper.rb
|
103
|
+
- test/test_rfeedfinder.rb
|