rfeedfinder 0.9.12 → 0.9.13
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/lib/rfeedfinder.rb +370 -164
- data/lib/rfeedfinder/version.rb +3 -3
- data/test/test_rfeedfinder.rb +3 -3
- metadata +75 -56
data/History.txt
CHANGED
data/lib/rfeedfinder.rb
CHANGED
@@ -1,140 +1,102 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'rubygems'
|
3
|
-
require 'htmlentities'
|
4
3
|
require 'open-uri'
|
5
4
|
require 'hpricot'
|
6
5
|
require 'timeout'
|
7
6
|
|
8
|
-
require
|
7
|
+
require File.dirname(__FILE__) + "/rfeedfinder/version"
|
9
8
|
|
10
|
-
module Rfeedfinder
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
def searchLinks(data, baseuri, regexp)
|
38
|
-
links = []
|
39
|
-
data.search(regexp).map!{|link|
|
40
|
-
if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
|
41
|
-
uri = link[:href].to_s
|
42
|
-
uri = link[:HREF].to_s if uri.empty?
|
43
|
-
uri = link[:src].to_s if uri.empty?
|
44
|
-
uri = link[:SRC].to_s if uri.empty?
|
45
|
-
if !uri.strip.empty? and uri !~ /^javascript/
|
46
|
-
uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
|
47
|
-
links << uri
|
48
|
-
end
|
49
|
-
end
|
50
|
-
}
|
51
|
-
#links.each{|link| puts "searchLinks: #{link}"}
|
52
|
-
return links.uniq
|
53
|
-
end
|
54
|
-
|
55
|
-
def getLocalLinks(links, baseuri)
|
56
|
-
locallinks = []
|
57
|
-
links.each do |link|
|
58
|
-
locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
|
59
|
-
end
|
60
|
-
links = links.select{|link| link !~ /^\//} #remove local links from link array
|
61
|
-
return [links, locallinks]
|
10
|
+
class Rfeedfinder
|
11
|
+
#
|
12
|
+
# Takes:
|
13
|
+
# * +init_values+ (hash)
|
14
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
15
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
16
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
17
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
18
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
19
|
+
#
|
20
|
+
#
|
21
|
+
# Example:
|
22
|
+
#
|
23
|
+
# Rfeedfinder.new({:proxy => "http://127.0.0.1:1234",
|
24
|
+
# :user_agent => "MyApp",
|
25
|
+
# :from => "contant@domain.com",
|
26
|
+
# :referer => "http://domain.com"})
|
27
|
+
#
|
28
|
+
#
|
29
|
+
# Returns a new instance of Rfeedfinder
|
30
|
+
#
|
31
|
+
def initialize(init_values = {})
|
32
|
+
@options = init_values
|
62
33
|
end
|
63
34
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
35
|
+
#
|
36
|
+
# Takes:
|
37
|
+
# * +uri+ (string)
|
38
|
+
#
|
39
|
+
# Returns:
|
40
|
+
# * array of urls
|
41
|
+
#
|
42
|
+
def feeds(uri)
|
43
|
+
Rfeedfinder.feeds(uri, @options.dup)
|
70
44
|
end
|
71
45
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
def
|
80
|
-
|
81
|
-
begin
|
82
|
-
response = Net::HTTP.get_response(URI.parse(feed))
|
83
|
-
#puts "Verify #{feed} - code: #{response.code}"
|
84
|
-
if response.code == "302"
|
85
|
-
newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
|
86
|
-
|
87
|
-
feedlist.delete(feed)
|
88
|
-
feedlist << newuri
|
89
|
-
feedlist.uniq!
|
90
|
-
end
|
91
|
-
rescue
|
92
|
-
# rescue net error
|
93
|
-
end
|
94
|
-
end
|
95
|
-
return feedlist
|
96
|
-
end
|
97
|
-
|
98
|
-
def isFeedData?(data)
|
99
|
-
# if no html tag and rss, rdf or feed tag, it's a feed
|
100
|
-
return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
|
46
|
+
#
|
47
|
+
# Takes:
|
48
|
+
# * +uri+ (string)
|
49
|
+
#
|
50
|
+
# Returns:
|
51
|
+
# * url (string)
|
52
|
+
#
|
53
|
+
def feed(uri)
|
54
|
+
result = Rfeedfinder.feed(uri, @options.dup)
|
101
55
|
end
|
102
56
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
57
|
+
#
|
58
|
+
# Takes:
|
59
|
+
# * +uri+ (string): The URI to check
|
60
|
+
# * +options+ (hash)
|
61
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
62
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
63
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
64
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
65
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# Example:
|
69
|
+
#
|
70
|
+
# Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
|
71
|
+
# :user_agent => "MyApp",
|
72
|
+
# :from => "contant@domain.com",
|
73
|
+
# :referer => "http://domain.com"})
|
74
|
+
#
|
75
|
+
#
|
76
|
+
# Returns:
|
77
|
+
# * array of urls
|
78
|
+
# * array of hashes if the :keep_data option is true
|
79
|
+
# Example:
|
80
|
+
# [{:url => "url1", :data => "some data"},{:url => "url2", :data => "feed data"}]
|
81
|
+
#
|
82
|
+
# Raises:
|
83
|
+
# * ArgumentError if +uri+ is not a valid URL, and :use_google => false
|
84
|
+
# * ArgumentError if :use_google => true but it's not your lucky day
|
85
|
+
#
|
86
|
+
def self.feeds(uri, options = {})
|
112
87
|
|
113
|
-
|
114
|
-
|
88
|
+
# We have to create a hash for the data
|
89
|
+
# if the user has asked us to keep the data
|
90
|
+
options[:data] = {} if options[:keep_data]
|
91
|
+
|
92
|
+
options[:original_uri] = uri if !Rfeedfinder.isAValidURL?(uri) and options[:use_google]
|
115
93
|
|
116
|
-
|
117
|
-
|
94
|
+
uri = URI.decode(uri)
|
95
|
+
options[:recurs] = [uri] if options[:recurs].nil?
|
96
|
+
fulluri = Rfeedfinder.makeFullURI(uri)
|
118
97
|
|
119
|
-
|
120
|
-
|
121
|
-
begin
|
122
|
-
server = Syndic8.new
|
123
|
-
feedids = server.find_feeds(uri)
|
124
|
-
infolist = server.feed_info(feedids, ['headlines_rank','status','dataurl'])
|
125
|
-
infolist.sort_by{|feedInfo| feedInfo[:headlines_rank]}
|
126
|
-
infolist.each do |feed|
|
127
|
-
feeds << feed[:dataurl] if feed[:status]=='Syndicated'
|
128
|
-
end
|
129
|
-
rescue
|
130
|
-
end
|
131
|
-
return feeds
|
132
|
-
end
|
133
|
-
|
134
|
-
def feeds(uri, all=false, querySyndic8=false, _recurs=nil)
|
135
|
-
uri = HTMLEntities.decode_entities(uri)
|
136
|
-
_recurs = [uri] if _recurs.nil?
|
137
|
-
fulluri = makeFullURI(uri)
|
98
|
+
raise ArgumentError, "#{fulluri} is not a valid URI." \
|
99
|
+
if !Rfeedfinder.isAValidURL?(fulluri) and !options[:use_google]
|
138
100
|
|
139
101
|
# Add youtube support
|
140
102
|
if fulluri =~ /youtube\.com\/user\/(.*[^\/])/
|
@@ -143,70 +105,77 @@ module Rfeedfinder
|
|
143
105
|
if fulluri =~ /youtube\.com\/tag\/(.*[^\/])/
|
144
106
|
fulluri = "http://www.youtube.com/rss/tag/#{$1}/videos.rss"
|
145
107
|
end
|
146
|
-
|
147
|
-
data = open_doc(fulluri)
|
108
|
+
|
109
|
+
data = Rfeedfinder.open_doc(fulluri, options)
|
148
110
|
return [] if data.nil?
|
149
111
|
|
112
|
+
# If we used the google link finder, then we should set the new URL
|
113
|
+
fulluri = options[:google_link] if options[:google_link]
|
114
|
+
|
150
115
|
# is this already a feed?
|
151
|
-
if isFeedData?(data)
|
116
|
+
if Rfeedfinder.isFeedData?(data)
|
152
117
|
feedlist = [fulluri]
|
153
|
-
verifyRedirect(feedlist)
|
118
|
+
Rfeedfinder.verifyRedirect(feedlist)
|
154
119
|
return feedlist
|
155
120
|
end
|
156
121
|
|
157
122
|
#verify redirection
|
158
|
-
newuri = tryBrokenRedirect(data)
|
123
|
+
newuri = Rfeedfinder.tryBrokenRedirect(data)
|
159
124
|
if !newuri.nil? and !newuri.empty?
|
160
|
-
unless
|
161
|
-
|
162
|
-
|
125
|
+
options[:recurs] = [] unless options[:recurs]
|
126
|
+
unless options[:recurs].include?(newuri)
|
127
|
+
options[:recurs] << newuri
|
128
|
+
return feeds(newuri, options)
|
163
129
|
end
|
164
130
|
end
|
165
131
|
|
166
132
|
#verify frameset
|
167
|
-
frames = getFrameLinks(data, fulluri)
|
133
|
+
frames = Rfeedfinder.getFrameLinks(data, fulluri)
|
168
134
|
frames.each {|newuri|
|
169
135
|
if !newuri.nil? and !newuri.empty?
|
170
|
-
unless
|
171
|
-
|
172
|
-
|
136
|
+
options[:recurs] = [] unless options[:recurs]
|
137
|
+
unless options[:recurs].include?(newuri)
|
138
|
+
options[:recurs] << newuri
|
139
|
+
return feeds(newuri, options)
|
173
140
|
end
|
174
141
|
end
|
175
142
|
}
|
176
|
-
|
177
|
-
# nope, it's a page, try LINK tags first
|
178
|
-
outfeeds = getLinks(data, fulluri)
|
179
|
-
outfeeds.select {|link| isFeed?(link)}
|
180
143
|
|
144
|
+
# nope, it's a page, try LINK tags first
|
145
|
+
outfeeds = Rfeedfinder.getLinks(data, fulluri).select {|link| Rfeedfinder.isFeed?(link, options)}
|
146
|
+
|
181
147
|
#_debuglog('found %s feeds through LINK tags' % len(outfeeds))
|
182
148
|
if outfeeds.empty?
|
183
149
|
# no LINK tags, look for regular <A> links that point to feeds
|
184
150
|
begin
|
185
|
-
links = getALinks(data, fulluri)
|
151
|
+
links = Rfeedfinder.getALinks(data, fulluri)
|
186
152
|
rescue
|
187
153
|
links = []
|
188
154
|
end
|
189
155
|
|
190
156
|
# Get local links
|
191
|
-
links, locallinks = getLocalLinks(links, fulluri)
|
157
|
+
links, locallinks = Rfeedfinder.getLocalLinks(links, fulluri)
|
158
|
+
|
159
|
+
# TODO:
|
160
|
+
# implement support for :only_first down her
|
192
161
|
|
193
162
|
# look for obvious feed links on the same server
|
194
|
-
selected_feeds = locallinks.select{|link| isFeedLink?(link) and isFeed?(link)}
|
163
|
+
selected_feeds = locallinks.select{|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)}
|
195
164
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
196
165
|
# outfeeds.each{|link| puts "1 #{link}"}
|
197
166
|
|
198
167
|
# look harder for feed links on the same server
|
199
|
-
selected_feeds = locallinks.select{|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
168
|
+
selected_feeds = locallinks.select{|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
200
169
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
201
170
|
# outfeeds.each{|link| puts "2 #{link}"}
|
202
171
|
|
203
172
|
# look for obvious feed links on another server
|
204
|
-
selected_feeds = links.select {|link| isFeedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
173
|
+
selected_feeds = links.select {|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
205
174
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
206
175
|
# outfeeds.each{|link| puts "3 #{link}"}
|
207
176
|
|
208
177
|
# look harder for feed links on another server
|
209
|
-
selected_feeds = links.select {|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
|
178
|
+
selected_feeds = links.select {|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
|
210
179
|
outfeeds << selected_feeds unless selected_feeds.empty?
|
211
180
|
# outfeeds.each{|link| puts "4 #{link}"}
|
212
181
|
end
|
@@ -226,63 +195,300 @@ module Rfeedfinder
|
|
226
195
|
|
227
196
|
guesses.each { |guess|
|
228
197
|
uri = URI.join(fulluri, guess).to_s
|
229
|
-
outfeeds << uri if isFeed?(uri)
|
198
|
+
outfeeds << uri if Rfeedfinder.isFeed?(uri, options)
|
230
199
|
}
|
231
200
|
end
|
232
201
|
|
233
202
|
# try with adding ending slash
|
234
203
|
if outfeeds.empty? and fulluri !~ /\/$/
|
235
|
-
outfeeds = feeds(fulluri + "/",
|
204
|
+
outfeeds = Rfeedfinder.feeds(fulluri + "/", options)
|
236
205
|
end
|
237
|
-
|
238
|
-
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
|
239
|
-
#_debuglog('still no luck, searching Syndic8')
|
240
|
-
outfeeds << getFeedsFromSyndic8(uri) if querySyndic8 and outfeeds.empty?
|
241
|
-
#outfeeds = list(set(outfeeds)) if hasattr(__builtins__, 'set') or __builtins__.has_key('set')
|
242
|
-
|
206
|
+
|
243
207
|
# Verify redirection
|
244
|
-
verifyRedirect(outfeeds)
|
208
|
+
Rfeedfinder.verifyRedirect(outfeeds)
|
209
|
+
|
210
|
+
# This has to be used until proper :only_first support has been built in
|
211
|
+
outfeeds = outfeeds.first if options[:only_first] and outfeeds.size > 1
|
245
212
|
|
246
|
-
|
213
|
+
if options[:keep_data]
|
214
|
+
output = []
|
215
|
+
outfeeds.each do |feed|
|
216
|
+
output << {:url => feed, :data => options[:data][feed]}
|
217
|
+
end
|
218
|
+
return output
|
219
|
+
else
|
220
|
+
return outfeeds
|
221
|
+
end
|
247
222
|
end
|
248
223
|
|
249
|
-
|
250
|
-
|
251
|
-
|
224
|
+
|
225
|
+
#
|
226
|
+
# Takes:
|
227
|
+
# * +uri+ (string): The URI to check
|
228
|
+
# * +options+ (hash)
|
229
|
+
# * +:proxy+: (string) proxy information to use. Defaults to a blank string
|
230
|
+
# * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
|
231
|
+
# * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
|
232
|
+
# * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
|
233
|
+
# * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
|
234
|
+
#
|
235
|
+
#
|
236
|
+
# Example:
|
237
|
+
#
|
238
|
+
# Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
|
239
|
+
# :user_agent => "MyApp",
|
240
|
+
# :from => "contant@domain.com",
|
241
|
+
# :referer => "http://domain.com"})
|
242
|
+
#
|
243
|
+
#
|
244
|
+
# Returns:
|
245
|
+
# * one URL as a string or nil
|
246
|
+
# * one hash if the :keep_data option is true
|
247
|
+
# Example:
|
248
|
+
# {:url => "url1", :data => "some data"}
|
249
|
+
#
|
250
|
+
# Raises:
|
251
|
+
# * ArgumentError if +uri+ is not a valid URL, and :use_google => false
|
252
|
+
# * ArgumentError if :use_google => true but it's not your lucky day
|
253
|
+
#
|
254
|
+
def self.feed(uri, options = {})
|
255
|
+
options[:only_first] = true
|
256
|
+
feedlist = Rfeedfinder.feeds(uri, options)
|
252
257
|
unless feedlist.empty?
|
253
258
|
return feedlist[0]
|
254
259
|
else
|
255
260
|
return nil
|
256
261
|
end
|
257
262
|
end
|
263
|
+
|
264
|
+
#
|
265
|
+
# Takes:
|
266
|
+
# * +data+ (string)
|
267
|
+
#
|
268
|
+
# Returns:
|
269
|
+
# * +true+ if the data has a rss, rdf or feed tag
|
270
|
+
# * +false+ if the data has a html tag
|
271
|
+
#
|
272
|
+
def self.isFeedData?(data)
|
273
|
+
# if no html tag and rss, rdf or feed tag, it's a feed
|
274
|
+
# puts data
|
275
|
+
return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
|
276
|
+
end
|
277
|
+
|
278
|
+
#
|
279
|
+
# Takes:
|
280
|
+
# * +uri+ (string)
|
281
|
+
#
|
282
|
+
# Downloads the URI and checkes the content
|
283
|
+
# with the +isFeedData?+ class method
|
284
|
+
#
|
285
|
+
# Returns:
|
286
|
+
# * +true+ if the uri points to a feed
|
287
|
+
# * +false+ if not
|
288
|
+
#
|
289
|
+
def self.isFeed?(uri, options)
|
290
|
+
# We return false if the user only wants one result
|
291
|
+
# and we already have found it so there aren't made
|
292
|
+
# any additional external calls
|
293
|
+
return false if options[:only_first] and options[:already_found_one]
|
294
|
+
|
295
|
+
uri.gsub!(/\/\/www\d\./, "//www.")
|
296
|
+
begin
|
297
|
+
protocol = URI.split(uri)
|
298
|
+
return false if !protocol[0].index(/^[http|https]/)
|
299
|
+
rescue
|
300
|
+
# URI error
|
301
|
+
return false
|
302
|
+
end
|
303
|
+
|
304
|
+
data = Rfeedfinder.open_doc(uri, options)
|
305
|
+
return false if data.nil?
|
306
|
+
|
307
|
+
if Rfeedfinder.isFeedData?(data)
|
308
|
+
options[:already_found_one] = true if options[:only_first]
|
309
|
+
return true
|
310
|
+
else
|
311
|
+
return false
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
protected
|
316
|
+
def self.makeFullURI(uri)
|
317
|
+
uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
|
318
|
+
if /^http|https/.match(uri)
|
319
|
+
return uri
|
320
|
+
else
|
321
|
+
return "http://" << uri
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
def self.getLinks(data, baseuri)
|
326
|
+
return Rfeedfinder.searchLinks(data, baseuri, "[@rel='alternate'][@type*='xml'][@href*='http']")
|
327
|
+
end
|
328
|
+
|
329
|
+
def self.getALinks(data, baseuri)
|
330
|
+
return Rfeedfinder.searchLinks(data, baseuri, "a")
|
331
|
+
end
|
332
|
+
|
333
|
+
def self.getFrameLinks(data, baseuri)
|
334
|
+
links = Rfeedfinder.searchLinks(data, baseuri, "frame")
|
335
|
+
links += Rfeedfinder.searchLinks(data, baseuri, "FRAME")
|
336
|
+
return links
|
337
|
+
end
|
258
338
|
|
259
|
-
def
|
339
|
+
def self.searchLinks(data, baseuri, regexp)
|
340
|
+
links = []
|
341
|
+
data.search(regexp).map!{|link|
|
342
|
+
if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
|
343
|
+
uri = link[:href].to_s
|
344
|
+
uri = link[:HREF].to_s if uri.empty?
|
345
|
+
uri = link[:src].to_s if uri.empty?
|
346
|
+
uri = link[:SRC].to_s if uri.empty?
|
347
|
+
if !uri.strip.empty? and uri !~ /^javascript/
|
348
|
+
uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
|
349
|
+
links << uri
|
350
|
+
end
|
351
|
+
end
|
352
|
+
}
|
353
|
+
#links.each{|link| puts "Rfeedfinder.searchLinks: #{link}"}
|
354
|
+
return links.uniq
|
355
|
+
end
|
356
|
+
|
357
|
+
def self.getLocalLinks(links, baseuri)
|
358
|
+
locallinks = []
|
359
|
+
links.each do |link|
|
360
|
+
locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
|
361
|
+
end
|
362
|
+
links = links.select{|link| link !~ /^\//} #remove local links from link array
|
363
|
+
return [links, locallinks]
|
364
|
+
end
|
365
|
+
|
366
|
+
def self.isFeedLink?(link)
|
367
|
+
return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
|
368
|
+
end
|
369
|
+
|
370
|
+
def self.isXMLRelatedLink?(link)
|
371
|
+
return link.downcase =~ /rss|rdf|xml|atom/
|
372
|
+
end
|
373
|
+
|
374
|
+
def self.tryBrokenRedirect(data)
|
375
|
+
newuris = (data/:newLocation)
|
376
|
+
if !newuris.empty?
|
377
|
+
return newuris[0].strip
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
def self.verifyRedirect(feedlist)
|
382
|
+
feedlist.each do |feed|
|
383
|
+
begin
|
384
|
+
response = Net::HTTP.get_response(URI.parse(feed))
|
385
|
+
#puts "Verify #{feed} - code: #{response.code}"
|
386
|
+
if response.code == "302"
|
387
|
+
newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
|
388
|
+
|
389
|
+
feedlist.delete(feed)
|
390
|
+
feedlist << newuri
|
391
|
+
feedlist.uniq!
|
392
|
+
end
|
393
|
+
rescue
|
394
|
+
# rescue net error
|
395
|
+
end
|
396
|
+
end
|
397
|
+
return feedlist
|
398
|
+
end
|
399
|
+
|
400
|
+
def self.open_doc(link, options)
|
401
|
+
|
402
|
+
# Setting default values for missing options
|
403
|
+
options[:proxy] = URI.parse(options[:proxy]) if options[:proxy]
|
404
|
+
options[:user_agent] = options[:user_agent] || "Ruby/#{RUBY_VERSION} - " + \
|
405
|
+
"Rfeedfinder #{Rfeedfinder::VERSION::STRING}"
|
406
|
+
options[:from] = options[:from] || "rfeedfinder@googlegroups.com"
|
407
|
+
options[:referer] = options[:referer] || "http://rfeedfinder.rubyforge.org/"
|
408
|
+
|
260
409
|
data = nil
|
410
|
+
|
411
|
+
if !Rfeedfinder.isAValidURL?(link) and options[:use_google]
|
412
|
+
# Used google lucky script as found on
|
413
|
+
# http://www.leancrew.com/all-this/2006/07/lucky-linking/
|
414
|
+
# It doesn't work to well...
|
415
|
+
# TODO: Improve it somehow. The real google function works a lot better!
|
416
|
+
# TODO: Build in support for languages through parameter "hl" (=> "en" by default)
|
417
|
+
prefix = "http://www.google.com/search?q="
|
418
|
+
suffix = "&btnI=I'm+Feeling+Lucky"
|
419
|
+
goodURL = URI.escape(prefix + options[:original_uri] + suffix)
|
420
|
+
puts "Checking #{goodURL}"
|
421
|
+
response = Net::HTTP.get_response(URI.parse(goodURL))
|
422
|
+
link = response.to_hash['location'].first
|
423
|
+
options[:google_link] = link
|
424
|
+
raise ArgumentError, "Google couldn't save us. We couldn't find anything for #{options[:original_uri]}" if link.nil?
|
425
|
+
end
|
426
|
+
|
261
427
|
begin
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
"
|
267
|
-
|
428
|
+
|
429
|
+
Timeout::timeout(20) do
|
430
|
+
|
431
|
+
data = Hpricot(open(link, {
|
432
|
+
"User-Agent" => options[:user_agent],
|
433
|
+
"From" => options[:from],
|
434
|
+
"Referer" => options[:referer],
|
435
|
+
:proxy => options[:proxy]
|
436
|
+
}), :xml => true)
|
437
|
+
|
438
|
+
end
|
439
|
+
|
268
440
|
rescue OpenURI::HTTPError
|
441
|
+
|
269
442
|
begin
|
270
|
-
|
443
|
+
|
444
|
+
Timeout::timeout(20) do
|
445
|
+
|
271
446
|
html = Net::HTTP.get(URI.parse(link))
|
272
447
|
data = Hpricot(html, :xml => true) if html.to_s !~ /404 Not Found/
|
273
|
-
|
448
|
+
|
449
|
+
end
|
450
|
+
|
274
451
|
rescue Timeout::Error
|
275
452
|
return nil
|
453
|
+
|
276
454
|
rescue => err
|
277
455
|
puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
|
278
456
|
return nil
|
457
|
+
|
279
458
|
end
|
459
|
+
|
280
460
|
rescue Timeout::Error
|
281
461
|
return nil
|
462
|
+
|
282
463
|
rescue => err
|
283
464
|
puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
|
284
465
|
return nil
|
466
|
+
|
285
467
|
end
|
468
|
+
|
469
|
+
# Store the data for the URL if the user has requested it
|
470
|
+
options[:data][link] = data.to_original_html if options[:keep_data]
|
471
|
+
|
286
472
|
return data
|
287
473
|
end
|
474
|
+
|
475
|
+
def self.isAValidURL?(url_to_check)
|
476
|
+
return false if url_to_check == nil
|
477
|
+
|
478
|
+
# The protocols that we allow are the following
|
479
|
+
protocol_whitelist = ["http", "https"]
|
480
|
+
# I guess we could have included some more, but that doesn't really
|
481
|
+
# make sense anyway as these are the ones that should be used.
|
482
|
+
# We'll see if the need arises and then add more later if needed.
|
483
|
+
|
484
|
+
re = Regexp.new("(#{protocol_whitelist.join('|')}):" + \
|
485
|
+
"\/\/([[:alpha:][:digit:].]{2,})([.]{1})([[:alpha:]]{2,4})(\/)")
|
486
|
+
|
487
|
+
# For the sake of the regular expression check we add a back slash
|
488
|
+
# at the end of the URL
|
489
|
+
url_to_check += "/"
|
490
|
+
return true unless (re =~ url_to_check) == nil
|
491
|
+
false
|
492
|
+
end
|
493
|
+
|
288
494
|
end
|
data/lib/rfeedfinder/version.rb
CHANGED
data/test/test_rfeedfinder.rb
CHANGED
@@ -6,12 +6,12 @@ class TestRfeedfinder < Test::Unit::TestCase
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def test_feed
|
9
|
-
feed_finder "scripting.com",
|
9
|
+
feed_finder "http://scripting.com",
|
10
10
|
"http://www.scripting.com/rss.xml"
|
11
11
|
end
|
12
12
|
|
13
13
|
def test_feeds
|
14
|
-
feeds = Rfeedfinder.feeds("flickr.com/photos/alx")
|
14
|
+
feeds = Rfeedfinder.feeds("http://flickr.com/photos/alx")
|
15
15
|
assert_equal 2, feeds.size
|
16
16
|
end
|
17
17
|
|
@@ -143,6 +143,6 @@ class TestRfeedfinder < Test::Unit::TestCase
|
|
143
143
|
end
|
144
144
|
|
145
145
|
def test_nytimes
|
146
|
-
feed_finder "http://www.nytimes.com/"
|
146
|
+
feed_finder "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml"
|
147
147
|
end
|
148
148
|
end
|
metadata
CHANGED
@@ -1,33 +1,59 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: rfeedfinder
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2008-04-03 00:00:00 +02:00
|
8
|
-
summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: alx.girard@gmail.com
|
12
|
-
homepage: http://rfeedfinder.rubyforge.org
|
13
|
-
rubyforge_project: rfeedfinder
|
14
|
-
description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.9.13
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Alexandre Girard
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-11 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.6"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: htmlentities
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 4.0.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: hoe
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.7.0
|
44
|
+
version:
|
45
|
+
description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
46
|
+
email: alx.girard@gmail.com
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- History.txt
|
53
|
+
- License.txt
|
54
|
+
- Manifest.txt
|
55
|
+
- README.txt
|
56
|
+
- website/index.txt
|
31
57
|
files:
|
32
58
|
- History.txt
|
33
59
|
- License.txt
|
@@ -45,40 +71,33 @@ files:
|
|
45
71
|
- website/javascripts/rounded_corners_lite.inc.js
|
46
72
|
- website/stylesheets/screen.css
|
47
73
|
- website/template.rhtml
|
48
|
-
|
49
|
-
|
50
|
-
|
74
|
+
has_rdoc: true
|
75
|
+
homepage: http://rfeedfinder.rubyforge.org
|
76
|
+
post_install_message:
|
51
77
|
rdoc_options:
|
52
78
|
- --main
|
53
79
|
- README.txt
|
54
|
-
|
55
|
-
-
|
56
|
-
|
57
|
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: "0"
|
87
|
+
version:
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: "0"
|
93
|
+
version:
|
64
94
|
requirements: []
|
65
95
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
version: "0.6"
|
75
|
-
version:
|
76
|
-
- !ruby/object:Gem::Dependency
|
77
|
-
name: htmlentities
|
78
|
-
version_requirement:
|
79
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
80
|
-
requirements:
|
81
|
-
- - ">="
|
82
|
-
- !ruby/object:Gem::Version
|
83
|
-
version: 4.0.0
|
84
|
-
version:
|
96
|
+
rubyforge_project: rfeedfinder
|
97
|
+
rubygems_version: 1.2.0
|
98
|
+
signing_key:
|
99
|
+
specification_version: 2
|
100
|
+
summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
|
101
|
+
test_files:
|
102
|
+
- test/test_helper.rb
|
103
|
+
- test/test_rfeedfinder.rb
|