rfeedfinder 0.9.12 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ == 0.9.13 2008-10-11
2
+
3
+ * Correct getLinks method, thanks kdoya
4
+
1
5
  == 0.9.12 2008-03-04
2
6
 
3
7
  * Correct nytimes.com bug, thanks Sebastian
@@ -1,140 +1,102 @@
1
1
  require 'net/http'
2
2
  require 'rubygems'
3
- require 'htmlentities'
4
3
  require 'open-uri'
5
4
  require 'hpricot'
6
5
  require 'timeout'
7
6
 
8
- require 'rfeedfinder/version'
7
+ require File.dirname(__FILE__) + "/rfeedfinder/version"
9
8
 
10
- module Rfeedfinder
11
9
 
12
- module_function
13
-
14
- def makeFullURI(uri)
15
- uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
16
- if /^http|https/.match(uri)
17
- return uri
18
- else
19
- return "http://" << uri
20
- end
21
- end
22
-
23
- def getLinks(data, baseuri)
24
- return searchLinks(data, baseuri, "[@rel=alternate]&[@type=xml]&[@href=http]")
25
- end
26
-
27
- def getALinks(data, baseuri)
28
- return searchLinks(data, baseuri, "a")
29
- end
30
-
31
- def getFrameLinks(data, baseuri)
32
- links = searchLinks(data, baseuri, "frame")
33
- links += searchLinks(data, baseuri, "FRAME")
34
- return links
35
- end
36
-
37
- def searchLinks(data, baseuri, regexp)
38
- links = []
39
- data.search(regexp).map!{|link|
40
- if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
41
- uri = link[:href].to_s
42
- uri = link[:HREF].to_s if uri.empty?
43
- uri = link[:src].to_s if uri.empty?
44
- uri = link[:SRC].to_s if uri.empty?
45
- if !uri.strip.empty? and uri !~ /^javascript/
46
- uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
47
- links << uri
48
- end
49
- end
50
- }
51
- #links.each{|link| puts "searchLinks: #{link}"}
52
- return links.uniq
53
- end
54
-
55
- def getLocalLinks(links, baseuri)
56
- locallinks = []
57
- links.each do |link|
58
- locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
59
- end
60
- links = links.select{|link| link !~ /^\//} #remove local links from link array
61
- return [links, locallinks]
10
+ class Rfeedfinder
11
+ #
12
+ # Takes:
13
+ # * +init_values+ (hash)
14
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
15
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
16
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
17
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
18
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
19
+ #
20
+ #
21
+ # Example:
22
+ #
23
+ # Rfeedfinder.new({:proxy => "http://127.0.0.1:1234",
24
+ # :user_agent => "MyApp",
25
+ # :from => "contant@domain.com",
26
+ # :referer => "http://domain.com"})
27
+ #
28
+ #
29
+ # Returns a new instance of Rfeedfinder
30
+ #
31
+ def initialize(init_values = {})
32
+ @options = init_values
62
33
  end
63
34
 
64
- def isFeedLink?(link)
65
- return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
66
- end
67
-
68
- def isXMLRelatedLink?(link)
69
- return link.downcase =~ /rss|rdf|xml|atom/
35
+ #
36
+ # Takes:
37
+ # * +uri+ (string)
38
+ #
39
+ # Returns:
40
+ # * array of urls
41
+ #
42
+ def feeds(uri)
43
+ Rfeedfinder.feeds(uri, @options.dup)
70
44
  end
71
45
 
72
- def tryBrokenRedirect(data)
73
- newuris = (data/:newLocation)
74
- if !newuris.empty?
75
- return newuris[0].strip
76
- end
77
- end
78
-
79
- def verifyRedirect(feedlist)
80
- feedlist.each do |feed|
81
- begin
82
- response = Net::HTTP.get_response(URI.parse(feed))
83
- #puts "Verify #{feed} - code: #{response.code}"
84
- if response.code == "302"
85
- newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
86
-
87
- feedlist.delete(feed)
88
- feedlist << newuri
89
- feedlist.uniq!
90
- end
91
- rescue
92
- # rescue net error
93
- end
94
- end
95
- return feedlist
96
- end
97
-
98
- def isFeedData?(data)
99
- # if no html tag and rss, rdf or feed tag, it's a feed
100
- return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
46
+ #
47
+ # Takes:
48
+ # * +uri+ (string)
49
+ #
50
+ # Returns:
51
+ # * url (string)
52
+ #
53
+ def feed(uri)
54
+ result = Rfeedfinder.feed(uri, @options.dup)
101
55
  end
102
56
 
103
- def isFeed?(uri)
104
- uri.gsub!(/\/\/www\d\./, "//www.")
105
- begin
106
- protocol = URI.split(uri)
107
- return false if !protocol[0].index(/^[http|https]/)
108
- rescue
109
- # URI error
110
- return false
111
- end
57
+ #
58
+ # Takes:
59
+ # * +uri+ (string): The URI to check
60
+ # * +options+ (hash)
61
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
62
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
63
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
64
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
65
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
66
+ #
67
+ #
68
+ # Example:
69
+ #
70
+ # Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
71
+ # :user_agent => "MyApp",
72
+ # :from => "contant@domain.com",
73
+ # :referer => "http://domain.com"})
74
+ #
75
+ #
76
+ # Returns:
77
+ # * array of urls
78
+ # * array of hashes if the :keep_data option is true
79
+ # Example:
80
+ # [{:url => "url1", :data => "some data"},{:url => "url2", :data => "feed data"}]
81
+ #
82
+ # Raises:
83
+ # * ArgumentError if +uri+ is not a valid URL, and :use_google => false
84
+ # * ArgumentError if :use_google => true but it's not your lucky day
85
+ #
86
+ def self.feeds(uri, options = {})
112
87
 
113
- data = open_doc(uri)
114
- return false if data.nil?
88
+ # We have to create a hash for the data
89
+ # if the user has asked us to keep the data
90
+ options[:data] = {} if options[:keep_data]
91
+
92
+ options[:original_uri] = uri if !Rfeedfinder.isAValidURL?(uri) and options[:use_google]
115
93
 
116
- return isFeedData?(data)
117
- end
94
+ uri = URI.decode(uri)
95
+ options[:recurs] = [uri] if options[:recurs].nil?
96
+ fulluri = Rfeedfinder.makeFullURI(uri)
118
97
 
119
- def getFeedsFromSyndic8(uri)
120
- feeds = []
121
- begin
122
- server = Syndic8.new
123
- feedids = server.find_feeds(uri)
124
- infolist = server.feed_info(feedids, ['headlines_rank','status','dataurl'])
125
- infolist.sort_by{|feedInfo| feedInfo[:headlines_rank]}
126
- infolist.each do |feed|
127
- feeds << feed[:dataurl] if feed[:status]=='Syndicated'
128
- end
129
- rescue
130
- end
131
- return feeds
132
- end
133
-
134
- def feeds(uri, all=false, querySyndic8=false, _recurs=nil)
135
- uri = HTMLEntities.decode_entities(uri)
136
- _recurs = [uri] if _recurs.nil?
137
- fulluri = makeFullURI(uri)
98
+ raise ArgumentError, "#{fulluri} is not a valid URI." \
99
+ if !Rfeedfinder.isAValidURL?(fulluri) and !options[:use_google]
138
100
 
139
101
  # Add youtube support
140
102
  if fulluri =~ /youtube\.com\/user\/(.*[^\/])/
@@ -143,70 +105,77 @@ module Rfeedfinder
143
105
  if fulluri =~ /youtube\.com\/tag\/(.*[^\/])/
144
106
  fulluri = "http://www.youtube.com/rss/tag/#{$1}/videos.rss"
145
107
  end
146
-
147
- data = open_doc(fulluri)
108
+
109
+ data = Rfeedfinder.open_doc(fulluri, options)
148
110
  return [] if data.nil?
149
111
 
112
+ # If we used the google link finder, then we should set the new URL
113
+ fulluri = options[:google_link] if options[:google_link]
114
+
150
115
  # is this already a feed?
151
- if isFeedData?(data)
116
+ if Rfeedfinder.isFeedData?(data)
152
117
  feedlist = [fulluri]
153
- verifyRedirect(feedlist)
118
+ Rfeedfinder.verifyRedirect(feedlist)
154
119
  return feedlist
155
120
  end
156
121
 
157
122
  #verify redirection
158
- newuri = tryBrokenRedirect(data)
123
+ newuri = Rfeedfinder.tryBrokenRedirect(data)
159
124
  if !newuri.nil? and !newuri.empty?
160
- unless _recurs.include?(newuri)
161
- _recurs << newuri
162
- return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
125
+ options[:recurs] = [] unless options[:recurs]
126
+ unless options[:recurs].include?(newuri)
127
+ options[:recurs] << newuri
128
+ return feeds(newuri, options)
163
129
  end
164
130
  end
165
131
 
166
132
  #verify frameset
167
- frames = getFrameLinks(data, fulluri)
133
+ frames = Rfeedfinder.getFrameLinks(data, fulluri)
168
134
  frames.each {|newuri|
169
135
  if !newuri.nil? and !newuri.empty?
170
- unless _recurs.include?(newuri)
171
- _recurs << newuri
172
- return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
136
+ options[:recurs] = [] unless options[:recurs]
137
+ unless options[:recurs].include?(newuri)
138
+ options[:recurs] << newuri
139
+ return feeds(newuri, options)
173
140
  end
174
141
  end
175
142
  }
176
-
177
- # nope, it's a page, try LINK tags first
178
- outfeeds = getLinks(data, fulluri)
179
- outfeeds.select {|link| isFeed?(link)}
180
143
 
144
+ # nope, it's a page, try LINK tags first
145
+ outfeeds = Rfeedfinder.getLinks(data, fulluri).select {|link| Rfeedfinder.isFeed?(link, options)}
146
+
181
147
  #_debuglog('found %s feeds through LINK tags' % len(outfeeds))
182
148
  if outfeeds.empty?
183
149
  # no LINK tags, look for regular <A> links that point to feeds
184
150
  begin
185
- links = getALinks(data, fulluri)
151
+ links = Rfeedfinder.getALinks(data, fulluri)
186
152
  rescue
187
153
  links = []
188
154
  end
189
155
 
190
156
  # Get local links
191
- links, locallinks = getLocalLinks(links, fulluri)
157
+ links, locallinks = Rfeedfinder.getLocalLinks(links, fulluri)
158
+
159
+ # TODO:
160
+ # implement support for :only_first down her
192
161
 
193
162
  # look for obvious feed links on the same server
194
- selected_feeds = locallinks.select{|link| isFeedLink?(link) and isFeed?(link)}
163
+ selected_feeds = locallinks.select{|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)}
195
164
  outfeeds << selected_feeds unless selected_feeds.empty?
196
165
  # outfeeds.each{|link| puts "1 #{link}"}
197
166
 
198
167
  # look harder for feed links on the same server
199
- selected_feeds = locallinks.select{|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
168
+ selected_feeds = locallinks.select{|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
200
169
  outfeeds << selected_feeds unless selected_feeds.empty?
201
170
  # outfeeds.each{|link| puts "2 #{link}"}
202
171
 
203
172
  # look for obvious feed links on another server
204
- selected_feeds = links.select {|link| isFeedLink?(link) and isFeed?(link)} if outfeeds.empty?
173
+ selected_feeds = links.select {|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
205
174
  outfeeds << selected_feeds unless selected_feeds.empty?
206
175
  # outfeeds.each{|link| puts "3 #{link}"}
207
176
 
208
177
  # look harder for feed links on another server
209
- selected_feeds = links.select {|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
178
+ selected_feeds = links.select {|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
210
179
  outfeeds << selected_feeds unless selected_feeds.empty?
211
180
  # outfeeds.each{|link| puts "4 #{link}"}
212
181
  end
@@ -226,63 +195,300 @@ module Rfeedfinder
226
195
 
227
196
  guesses.each { |guess|
228
197
  uri = URI.join(fulluri, guess).to_s
229
- outfeeds << uri if isFeed?(uri)
198
+ outfeeds << uri if Rfeedfinder.isFeed?(uri, options)
230
199
  }
231
200
  end
232
201
 
233
202
  # try with adding ending slash
234
203
  if outfeeds.empty? and fulluri !~ /\/$/
235
- outfeeds = feeds(fulluri + "/", all=all, querySyndic8=querySyndic8, _recurs=_recurs)
204
+ outfeeds = Rfeedfinder.feeds(fulluri + "/", options)
236
205
  end
237
-
238
- # still no luck, search Syndic8 for feeds (requires xmlrpclib)
239
- #_debuglog('still no luck, searching Syndic8')
240
- outfeeds << getFeedsFromSyndic8(uri) if querySyndic8 and outfeeds.empty?
241
- #outfeeds = list(set(outfeeds)) if hasattr(__builtins__, 'set') or __builtins__.has_key('set')
242
-
206
+
243
207
  # Verify redirection
244
- verifyRedirect(outfeeds)
208
+ Rfeedfinder.verifyRedirect(outfeeds)
209
+
210
+ # This has to be used until proper :only_first support has been built in
211
+ outfeeds = outfeeds.first if options[:only_first] and outfeeds.size > 1
245
212
 
246
- return outfeeds.flatten
213
+ if options[:keep_data]
214
+ output = []
215
+ outfeeds.each do |feed|
216
+ output << {:url => feed, :data => options[:data][feed]}
217
+ end
218
+ return output
219
+ else
220
+ return outfeeds
221
+ end
247
222
  end
248
223
 
249
- def feed(uri)
250
- #todo: give preference to certain feed formats
251
- feedlist = feeds(uri)
224
+
225
+ #
226
+ # Takes:
227
+ # * +uri+ (string): The URI to check
228
+ # * +options+ (hash)
229
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
230
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
231
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
232
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
233
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
234
+ #
235
+ #
236
+ # Example:
237
+ #
238
+ # Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
239
+ # :user_agent => "MyApp",
240
+ # :from => "contant@domain.com",
241
+ # :referer => "http://domain.com"})
242
+ #
243
+ #
244
+ # Returns:
245
+ # * one URL as a string or nil
246
+ # * one hash if the :keep_data option is true
247
+ # Example:
248
+ # {:url => "url1", :data => "some data"}
249
+ #
250
+ # Raises:
251
+ # * ArgumentError if +uri+ is not a valid URL, and :use_google => false
252
+ # * ArgumentError if :use_google => true but it's not your lucky day
253
+ #
254
+ def self.feed(uri, options = {})
255
+ options[:only_first] = true
256
+ feedlist = Rfeedfinder.feeds(uri, options)
252
257
  unless feedlist.empty?
253
258
  return feedlist[0]
254
259
  else
255
260
  return nil
256
261
  end
257
262
  end
263
+
264
+ #
265
+ # Takes:
266
+ # * +data+ (string)
267
+ #
268
+ # Returns:
269
+ # * +true+ if the data has a rss, rdf or feed tag
270
+ # * +false+ if the data has a html tag
271
+ #
272
+ def self.isFeedData?(data)
273
+ # if no html tag and rss, rdf or feed tag, it's a feed
274
+ # puts data
275
+ return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
276
+ end
277
+
278
+ #
279
+ # Takes:
280
+ # * +uri+ (string)
281
+ #
282
+ # Downloads the URI and checkes the content
283
+ # with the +isFeedData?+ class method
284
+ #
285
+ # Returns:
286
+ # * +true+ if the uri points to a feed
287
+ # * +false+ if not
288
+ #
289
+ def self.isFeed?(uri, options)
290
+ # We return false if the user only wants one result
291
+ # and we already have found it so there aren't made
292
+ # any additional external calls
293
+ return false if options[:only_first] and options[:already_found_one]
294
+
295
+ uri.gsub!(/\/\/www\d\./, "//www.")
296
+ begin
297
+ protocol = URI.split(uri)
298
+ return false if !protocol[0].index(/^[http|https]/)
299
+ rescue
300
+ # URI error
301
+ return false
302
+ end
303
+
304
+ data = Rfeedfinder.open_doc(uri, options)
305
+ return false if data.nil?
306
+
307
+ if Rfeedfinder.isFeedData?(data)
308
+ options[:already_found_one] = true if options[:only_first]
309
+ return true
310
+ else
311
+ return false
312
+ end
313
+ end
314
+
315
+ protected
316
+ def self.makeFullURI(uri)
317
+ uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
318
+ if /^http|https/.match(uri)
319
+ return uri
320
+ else
321
+ return "http://" << uri
322
+ end
323
+ end
324
+
325
+ def self.getLinks(data, baseuri)
326
+ return Rfeedfinder.searchLinks(data, baseuri, "[@rel='alternate'][@type*='xml'][@href*='http']")
327
+ end
328
+
329
+ def self.getALinks(data, baseuri)
330
+ return Rfeedfinder.searchLinks(data, baseuri, "a")
331
+ end
332
+
333
+ def self.getFrameLinks(data, baseuri)
334
+ links = Rfeedfinder.searchLinks(data, baseuri, "frame")
335
+ links += Rfeedfinder.searchLinks(data, baseuri, "FRAME")
336
+ return links
337
+ end
258
338
 
259
- def open_doc(link)
339
+ def self.searchLinks(data, baseuri, regexp)
340
+ links = []
341
+ data.search(regexp).map!{|link|
342
+ if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
343
+ uri = link[:href].to_s
344
+ uri = link[:HREF].to_s if uri.empty?
345
+ uri = link[:src].to_s if uri.empty?
346
+ uri = link[:SRC].to_s if uri.empty?
347
+ if !uri.strip.empty? and uri !~ /^javascript/
348
+ uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
349
+ links << uri
350
+ end
351
+ end
352
+ }
353
+ #links.each{|link| puts "Rfeedfinder.searchLinks: #{link}"}
354
+ return links.uniq
355
+ end
356
+
357
+ def self.getLocalLinks(links, baseuri)
358
+ locallinks = []
359
+ links.each do |link|
360
+ locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
361
+ end
362
+ links = links.select{|link| link !~ /^\//} #remove local links from link array
363
+ return [links, locallinks]
364
+ end
365
+
366
+ def self.isFeedLink?(link)
367
+ return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
368
+ end
369
+
370
+ def self.isXMLRelatedLink?(link)
371
+ return link.downcase =~ /rss|rdf|xml|atom/
372
+ end
373
+
374
+ def self.tryBrokenRedirect(data)
375
+ newuris = (data/:newLocation)
376
+ if !newuris.empty?
377
+ return newuris[0].strip
378
+ end
379
+ end
380
+
381
+ def self.verifyRedirect(feedlist)
382
+ feedlist.each do |feed|
383
+ begin
384
+ response = Net::HTTP.get_response(URI.parse(feed))
385
+ #puts "Verify #{feed} - code: #{response.code}"
386
+ if response.code == "302"
387
+ newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
388
+
389
+ feedlist.delete(feed)
390
+ feedlist << newuri
391
+ feedlist.uniq!
392
+ end
393
+ rescue
394
+ # rescue net error
395
+ end
396
+ end
397
+ return feedlist
398
+ end
399
+
400
+ def self.open_doc(link, options)
401
+
402
+ # Setting default values for missing options
403
+ options[:proxy] = URI.parse(options[:proxy]) if options[:proxy]
404
+ options[:user_agent] = options[:user_agent] || "Ruby/#{RUBY_VERSION} - " + \
405
+ "Rfeedfinder #{Rfeedfinder::VERSION::STRING}"
406
+ options[:from] = options[:from] || "rfeedfinder@googlegroups.com"
407
+ options[:referer] = options[:referer] || "http://rfeedfinder.rubyforge.org/"
408
+
260
409
  data = nil
410
+
411
+ if !Rfeedfinder.isAValidURL?(link) and options[:use_google]
412
+ # Used google lucky script as found on
413
+ # http://www.leancrew.com/all-this/2006/07/lucky-linking/
414
+ # It doesn't work to well...
415
+ # TODO: Improve it somehow. The real google function works a lot better!
416
+ # TODO: Build in support for languages through parameter "hl" (=> "en" by default)
417
+ prefix = "http://www.google.com/search?q="
418
+ suffix = "&btnI=I'm+Feeling+Lucky"
419
+ goodURL = URI.escape(prefix + options[:original_uri] + suffix)
420
+ puts "Checking #{goodURL}"
421
+ response = Net::HTTP.get_response(URI.parse(goodURL))
422
+ link = response.to_hash['location'].first
423
+ options[:google_link] = link
424
+ raise ArgumentError, "Google couldn't save us. We couldn't find anything for #{options[:original_uri]}" if link.nil?
425
+ end
426
+
261
427
  begin
262
- Timeout::timeout(20) {
263
- data = Hpricot(open(link,
264
- "User-Agent" => "Ruby/#{RUBY_VERSION} - Rfeedfinder",
265
- "From" => "rfeedfinder@googlegroups.com",
266
- "Referer" => "http://rfeedfinder.rubyforge.org/"), :xml => true)
267
- }
428
+
429
+ Timeout::timeout(20) do
430
+
431
+ data = Hpricot(open(link, {
432
+ "User-Agent" => options[:user_agent],
433
+ "From" => options[:from],
434
+ "Referer" => options[:referer],
435
+ :proxy => options[:proxy]
436
+ }), :xml => true)
437
+
438
+ end
439
+
268
440
  rescue OpenURI::HTTPError
441
+
269
442
  begin
270
- Timeout::timeout(20) {
443
+
444
+ Timeout::timeout(20) do
445
+
271
446
  html = Net::HTTP.get(URI.parse(link))
272
447
  data = Hpricot(html, :xml => true) if html.to_s !~ /404 Not Found/
273
- }
448
+
449
+ end
450
+
274
451
  rescue Timeout::Error
275
452
  return nil
453
+
276
454
  rescue => err
277
455
  puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
278
456
  return nil
457
+
279
458
  end
459
+
280
460
  rescue Timeout::Error
281
461
  return nil
462
+
282
463
  rescue => err
283
464
  puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
284
465
  return nil
466
+
285
467
  end
468
+
469
+ # Store the data for the URL if the user has requested it
470
+ options[:data][link] = data.to_original_html if options[:keep_data]
471
+
286
472
  return data
287
473
  end
474
+
475
+ def self.isAValidURL?(url_to_check)
476
+ return false if url_to_check == nil
477
+
478
+ # The protocols that we allow are the following
479
+ protocol_whitelist = ["http", "https"]
480
+ # I guess we could have included some more, but that doesn't really
481
+ # make sense anyway as these are the ones that should be used.
482
+ # We'll see if the need arises and then add more later if needed.
483
+
484
+ re = Regexp.new("(#{protocol_whitelist.join('|')}):" + \
485
+ "\/\/([[:alpha:][:digit:].]{2,})([.]{1})([[:alpha:]]{2,4})(\/)")
486
+
487
+ # For the sake of the regular expression check we add a back slash
488
+ # at the end of the URL
489
+ url_to_check += "/"
490
+ return true unless (re =~ url_to_check) == nil
491
+ false
492
+ end
493
+
288
494
  end
@@ -1,9 +1,9 @@
1
- module Rfeedfinder #:nodoc:
1
+ class Rfeedfinder #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 9
5
- TINY = 12
5
+ TINY = 13
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
9
- end
9
+ end
@@ -6,12 +6,12 @@ class TestRfeedfinder < Test::Unit::TestCase
6
6
  end
7
7
 
8
8
  def test_feed
9
- feed_finder "scripting.com",
9
+ feed_finder "http://scripting.com",
10
10
  "http://www.scripting.com/rss.xml"
11
11
  end
12
12
 
13
13
  def test_feeds
14
- feeds = Rfeedfinder.feeds("flickr.com/photos/alx")
14
+ feeds = Rfeedfinder.feeds("http://flickr.com/photos/alx")
15
15
  assert_equal 2, feeds.size
16
16
  end
17
17
 
@@ -143,6 +143,6 @@ class TestRfeedfinder < Test::Unit::TestCase
143
143
  end
144
144
 
145
145
  def test_nytimes
146
- feed_finder "http://www.nytimes.com/"
146
+ feed_finder "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml"
147
147
  end
148
148
  end
metadata CHANGED
@@ -1,33 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: rfeedfinder
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.9.12
7
- date: 2008-04-03 00:00:00 +02:00
8
- summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
9
- require_paths:
10
- - lib
11
- email: alx.girard@gmail.com
12
- homepage: http://rfeedfinder.rubyforge.org
13
- rubyforge_project: rfeedfinder
14
- description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.9.13
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Alexandre Girard
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-10-11 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 4.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: hoe
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.7.0
44
+ version:
45
+ description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
46
+ email: alx.girard@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - History.txt
53
+ - License.txt
54
+ - Manifest.txt
55
+ - README.txt
56
+ - website/index.txt
31
57
  files:
32
58
  - History.txt
33
59
  - License.txt
@@ -45,40 +71,33 @@ files:
45
71
  - website/javascripts/rounded_corners_lite.inc.js
46
72
  - website/stylesheets/screen.css
47
73
  - website/template.rhtml
48
- test_files:
49
- - test/test_helper.rb
50
- - test/test_rfeedfinder.rb
74
+ has_rdoc: true
75
+ homepage: http://rfeedfinder.rubyforge.org
76
+ post_install_message:
51
77
  rdoc_options:
52
78
  - --main
53
79
  - README.txt
54
- extra_rdoc_files:
55
- - History.txt
56
- - License.txt
57
- - Manifest.txt
58
- - README.txt
59
- - website/index.txt
60
- executables: []
61
-
62
- extensions: []
63
-
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: "0"
87
+ version:
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ version:
64
94
  requirements: []
65
95
 
66
- dependencies:
67
- - !ruby/object:Gem::Dependency
68
- name: hpricot
69
- version_requirement:
70
- version_requirements: !ruby/object:Gem::Version::Requirement
71
- requirements:
72
- - - ">="
73
- - !ruby/object:Gem::Version
74
- version: "0.6"
75
- version:
76
- - !ruby/object:Gem::Dependency
77
- name: htmlentities
78
- version_requirement:
79
- version_requirements: !ruby/object:Gem::Version::Requirement
80
- requirements:
81
- - - ">="
82
- - !ruby/object:Gem::Version
83
- version: 4.0.0
84
- version:
96
+ rubyforge_project: rfeedfinder
97
+ rubygems_version: 1.2.0
98
+ signing_key:
99
+ specification_version: 2
100
+ summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
101
+ test_files:
102
+ - test/test_helper.rb
103
+ - test/test_rfeedfinder.rb