rfeedfinder 0.9.12 → 0.9.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ == 0.9.13 2008-10-11
2
+
3
+ * Correct getLinks method, thanks kdoya
4
+
1
5
  == 0.9.12 2008-03-04
2
6
 
3
7
  * Correct nytimes.com bug, thanks Sebastian
@@ -1,140 +1,102 @@
1
1
  require 'net/http'
2
2
  require 'rubygems'
3
- require 'htmlentities'
4
3
  require 'open-uri'
5
4
  require 'hpricot'
6
5
  require 'timeout'
7
6
 
8
- require 'rfeedfinder/version'
7
+ require File.dirname(__FILE__) + "/rfeedfinder/version"
9
8
 
10
- module Rfeedfinder
11
9
 
12
- module_function
13
-
14
- def makeFullURI(uri)
15
- uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
16
- if /^http|https/.match(uri)
17
- return uri
18
- else
19
- return "http://" << uri
20
- end
21
- end
22
-
23
- def getLinks(data, baseuri)
24
- return searchLinks(data, baseuri, "[@rel=alternate]&[@type=xml]&[@href=http]")
25
- end
26
-
27
- def getALinks(data, baseuri)
28
- return searchLinks(data, baseuri, "a")
29
- end
30
-
31
- def getFrameLinks(data, baseuri)
32
- links = searchLinks(data, baseuri, "frame")
33
- links += searchLinks(data, baseuri, "FRAME")
34
- return links
35
- end
36
-
37
- def searchLinks(data, baseuri, regexp)
38
- links = []
39
- data.search(regexp).map!{|link|
40
- if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
41
- uri = link[:href].to_s
42
- uri = link[:HREF].to_s if uri.empty?
43
- uri = link[:src].to_s if uri.empty?
44
- uri = link[:SRC].to_s if uri.empty?
45
- if !uri.strip.empty? and uri !~ /^javascript/
46
- uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
47
- links << uri
48
- end
49
- end
50
- }
51
- #links.each{|link| puts "searchLinks: #{link}"}
52
- return links.uniq
53
- end
54
-
55
- def getLocalLinks(links, baseuri)
56
- locallinks = []
57
- links.each do |link|
58
- locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
59
- end
60
- links = links.select{|link| link !~ /^\//} #remove local links from link array
61
- return [links, locallinks]
10
+ class Rfeedfinder
11
+ #
12
+ # Takes:
13
+ # * +init_values+ (hash)
14
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
15
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
16
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
17
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
18
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
19
+ #
20
+ #
21
+ # Example:
22
+ #
23
+ # Rfeedfinder.new({:proxy => "http://127.0.0.1:1234",
24
+ # :user_agent => "MyApp",
25
+ # :from => "contant@domain.com",
26
+ # :referer => "http://domain.com"})
27
+ #
28
+ #
29
+ # Returns a new instance of Rfeedfinder
30
+ #
31
+ def initialize(init_values = {})
32
+ @options = init_values
62
33
  end
63
34
 
64
- def isFeedLink?(link)
65
- return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
66
- end
67
-
68
- def isXMLRelatedLink?(link)
69
- return link.downcase =~ /rss|rdf|xml|atom/
35
+ #
36
+ # Takes:
37
+ # * +uri+ (string)
38
+ #
39
+ # Returns:
40
+ # * array of urls
41
+ #
42
+ def feeds(uri)
43
+ Rfeedfinder.feeds(uri, @options.dup)
70
44
  end
71
45
 
72
- def tryBrokenRedirect(data)
73
- newuris = (data/:newLocation)
74
- if !newuris.empty?
75
- return newuris[0].strip
76
- end
77
- end
78
-
79
- def verifyRedirect(feedlist)
80
- feedlist.each do |feed|
81
- begin
82
- response = Net::HTTP.get_response(URI.parse(feed))
83
- #puts "Verify #{feed} - code: #{response.code}"
84
- if response.code == "302"
85
- newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
86
-
87
- feedlist.delete(feed)
88
- feedlist << newuri
89
- feedlist.uniq!
90
- end
91
- rescue
92
- # rescue net error
93
- end
94
- end
95
- return feedlist
96
- end
97
-
98
- def isFeedData?(data)
99
- # if no html tag and rss, rdf or feed tag, it's a feed
100
- return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
46
+ #
47
+ # Takes:
48
+ # * +uri+ (string)
49
+ #
50
+ # Returns:
51
+ # * url (string)
52
+ #
53
+ def feed(uri)
54
+ result = Rfeedfinder.feed(uri, @options.dup)
101
55
  end
102
56
 
103
- def isFeed?(uri)
104
- uri.gsub!(/\/\/www\d\./, "//www.")
105
- begin
106
- protocol = URI.split(uri)
107
- return false if !protocol[0].index(/^[http|https]/)
108
- rescue
109
- # URI error
110
- return false
111
- end
57
+ #
58
+ # Takes:
59
+ # * +uri+ (string): The URI to check
60
+ # * +options+ (hash)
61
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
62
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
63
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
64
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
65
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
66
+ #
67
+ #
68
+ # Example:
69
+ #
70
+ # Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
71
+ # :user_agent => "MyApp",
72
+ # :from => "contant@domain.com",
73
+ # :referer => "http://domain.com"})
74
+ #
75
+ #
76
+ # Returns:
77
+ # * array of urls
78
+ # * array of hashes if the :keep_data option is true
79
+ # Example:
80
+ # [{:url => "url1", :data => "some data"},{:url => "url2", :data => "feed data"}]
81
+ #
82
+ # Raises:
83
+ # * ArgumentError if +uri+ is not a valid URL, and :use_google => false
84
+ # * ArgumentError if :use_google => true but it's not your lucky day
85
+ #
86
+ def self.feeds(uri, options = {})
112
87
 
113
- data = open_doc(uri)
114
- return false if data.nil?
88
+ # We have to create a hash for the data
89
+ # if the user has asked us to keep the data
90
+ options[:data] = {} if options[:keep_data]
91
+
92
+ options[:original_uri] = uri if !Rfeedfinder.isAValidURL?(uri) and options[:use_google]
115
93
 
116
- return isFeedData?(data)
117
- end
94
+ uri = URI.decode(uri)
95
+ options[:recurs] = [uri] if options[:recurs].nil?
96
+ fulluri = Rfeedfinder.makeFullURI(uri)
118
97
 
119
- def getFeedsFromSyndic8(uri)
120
- feeds = []
121
- begin
122
- server = Syndic8.new
123
- feedids = server.find_feeds(uri)
124
- infolist = server.feed_info(feedids, ['headlines_rank','status','dataurl'])
125
- infolist.sort_by{|feedInfo| feedInfo[:headlines_rank]}
126
- infolist.each do |feed|
127
- feeds << feed[:dataurl] if feed[:status]=='Syndicated'
128
- end
129
- rescue
130
- end
131
- return feeds
132
- end
133
-
134
- def feeds(uri, all=false, querySyndic8=false, _recurs=nil)
135
- uri = HTMLEntities.decode_entities(uri)
136
- _recurs = [uri] if _recurs.nil?
137
- fulluri = makeFullURI(uri)
98
+ raise ArgumentError, "#{fulluri} is not a valid URI." \
99
+ if !Rfeedfinder.isAValidURL?(fulluri) and !options[:use_google]
138
100
 
139
101
  # Add youtube support
140
102
  if fulluri =~ /youtube\.com\/user\/(.*[^\/])/
@@ -143,70 +105,77 @@ module Rfeedfinder
143
105
  if fulluri =~ /youtube\.com\/tag\/(.*[^\/])/
144
106
  fulluri = "http://www.youtube.com/rss/tag/#{$1}/videos.rss"
145
107
  end
146
-
147
- data = open_doc(fulluri)
108
+
109
+ data = Rfeedfinder.open_doc(fulluri, options)
148
110
  return [] if data.nil?
149
111
 
112
+ # If we used the google link finder, then we should set the new URL
113
+ fulluri = options[:google_link] if options[:google_link]
114
+
150
115
  # is this already a feed?
151
- if isFeedData?(data)
116
+ if Rfeedfinder.isFeedData?(data)
152
117
  feedlist = [fulluri]
153
- verifyRedirect(feedlist)
118
+ Rfeedfinder.verifyRedirect(feedlist)
154
119
  return feedlist
155
120
  end
156
121
 
157
122
  #verify redirection
158
- newuri = tryBrokenRedirect(data)
123
+ newuri = Rfeedfinder.tryBrokenRedirect(data)
159
124
  if !newuri.nil? and !newuri.empty?
160
- unless _recurs.include?(newuri)
161
- _recurs << newuri
162
- return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
125
+ options[:recurs] = [] unless options[:recurs]
126
+ unless options[:recurs].include?(newuri)
127
+ options[:recurs] << newuri
128
+ return feeds(newuri, options)
163
129
  end
164
130
  end
165
131
 
166
132
  #verify frameset
167
- frames = getFrameLinks(data, fulluri)
133
+ frames = Rfeedfinder.getFrameLinks(data, fulluri)
168
134
  frames.each {|newuri|
169
135
  if !newuri.nil? and !newuri.empty?
170
- unless _recurs.include?(newuri)
171
- _recurs << newuri
172
- return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
136
+ options[:recurs] = [] unless options[:recurs]
137
+ unless options[:recurs].include?(newuri)
138
+ options[:recurs] << newuri
139
+ return feeds(newuri, options)
173
140
  end
174
141
  end
175
142
  }
176
-
177
- # nope, it's a page, try LINK tags first
178
- outfeeds = getLinks(data, fulluri)
179
- outfeeds.select {|link| isFeed?(link)}
180
143
 
144
+ # nope, it's a page, try LINK tags first
145
+ outfeeds = Rfeedfinder.getLinks(data, fulluri).select {|link| Rfeedfinder.isFeed?(link, options)}
146
+
181
147
  #_debuglog('found %s feeds through LINK tags' % len(outfeeds))
182
148
  if outfeeds.empty?
183
149
  # no LINK tags, look for regular <A> links that point to feeds
184
150
  begin
185
- links = getALinks(data, fulluri)
151
+ links = Rfeedfinder.getALinks(data, fulluri)
186
152
  rescue
187
153
  links = []
188
154
  end
189
155
 
190
156
  # Get local links
191
- links, locallinks = getLocalLinks(links, fulluri)
157
+ links, locallinks = Rfeedfinder.getLocalLinks(links, fulluri)
158
+
159
+ # TODO:
160
+ # implement support for :only_first down her
192
161
 
193
162
  # look for obvious feed links on the same server
194
- selected_feeds = locallinks.select{|link| isFeedLink?(link) and isFeed?(link)}
163
+ selected_feeds = locallinks.select{|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)}
195
164
  outfeeds << selected_feeds unless selected_feeds.empty?
196
165
  # outfeeds.each{|link| puts "1 #{link}"}
197
166
 
198
167
  # look harder for feed links on the same server
199
- selected_feeds = locallinks.select{|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
168
+ selected_feeds = locallinks.select{|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
200
169
  outfeeds << selected_feeds unless selected_feeds.empty?
201
170
  # outfeeds.each{|link| puts "2 #{link}"}
202
171
 
203
172
  # look for obvious feed links on another server
204
- selected_feeds = links.select {|link| isFeedLink?(link) and isFeed?(link)} if outfeeds.empty?
173
+ selected_feeds = links.select {|link| Rfeedfinder.isFeedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
205
174
  outfeeds << selected_feeds unless selected_feeds.empty?
206
175
  # outfeeds.each{|link| puts "3 #{link}"}
207
176
 
208
177
  # look harder for feed links on another server
209
- selected_feeds = links.select {|link| isXMLRelatedLink?(link) and isFeed?(link)} if outfeeds.empty?
178
+ selected_feeds = links.select {|link| Rfeedfinder.isXMLRelatedLink?(link) and Rfeedfinder.isFeed?(link, options)} if outfeeds.empty?
210
179
  outfeeds << selected_feeds unless selected_feeds.empty?
211
180
  # outfeeds.each{|link| puts "4 #{link}"}
212
181
  end
@@ -226,63 +195,300 @@ module Rfeedfinder
226
195
 
227
196
  guesses.each { |guess|
228
197
  uri = URI.join(fulluri, guess).to_s
229
- outfeeds << uri if isFeed?(uri)
198
+ outfeeds << uri if Rfeedfinder.isFeed?(uri, options)
230
199
  }
231
200
  end
232
201
 
233
202
  # try with adding ending slash
234
203
  if outfeeds.empty? and fulluri !~ /\/$/
235
- outfeeds = feeds(fulluri + "/", all=all, querySyndic8=querySyndic8, _recurs=_recurs)
204
+ outfeeds = Rfeedfinder.feeds(fulluri + "/", options)
236
205
  end
237
-
238
- # still no luck, search Syndic8 for feeds (requires xmlrpclib)
239
- #_debuglog('still no luck, searching Syndic8')
240
- outfeeds << getFeedsFromSyndic8(uri) if querySyndic8 and outfeeds.empty?
241
- #outfeeds = list(set(outfeeds)) if hasattr(__builtins__, 'set') or __builtins__.has_key('set')
242
-
206
+
243
207
  # Verify redirection
244
- verifyRedirect(outfeeds)
208
+ Rfeedfinder.verifyRedirect(outfeeds)
209
+
210
+ # This has to be used until proper :only_first support has been built in
211
+ outfeeds = outfeeds.first if options[:only_first] and outfeeds.size > 1
245
212
 
246
- return outfeeds.flatten
213
+ if options[:keep_data]
214
+ output = []
215
+ outfeeds.each do |feed|
216
+ output << {:url => feed, :data => options[:data][feed]}
217
+ end
218
+ return output
219
+ else
220
+ return outfeeds
221
+ end
247
222
  end
248
223
 
249
- def feed(uri)
250
- #todo: give preference to certain feed formats
251
- feedlist = feeds(uri)
224
+
225
+ #
226
+ # Takes:
227
+ # * +uri+ (string): The URI to check
228
+ # * +options+ (hash)
229
+ # * +:proxy+: (string) proxy information to use. Defaults to a blank string
230
+ # * +:user_agent+: (string) user agent to identify as. Defaults to Ruby/#{RUBY_VERSION} - Rfeedfinder VERSION
231
+ # * +:from+: (string) contact info to the responsible person. FIXME: Is this correct? Defaults to rfeedfinder@googlegroups.com
232
+ # * +:keep_data+: (boolean) if the data downloaded for the feeds should be returned along with the URLs. Defaults to false
233
+ # * +:use_google+: (boolean) tries to find a URL using a google "I'm feeling lucky" search. Defaults to false
234
+ #
235
+ #
236
+ # Example:
237
+ #
238
+ # Rfeedfinder.feeds("www.google.com", {:proxy => "http://127.0.0.1:1234",
239
+ # :user_agent => "MyApp",
240
+ # :from => "contant@domain.com",
241
+ # :referer => "http://domain.com"})
242
+ #
243
+ #
244
+ # Returns:
245
+ # * one URL as a string or nil
246
+ # * one hash if the :keep_data option is true
247
+ # Example:
248
+ # {:url => "url1", :data => "some data"}
249
+ #
250
+ # Raises:
251
+ # * ArgumentError if +uri+ is not a valid URL, and :use_google => false
252
+ # * ArgumentError if :use_google => true but it's not your lucky day
253
+ #
254
+ def self.feed(uri, options = {})
255
+ options[:only_first] = true
256
+ feedlist = Rfeedfinder.feeds(uri, options)
252
257
  unless feedlist.empty?
253
258
  return feedlist[0]
254
259
  else
255
260
  return nil
256
261
  end
257
262
  end
263
+
264
+ #
265
+ # Takes:
266
+ # * +data+ (string)
267
+ #
268
+ # Returns:
269
+ # * +true+ if the data has a rss, rdf or feed tag
270
+ # * +false+ if the data has a html tag
271
+ #
272
+ def self.isFeedData?(data)
273
+ # if no html tag and rss, rdf or feed tag, it's a feed
274
+ # puts data
275
+ return ((data/"html|HTML").empty? and (!(data/:rss).nil? or !(data/:rdf).nil? or !(data/:feed).nil?))
276
+ end
277
+
278
+ #
279
+ # Takes:
280
+ # * +uri+ (string)
281
+ #
282
+ # Downloads the URI and checkes the content
283
+ # with the +isFeedData?+ class method
284
+ #
285
+ # Returns:
286
+ # * +true+ if the uri points to a feed
287
+ # * +false+ if not
288
+ #
289
+ def self.isFeed?(uri, options)
290
+ # We return false if the user only wants one result
291
+ # and we already have found it so there aren't made
292
+ # any additional external calls
293
+ return false if options[:only_first] and options[:already_found_one]
294
+
295
+ uri.gsub!(/\/\/www\d\./, "//www.")
296
+ begin
297
+ protocol = URI.split(uri)
298
+ return false if !protocol[0].index(/^[http|https]/)
299
+ rescue
300
+ # URI error
301
+ return false
302
+ end
303
+
304
+ data = Rfeedfinder.open_doc(uri, options)
305
+ return false if data.nil?
306
+
307
+ if Rfeedfinder.isFeedData?(data)
308
+ options[:already_found_one] = true if options[:only_first]
309
+ return true
310
+ else
311
+ return false
312
+ end
313
+ end
314
+
315
+ protected
316
+ def self.makeFullURI(uri)
317
+ uri = uri.strip.sub(/^feed(.*)/, 'http\1').downcase
318
+ if /^http|https/.match(uri)
319
+ return uri
320
+ else
321
+ return "http://" << uri
322
+ end
323
+ end
324
+
325
+ def self.getLinks(data, baseuri)
326
+ return Rfeedfinder.searchLinks(data, baseuri, "[@rel='alternate'][@type*='xml'][@href*='http']")
327
+ end
328
+
329
+ def self.getALinks(data, baseuri)
330
+ return Rfeedfinder.searchLinks(data, baseuri, "a")
331
+ end
332
+
333
+ def self.getFrameLinks(data, baseuri)
334
+ links = Rfeedfinder.searchLinks(data, baseuri, "frame")
335
+ links += Rfeedfinder.searchLinks(data, baseuri, "FRAME")
336
+ return links
337
+ end
258
338
 
259
- def open_doc(link)
339
+ def self.searchLinks(data, baseuri, regexp)
340
+ links = []
341
+ data.search(regexp).map!{|link|
342
+ if !link.to_s.strip.empty? and link.kind_of? Hpricot::Elem and !(link.kind_of? Hpricot::Text)
343
+ uri = link[:href].to_s
344
+ uri = link[:HREF].to_s if uri.empty?
345
+ uri = link[:src].to_s if uri.empty?
346
+ uri = link[:SRC].to_s if uri.empty?
347
+ if !uri.strip.empty? and uri !~ /^javascript/
348
+ uri = URI.join(baseuri, uri).to_s if uri !~ /^http:\/\//
349
+ links << uri
350
+ end
351
+ end
352
+ }
353
+ #links.each{|link| puts "Rfeedfinder.searchLinks: #{link}"}
354
+ return links.uniq
355
+ end
356
+
357
+ def self.getLocalLinks(links, baseuri)
358
+ locallinks = []
359
+ links.each do |link|
360
+ locallinks << URI.join(baseuri, link).to_s if link =~ /^\//
361
+ end
362
+ links = links.select{|link| link !~ /^\//} #remove local links from link array
363
+ return [links, locallinks]
364
+ end
365
+
366
+ def self.isFeedLink?(link)
367
+ return link.downcase =~ /\.rss$|\.rdf$|\.xml$|\.atom$/
368
+ end
369
+
370
+ def self.isXMLRelatedLink?(link)
371
+ return link.downcase =~ /rss|rdf|xml|atom/
372
+ end
373
+
374
+ def self.tryBrokenRedirect(data)
375
+ newuris = (data/:newLocation)
376
+ if !newuris.empty?
377
+ return newuris[0].strip
378
+ end
379
+ end
380
+
381
+ def self.verifyRedirect(feedlist)
382
+ feedlist.each do |feed|
383
+ begin
384
+ response = Net::HTTP.get_response(URI.parse(feed))
385
+ #puts "Verify #{feed} - code: #{response.code}"
386
+ if response.code == "302"
387
+ newuri = response.body.match(/<a href=\"([^>]+)\">/)[1]
388
+
389
+ feedlist.delete(feed)
390
+ feedlist << newuri
391
+ feedlist.uniq!
392
+ end
393
+ rescue
394
+ # rescue net error
395
+ end
396
+ end
397
+ return feedlist
398
+ end
399
+
400
+ def self.open_doc(link, options)
401
+
402
+ # Setting default values for missing options
403
+ options[:proxy] = URI.parse(options[:proxy]) if options[:proxy]
404
+ options[:user_agent] = options[:user_agent] || "Ruby/#{RUBY_VERSION} - " + \
405
+ "Rfeedfinder #{Rfeedfinder::VERSION::STRING}"
406
+ options[:from] = options[:from] || "rfeedfinder@googlegroups.com"
407
+ options[:referer] = options[:referer] || "http://rfeedfinder.rubyforge.org/"
408
+
260
409
  data = nil
410
+
411
+ if !Rfeedfinder.isAValidURL?(link) and options[:use_google]
412
+ # Used google lucky script as found on
413
+ # http://www.leancrew.com/all-this/2006/07/lucky-linking/
414
+ # It doesn't work to well...
415
+ # TODO: Improve it somehow. The real google function works a lot better!
416
+ # TODO: Build in support for languages through parameter "hl" (=> "en" by default)
417
+ prefix = "http://www.google.com/search?q="
418
+ suffix = "&btnI=I'm+Feeling+Lucky"
419
+ goodURL = URI.escape(prefix + options[:original_uri] + suffix)
420
+ puts "Checking #{goodURL}"
421
+ response = Net::HTTP.get_response(URI.parse(goodURL))
422
+ link = response.to_hash['location'].first
423
+ options[:google_link] = link
424
+ raise ArgumentError, "Google couldn't save us. We couldn't find anything for #{options[:original_uri]}" if link.nil?
425
+ end
426
+
261
427
  begin
262
- Timeout::timeout(20) {
263
- data = Hpricot(open(link,
264
- "User-Agent" => "Ruby/#{RUBY_VERSION} - Rfeedfinder",
265
- "From" => "rfeedfinder@googlegroups.com",
266
- "Referer" => "http://rfeedfinder.rubyforge.org/"), :xml => true)
267
- }
428
+
429
+ Timeout::timeout(20) do
430
+
431
+ data = Hpricot(open(link, {
432
+ "User-Agent" => options[:user_agent],
433
+ "From" => options[:from],
434
+ "Referer" => options[:referer],
435
+ :proxy => options[:proxy]
436
+ }), :xml => true)
437
+
438
+ end
439
+
268
440
  rescue OpenURI::HTTPError
441
+
269
442
  begin
270
- Timeout::timeout(20) {
443
+
444
+ Timeout::timeout(20) do
445
+
271
446
  html = Net::HTTP.get(URI.parse(link))
272
447
  data = Hpricot(html, :xml => true) if html.to_s !~ /404 Not Found/
273
- }
448
+
449
+ end
450
+
274
451
  rescue Timeout::Error
275
452
  return nil
453
+
276
454
  rescue => err
277
455
  puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
278
456
  return nil
457
+
279
458
  end
459
+
280
460
  rescue Timeout::Error
281
461
  return nil
462
+
282
463
  rescue => err
283
464
  puts "Error while opening #{link} with Hpricot: #{err.class} " << $!
284
465
  return nil
466
+
285
467
  end
468
+
469
+ # Store the data for the URL if the user has requested it
470
+ options[:data][link] = data.to_original_html if options[:keep_data]
471
+
286
472
  return data
287
473
  end
474
+
475
+ def self.isAValidURL?(url_to_check)
476
+ return false if url_to_check == nil
477
+
478
+ # The protocols that we allow are the following
479
+ protocol_whitelist = ["http", "https"]
480
+ # I guess we could have included some more, but that doesn't really
481
+ # make sense anyway as these are the ones that should be used.
482
+ # We'll see if the need arises and then add more later if needed.
483
+
484
+ re = Regexp.new("(#{protocol_whitelist.join('|')}):" + \
485
+ "\/\/([[:alpha:][:digit:].]{2,})([.]{1})([[:alpha:]]{2,4})(\/)")
486
+
487
+ # For the sake of the regular expression check we add a back slash
488
+ # at the end of the URL
489
+ url_to_check += "/"
490
+ return true unless (re =~ url_to_check) == nil
491
+ false
492
+ end
493
+
288
494
  end
@@ -1,9 +1,9 @@
1
- module Rfeedfinder #:nodoc:
1
+ class Rfeedfinder #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 9
5
- TINY = 12
5
+ TINY = 13
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
9
- end
9
+ end
@@ -6,12 +6,12 @@ class TestRfeedfinder < Test::Unit::TestCase
6
6
  end
7
7
 
8
8
  def test_feed
9
- feed_finder "scripting.com",
9
+ feed_finder "http://scripting.com",
10
10
  "http://www.scripting.com/rss.xml"
11
11
  end
12
12
 
13
13
  def test_feeds
14
- feeds = Rfeedfinder.feeds("flickr.com/photos/alx")
14
+ feeds = Rfeedfinder.feeds("http://flickr.com/photos/alx")
15
15
  assert_equal 2, feeds.size
16
16
  end
17
17
 
@@ -143,6 +143,6 @@ class TestRfeedfinder < Test::Unit::TestCase
143
143
  end
144
144
 
145
145
  def test_nytimes
146
- feed_finder "http://www.nytimes.com/"
146
+ feed_finder "http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml"
147
147
  end
148
148
  end
metadata CHANGED
@@ -1,33 +1,59 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: rfeedfinder
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.9.12
7
- date: 2008-04-03 00:00:00 +02:00
8
- summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
9
- require_paths:
10
- - lib
11
- email: alx.girard@gmail.com
12
- homepage: http://rfeedfinder.rubyforge.org
13
- rubyforge_project: rfeedfinder
14
- description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.9.13
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Alexandre Girard
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-10-11 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: htmlentities
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 4.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: hoe
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.7.0
44
+ version:
45
+ description: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
46
+ email: alx.girard@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - History.txt
53
+ - License.txt
54
+ - Manifest.txt
55
+ - README.txt
56
+ - website/index.txt
31
57
  files:
32
58
  - History.txt
33
59
  - License.txt
@@ -45,40 +71,33 @@ files:
45
71
  - website/javascripts/rounded_corners_lite.inc.js
46
72
  - website/stylesheets/screen.css
47
73
  - website/template.rhtml
48
- test_files:
49
- - test/test_helper.rb
50
- - test/test_rfeedfinder.rb
74
+ has_rdoc: true
75
+ homepage: http://rfeedfinder.rubyforge.org
76
+ post_install_message:
51
77
  rdoc_options:
52
78
  - --main
53
79
  - README.txt
54
- extra_rdoc_files:
55
- - History.txt
56
- - License.txt
57
- - Manifest.txt
58
- - README.txt
59
- - website/index.txt
60
- executables: []
61
-
62
- extensions: []
63
-
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: "0"
87
+ version:
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ version:
64
94
  requirements: []
65
95
 
66
- dependencies:
67
- - !ruby/object:Gem::Dependency
68
- name: hpricot
69
- version_requirement:
70
- version_requirements: !ruby/object:Gem::Version::Requirement
71
- requirements:
72
- - - ">="
73
- - !ruby/object:Gem::Version
74
- version: "0.6"
75
- version:
76
- - !ruby/object:Gem::Dependency
77
- name: htmlentities
78
- version_requirement:
79
- version_requirements: !ruby/object:Gem::Version::Requirement
80
- requirements:
81
- - - ">="
82
- - !ruby/object:Gem::Version
83
- version: 4.0.0
84
- version:
96
+ rubyforge_project: rfeedfinder
97
+ rubygems_version: 1.2.0
98
+ signing_key:
99
+ specification_version: 2
100
+ summary: rFeedFinder uses RSS autodiscovery, Atom autodiscovery, spidering, URL correction, and Web service queries -- whatever it takes -- to find the feed.
101
+ test_files:
102
+ - test/test_helper.rb
103
+ - test/test_rfeedfinder.rb