url_scrubber 0.8.0 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjU1ZDhmYTczNWEzMDk2MjQ4M2U2NDkzMGNiZmQ1Y2U2YTMxZDk0MQ==
4
+ ZDlkYzlmMDBiOTY5MzU0N2ExYTZjMjI4YzUyZDQ0MzFiZWNiYmM1NA==
5
5
  data.tar.gz: !binary |-
6
- NDE2YzlhOWFjMmU3NWFiNjIzNmE4OTJlYjQ3ZmYwMDExMTJjZGNmMw==
6
+ Y2NiYzUzMDAwZWRlZjA3MTYzZmM5ZTI2MjkyNDI2ZmQ3ZjRhMzcyMg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MDRlZTJlNzFiOTMwYWFkZWY4YzZiZWM5ZjAxMWVhMzMwZjNmYzg4ZmZkMjc2
10
- NGRmZTc0Nzc0NGU4M2MzNTk2NGQ5YzhkODRhYTk1YTU3ZWE5YmY1MDMyMTQ4
11
- ZjFiNGI4MTAyMmM0MzQ1Y2Y5NWMyMTYwMmQyZTQyNTE1ZTQ4ZDQ=
9
+ NTIyOGY2ZTc3YTNiMDQwMmE5NzNiZDhlOWMxMTcyYThhZDBmMTBhNDAxNTc5
10
+ MTIzM2RiMzVmM2E4ZTJmMDBlNjdjNmM2MDJiNWZhMWJjZjM0MDE5NjNlNTQ3
11
+ ZmFiYWYwYjJiNDRlMDg0MWUxMTdhNmY3MjdjYmQ2ZjI0YWFlNmQ=
12
12
  data.tar.gz: !binary |-
13
- MTQ4MDcxMDFlM2QzM2Q2ODAwMWUxOGRhMzg2OTBmMjhiNTEzNmNmYTY5ZDg4
14
- NjM4YjhmOGNkMTJmNzQ1ZGQxMDI3ODgxNDc0NGNiN2QyNmExMTk5NzRjOGUy
15
- NWMzOWRjNTMwM2YxZjFjYjUzYmMyYjU1YjVkNGNjOWFiMTRkOTE=
13
+ YTQwYmIwMGYzOGU2ZGZjOTA1NGE3MTNkNDk4NGMzNDkyOTBlMmM3MWFjYTRi
14
+ ODdhMDhjNmM0ZTIwYWY3MzZkY2UyMjQ2N2I4YThjNDQ1OGFkMDFlODU1OTU5
15
+ NTY2YzQxNzk0ZTk0MTE5ZTdlZGY2MzNmY2VlNTg4NTA5NTA3ZmY=
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.0"
2
+ VERSION = "0.8.2"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -153,6 +153,7 @@ module UrlScrubber
153
153
  end
154
154
 
155
155
  def self.special_cases(url)
156
+ #puts "special_cases"
156
157
  case service_of(url)
157
158
  when :youtube then return sc_youtube(url)
158
159
  when :twitter then return sc_twitter(url)
@@ -191,6 +192,7 @@ module UrlScrubber
191
192
  end
192
193
 
193
194
  def self.drop_anchor!(url)
195
+ #puts "drop anchor"
194
196
  url.sub!(/#.*$/, '')
195
197
  url
196
198
  end
@@ -232,7 +234,7 @@ module UrlScrubber
232
234
  end
233
235
 
234
236
  def self.sc_facebook(url)
235
- # puts "sc_facebook: #{url}"
237
+ #puts "sc_facebook: #{url}"
236
238
  regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
237
239
  regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
238
240
  regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
@@ -253,13 +255,13 @@ module UrlScrubber
253
255
  elsif mdata = url.match(regex2)
254
256
  # puts "regex2"
255
257
  # "https://www.facebook.com/profile.php?id=100009574328879"
256
- url, http_response = check_for_redirection(mdata[:url])
258
+ url, http_response = check_for_facebook_redirection(mdata[:url])
257
259
  uid = mdata[:uid]
258
260
  elsif mdata = url.match(regex2a)
259
261
  # puts "regex2a"
260
262
  # "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
261
263
  url = "http://facebook.com/profile.php?id=" + mdata[:uid]
262
- url, http_response = check_for_redirection(url)
264
+ url, http_response = check_for_facebook_redirection(url)
263
265
  uid = mdata[:uid]
264
266
  elsif mdata = url.match(regex4)
265
267
  # puts "#{url} - #{mdata[:uname]}"
@@ -278,7 +280,7 @@ module UrlScrubber
278
280
  # puts "profile.php"
279
281
  # these were being truncated, they do redirect, but typically a 301 response is generated
280
282
  # so the url is returned unchanged. Better than truncation.
281
- url, http_response = check_for_redirection(url)
283
+ url, http_response = check_for_facebook_redirection(url)
282
284
  else
283
285
  # puts "else"
284
286
  url = drop_url_query!(url)
@@ -355,7 +357,9 @@ module UrlScrubber
355
357
  url
356
358
  end
357
359
 
358
- def self.check_for_redirection(uri_str, limit = 5)
360
+ def self.check_for_facebook_redirection(uri_str, limit = 5)
361
+ #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
362
+ # finds any redirects intended for facebook URLs only!!!!
359
363
  login_patterns = [
360
364
  # pages that require user logins
361
365
  %r{^.*/login[^/]*$}
@@ -367,14 +371,20 @@ module UrlScrubber
367
371
  ]
368
372
 
369
373
  raise 'Too many HTTP redirects' if limit == 0
374
+
375
+ uri_str_new = uri_str.sub('http://', 'https://')
376
+ uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
370
377
 
371
378
  begin
372
- url = URI.parse(uri_str)
379
+ url = URI.parse(uri_str_new)
373
380
  rescue URI::InvalidURIError => e
374
- return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
381
+ return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
375
382
  end
376
383
 
377
384
  http = Net::HTTP.new(url.host, url.port)
385
+ http = Net::HTTP.new(url.host, url.port)
386
+ http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
387
+ http.read_timeout = 10 # and up to 10 seconds for a response
378
388
  if url.port == 443
379
389
  http.use_ssl = true
380
390
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -385,35 +395,43 @@ module UrlScrubber
385
395
 
386
396
  begin
387
397
  response = http.request(request)
398
+ rescue Timeout::Error
399
+ #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
400
+ failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
401
+ return [uri_str_new, failure_response]
388
402
  rescue Exception => e
389
403
  failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
390
- return [uri_str, failure_response]
391
- end
404
+ return [uri_str_new, failure_response]
405
+ end
392
406
 
393
407
  if response.is_a? Net::HTTPRedirection
394
408
  if response['location'][0,4] == "http"
395
409
  if failure_patterns.any? { |pattern| response['location'].match(pattern) }
396
410
  # got redirected to a page indicating failure, so act like it's a 404
397
411
  failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
398
- return [uri_str, failure_response]
412
+ #puts "check_for_facebook_redirection 404"
413
+ return [uri_str_new, failure_response]
399
414
  end
400
415
 
401
- redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
402
-
403
416
  if login_patterns.any? { |pattern| redirected_url.match(pattern) }
404
417
  # got redirected to a login page. return the ultimate response, but the previous url
405
- return [uri_str, base_response]
406
- else
407
- return [redirected_url, base_response]
418
+ failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
419
+ #puts "check_for_facebook_redirection 401"
420
+ return [uri_str_new, failure_response]
408
421
  end
422
+ #puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
423
+ redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
424
+ return [redirected_url, base_response]
409
425
 
410
426
  else
411
427
  redir_url = "http://#{url.host}#{response['location']}"
412
- redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
428
+ #puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
429
+ redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
413
430
  return [redirected_url, base_response]
414
431
  end
415
432
  else
416
- return [uri_str, response]
433
+ #puts "check_for_facebook_redirection return code #{response.code.to_s}"
434
+ return [uri_str_new, response]
417
435
  end
418
436
  end
419
437
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-04-08 00:00:00.000000000 Z
14
+ date: 2016-09-26 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -124,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
124
124
  version: '0'
125
125
  requirements: []
126
126
  rubyforge_project:
127
- rubygems_version: 2.4.8
127
+ rubygems_version: 2.6.6
128
128
  signing_key:
129
129
  specification_version: 4
130
130
  summary: Clean up URLs.