url_scrubber 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjU1ZDhmYTczNWEzMDk2MjQ4M2U2NDkzMGNiZmQ1Y2U2YTMxZDk0MQ==
4
+ ZDlkYzlmMDBiOTY5MzU0N2ExYTZjMjI4YzUyZDQ0MzFiZWNiYmM1NA==
5
5
  data.tar.gz: !binary |-
6
- NDE2YzlhOWFjMmU3NWFiNjIzNmE4OTJlYjQ3ZmYwMDExMTJjZGNmMw==
6
+ Y2NiYzUzMDAwZWRlZjA3MTYzZmM5ZTI2MjkyNDI2ZmQ3ZjRhMzcyMg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MDRlZTJlNzFiOTMwYWFkZWY4YzZiZWM5ZjAxMWVhMzMwZjNmYzg4ZmZkMjc2
10
- NGRmZTc0Nzc0NGU4M2MzNTk2NGQ5YzhkODRhYTk1YTU3ZWE5YmY1MDMyMTQ4
11
- ZjFiNGI4MTAyMmM0MzQ1Y2Y5NWMyMTYwMmQyZTQyNTE1ZTQ4ZDQ=
9
+ NTIyOGY2ZTc3YTNiMDQwMmE5NzNiZDhlOWMxMTcyYThhZDBmMTBhNDAxNTc5
10
+ MTIzM2RiMzVmM2E4ZTJmMDBlNjdjNmM2MDJiNWZhMWJjZjM0MDE5NjNlNTQ3
11
+ ZmFiYWYwYjJiNDRlMDg0MWUxMTdhNmY3MjdjYmQ2ZjI0YWFlNmQ=
12
12
  data.tar.gz: !binary |-
13
- MTQ4MDcxMDFlM2QzM2Q2ODAwMWUxOGRhMzg2OTBmMjhiNTEzNmNmYTY5ZDg4
14
- NjM4YjhmOGNkMTJmNzQ1ZGQxMDI3ODgxNDc0NGNiN2QyNmExMTk5NzRjOGUy
15
- NWMzOWRjNTMwM2YxZjFjYjUzYmMyYjU1YjVkNGNjOWFiMTRkOTE=
13
+ YTQwYmIwMGYzOGU2ZGZjOTA1NGE3MTNkNDk4NGMzNDkyOTBlMmM3MWFjYTRi
14
+ ODdhMDhjNmM0ZTIwYWY3MzZkY2UyMjQ2N2I4YThjNDQ1OGFkMDFlODU1OTU5
15
+ NTY2YzQxNzk0ZTk0MTE5ZTdlZGY2MzNmY2VlNTg4NTA5NTA3ZmY=
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.0"
2
+ VERSION = "0.8.2"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -153,6 +153,7 @@ module UrlScrubber
153
153
  end
154
154
 
155
155
  def self.special_cases(url)
156
+ #puts "special_cases"
156
157
  case service_of(url)
157
158
  when :youtube then return sc_youtube(url)
158
159
  when :twitter then return sc_twitter(url)
@@ -191,6 +192,7 @@ module UrlScrubber
191
192
  end
192
193
 
193
194
  def self.drop_anchor!(url)
195
+ #puts "drop anchor"
194
196
  url.sub!(/#.*$/, '')
195
197
  url
196
198
  end
@@ -232,7 +234,7 @@ module UrlScrubber
232
234
  end
233
235
 
234
236
  def self.sc_facebook(url)
235
- # puts "sc_facebook: #{url}"
237
+ #puts "sc_facebook: #{url}"
236
238
  regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
237
239
  regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
238
240
  regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
@@ -253,13 +255,13 @@ module UrlScrubber
253
255
  elsif mdata = url.match(regex2)
254
256
  # puts "regex2"
255
257
  # "https://www.facebook.com/profile.php?id=100009574328879"
256
- url, http_response = check_for_redirection(mdata[:url])
258
+ url, http_response = check_for_facebook_redirection(mdata[:url])
257
259
  uid = mdata[:uid]
258
260
  elsif mdata = url.match(regex2a)
259
261
  # puts "regex2a"
260
262
  # "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
261
263
  url = "http://facebook.com/profile.php?id=" + mdata[:uid]
262
- url, http_response = check_for_redirection(url)
264
+ url, http_response = check_for_facebook_redirection(url)
263
265
  uid = mdata[:uid]
264
266
  elsif mdata = url.match(regex4)
265
267
  # puts "#{url} - #{mdata[:uname]}"
@@ -278,7 +280,7 @@ module UrlScrubber
278
280
  # puts "profile.php"
279
281
  # these were being truncated, they do redirect, but typically a 301 response is generated
280
282
  # so the url is returned unchanged. Better than truncation.
281
- url, http_response = check_for_redirection(url)
283
+ url, http_response = check_for_facebook_redirection(url)
282
284
  else
283
285
  # puts "else"
284
286
  url = drop_url_query!(url)
@@ -355,7 +357,9 @@ module UrlScrubber
355
357
  url
356
358
  end
357
359
 
358
- def self.check_for_redirection(uri_str, limit = 5)
360
+ def self.check_for_facebook_redirection(uri_str, limit = 5)
361
+ #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
362
+ # finds any redirects intended for facebook URLs only!!!!
359
363
  login_patterns = [
360
364
  # pages that require user logins
361
365
  %r{^.*/login[^/]*$}
@@ -367,14 +371,20 @@ module UrlScrubber
367
371
  ]
368
372
 
369
373
  raise 'Too many HTTP redirects' if limit == 0
374
+
375
+ uri_str_new = uri_str.sub('http://', 'https://')
376
+ uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
370
377
 
371
378
  begin
372
- url = URI.parse(uri_str)
379
+ url = URI.parse(uri_str_new)
373
380
  rescue URI::InvalidURIError => e
374
- return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
381
+ return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
375
382
  end
376
383
 
377
384
  http = Net::HTTP.new(url.host, url.port)
385
+ http = Net::HTTP.new(url.host, url.port)
386
+ http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
387
+ http.read_timeout = 10 # and up to 10 seconds for a response
378
388
  if url.port == 443
379
389
  http.use_ssl = true
380
390
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -385,35 +395,43 @@ module UrlScrubber
385
395
 
386
396
  begin
387
397
  response = http.request(request)
398
+ rescue Timeout::Error
399
+ #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
400
+ failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
401
+ return [uri_str_new, failure_response]
388
402
  rescue Exception => e
389
403
  failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
390
- return [uri_str, failure_response]
391
- end
404
+ return [uri_str_new, failure_response]
405
+ end
392
406
 
393
407
  if response.is_a? Net::HTTPRedirection
394
408
  if response['location'][0,4] == "http"
395
409
  if failure_patterns.any? { |pattern| response['location'].match(pattern) }
396
410
  # got redirected to a page indicating failure, so act like it's a 404
397
411
  failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
398
- return [uri_str, failure_response]
412
+ #puts "check_for_facebook_redirection 404"
413
+ return [uri_str_new, failure_response]
399
414
  end
400
415
 
401
- redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
402
-
403
416
  if login_patterns.any? { |pattern| redirected_url.match(pattern) }
404
417
  # got redirected to a login page. return the ultimate response, but the previous url
405
- return [uri_str, base_response]
406
- else
407
- return [redirected_url, base_response]
418
+ failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
419
+ #puts "check_for_facebook_redirection 401"
420
+ return [uri_str_new, failure_response]
408
421
  end
422
+ #puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
423
+ redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
424
+ return [redirected_url, base_response]
409
425
 
410
426
  else
411
427
  redir_url = "http://#{url.host}#{response['location']}"
412
- redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
428
+ #puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
429
+ redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
413
430
  return [redirected_url, base_response]
414
431
  end
415
432
  else
416
- return [uri_str, response]
433
+ #puts "check_for_facebook_redirection return code #{response.code.to_s}"
434
+ return [uri_str_new, response]
417
435
  end
418
436
  end
419
437
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-04-08 00:00:00.000000000 Z
14
+ date: 2016-09-26 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
@@ -124,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
124
124
  version: '0'
125
125
  requirements: []
126
126
  rubyforge_project:
127
- rubygems_version: 2.4.8
127
+ rubygems_version: 2.6.6
128
128
  signing_key:
129
129
  specification_version: 4
130
130
  summary: Clean up URLs.