url_scrubber 0.8.0 → 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +35 -17
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZDlkYzlmMDBiOTY5MzU0N2ExYTZjMjI4YzUyZDQ0MzFiZWNiYmM1NA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
Y2NiYzUzMDAwZWRlZjA3MTYzZmM5ZTI2MjkyNDI2ZmQ3ZjRhMzcyMg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NTIyOGY2ZTc3YTNiMDQwMmE5NzNiZDhlOWMxMTcyYThhZDBmMTBhNDAxNTc5
|
10
|
+
MTIzM2RiMzVmM2E4ZTJmMDBlNjdjNmM2MDJiNWZhMWJjZjM0MDE5NjNlNTQ3
|
11
|
+
ZmFiYWYwYjJiNDRlMDg0MWUxMTdhNmY3MjdjYmQ2ZjI0YWFlNmQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YTQwYmIwMGYzOGU2ZGZjOTA1NGE3MTNkNDk4NGMzNDkyOTBlMmM3MWFjYTRi
|
14
|
+
ODdhMDhjNmM0ZTIwYWY3MzZkY2UyMjQ2N2I4YThjNDQ1OGFkMDFlODU1OTU5
|
15
|
+
NTY2YzQxNzk0ZTk0MTE5ZTdlZGY2MzNmY2VlNTg4NTA5NTA3ZmY=
|
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -153,6 +153,7 @@ module UrlScrubber
|
|
153
153
|
end
|
154
154
|
|
155
155
|
def self.special_cases(url)
|
156
|
+
#puts "special_cases"
|
156
157
|
case service_of(url)
|
157
158
|
when :youtube then return sc_youtube(url)
|
158
159
|
when :twitter then return sc_twitter(url)
|
@@ -191,6 +192,7 @@ module UrlScrubber
|
|
191
192
|
end
|
192
193
|
|
193
194
|
def self.drop_anchor!(url)
|
195
|
+
#puts "drop anchor"
|
194
196
|
url.sub!(/#.*$/, '')
|
195
197
|
url
|
196
198
|
end
|
@@ -232,7 +234,7 @@ module UrlScrubber
|
|
232
234
|
end
|
233
235
|
|
234
236
|
def self.sc_facebook(url)
|
235
|
-
#
|
237
|
+
#puts "sc_facebook: #{url}"
|
236
238
|
regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
237
239
|
regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
238
240
|
regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
@@ -253,13 +255,13 @@ module UrlScrubber
|
|
253
255
|
elsif mdata = url.match(regex2)
|
254
256
|
# puts "regex2"
|
255
257
|
# "https://www.facebook.com/profile.php?id=100009574328879"
|
256
|
-
url, http_response =
|
258
|
+
url, http_response = check_for_facebook_redirection(mdata[:url])
|
257
259
|
uid = mdata[:uid]
|
258
260
|
elsif mdata = url.match(regex2a)
|
259
261
|
# puts "regex2a"
|
260
262
|
# "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
|
261
263
|
url = "http://facebook.com/profile.php?id=" + mdata[:uid]
|
262
|
-
url, http_response =
|
264
|
+
url, http_response = check_for_facebook_redirection(url)
|
263
265
|
uid = mdata[:uid]
|
264
266
|
elsif mdata = url.match(regex4)
|
265
267
|
# puts "#{url} - #{mdata[:uname]}"
|
@@ -278,7 +280,7 @@ module UrlScrubber
|
|
278
280
|
# puts "profile.php"
|
279
281
|
# these were being truncated, they do redirect, but typically a 301 response is generated
|
280
282
|
# so the url is returned unchanged. Better than truncation.
|
281
|
-
url, http_response =
|
283
|
+
url, http_response = check_for_facebook_redirection(url)
|
282
284
|
else
|
283
285
|
# puts "else"
|
284
286
|
url = drop_url_query!(url)
|
@@ -355,7 +357,9 @@ module UrlScrubber
|
|
355
357
|
url
|
356
358
|
end
|
357
359
|
|
358
|
-
def self.
|
360
|
+
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
361
|
+
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
362
|
+
# finds any redirects intended for facebook URLs only!!!!
|
359
363
|
login_patterns = [
|
360
364
|
# pages that require user logins
|
361
365
|
%r{^.*/login[^/]*$}
|
@@ -367,14 +371,20 @@ module UrlScrubber
|
|
367
371
|
]
|
368
372
|
|
369
373
|
raise 'Too many HTTP redirects' if limit == 0
|
374
|
+
|
375
|
+
uri_str_new = uri_str.sub('http://', 'https://')
|
376
|
+
uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
|
370
377
|
|
371
378
|
begin
|
372
|
-
url = URI.parse(
|
379
|
+
url = URI.parse(uri_str_new)
|
373
380
|
rescue URI::InvalidURIError => e
|
374
|
-
return [
|
381
|
+
return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
|
375
382
|
end
|
376
383
|
|
377
384
|
http = Net::HTTP.new(url.host, url.port)
|
385
|
+
http = Net::HTTP.new(url.host, url.port)
|
386
|
+
http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
|
387
|
+
http.read_timeout = 10 # and up to 10 seconds for a response
|
378
388
|
if url.port == 443
|
379
389
|
http.use_ssl = true
|
380
390
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -385,35 +395,43 @@ module UrlScrubber
|
|
385
395
|
|
386
396
|
begin
|
387
397
|
response = http.request(request)
|
398
|
+
rescue Timeout::Error
|
399
|
+
#Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
|
400
|
+
failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
|
401
|
+
return [uri_str_new, failure_response]
|
388
402
|
rescue Exception => e
|
389
403
|
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
390
|
-
return [
|
391
|
-
|
404
|
+
return [uri_str_new, failure_response]
|
405
|
+
end
|
392
406
|
|
393
407
|
if response.is_a? Net::HTTPRedirection
|
394
408
|
if response['location'][0,4] == "http"
|
395
409
|
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
396
410
|
# got redirected to a page indicating failure, so act like it's a 404
|
397
411
|
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
398
|
-
|
412
|
+
#puts "check_for_facebook_redirection 404"
|
413
|
+
return [uri_str_new, failure_response]
|
399
414
|
end
|
400
415
|
|
401
|
-
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
402
|
-
|
403
416
|
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
404
417
|
# got redirected to a login page. return the ultimate response, but the previous url
|
405
|
-
|
406
|
-
|
407
|
-
return [
|
418
|
+
failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
|
419
|
+
#puts "check_for_facebook_redirection 401"
|
420
|
+
return [uri_str_new, failure_response]
|
408
421
|
end
|
422
|
+
#puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
|
423
|
+
redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
|
424
|
+
return [redirected_url, base_response]
|
409
425
|
|
410
426
|
else
|
411
427
|
redir_url = "http://#{url.host}#{response['location']}"
|
412
|
-
|
428
|
+
#puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
|
429
|
+
redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
|
413
430
|
return [redirected_url, base_response]
|
414
431
|
end
|
415
432
|
else
|
416
|
-
return
|
433
|
+
#puts "check_for_facebook_redirection return code #{response.code.to_s}"
|
434
|
+
return [uri_str_new, response]
|
417
435
|
end
|
418
436
|
end
|
419
437
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2016-
|
14
|
+
date: 2016-09-26 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -124,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
124
|
version: '0'
|
125
125
|
requirements: []
|
126
126
|
rubyforge_project:
|
127
|
-
rubygems_version: 2.
|
127
|
+
rubygems_version: 2.6.6
|
128
128
|
signing_key:
|
129
129
|
specification_version: 4
|
130
130
|
summary: Clean up URLs.
|