url_scrubber 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +35 -17
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZDlkYzlmMDBiOTY5MzU0N2ExYTZjMjI4YzUyZDQ0MzFiZWNiYmM1NA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
Y2NiYzUzMDAwZWRlZjA3MTYzZmM5ZTI2MjkyNDI2ZmQ3ZjRhMzcyMg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NTIyOGY2ZTc3YTNiMDQwMmE5NzNiZDhlOWMxMTcyYThhZDBmMTBhNDAxNTc5
|
10
|
+
MTIzM2RiMzVmM2E4ZTJmMDBlNjdjNmM2MDJiNWZhMWJjZjM0MDE5NjNlNTQ3
|
11
|
+
ZmFiYWYwYjJiNDRlMDg0MWUxMTdhNmY3MjdjYmQ2ZjI0YWFlNmQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YTQwYmIwMGYzOGU2ZGZjOTA1NGE3MTNkNDk4NGMzNDkyOTBlMmM3MWFjYTRi
|
14
|
+
ODdhMDhjNmM0ZTIwYWY3MzZkY2UyMjQ2N2I4YThjNDQ1OGFkMDFlODU1OTU5
|
15
|
+
NTY2YzQxNzk0ZTk0MTE5ZTdlZGY2MzNmY2VlNTg4NTA5NTA3ZmY=
|
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -153,6 +153,7 @@ module UrlScrubber
|
|
153
153
|
end
|
154
154
|
|
155
155
|
def self.special_cases(url)
|
156
|
+
#puts "special_cases"
|
156
157
|
case service_of(url)
|
157
158
|
when :youtube then return sc_youtube(url)
|
158
159
|
when :twitter then return sc_twitter(url)
|
@@ -191,6 +192,7 @@ module UrlScrubber
|
|
191
192
|
end
|
192
193
|
|
193
194
|
def self.drop_anchor!(url)
|
195
|
+
#puts "drop anchor"
|
194
196
|
url.sub!(/#.*$/, '')
|
195
197
|
url
|
196
198
|
end
|
@@ -232,7 +234,7 @@ module UrlScrubber
|
|
232
234
|
end
|
233
235
|
|
234
236
|
def self.sc_facebook(url)
|
235
|
-
#
|
237
|
+
#puts "sc_facebook: #{url}"
|
236
238
|
regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
237
239
|
regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
238
240
|
regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
@@ -253,13 +255,13 @@ module UrlScrubber
|
|
253
255
|
elsif mdata = url.match(regex2)
|
254
256
|
# puts "regex2"
|
255
257
|
# "https://www.facebook.com/profile.php?id=100009574328879"
|
256
|
-
url, http_response =
|
258
|
+
url, http_response = check_for_facebook_redirection(mdata[:url])
|
257
259
|
uid = mdata[:uid]
|
258
260
|
elsif mdata = url.match(regex2a)
|
259
261
|
# puts "regex2a"
|
260
262
|
# "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
|
261
263
|
url = "http://facebook.com/profile.php?id=" + mdata[:uid]
|
262
|
-
url, http_response =
|
264
|
+
url, http_response = check_for_facebook_redirection(url)
|
263
265
|
uid = mdata[:uid]
|
264
266
|
elsif mdata = url.match(regex4)
|
265
267
|
# puts "#{url} - #{mdata[:uname]}"
|
@@ -278,7 +280,7 @@ module UrlScrubber
|
|
278
280
|
# puts "profile.php"
|
279
281
|
# these were being truncated, they do redirect, but typically a 301 response is generated
|
280
282
|
# so the url is returned unchanged. Better than truncation.
|
281
|
-
url, http_response =
|
283
|
+
url, http_response = check_for_facebook_redirection(url)
|
282
284
|
else
|
283
285
|
# puts "else"
|
284
286
|
url = drop_url_query!(url)
|
@@ -355,7 +357,9 @@ module UrlScrubber
|
|
355
357
|
url
|
356
358
|
end
|
357
359
|
|
358
|
-
def self.
|
360
|
+
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
361
|
+
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
362
|
+
# finds any redirects intended for facebook URLs only!!!!
|
359
363
|
login_patterns = [
|
360
364
|
# pages that require user logins
|
361
365
|
%r{^.*/login[^/]*$}
|
@@ -367,14 +371,20 @@ module UrlScrubber
|
|
367
371
|
]
|
368
372
|
|
369
373
|
raise 'Too many HTTP redirects' if limit == 0
|
374
|
+
|
375
|
+
uri_str_new = uri_str.sub('http://', 'https://')
|
376
|
+
uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
|
370
377
|
|
371
378
|
begin
|
372
|
-
url = URI.parse(
|
379
|
+
url = URI.parse(uri_str_new)
|
373
380
|
rescue URI::InvalidURIError => e
|
374
|
-
return [
|
381
|
+
return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
|
375
382
|
end
|
376
383
|
|
377
384
|
http = Net::HTTP.new(url.host, url.port)
|
385
|
+
http = Net::HTTP.new(url.host, url.port)
|
386
|
+
http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
|
387
|
+
http.read_timeout = 10 # and up to 10 seconds for a response
|
378
388
|
if url.port == 443
|
379
389
|
http.use_ssl = true
|
380
390
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -385,35 +395,43 @@ module UrlScrubber
|
|
385
395
|
|
386
396
|
begin
|
387
397
|
response = http.request(request)
|
398
|
+
rescue Timeout::Error
|
399
|
+
#Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
|
400
|
+
failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
|
401
|
+
return [uri_str_new, failure_response]
|
388
402
|
rescue Exception => e
|
389
403
|
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
390
|
-
return [
|
391
|
-
|
404
|
+
return [uri_str_new, failure_response]
|
405
|
+
end
|
392
406
|
|
393
407
|
if response.is_a? Net::HTTPRedirection
|
394
408
|
if response['location'][0,4] == "http"
|
395
409
|
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
396
410
|
# got redirected to a page indicating failure, so act like it's a 404
|
397
411
|
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
398
|
-
|
412
|
+
#puts "check_for_facebook_redirection 404"
|
413
|
+
return [uri_str_new, failure_response]
|
399
414
|
end
|
400
415
|
|
401
|
-
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
402
|
-
|
403
416
|
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
404
417
|
# got redirected to a login page. return the ultimate response, but the previous url
|
405
|
-
|
406
|
-
|
407
|
-
return [
|
418
|
+
failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible')
|
419
|
+
#puts "check_for_facebook_redirection 401"
|
420
|
+
return [uri_str_new, failure_response]
|
408
421
|
end
|
422
|
+
#puts "check_for_facebook_redirection 1 limit=#{limit.to_s}"
|
423
|
+
redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1)
|
424
|
+
return [redirected_url, base_response]
|
409
425
|
|
410
426
|
else
|
411
427
|
redir_url = "http://#{url.host}#{response['location']}"
|
412
|
-
|
428
|
+
#puts "check_for_facebook_redirection recalled limit =#{limit.to_s}"
|
429
|
+
redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1)
|
413
430
|
return [redirected_url, base_response]
|
414
431
|
end
|
415
432
|
else
|
416
|
-
return
|
433
|
+
#puts "check_for_facebook_redirection return code #{response.code.to_s}"
|
434
|
+
return [uri_str_new, response]
|
417
435
|
end
|
418
436
|
end
|
419
437
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2016-
|
14
|
+
date: 2016-09-26 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
@@ -124,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
124
|
version: '0'
|
125
125
|
requirements: []
|
126
126
|
rubyforge_project:
|
127
|
-
rubygems_version: 2.
|
127
|
+
rubygems_version: 2.6.6
|
128
128
|
signing_key:
|
129
129
|
specification_version: 4
|
130
130
|
summary: Clean up URLs.
|