url_scrubber 0.8.12 → 0.8.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/lib/url_scrubber.rb +44 -31
- data/lib/url_scrubber/version.rb +1 -1
- data/spec/url_scrubber_spec.rb +4 -0
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
|
4
|
+
data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
|
11
|
-
NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
|
14
|
-
NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
|
15
|
-
ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
|
6
|
+
metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
|
7
|
+
data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
|
data/lib/url_scrubber.rb
CHANGED
@@ -9,9 +9,9 @@ module UrlScrubber
|
|
9
9
|
def self.scrub(url)
|
10
10
|
return url if url.blank?
|
11
11
|
return url if /^app:\/\//.match(url) # Do not scrub app-only URLs
|
12
|
-
|
12
|
+
|
13
13
|
url = url.clone # don't modify the original argument
|
14
|
-
|
14
|
+
|
15
15
|
m = url.match(/(htt?ps?:\/\/\S*)/i)
|
16
16
|
return nil unless m
|
17
17
|
|
@@ -25,6 +25,7 @@ module UrlScrubber
|
|
25
25
|
url = downcase_domain(url)
|
26
26
|
remove_subdomain!(url)
|
27
27
|
remove_html_tags!(url)
|
28
|
+
# CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
|
28
29
|
url = drop_anchor!(special_cases(url))
|
29
30
|
url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL
|
30
31
|
url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
|
@@ -33,16 +34,16 @@ module UrlScrubber
|
|
33
34
|
|
34
35
|
|
35
36
|
def self.service_of(url)
|
36
|
-
|
37
|
+
|
37
38
|
domain_match = url.match(%r{https?://([^/]+)})
|
38
|
-
|
39
|
+
|
39
40
|
if domain_match
|
40
41
|
domain = domain_match[1]
|
41
42
|
first_dot = domain.index(".")
|
42
|
-
|
43
|
+
|
43
44
|
#first_dot_position = domain.index(".")
|
44
45
|
#first_dot_position += 1 if first_dot_position
|
45
|
-
|
46
|
+
|
46
47
|
#Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
|
47
48
|
if first_dot
|
48
49
|
# tumblr is a unique format
|
@@ -69,7 +70,7 @@ module UrlScrubber
|
|
69
70
|
end
|
70
71
|
|
71
72
|
:other
|
72
|
-
end
|
73
|
+
end
|
73
74
|
|
74
75
|
|
75
76
|
def self.ideal_form?(url)
|
@@ -114,14 +115,14 @@ module UrlScrubber
|
|
114
115
|
return url.include?('http://linkedin.com/company/')
|
115
116
|
end
|
116
117
|
|
117
|
-
|
118
|
+
|
118
119
|
def self.linkedin_personal_url?(url)
|
119
120
|
url = scrub(url)
|
120
121
|
return false unless url
|
121
|
-
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
122
|
+
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
122
123
|
end
|
123
124
|
|
124
|
-
|
125
|
+
|
125
126
|
def self.find_identity_from_url(url)
|
126
127
|
UrlScrubber.scrub(url).split("/").last unless url.nil?
|
127
128
|
end
|
@@ -129,7 +130,7 @@ module UrlScrubber
|
|
129
130
|
|
130
131
|
def self.find_linkedin_identity_from_url(url)
|
131
132
|
return nil if url.nil?
|
132
|
-
scrubbed_url = scrub(url)
|
133
|
+
scrubbed_url = scrub(url)
|
133
134
|
if scrubbed_url && linkedin_company_url?(scrubbed_url)
|
134
135
|
scrubbed_url.split("/").last
|
135
136
|
elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
|
@@ -173,7 +174,7 @@ module UrlScrubber
|
|
173
174
|
public_url
|
174
175
|
end
|
175
176
|
|
176
|
-
|
177
|
+
|
177
178
|
################################################################################
|
178
179
|
private
|
179
180
|
################################################################################
|
@@ -197,6 +198,8 @@ module UrlScrubber
|
|
197
198
|
when :pinterest then return sc_pinterest(url)
|
198
199
|
when :vimeo then return sc_vimeo(url)
|
199
200
|
when :yelp then return sc_yelp(url)
|
201
|
+
else
|
202
|
+
sc_generic(url)
|
200
203
|
end
|
201
204
|
|
202
205
|
url
|
@@ -225,7 +228,7 @@ module UrlScrubber
|
|
225
228
|
url
|
226
229
|
end
|
227
230
|
|
228
|
-
|
231
|
+
|
229
232
|
def self.drop_url_query!(url)
|
230
233
|
url.sub!(/\?.*$/, '')
|
231
234
|
url
|
@@ -244,15 +247,16 @@ module UrlScrubber
|
|
244
247
|
# which is not separate channel with it's own customUrl.
|
245
248
|
# url.sub!('youtube.com/user/', 'youtube.com/')
|
246
249
|
url.sub!('youtube.com/profile?user=', 'youtube.com/')
|
250
|
+
drop_url_query!(url)
|
247
251
|
url
|
248
252
|
end
|
249
253
|
|
250
254
|
|
251
255
|
def self.sc_vimeo(url)
|
252
|
-
if url.include?('vimeo.com/groups/')
|
256
|
+
if url.include?('vimeo.com/groups/')
|
253
257
|
groups_partition = url.partition('vimeo.com/groups/')
|
254
258
|
if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
|
255
|
-
extraneous_slash_partition = groups_partition[2].partition('/')
|
259
|
+
extraneous_slash_partition = groups_partition[2].partition('/')
|
256
260
|
if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
|
257
261
|
# need to trim off the sub page stuff
|
258
262
|
return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
|
@@ -276,18 +280,20 @@ module UrlScrubber
|
|
276
280
|
url = "http://twitter.com/#{search_match[1]}"
|
277
281
|
end
|
278
282
|
|
283
|
+
url = drop_url_query!(url)
|
284
|
+
|
279
285
|
url
|
280
286
|
end
|
281
287
|
|
282
288
|
|
283
289
|
def self.sc_facebook(url)
|
284
290
|
#puts "sc_facebook: #{url}"
|
285
|
-
regex1
|
286
|
-
regex2
|
291
|
+
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
292
|
+
regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
287
293
|
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
288
|
-
regex3
|
289
|
-
regex4
|
290
|
-
|
294
|
+
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
295
|
+
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
296
|
+
|
291
297
|
# If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
|
292
298
|
# then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
|
293
299
|
if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
|
@@ -332,14 +338,14 @@ module UrlScrubber
|
|
332
338
|
url = drop_url_query!(url)
|
333
339
|
elsif url.include?("facebook.com/profile.php?id=")
|
334
340
|
# puts "profile.php"
|
335
|
-
# these were being truncated, they do redirect, but typically a 301 response is generated
|
341
|
+
# these were being truncated, they do redirect, but typically a 301 response is generated
|
336
342
|
# so the url is returned unchanged. Better than truncation.
|
337
343
|
url, http_response = check_for_facebook_redirection(url)
|
338
344
|
else
|
339
345
|
# puts "else"
|
340
346
|
url = drop_url_query!(url)
|
341
347
|
end
|
342
|
-
|
348
|
+
|
343
349
|
# Due to the redirection check, "https" and "www." can be re-introduced
|
344
350
|
url = url.sub(%r{^https?://www.}i, 'http://')
|
345
351
|
url = url.sub(/\?_rdr.*/, '')
|
@@ -348,7 +354,6 @@ module UrlScrubber
|
|
348
354
|
|
349
355
|
|
350
356
|
def self.sc_linkedin(url)
|
351
|
-
|
352
357
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
353
358
|
if !!url.match(%r{com/company/})
|
354
359
|
drop_url_query!(url)
|
@@ -388,10 +393,10 @@ module UrlScrubber
|
|
388
393
|
|
389
394
|
|
390
395
|
def self.sc_flickr(url)
|
391
|
-
if url.include?('flickr.com/groups/')
|
396
|
+
if url.include?('flickr.com/groups/')
|
392
397
|
groups_partition = url.partition('flickr.com/groups/')
|
393
398
|
if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
|
394
|
-
extraneous_slash_partition = groups_partition[2].partition('/')
|
399
|
+
extraneous_slash_partition = groups_partition[2].partition('/')
|
395
400
|
if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
|
396
401
|
# need to trim off the sub page stuff
|
397
402
|
return "http://flickr.com/groups/" + extraneous_slash_partition[0]
|
@@ -408,14 +413,22 @@ module UrlScrubber
|
|
408
413
|
|
409
414
|
|
410
415
|
def self.sc_pinterest(url)
|
416
|
+
drop_url_query!(url)
|
411
417
|
url
|
412
418
|
end
|
413
419
|
|
414
420
|
|
415
421
|
def self.sc_yelp(url)
|
422
|
+
drop_url_query!(url)
|
416
423
|
url
|
417
424
|
end
|
418
|
-
|
425
|
+
|
426
|
+
|
427
|
+
def self.sc_generic(url)
|
428
|
+
drop_url_query!(url)
|
429
|
+
url
|
430
|
+
end
|
431
|
+
|
419
432
|
|
420
433
|
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
421
434
|
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
@@ -434,9 +447,9 @@ module UrlScrubber
|
|
434
447
|
|
435
448
|
uri_str_new = uri_str.sub('http://', 'https://')
|
436
449
|
uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
|
437
|
-
|
450
|
+
|
438
451
|
begin
|
439
|
-
url = URI.parse(URI.escape(uri_str_new))
|
452
|
+
url = URI.parse(URI.escape(uri_str_new))
|
440
453
|
rescue URI::InvalidURIError => e
|
441
454
|
return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
|
442
455
|
end
|
@@ -444,7 +457,7 @@ module UrlScrubber
|
|
444
457
|
http = Net::HTTP.new(url.host, url.port)
|
445
458
|
http = Net::HTTP.new(url.host, url.port)
|
446
459
|
http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
|
447
|
-
http.read_timeout = 10 # and up to 10 seconds for a response
|
460
|
+
http.read_timeout = 10 # and up to 10 seconds for a response
|
448
461
|
if url.port == 443
|
449
462
|
http.use_ssl = true
|
450
463
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -454,7 +467,7 @@ module UrlScrubber
|
|
454
467
|
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
455
468
|
|
456
469
|
begin
|
457
|
-
response = http.request(request)
|
470
|
+
response = http.request(request)
|
458
471
|
rescue Timeout::Error
|
459
472
|
#Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
|
460
473
|
failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
|
@@ -494,5 +507,5 @@ module UrlScrubber
|
|
494
507
|
return [uri_str_new, response]
|
495
508
|
end
|
496
509
|
end
|
497
|
-
|
510
|
+
|
498
511
|
end
|
data/lib/url_scrubber/version.rb
CHANGED
data/spec/url_scrubber_spec.rb
CHANGED
@@ -69,6 +69,10 @@ describe UrlScrubber do
|
|
69
69
|
it "should transform user statuses into that user's profile" do
|
70
70
|
UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
|
71
71
|
end
|
72
|
+
|
73
|
+
it "should drop the query part of the url" do
|
74
|
+
UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
|
75
|
+
end
|
72
76
|
end
|
73
77
|
|
74
78
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,76 +11,76 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2018-
|
14
|
+
date: 2018-11-20 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- - ~>
|
20
|
+
- - "~>"
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 2.11.0
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 2.11.0
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: guard-bundler
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
33
33
|
requirements:
|
34
|
-
- - ~>
|
34
|
+
- - "~>"
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: 0.1.3
|
37
37
|
type: :development
|
38
38
|
prerelease: false
|
39
39
|
version_requirements: !ruby/object:Gem::Requirement
|
40
40
|
requirements:
|
41
|
-
- - ~>
|
41
|
+
- - "~>"
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
version: 0.1.3
|
44
44
|
- !ruby/object:Gem::Dependency
|
45
45
|
name: guard-rspec
|
46
46
|
requirement: !ruby/object:Gem::Requirement
|
47
47
|
requirements:
|
48
|
-
- - ~>
|
48
|
+
- - "~>"
|
49
49
|
- !ruby/object:Gem::Version
|
50
50
|
version: 0.4.3
|
51
51
|
type: :development
|
52
52
|
prerelease: false
|
53
53
|
version_requirements: !ruby/object:Gem::Requirement
|
54
54
|
requirements:
|
55
|
-
- - ~>
|
55
|
+
- - "~>"
|
56
56
|
- !ruby/object:Gem::Version
|
57
57
|
version: 0.4.3
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: terminal-notifier-guard
|
60
60
|
requirement: !ruby/object:Gem::Requirement
|
61
61
|
requirements:
|
62
|
-
- -
|
62
|
+
- - ">="
|
63
63
|
- !ruby/object:Gem::Version
|
64
64
|
version: '0'
|
65
65
|
type: :development
|
66
66
|
prerelease: false
|
67
67
|
version_requirements: !ruby/object:Gem::Requirement
|
68
68
|
requirements:
|
69
|
-
- -
|
69
|
+
- - ">="
|
70
70
|
- !ruby/object:Gem::Version
|
71
71
|
version: '0'
|
72
72
|
- !ruby/object:Gem::Dependency
|
73
73
|
name: rb-fsevent
|
74
74
|
requirement: !ruby/object:Gem::Requirement
|
75
75
|
requirements:
|
76
|
-
- - ~>
|
76
|
+
- - "~>"
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: 0.9.1
|
79
79
|
type: :development
|
80
80
|
prerelease: false
|
81
81
|
version_requirements: !ruby/object:Gem::Requirement
|
82
82
|
requirements:
|
83
|
-
- - ~>
|
83
|
+
- - "~>"
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: 0.9.1
|
86
86
|
description: Remove extraneous bits from URLs, follow redirects, identify social media
|
@@ -94,8 +94,8 @@ executables: []
|
|
94
94
|
extensions: []
|
95
95
|
extra_rdoc_files: []
|
96
96
|
files:
|
97
|
-
- .gitignore
|
98
|
-
- .rvmrc
|
97
|
+
- ".gitignore"
|
98
|
+
- ".rvmrc"
|
99
99
|
- Gemfile
|
100
100
|
- Guardfile
|
101
101
|
- README.md
|
@@ -114,17 +114,17 @@ require_paths:
|
|
114
114
|
- lib
|
115
115
|
required_ruby_version: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- -
|
117
|
+
- - ">="
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0'
|
120
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- -
|
122
|
+
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
requirements: []
|
126
126
|
rubyforge_project:
|
127
|
-
rubygems_version: 2.
|
127
|
+
rubygems_version: 2.7.7
|
128
128
|
signing_key:
|
129
129
|
specification_version: 4
|
130
130
|
summary: Clean up URLs.
|