url_scrubber 0.8.12 → 0.8.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/lib/url_scrubber.rb +44 -31
- data/lib/url_scrubber/version.rb +1 -1
- data/spec/url_scrubber_spec.rb +4 -0
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
|
4
|
+
data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
|
11
|
-
NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
|
14
|
-
NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
|
15
|
-
ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
|
6
|
+
metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
|
7
|
+
data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
|
data/lib/url_scrubber.rb
CHANGED
@@ -9,9 +9,9 @@ module UrlScrubber
|
|
9
9
|
def self.scrub(url)
|
10
10
|
return url if url.blank?
|
11
11
|
return url if /^app:\/\//.match(url) # Do not scrub app-only URLs
|
12
|
-
|
12
|
+
|
13
13
|
url = url.clone # don't modify the original argument
|
14
|
-
|
14
|
+
|
15
15
|
m = url.match(/(htt?ps?:\/\/\S*)/i)
|
16
16
|
return nil unless m
|
17
17
|
|
@@ -25,6 +25,7 @@ module UrlScrubber
|
|
25
25
|
url = downcase_domain(url)
|
26
26
|
remove_subdomain!(url)
|
27
27
|
remove_html_tags!(url)
|
28
|
+
# CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
|
28
29
|
url = drop_anchor!(special_cases(url))
|
29
30
|
url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL
|
30
31
|
url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
|
@@ -33,16 +34,16 @@ module UrlScrubber
|
|
33
34
|
|
34
35
|
|
35
36
|
def self.service_of(url)
|
36
|
-
|
37
|
+
|
37
38
|
domain_match = url.match(%r{https?://([^/]+)})
|
38
|
-
|
39
|
+
|
39
40
|
if domain_match
|
40
41
|
domain = domain_match[1]
|
41
42
|
first_dot = domain.index(".")
|
42
|
-
|
43
|
+
|
43
44
|
#first_dot_position = domain.index(".")
|
44
45
|
#first_dot_position += 1 if first_dot_position
|
45
|
-
|
46
|
+
|
46
47
|
#Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
|
47
48
|
if first_dot
|
48
49
|
# tumblr is a unique format
|
@@ -69,7 +70,7 @@ module UrlScrubber
|
|
69
70
|
end
|
70
71
|
|
71
72
|
:other
|
72
|
-
end
|
73
|
+
end
|
73
74
|
|
74
75
|
|
75
76
|
def self.ideal_form?(url)
|
@@ -114,14 +115,14 @@ module UrlScrubber
|
|
114
115
|
return url.include?('http://linkedin.com/company/')
|
115
116
|
end
|
116
117
|
|
117
|
-
|
118
|
+
|
118
119
|
def self.linkedin_personal_url?(url)
|
119
120
|
url = scrub(url)
|
120
121
|
return false unless url
|
121
|
-
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
122
|
+
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
122
123
|
end
|
123
124
|
|
124
|
-
|
125
|
+
|
125
126
|
def self.find_identity_from_url(url)
|
126
127
|
UrlScrubber.scrub(url).split("/").last unless url.nil?
|
127
128
|
end
|
@@ -129,7 +130,7 @@ module UrlScrubber
|
|
129
130
|
|
130
131
|
def self.find_linkedin_identity_from_url(url)
|
131
132
|
return nil if url.nil?
|
132
|
-
scrubbed_url = scrub(url)
|
133
|
+
scrubbed_url = scrub(url)
|
133
134
|
if scrubbed_url && linkedin_company_url?(scrubbed_url)
|
134
135
|
scrubbed_url.split("/").last
|
135
136
|
elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
|
@@ -173,7 +174,7 @@ module UrlScrubber
|
|
173
174
|
public_url
|
174
175
|
end
|
175
176
|
|
176
|
-
|
177
|
+
|
177
178
|
################################################################################
|
178
179
|
private
|
179
180
|
################################################################################
|
@@ -197,6 +198,8 @@ module UrlScrubber
|
|
197
198
|
when :pinterest then return sc_pinterest(url)
|
198
199
|
when :vimeo then return sc_vimeo(url)
|
199
200
|
when :yelp then return sc_yelp(url)
|
201
|
+
else
|
202
|
+
sc_generic(url)
|
200
203
|
end
|
201
204
|
|
202
205
|
url
|
@@ -225,7 +228,7 @@ module UrlScrubber
|
|
225
228
|
url
|
226
229
|
end
|
227
230
|
|
228
|
-
|
231
|
+
|
229
232
|
def self.drop_url_query!(url)
|
230
233
|
url.sub!(/\?.*$/, '')
|
231
234
|
url
|
@@ -244,15 +247,16 @@ module UrlScrubber
|
|
244
247
|
# which is not separate channel with it's own customUrl.
|
245
248
|
# url.sub!('youtube.com/user/', 'youtube.com/')
|
246
249
|
url.sub!('youtube.com/profile?user=', 'youtube.com/')
|
250
|
+
drop_url_query!(url)
|
247
251
|
url
|
248
252
|
end
|
249
253
|
|
250
254
|
|
251
255
|
def self.sc_vimeo(url)
|
252
|
-
if url.include?('vimeo.com/groups/')
|
256
|
+
if url.include?('vimeo.com/groups/')
|
253
257
|
groups_partition = url.partition('vimeo.com/groups/')
|
254
258
|
if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
|
255
|
-
extraneous_slash_partition = groups_partition[2].partition('/')
|
259
|
+
extraneous_slash_partition = groups_partition[2].partition('/')
|
256
260
|
if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
|
257
261
|
# need to trim off the sub page stuff
|
258
262
|
return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
|
@@ -276,18 +280,20 @@ module UrlScrubber
|
|
276
280
|
url = "http://twitter.com/#{search_match[1]}"
|
277
281
|
end
|
278
282
|
|
283
|
+
url = drop_url_query!(url)
|
284
|
+
|
279
285
|
url
|
280
286
|
end
|
281
287
|
|
282
288
|
|
283
289
|
def self.sc_facebook(url)
|
284
290
|
#puts "sc_facebook: #{url}"
|
285
|
-
regex1
|
286
|
-
regex2
|
291
|
+
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
292
|
+
regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
287
293
|
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
288
|
-
regex3
|
289
|
-
regex4
|
290
|
-
|
294
|
+
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
295
|
+
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
296
|
+
|
291
297
|
# If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
|
292
298
|
# then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
|
293
299
|
if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
|
@@ -332,14 +338,14 @@ module UrlScrubber
|
|
332
338
|
url = drop_url_query!(url)
|
333
339
|
elsif url.include?("facebook.com/profile.php?id=")
|
334
340
|
# puts "profile.php"
|
335
|
-
# these were being truncated, they do redirect, but typically a 301 response is generated
|
341
|
+
# these were being truncated, they do redirect, but typically a 301 response is generated
|
336
342
|
# so the url is returned unchanged. Better than truncation.
|
337
343
|
url, http_response = check_for_facebook_redirection(url)
|
338
344
|
else
|
339
345
|
# puts "else"
|
340
346
|
url = drop_url_query!(url)
|
341
347
|
end
|
342
|
-
|
348
|
+
|
343
349
|
# Due to the redirection check, "https" and "www." can be re-introduced
|
344
350
|
url = url.sub(%r{^https?://www.}i, 'http://')
|
345
351
|
url = url.sub(/\?_rdr.*/, '')
|
@@ -348,7 +354,6 @@ module UrlScrubber
|
|
348
354
|
|
349
355
|
|
350
356
|
def self.sc_linkedin(url)
|
351
|
-
|
352
357
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
353
358
|
if !!url.match(%r{com/company/})
|
354
359
|
drop_url_query!(url)
|
@@ -388,10 +393,10 @@ module UrlScrubber
|
|
388
393
|
|
389
394
|
|
390
395
|
def self.sc_flickr(url)
|
391
|
-
if url.include?('flickr.com/groups/')
|
396
|
+
if url.include?('flickr.com/groups/')
|
392
397
|
groups_partition = url.partition('flickr.com/groups/')
|
393
398
|
if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
|
394
|
-
extraneous_slash_partition = groups_partition[2].partition('/')
|
399
|
+
extraneous_slash_partition = groups_partition[2].partition('/')
|
395
400
|
if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
|
396
401
|
# need to trim off the sub page stuff
|
397
402
|
return "http://flickr.com/groups/" + extraneous_slash_partition[0]
|
@@ -408,14 +413,22 @@ module UrlScrubber
|
|
408
413
|
|
409
414
|
|
410
415
|
def self.sc_pinterest(url)
|
416
|
+
drop_url_query!(url)
|
411
417
|
url
|
412
418
|
end
|
413
419
|
|
414
420
|
|
415
421
|
def self.sc_yelp(url)
|
422
|
+
drop_url_query!(url)
|
416
423
|
url
|
417
424
|
end
|
418
|
-
|
425
|
+
|
426
|
+
|
427
|
+
def self.sc_generic(url)
|
428
|
+
drop_url_query!(url)
|
429
|
+
url
|
430
|
+
end
|
431
|
+
|
419
432
|
|
420
433
|
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
421
434
|
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
@@ -434,9 +447,9 @@ module UrlScrubber
|
|
434
447
|
|
435
448
|
uri_str_new = uri_str.sub('http://', 'https://')
|
436
449
|
uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
|
437
|
-
|
450
|
+
|
438
451
|
begin
|
439
|
-
url = URI.parse(URI.escape(uri_str_new))
|
452
|
+
url = URI.parse(URI.escape(uri_str_new))
|
440
453
|
rescue URI::InvalidURIError => e
|
441
454
|
return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
|
442
455
|
end
|
@@ -444,7 +457,7 @@ module UrlScrubber
|
|
444
457
|
http = Net::HTTP.new(url.host, url.port)
|
445
458
|
http = Net::HTTP.new(url.host, url.port)
|
446
459
|
http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
|
447
|
-
http.read_timeout = 10 # and up to 10 seconds for a response
|
460
|
+
http.read_timeout = 10 # and up to 10 seconds for a response
|
448
461
|
if url.port == 443
|
449
462
|
http.use_ssl = true
|
450
463
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -454,7 +467,7 @@ module UrlScrubber
|
|
454
467
|
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
455
468
|
|
456
469
|
begin
|
457
|
-
response = http.request(request)
|
470
|
+
response = http.request(request)
|
458
471
|
rescue Timeout::Error
|
459
472
|
#Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
|
460
473
|
failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
|
@@ -494,5 +507,5 @@ module UrlScrubber
|
|
494
507
|
return [uri_str_new, response]
|
495
508
|
end
|
496
509
|
end
|
497
|
-
|
510
|
+
|
498
511
|
end
|
data/lib/url_scrubber/version.rb
CHANGED
data/spec/url_scrubber_spec.rb
CHANGED
@@ -69,6 +69,10 @@ describe UrlScrubber do
|
|
69
69
|
it "should transform user statuses into that user's profile" do
|
70
70
|
UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
|
71
71
|
end
|
72
|
+
|
73
|
+
it "should drop the query part of the url" do
|
74
|
+
UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
|
75
|
+
end
|
72
76
|
end
|
73
77
|
|
74
78
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,76 +11,76 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2018-
|
14
|
+
date: 2018-11-20 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- - ~>
|
20
|
+
- - "~>"
|
21
21
|
- !ruby/object:Gem::Version
|
22
22
|
version: 2.11.0
|
23
23
|
type: :development
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - ~>
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: 2.11.0
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: guard-bundler
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
33
33
|
requirements:
|
34
|
-
- - ~>
|
34
|
+
- - "~>"
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: 0.1.3
|
37
37
|
type: :development
|
38
38
|
prerelease: false
|
39
39
|
version_requirements: !ruby/object:Gem::Requirement
|
40
40
|
requirements:
|
41
|
-
- - ~>
|
41
|
+
- - "~>"
|
42
42
|
- !ruby/object:Gem::Version
|
43
43
|
version: 0.1.3
|
44
44
|
- !ruby/object:Gem::Dependency
|
45
45
|
name: guard-rspec
|
46
46
|
requirement: !ruby/object:Gem::Requirement
|
47
47
|
requirements:
|
48
|
-
- - ~>
|
48
|
+
- - "~>"
|
49
49
|
- !ruby/object:Gem::Version
|
50
50
|
version: 0.4.3
|
51
51
|
type: :development
|
52
52
|
prerelease: false
|
53
53
|
version_requirements: !ruby/object:Gem::Requirement
|
54
54
|
requirements:
|
55
|
-
- - ~>
|
55
|
+
- - "~>"
|
56
56
|
- !ruby/object:Gem::Version
|
57
57
|
version: 0.4.3
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: terminal-notifier-guard
|
60
60
|
requirement: !ruby/object:Gem::Requirement
|
61
61
|
requirements:
|
62
|
-
- -
|
62
|
+
- - ">="
|
63
63
|
- !ruby/object:Gem::Version
|
64
64
|
version: '0'
|
65
65
|
type: :development
|
66
66
|
prerelease: false
|
67
67
|
version_requirements: !ruby/object:Gem::Requirement
|
68
68
|
requirements:
|
69
|
-
- -
|
69
|
+
- - ">="
|
70
70
|
- !ruby/object:Gem::Version
|
71
71
|
version: '0'
|
72
72
|
- !ruby/object:Gem::Dependency
|
73
73
|
name: rb-fsevent
|
74
74
|
requirement: !ruby/object:Gem::Requirement
|
75
75
|
requirements:
|
76
|
-
- - ~>
|
76
|
+
- - "~>"
|
77
77
|
- !ruby/object:Gem::Version
|
78
78
|
version: 0.9.1
|
79
79
|
type: :development
|
80
80
|
prerelease: false
|
81
81
|
version_requirements: !ruby/object:Gem::Requirement
|
82
82
|
requirements:
|
83
|
-
- - ~>
|
83
|
+
- - "~>"
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: 0.9.1
|
86
86
|
description: Remove extraneous bits from URLs, follow redirects, identify social media
|
@@ -94,8 +94,8 @@ executables: []
|
|
94
94
|
extensions: []
|
95
95
|
extra_rdoc_files: []
|
96
96
|
files:
|
97
|
-
- .gitignore
|
98
|
-
- .rvmrc
|
97
|
+
- ".gitignore"
|
98
|
+
- ".rvmrc"
|
99
99
|
- Gemfile
|
100
100
|
- Guardfile
|
101
101
|
- README.md
|
@@ -114,17 +114,17 @@ require_paths:
|
|
114
114
|
- lib
|
115
115
|
required_ruby_version: !ruby/object:Gem::Requirement
|
116
116
|
requirements:
|
117
|
-
- -
|
117
|
+
- - ">="
|
118
118
|
- !ruby/object:Gem::Version
|
119
119
|
version: '0'
|
120
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
|
-
- -
|
122
|
+
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
requirements: []
|
126
126
|
rubyforge_project:
|
127
|
-
rubygems_version: 2.
|
127
|
+
rubygems_version: 2.7.7
|
128
128
|
signing_key:
|
129
129
|
specification_version: 4
|
130
130
|
summary: Clean up URLs.
|