url_scrubber 0.8.12 → 0.8.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- ZmJkNTc2MWQ5NmU0NWNkMTZhMmE1NDZmOTFmMGRjOGE4NDc3ZjJkMA==
5
- data.tar.gz: !binary |-
6
- NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
2
+ SHA256:
3
+ metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
4
+ data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZWZjMGY0YmVhZDM3OThhZjExNDAzMWUyYzlkYjdlMzg3N2Q0MWM0MGM3ZTkx
10
- MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
11
- NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
12
- data.tar.gz: !binary |-
13
- YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
14
- NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
15
- ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
6
+ metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
7
+ data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
data/lib/url_scrubber.rb CHANGED
@@ -9,9 +9,9 @@ module UrlScrubber
9
9
  def self.scrub(url)
10
10
  return url if url.blank?
11
11
  return url if /^app:\/\//.match(url) # Do not scrub app-only URLs
12
-
12
+
13
13
  url = url.clone # don't modify the original argument
14
-
14
+
15
15
  m = url.match(/(htt?ps?:\/\/\S*)/i)
16
16
  return nil unless m
17
17
 
@@ -25,6 +25,7 @@ module UrlScrubber
25
25
  url = downcase_domain(url)
26
26
  remove_subdomain!(url)
27
27
  remove_html_tags!(url)
28
+ # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
28
29
  url = drop_anchor!(special_cases(url))
29
30
  url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL
30
31
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
@@ -33,16 +34,16 @@ module UrlScrubber
33
34
 
34
35
 
35
36
  def self.service_of(url)
36
-
37
+
37
38
  domain_match = url.match(%r{https?://([^/]+)})
38
-
39
+
39
40
  if domain_match
40
41
  domain = domain_match[1]
41
42
  first_dot = domain.index(".")
42
-
43
+
43
44
  #first_dot_position = domain.index(".")
44
45
  #first_dot_position += 1 if first_dot_position
45
-
46
+
46
47
  #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
47
48
  if first_dot
48
49
  # tumblr is a unique format
@@ -69,7 +70,7 @@ module UrlScrubber
69
70
  end
70
71
 
71
72
  :other
72
- end
73
+ end
73
74
 
74
75
 
75
76
  def self.ideal_form?(url)
@@ -114,14 +115,14 @@ module UrlScrubber
114
115
  return url.include?('http://linkedin.com/company/')
115
116
  end
116
117
 
117
-
118
+
118
119
  def self.linkedin_personal_url?(url)
119
120
  url = scrub(url)
120
121
  return false unless url
121
- return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
122
+ return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
122
123
  end
123
124
 
124
-
125
+
125
126
  def self.find_identity_from_url(url)
126
127
  UrlScrubber.scrub(url).split("/").last unless url.nil?
127
128
  end
@@ -129,7 +130,7 @@ module UrlScrubber
129
130
 
130
131
  def self.find_linkedin_identity_from_url(url)
131
132
  return nil if url.nil?
132
- scrubbed_url = scrub(url)
133
+ scrubbed_url = scrub(url)
133
134
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
134
135
  scrubbed_url.split("/").last
135
136
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
@@ -173,7 +174,7 @@ module UrlScrubber
173
174
  public_url
174
175
  end
175
176
 
176
-
177
+
177
178
  ################################################################################
178
179
  private
179
180
  ################################################################################
@@ -197,6 +198,8 @@ module UrlScrubber
197
198
  when :pinterest then return sc_pinterest(url)
198
199
  when :vimeo then return sc_vimeo(url)
199
200
  when :yelp then return sc_yelp(url)
201
+ else
202
+ sc_generic(url)
200
203
  end
201
204
 
202
205
  url
@@ -225,7 +228,7 @@ module UrlScrubber
225
228
  url
226
229
  end
227
230
 
228
-
231
+
229
232
  def self.drop_url_query!(url)
230
233
  url.sub!(/\?.*$/, '')
231
234
  url
@@ -244,15 +247,16 @@ module UrlScrubber
244
247
  # which is not separate channel with it's own customUrl.
245
248
  # url.sub!('youtube.com/user/', 'youtube.com/')
246
249
  url.sub!('youtube.com/profile?user=', 'youtube.com/')
250
+ drop_url_query!(url)
247
251
  url
248
252
  end
249
253
 
250
254
 
251
255
  def self.sc_vimeo(url)
252
- if url.include?('vimeo.com/groups/')
256
+ if url.include?('vimeo.com/groups/')
253
257
  groups_partition = url.partition('vimeo.com/groups/')
254
258
  if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
255
- extraneous_slash_partition = groups_partition[2].partition('/')
259
+ extraneous_slash_partition = groups_partition[2].partition('/')
256
260
  if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
257
261
  # need to trim off the sub page stuff
258
262
  return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
@@ -276,18 +280,20 @@ module UrlScrubber
276
280
  url = "http://twitter.com/#{search_match[1]}"
277
281
  end
278
282
 
283
+ url = drop_url_query!(url)
284
+
279
285
  url
280
286
  end
281
287
 
282
288
 
283
289
  def self.sc_facebook(url)
284
290
  #puts "sc_facebook: #{url}"
285
- regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
286
- regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
291
+ regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
292
+ regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
287
293
  regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
288
- regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pg\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
289
- regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
290
-
294
+ regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
295
+ regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
296
+
291
297
  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
292
298
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
293
299
  if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
@@ -332,14 +338,14 @@ module UrlScrubber
332
338
  url = drop_url_query!(url)
333
339
  elsif url.include?("facebook.com/profile.php?id=")
334
340
  # puts "profile.php"
335
- # these were being truncated, they do redirect, but typically a 301 response is generated
341
+ # these were being truncated, they do redirect, but typically a 301 response is generated
336
342
  # so the url is returned unchanged. Better than truncation.
337
343
  url, http_response = check_for_facebook_redirection(url)
338
344
  else
339
345
  # puts "else"
340
346
  url = drop_url_query!(url)
341
347
  end
342
-
348
+
343
349
  # Due to the redirection check, "https" and "www." can be re-introduced
344
350
  url = url.sub(%r{^https?://www.}i, 'http://')
345
351
  url = url.sub(/\?_rdr.*/, '')
@@ -348,7 +354,6 @@ module UrlScrubber
348
354
 
349
355
 
350
356
  def self.sc_linkedin(url)
351
-
352
357
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
353
358
  if !!url.match(%r{com/company/})
354
359
  drop_url_query!(url)
@@ -388,10 +393,10 @@ module UrlScrubber
388
393
 
389
394
 
390
395
  def self.sc_flickr(url)
391
- if url.include?('flickr.com/groups/')
396
+ if url.include?('flickr.com/groups/')
392
397
  groups_partition = url.partition('flickr.com/groups/')
393
398
  if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
394
- extraneous_slash_partition = groups_partition[2].partition('/')
399
+ extraneous_slash_partition = groups_partition[2].partition('/')
395
400
  if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
396
401
  # need to trim off the sub page stuff
397
402
  return "http://flickr.com/groups/" + extraneous_slash_partition[0]
@@ -408,14 +413,22 @@ module UrlScrubber
408
413
 
409
414
 
410
415
  def self.sc_pinterest(url)
416
+ drop_url_query!(url)
411
417
  url
412
418
  end
413
419
 
414
420
 
415
421
  def self.sc_yelp(url)
422
+ drop_url_query!(url)
416
423
  url
417
424
  end
418
-
425
+
426
+
427
+ def self.sc_generic(url)
428
+ drop_url_query!(url)
429
+ url
430
+ end
431
+
419
432
 
420
433
  def self.check_for_facebook_redirection(uri_str, limit = 5)
421
434
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
@@ -434,9 +447,9 @@ module UrlScrubber
434
447
 
435
448
  uri_str_new = uri_str.sub('http://', 'https://')
436
449
  uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
437
-
450
+
438
451
  begin
439
- url = URI.parse(URI.escape(uri_str_new))
452
+ url = URI.parse(URI.escape(uri_str_new))
440
453
  rescue URI::InvalidURIError => e
441
454
  return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
442
455
  end
@@ -444,7 +457,7 @@ module UrlScrubber
444
457
  http = Net::HTTP.new(url.host, url.port)
445
458
  http = Net::HTTP.new(url.host, url.port)
446
459
  http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
447
- http.read_timeout = 10 # and up to 10 seconds for a response
460
+ http.read_timeout = 10 # and up to 10 seconds for a response
448
461
  if url.port == 443
449
462
  http.use_ssl = true
450
463
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -454,7 +467,7 @@ module UrlScrubber
454
467
  request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
455
468
 
456
469
  begin
457
- response = http.request(request)
470
+ response = http.request(request)
458
471
  rescue Timeout::Error
459
472
  #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
460
473
  failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
@@ -494,5 +507,5 @@ module UrlScrubber
494
507
  return [uri_str_new, response]
495
508
  end
496
509
  end
497
-
510
+
498
511
  end
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.12"
2
+ VERSION = "0.8.13"
3
3
  end
@@ -69,6 +69,10 @@ describe UrlScrubber do
69
69
  it "should transform user statuses into that user's profile" do
70
70
  UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
71
71
  end
72
+
73
+ it "should drop the query part of the url" do
74
+ UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
75
+ end
72
76
  end
73
77
 
74
78
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.12
4
+ version: 0.8.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,76 +11,76 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2018-05-20 00:00:00.000000000 Z
14
+ date: 2018-11-20 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
18
18
  requirement: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ~>
20
+ - - "~>"
21
21
  - !ruby/object:Gem::Version
22
22
  version: 2.11.0
23
23
  type: :development
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: 2.11.0
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: guard-bundler
32
32
  requirement: !ruby/object:Gem::Requirement
33
33
  requirements:
34
- - - ~>
34
+ - - "~>"
35
35
  - !ruby/object:Gem::Version
36
36
  version: 0.1.3
37
37
  type: :development
38
38
  prerelease: false
39
39
  version_requirements: !ruby/object:Gem::Requirement
40
40
  requirements:
41
- - - ~>
41
+ - - "~>"
42
42
  - !ruby/object:Gem::Version
43
43
  version: 0.1.3
44
44
  - !ruby/object:Gem::Dependency
45
45
  name: guard-rspec
46
46
  requirement: !ruby/object:Gem::Requirement
47
47
  requirements:
48
- - - ~>
48
+ - - "~>"
49
49
  - !ruby/object:Gem::Version
50
50
  version: 0.4.3
51
51
  type: :development
52
52
  prerelease: false
53
53
  version_requirements: !ruby/object:Gem::Requirement
54
54
  requirements:
55
- - - ~>
55
+ - - "~>"
56
56
  - !ruby/object:Gem::Version
57
57
  version: 0.4.3
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: terminal-notifier-guard
60
60
  requirement: !ruby/object:Gem::Requirement
61
61
  requirements:
62
- - - ! '>='
62
+ - - ">="
63
63
  - !ruby/object:Gem::Version
64
64
  version: '0'
65
65
  type: :development
66
66
  prerelease: false
67
67
  version_requirements: !ruby/object:Gem::Requirement
68
68
  requirements:
69
- - - ! '>='
69
+ - - ">="
70
70
  - !ruby/object:Gem::Version
71
71
  version: '0'
72
72
  - !ruby/object:Gem::Dependency
73
73
  name: rb-fsevent
74
74
  requirement: !ruby/object:Gem::Requirement
75
75
  requirements:
76
- - - ~>
76
+ - - "~>"
77
77
  - !ruby/object:Gem::Version
78
78
  version: 0.9.1
79
79
  type: :development
80
80
  prerelease: false
81
81
  version_requirements: !ruby/object:Gem::Requirement
82
82
  requirements:
83
- - - ~>
83
+ - - "~>"
84
84
  - !ruby/object:Gem::Version
85
85
  version: 0.9.1
86
86
  description: Remove extraneous bits from URLs, follow redirects, identify social media
@@ -94,8 +94,8 @@ executables: []
94
94
  extensions: []
95
95
  extra_rdoc_files: []
96
96
  files:
97
- - .gitignore
98
- - .rvmrc
97
+ - ".gitignore"
98
+ - ".rvmrc"
99
99
  - Gemfile
100
100
  - Guardfile
101
101
  - README.md
@@ -114,17 +114,17 @@ require_paths:
114
114
  - lib
115
115
  required_ruby_version: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ! '>='
117
+ - - ">="
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0'
120
120
  required_rubygems_version: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - ! '>='
122
+ - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  requirements: []
126
126
  rubyforge_project:
127
- rubygems_version: 2.4.8
127
+ rubygems_version: 2.7.7
128
128
  signing_key:
129
129
  specification_version: 4
130
130
  summary: Clean up URLs.