url_scrubber 0.8.12 → 0.8.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- ZmJkNTc2MWQ5NmU0NWNkMTZhMmE1NDZmOTFmMGRjOGE4NDc3ZjJkMA==
5
- data.tar.gz: !binary |-
6
- NmYzMDIzNTE1OGIxZWM3ZTRhZmE0NGE2NzI2YmIzZWIzZDJhNWQ0ZQ==
2
+ SHA256:
3
+ metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
4
+ data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZWZjMGY0YmVhZDM3OThhZjExNDAzMWUyYzlkYjdlMzg3N2Q0MWM0MGM3ZTkx
10
- MWY2Yzk1Y2FmZTMyODVhNjFlMDg2OWU0ODc5OTQ0N2ViZGEyMTNhZmQyNDU2
11
- NmI2MmE5NzRmMzQ2YWYyY2I1N2IyMTU3ZWQzYzU1ZGM3YWZiMTY=
12
- data.tar.gz: !binary |-
13
- YzBkOTY2YjQ0ZDFiOGViNGEzNDMxYzM2YmVhYWZkMWFkMWZkOTg3ZGY4YTFj
14
- NWIyZmI0N2E3ZDZlZGQ4OWZlYThiMzhlMWFiNWZkOTBhMzYxZTdmMjkxM2U0
15
- ZjZkNDAxZjU2NDY3OGUwZjk2MDQxN2IyZjE0YzRiYjUwMTRmYmU=
6
+ metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
7
+ data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
data/lib/url_scrubber.rb CHANGED
@@ -9,9 +9,9 @@ module UrlScrubber
9
9
  def self.scrub(url)
10
10
  return url if url.blank?
11
11
  return url if /^app:\/\//.match(url) # Do not scrub app-only URLs
12
-
12
+
13
13
  url = url.clone # don't modify the original argument
14
-
14
+
15
15
  m = url.match(/(htt?ps?:\/\/\S*)/i)
16
16
  return nil unless m
17
17
 
@@ -25,6 +25,7 @@ module UrlScrubber
25
25
  url = downcase_domain(url)
26
26
  remove_subdomain!(url)
27
27
  remove_html_tags!(url)
28
+ # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL
28
29
  url = drop_anchor!(special_cases(url))
29
30
  url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL
30
31
  url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL
@@ -33,16 +34,16 @@ module UrlScrubber
33
34
 
34
35
 
35
36
  def self.service_of(url)
36
-
37
+
37
38
  domain_match = url.match(%r{https?://([^/]+)})
38
-
39
+
39
40
  if domain_match
40
41
  domain = domain_match[1]
41
42
  first_dot = domain.index(".")
42
-
43
+
43
44
  #first_dot_position = domain.index(".")
44
45
  #first_dot_position += 1 if first_dot_position
45
-
46
+
46
47
  #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
47
48
  if first_dot
48
49
  # tumblr is a unique format
@@ -69,7 +70,7 @@ module UrlScrubber
69
70
  end
70
71
 
71
72
  :other
72
- end
73
+ end
73
74
 
74
75
 
75
76
  def self.ideal_form?(url)
@@ -114,14 +115,14 @@ module UrlScrubber
114
115
  return url.include?('http://linkedin.com/company/')
115
116
  end
116
117
 
117
-
118
+
118
119
  def self.linkedin_personal_url?(url)
119
120
  url = scrub(url)
120
121
  return false unless url
121
- return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
122
+ return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
122
123
  end
123
124
 
124
-
125
+
125
126
  def self.find_identity_from_url(url)
126
127
  UrlScrubber.scrub(url).split("/").last unless url.nil?
127
128
  end
@@ -129,7 +130,7 @@ module UrlScrubber
129
130
 
130
131
  def self.find_linkedin_identity_from_url(url)
131
132
  return nil if url.nil?
132
- scrubbed_url = scrub(url)
133
+ scrubbed_url = scrub(url)
133
134
  if scrubbed_url && linkedin_company_url?(scrubbed_url)
134
135
  scrubbed_url.split("/").last
135
136
  elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/')
@@ -173,7 +174,7 @@ module UrlScrubber
173
174
  public_url
174
175
  end
175
176
 
176
-
177
+
177
178
  ################################################################################
178
179
  private
179
180
  ################################################################################
@@ -197,6 +198,8 @@ module UrlScrubber
197
198
  when :pinterest then return sc_pinterest(url)
198
199
  when :vimeo then return sc_vimeo(url)
199
200
  when :yelp then return sc_yelp(url)
201
+ else
202
+ sc_generic(url)
200
203
  end
201
204
 
202
205
  url
@@ -225,7 +228,7 @@ module UrlScrubber
225
228
  url
226
229
  end
227
230
 
228
-
231
+
229
232
  def self.drop_url_query!(url)
230
233
  url.sub!(/\?.*$/, '')
231
234
  url
@@ -244,15 +247,16 @@ module UrlScrubber
244
247
  # which is not separate channel with it's own customUrl.
245
248
  # url.sub!('youtube.com/user/', 'youtube.com/')
246
249
  url.sub!('youtube.com/profile?user=', 'youtube.com/')
250
+ drop_url_query!(url)
247
251
  url
248
252
  end
249
253
 
250
254
 
251
255
  def self.sc_vimeo(url)
252
- if url.include?('vimeo.com/groups/')
256
+ if url.include?('vimeo.com/groups/')
253
257
  groups_partition = url.partition('vimeo.com/groups/')
254
258
  if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
255
- extraneous_slash_partition = groups_partition[2].partition('/')
259
+ extraneous_slash_partition = groups_partition[2].partition('/')
256
260
  if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
257
261
  # need to trim off the sub page stuff
258
262
  return "http://vimeo.com/groups/" + extraneous_slash_partition[0]
@@ -276,18 +280,20 @@ module UrlScrubber
276
280
  url = "http://twitter.com/#{search_match[1]}"
277
281
  end
278
282
 
283
+ url = drop_url_query!(url)
284
+
279
285
  url
280
286
  end
281
287
 
282
288
 
283
289
  def self.sc_facebook(url)
284
290
  #puts "sc_facebook: #{url}"
285
- regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
286
- regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
291
+ regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
292
+ regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
287
293
  regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
288
- regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pg\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
289
- regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
290
-
294
+ regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
295
+ regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
296
+
291
297
  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
292
298
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
293
299
  if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url)
@@ -332,14 +338,14 @@ module UrlScrubber
332
338
  url = drop_url_query!(url)
333
339
  elsif url.include?("facebook.com/profile.php?id=")
334
340
  # puts "profile.php"
335
- # these were being truncated, they do redirect, but typically a 301 response is generated
341
+ # these were being truncated, they do redirect, but typically a 301 response is generated
336
342
  # so the url is returned unchanged. Better than truncation.
337
343
  url, http_response = check_for_facebook_redirection(url)
338
344
  else
339
345
  # puts "else"
340
346
  url = drop_url_query!(url)
341
347
  end
342
-
348
+
343
349
  # Due to the redirection check, "https" and "www." can be re-introduced
344
350
  url = url.sub(%r{^https?://www.}i, 'http://')
345
351
  url = url.sub(/\?_rdr.*/, '')
@@ -348,7 +354,6 @@ module UrlScrubber
348
354
 
349
355
 
350
356
  def self.sc_linkedin(url)
351
-
352
357
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
353
358
  if !!url.match(%r{com/company/})
354
359
  drop_url_query!(url)
@@ -388,10 +393,10 @@ module UrlScrubber
388
393
 
389
394
 
390
395
  def self.sc_flickr(url)
391
- if url.include?('flickr.com/groups/')
396
+ if url.include?('flickr.com/groups/')
392
397
  groups_partition = url.partition('flickr.com/groups/')
393
398
  if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != ""
394
- extraneous_slash_partition = groups_partition[2].partition('/')
399
+ extraneous_slash_partition = groups_partition[2].partition('/')
395
400
  if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != ""
396
401
  # need to trim off the sub page stuff
397
402
  return "http://flickr.com/groups/" + extraneous_slash_partition[0]
@@ -408,14 +413,22 @@ module UrlScrubber
408
413
 
409
414
 
410
415
  def self.sc_pinterest(url)
416
+ drop_url_query!(url)
411
417
  url
412
418
  end
413
419
 
414
420
 
415
421
  def self.sc_yelp(url)
422
+ drop_url_query!(url)
416
423
  url
417
424
  end
418
-
425
+
426
+
427
+ def self.sc_generic(url)
428
+ drop_url_query!(url)
429
+ url
430
+ end
431
+
419
432
 
420
433
  def self.check_for_facebook_redirection(uri_str, limit = 5)
421
434
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
@@ -434,9 +447,9 @@ module UrlScrubber
434
447
 
435
448
  uri_str_new = uri_str.sub('http://', 'https://')
436
449
  uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.")
437
-
450
+
438
451
  begin
439
- url = URI.parse(URI.escape(uri_str_new))
452
+ url = URI.parse(URI.escape(uri_str_new))
440
453
  rescue URI::InvalidURIError => e
441
454
  return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ]
442
455
  end
@@ -444,7 +457,7 @@ module UrlScrubber
444
457
  http = Net::HTTP.new(url.host, url.port)
445
458
  http = Net::HTTP.new(url.host, url.port)
446
459
  http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established
447
- http.read_timeout = 10 # and up to 10 seconds for a response
460
+ http.read_timeout = 10 # and up to 10 seconds for a response
448
461
  if url.port == 443
449
462
  http.use_ssl = true
450
463
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -454,7 +467,7 @@ module UrlScrubber
454
467
  request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
455
468
 
456
469
  begin
457
- response = http.request(request)
470
+ response = http.request(request)
458
471
  rescue Timeout::Error
459
472
  #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}")
460
473
  failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable')
@@ -494,5 +507,5 @@ module UrlScrubber
494
507
  return [uri_str_new, response]
495
508
  end
496
509
  end
497
-
510
+
498
511
  end
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.12"
2
+ VERSION = "0.8.13"
3
3
  end
@@ -69,6 +69,10 @@ describe UrlScrubber do
69
69
  it "should transform user statuses into that user's profile" do
70
70
  UrlScrubber.scrub('http://twitter.com/absolutely/statuses/135243243261312').should eq('http://twitter.com/absolutely')
71
71
  end
72
+
73
+ it "should drop the query part of the url" do
74
+ UrlScrubber.scrub('http://twitter.com/novartisuk?lang=en').should eq('http://twitter.com/novartisuk')
75
+ end
72
76
  end
73
77
 
74
78
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.12
4
+ version: 0.8.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,76 +11,76 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2018-05-20 00:00:00.000000000 Z
14
+ date: 2018-11-20 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
18
18
  requirement: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ~>
20
+ - - "~>"
21
21
  - !ruby/object:Gem::Version
22
22
  version: 2.11.0
23
23
  type: :development
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: 2.11.0
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: guard-bundler
32
32
  requirement: !ruby/object:Gem::Requirement
33
33
  requirements:
34
- - - ~>
34
+ - - "~>"
35
35
  - !ruby/object:Gem::Version
36
36
  version: 0.1.3
37
37
  type: :development
38
38
  prerelease: false
39
39
  version_requirements: !ruby/object:Gem::Requirement
40
40
  requirements:
41
- - - ~>
41
+ - - "~>"
42
42
  - !ruby/object:Gem::Version
43
43
  version: 0.1.3
44
44
  - !ruby/object:Gem::Dependency
45
45
  name: guard-rspec
46
46
  requirement: !ruby/object:Gem::Requirement
47
47
  requirements:
48
- - - ~>
48
+ - - "~>"
49
49
  - !ruby/object:Gem::Version
50
50
  version: 0.4.3
51
51
  type: :development
52
52
  prerelease: false
53
53
  version_requirements: !ruby/object:Gem::Requirement
54
54
  requirements:
55
- - - ~>
55
+ - - "~>"
56
56
  - !ruby/object:Gem::Version
57
57
  version: 0.4.3
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: terminal-notifier-guard
60
60
  requirement: !ruby/object:Gem::Requirement
61
61
  requirements:
62
- - - ! '>='
62
+ - - ">="
63
63
  - !ruby/object:Gem::Version
64
64
  version: '0'
65
65
  type: :development
66
66
  prerelease: false
67
67
  version_requirements: !ruby/object:Gem::Requirement
68
68
  requirements:
69
- - - ! '>='
69
+ - - ">="
70
70
  - !ruby/object:Gem::Version
71
71
  version: '0'
72
72
  - !ruby/object:Gem::Dependency
73
73
  name: rb-fsevent
74
74
  requirement: !ruby/object:Gem::Requirement
75
75
  requirements:
76
- - - ~>
76
+ - - "~>"
77
77
  - !ruby/object:Gem::Version
78
78
  version: 0.9.1
79
79
  type: :development
80
80
  prerelease: false
81
81
  version_requirements: !ruby/object:Gem::Requirement
82
82
  requirements:
83
- - - ~>
83
+ - - "~>"
84
84
  - !ruby/object:Gem::Version
85
85
  version: 0.9.1
86
86
  description: Remove extraneous bits from URLs, follow redirects, identify social media
@@ -94,8 +94,8 @@ executables: []
94
94
  extensions: []
95
95
  extra_rdoc_files: []
96
96
  files:
97
- - .gitignore
98
- - .rvmrc
97
+ - ".gitignore"
98
+ - ".rvmrc"
99
99
  - Gemfile
100
100
  - Guardfile
101
101
  - README.md
@@ -114,17 +114,17 @@ require_paths:
114
114
  - lib
115
115
  required_ruby_version: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ! '>='
117
+ - - ">="
118
118
  - !ruby/object:Gem::Version
119
119
  version: '0'
120
120
  required_rubygems_version: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - ! '>='
122
+ - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  requirements: []
126
126
  rubyforge_project:
127
- rubygems_version: 2.4.8
127
+ rubygems_version: 2.7.7
128
128
  signing_key:
129
129
  specification_version: 4
130
130
  summary: Clean up URLs.