url_scrubber 0.8.13 → 0.8.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
4
- data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
3
+ metadata.gz: 608b40e9de605ac987e39f8fa5b1640d6543c8dc8a83553f2689f2fbe716b50a
4
+ data.tar.gz: d6d1c7905a4875ef9fb6f2ce7ab03f18ff31e524b180c3e13bd10467513c79b8
5
5
  SHA512:
6
- metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
7
- data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
6
+ metadata.gz: f6f43dd74cec24acd3f7e2a376b89476f00eb09e627a96c55ba5ec0e97c7ccc994385858183c13230a98066d2b687a462225a3347f6aa27d36f8a668eaef087c
7
+ data.tar.gz: 395a1561434cc85197f10211393cfdaabea3e35fb50a2559f0c0c3dd669aea3cd13b8e68192a4eda105a2abce1c7476bde167ad00c8704d0e62052ce92ca94ba
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.13"
2
+ VERSION = "0.8.15"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -34,39 +34,30 @@ module UrlScrubber
34
34
 
35
35
 
36
36
  def self.service_of(url)
37
-
38
- domain_match = url.match(%r{https?://([^/]+)})
39
-
40
- if domain_match
41
- domain = domain_match[1]
42
- first_dot = domain.index(".")
43
-
44
- #first_dot_position = domain.index(".")
45
- #first_dot_position += 1 if first_dot_position
46
-
47
- #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
48
- if first_dot
49
- # tumblr is a unique format
50
- return :tumblr if domain[first_dot+1..domain.size].index("tumblr.com") == 0
37
+ url_parts = Domainatrix.parse(url)
38
+ if url_parts.host.present?
39
+
40
+ case url_parts.domain
41
+ when 'facebook' then return :facebook
42
+ when 'fb' then return :facebook
43
+ when 'flickr' then return :flickr
44
+ when 'instagram' then return :instagram
45
+ when 'linkedin' then return :linkedin
46
+ when 'pinterest' then return :pinterest
47
+ when 'slideshare' then return :slideshare
48
+ when 'tumblr' then return :tumblr
49
+ when 'twitter' then return :twitter
50
+ when 'vimeo' then return :vimeo
51
+ when 'yelp' then return :yelp
52
+ when 'youtube' then return :youtube
51
53
  end
52
54
 
53
- case domain
54
- when /\byoutube\.com$/ then return :youtube
55
- when /\btwitter\.com$/ then return :twitter
56
- when /\bfacebook\.com$/ then return :facebook
57
- when /\bbusiness.facebook\.com$/ then return :facebook
58
- when /\blinkedin\.com$/ then return :linkedin
59
- when /\bplus\.google\.com$/ then return :google
60
- when /\bbusiness\.google\.com$/ then return :google
61
- when /\bslideshare\.net$/ then return :slideshare
62
- when /\bflickr\.com$/ then return :flickr
63
- when /\bpinterest\.com$/ then return :pinterest
64
- when /\bvimeo\.com$/ then return :vimeo
65
- when /\binstagram\.com$/ then return :instagram
66
- when /\byelp\.com$/ then return :yelp
55
+ case url_parts.host
56
+ when /\bplus\.google\.com$/ then return :google
67
57
  end
58
+
68
59
  else
69
- Rails.logger.debug "No Domain Match"
60
+ Rails.logger.debug "No Domain Match"
70
61
  end
71
62
 
72
63
  :other
@@ -286,6 +277,7 @@ module UrlScrubber
286
277
  end
287
278
 
288
279
 
280
+ # TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
289
281
  def self.sc_facebook(url)
290
282
  #puts "sc_facebook: #{url}"
291
283
  regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
@@ -293,6 +285,7 @@ module UrlScrubber
293
285
  regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
294
286
  regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
295
287
  regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
288
+ regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
296
289
 
297
290
  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
298
291
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
@@ -306,32 +299,31 @@ module UrlScrubber
306
299
  elsif url.include?('facebook.com/groups/')
307
300
  url = drop_url_query!(url)
308
301
  elsif mdata = url.match(regex1)
309
- # puts "regex1"
310
302
  # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
311
303
  url = mdata[:url]
312
304
  uname = mdata[:uname]
313
305
  uid = mdata[:uid]
314
306
  elsif mdata = url.match(regex2)
315
- # puts "regex2"
316
307
  # "https://www.facebook.com/profile.php?id=100009574328879"
317
308
  url, http_response = check_for_facebook_redirection(mdata[:url])
318
309
  uid = mdata[:uid]
319
310
  elsif mdata = url.match(regex2a)
320
- # puts "regex2a"
321
311
  # "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
322
312
  url = "http://facebook.com/profile.php?id=" + mdata[:uid]
323
313
  url, http_response = check_for_facebook_redirection(url)
324
314
  uid = mdata[:uid]
325
315
  elsif mdata = url.match(regex4)
326
- # puts "#{url} - #{mdata[:uname]}"
327
316
  # "http://facebook.com/home.php?#!/person.name"
328
317
  url = mdata[:url] + mdata[:uname]
329
318
  url = drop_url_query!(url)
319
+ elsif mdata = url.match(regex5)
320
+ # "https://www.facebook.com/100009574328879"
321
+ url = "http://facebook.com/" + mdata[:uid]
322
+ uid = mdata[:uid]
330
323
  elsif mdata = url.match(regex3)
331
- # puts "regex3"
332
324
  # "http://facebook.com/TonyMollHomeLoans/timeline"
333
325
  # "http://facebook.com/pg/TonyMollHomeLoans/timeline"
334
- if ["pages", "pg"].exclude?(mdata[:uname])
326
+ if ["page", "pages", "pg"].exclude?(mdata[:uname])
335
327
  url = "http://facebook.com/" + mdata[:uname]
336
328
  uname = mdata[:uname]
337
329
  end
@@ -353,6 +345,7 @@ module UrlScrubber
353
345
  end
354
346
 
355
347
 
348
+ # TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca
356
349
  def self.sc_linkedin(url)
357
350
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
358
351
  if !!url.match(%r{com/company/})
data/url_scrubber.gemspec CHANGED
@@ -2,8 +2,8 @@
2
2
  require File.expand_path('../lib/url_scrubber/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
- gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
6
- gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
5
+ gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines", "Chip Roberson"]
6
+ gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net", "chip@brandle.net"]
7
7
  gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
8
8
  gem.summary = %q{Clean up URLs.}
9
9
  gem.homepage = "http://brandle.net"
metadata CHANGED
@@ -1,17 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.13
4
+ version: 0.8.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
8
8
  - Christopher Maujean
9
9
  - David Hillard
10
10
  - Edgar Abadines
11
+ - Chip Roberson
11
12
  autorequire:
12
13
  bindir: bin
13
14
  cert_chain: []
14
- date: 2018-11-20 00:00:00.000000000 Z
15
+ date: 2018-11-22 00:00:00.000000000 Z
15
16
  dependencies:
16
17
  - !ruby/object:Gem::Dependency
17
18
  name: rspec
@@ -90,6 +91,7 @@ email:
90
91
  - cmaujean@brandle.net
91
92
  - dhillard@brandle.net
92
93
  - ed@brandle.net
94
+ - chip@brandle.net
93
95
  executables: []
94
96
  extensions: []
95
97
  extra_rdoc_files: []