url_scrubber 0.8.13 → 0.8.15

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5b0e2a469b840847dde026eaaeb18f48938e49080a737ba2f70b53333f889aeb
4
- data.tar.gz: df9c7c316151a830d000f1adffcf043a95fe46e71822824a9526a9a309522b11
3
+ metadata.gz: 608b40e9de605ac987e39f8fa5b1640d6543c8dc8a83553f2689f2fbe716b50a
4
+ data.tar.gz: d6d1c7905a4875ef9fb6f2ce7ab03f18ff31e524b180c3e13bd10467513c79b8
5
5
  SHA512:
6
- metadata.gz: 0c45b738609ad89ffbf7e69cb46378710fe7fdb69f88abcb61df71e112a02194493eb1175650a446d81a51601ee36f9acabf8d5f68853f561ead495025eb9e3f
7
- data.tar.gz: 8a293e4f32b7cf355bd3a2e8570ac1b5e5569b116ab92e0b98c6eeff92d028ac25952c8df4d8a49a1d0f829919b9fdc716fbade35063922f3157c1f26710b45b
6
+ metadata.gz: f6f43dd74cec24acd3f7e2a376b89476f00eb09e627a96c55ba5ec0e97c7ccc994385858183c13230a98066d2b687a462225a3347f6aa27d36f8a668eaef087c
7
+ data.tar.gz: 395a1561434cc85197f10211393cfdaabea3e35fb50a2559f0c0c3dd669aea3cd13b8e68192a4eda105a2abce1c7476bde167ad00c8704d0e62052ce92ca94ba
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.13"
2
+ VERSION = "0.8.15"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -34,39 +34,30 @@ module UrlScrubber
34
34
 
35
35
 
36
36
  def self.service_of(url)
37
-
38
- domain_match = url.match(%r{https?://([^/]+)})
39
-
40
- if domain_match
41
- domain = domain_match[1]
42
- first_dot = domain.index(".")
43
-
44
- #first_dot_position = domain.index(".")
45
- #first_dot_position += 1 if first_dot_position
46
-
47
- #Rails.logger.debug "domain = #{domain}, first dot = #{first_dot ? first_dot : 'none'}, first dot 1= #{first_dot ? domain[first_dot+1..domain.size] : 'NIL'}"
48
- if first_dot
49
- # tumblr is a unique format
50
- return :tumblr if domain[first_dot+1..domain.size].index("tumblr.com") == 0
37
+ url_parts = Domainatrix.parse(url)
38
+ if url_parts.host.present?
39
+
40
+ case url_parts.domain
41
+ when 'facebook' then return :facebook
42
+ when 'fb' then return :facebook
43
+ when 'flickr' then return :flickr
44
+ when 'instagram' then return :instagram
45
+ when 'linkedin' then return :linkedin
46
+ when 'pinterest' then return :pinterest
47
+ when 'slideshare' then return :slideshare
48
+ when 'tumblr' then return :tumblr
49
+ when 'twitter' then return :twitter
50
+ when 'vimeo' then return :vimeo
51
+ when 'yelp' then return :yelp
52
+ when 'youtube' then return :youtube
51
53
  end
52
54
 
53
- case domain
54
- when /\byoutube\.com$/ then return :youtube
55
- when /\btwitter\.com$/ then return :twitter
56
- when /\bfacebook\.com$/ then return :facebook
57
- when /\bbusiness.facebook\.com$/ then return :facebook
58
- when /\blinkedin\.com$/ then return :linkedin
59
- when /\bplus\.google\.com$/ then return :google
60
- when /\bbusiness\.google\.com$/ then return :google
61
- when /\bslideshare\.net$/ then return :slideshare
62
- when /\bflickr\.com$/ then return :flickr
63
- when /\bpinterest\.com$/ then return :pinterest
64
- when /\bvimeo\.com$/ then return :vimeo
65
- when /\binstagram\.com$/ then return :instagram
66
- when /\byelp\.com$/ then return :yelp
55
+ case url_parts.host
56
+ when /\bplus\.google\.com$/ then return :google
67
57
  end
58
+
68
59
  else
69
- Rails.logger.debug "No Domain Match"
60
+ Rails.logger.debug "No Domain Match"
70
61
  end
71
62
 
72
63
  :other
@@ -286,6 +277,7 @@ module UrlScrubber
286
277
  end
287
278
 
288
279
 
280
+ # TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
289
281
  def self.sc_facebook(url)
290
282
  #puts "sc_facebook: #{url}"
291
283
  regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
@@ -293,6 +285,7 @@ module UrlScrubber
293
285
  regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
294
286
  regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
295
287
  regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
288
+ regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
296
289
 
297
290
  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
298
291
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
@@ -306,32 +299,31 @@ module UrlScrubber
306
299
  elsif url.include?('facebook.com/groups/')
307
300
  url = drop_url_query!(url)
308
301
  elsif mdata = url.match(regex1)
309
- # puts "regex1"
310
302
  # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
311
303
  url = mdata[:url]
312
304
  uname = mdata[:uname]
313
305
  uid = mdata[:uid]
314
306
  elsif mdata = url.match(regex2)
315
- # puts "regex2"
316
307
  # "https://www.facebook.com/profile.php?id=100009574328879"
317
308
  url, http_response = check_for_facebook_redirection(mdata[:url])
318
309
  uid = mdata[:uid]
319
310
  elsif mdata = url.match(regex2a)
320
- # puts "regex2a"
321
311
  # "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
322
312
  url = "http://facebook.com/profile.php?id=" + mdata[:uid]
323
313
  url, http_response = check_for_facebook_redirection(url)
324
314
  uid = mdata[:uid]
325
315
  elsif mdata = url.match(regex4)
326
- # puts "#{url} - #{mdata[:uname]}"
327
316
  # "http://facebook.com/home.php?#!/person.name"
328
317
  url = mdata[:url] + mdata[:uname]
329
318
  url = drop_url_query!(url)
319
+ elsif mdata = url.match(regex5)
320
+ # "https://www.facebook.com/100009574328879"
321
+ url = "http://facebook.com/" + mdata[:uid]
322
+ uid = mdata[:uid]
330
323
  elsif mdata = url.match(regex3)
331
- # puts "regex3"
332
324
  # "http://facebook.com/TonyMollHomeLoans/timeline"
333
325
  # "http://facebook.com/pg/TonyMollHomeLoans/timeline"
334
- if ["pages", "pg"].exclude?(mdata[:uname])
326
+ if ["page", "pages", "pg"].exclude?(mdata[:uname])
335
327
  url = "http://facebook.com/" + mdata[:uname]
336
328
  uname = mdata[:uname]
337
329
  end
@@ -353,6 +345,7 @@ module UrlScrubber
353
345
  end
354
346
 
355
347
 
348
+ # TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca
356
349
  def self.sc_linkedin(url)
357
350
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
358
351
  if !!url.match(%r{com/company/})
data/url_scrubber.gemspec CHANGED
@@ -2,8 +2,8 @@
2
2
  require File.expand_path('../lib/url_scrubber/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
- gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
6
- gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
5
+ gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines", "Chip Roberson"]
6
+ gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net", "chip@brandle.net"]
7
7
  gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
8
8
  gem.summary = %q{Clean up URLs.}
9
9
  gem.homepage = "http://brandle.net"
metadata CHANGED
@@ -1,17 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.13
4
+ version: 0.8.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
8
8
  - Christopher Maujean
9
9
  - David Hillard
10
10
  - Edgar Abadines
11
+ - Chip Roberson
11
12
  autorequire:
12
13
  bindir: bin
13
14
  cert_chain: []
14
- date: 2018-11-20 00:00:00.000000000 Z
15
+ date: 2018-11-22 00:00:00.000000000 Z
15
16
  dependencies:
16
17
  - !ruby/object:Gem::Dependency
17
18
  name: rspec
@@ -90,6 +91,7 @@ email:
90
91
  - cmaujean@brandle.net
91
92
  - dhillard@brandle.net
92
93
  - ed@brandle.net
94
+ - chip@brandle.net
93
95
  executables: []
94
96
  extensions: []
95
97
  extra_rdoc_files: []