url_scrubber 0.8.13 → 0.8.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +28 -35
- data/url_scrubber.gemspec +2 -2
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 608b40e9de605ac987e39f8fa5b1640d6543c8dc8a83553f2689f2fbe716b50a
|
4
|
+
data.tar.gz: d6d1c7905a4875ef9fb6f2ce7ab03f18ff31e524b180c3e13bd10467513c79b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6f43dd74cec24acd3f7e2a376b89476f00eb09e627a96c55ba5ec0e97c7ccc994385858183c13230a98066d2b687a462225a3347f6aa27d36f8a668eaef087c
|
7
|
+
data.tar.gz: 395a1561434cc85197f10211393cfdaabea3e35fb50a2559f0c0c3dd669aea3cd13b8e68192a4eda105a2abce1c7476bde167ad00c8704d0e62052ce92ca94ba
|
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -34,39 +34,30 @@ module UrlScrubber
|
|
34
34
|
|
35
35
|
|
36
36
|
def self.service_of(url)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
37
|
+
url_parts = Domainatrix.parse(url)
|
38
|
+
if url_parts.host.present?
|
39
|
+
|
40
|
+
case url_parts.domain
|
41
|
+
when 'facebook' then return :facebook
|
42
|
+
when 'fb' then return :facebook
|
43
|
+
when 'flickr' then return :flickr
|
44
|
+
when 'instagram' then return :instagram
|
45
|
+
when 'linkedin' then return :linkedin
|
46
|
+
when 'pinterest' then return :pinterest
|
47
|
+
when 'slideshare' then return :slideshare
|
48
|
+
when 'tumblr' then return :tumblr
|
49
|
+
when 'twitter' then return :twitter
|
50
|
+
when 'vimeo' then return :vimeo
|
51
|
+
when 'yelp' then return :yelp
|
52
|
+
when 'youtube' then return :youtube
|
51
53
|
end
|
52
54
|
|
53
|
-
case
|
54
|
-
when /\
|
55
|
-
when /\btwitter\.com$/ then return :twitter
|
56
|
-
when /\bfacebook\.com$/ then return :facebook
|
57
|
-
when /\bbusiness.facebook\.com$/ then return :facebook
|
58
|
-
when /\blinkedin\.com$/ then return :linkedin
|
59
|
-
when /\bplus\.google\.com$/ then return :google
|
60
|
-
when /\bbusiness\.google\.com$/ then return :google
|
61
|
-
when /\bslideshare\.net$/ then return :slideshare
|
62
|
-
when /\bflickr\.com$/ then return :flickr
|
63
|
-
when /\bpinterest\.com$/ then return :pinterest
|
64
|
-
when /\bvimeo\.com$/ then return :vimeo
|
65
|
-
when /\binstagram\.com$/ then return :instagram
|
66
|
-
when /\byelp\.com$/ then return :yelp
|
55
|
+
case url_parts.host
|
56
|
+
when /\bplus\.google\.com$/ then return :google
|
67
57
|
end
|
58
|
+
|
68
59
|
else
|
69
|
-
|
60
|
+
Rails.logger.debug "No Domain Match"
|
70
61
|
end
|
71
62
|
|
72
63
|
:other
|
@@ -286,6 +277,7 @@ module UrlScrubber
|
|
286
277
|
end
|
287
278
|
|
288
279
|
|
280
|
+
# TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
|
289
281
|
def self.sc_facebook(url)
|
290
282
|
#puts "sc_facebook: #{url}"
|
291
283
|
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
@@ -293,6 +285,7 @@ module UrlScrubber
|
|
293
285
|
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
294
286
|
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
295
287
|
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
288
|
+
regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
296
289
|
|
297
290
|
# If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
|
298
291
|
# then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
|
@@ -306,32 +299,31 @@ module UrlScrubber
|
|
306
299
|
elsif url.include?('facebook.com/groups/')
|
307
300
|
url = drop_url_query!(url)
|
308
301
|
elsif mdata = url.match(regex1)
|
309
|
-
# puts "regex1"
|
310
302
|
# "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
|
311
303
|
url = mdata[:url]
|
312
304
|
uname = mdata[:uname]
|
313
305
|
uid = mdata[:uid]
|
314
306
|
elsif mdata = url.match(regex2)
|
315
|
-
# puts "regex2"
|
316
307
|
# "https://www.facebook.com/profile.php?id=100009574328879"
|
317
308
|
url, http_response = check_for_facebook_redirection(mdata[:url])
|
318
309
|
uid = mdata[:uid]
|
319
310
|
elsif mdata = url.match(regex2a)
|
320
|
-
# puts "regex2a"
|
321
311
|
# "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
|
322
312
|
url = "http://facebook.com/profile.php?id=" + mdata[:uid]
|
323
313
|
url, http_response = check_for_facebook_redirection(url)
|
324
314
|
uid = mdata[:uid]
|
325
315
|
elsif mdata = url.match(regex4)
|
326
|
-
# puts "#{url} - #{mdata[:uname]}"
|
327
316
|
# "http://facebook.com/home.php?#!/person.name"
|
328
317
|
url = mdata[:url] + mdata[:uname]
|
329
318
|
url = drop_url_query!(url)
|
319
|
+
elsif mdata = url.match(regex5)
|
320
|
+
# "https://www.facebook.com/100009574328879"
|
321
|
+
url = "http://facebook.com/" + mdata[:uid]
|
322
|
+
uid = mdata[:uid]
|
330
323
|
elsif mdata = url.match(regex3)
|
331
|
-
# puts "regex3"
|
332
324
|
# "http://facebook.com/TonyMollHomeLoans/timeline"
|
333
325
|
# "http://facebook.com/pg/TonyMollHomeLoans/timeline"
|
334
|
-
if ["pages", "pg"].exclude?(mdata[:uname])
|
326
|
+
if ["page", "pages", "pg"].exclude?(mdata[:uname])
|
335
327
|
url = "http://facebook.com/" + mdata[:uname]
|
336
328
|
uname = mdata[:uname]
|
337
329
|
end
|
@@ -353,6 +345,7 @@ module UrlScrubber
|
|
353
345
|
end
|
354
346
|
|
355
347
|
|
348
|
+
# TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca
|
356
349
|
def self.sc_linkedin(url)
|
357
350
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
358
351
|
if !!url.match(%r{com/company/})
|
data/url_scrubber.gemspec
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
|
6
|
-
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines", "Chip Roberson"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net", "chip@brandle.net"]
|
7
7
|
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
8
|
gem.summary = %q{Clean up URLs.}
|
9
9
|
gem.homepage = "http://brandle.net"
|
metadata
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
8
8
|
- Christopher Maujean
|
9
9
|
- David Hillard
|
10
10
|
- Edgar Abadines
|
11
|
+
- Chip Roberson
|
11
12
|
autorequire:
|
12
13
|
bindir: bin
|
13
14
|
cert_chain: []
|
14
|
-
date: 2018-11-
|
15
|
+
date: 2018-11-22 00:00:00.000000000 Z
|
15
16
|
dependencies:
|
16
17
|
- !ruby/object:Gem::Dependency
|
17
18
|
name: rspec
|
@@ -90,6 +91,7 @@ email:
|
|
90
91
|
- cmaujean@brandle.net
|
91
92
|
- dhillard@brandle.net
|
92
93
|
- ed@brandle.net
|
94
|
+
- chip@brandle.net
|
93
95
|
executables: []
|
94
96
|
extensions: []
|
95
97
|
extra_rdoc_files: []
|