url_scrubber 0.8.13 → 0.8.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +28 -35
- data/url_scrubber.gemspec +2 -2
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 608b40e9de605ac987e39f8fa5b1640d6543c8dc8a83553f2689f2fbe716b50a
|
4
|
+
data.tar.gz: d6d1c7905a4875ef9fb6f2ce7ab03f18ff31e524b180c3e13bd10467513c79b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6f43dd74cec24acd3f7e2a376b89476f00eb09e627a96c55ba5ec0e97c7ccc994385858183c13230a98066d2b687a462225a3347f6aa27d36f8a668eaef087c
|
7
|
+
data.tar.gz: 395a1561434cc85197f10211393cfdaabea3e35fb50a2559f0c0c3dd669aea3cd13b8e68192a4eda105a2abce1c7476bde167ad00c8704d0e62052ce92ca94ba
|
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -34,39 +34,30 @@ module UrlScrubber
|
|
34
34
|
|
35
35
|
|
36
36
|
def self.service_of(url)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
37
|
+
url_parts = Domainatrix.parse(url)
|
38
|
+
if url_parts.host.present?
|
39
|
+
|
40
|
+
case url_parts.domain
|
41
|
+
when 'facebook' then return :facebook
|
42
|
+
when 'fb' then return :facebook
|
43
|
+
when 'flickr' then return :flickr
|
44
|
+
when 'instagram' then return :instagram
|
45
|
+
when 'linkedin' then return :linkedin
|
46
|
+
when 'pinterest' then return :pinterest
|
47
|
+
when 'slideshare' then return :slideshare
|
48
|
+
when 'tumblr' then return :tumblr
|
49
|
+
when 'twitter' then return :twitter
|
50
|
+
when 'vimeo' then return :vimeo
|
51
|
+
when 'yelp' then return :yelp
|
52
|
+
when 'youtube' then return :youtube
|
51
53
|
end
|
52
54
|
|
53
|
-
case
|
54
|
-
when /\
|
55
|
-
when /\btwitter\.com$/ then return :twitter
|
56
|
-
when /\bfacebook\.com$/ then return :facebook
|
57
|
-
when /\bbusiness.facebook\.com$/ then return :facebook
|
58
|
-
when /\blinkedin\.com$/ then return :linkedin
|
59
|
-
when /\bplus\.google\.com$/ then return :google
|
60
|
-
when /\bbusiness\.google\.com$/ then return :google
|
61
|
-
when /\bslideshare\.net$/ then return :slideshare
|
62
|
-
when /\bflickr\.com$/ then return :flickr
|
63
|
-
when /\bpinterest\.com$/ then return :pinterest
|
64
|
-
when /\bvimeo\.com$/ then return :vimeo
|
65
|
-
when /\binstagram\.com$/ then return :instagram
|
66
|
-
when /\byelp\.com$/ then return :yelp
|
55
|
+
case url_parts.host
|
56
|
+
when /\bplus\.google\.com$/ then return :google
|
67
57
|
end
|
58
|
+
|
68
59
|
else
|
69
|
-
|
60
|
+
Rails.logger.debug "No Domain Match"
|
70
61
|
end
|
71
62
|
|
72
63
|
:other
|
@@ -286,6 +277,7 @@ module UrlScrubber
|
|
286
277
|
end
|
287
278
|
|
288
279
|
|
280
|
+
# TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
|
289
281
|
def self.sc_facebook(url)
|
290
282
|
#puts "sc_facebook: #{url}"
|
291
283
|
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
@@ -293,6 +285,7 @@ module UrlScrubber
|
|
293
285
|
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
294
286
|
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
295
287
|
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
288
|
+
regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
296
289
|
|
297
290
|
# If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
|
298
291
|
# then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
|
@@ -306,32 +299,31 @@ module UrlScrubber
|
|
306
299
|
elsif url.include?('facebook.com/groups/')
|
307
300
|
url = drop_url_query!(url)
|
308
301
|
elsif mdata = url.match(regex1)
|
309
|
-
# puts "regex1"
|
310
302
|
# "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
|
311
303
|
url = mdata[:url]
|
312
304
|
uname = mdata[:uname]
|
313
305
|
uid = mdata[:uid]
|
314
306
|
elsif mdata = url.match(regex2)
|
315
|
-
# puts "regex2"
|
316
307
|
# "https://www.facebook.com/profile.php?id=100009574328879"
|
317
308
|
url, http_response = check_for_facebook_redirection(mdata[:url])
|
318
309
|
uid = mdata[:uid]
|
319
310
|
elsif mdata = url.match(regex2a)
|
320
|
-
# puts "regex2a"
|
321
311
|
# "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
|
322
312
|
url = "http://facebook.com/profile.php?id=" + mdata[:uid]
|
323
313
|
url, http_response = check_for_facebook_redirection(url)
|
324
314
|
uid = mdata[:uid]
|
325
315
|
elsif mdata = url.match(regex4)
|
326
|
-
# puts "#{url} - #{mdata[:uname]}"
|
327
316
|
# "http://facebook.com/home.php?#!/person.name"
|
328
317
|
url = mdata[:url] + mdata[:uname]
|
329
318
|
url = drop_url_query!(url)
|
319
|
+
elsif mdata = url.match(regex5)
|
320
|
+
# "https://www.facebook.com/100009574328879"
|
321
|
+
url = "http://facebook.com/" + mdata[:uid]
|
322
|
+
uid = mdata[:uid]
|
330
323
|
elsif mdata = url.match(regex3)
|
331
|
-
# puts "regex3"
|
332
324
|
# "http://facebook.com/TonyMollHomeLoans/timeline"
|
333
325
|
# "http://facebook.com/pg/TonyMollHomeLoans/timeline"
|
334
|
-
if ["pages", "pg"].exclude?(mdata[:uname])
|
326
|
+
if ["page", "pages", "pg"].exclude?(mdata[:uname])
|
335
327
|
url = "http://facebook.com/" + mdata[:uname]
|
336
328
|
uname = mdata[:uname]
|
337
329
|
end
|
@@ -353,6 +345,7 @@ module UrlScrubber
|
|
353
345
|
end
|
354
346
|
|
355
347
|
|
348
|
+
# TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca
|
356
349
|
def self.sc_linkedin(url)
|
357
350
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
358
351
|
if !!url.match(%r{com/company/})
|
data/url_scrubber.gemspec
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
|
6
|
-
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines", "Chip Roberson"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net", "chip@brandle.net"]
|
7
7
|
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
8
|
gem.summary = %q{Clean up URLs.}
|
9
9
|
gem.homepage = "http://brandle.net"
|
metadata
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.15
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
8
8
|
- Christopher Maujean
|
9
9
|
- David Hillard
|
10
10
|
- Edgar Abadines
|
11
|
+
- Chip Roberson
|
11
12
|
autorequire:
|
12
13
|
bindir: bin
|
13
14
|
cert_chain: []
|
14
|
-
date: 2018-11-
|
15
|
+
date: 2018-11-22 00:00:00.000000000 Z
|
15
16
|
dependencies:
|
16
17
|
- !ruby/object:Gem::Dependency
|
17
18
|
name: rspec
|
@@ -90,6 +91,7 @@ email:
|
|
90
91
|
- cmaujean@brandle.net
|
91
92
|
- dhillard@brandle.net
|
92
93
|
- ed@brandle.net
|
94
|
+
- chip@brandle.net
|
93
95
|
executables: []
|
94
96
|
extensions: []
|
95
97
|
extra_rdoc_files: []
|