url_scrubber 0.8.16 → 0.8.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/url_scrubber.rb +19 -16
- data/lib/url_scrubber/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dfe7609c65d93e0b93cdc01f8b9ff08b6abbd417a4be183a19993f2df17a5451
|
4
|
+
data.tar.gz: 7f85340db035fa6330cc894ae0290261633d3c626ba76210ddea2be1c0a57352
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2751402307e2edb719e12279c11301d3a7084920e19dc7dc7ec18b37bf19a684b69b5ad8f7dcca7867667a463012d6f08c339fa4842458edc5c8003d303d69e1
|
7
|
+
data.tar.gz: d9156d4cb46a3a232b4e80df2759bba624d2366ec691cece5f0c025a24ccc4faf5706a7e48f6ef35d4bde8b50b354fb3965ff54f056cca59f29beac926f71a73
|
data/lib/url_scrubber.rb
CHANGED
@@ -12,7 +12,7 @@ module UrlScrubber
|
|
12
12
|
|
13
13
|
url = url.clone # don't modify the original argument
|
14
14
|
|
15
|
-
m = url.match(/(htt?ps?:\/\/\S
|
15
|
+
m = url.match(/(htt?ps?:\/\/\S+)/i)
|
16
16
|
return nil unless m
|
17
17
|
|
18
18
|
url = m[1]
|
@@ -179,7 +179,11 @@ module UrlScrubber
|
|
179
179
|
|
180
180
|
def self.downcase_domain(url)
|
181
181
|
domain_match = url.match(%r{http://[^/]+}i)
|
182
|
-
|
182
|
+
if domain_match
|
183
|
+
domain_match[0].downcase + domain_match.post_match
|
184
|
+
else
|
185
|
+
url
|
186
|
+
end
|
183
187
|
end
|
184
188
|
|
185
189
|
|
@@ -285,13 +289,15 @@ module UrlScrubber
|
|
285
289
|
|
286
290
|
# TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
|
287
291
|
def self.sc_facebook(url)
|
288
|
-
|
289
|
-
|
292
|
+
|
293
|
+
url = url.gsub(/(_rdr=.+&)|(&_rdr=.+$)/,"")
|
294
|
+
|
295
|
+
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
290
296
|
regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
291
|
-
|
292
|
-
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
297
|
+
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
293
298
|
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
294
299
|
regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
300
|
+
regex6 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/home\/accounts\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
295
301
|
|
296
302
|
# If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
|
297
303
|
# then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
|
@@ -300,10 +306,7 @@ module UrlScrubber
|
|
300
306
|
end
|
301
307
|
|
302
308
|
if url.match("/media/albums") || url.match("/media/set")
|
303
|
-
# puts "media"
|
304
309
|
url = url.match('\&') ? url.split('&',2)[0] : url
|
305
|
-
elsif url.include?('facebook.com/groups/')
|
306
|
-
url = drop_url_query!(url)
|
307
310
|
elsif mdata = url.match(regex1)
|
308
311
|
# "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
|
309
312
|
url = mdata[:url]
|
@@ -313,11 +316,6 @@ module UrlScrubber
|
|
313
316
|
# "https://www.facebook.com/profile.php?id=100009574328879"
|
314
317
|
url, http_response = check_for_facebook_redirection(mdata[:url])
|
315
318
|
uid = mdata[:uid]
|
316
|
-
elsif mdata = url.match(regex2a)
|
317
|
-
# "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
|
318
|
-
url = "http://facebook.com/profile.php?id=" + mdata[:uid]
|
319
|
-
url, http_response = check_for_facebook_redirection(url)
|
320
|
-
uid = mdata[:uid]
|
321
319
|
elsif mdata = url.match(regex4)
|
322
320
|
# "http://facebook.com/home.php?#!/person.name"
|
323
321
|
url = mdata[:url] + mdata[:uname]
|
@@ -326,11 +324,16 @@ module UrlScrubber
|
|
326
324
|
# "https://www.facebook.com/100009574328879"
|
327
325
|
url = "http://facebook.com/" + mdata[:uid]
|
328
326
|
uid = mdata[:uid]
|
327
|
+
elsif mdata = url.match(regex6)
|
328
|
+
# "http://business.facebook.com/home/accounts?business_id=1145724702268347"
|
329
|
+
url = mdata[:url]
|
330
|
+
uid = mdata[:uid]
|
329
331
|
elsif mdata = url.match(regex3)
|
330
332
|
# "http://facebook.com/TonyMollHomeLoans/timeline"
|
331
333
|
# "http://facebook.com/pg/TonyMollHomeLoans/timeline"
|
332
|
-
|
333
|
-
|
334
|
+
# "https://www.facebook.com/groups/practicewithclaritygroup"
|
335
|
+
if ["group", "groups", "page", "pages", "pg"].exclude?(mdata[:uname])
|
336
|
+
url = (mdata[:group] ? "http://facebook.com/groups/" : "http://facebook.com/") + mdata[:uname]
|
334
337
|
uname = mdata[:uname]
|
335
338
|
end
|
336
339
|
url = drop_url_query!(url)
|
data/lib/url_scrubber/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.21
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date:
|
15
|
+
date: 2021-01-08 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rspec
|
@@ -125,8 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
125
125
|
- !ruby/object:Gem::Version
|
126
126
|
version: '0'
|
127
127
|
requirements: []
|
128
|
-
|
129
|
-
rubygems_version: 2.7.8
|
128
|
+
rubygems_version: 3.0.6
|
130
129
|
signing_key:
|
131
130
|
specification_version: 4
|
132
131
|
summary: Clean up URLs.
|