url_scrubber 0.8.16 → 0.8.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 455a7dccc82ca65d302205f525100c2c59700cbf41950533cae59269e47355c0
4
- data.tar.gz: 558a14779d4edcb5766896cab6ed21b1b7ad2f5cc924d30a007aa670d4fe7bd7
3
+ metadata.gz: dfe7609c65d93e0b93cdc01f8b9ff08b6abbd417a4be183a19993f2df17a5451
4
+ data.tar.gz: 7f85340db035fa6330cc894ae0290261633d3c626ba76210ddea2be1c0a57352
5
5
  SHA512:
6
- metadata.gz: fba24e3059a04408972ef8fc52013b726aff9b2995adc1bf153bb9723673b65f9add9334a1c4fd2f93c7a92fa9db960b1e9469b81590f20a60a4c830e92632ea
7
- data.tar.gz: 2d30bdcaf8d61e516cacc933dc2cfd2dd8010cea0edaab786a6d8b2c4a0f94c657d97c6b63ace038cc0f12a14e0ee0d2979e8bda43d9d814b74a56d0f7e3ae02
6
+ metadata.gz: 2751402307e2edb719e12279c11301d3a7084920e19dc7dc7ec18b37bf19a684b69b5ad8f7dcca7867667a463012d6f08c339fa4842458edc5c8003d303d69e1
7
+ data.tar.gz: d9156d4cb46a3a232b4e80df2759bba624d2366ec691cece5f0c025a24ccc4faf5706a7e48f6ef35d4bde8b50b354fb3965ff54f056cca59f29beac926f71a73
data/lib/url_scrubber.rb CHANGED
@@ -12,7 +12,7 @@ module UrlScrubber
12
12
 
13
13
  url = url.clone # don't modify the original argument
14
14
 
15
- m = url.match(/(htt?ps?:\/\/\S*)/i)
15
+ m = url.match(/(htt?ps?:\/\/\S+)/i)
16
16
  return nil unless m
17
17
 
18
18
  url = m[1]
@@ -179,7 +179,11 @@ module UrlScrubber
179
179
 
180
180
  def self.downcase_domain(url)
181
181
  domain_match = url.match(%r{http://[^/]+}i)
182
- domain_match[0].downcase + domain_match.post_match
182
+ if domain_match
183
+ domain_match[0].downcase + domain_match.post_match
184
+ else
185
+ url
186
+ end
183
187
  end
184
188
 
185
189
 
@@ -285,13 +289,15 @@ module UrlScrubber
285
289
 
286
290
  # TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
287
291
  def self.sc_facebook(url)
288
- #puts "sc_facebook: #{url}"
289
- regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
292
+
293
+ url = url.gsub(/(_rdr=.+&)|(&_rdr=.+$)/,"")
294
+
295
+ regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
290
296
  regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
291
- regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
292
- regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/((pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
297
+ regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
293
298
  regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
294
299
  regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
300
+ regex6 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/home\/accounts\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
295
301
 
296
302
  # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418"
297
303
  # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/"
@@ -300,10 +306,7 @@ module UrlScrubber
300
306
  end
301
307
 
302
308
  if url.match("/media/albums") || url.match("/media/set")
303
- # puts "media"
304
309
  url = url.match('\&') ? url.split('&',2)[0] : url
305
- elsif url.include?('facebook.com/groups/')
306
- url = drop_url_query!(url)
307
310
  elsif mdata = url.match(regex1)
308
311
  # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline"
309
312
  url = mdata[:url]
@@ -313,11 +316,6 @@ module UrlScrubber
313
316
  # "https://www.facebook.com/profile.php?id=100009574328879"
314
317
  url, http_response = check_for_facebook_redirection(mdata[:url])
315
318
  uid = mdata[:uid]
316
- elsif mdata = url.match(regex2a)
317
- # "https://www.facebook.com/profile.php?_rdr=p&id=100009574328879"
318
- url = "http://facebook.com/profile.php?id=" + mdata[:uid]
319
- url, http_response = check_for_facebook_redirection(url)
320
- uid = mdata[:uid]
321
319
  elsif mdata = url.match(regex4)
322
320
  # "http://facebook.com/home.php?#!/person.name"
323
321
  url = mdata[:url] + mdata[:uname]
@@ -326,11 +324,16 @@ module UrlScrubber
326
324
  # "https://www.facebook.com/100009574328879"
327
325
  url = "http://facebook.com/" + mdata[:uid]
328
326
  uid = mdata[:uid]
327
+ elsif mdata = url.match(regex6)
328
+ # "http://business.facebook.com/home/accounts?business_id=1145724702268347"
329
+ url = mdata[:url]
330
+ uid = mdata[:uid]
329
331
  elsif mdata = url.match(regex3)
330
332
  # "http://facebook.com/TonyMollHomeLoans/timeline"
331
333
  # "http://facebook.com/pg/TonyMollHomeLoans/timeline"
332
- if ["page", "pages", "pg"].exclude?(mdata[:uname])
333
- url = "http://facebook.com/" + mdata[:uname]
334
+ # "https://www.facebook.com/groups/practicewithclaritygroup"
335
+ if ["group", "groups", "page", "pages", "pg"].exclude?(mdata[:uname])
336
+ url = (mdata[:group] ? "http://facebook.com/groups/" : "http://facebook.com/") + mdata[:uname]
334
337
  uname = mdata[:uname]
335
338
  end
336
339
  url = drop_url_query!(url)
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.16"
2
+ VERSION = "0.8.21"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.16
4
+ version: 0.8.21
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -12,7 +12,7 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2019-02-12 00:00:00.000000000 Z
15
+ date: 2021-01-08 00:00:00.000000000 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rspec
@@ -125,8 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
125
125
  - !ruby/object:Gem::Version
126
126
  version: '0'
127
127
  requirements: []
128
- rubyforge_project:
129
- rubygems_version: 2.7.8
128
+ rubygems_version: 3.0.6
130
129
  signing_key:
131
130
  specification_version: 4
132
131
  summary: Clean up URLs.