url_scrubber 0.8.5 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MWNkMWE5MDM0ZTliMzg1NDcwN2Y5Zjg4Mzc1ZWE4MWY1ZmVlMWUwMQ==
4
+ MDM4ZTYzMzA2NTRjNzhjOTI2MjI4MGI3MDdlZjIzYjZmYjQ4ZDUwOA==
5
5
  data.tar.gz: !binary |-
6
- ZTg4Y2ZkNTU2NWZjMjhmYjczZGM4OTQ3MWRmYjNkMDc5ZDMxMTJmNg==
6
+ NDczYzkxOTc4OGFkNmQzNTI2M2E4Y2E0MGM2NmRhZmQwNWRmZDYwNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZWM2M2VlZWM5N2UyMmFhODU4NmE0MjAwYWZiMzk1OTBkODA3ZDQ4NzA2ZWRj
10
- Mzc0ZTk2MjY5YTA2YzVjY2NiY2RjMWEyZTQyZjc3NzFlNzE4YTU3YWRhZTQy
11
- NWE3MDBlNzc5YTM4NDEzNjYwYzJiMjUxYjY3MDRjZjUzNzc1NTA=
9
+ MmMxY2U0ZmMwMDYyZjUwYjQ4YjI3OTU2NDZiZjgzYTZhZDY5ODQ3ZTAzM2E4
10
+ NTNkNWM0NWMwNTcyYTkwMDlkYzk2MzkyMDY1NjYzMTNlODg0Y2FhYWIxNmUy
11
+ ZDgwMjUxNDI3YTk5ZTQ2MTdjZWJkNzcxZGViMzYyZTBjYTI5NDY=
12
12
  data.tar.gz: !binary |-
13
- NGQ2ZDQ3NjUwY2RlOGYxMzljNDMyMTU0ZWJmZGRkN2YwMmJjNmIwMzY4OGQ4
14
- NWU0NmE5OGE3Mjk4NTgxNDk1M2E2YzdkNTVmNTEwMzQ1NDEzOGY2NmZjMzA2
15
- ZTMzNzUwNDIyMWM5YzIzMjg2NTJhMGRlMWNkMWE5NTdiN2E5ZGI=
13
+ ZDRkYjJmNjY2ZWQ5N2IxNzE2MWYxNGNmODJhNDJmNjc0OTkyZjIzOTYxMzMx
14
+ NGVkZjE4MTYzN2E4OWE2MTNhMDUzOGFiMWY1Mzg1MmNkNWUyNWQyNjlhYjE0
15
+ M2NmYjY2ZDg3MzAxOTNkZTZmZTQ0NDFlY2FmODVjOWI1OTEyYzA=
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in url_scrubber.gemspec
4
- gemspec
4
+ gemspec
5
+
6
+ gem 'domainatrix'
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.5"
2
+ VERSION = "0.8.6"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -5,6 +5,7 @@ require 'uri'
5
5
  USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
6
6
 
7
7
  module UrlScrubber
8
+
8
9
  def self.scrub(url)
9
10
  url = url.clone # don't modify the original argument
10
11
 
@@ -27,6 +28,7 @@ module UrlScrubber
27
28
  return url
28
29
  end
29
30
 
31
+
30
32
  def self.service_of(url)
31
33
 
32
34
  domain_match = url.match(%r{https?://([^/]+)})
@@ -66,6 +68,7 @@ module UrlScrubber
66
68
  :other
67
69
  end
68
70
 
71
+
69
72
  def self.ideal_form?(url)
70
73
  url = scrub(url)
71
74
  return false unless url
@@ -100,23 +103,27 @@ module UrlScrubber
100
103
  true
101
104
  end
102
105
  end
103
-
106
+
107
+
104
108
  def self.linkedin_company_url?(url)
105
109
  url = scrub(url)
106
110
  return false unless url
107
111
  return url.include?('http://linkedin.com/company/')
108
112
  end
113
+
109
114
 
110
115
  def self.linkedin_personal_url?(url)
111
116
  url = scrub(url)
112
117
  return false unless url
113
118
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
114
119
  end
120
+
115
121
 
116
122
  def self.find_identity_from_url(url)
117
123
  UrlScrubber.scrub(url).split("/").last unless url.nil?
118
124
  end
119
125
 
126
+
120
127
  def self.find_linkedin_identity_from_url(url)
121
128
  return nil if url.nil?
122
129
  scrubbed_url = scrub(url)
@@ -135,6 +142,7 @@ module UrlScrubber
135
142
  end
136
143
  end
137
144
 
145
+
138
146
  # Requirements:
139
147
  # 1. must have http/https scheme
140
148
  # 2. no "@" in any of the passed in url string
@@ -146,14 +154,34 @@ module UrlScrubber
146
154
  rescue URI::InvalidURIError
147
155
  false
148
156
  end
157
+
158
+
159
+ def self.maps_to_public_url(url)
160
+ scrubbed = scrub(url)
161
+ parsed = URI.parse(url) or return nil
162
+ host = Domainatrix.parse(parsed.host)
163
+ if host.domain == "facebook" && host.subdomain == "business"
164
+ public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
165
+ elsif host.domain == "google" && host.subdomain == "business"
166
+ public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
167
+ else
168
+ public_url = nil
169
+ end
170
+ public_url
171
+ end
172
+
149
173
 
174
+ ################################################################################
150
175
  private
176
+ ################################################################################
177
+
151
178
 
152
179
  def self.downcase_domain(url)
153
180
  domain_match = url.match(%r{http://[^/]+}i)
154
181
  domain_match[0].downcase + domain_match.post_match
155
182
  end
156
183
 
184
+
157
185
  def self.special_cases(url)
158
186
  #puts "special_cases"
159
187
  case service_of(url)
@@ -171,6 +199,7 @@ module UrlScrubber
171
199
  url
172
200
  end
173
201
 
202
+
174
203
  def self.remove_www!(url)
175
204
  # url.sub!(%r{://www\d*\.}, '://')
176
205
  url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
@@ -179,27 +208,32 @@ module UrlScrubber
179
208
  url
180
209
  end
181
210
 
211
+
182
212
  def self.remove_html_tags!(url)
183
213
  url.gsub!(/<\/?[^>]+>/, '')
184
214
  url
185
215
  end
186
216
 
217
+
187
218
  def self.drop_url_ampersand!(url)
188
219
  url.sub!(/\&.*$/, '')
189
220
  url
190
221
  end
222
+
191
223
 
192
224
  def self.drop_url_query!(url)
193
225
  url.sub!(/\?.*$/, '')
194
226
  url
195
227
  end
196
228
 
229
+
197
230
  def self.drop_anchor!(url)
198
231
  #puts "drop anchor"
199
232
  url.sub!(/#.*$/, '')
200
233
  url
201
234
  end
202
235
 
236
+
203
237
  def self.sc_youtube(url)
204
238
  # We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
205
239
  # which is not separate channel with it's own customUrl.
@@ -208,6 +242,7 @@ module UrlScrubber
208
242
  url
209
243
  end
210
244
 
245
+
211
246
  def self.sc_vimeo(url)
212
247
  if url.include?('vimeo.com/groups/')
213
248
  groups_partition = url.partition('vimeo.com/groups/')
@@ -222,6 +257,7 @@ module UrlScrubber
222
257
  url
223
258
  end
224
259
 
260
+
225
261
  def self.sc_twitter(url)
226
262
  url.sub!('twitter.com/@', 'twitter.com/')
227
263
 
@@ -238,13 +274,14 @@ module UrlScrubber
238
274
  url
239
275
  end
240
276
 
277
+
241
278
  def self.sc_facebook(url)
242
279
  #puts "sc_facebook: #{url}"
243
- regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
244
- regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
245
- regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
246
- regex3 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
247
- regex4 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
280
+ regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
281
+ regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
282
+ regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
283
+ regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
284
+ regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
248
285
 
249
286
  if url.match("/media/albums") || url.match("/media/set")
250
287
  # puts "media"
@@ -296,7 +333,8 @@ module UrlScrubber
296
333
  url = url.sub(/\?_rdr.*/, '')
297
334
  url
298
335
  end
299
-
336
+
337
+
300
338
  def self.sc_linkedin(url)
301
339
 
302
340
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
@@ -320,6 +358,7 @@ module UrlScrubber
320
358
  url
321
359
  end
322
360
 
361
+
323
362
  def self.sc_google_plus(url)
324
363
  url.sub!('com/u/0/b/', 'com/')
325
364
  url.sub!('com/u/0/', 'com/')
@@ -335,6 +374,7 @@ module UrlScrubber
335
374
  community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
336
375
  end
337
376
 
377
+
338
378
  def self.sc_flickr(url)
339
379
  if url.include?('flickr.com/groups/')
340
380
  groups_partition = url.partition('flickr.com/groups/')
@@ -354,14 +394,17 @@ module UrlScrubber
354
394
  "http://flickr.com/#{user_match[2]}"
355
395
  end
356
396
 
397
+
357
398
  def self.sc_pinterest(url)
358
399
  url
359
400
  end
360
401
 
402
+
361
403
  def self.sc_yelp(url)
362
404
  url
363
405
  end
364
406
 
407
+
365
408
  def self.check_for_facebook_redirection(uri_str, limit = 5)
366
409
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
367
410
  # finds any redirects intended for facebook URLs only!!!!
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.8.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-12-08 00:00:00.000000000 Z
14
+ date: 2016-12-10 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec