url_scrubber 0.8.5 → 0.8.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MWNkMWE5MDM0ZTliMzg1NDcwN2Y5Zjg4Mzc1ZWE4MWY1ZmVlMWUwMQ==
4
+ MDM4ZTYzMzA2NTRjNzhjOTI2MjI4MGI3MDdlZjIzYjZmYjQ4ZDUwOA==
5
5
  data.tar.gz: !binary |-
6
- ZTg4Y2ZkNTU2NWZjMjhmYjczZGM4OTQ3MWRmYjNkMDc5ZDMxMTJmNg==
6
+ NDczYzkxOTc4OGFkNmQzNTI2M2E4Y2E0MGM2NmRhZmQwNWRmZDYwNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZWM2M2VlZWM5N2UyMmFhODU4NmE0MjAwYWZiMzk1OTBkODA3ZDQ4NzA2ZWRj
10
- Mzc0ZTk2MjY5YTA2YzVjY2NiY2RjMWEyZTQyZjc3NzFlNzE4YTU3YWRhZTQy
11
- NWE3MDBlNzc5YTM4NDEzNjYwYzJiMjUxYjY3MDRjZjUzNzc1NTA=
9
+ MmMxY2U0ZmMwMDYyZjUwYjQ4YjI3OTU2NDZiZjgzYTZhZDY5ODQ3ZTAzM2E4
10
+ NTNkNWM0NWMwNTcyYTkwMDlkYzk2MzkyMDY1NjYzMTNlODg0Y2FhYWIxNmUy
11
+ ZDgwMjUxNDI3YTk5ZTQ2MTdjZWJkNzcxZGViMzYyZTBjYTI5NDY=
12
12
  data.tar.gz: !binary |-
13
- NGQ2ZDQ3NjUwY2RlOGYxMzljNDMyMTU0ZWJmZGRkN2YwMmJjNmIwMzY4OGQ4
14
- NWU0NmE5OGE3Mjk4NTgxNDk1M2E2YzdkNTVmNTEwMzQ1NDEzOGY2NmZjMzA2
15
- ZTMzNzUwNDIyMWM5YzIzMjg2NTJhMGRlMWNkMWE5NTdiN2E5ZGI=
13
+ ZDRkYjJmNjY2ZWQ5N2IxNzE2MWYxNGNmODJhNDJmNjc0OTkyZjIzOTYxMzMx
14
+ NGVkZjE4MTYzN2E4OWE2MTNhMDUzOGFiMWY1Mzg1MmNkNWUyNWQyNjlhYjE0
15
+ M2NmYjY2ZDg3MzAxOTNkZTZmZTQ0NDFlY2FmODVjOWI1OTEyYzA=
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
1
  source "http://rubygems.org"
2
2
 
3
3
  # Specify your gem's dependencies in url_scrubber.gemspec
4
- gemspec
4
+ gemspec
5
+
6
+ gem 'domainatrix'
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.8.5"
2
+ VERSION = "0.8.6"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -5,6 +5,7 @@ require 'uri'
5
5
  USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
6
6
 
7
7
  module UrlScrubber
8
+
8
9
  def self.scrub(url)
9
10
  url = url.clone # don't modify the original argument
10
11
 
@@ -27,6 +28,7 @@ module UrlScrubber
27
28
  return url
28
29
  end
29
30
 
31
+
30
32
  def self.service_of(url)
31
33
 
32
34
  domain_match = url.match(%r{https?://([^/]+)})
@@ -66,6 +68,7 @@ module UrlScrubber
66
68
  :other
67
69
  end
68
70
 
71
+
69
72
  def self.ideal_form?(url)
70
73
  url = scrub(url)
71
74
  return false unless url
@@ -100,23 +103,27 @@ module UrlScrubber
100
103
  true
101
104
  end
102
105
  end
103
-
106
+
107
+
104
108
  def self.linkedin_company_url?(url)
105
109
  url = scrub(url)
106
110
  return false unless url
107
111
  return url.include?('http://linkedin.com/company/')
108
112
  end
113
+
109
114
 
110
115
  def self.linkedin_personal_url?(url)
111
116
  url = scrub(url)
112
117
  return false unless url
113
118
  return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
114
119
  end
120
+
115
121
 
116
122
  def self.find_identity_from_url(url)
117
123
  UrlScrubber.scrub(url).split("/").last unless url.nil?
118
124
  end
119
125
 
126
+
120
127
  def self.find_linkedin_identity_from_url(url)
121
128
  return nil if url.nil?
122
129
  scrubbed_url = scrub(url)
@@ -135,6 +142,7 @@ module UrlScrubber
135
142
  end
136
143
  end
137
144
 
145
+
138
146
  # Requirements:
139
147
  # 1. must have http/https scheme
140
148
  # 2. no "@" in any of the passed in url string
@@ -146,14 +154,34 @@ module UrlScrubber
146
154
  rescue URI::InvalidURIError
147
155
  false
148
156
  end
157
+
158
+
159
+ def self.maps_to_public_url(url)
160
+ scrubbed = scrub(url)
161
+ parsed = URI.parse(url) or return nil
162
+ host = Domainatrix.parse(parsed.host)
163
+ if host.domain == "facebook" && host.subdomain == "business"
164
+ public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
165
+ elsif host.domain == "google" && host.subdomain == "business"
166
+ public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
167
+ else
168
+ public_url = nil
169
+ end
170
+ public_url
171
+ end
172
+
149
173
 
174
+ ################################################################################
150
175
  private
176
+ ################################################################################
177
+
151
178
 
152
179
  def self.downcase_domain(url)
153
180
  domain_match = url.match(%r{http://[^/]+}i)
154
181
  domain_match[0].downcase + domain_match.post_match
155
182
  end
156
183
 
184
+
157
185
  def self.special_cases(url)
158
186
  #puts "special_cases"
159
187
  case service_of(url)
@@ -171,6 +199,7 @@ module UrlScrubber
171
199
  url
172
200
  end
173
201
 
202
+
174
203
  def self.remove_www!(url)
175
204
  # url.sub!(%r{://www\d*\.}, '://')
176
205
  url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
@@ -179,27 +208,32 @@ module UrlScrubber
179
208
  url
180
209
  end
181
210
 
211
+
182
212
  def self.remove_html_tags!(url)
183
213
  url.gsub!(/<\/?[^>]+>/, '')
184
214
  url
185
215
  end
186
216
 
217
+
187
218
  def self.drop_url_ampersand!(url)
188
219
  url.sub!(/\&.*$/, '')
189
220
  url
190
221
  end
222
+
191
223
 
192
224
  def self.drop_url_query!(url)
193
225
  url.sub!(/\?.*$/, '')
194
226
  url
195
227
  end
196
228
 
229
+
197
230
  def self.drop_anchor!(url)
198
231
  #puts "drop anchor"
199
232
  url.sub!(/#.*$/, '')
200
233
  url
201
234
  end
202
235
 
236
+
203
237
  def self.sc_youtube(url)
204
238
  # We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
205
239
  # which is not separate channel with it's own customUrl.
@@ -208,6 +242,7 @@ module UrlScrubber
208
242
  url
209
243
  end
210
244
 
245
+
211
246
  def self.sc_vimeo(url)
212
247
  if url.include?('vimeo.com/groups/')
213
248
  groups_partition = url.partition('vimeo.com/groups/')
@@ -222,6 +257,7 @@ module UrlScrubber
222
257
  url
223
258
  end
224
259
 
260
+
225
261
  def self.sc_twitter(url)
226
262
  url.sub!('twitter.com/@', 'twitter.com/')
227
263
 
@@ -238,13 +274,14 @@ module UrlScrubber
238
274
  url
239
275
  end
240
276
 
277
+
241
278
  def self.sc_facebook(url)
242
279
  #puts "sc_facebook: #{url}"
243
- regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
244
- regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
245
- regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
246
- regex3 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
247
- regex4 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
280
+ regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
281
+ regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
282
+ regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
283
+ regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
284
+ regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
248
285
 
249
286
  if url.match("/media/albums") || url.match("/media/set")
250
287
  # puts "media"
@@ -296,7 +333,8 @@ module UrlScrubber
296
333
  url = url.sub(/\?_rdr.*/, '')
297
334
  url
298
335
  end
299
-
336
+
337
+
300
338
  def self.sc_linkedin(url)
301
339
 
302
340
  url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
@@ -320,6 +358,7 @@ module UrlScrubber
320
358
  url
321
359
  end
322
360
 
361
+
323
362
  def self.sc_google_plus(url)
324
363
  url.sub!('com/u/0/b/', 'com/')
325
364
  url.sub!('com/u/0/', 'com/')
@@ -335,6 +374,7 @@ module UrlScrubber
335
374
  community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
336
375
  end
337
376
 
377
+
338
378
  def self.sc_flickr(url)
339
379
  if url.include?('flickr.com/groups/')
340
380
  groups_partition = url.partition('flickr.com/groups/')
@@ -354,14 +394,17 @@ module UrlScrubber
354
394
  "http://flickr.com/#{user_match[2]}"
355
395
  end
356
396
 
397
+
357
398
  def self.sc_pinterest(url)
358
399
  url
359
400
  end
360
401
 
402
+
361
403
  def self.sc_yelp(url)
362
404
  url
363
405
  end
364
406
 
407
+
365
408
  def self.check_for_facebook_redirection(uri_str, limit = 5)
366
409
  #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
367
410
  # finds any redirects intended for facebook URLs only!!!!
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.5
4
+ version: 0.8.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Colin Langton
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2016-12-08 00:00:00.000000000 Z
14
+ date: 2016-12-10 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec