url_scrubber 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/Gemfile +3 -1
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +50 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MDM4ZTYzMzA2NTRjNzhjOTI2MjI4MGI3MDdlZjIzYjZmYjQ4ZDUwOA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDczYzkxOTc4OGFkNmQzNTI2M2E4Y2E0MGM2NmRhZmQwNWRmZDYwNw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MmMxY2U0ZmMwMDYyZjUwYjQ4YjI3OTU2NDZiZjgzYTZhZDY5ODQ3ZTAzM2E4
|
10
|
+
NTNkNWM0NWMwNTcyYTkwMDlkYzk2MzkyMDY1NjYzMTNlODg0Y2FhYWIxNmUy
|
11
|
+
ZDgwMjUxNDI3YTk5ZTQ2MTdjZWJkNzcxZGViMzYyZTBjYTI5NDY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZDRkYjJmNjY2ZWQ5N2IxNzE2MWYxNGNmODJhNDJmNjc0OTkyZjIzOTYxMzMx
|
14
|
+
NGVkZjE4MTYzN2E4OWE2MTNhMDUzOGFiMWY1Mzg1MmNkNWUyNWQyNjlhYjE0
|
15
|
+
M2NmYjY2ZDg3MzAxOTNkZTZmZTQ0NDFlY2FmODVjOWI1OTEyYzA=
|
data/Gemfile
CHANGED
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -5,6 +5,7 @@ require 'uri'
|
|
5
5
|
USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
|
6
6
|
|
7
7
|
module UrlScrubber
|
8
|
+
|
8
9
|
def self.scrub(url)
|
9
10
|
url = url.clone # don't modify the original argument
|
10
11
|
|
@@ -27,6 +28,7 @@ module UrlScrubber
|
|
27
28
|
return url
|
28
29
|
end
|
29
30
|
|
31
|
+
|
30
32
|
def self.service_of(url)
|
31
33
|
|
32
34
|
domain_match = url.match(%r{https?://([^/]+)})
|
@@ -66,6 +68,7 @@ module UrlScrubber
|
|
66
68
|
:other
|
67
69
|
end
|
68
70
|
|
71
|
+
|
69
72
|
def self.ideal_form?(url)
|
70
73
|
url = scrub(url)
|
71
74
|
return false unless url
|
@@ -100,23 +103,27 @@ module UrlScrubber
|
|
100
103
|
true
|
101
104
|
end
|
102
105
|
end
|
103
|
-
|
106
|
+
|
107
|
+
|
104
108
|
def self.linkedin_company_url?(url)
|
105
109
|
url = scrub(url)
|
106
110
|
return false unless url
|
107
111
|
return url.include?('http://linkedin.com/company/')
|
108
112
|
end
|
113
|
+
|
109
114
|
|
110
115
|
def self.linkedin_personal_url?(url)
|
111
116
|
url = scrub(url)
|
112
117
|
return false unless url
|
113
118
|
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
114
119
|
end
|
120
|
+
|
115
121
|
|
116
122
|
def self.find_identity_from_url(url)
|
117
123
|
UrlScrubber.scrub(url).split("/").last unless url.nil?
|
118
124
|
end
|
119
125
|
|
126
|
+
|
120
127
|
def self.find_linkedin_identity_from_url(url)
|
121
128
|
return nil if url.nil?
|
122
129
|
scrubbed_url = scrub(url)
|
@@ -135,6 +142,7 @@ module UrlScrubber
|
|
135
142
|
end
|
136
143
|
end
|
137
144
|
|
145
|
+
|
138
146
|
# Requirements:
|
139
147
|
# 1. must have http/https scheme
|
140
148
|
# 2. no "@" in any of the passed in url string
|
@@ -146,14 +154,34 @@ module UrlScrubber
|
|
146
154
|
rescue URI::InvalidURIError
|
147
155
|
false
|
148
156
|
end
|
157
|
+
|
158
|
+
|
159
|
+
def self.maps_to_public_url(url)
|
160
|
+
scrubbed = scrub(url)
|
161
|
+
parsed = URI.parse(url) or return nil
|
162
|
+
host = Domainatrix.parse(parsed.host)
|
163
|
+
if host.domain == "facebook" && host.subdomain == "business"
|
164
|
+
public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
|
165
|
+
elsif host.domain == "google" && host.subdomain == "business"
|
166
|
+
public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
|
167
|
+
else
|
168
|
+
public_url = nil
|
169
|
+
end
|
170
|
+
public_url
|
171
|
+
end
|
172
|
+
|
149
173
|
|
174
|
+
################################################################################
|
150
175
|
private
|
176
|
+
################################################################################
|
177
|
+
|
151
178
|
|
152
179
|
def self.downcase_domain(url)
|
153
180
|
domain_match = url.match(%r{http://[^/]+}i)
|
154
181
|
domain_match[0].downcase + domain_match.post_match
|
155
182
|
end
|
156
183
|
|
184
|
+
|
157
185
|
def self.special_cases(url)
|
158
186
|
#puts "special_cases"
|
159
187
|
case service_of(url)
|
@@ -171,6 +199,7 @@ module UrlScrubber
|
|
171
199
|
url
|
172
200
|
end
|
173
201
|
|
202
|
+
|
174
203
|
def self.remove_www!(url)
|
175
204
|
# url.sub!(%r{://www\d*\.}, '://')
|
176
205
|
url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
|
@@ -179,27 +208,32 @@ module UrlScrubber
|
|
179
208
|
url
|
180
209
|
end
|
181
210
|
|
211
|
+
|
182
212
|
def self.remove_html_tags!(url)
|
183
213
|
url.gsub!(/<\/?[^>]+>/, '')
|
184
214
|
url
|
185
215
|
end
|
186
216
|
|
217
|
+
|
187
218
|
def self.drop_url_ampersand!(url)
|
188
219
|
url.sub!(/\&.*$/, '')
|
189
220
|
url
|
190
221
|
end
|
222
|
+
|
191
223
|
|
192
224
|
def self.drop_url_query!(url)
|
193
225
|
url.sub!(/\?.*$/, '')
|
194
226
|
url
|
195
227
|
end
|
196
228
|
|
229
|
+
|
197
230
|
def self.drop_anchor!(url)
|
198
231
|
#puts "drop anchor"
|
199
232
|
url.sub!(/#.*$/, '')
|
200
233
|
url
|
201
234
|
end
|
202
235
|
|
236
|
+
|
203
237
|
def self.sc_youtube(url)
|
204
238
|
# We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
|
205
239
|
# which is not separate channel with it's own customUrl.
|
@@ -208,6 +242,7 @@ module UrlScrubber
|
|
208
242
|
url
|
209
243
|
end
|
210
244
|
|
245
|
+
|
211
246
|
def self.sc_vimeo(url)
|
212
247
|
if url.include?('vimeo.com/groups/')
|
213
248
|
groups_partition = url.partition('vimeo.com/groups/')
|
@@ -222,6 +257,7 @@ module UrlScrubber
|
|
222
257
|
url
|
223
258
|
end
|
224
259
|
|
260
|
+
|
225
261
|
def self.sc_twitter(url)
|
226
262
|
url.sub!('twitter.com/@', 'twitter.com/')
|
227
263
|
|
@@ -238,13 +274,14 @@ module UrlScrubber
|
|
238
274
|
url
|
239
275
|
end
|
240
276
|
|
277
|
+
|
241
278
|
def self.sc_facebook(url)
|
242
279
|
#puts "sc_facebook: #{url}"
|
243
|
-
regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
244
|
-
regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
245
|
-
regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
246
|
-
regex3 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
247
|
-
regex4 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
280
|
+
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
281
|
+
regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
282
|
+
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
283
|
+
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
284
|
+
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
248
285
|
|
249
286
|
if url.match("/media/albums") || url.match("/media/set")
|
250
287
|
# puts "media"
|
@@ -296,7 +333,8 @@ module UrlScrubber
|
|
296
333
|
url = url.sub(/\?_rdr.*/, '')
|
297
334
|
url
|
298
335
|
end
|
299
|
-
|
336
|
+
|
337
|
+
|
300
338
|
def self.sc_linkedin(url)
|
301
339
|
|
302
340
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
@@ -320,6 +358,7 @@ module UrlScrubber
|
|
320
358
|
url
|
321
359
|
end
|
322
360
|
|
361
|
+
|
323
362
|
def self.sc_google_plus(url)
|
324
363
|
url.sub!('com/u/0/b/', 'com/')
|
325
364
|
url.sub!('com/u/0/', 'com/')
|
@@ -335,6 +374,7 @@ module UrlScrubber
|
|
335
374
|
community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
|
336
375
|
end
|
337
376
|
|
377
|
+
|
338
378
|
def self.sc_flickr(url)
|
339
379
|
if url.include?('flickr.com/groups/')
|
340
380
|
groups_partition = url.partition('flickr.com/groups/')
|
@@ -354,14 +394,17 @@ module UrlScrubber
|
|
354
394
|
"http://flickr.com/#{user_match[2]}"
|
355
395
|
end
|
356
396
|
|
397
|
+
|
357
398
|
def self.sc_pinterest(url)
|
358
399
|
url
|
359
400
|
end
|
360
401
|
|
402
|
+
|
361
403
|
def self.sc_yelp(url)
|
362
404
|
url
|
363
405
|
end
|
364
406
|
|
407
|
+
|
365
408
|
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
366
409
|
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
367
410
|
# finds any redirects intended for facebook URLs only!!!!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2016-12-
|
14
|
+
date: 2016-12-10 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|