url_scrubber 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/Gemfile +3 -1
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +50 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MDM4ZTYzMzA2NTRjNzhjOTI2MjI4MGI3MDdlZjIzYjZmYjQ4ZDUwOA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDczYzkxOTc4OGFkNmQzNTI2M2E4Y2E0MGM2NmRhZmQwNWRmZDYwNw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MmMxY2U0ZmMwMDYyZjUwYjQ4YjI3OTU2NDZiZjgzYTZhZDY5ODQ3ZTAzM2E4
|
10
|
+
NTNkNWM0NWMwNTcyYTkwMDlkYzk2MzkyMDY1NjYzMTNlODg0Y2FhYWIxNmUy
|
11
|
+
ZDgwMjUxNDI3YTk5ZTQ2MTdjZWJkNzcxZGViMzYyZTBjYTI5NDY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZDRkYjJmNjY2ZWQ5N2IxNzE2MWYxNGNmODJhNDJmNjc0OTkyZjIzOTYxMzMx
|
14
|
+
NGVkZjE4MTYzN2E4OWE2MTNhMDUzOGFiMWY1Mzg1MmNkNWUyNWQyNjlhYjE0
|
15
|
+
M2NmYjY2ZDg3MzAxOTNkZTZmZTQ0NDFlY2FmODVjOWI1OTEyYzA=
|
data/Gemfile
CHANGED
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -5,6 +5,7 @@ require 'uri'
|
|
5
5
|
USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
|
6
6
|
|
7
7
|
module UrlScrubber
|
8
|
+
|
8
9
|
def self.scrub(url)
|
9
10
|
url = url.clone # don't modify the original argument
|
10
11
|
|
@@ -27,6 +28,7 @@ module UrlScrubber
|
|
27
28
|
return url
|
28
29
|
end
|
29
30
|
|
31
|
+
|
30
32
|
def self.service_of(url)
|
31
33
|
|
32
34
|
domain_match = url.match(%r{https?://([^/]+)})
|
@@ -66,6 +68,7 @@ module UrlScrubber
|
|
66
68
|
:other
|
67
69
|
end
|
68
70
|
|
71
|
+
|
69
72
|
def self.ideal_form?(url)
|
70
73
|
url = scrub(url)
|
71
74
|
return false unless url
|
@@ -100,23 +103,27 @@ module UrlScrubber
|
|
100
103
|
true
|
101
104
|
end
|
102
105
|
end
|
103
|
-
|
106
|
+
|
107
|
+
|
104
108
|
def self.linkedin_company_url?(url)
|
105
109
|
url = scrub(url)
|
106
110
|
return false unless url
|
107
111
|
return url.include?('http://linkedin.com/company/')
|
108
112
|
end
|
113
|
+
|
109
114
|
|
110
115
|
def self.linkedin_personal_url?(url)
|
111
116
|
url = scrub(url)
|
112
117
|
return false unless url
|
113
118
|
return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/')
|
114
119
|
end
|
120
|
+
|
115
121
|
|
116
122
|
def self.find_identity_from_url(url)
|
117
123
|
UrlScrubber.scrub(url).split("/").last unless url.nil?
|
118
124
|
end
|
119
125
|
|
126
|
+
|
120
127
|
def self.find_linkedin_identity_from_url(url)
|
121
128
|
return nil if url.nil?
|
122
129
|
scrubbed_url = scrub(url)
|
@@ -135,6 +142,7 @@ module UrlScrubber
|
|
135
142
|
end
|
136
143
|
end
|
137
144
|
|
145
|
+
|
138
146
|
# Requirements:
|
139
147
|
# 1. must have http/https scheme
|
140
148
|
# 2. no "@" in any of the passed in url string
|
@@ -146,14 +154,34 @@ module UrlScrubber
|
|
146
154
|
rescue URI::InvalidURIError
|
147
155
|
false
|
148
156
|
end
|
157
|
+
|
158
|
+
|
159
|
+
def self.maps_to_public_url(url)
|
160
|
+
scrubbed = scrub(url)
|
161
|
+
parsed = URI.parse(url) or return nil
|
162
|
+
host = Domainatrix.parse(parsed.host)
|
163
|
+
if host.domain == "facebook" && host.subdomain == "business"
|
164
|
+
public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com")
|
165
|
+
elsif host.domain == "google" && host.subdomain == "business"
|
166
|
+
public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com")
|
167
|
+
else
|
168
|
+
public_url = nil
|
169
|
+
end
|
170
|
+
public_url
|
171
|
+
end
|
172
|
+
|
149
173
|
|
174
|
+
################################################################################
|
150
175
|
private
|
176
|
+
################################################################################
|
177
|
+
|
151
178
|
|
152
179
|
def self.downcase_domain(url)
|
153
180
|
domain_match = url.match(%r{http://[^/]+}i)
|
154
181
|
domain_match[0].downcase + domain_match.post_match
|
155
182
|
end
|
156
183
|
|
184
|
+
|
157
185
|
def self.special_cases(url)
|
158
186
|
#puts "special_cases"
|
159
187
|
case service_of(url)
|
@@ -171,6 +199,7 @@ module UrlScrubber
|
|
171
199
|
url
|
172
200
|
end
|
173
201
|
|
202
|
+
|
174
203
|
def self.remove_www!(url)
|
175
204
|
# url.sub!(%r{://www\d*\.}, '://')
|
176
205
|
url.sub!(%r{^https?://www?w?\d*\.}i, 'http://')
|
@@ -179,27 +208,32 @@ module UrlScrubber
|
|
179
208
|
url
|
180
209
|
end
|
181
210
|
|
211
|
+
|
182
212
|
def self.remove_html_tags!(url)
|
183
213
|
url.gsub!(/<\/?[^>]+>/, '')
|
184
214
|
url
|
185
215
|
end
|
186
216
|
|
217
|
+
|
187
218
|
def self.drop_url_ampersand!(url)
|
188
219
|
url.sub!(/\&.*$/, '')
|
189
220
|
url
|
190
221
|
end
|
222
|
+
|
191
223
|
|
192
224
|
def self.drop_url_query!(url)
|
193
225
|
url.sub!(/\?.*$/, '')
|
194
226
|
url
|
195
227
|
end
|
196
228
|
|
229
|
+
|
197
230
|
def self.drop_anchor!(url)
|
198
231
|
#puts "drop anchor"
|
199
232
|
url.sub!(/#.*$/, '')
|
200
233
|
url
|
201
234
|
end
|
202
235
|
|
236
|
+
|
203
237
|
def self.sc_youtube(url)
|
204
238
|
# We need to allow the /user version of the URL due to how YouTube allows users to have their own URL
|
205
239
|
# which is not separate channel with it's own customUrl.
|
@@ -208,6 +242,7 @@ module UrlScrubber
|
|
208
242
|
url
|
209
243
|
end
|
210
244
|
|
245
|
+
|
211
246
|
def self.sc_vimeo(url)
|
212
247
|
if url.include?('vimeo.com/groups/')
|
213
248
|
groups_partition = url.partition('vimeo.com/groups/')
|
@@ -222,6 +257,7 @@ module UrlScrubber
|
|
222
257
|
url
|
223
258
|
end
|
224
259
|
|
260
|
+
|
225
261
|
def self.sc_twitter(url)
|
226
262
|
url.sub!('twitter.com/@', 'twitter.com/')
|
227
263
|
|
@@ -238,13 +274,14 @@ module UrlScrubber
|
|
238
274
|
url
|
239
275
|
end
|
240
276
|
|
277
|
+
|
241
278
|
def self.sc_facebook(url)
|
242
279
|
#puts "sc_facebook: #{url}"
|
243
|
-
regex1 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
244
|
-
regex2 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
245
|
-
regex2a = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
246
|
-
regex3 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
247
|
-
regex4 = /^(?<url>(https?:\/\/)(www\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
280
|
+
regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(pages\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i
|
281
|
+
regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
282
|
+
regex2a = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?_rdr=p&id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i
|
283
|
+
regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i
|
284
|
+
regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i
|
248
285
|
|
249
286
|
if url.match("/media/albums") || url.match("/media/set")
|
250
287
|
# puts "media"
|
@@ -296,7 +333,8 @@ module UrlScrubber
|
|
296
333
|
url = url.sub(/\?_rdr.*/, '')
|
297
334
|
url
|
298
335
|
end
|
299
|
-
|
336
|
+
|
337
|
+
|
300
338
|
def self.sc_linkedin(url)
|
301
339
|
|
302
340
|
url.sub!('linkedin.com/companies/', 'linkedin.com/company/')
|
@@ -320,6 +358,7 @@ module UrlScrubber
|
|
320
358
|
url
|
321
359
|
end
|
322
360
|
|
361
|
+
|
323
362
|
def self.sc_google_plus(url)
|
324
363
|
url.sub!('com/u/0/b/', 'com/')
|
325
364
|
url.sub!('com/u/0/', 'com/')
|
@@ -335,6 +374,7 @@ module UrlScrubber
|
|
335
374
|
community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}"
|
336
375
|
end
|
337
376
|
|
377
|
+
|
338
378
|
def self.sc_flickr(url)
|
339
379
|
if url.include?('flickr.com/groups/')
|
340
380
|
groups_partition = url.partition('flickr.com/groups/')
|
@@ -354,14 +394,17 @@ module UrlScrubber
|
|
354
394
|
"http://flickr.com/#{user_match[2]}"
|
355
395
|
end
|
356
396
|
|
397
|
+
|
357
398
|
def self.sc_pinterest(url)
|
358
399
|
url
|
359
400
|
end
|
360
401
|
|
402
|
+
|
361
403
|
def self.sc_yelp(url)
|
362
404
|
url
|
363
405
|
end
|
364
406
|
|
407
|
+
|
365
408
|
def self.check_for_facebook_redirection(uri_str, limit = 5)
|
366
409
|
#puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}"
|
367
410
|
# finds any redirects intended for facebook URLs only!!!!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Colin Langton
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2016-12-
|
14
|
+
date: 2016-12-10 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|