url_scrubber 0.7.4 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +73 -0
- metadata +2 -2
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
require "url_scrubber/version"
|
2
|
+
require 'net/http'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
|
2
6
|
|
3
7
|
module UrlScrubber
|
4
8
|
def self.scrub(url)
|
@@ -149,6 +153,10 @@ module UrlScrubber
|
|
149
153
|
def self.facebook(url)
|
150
154
|
if url.match("/media/albums") || url.match("/media/set")
|
151
155
|
url = url.match('\&') ? url.split('&',2)[0] : url
|
156
|
+
elsif url.include?("facebook.com/profile.php?id=")
|
157
|
+
# these were being truncated, they do redirect, but typically a 301 response is generated
|
158
|
+
# so the url is returned unchanged. Better than truncation.
|
159
|
+
url, http_response = check_for_redirection(url)
|
152
160
|
else
|
153
161
|
url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
|
154
162
|
url = drop_url_query!(url)
|
@@ -188,4 +196,69 @@ module UrlScrubber
|
|
188
196
|
def self.yelp(url)
|
189
197
|
url
|
190
198
|
end
|
199
|
+
|
200
|
+
private
|
201
|
+
|
202
|
+
def self.check_for_redirection(uri_str, limit = 5)
|
203
|
+
login_patterns = [
|
204
|
+
# pages that require user logins
|
205
|
+
%r{^.*/login[^/]*$}
|
206
|
+
]
|
207
|
+
|
208
|
+
failure_patterns = [
|
209
|
+
# pages that give 200 codes but actually indicate a not found
|
210
|
+
%r{linkedin\.com/home\?report%2Efailure}i
|
211
|
+
]
|
212
|
+
|
213
|
+
raise 'Too many HTTP redirects' if limit == 0
|
214
|
+
|
215
|
+
begin
|
216
|
+
url = URI.parse(uri_str)
|
217
|
+
rescue URI::InvalidURIError => e
|
218
|
+
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
219
|
+
end
|
220
|
+
|
221
|
+
http = Net::HTTP.new(url.host, url.port)
|
222
|
+
if url.port == 443
|
223
|
+
http.use_ssl = true
|
224
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
225
|
+
else
|
226
|
+
http.use_ssl = false
|
227
|
+
end
|
228
|
+
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
229
|
+
|
230
|
+
begin
|
231
|
+
response = http.request(request)
|
232
|
+
rescue Exception => e
|
233
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
234
|
+
return [uri_str, failure_response]
|
235
|
+
end
|
236
|
+
|
237
|
+
if response.is_a? Net::HTTPRedirection
|
238
|
+
if response['location'][0,4] == "http"
|
239
|
+
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
240
|
+
# got redirected to a page indicating failure, so act like it's a 404
|
241
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
242
|
+
return [uri_str, failure_response]
|
243
|
+
end
|
244
|
+
|
245
|
+
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
246
|
+
|
247
|
+
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
248
|
+
# got redirected to a login page. return the ultimate response, but the previous url
|
249
|
+
return [uri_str, base_response]
|
250
|
+
else
|
251
|
+
return [redirected_url, base_response]
|
252
|
+
end
|
253
|
+
|
254
|
+
else
|
255
|
+
redir_url = "http://#{url.host}#{response['location']}"
|
256
|
+
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
257
|
+
return [redirected_url, base_response]
|
258
|
+
end
|
259
|
+
else
|
260
|
+
return [uri_str, response]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
191
264
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-09-09 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|