url_scrubber 0.7.4 → 0.7.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/url_scrubber/version.rb +1 -1
- data/lib/url_scrubber.rb +73 -0
- metadata +2 -2
data/lib/url_scrubber/version.rb
CHANGED
data/lib/url_scrubber.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
require "url_scrubber/version"
|
2
|
+
require 'net/http'
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
|
2
6
|
|
3
7
|
module UrlScrubber
|
4
8
|
def self.scrub(url)
|
@@ -149,6 +153,10 @@ module UrlScrubber
|
|
149
153
|
def self.facebook(url)
|
150
154
|
if url.match("/media/albums") || url.match("/media/set")
|
151
155
|
url = url.match('\&') ? url.split('&',2)[0] : url
|
156
|
+
elsif url.include?("facebook.com/profile.php?id=")
|
157
|
+
# these were being truncated, they do redirect, but typically a 301 response is generated
|
158
|
+
# so the url is returned unchanged. Better than truncation.
|
159
|
+
url, http_response = check_for_redirection(url)
|
152
160
|
else
|
153
161
|
url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
|
154
162
|
url = drop_url_query!(url)
|
@@ -188,4 +196,69 @@ module UrlScrubber
|
|
188
196
|
def self.yelp(url)
|
189
197
|
url
|
190
198
|
end
|
199
|
+
|
200
|
+
private
|
201
|
+
|
202
|
+
def self.check_for_redirection(uri_str, limit = 5)
|
203
|
+
login_patterns = [
|
204
|
+
# pages that require user logins
|
205
|
+
%r{^.*/login[^/]*$}
|
206
|
+
]
|
207
|
+
|
208
|
+
failure_patterns = [
|
209
|
+
# pages that give 200 codes but actually indicate a not found
|
210
|
+
%r{linkedin\.com/home\?report%2Efailure}i
|
211
|
+
]
|
212
|
+
|
213
|
+
raise 'Too many HTTP redirects' if limit == 0
|
214
|
+
|
215
|
+
begin
|
216
|
+
url = URI.parse(uri_str)
|
217
|
+
rescue URI::InvalidURIError => e
|
218
|
+
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
219
|
+
end
|
220
|
+
|
221
|
+
http = Net::HTTP.new(url.host, url.port)
|
222
|
+
if url.port == 443
|
223
|
+
http.use_ssl = true
|
224
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
225
|
+
else
|
226
|
+
http.use_ssl = false
|
227
|
+
end
|
228
|
+
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
229
|
+
|
230
|
+
begin
|
231
|
+
response = http.request(request)
|
232
|
+
rescue Exception => e
|
233
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
234
|
+
return [uri_str, failure_response]
|
235
|
+
end
|
236
|
+
|
237
|
+
if response.is_a? Net::HTTPRedirection
|
238
|
+
if response['location'][0,4] == "http"
|
239
|
+
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
240
|
+
# got redirected to a page indicating failure, so act like it's a 404
|
241
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
242
|
+
return [uri_str, failure_response]
|
243
|
+
end
|
244
|
+
|
245
|
+
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
246
|
+
|
247
|
+
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
248
|
+
# got redirected to a login page. return the ultimate response, but the previous url
|
249
|
+
return [uri_str, base_response]
|
250
|
+
else
|
251
|
+
return [redirected_url, base_response]
|
252
|
+
end
|
253
|
+
|
254
|
+
else
|
255
|
+
redir_url = "http://#{url.host}#{response['location']}"
|
256
|
+
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
257
|
+
return [redirected_url, base_response]
|
258
|
+
end
|
259
|
+
else
|
260
|
+
return [uri_str, response]
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
191
264
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2013-
|
14
|
+
date: 2013-09-09 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|