url_scrubber 0.7.4 → 0.7.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.7.4"
2
+ VERSION = "0.7.5"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -1,4 +1,8 @@
1
1
  require "url_scrubber/version"
2
+ require 'net/http'
3
+ require 'uri'
4
+
5
+ USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
2
6
 
3
7
  module UrlScrubber
4
8
  def self.scrub(url)
@@ -149,6 +153,10 @@ module UrlScrubber
149
153
  def self.facebook(url)
150
154
  if url.match("/media/albums") || url.match("/media/set")
151
155
  url = url.match('\&') ? url.split('&',2)[0] : url
156
+ elsif url.include?("facebook.com/profile.php?id=")
157
+ # these were being truncated, they do redirect, but typically a 301 response is generated
158
+ # so the url is returned unchanged. Better than truncation.
159
+ url, http_response = check_for_redirection(url)
152
160
  else
153
161
  url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
154
162
  url = drop_url_query!(url)
@@ -188,4 +196,69 @@ module UrlScrubber
188
196
  def self.yelp(url)
189
197
  url
190
198
  end
199
+
200
+ private
201
+
202
+ def self.check_for_redirection(uri_str, limit = 5)
203
+ login_patterns = [
204
+ # pages that require user logins
205
+ %r{^.*/login[^/]*$}
206
+ ]
207
+
208
+ failure_patterns = [
209
+ # pages that give 200 codes but actually indicate a not found
210
+ %r{linkedin\.com/home\?report%2Efailure}i
211
+ ]
212
+
213
+ raise 'Too many HTTP redirects' if limit == 0
214
+
215
+ begin
216
+ url = URI.parse(uri_str)
217
+ rescue URI::InvalidURIError => e
218
+ return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
219
+ end
220
+
221
+ http = Net::HTTP.new(url.host, url.port)
222
+ if url.port == 443
223
+ http.use_ssl = true
224
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
225
+ else
226
+ http.use_ssl = false
227
+ end
228
+ request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
229
+
230
+ begin
231
+ response = http.request(request)
232
+ rescue Exception => e
233
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
234
+ return [uri_str, failure_response]
235
+ end
236
+
237
+ if response.is_a? Net::HTTPRedirection
238
+ if response['location'][0,4] == "http"
239
+ if failure_patterns.any? { |pattern| response['location'].match(pattern) }
240
+ # got redirected to a page indicating failure, so act like it's a 404
241
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
242
+ return [uri_str, failure_response]
243
+ end
244
+
245
+ redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
246
+
247
+ if login_patterns.any? { |pattern| redirected_url.match(pattern) }
248
+ # got redirected to a login page. return the ultimate response, but the previous url
249
+ return [uri_str, base_response]
250
+ else
251
+ return [redirected_url, base_response]
252
+ end
253
+
254
+ else
255
+ redir_url = "http://#{url.host}#{response['location']}"
256
+ redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
257
+ return [redirected_url, base_response]
258
+ end
259
+ else
260
+ return [uri_str, response]
261
+ end
262
+ end
263
+
191
264
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.4
4
+ version: 0.7.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-08-28 00:00:00.000000000 Z
14
+ date: 2013-09-09 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec