url_scrubber 0.7.4 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.7.4"
2
+ VERSION = "0.7.5"
3
3
  end
data/lib/url_scrubber.rb CHANGED
@@ -1,4 +1,8 @@
1
1
  require "url_scrubber/version"
2
+ require 'net/http'
3
+ require 'uri'
4
+
5
+ USER_AGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16'
2
6
 
3
7
  module UrlScrubber
4
8
  def self.scrub(url)
@@ -149,6 +153,10 @@ module UrlScrubber
149
153
  def self.facebook(url)
150
154
  if url.match("/media/albums") || url.match("/media/set")
151
155
  url = url.match('\&') ? url.split('&',2)[0] : url
156
+ elsif url.include?("facebook.com/profile.php?id=")
157
+ # these were being truncated, they do redirect, but typically a 301 response is generated
158
+ # so the url is returned unchanged. Better than truncation.
159
+ url, http_response = check_for_redirection(url)
152
160
  else
153
161
  url.sub!(/facebook\.com\/home\.php[\?#!\/]+/, 'facebook.com/')
154
162
  url = drop_url_query!(url)
@@ -188,4 +196,69 @@ module UrlScrubber
188
196
  def self.yelp(url)
189
197
  url
190
198
  end
199
+
200
+ private
201
+
202
+ def self.check_for_redirection(uri_str, limit = 5)
203
+ login_patterns = [
204
+ # pages that require user logins
205
+ %r{^.*/login[^/]*$}
206
+ ]
207
+
208
+ failure_patterns = [
209
+ # pages that give 200 codes but actually indicate a not found
210
+ %r{linkedin\.com/home\?report%2Efailure}i
211
+ ]
212
+
213
+ raise 'Too many HTTP redirects' if limit == 0
214
+
215
+ begin
216
+ url = URI.parse(uri_str)
217
+ rescue URI::InvalidURIError => e
218
+ return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
219
+ end
220
+
221
+ http = Net::HTTP.new(url.host, url.port)
222
+ if url.port == 443
223
+ http.use_ssl = true
224
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
225
+ else
226
+ http.use_ssl = false
227
+ end
228
+ request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
229
+
230
+ begin
231
+ response = http.request(request)
232
+ rescue Exception => e
233
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
234
+ return [uri_str, failure_response]
235
+ end
236
+
237
+ if response.is_a? Net::HTTPRedirection
238
+ if response['location'][0,4] == "http"
239
+ if failure_patterns.any? { |pattern| response['location'].match(pattern) }
240
+ # got redirected to a page indicating failure, so act like it's a 404
241
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
242
+ return [uri_str, failure_response]
243
+ end
244
+
245
+ redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
246
+
247
+ if login_patterns.any? { |pattern| redirected_url.match(pattern) }
248
+ # got redirected to a login page. return the ultimate response, but the previous url
249
+ return [uri_str, base_response]
250
+ else
251
+ return [redirected_url, base_response]
252
+ end
253
+
254
+ else
255
+ redir_url = "http://#{url.host}#{response['location']}"
256
+ redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
257
+ return [redirected_url, base_response]
258
+ end
259
+ else
260
+ return [uri_str, response]
261
+ end
262
+ end
263
+
191
264
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.4
4
+ version: 0.7.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-08-28 00:00:00.000000000 Z
14
+ date: 2013-09-09 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec