url_scrubber 0.7.10 → 0.7.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dc1cea704c5440cf1114fe0ab1a013963bb0184f
4
+ data.tar.gz: 15e202c908bc49f2dd65aa1ab8bdb482bdc0fa71
5
+ SHA512:
6
+ metadata.gz: e12356b808e46f5e2bb292105eb69b001b9a7ef42c65908351bfac100fd5bcbc29cffe8740ad75e0e55ea1b2fd0a8981f101333ad3198718ce743a3fb2721f7f
7
+ data.tar.gz: 4b0961d123a5962dc027e82e11b43ac6135bebf94ac92e6462af1634e4734fb90d63dea6edd146ec09f76903416522d535a2f8f3845500b71b1bc213c0d21e0c
data/lib/url_scrubber.rb CHANGED
@@ -112,6 +112,18 @@ module UrlScrubber
112
112
  end
113
113
  end
114
114
 
115
+ # Requirements:
116
+ # 1. must have http/https scheme
117
+ # 2. no "@" in any of the passed in url string
118
+ # 3. valid uri as determined by Addressable::URI
119
+ def self.valid_url?(url)
120
+ schemes = %w(http https)
121
+ parsed = URI.parse(url) or return false
122
+ schemes.include?(parsed.scheme) && !url.include?("@")
123
+ rescue URI::InvalidURIError
124
+ false
125
+ end
126
+
115
127
  private
116
128
 
117
129
  def self.downcase_domain(url)
@@ -238,68 +250,66 @@ module UrlScrubber
238
250
  url
239
251
  end
240
252
 
241
- private
242
-
243
- def self.check_for_redirection(uri_str, limit = 5)
244
- login_patterns = [
245
- # pages that require user logins
246
- %r{^.*/login[^/]*$}
247
- ]
248
-
249
- failure_patterns = [
250
- # pages that give 200 codes but actually indicate a not found
251
- %r{linkedin\.com/home\?report%2Efailure}i
252
- ]
253
-
254
- raise 'Too many HTTP redirects' if limit == 0
255
-
256
- begin
257
- url = URI.parse(uri_str)
258
- rescue URI::InvalidURIError => e
259
- return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
260
- end
253
+ def self.check_for_redirection(uri_str, limit = 5)
254
+ login_patterns = [
255
+ # pages that require user logins
256
+ %r{^.*/login[^/]*$}
257
+ ]
258
+
259
+ failure_patterns = [
260
+ # pages that give 200 codes but actually indicate a not found
261
+ %r{linkedin\.com/home\?report%2Efailure}i
262
+ ]
263
+
264
+ raise 'Too many HTTP redirects' if limit == 0
265
+
266
+ begin
267
+ url = URI.parse(uri_str)
268
+ rescue URI::InvalidURIError => e
269
+ return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
270
+ end
261
271
 
262
- http = Net::HTTP.new(url.host, url.port)
263
- if url.port == 443
264
- http.use_ssl = true
265
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
266
- else
267
- http.use_ssl = false
272
+ http = Net::HTTP.new(url.host, url.port)
273
+ if url.port == 443
274
+ http.use_ssl = true
275
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
276
+ else
277
+ http.use_ssl = false
278
+ end
279
+ request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
280
+
281
+ begin
282
+ response = http.request(request)
283
+ rescue Exception => e
284
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
285
+ return [uri_str, failure_response]
268
286
  end
269
- request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
270
287
 
271
- begin
272
- response = http.request(request)
273
- rescue Exception => e
274
- failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
275
- return [uri_str, failure_response]
288
+ if response.is_a? Net::HTTPRedirection
289
+ if response['location'][0,4] == "http"
290
+ if failure_patterns.any? { |pattern| response['location'].match(pattern) }
291
+ # got redirected to a page indicating failure, so act like it's a 404
292
+ failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
293
+ return [uri_str, failure_response]
276
294
  end
277
295
 
278
- if response.is_a? Net::HTTPRedirection
279
- if response['location'][0,4] == "http"
280
- if failure_patterns.any? { |pattern| response['location'].match(pattern) }
281
- # got redirected to a page indicating failure, so act like it's a 404
282
- failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
283
- return [uri_str, failure_response]
284
- end
285
-
286
- redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
287
-
288
- if login_patterns.any? { |pattern| redirected_url.match(pattern) }
289
- # got redirected to a login page. return the ultimate response, but the previous url
290
- return [uri_str, base_response]
291
- else
292
- return [redirected_url, base_response]
293
- end
296
+ redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
294
297
 
298
+ if login_patterns.any? { |pattern| redirected_url.match(pattern) }
299
+ # got redirected to a login page. return the ultimate response, but the previous url
300
+ return [uri_str, base_response]
295
301
  else
296
- redir_url = "http://#{url.host}#{response['location']}"
297
- redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
298
302
  return [redirected_url, base_response]
299
303
  end
304
+
300
305
  else
301
- return [uri_str, response]
306
+ redir_url = "http://#{url.host}#{response['location']}"
307
+ redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
308
+ return [redirected_url, base_response]
302
309
  end
310
+ else
311
+ return [uri_str, response]
303
312
  end
313
+ end
304
314
 
305
315
  end
@@ -1,3 +1,3 @@
1
1
  module UrlScrubber
2
- VERSION = "0.7.10"
2
+ VERSION = "0.7.11"
3
3
  end
data/url_scrubber.gemspec CHANGED
@@ -2,8 +2,8 @@
2
2
  require File.expand_path('../lib/url_scrubber/version', __FILE__)
3
3
 
4
4
  Gem::Specification.new do |gem|
5
- gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard"]
6
- gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net"]
5
+ gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
6
+ gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
7
7
  gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
8
8
  gem.summary = %q{Clean up URLs.}
9
9
  gem.homepage = "http://brandle.net"
metadata CHANGED
@@ -1,22 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: url_scrubber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.10
5
- prerelease:
4
+ version: 0.7.11
6
5
  platform: ruby
7
6
  authors:
8
7
  - Colin Langton
9
8
  - Christopher Maujean
10
9
  - David Hillard
10
+ - Edgar Abadines
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-09-27 00:00:00.000000000 Z
14
+ date: 2014-02-03 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
18
18
  requirement: !ruby/object:Gem::Requirement
19
- none: false
20
19
  requirements:
21
20
  - - ~>
22
21
  - !ruby/object:Gem::Version
@@ -24,7 +23,6 @@ dependencies:
24
23
  type: :development
25
24
  prerelease: false
26
25
  version_requirements: !ruby/object:Gem::Requirement
27
- none: false
28
26
  requirements:
29
27
  - - ~>
30
28
  - !ruby/object:Gem::Version
@@ -32,7 +30,6 @@ dependencies:
32
30
  - !ruby/object:Gem::Dependency
33
31
  name: guard-bundler
34
32
  requirement: !ruby/object:Gem::Requirement
35
- none: false
36
33
  requirements:
37
34
  - - ~>
38
35
  - !ruby/object:Gem::Version
@@ -40,7 +37,6 @@ dependencies:
40
37
  type: :development
41
38
  prerelease: false
42
39
  version_requirements: !ruby/object:Gem::Requirement
43
- none: false
44
40
  requirements:
45
41
  - - ~>
46
42
  - !ruby/object:Gem::Version
@@ -48,7 +44,6 @@ dependencies:
48
44
  - !ruby/object:Gem::Dependency
49
45
  name: guard-rspec
50
46
  requirement: !ruby/object:Gem::Requirement
51
- none: false
52
47
  requirements:
53
48
  - - ~>
54
49
  - !ruby/object:Gem::Version
@@ -56,7 +51,6 @@ dependencies:
56
51
  type: :development
57
52
  prerelease: false
58
53
  version_requirements: !ruby/object:Gem::Requirement
59
- none: false
60
54
  requirements:
61
55
  - - ~>
62
56
  - !ruby/object:Gem::Version
@@ -64,7 +58,6 @@ dependencies:
64
58
  - !ruby/object:Gem::Dependency
65
59
  name: terminal-notifier-guard
66
60
  requirement: !ruby/object:Gem::Requirement
67
- none: false
68
61
  requirements:
69
62
  - - ! '>='
70
63
  - !ruby/object:Gem::Version
@@ -72,7 +65,6 @@ dependencies:
72
65
  type: :development
73
66
  prerelease: false
74
67
  version_requirements: !ruby/object:Gem::Requirement
75
- none: false
76
68
  requirements:
77
69
  - - ! '>='
78
70
  - !ruby/object:Gem::Version
@@ -80,7 +72,6 @@ dependencies:
80
72
  - !ruby/object:Gem::Dependency
81
73
  name: rb-fsevent
82
74
  requirement: !ruby/object:Gem::Requirement
83
- none: false
84
75
  requirements:
85
76
  - - ~>
86
77
  - !ruby/object:Gem::Version
@@ -88,7 +79,6 @@ dependencies:
88
79
  type: :development
89
80
  prerelease: false
90
81
  version_requirements: !ruby/object:Gem::Requirement
91
- none: false
92
82
  requirements:
93
83
  - - ~>
94
84
  - !ruby/object:Gem::Version
@@ -99,6 +89,7 @@ email:
99
89
  - colin@hoteldelta.net
100
90
  - cmaujean@brandle.net
101
91
  - dhillard@brandle.net
92
+ - ed@brandle.net
102
93
  executables: []
103
94
  extensions: []
104
95
  extra_rdoc_files: []
@@ -116,27 +107,26 @@ files:
116
107
  - url_scrubber.gemspec
117
108
  homepage: http://brandle.net
118
109
  licenses: []
110
+ metadata: {}
119
111
  post_install_message:
120
112
  rdoc_options: []
121
113
  require_paths:
122
114
  - lib
123
115
  required_ruby_version: !ruby/object:Gem::Requirement
124
- none: false
125
116
  requirements:
126
117
  - - ! '>='
127
118
  - !ruby/object:Gem::Version
128
119
  version: '0'
129
120
  required_rubygems_version: !ruby/object:Gem::Requirement
130
- none: false
131
121
  requirements:
132
122
  - - ! '>='
133
123
  - !ruby/object:Gem::Version
134
124
  version: '0'
135
125
  requirements: []
136
126
  rubyforge_project:
137
- rubygems_version: 1.8.21
127
+ rubygems_version: 2.1.10
138
128
  signing_key:
139
- specification_version: 3
129
+ specification_version: 4
140
130
  summary: Clean up URLs.
141
131
  test_files:
142
132
  - spec/spec_helper.rb