url_scrubber 0.7.10 → 0.7.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/url_scrubber.rb +61 -51
- data/lib/url_scrubber/version.rb +1 -1
- data/url_scrubber.gemspec +2 -2
- metadata +7 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dc1cea704c5440cf1114fe0ab1a013963bb0184f
|
4
|
+
data.tar.gz: 15e202c908bc49f2dd65aa1ab8bdb482bdc0fa71
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e12356b808e46f5e2bb292105eb69b001b9a7ef42c65908351bfac100fd5bcbc29cffe8740ad75e0e55ea1b2fd0a8981f101333ad3198718ce743a3fb2721f7f
|
7
|
+
data.tar.gz: 4b0961d123a5962dc027e82e11b43ac6135bebf94ac92e6462af1634e4734fb90d63dea6edd146ec09f76903416522d535a2f8f3845500b71b1bc213c0d21e0c
|
data/lib/url_scrubber.rb
CHANGED
@@ -112,6 +112,18 @@ module UrlScrubber
|
|
112
112
|
end
|
113
113
|
end
|
114
114
|
|
115
|
+
# Requirements:
|
116
|
+
# 1. must have http/https scheme
|
117
|
+
# 2. no "@" in any of the passed in url string
|
118
|
+
# 3. valid uri as determined by Addressable::URI
|
119
|
+
def self.valid_url?(url)
|
120
|
+
schemes = %w(http https)
|
121
|
+
parsed = URI.parse(url) or return false
|
122
|
+
schemes.include?(parsed.scheme) && !url.include?("@")
|
123
|
+
rescue URI::InvalidURIError
|
124
|
+
false
|
125
|
+
end
|
126
|
+
|
115
127
|
private
|
116
128
|
|
117
129
|
def self.downcase_domain(url)
|
@@ -238,68 +250,66 @@ module UrlScrubber
|
|
238
250
|
url
|
239
251
|
end
|
240
252
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
260
|
-
end
|
253
|
+
def self.check_for_redirection(uri_str, limit = 5)
|
254
|
+
login_patterns = [
|
255
|
+
# pages that require user logins
|
256
|
+
%r{^.*/login[^/]*$}
|
257
|
+
]
|
258
|
+
|
259
|
+
failure_patterns = [
|
260
|
+
# pages that give 200 codes but actually indicate a not found
|
261
|
+
%r{linkedin\.com/home\?report%2Efailure}i
|
262
|
+
]
|
263
|
+
|
264
|
+
raise 'Too many HTTP redirects' if limit == 0
|
265
|
+
|
266
|
+
begin
|
267
|
+
url = URI.parse(uri_str)
|
268
|
+
rescue URI::InvalidURIError => e
|
269
|
+
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
270
|
+
end
|
261
271
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
272
|
+
http = Net::HTTP.new(url.host, url.port)
|
273
|
+
if url.port == 443
|
274
|
+
http.use_ssl = true
|
275
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
276
|
+
else
|
277
|
+
http.use_ssl = false
|
278
|
+
end
|
279
|
+
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
280
|
+
|
281
|
+
begin
|
282
|
+
response = http.request(request)
|
283
|
+
rescue Exception => e
|
284
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
285
|
+
return [uri_str, failure_response]
|
268
286
|
end
|
269
|
-
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
270
287
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
288
|
+
if response.is_a? Net::HTTPRedirection
|
289
|
+
if response['location'][0,4] == "http"
|
290
|
+
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
291
|
+
# got redirected to a page indicating failure, so act like it's a 404
|
292
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
293
|
+
return [uri_str, failure_response]
|
276
294
|
end
|
277
295
|
|
278
|
-
|
279
|
-
if response['location'][0,4] == "http"
|
280
|
-
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
281
|
-
# got redirected to a page indicating failure, so act like it's a 404
|
282
|
-
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
283
|
-
return [uri_str, failure_response]
|
284
|
-
end
|
285
|
-
|
286
|
-
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
287
|
-
|
288
|
-
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
289
|
-
# got redirected to a login page. return the ultimate response, but the previous url
|
290
|
-
return [uri_str, base_response]
|
291
|
-
else
|
292
|
-
return [redirected_url, base_response]
|
293
|
-
end
|
296
|
+
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
294
297
|
|
298
|
+
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
299
|
+
# got redirected to a login page. return the ultimate response, but the previous url
|
300
|
+
return [uri_str, base_response]
|
295
301
|
else
|
296
|
-
redir_url = "http://#{url.host}#{response['location']}"
|
297
|
-
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
298
302
|
return [redirected_url, base_response]
|
299
303
|
end
|
304
|
+
|
300
305
|
else
|
301
|
-
|
306
|
+
redir_url = "http://#{url.host}#{response['location']}"
|
307
|
+
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
308
|
+
return [redirected_url, base_response]
|
302
309
|
end
|
310
|
+
else
|
311
|
+
return [uri_str, response]
|
303
312
|
end
|
313
|
+
end
|
304
314
|
|
305
315
|
end
|
data/lib/url_scrubber/version.rb
CHANGED
data/url_scrubber.gemspec
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard"]
|
6
|
-
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net"]
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
|
7
7
|
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
8
|
gem.summary = %q{Clean up URLs.}
|
9
9
|
gem.homepage = "http://brandle.net"
|
metadata
CHANGED
@@ -1,22 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
5
|
-
prerelease:
|
4
|
+
version: 0.7.11
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Colin Langton
|
9
8
|
- Christopher Maujean
|
10
9
|
- David Hillard
|
10
|
+
- Edgar Abadines
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2014-02-03 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
|
-
none: false
|
20
19
|
requirements:
|
21
20
|
- - ~>
|
22
21
|
- !ruby/object:Gem::Version
|
@@ -24,7 +23,6 @@ dependencies:
|
|
24
23
|
type: :development
|
25
24
|
prerelease: false
|
26
25
|
version_requirements: !ruby/object:Gem::Requirement
|
27
|
-
none: false
|
28
26
|
requirements:
|
29
27
|
- - ~>
|
30
28
|
- !ruby/object:Gem::Version
|
@@ -32,7 +30,6 @@ dependencies:
|
|
32
30
|
- !ruby/object:Gem::Dependency
|
33
31
|
name: guard-bundler
|
34
32
|
requirement: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
33
|
requirements:
|
37
34
|
- - ~>
|
38
35
|
- !ruby/object:Gem::Version
|
@@ -40,7 +37,6 @@ dependencies:
|
|
40
37
|
type: :development
|
41
38
|
prerelease: false
|
42
39
|
version_requirements: !ruby/object:Gem::Requirement
|
43
|
-
none: false
|
44
40
|
requirements:
|
45
41
|
- - ~>
|
46
42
|
- !ruby/object:Gem::Version
|
@@ -48,7 +44,6 @@ dependencies:
|
|
48
44
|
- !ruby/object:Gem::Dependency
|
49
45
|
name: guard-rspec
|
50
46
|
requirement: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
47
|
requirements:
|
53
48
|
- - ~>
|
54
49
|
- !ruby/object:Gem::Version
|
@@ -56,7 +51,6 @@ dependencies:
|
|
56
51
|
type: :development
|
57
52
|
prerelease: false
|
58
53
|
version_requirements: !ruby/object:Gem::Requirement
|
59
|
-
none: false
|
60
54
|
requirements:
|
61
55
|
- - ~>
|
62
56
|
- !ruby/object:Gem::Version
|
@@ -64,7 +58,6 @@ dependencies:
|
|
64
58
|
- !ruby/object:Gem::Dependency
|
65
59
|
name: terminal-notifier-guard
|
66
60
|
requirement: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
61
|
requirements:
|
69
62
|
- - ! '>='
|
70
63
|
- !ruby/object:Gem::Version
|
@@ -72,7 +65,6 @@ dependencies:
|
|
72
65
|
type: :development
|
73
66
|
prerelease: false
|
74
67
|
version_requirements: !ruby/object:Gem::Requirement
|
75
|
-
none: false
|
76
68
|
requirements:
|
77
69
|
- - ! '>='
|
78
70
|
- !ruby/object:Gem::Version
|
@@ -80,7 +72,6 @@ dependencies:
|
|
80
72
|
- !ruby/object:Gem::Dependency
|
81
73
|
name: rb-fsevent
|
82
74
|
requirement: !ruby/object:Gem::Requirement
|
83
|
-
none: false
|
84
75
|
requirements:
|
85
76
|
- - ~>
|
86
77
|
- !ruby/object:Gem::Version
|
@@ -88,7 +79,6 @@ dependencies:
|
|
88
79
|
type: :development
|
89
80
|
prerelease: false
|
90
81
|
version_requirements: !ruby/object:Gem::Requirement
|
91
|
-
none: false
|
92
82
|
requirements:
|
93
83
|
- - ~>
|
94
84
|
- !ruby/object:Gem::Version
|
@@ -99,6 +89,7 @@ email:
|
|
99
89
|
- colin@hoteldelta.net
|
100
90
|
- cmaujean@brandle.net
|
101
91
|
- dhillard@brandle.net
|
92
|
+
- ed@brandle.net
|
102
93
|
executables: []
|
103
94
|
extensions: []
|
104
95
|
extra_rdoc_files: []
|
@@ -116,27 +107,26 @@ files:
|
|
116
107
|
- url_scrubber.gemspec
|
117
108
|
homepage: http://brandle.net
|
118
109
|
licenses: []
|
110
|
+
metadata: {}
|
119
111
|
post_install_message:
|
120
112
|
rdoc_options: []
|
121
113
|
require_paths:
|
122
114
|
- lib
|
123
115
|
required_ruby_version: !ruby/object:Gem::Requirement
|
124
|
-
none: false
|
125
116
|
requirements:
|
126
117
|
- - ! '>='
|
127
118
|
- !ruby/object:Gem::Version
|
128
119
|
version: '0'
|
129
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
-
none: false
|
131
121
|
requirements:
|
132
122
|
- - ! '>='
|
133
123
|
- !ruby/object:Gem::Version
|
134
124
|
version: '0'
|
135
125
|
requirements: []
|
136
126
|
rubyforge_project:
|
137
|
-
rubygems_version: 1.
|
127
|
+
rubygems_version: 2.1.10
|
138
128
|
signing_key:
|
139
|
-
specification_version:
|
129
|
+
specification_version: 4
|
140
130
|
summary: Clean up URLs.
|
141
131
|
test_files:
|
142
132
|
- spec/spec_helper.rb
|