url_scrubber 0.7.10 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/url_scrubber.rb +61 -51
- data/lib/url_scrubber/version.rb +1 -1
- data/url_scrubber.gemspec +2 -2
- metadata +7 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dc1cea704c5440cf1114fe0ab1a013963bb0184f
|
4
|
+
data.tar.gz: 15e202c908bc49f2dd65aa1ab8bdb482bdc0fa71
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e12356b808e46f5e2bb292105eb69b001b9a7ef42c65908351bfac100fd5bcbc29cffe8740ad75e0e55ea1b2fd0a8981f101333ad3198718ce743a3fb2721f7f
|
7
|
+
data.tar.gz: 4b0961d123a5962dc027e82e11b43ac6135bebf94ac92e6462af1634e4734fb90d63dea6edd146ec09f76903416522d535a2f8f3845500b71b1bc213c0d21e0c
|
data/lib/url_scrubber.rb
CHANGED
@@ -112,6 +112,18 @@ module UrlScrubber
|
|
112
112
|
end
|
113
113
|
end
|
114
114
|
|
115
|
+
# Requirements:
|
116
|
+
# 1. must have http/https scheme
|
117
|
+
# 2. no "@" in any of the passed in url string
|
118
|
+
# 3. valid uri as determined by Addressable::URI
|
119
|
+
def self.valid_url?(url)
|
120
|
+
schemes = %w(http https)
|
121
|
+
parsed = URI.parse(url) or return false
|
122
|
+
schemes.include?(parsed.scheme) && !url.include?("@")
|
123
|
+
rescue URI::InvalidURIError
|
124
|
+
false
|
125
|
+
end
|
126
|
+
|
115
127
|
private
|
116
128
|
|
117
129
|
def self.downcase_domain(url)
|
@@ -238,68 +250,66 @@ module UrlScrubber
|
|
238
250
|
url
|
239
251
|
end
|
240
252
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
260
|
-
end
|
253
|
+
def self.check_for_redirection(uri_str, limit = 5)
|
254
|
+
login_patterns = [
|
255
|
+
# pages that require user logins
|
256
|
+
%r{^.*/login[^/]*$}
|
257
|
+
]
|
258
|
+
|
259
|
+
failure_patterns = [
|
260
|
+
# pages that give 200 codes but actually indicate a not found
|
261
|
+
%r{linkedin\.com/home\?report%2Efailure}i
|
262
|
+
]
|
263
|
+
|
264
|
+
raise 'Too many HTTP redirects' if limit == 0
|
265
|
+
|
266
|
+
begin
|
267
|
+
url = URI.parse(uri_str)
|
268
|
+
rescue URI::InvalidURIError => e
|
269
|
+
return [uri_str, CustomError.new(786, "Invalid URI #{uri_str} : #{e.message}") ]
|
270
|
+
end
|
261
271
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
272
|
+
http = Net::HTTP.new(url.host, url.port)
|
273
|
+
if url.port == 443
|
274
|
+
http.use_ssl = true
|
275
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
276
|
+
else
|
277
|
+
http.use_ssl = false
|
278
|
+
end
|
279
|
+
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
280
|
+
|
281
|
+
begin
|
282
|
+
response = http.request(request)
|
283
|
+
rescue Exception => e
|
284
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
285
|
+
return [uri_str, failure_response]
|
268
286
|
end
|
269
|
-
request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT })
|
270
287
|
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
288
|
+
if response.is_a? Net::HTTPRedirection
|
289
|
+
if response['location'][0,4] == "http"
|
290
|
+
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
291
|
+
# got redirected to a page indicating failure, so act like it's a 404
|
292
|
+
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
293
|
+
return [uri_str, failure_response]
|
276
294
|
end
|
277
295
|
|
278
|
-
|
279
|
-
if response['location'][0,4] == "http"
|
280
|
-
if failure_patterns.any? { |pattern| response['location'].match(pattern) }
|
281
|
-
# got redirected to a page indicating failure, so act like it's a 404
|
282
|
-
failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found')
|
283
|
-
return [uri_str, failure_response]
|
284
|
-
end
|
285
|
-
|
286
|
-
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
287
|
-
|
288
|
-
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
289
|
-
# got redirected to a login page. return the ultimate response, but the previous url
|
290
|
-
return [uri_str, base_response]
|
291
|
-
else
|
292
|
-
return [redirected_url, base_response]
|
293
|
-
end
|
296
|
+
redirected_url, base_response = check_for_redirection(response['location'], limit - 1)
|
294
297
|
|
298
|
+
if login_patterns.any? { |pattern| redirected_url.match(pattern) }
|
299
|
+
# got redirected to a login page. return the ultimate response, but the previous url
|
300
|
+
return [uri_str, base_response]
|
295
301
|
else
|
296
|
-
redir_url = "http://#{url.host}#{response['location']}"
|
297
|
-
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
298
302
|
return [redirected_url, base_response]
|
299
303
|
end
|
304
|
+
|
300
305
|
else
|
301
|
-
|
306
|
+
redir_url = "http://#{url.host}#{response['location']}"
|
307
|
+
redirected_url, base_response = check_for_redirection(redir_url, limit - 1)
|
308
|
+
return [redirected_url, base_response]
|
302
309
|
end
|
310
|
+
else
|
311
|
+
return [uri_str, response]
|
303
312
|
end
|
313
|
+
end
|
304
314
|
|
305
315
|
end
|
data/lib/url_scrubber/version.rb
CHANGED
data/url_scrubber.gemspec
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
require File.expand_path('../lib/url_scrubber/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard"]
|
6
|
-
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net"]
|
5
|
+
gem.authors = ["Colin Langton", "Christopher Maujean", "David Hillard", "Edgar Abadines"]
|
6
|
+
gem.email = ["colin@hoteldelta.net", "cmaujean@brandle.net", "dhillard@brandle.net", "ed@brandle.net"]
|
7
7
|
gem.description = %q{Remove extraneous bits from URLs, follow redirects, identify social media urls, etc.}
|
8
8
|
gem.summary = %q{Clean up URLs.}
|
9
9
|
gem.homepage = "http://brandle.net"
|
metadata
CHANGED
@@ -1,22 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: url_scrubber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
5
|
-
prerelease:
|
4
|
+
version: 0.7.11
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Colin Langton
|
9
8
|
- Christopher Maujean
|
10
9
|
- David Hillard
|
10
|
+
- Edgar Abadines
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2014-02-03 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: rspec
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
|
-
none: false
|
20
19
|
requirements:
|
21
20
|
- - ~>
|
22
21
|
- !ruby/object:Gem::Version
|
@@ -24,7 +23,6 @@ dependencies:
|
|
24
23
|
type: :development
|
25
24
|
prerelease: false
|
26
25
|
version_requirements: !ruby/object:Gem::Requirement
|
27
|
-
none: false
|
28
26
|
requirements:
|
29
27
|
- - ~>
|
30
28
|
- !ruby/object:Gem::Version
|
@@ -32,7 +30,6 @@ dependencies:
|
|
32
30
|
- !ruby/object:Gem::Dependency
|
33
31
|
name: guard-bundler
|
34
32
|
requirement: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
33
|
requirements:
|
37
34
|
- - ~>
|
38
35
|
- !ruby/object:Gem::Version
|
@@ -40,7 +37,6 @@ dependencies:
|
|
40
37
|
type: :development
|
41
38
|
prerelease: false
|
42
39
|
version_requirements: !ruby/object:Gem::Requirement
|
43
|
-
none: false
|
44
40
|
requirements:
|
45
41
|
- - ~>
|
46
42
|
- !ruby/object:Gem::Version
|
@@ -48,7 +44,6 @@ dependencies:
|
|
48
44
|
- !ruby/object:Gem::Dependency
|
49
45
|
name: guard-rspec
|
50
46
|
requirement: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
47
|
requirements:
|
53
48
|
- - ~>
|
54
49
|
- !ruby/object:Gem::Version
|
@@ -56,7 +51,6 @@ dependencies:
|
|
56
51
|
type: :development
|
57
52
|
prerelease: false
|
58
53
|
version_requirements: !ruby/object:Gem::Requirement
|
59
|
-
none: false
|
60
54
|
requirements:
|
61
55
|
- - ~>
|
62
56
|
- !ruby/object:Gem::Version
|
@@ -64,7 +58,6 @@ dependencies:
|
|
64
58
|
- !ruby/object:Gem::Dependency
|
65
59
|
name: terminal-notifier-guard
|
66
60
|
requirement: !ruby/object:Gem::Requirement
|
67
|
-
none: false
|
68
61
|
requirements:
|
69
62
|
- - ! '>='
|
70
63
|
- !ruby/object:Gem::Version
|
@@ -72,7 +65,6 @@ dependencies:
|
|
72
65
|
type: :development
|
73
66
|
prerelease: false
|
74
67
|
version_requirements: !ruby/object:Gem::Requirement
|
75
|
-
none: false
|
76
68
|
requirements:
|
77
69
|
- - ! '>='
|
78
70
|
- !ruby/object:Gem::Version
|
@@ -80,7 +72,6 @@ dependencies:
|
|
80
72
|
- !ruby/object:Gem::Dependency
|
81
73
|
name: rb-fsevent
|
82
74
|
requirement: !ruby/object:Gem::Requirement
|
83
|
-
none: false
|
84
75
|
requirements:
|
85
76
|
- - ~>
|
86
77
|
- !ruby/object:Gem::Version
|
@@ -88,7 +79,6 @@ dependencies:
|
|
88
79
|
type: :development
|
89
80
|
prerelease: false
|
90
81
|
version_requirements: !ruby/object:Gem::Requirement
|
91
|
-
none: false
|
92
82
|
requirements:
|
93
83
|
- - ~>
|
94
84
|
- !ruby/object:Gem::Version
|
@@ -99,6 +89,7 @@ email:
|
|
99
89
|
- colin@hoteldelta.net
|
100
90
|
- cmaujean@brandle.net
|
101
91
|
- dhillard@brandle.net
|
92
|
+
- ed@brandle.net
|
102
93
|
executables: []
|
103
94
|
extensions: []
|
104
95
|
extra_rdoc_files: []
|
@@ -116,27 +107,26 @@ files:
|
|
116
107
|
- url_scrubber.gemspec
|
117
108
|
homepage: http://brandle.net
|
118
109
|
licenses: []
|
110
|
+
metadata: {}
|
119
111
|
post_install_message:
|
120
112
|
rdoc_options: []
|
121
113
|
require_paths:
|
122
114
|
- lib
|
123
115
|
required_ruby_version: !ruby/object:Gem::Requirement
|
124
|
-
none: false
|
125
116
|
requirements:
|
126
117
|
- - ! '>='
|
127
118
|
- !ruby/object:Gem::Version
|
128
119
|
version: '0'
|
129
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
-
none: false
|
131
121
|
requirements:
|
132
122
|
- - ! '>='
|
133
123
|
- !ruby/object:Gem::Version
|
134
124
|
version: '0'
|
135
125
|
requirements: []
|
136
126
|
rubyforge_project:
|
137
|
-
rubygems_version: 1.
|
127
|
+
rubygems_version: 2.1.10
|
138
128
|
signing_key:
|
139
|
-
specification_version:
|
129
|
+
specification_version: 4
|
140
130
|
summary: Clean up URLs.
|
141
131
|
test_files:
|
142
132
|
- spec/spec_helper.rb
|