wmap 2.7.0 → 2.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wmap/url_checker.rb +0 -104
- data/lib/wmap/utils/url_magic.rb +189 -214
- data/version.txt +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fbbbf1b71d804f4682d5827d07aa6079a27953b04643a7479db78b2a8c68f865
|
4
|
+
data.tar.gz: 471585e726794ca17d72ec92d8f9963d8c10433f5df2fb941b0ad930880f9bcb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 222487917ebee32bf3f1ebed8356fcd12b2c21fead45f6ade6e95e1cc714a9bff2a70bd7fea4ce70551fdc6589cc8b92b8625e45919b18a02ff972f2e43c65d4
|
7
|
+
data.tar.gz: a96bec97937ec3ff93e6f65081e54608f80dd78916a7fb600fe82f7b41e0458f66890c551eb9f2987dc35e4f09d75cdeeb4068e91ce01139b73f3f4cdf7c11e3
|
data/lib/wmap/url_checker.rb
CHANGED
@@ -119,110 +119,6 @@ class Wmap::UrlChecker
|
|
119
119
|
end
|
120
120
|
alias_method :checks, :url_workers
|
121
121
|
|
122
|
-
=begin
|
123
|
-
# Test the URL and return the response code
|
124
|
-
def response_code (url)
|
125
|
-
puts "Check the http response code on the url: #{url}" if @verbose
|
126
|
-
code = 10000 # All unknown url connection exceptions go here
|
127
|
-
raise "Invalid url: #{url}" unless is_url?(url)
|
128
|
-
url=url.strip.downcase
|
129
|
-
timeo = @http_timeout/1000.0
|
130
|
-
uri = URI.parse(url)
|
131
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
132
|
-
http.open_timeout = timeo
|
133
|
-
http.read_timeout = timeo
|
134
|
-
if (url =~ /https\:/i)
|
135
|
-
http.use_ssl = true
|
136
|
-
#http.ssl_version = :SSLv3
|
137
|
-
# Bypass the remote web server cert validation test
|
138
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
139
|
-
end
|
140
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
141
|
-
response = http.request(request)
|
142
|
-
puts "Server response the following: #{response}" if @verbose
|
143
|
-
code = response.code.to_i
|
144
|
-
#response.finish if response.started?()
|
145
|
-
@url_code[url]=code
|
146
|
-
puts "Response code on #{url}: #{code}" if @verbose
|
147
|
-
return code
|
148
|
-
rescue Exception => ee
|
149
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
150
|
-
case ee
|
151
|
-
# rescue "Connection reset by peer" error type
|
152
|
-
when Errno::ECONNRESET
|
153
|
-
code=104
|
154
|
-
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
155
|
-
#code=10000
|
156
|
-
when Timeout::Error # Quick fix
|
157
|
-
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
158
|
-
http.ssl_version = :SSLv3
|
159
|
-
response = http.request(request)
|
160
|
-
code = response.code.to_i
|
161
|
-
unless code.nil?
|
162
|
-
@ssl_version = http.ssl_version
|
163
|
-
end
|
164
|
-
end
|
165
|
-
else
|
166
|
-
#code=10000
|
167
|
-
end
|
168
|
-
@url_code[url]=code
|
169
|
-
return code
|
170
|
-
end
|
171
|
-
|
172
|
-
# Test the URL / site and return the redirection location (3xx response code only)
|
173
|
-
def redirect_location (url)
|
174
|
-
puts "Test the redirection location for the url: #{url}" if @verbose
|
175
|
-
location=""
|
176
|
-
raise "Invalid url: #{url}" unless is_url?(url)
|
177
|
-
url=url.strip.downcase
|
178
|
-
timeo = @http_timeout/1000.0
|
179
|
-
uri = URI.parse(url)
|
180
|
-
code = response_code (url)
|
181
|
-
if code >= 300 && code < 400
|
182
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
183
|
-
http.open_timeout = timeo
|
184
|
-
http.read_timeout = timeo
|
185
|
-
if (url =~ /https\:/i)
|
186
|
-
http.use_ssl = true
|
187
|
-
# Bypass the remote web server cert validation test
|
188
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
189
|
-
http.ssl_version = @ssl_version
|
190
|
-
end
|
191
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
192
|
-
response = http.request(request)
|
193
|
-
puts "Response: #{response}" if @verbose
|
194
|
-
case response
|
195
|
-
when Net::HTTPRedirection then
|
196
|
-
location = response['location']
|
197
|
-
end
|
198
|
-
end
|
199
|
-
@url_redirection[url]=location
|
200
|
-
return location
|
201
|
-
rescue Exception => ee
|
202
|
-
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
203
|
-
return ""
|
204
|
-
end
|
205
|
-
alias_method :location, :redirect_location
|
206
|
-
|
207
|
-
# Test the URL / Site and return the landing url location (recursive with the depth = 4 )
|
208
|
-
def landing_location (depth=5, url)
|
209
|
-
depth -= 1
|
210
|
-
return url if depth < 1
|
211
|
-
timeo = @http_timeout/1000.0
|
212
|
-
uri = URI.parse(url)
|
213
|
-
code = response_code (url)
|
214
|
-
if code >= 300 && code < 400
|
215
|
-
url = redirect_location (url)
|
216
|
-
url = landing_location(depth,url)
|
217
|
-
else
|
218
|
-
return url
|
219
|
-
end
|
220
|
-
return url
|
221
|
-
rescue Exception => ee
|
222
|
-
puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
|
223
|
-
end
|
224
|
-
=end
|
225
|
-
|
226
122
|
# Test the URL / site and return the web server type from the HTTP header "server" field
|
227
123
|
def get_server_header (url)
|
228
124
|
puts "Retrieve the server header field from the url: #{url}" if @verbose
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -14,176 +14,161 @@ module Wmap
|
|
14
14
|
extend self
|
15
15
|
|
16
16
|
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
|
-
Max_http_timeout=
|
17
|
+
Max_http_timeout=15000
|
18
18
|
|
19
19
|
# Simple sanity check on a 'claimed' URL string.
|
20
20
|
def is_url?(url)
|
21
21
|
puts "Validate the URL format is valid: #{url}" if @verbose
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
return true
|
28
|
-
else
|
29
|
-
return false
|
30
|
-
end
|
22
|
+
if url =~ /(http|https)\:\/\/((.)+)/i
|
23
|
+
host=$2.split('/')[0]
|
24
|
+
host=host.split(':')[0]
|
25
|
+
if is_ip?(host) or is_fqdn?(host)
|
26
|
+
return true
|
31
27
|
else
|
32
|
-
puts "Unknown URL format: #{url}" if @verbose
|
33
28
|
return false
|
34
29
|
end
|
35
|
-
|
36
|
-
puts "
|
30
|
+
else
|
31
|
+
puts "Unknown URL format: #{url}" if @verbose
|
37
32
|
return false
|
38
33
|
end
|
34
|
+
rescue => ee
|
35
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
36
|
+
return false
|
39
37
|
end
|
40
38
|
|
41
39
|
# Simple sanity check on a 'claimed' SSL enabled URL string
|
42
40
|
def is_ssl?(url)
|
43
41
|
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
return false
|
50
|
-
end
|
51
|
-
rescue => ee
|
52
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
42
|
+
url=url.strip
|
43
|
+
if is_url?(url) && url =~ /https/i
|
44
|
+
return true
|
45
|
+
else
|
53
46
|
return false
|
54
47
|
end
|
48
|
+
rescue => ee
|
49
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
50
|
+
return false
|
55
51
|
end
|
56
52
|
alias_method :is_https?, :is_ssl?
|
57
53
|
|
58
54
|
# Simple sanity check on a 'claimed' web site base string.
|
59
55
|
def is_site?(url)
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if
|
64
|
-
|
65
|
-
return true
|
66
|
-
else
|
67
|
-
return false
|
68
|
-
end
|
56
|
+
puts "Validate the website string format for: #{url}" if @verbose
|
57
|
+
url=url.strip.downcase
|
58
|
+
if is_url?(url)
|
59
|
+
if url == url_2_site(url)
|
60
|
+
return true
|
69
61
|
else
|
70
|
-
puts "Unknown site format: #{url}" if @verbose
|
71
62
|
return false
|
72
63
|
end
|
73
|
-
|
74
|
-
puts "
|
75
|
-
return
|
64
|
+
else
|
65
|
+
puts "Unknown site format: #{url}" if @verbose
|
66
|
+
return false
|
76
67
|
end
|
68
|
+
rescue => ee
|
69
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
70
|
+
return nil
|
77
71
|
end
|
78
72
|
|
79
73
|
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
80
74
|
def url_2_host (url)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
puts "Error process url: #{url}"
|
86
|
-
return nil
|
87
|
-
else
|
88
|
-
record2 = record1[0].split(':')
|
89
|
-
return record2[0]
|
90
|
-
end
|
91
|
-
rescue => ee
|
92
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
75
|
+
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
76
|
+
record1 = url.split('/')
|
77
|
+
if record1[0].nil?
|
78
|
+
puts "Error process url: #{url}"
|
93
79
|
return nil
|
80
|
+
else
|
81
|
+
record2 = record1[0].split(':')
|
82
|
+
return record2[0]
|
94
83
|
end
|
84
|
+
rescue => ee
|
85
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
86
|
+
return nil
|
95
87
|
end
|
96
88
|
|
97
89
|
# Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
|
98
90
|
def url_2_port (url)
|
99
91
|
puts "Retrieve service port on URL: #{url}" if @verbose
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
return 80
|
114
|
-
end
|
115
|
-
rescue => ee
|
116
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
117
|
-
return nil
|
92
|
+
ssl = (url =~ /https/i)
|
93
|
+
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
94
|
+
record1 = url.split('/')
|
95
|
+
record2 = record1[0].split(':')
|
96
|
+
if (record2.length == 2)
|
97
|
+
puts "The service port: #{record2[1]}" if @verbose
|
98
|
+
return record2[1].to_i
|
99
|
+
elsif ssl
|
100
|
+
puts "The service port: 443" if @verbose
|
101
|
+
return 443
|
102
|
+
else
|
103
|
+
puts "The service port: 80" if @verbose
|
104
|
+
return 80
|
118
105
|
end
|
106
|
+
rescue => ee
|
107
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
108
|
+
return nil
|
119
109
|
end
|
120
110
|
|
121
111
|
# Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
|
122
112
|
def url_2_site (url)
|
123
113
|
puts "Retrieve the web site base for url: #{url}" if @verbose
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
#do nothing
|
152
|
-
end
|
153
|
-
end
|
154
|
-
# step 2, put the host:port pair back to the normal site format
|
155
|
-
prot="https:" if port==443
|
156
|
-
if port==80 || port==443
|
157
|
-
site=prot+"//"+host+"/"
|
158
|
-
else
|
159
|
-
site=prot+"//"+host+":"+port.to_s+"/"
|
160
|
-
end
|
161
|
-
if site=~ /http/i
|
162
|
-
#puts "Base found: #{site}" if @verbose
|
163
|
-
return site
|
164
|
-
else
|
165
|
-
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
166
|
-
return nil
|
114
|
+
url = url.downcase
|
115
|
+
url = url.sub(/^(.*?)http/i,'http')
|
116
|
+
entry = url.split(%r{\/\/})
|
117
|
+
prot=entry[0]
|
118
|
+
# step 1, extract the host:port pair from the url
|
119
|
+
host_port=entry[1].split(%r{\/})[0]
|
120
|
+
if host_port =~ /\:/
|
121
|
+
host=host_port.split(%r{\:})[0]
|
122
|
+
port=host_port.split(%r{\:})[1].to_i
|
123
|
+
elsif prot =~ /https/i
|
124
|
+
host=host_port
|
125
|
+
port=443
|
126
|
+
elsif prot =~ /http/i
|
127
|
+
host=host_port
|
128
|
+
port=80
|
129
|
+
else
|
130
|
+
host=host_port
|
131
|
+
#raise "Unknown url format: #{url}"
|
132
|
+
end
|
133
|
+
# additional logic to handle uncommon url base structures
|
134
|
+
unless is_fqdn?(host)
|
135
|
+
case host
|
136
|
+
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
137
|
+
when /\?|\#/
|
138
|
+
host=host.split(%r{\?|\#})[0]
|
139
|
+
else
|
140
|
+
#do nothing
|
167
141
|
end
|
168
|
-
|
169
|
-
|
142
|
+
end
|
143
|
+
# step 2, put the host:port pair back to the normal site format
|
144
|
+
prot="https:" if port==443
|
145
|
+
if port==80 || port==443
|
146
|
+
site=prot+"//"+host+"/"
|
147
|
+
else
|
148
|
+
site=prot+"//"+host+":"+port.to_s+"/"
|
149
|
+
end
|
150
|
+
if site=~ /http/i
|
151
|
+
#puts "Base found: #{site}" if @verbose
|
152
|
+
return site
|
153
|
+
else
|
154
|
+
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
170
155
|
return nil
|
171
156
|
end
|
157
|
+
rescue => ee
|
158
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
159
|
+
return nil
|
172
160
|
end
|
173
161
|
|
174
162
|
# Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
|
175
163
|
def url_2_path(url)
|
176
164
|
#puts "Retrieve the relative path component of the url: #{url}" if @verbose
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
185
|
-
end
|
186
|
-
|
165
|
+
url.strip!
|
166
|
+
base = url_2_site(url).chop
|
167
|
+
path=url.sub(base,'')
|
168
|
+
#puts "Path component found: #{path}" if @verbose
|
169
|
+
return path
|
170
|
+
rescue => ee
|
171
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
187
172
|
end
|
188
173
|
|
189
174
|
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
@@ -200,121 +185,111 @@ module Wmap
|
|
200
185
|
# Input is host and open port, output is a URL for valid http response code or nil
|
201
186
|
def host_2_url (host,port=80)
|
202
187
|
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
223
|
-
return nil
|
224
|
-
end
|
225
|
-
rescue => ee
|
226
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
188
|
+
host=host.strip
|
189
|
+
if port.to_i == 80
|
190
|
+
url_1 = "http://" + host + "/"
|
191
|
+
elsif port.to_i ==443
|
192
|
+
url_1 = "https://" + host + "/"
|
193
|
+
else
|
194
|
+
url_1 = "http://" + host + ":" + port.to_s + "/"
|
195
|
+
url_2 = "https://" + host + ":" + port.to_s + "/"
|
196
|
+
end
|
197
|
+
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
198
|
+
checker=Wmap::UrlChecker.new
|
199
|
+
if checker.response_code(url_1) != 10000
|
200
|
+
puts "Found URL: #{url_1}" if @verbose
|
201
|
+
return url_1
|
202
|
+
elsif checker.response_code(url_2) != 10000
|
203
|
+
puts "Found URL: #{url_2}" if @verbose
|
204
|
+
return url_2
|
205
|
+
else
|
206
|
+
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
227
207
|
return nil
|
228
208
|
end
|
209
|
+
rescue => ee
|
210
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
211
|
+
return nil
|
229
212
|
end
|
230
213
|
|
231
214
|
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
232
215
|
def make_absolute(base, relative_url)
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
end
|
247
|
-
end
|
216
|
+
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
217
|
+
absolute_url = nil;
|
218
|
+
if relative_url =~ /^\//
|
219
|
+
absolute_url = create_absolute_url_from_base(base, relative_url)
|
220
|
+
else
|
221
|
+
absolute_url = create_absolute_url_from_context(base, relative_url)
|
222
|
+
end
|
223
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
224
|
+
return absolute_url
|
225
|
+
rescue => ee
|
226
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
227
|
+
return nil
|
228
|
+
end
|
248
229
|
|
249
230
|
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
250
231
|
def create_absolute_url_from_base(potential_base, relative_url)
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
end
|
260
|
-
end
|
232
|
+
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
233
|
+
naked_base = url_2_site(potential_base).strip.chop
|
234
|
+
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
235
|
+
return naked_base + relative_url
|
236
|
+
rescue => ee
|
237
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
238
|
+
return nil
|
239
|
+
end
|
261
240
|
|
262
241
|
# Construct the absolute URL by comparing a known URL and the relative file path
|
263
242
|
def create_absolute_url_from_context(potential_base, relative_url)
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
243
|
+
puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
|
244
|
+
absolute_url = nil
|
245
|
+
# make relative URL naked by removing the beginning '/'
|
246
|
+
relative_url.sub!(/^\//,'')
|
247
|
+
if potential_base =~ /\/$/
|
248
|
+
absolute_url = potential_base+relative_url.strip
|
249
|
+
else
|
250
|
+
last_index_of_slash = potential_base.rindex('/')
|
251
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
252
|
+
absolute_url = potential_base+relative_url
|
271
253
|
else
|
272
|
-
|
273
|
-
if
|
274
|
-
absolute_url = potential_base+relative_url
|
254
|
+
last_index_of_dot = potential_base.rindex('.')
|
255
|
+
if last_index_of_dot < last_index_of_slash
|
256
|
+
absolute_url = potential_base.strip.chop+relative_url
|
275
257
|
else
|
276
|
-
|
277
|
-
if last_index_of_dot < last_index_of_slash
|
278
|
-
absolute_url = potential_base.strip.chop+relative_url
|
279
|
-
else
|
280
|
-
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
281
|
-
end
|
258
|
+
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
282
259
|
end
|
283
260
|
end
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
261
|
+
end
|
262
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
263
|
+
return absolute_url
|
264
|
+
rescue => ee
|
265
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
266
|
+
return nil
|
267
|
+
end
|
291
268
|
|
292
269
|
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
293
270
|
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
294
271
|
def normalize_url(url)
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
return base+path
|
313
|
-
end
|
314
|
-
rescue => ee
|
315
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
316
|
-
return url
|
272
|
+
url.strip!
|
273
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
274
|
+
# Normalize the base
|
275
|
+
base=url_2_site(url)
|
276
|
+
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
277
|
+
base=base.sub(/\.\/$/,'/')
|
278
|
+
# Normalize the relative path, case#1
|
279
|
+
# retrieve the file path and remove the first '/' or '.',
|
280
|
+
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
281
|
+
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
282
|
+
# Normalize the relative path, case#2
|
283
|
+
# Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
|
284
|
+
path=path.gsub(/\/\.{1,2}\//,'/')
|
285
|
+
if path.nil?
|
286
|
+
return base
|
287
|
+
else
|
288
|
+
return base+path
|
317
289
|
end
|
290
|
+
rescue => ee
|
291
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
292
|
+
return url
|
318
293
|
end
|
319
294
|
|
320
295
|
|
data/version.txt
CHANGED