wmap 2.7.0 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wmap/url_checker.rb +0 -104
- data/lib/wmap/utils/url_magic.rb +189 -214
- data/version.txt +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fbbbf1b71d804f4682d5827d07aa6079a27953b04643a7479db78b2a8c68f865
|
4
|
+
data.tar.gz: 471585e726794ca17d72ec92d8f9963d8c10433f5df2fb941b0ad930880f9bcb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 222487917ebee32bf3f1ebed8356fcd12b2c21fead45f6ade6e95e1cc714a9bff2a70bd7fea4ce70551fdc6589cc8b92b8625e45919b18a02ff972f2e43c65d4
|
7
|
+
data.tar.gz: a96bec97937ec3ff93e6f65081e54608f80dd78916a7fb600fe82f7b41e0458f66890c551eb9f2987dc35e4f09d75cdeeb4068e91ce01139b73f3f4cdf7c11e3
|
data/lib/wmap/url_checker.rb
CHANGED
@@ -119,110 +119,6 @@ class Wmap::UrlChecker
|
|
119
119
|
end
|
120
120
|
alias_method :checks, :url_workers
|
121
121
|
|
122
|
-
=begin
|
123
|
-
# Test the URL and return the response code
|
124
|
-
def response_code (url)
|
125
|
-
puts "Check the http response code on the url: #{url}" if @verbose
|
126
|
-
code = 10000 # All unknown url connection exceptions go here
|
127
|
-
raise "Invalid url: #{url}" unless is_url?(url)
|
128
|
-
url=url.strip.downcase
|
129
|
-
timeo = @http_timeout/1000.0
|
130
|
-
uri = URI.parse(url)
|
131
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
132
|
-
http.open_timeout = timeo
|
133
|
-
http.read_timeout = timeo
|
134
|
-
if (url =~ /https\:/i)
|
135
|
-
http.use_ssl = true
|
136
|
-
#http.ssl_version = :SSLv3
|
137
|
-
# Bypass the remote web server cert validation test
|
138
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
139
|
-
end
|
140
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
141
|
-
response = http.request(request)
|
142
|
-
puts "Server response the following: #{response}" if @verbose
|
143
|
-
code = response.code.to_i
|
144
|
-
#response.finish if response.started?()
|
145
|
-
@url_code[url]=code
|
146
|
-
puts "Response code on #{url}: #{code}" if @verbose
|
147
|
-
return code
|
148
|
-
rescue Exception => ee
|
149
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
150
|
-
case ee
|
151
|
-
# rescue "Connection reset by peer" error type
|
152
|
-
when Errno::ECONNRESET
|
153
|
-
code=104
|
154
|
-
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
155
|
-
#code=10000
|
156
|
-
when Timeout::Error # Quick fix
|
157
|
-
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
158
|
-
http.ssl_version = :SSLv3
|
159
|
-
response = http.request(request)
|
160
|
-
code = response.code.to_i
|
161
|
-
unless code.nil?
|
162
|
-
@ssl_version = http.ssl_version
|
163
|
-
end
|
164
|
-
end
|
165
|
-
else
|
166
|
-
#code=10000
|
167
|
-
end
|
168
|
-
@url_code[url]=code
|
169
|
-
return code
|
170
|
-
end
|
171
|
-
|
172
|
-
# Test the URL / site and return the redirection location (3xx response code only)
|
173
|
-
def redirect_location (url)
|
174
|
-
puts "Test the redirection location for the url: #{url}" if @verbose
|
175
|
-
location=""
|
176
|
-
raise "Invalid url: #{url}" unless is_url?(url)
|
177
|
-
url=url.strip.downcase
|
178
|
-
timeo = @http_timeout/1000.0
|
179
|
-
uri = URI.parse(url)
|
180
|
-
code = response_code (url)
|
181
|
-
if code >= 300 && code < 400
|
182
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
183
|
-
http.open_timeout = timeo
|
184
|
-
http.read_timeout = timeo
|
185
|
-
if (url =~ /https\:/i)
|
186
|
-
http.use_ssl = true
|
187
|
-
# Bypass the remote web server cert validation test
|
188
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
189
|
-
http.ssl_version = @ssl_version
|
190
|
-
end
|
191
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
192
|
-
response = http.request(request)
|
193
|
-
puts "Response: #{response}" if @verbose
|
194
|
-
case response
|
195
|
-
when Net::HTTPRedirection then
|
196
|
-
location = response['location']
|
197
|
-
end
|
198
|
-
end
|
199
|
-
@url_redirection[url]=location
|
200
|
-
return location
|
201
|
-
rescue Exception => ee
|
202
|
-
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
203
|
-
return ""
|
204
|
-
end
|
205
|
-
alias_method :location, :redirect_location
|
206
|
-
|
207
|
-
# Test the URL / Site and return the landing url location (recursive with the depth = 4 )
|
208
|
-
def landing_location (depth=5, url)
|
209
|
-
depth -= 1
|
210
|
-
return url if depth < 1
|
211
|
-
timeo = @http_timeout/1000.0
|
212
|
-
uri = URI.parse(url)
|
213
|
-
code = response_code (url)
|
214
|
-
if code >= 300 && code < 400
|
215
|
-
url = redirect_location (url)
|
216
|
-
url = landing_location(depth,url)
|
217
|
-
else
|
218
|
-
return url
|
219
|
-
end
|
220
|
-
return url
|
221
|
-
rescue Exception => ee
|
222
|
-
puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
|
223
|
-
end
|
224
|
-
=end
|
225
|
-
|
226
122
|
# Test the URL / site and return the web server type from the HTTP header "server" field
|
227
123
|
def get_server_header (url)
|
228
124
|
puts "Retrieve the server header field from the url: #{url}" if @verbose
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -14,176 +14,161 @@ module Wmap
|
|
14
14
|
extend self
|
15
15
|
|
16
16
|
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
|
-
Max_http_timeout=
|
17
|
+
Max_http_timeout=15000
|
18
18
|
|
19
19
|
# Simple sanity check on a 'claimed' URL string.
|
20
20
|
def is_url?(url)
|
21
21
|
puts "Validate the URL format is valid: #{url}" if @verbose
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
return true
|
28
|
-
else
|
29
|
-
return false
|
30
|
-
end
|
22
|
+
if url =~ /(http|https)\:\/\/((.)+)/i
|
23
|
+
host=$2.split('/')[0]
|
24
|
+
host=host.split(':')[0]
|
25
|
+
if is_ip?(host) or is_fqdn?(host)
|
26
|
+
return true
|
31
27
|
else
|
32
|
-
puts "Unknown URL format: #{url}" if @verbose
|
33
28
|
return false
|
34
29
|
end
|
35
|
-
|
36
|
-
puts "
|
30
|
+
else
|
31
|
+
puts "Unknown URL format: #{url}" if @verbose
|
37
32
|
return false
|
38
33
|
end
|
34
|
+
rescue => ee
|
35
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
36
|
+
return false
|
39
37
|
end
|
40
38
|
|
41
39
|
# Simple sanity check on a 'claimed' SSL enabled URL string
|
42
40
|
def is_ssl?(url)
|
43
41
|
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
else
|
49
|
-
return false
|
50
|
-
end
|
51
|
-
rescue => ee
|
52
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
42
|
+
url=url.strip
|
43
|
+
if is_url?(url) && url =~ /https/i
|
44
|
+
return true
|
45
|
+
else
|
53
46
|
return false
|
54
47
|
end
|
48
|
+
rescue => ee
|
49
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
50
|
+
return false
|
55
51
|
end
|
56
52
|
alias_method :is_https?, :is_ssl?
|
57
53
|
|
58
54
|
# Simple sanity check on a 'claimed' web site base string.
|
59
55
|
def is_site?(url)
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if
|
64
|
-
|
65
|
-
return true
|
66
|
-
else
|
67
|
-
return false
|
68
|
-
end
|
56
|
+
puts "Validate the website string format for: #{url}" if @verbose
|
57
|
+
url=url.strip.downcase
|
58
|
+
if is_url?(url)
|
59
|
+
if url == url_2_site(url)
|
60
|
+
return true
|
69
61
|
else
|
70
|
-
puts "Unknown site format: #{url}" if @verbose
|
71
62
|
return false
|
72
63
|
end
|
73
|
-
|
74
|
-
puts "
|
75
|
-
return
|
64
|
+
else
|
65
|
+
puts "Unknown site format: #{url}" if @verbose
|
66
|
+
return false
|
76
67
|
end
|
68
|
+
rescue => ee
|
69
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
70
|
+
return nil
|
77
71
|
end
|
78
72
|
|
79
73
|
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
80
74
|
def url_2_host (url)
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
puts "Error process url: #{url}"
|
86
|
-
return nil
|
87
|
-
else
|
88
|
-
record2 = record1[0].split(':')
|
89
|
-
return record2[0]
|
90
|
-
end
|
91
|
-
rescue => ee
|
92
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
75
|
+
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
76
|
+
record1 = url.split('/')
|
77
|
+
if record1[0].nil?
|
78
|
+
puts "Error process url: #{url}"
|
93
79
|
return nil
|
80
|
+
else
|
81
|
+
record2 = record1[0].split(':')
|
82
|
+
return record2[0]
|
94
83
|
end
|
84
|
+
rescue => ee
|
85
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
86
|
+
return nil
|
95
87
|
end
|
96
88
|
|
97
89
|
# Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
|
98
90
|
def url_2_port (url)
|
99
91
|
puts "Retrieve service port on URL: #{url}" if @verbose
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
return 80
|
114
|
-
end
|
115
|
-
rescue => ee
|
116
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
117
|
-
return nil
|
92
|
+
ssl = (url =~ /https/i)
|
93
|
+
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
94
|
+
record1 = url.split('/')
|
95
|
+
record2 = record1[0].split(':')
|
96
|
+
if (record2.length == 2)
|
97
|
+
puts "The service port: #{record2[1]}" if @verbose
|
98
|
+
return record2[1].to_i
|
99
|
+
elsif ssl
|
100
|
+
puts "The service port: 443" if @verbose
|
101
|
+
return 443
|
102
|
+
else
|
103
|
+
puts "The service port: 80" if @verbose
|
104
|
+
return 80
|
118
105
|
end
|
106
|
+
rescue => ee
|
107
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
108
|
+
return nil
|
119
109
|
end
|
120
110
|
|
121
111
|
# Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
|
122
112
|
def url_2_site (url)
|
123
113
|
puts "Retrieve the web site base for url: #{url}" if @verbose
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
#do nothing
|
152
|
-
end
|
153
|
-
end
|
154
|
-
# step 2, put the host:port pair back to the normal site format
|
155
|
-
prot="https:" if port==443
|
156
|
-
if port==80 || port==443
|
157
|
-
site=prot+"//"+host+"/"
|
158
|
-
else
|
159
|
-
site=prot+"//"+host+":"+port.to_s+"/"
|
160
|
-
end
|
161
|
-
if site=~ /http/i
|
162
|
-
#puts "Base found: #{site}" if @verbose
|
163
|
-
return site
|
164
|
-
else
|
165
|
-
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
166
|
-
return nil
|
114
|
+
url = url.downcase
|
115
|
+
url = url.sub(/^(.*?)http/i,'http')
|
116
|
+
entry = url.split(%r{\/\/})
|
117
|
+
prot=entry[0]
|
118
|
+
# step 1, extract the host:port pair from the url
|
119
|
+
host_port=entry[1].split(%r{\/})[0]
|
120
|
+
if host_port =~ /\:/
|
121
|
+
host=host_port.split(%r{\:})[0]
|
122
|
+
port=host_port.split(%r{\:})[1].to_i
|
123
|
+
elsif prot =~ /https/i
|
124
|
+
host=host_port
|
125
|
+
port=443
|
126
|
+
elsif prot =~ /http/i
|
127
|
+
host=host_port
|
128
|
+
port=80
|
129
|
+
else
|
130
|
+
host=host_port
|
131
|
+
#raise "Unknown url format: #{url}"
|
132
|
+
end
|
133
|
+
# additional logic to handle uncommon url base structures
|
134
|
+
unless is_fqdn?(host)
|
135
|
+
case host
|
136
|
+
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
137
|
+
when /\?|\#/
|
138
|
+
host=host.split(%r{\?|\#})[0]
|
139
|
+
else
|
140
|
+
#do nothing
|
167
141
|
end
|
168
|
-
|
169
|
-
|
142
|
+
end
|
143
|
+
# step 2, put the host:port pair back to the normal site format
|
144
|
+
prot="https:" if port==443
|
145
|
+
if port==80 || port==443
|
146
|
+
site=prot+"//"+host+"/"
|
147
|
+
else
|
148
|
+
site=prot+"//"+host+":"+port.to_s+"/"
|
149
|
+
end
|
150
|
+
if site=~ /http/i
|
151
|
+
#puts "Base found: #{site}" if @verbose
|
152
|
+
return site
|
153
|
+
else
|
154
|
+
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
170
155
|
return nil
|
171
156
|
end
|
157
|
+
rescue => ee
|
158
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
159
|
+
return nil
|
172
160
|
end
|
173
161
|
|
174
162
|
# Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
|
175
163
|
def url_2_path(url)
|
176
164
|
#puts "Retrieve the relative path component of the url: #{url}" if @verbose
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
185
|
-
end
|
186
|
-
|
165
|
+
url.strip!
|
166
|
+
base = url_2_site(url).chop
|
167
|
+
path=url.sub(base,'')
|
168
|
+
#puts "Path component found: #{path}" if @verbose
|
169
|
+
return path
|
170
|
+
rescue => ee
|
171
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
187
172
|
end
|
188
173
|
|
189
174
|
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
@@ -200,121 +185,111 @@ module Wmap
|
|
200
185
|
# Input is host and open port, output is a URL for valid http response code or nil
|
201
186
|
def host_2_url (host,port=80)
|
202
187
|
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
223
|
-
return nil
|
224
|
-
end
|
225
|
-
rescue => ee
|
226
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
188
|
+
host=host.strip
|
189
|
+
if port.to_i == 80
|
190
|
+
url_1 = "http://" + host + "/"
|
191
|
+
elsif port.to_i ==443
|
192
|
+
url_1 = "https://" + host + "/"
|
193
|
+
else
|
194
|
+
url_1 = "http://" + host + ":" + port.to_s + "/"
|
195
|
+
url_2 = "https://" + host + ":" + port.to_s + "/"
|
196
|
+
end
|
197
|
+
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
198
|
+
checker=Wmap::UrlChecker.new
|
199
|
+
if checker.response_code(url_1) != 10000
|
200
|
+
puts "Found URL: #{url_1}" if @verbose
|
201
|
+
return url_1
|
202
|
+
elsif checker.response_code(url_2) != 10000
|
203
|
+
puts "Found URL: #{url_2}" if @verbose
|
204
|
+
return url_2
|
205
|
+
else
|
206
|
+
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
227
207
|
return nil
|
228
208
|
end
|
209
|
+
rescue => ee
|
210
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
211
|
+
return nil
|
229
212
|
end
|
230
213
|
|
231
214
|
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
232
215
|
def make_absolute(base, relative_url)
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
end
|
247
|
-
end
|
216
|
+
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
217
|
+
absolute_url = nil;
|
218
|
+
if relative_url =~ /^\//
|
219
|
+
absolute_url = create_absolute_url_from_base(base, relative_url)
|
220
|
+
else
|
221
|
+
absolute_url = create_absolute_url_from_context(base, relative_url)
|
222
|
+
end
|
223
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
224
|
+
return absolute_url
|
225
|
+
rescue => ee
|
226
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
227
|
+
return nil
|
228
|
+
end
|
248
229
|
|
249
230
|
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
250
231
|
def create_absolute_url_from_base(potential_base, relative_url)
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
end
|
260
|
-
end
|
232
|
+
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
233
|
+
naked_base = url_2_site(potential_base).strip.chop
|
234
|
+
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
235
|
+
return naked_base + relative_url
|
236
|
+
rescue => ee
|
237
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
238
|
+
return nil
|
239
|
+
end
|
261
240
|
|
262
241
|
# Construct the absolute URL by comparing a known URL and the relative file path
|
263
242
|
def create_absolute_url_from_context(potential_base, relative_url)
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
243
|
+
puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
|
244
|
+
absolute_url = nil
|
245
|
+
# make relative URL naked by removing the beginning '/'
|
246
|
+
relative_url.sub!(/^\//,'')
|
247
|
+
if potential_base =~ /\/$/
|
248
|
+
absolute_url = potential_base+relative_url.strip
|
249
|
+
else
|
250
|
+
last_index_of_slash = potential_base.rindex('/')
|
251
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
252
|
+
absolute_url = potential_base+relative_url
|
271
253
|
else
|
272
|
-
|
273
|
-
if
|
274
|
-
absolute_url = potential_base+relative_url
|
254
|
+
last_index_of_dot = potential_base.rindex('.')
|
255
|
+
if last_index_of_dot < last_index_of_slash
|
256
|
+
absolute_url = potential_base.strip.chop+relative_url
|
275
257
|
else
|
276
|
-
|
277
|
-
if last_index_of_dot < last_index_of_slash
|
278
|
-
absolute_url = potential_base.strip.chop+relative_url
|
279
|
-
else
|
280
|
-
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
281
|
-
end
|
258
|
+
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
282
259
|
end
|
283
260
|
end
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
261
|
+
end
|
262
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
263
|
+
return absolute_url
|
264
|
+
rescue => ee
|
265
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
266
|
+
return nil
|
267
|
+
end
|
291
268
|
|
292
269
|
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
293
270
|
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
294
271
|
def normalize_url(url)
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
return base+path
|
313
|
-
end
|
314
|
-
rescue => ee
|
315
|
-
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
316
|
-
return url
|
272
|
+
url.strip!
|
273
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
274
|
+
# Normalize the base
|
275
|
+
base=url_2_site(url)
|
276
|
+
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
277
|
+
base=base.sub(/\.\/$/,'/')
|
278
|
+
# Normalize the relative path, case#1
|
279
|
+
# retrieve the file path and remove the first '/' or '.',
|
280
|
+
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
281
|
+
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
282
|
+
# Normalize the relative path, case#2
|
283
|
+
# Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
|
284
|
+
path=path.gsub(/\/\.{1,2}\//,'/')
|
285
|
+
if path.nil?
|
286
|
+
return base
|
287
|
+
else
|
288
|
+
return base+path
|
317
289
|
end
|
290
|
+
rescue => ee
|
291
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
292
|
+
return url
|
318
293
|
end
|
319
294
|
|
320
295
|
|
data/version.txt
CHANGED