wmap 2.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +141 -0
- data/LICENSE.txt +15 -0
- data/README.rdoc +98 -0
- data/TODO +13 -0
- data/bin/deprime +21 -0
- data/bin/distrust +38 -0
- data/bin/googleBot +23 -0
- data/bin/prime +21 -0
- data/bin/refresh +26 -0
- data/bin/run_tests +16 -0
- data/bin/spiderBot +26 -0
- data/bin/trust +38 -0
- data/bin/updateAll +57 -0
- data/bin/wadd +25 -0
- data/bin/wadds +26 -0
- data/bin/wcheck +28 -0
- data/bin/wdel +25 -0
- data/bin/wdump +21 -0
- data/bin/wmap +151 -0
- data/bin/wscan +32 -0
- data/data/cidrs +2 -0
- data/data/deactivated_sites +1 -0
- data/data/domains +2 -0
- data/data/hosts +1 -0
- data/data/prime_hosts +1 -0
- data/data/sites +2 -0
- data/data/sub_domains +2 -0
- data/demos/bruter.rb +27 -0
- data/demos/dns_brutes.rb +28 -0
- data/demos/filter_cidr.rb +18 -0
- data/demos/filter_crawls.rb +5 -0
- data/demos/filter_domain.rb +25 -0
- data/demos/filter_geoip.rb +26 -0
- data/demos/filter_known_services.rb +59 -0
- data/demos/filter_netinfo.rb +23 -0
- data/demos/filter_prime.rb +25 -0
- data/demos/filter_profiler.rb +3 -0
- data/demos/filter_redirection.rb +19 -0
- data/demos/filter_site.rb +40 -0
- data/demos/filter_siteip.rb +31 -0
- data/demos/filter_status.rb +17 -0
- data/demos/filter_timestamp.rb +23 -0
- data/demos/filter_url.rb +19 -0
- data/demos/new_fnd.rb +66 -0
- data/demos/nmap_parser.pl +138 -0
- data/demos/site_format.rb +18 -0
- data/demos/whois_domain.rb +78 -0
- data/dicts/GeoIP.dat +0 -0
- data/dicts/GeoIPASNum.dat +0 -0
- data/dicts/GeoLiteCity.dat +0 -0
- data/dicts/ccsld.txt +2646 -0
- data/dicts/cctld.txt +243 -0
- data/dicts/gtld.txt +25 -0
- data/dicts/hostnames-dict.big +1402 -0
- data/dicts/hostnames-dict.txt +101 -0
- data/lib/wmap/cidr_tracker.rb +327 -0
- data/lib/wmap/dns_bruter.rb +308 -0
- data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
- data/lib/wmap/domain_tracker.rb +342 -0
- data/lib/wmap/geoip_tracker.rb +72 -0
- data/lib/wmap/google_search_scraper.rb +177 -0
- data/lib/wmap/host_tracker/primary_host.rb +130 -0
- data/lib/wmap/host_tracker.rb +550 -0
- data/lib/wmap/network_profiler.rb +144 -0
- data/lib/wmap/port_scanner.rb +208 -0
- data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
- data/lib/wmap/site_tracker.rb +937 -0
- data/lib/wmap/url_checker.rb +314 -0
- data/lib/wmap/url_crawler.rb +381 -0
- data/lib/wmap/utils/domain_root.rb +184 -0
- data/lib/wmap/utils/logger.rb +53 -0
- data/lib/wmap/utils/url_magic.rb +343 -0
- data/lib/wmap/utils/utils.rb +333 -0
- data/lib/wmap/whois.rb +76 -0
- data/lib/wmap.rb +227 -0
- data/logs/wmap.log +17 -0
- data/ruby_whois_patches/base_cocca2.rb +149 -0
- data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
- data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
- data/ruby_whois_patches/whois.above.com.rb +61 -0
- data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
- data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
- data/ruby_whois_patches/whois.ai.rb +112 -0
- data/ruby_whois_patches/whois.arnes.si.rb +121 -0
- data/ruby_whois_patches/whois.ascio.com.rb +91 -0
- data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
- data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
- data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
- data/ruby_whois_patches/whois.denic.de.rb +174 -0
- data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
- data/ruby_whois_patches/whois.dns.be.rb +134 -0
- data/ruby_whois_patches/whois.dns.lu.rb +129 -0
- data/ruby_whois_patches/whois.dns.pl.rb +150 -0
- data/ruby_whois_patches/whois.dns.pt.rb +119 -0
- data/ruby_whois_patches/whois.domain.kg.rb +126 -0
- data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
- data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
- data/ruby_whois_patches/whois.dot.tk.rb +140 -0
- data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
- data/ruby_whois_patches/whois.isnic.is.rb +130 -0
- data/ruby_whois_patches/whois.je.rb +119 -0
- data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
- data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
- data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
- data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
- data/ruby_whois_patches/whois.nic.as.rb +96 -0
- data/ruby_whois_patches/whois.nic.at.rb +109 -0
- data/ruby_whois_patches/whois.nic.ch.rb +141 -0
- data/ruby_whois_patches/whois.nic.cl.rb +117 -0
- data/ruby_whois_patches/whois.nic.ec.rb +157 -0
- data/ruby_whois_patches/whois.nic.im.rb +120 -0
- data/ruby_whois_patches/whois.nic.it.rb +170 -0
- data/ruby_whois_patches/whois.nic.lv.rb +116 -0
- data/ruby_whois_patches/whois.nic.ly.rb +127 -0
- data/ruby_whois_patches/whois.nic.mu.rb +27 -0
- data/ruby_whois_patches/whois.nic.mx.rb +123 -0
- data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
- data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
- data/ruby_whois_patches/whois.nic.tel.rb +129 -0
- data/ruby_whois_patches/whois.nic.tr.rb +133 -0
- data/ruby_whois_patches/whois.nic.us.rb +129 -0
- data/ruby_whois_patches/whois.nic.ve.rb +135 -0
- data/ruby_whois_patches/whois.norid.no.rb +127 -0
- data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
- data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
- data/ruby_whois_patches/whois.registro.br.rb +109 -0
- data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
- data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
- data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
- data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
- data/ruby_whois_patches/whois.tucows.com.rb +70 -0
- data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
- data/settings/discovery_ports +24 -0
- data/settings/google_keywords.txt +9 -0
- data/settings/google_locator.txt +23 -0
- data/test/domain_tracker_test.rb +31 -0
- data/test/utils_test.rb +168 -0
- data/version.txt +13 -0
- data/wmap.gemspec +49 -0
- metadata +202 -0
@@ -0,0 +1,314 @@
|
|
1
|
+
#--
|
2
|
+
# Wmap
|
3
|
+
#
|
4
|
+
# A pure Ruby library for the Internet web application discovery and tracking.
|
5
|
+
#
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
|
+
#++
|
8
|
+
require "net/http"
|
9
|
+
require 'httpclient'
|
10
|
+
require "openssl"
|
11
|
+
require "uri"
|
12
|
+
require "digest/md5"
|
13
|
+
require "parallel"
|
14
|
+
|
15
|
+
# A quick checker class to identify / finger-print a URL / site
|
16
|
+
class Wmap::UrlChecker
|
17
|
+
include Wmap::Utils
|
18
|
+
attr_accessor :http_timeout, :max_parallel, :verbose, :data_dir
|
19
|
+
|
20
|
+
def initialize (params = {})
|
21
|
+
# Set default instance variables
|
22
|
+
@verbose=params.fetch(:verbose, false)
|
23
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
|
24
|
+
@http_timeout=params.fetch(:http_timeout, 5000)
|
25
|
+
@max_parallel=params.fetch(:max_parallel, 40)
|
26
|
+
@ssl_version=nil
|
27
|
+
@url_code={}
|
28
|
+
@url_redirection={}
|
29
|
+
@url_finger_print={}
|
30
|
+
@url_server={}
|
31
|
+
end
|
32
|
+
|
33
|
+
# Main worker method to perform various checks on the URL / site
|
34
|
+
def url_worker (url)
|
35
|
+
puts "Checking out an unknown URL: #{url}" if @verbose
|
36
|
+
begin
|
37
|
+
url=url.strip.downcase
|
38
|
+
raise "Invalid URL format: #{url}" unless is_url?(url)
|
39
|
+
timestamp=Time.now
|
40
|
+
host=url_2_host(url)
|
41
|
+
ip=host_2_ip(host)
|
42
|
+
port=url_2_port(url)
|
43
|
+
code=10000
|
44
|
+
if @url_code.key?(url)
|
45
|
+
code=@url_code[url]
|
46
|
+
else
|
47
|
+
code=response_code(url)
|
48
|
+
end
|
49
|
+
if @url_redirection.key?(url)
|
50
|
+
loc=@url_redirection[url]
|
51
|
+
else
|
52
|
+
loc=redirect_location(url)
|
53
|
+
end
|
54
|
+
if @url_finger_print.key?(url)
|
55
|
+
fp=@url_finger_print[url]
|
56
|
+
else
|
57
|
+
fp=response_body_md5(url)
|
58
|
+
end
|
59
|
+
if @url_server.key?(url)
|
60
|
+
server=@url_server[url]
|
61
|
+
else
|
62
|
+
server=get_server_header(url)
|
63
|
+
end
|
64
|
+
# save the data
|
65
|
+
checker=Hash.new
|
66
|
+
checker['ip']=ip
|
67
|
+
checker['port']=port
|
68
|
+
checker['url']=url
|
69
|
+
checker['code']=code
|
70
|
+
checker['redirection']=loc
|
71
|
+
checker['md5']=fp
|
72
|
+
checker['server']=server
|
73
|
+
checker['timestamp']=timestamp
|
74
|
+
if Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
|
75
|
+
checker['status']="int_hosted"
|
76
|
+
else
|
77
|
+
checker['status']="ext_hosted"
|
78
|
+
end
|
79
|
+
return checker
|
80
|
+
rescue OpenSSL::SSL::SSLError => es # handler to temporally hold the openssl bug in bay: SSL_set_session: unable to find ssl method
|
81
|
+
checker=Hash.new
|
82
|
+
checker['ip']=ip
|
83
|
+
checker['port']=port
|
84
|
+
checker['url']=url
|
85
|
+
checker['code']=20000
|
86
|
+
checker['server']="Unknown SSL error: #{es}"
|
87
|
+
checker['md']=nil
|
88
|
+
checker['redirection']=nil
|
89
|
+
checker['timestamp']=timestamp
|
90
|
+
return checker
|
91
|
+
rescue Exception => ee
|
92
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" # if @verbose
|
93
|
+
return nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
alias_method :check, :url_worker
|
97
|
+
|
98
|
+
# Parallel scanner - by utilizing fork manager 'parallel' to spawn numbers of child processes on multiple urls simultaneously
|
99
|
+
def url_workers (targets,num=@max_parallel)
|
100
|
+
begin
|
101
|
+
results=Array.new
|
102
|
+
targets -= ["", nil]
|
103
|
+
if targets.size > 0
|
104
|
+
puts "Start the url checker on the targets:\n #{targets}"
|
105
|
+
Parallel.map(targets, :in_processes => num) { |target|
|
106
|
+
url_worker(target)
|
107
|
+
}.each do |process|
|
108
|
+
if process.nil?
|
109
|
+
next
|
110
|
+
elsif process.empty?
|
111
|
+
#do nothing
|
112
|
+
else
|
113
|
+
results << process
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
return results
|
118
|
+
rescue Exception => ee
|
119
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
end
|
123
|
+
alias_method :checks, :url_workers
|
124
|
+
|
125
|
+
# Test the URL and return the response code
|
126
|
+
def response_code (url)
|
127
|
+
puts "Check the http response code on the url: #{url}" if @verbose
|
128
|
+
response_code = 10000 # All unknown url connection exceptions go here
|
129
|
+
begin
|
130
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
131
|
+
url=url.strip.downcase
|
132
|
+
timeo = @http_timeout/1000.0
|
133
|
+
uri = URI.parse(url)
|
134
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
135
|
+
http.open_timeout = timeo
|
136
|
+
http.read_timeout = timeo
|
137
|
+
if (url =~ /https\:/i)
|
138
|
+
http.use_ssl = true
|
139
|
+
#http.ssl_version = :SSLv3
|
140
|
+
# Bypass the remote web server cert validation test
|
141
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
142
|
+
end
|
143
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
144
|
+
response = http.request(request)
|
145
|
+
puts "Server response the following: #{response}" if @verbose
|
146
|
+
response_code = response.code.to_i
|
147
|
+
#response.finish if response.started?()
|
148
|
+
@url_code[url]=response_code
|
149
|
+
puts "Response code on #{url}: #{response_code}" if @verbose
|
150
|
+
return response_code
|
151
|
+
rescue Exception => ee
|
152
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
153
|
+
case ee
|
154
|
+
# rescue "Connection reset by peer" error type
|
155
|
+
when Errno::ECONNRESET
|
156
|
+
response_code=104
|
157
|
+
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
158
|
+
#response_code=10000
|
159
|
+
when Timeout::Error # Quick fix
|
160
|
+
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
161
|
+
http.ssl_version = :SSLv3
|
162
|
+
response = http.request(request)
|
163
|
+
response_code = response.code.to_i
|
164
|
+
unless response_code.nil?
|
165
|
+
@ssl_version = http.ssl_version
|
166
|
+
end
|
167
|
+
end
|
168
|
+
else
|
169
|
+
#response_code=10000
|
170
|
+
end
|
171
|
+
@url_code[url]=response_code
|
172
|
+
return response_code
|
173
|
+
end
|
174
|
+
end
|
175
|
+
alias_method :query, :response_code
|
176
|
+
|
177
|
+
# Test the URL / site and return the redirection location (3xx response code only)
|
178
|
+
def redirect_location (url)
|
179
|
+
puts "Test the redirection location for the url: #{url}" if @verbose
|
180
|
+
location=""
|
181
|
+
begin
|
182
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
183
|
+
url=url.strip.downcase
|
184
|
+
timeo = @http_timeout/1000.0
|
185
|
+
uri = URI.parse(url)
|
186
|
+
code = response_code (url)
|
187
|
+
if code >= 300 && code < 400
|
188
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
189
|
+
http.open_timeout = timeo
|
190
|
+
http.read_timeout = timeo
|
191
|
+
if (url =~ /https\:/i)
|
192
|
+
http.use_ssl = true
|
193
|
+
# Bypass the remote web server cert validation test
|
194
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
195
|
+
http.ssl_version = @ssl_version
|
196
|
+
end
|
197
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
198
|
+
response = http.request(request)
|
199
|
+
case response
|
200
|
+
when Net::HTTPRedirection then
|
201
|
+
location = response['location']
|
202
|
+
end
|
203
|
+
end
|
204
|
+
@url_redirection[url]=location
|
205
|
+
return location
|
206
|
+
rescue Exception => ee
|
207
|
+
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
208
|
+
@url_redirection[url]=location
|
209
|
+
return location
|
210
|
+
end
|
211
|
+
end
|
212
|
+
alias_method :location, :redirect_location
|
213
|
+
|
214
|
+
# Test the URL / site and return the web server type from the HTTP header "server" field
|
215
|
+
def get_server_header (url)
|
216
|
+
puts "Retrieve the server header field from the url: #{url}" if @verbose
|
217
|
+
server=String.new
|
218
|
+
begin
|
219
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
220
|
+
url=url.strip.downcase
|
221
|
+
timeo = @http_timeout/1000.0
|
222
|
+
uri = URI.parse(url)
|
223
|
+
code = response_code (url)
|
224
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
225
|
+
http.open_timeout = timeo
|
226
|
+
http.read_timeout = timeo
|
227
|
+
if (url =~ /https\:/i)
|
228
|
+
http.use_ssl = true
|
229
|
+
# Bypass the remote web server cert validation test
|
230
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
231
|
+
http.ssl_version = @ssl_version
|
232
|
+
end
|
233
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
234
|
+
response = http.request(request)
|
235
|
+
server=response["server"]
|
236
|
+
server=server.gsub(/\,/,' ')
|
237
|
+
return server
|
238
|
+
rescue Exception => ee
|
239
|
+
puts "Exception on method get_server_header for URL #{url}: #{ee}" if @verbose
|
240
|
+
@url_server[url]=server
|
241
|
+
return server
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# Use MD5 algorithm to fingerprint the URL / site response payload (web page content)
|
246
|
+
def response_body_md5(url)
|
247
|
+
puts "MD5 finger print page body content: #{url}" if @verbose
|
248
|
+
begin
|
249
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
250
|
+
url=url.strip.downcase
|
251
|
+
timeo = @http_timeout/1000.0
|
252
|
+
uri = URI.parse(url)
|
253
|
+
fp=""
|
254
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
255
|
+
http.open_timeout = timeo
|
256
|
+
http.read_timeout = timeo
|
257
|
+
if (url =~ /https\:/i)
|
258
|
+
http.use_ssl = true
|
259
|
+
# Bypass the remote web server cert validation test
|
260
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
261
|
+
http.ssl_version = @ssl_version
|
262
|
+
end
|
263
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
264
|
+
response = http.request(request)
|
265
|
+
response_body = response.body.to_s
|
266
|
+
fp=Digest::MD5.hexdigest(response_body) unless response_body.nil?
|
267
|
+
@url_finger_print[url] = fp
|
268
|
+
return fp
|
269
|
+
rescue Exception => ee
|
270
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
271
|
+
end
|
272
|
+
end
|
273
|
+
alias_method :md5, :response_body_md5
|
274
|
+
|
275
|
+
# Retrieve the remote web server certification, open it and return the cert content as a string
|
276
|
+
def get_certificate (url)
|
277
|
+
puts "Retrieve the remote web server SSL certificate in clear text: #{url}" if @verbose
|
278
|
+
begin
|
279
|
+
url=url.strip
|
280
|
+
raise "Invalid URL string: #{url}" unless is_ssl?(url)
|
281
|
+
client = HTTPClient.new
|
282
|
+
client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
283
|
+
response = client.get(url)
|
284
|
+
cert = response.peer_cert
|
285
|
+
cer = OpenSSL::X509::Certificate.new(cert)
|
286
|
+
return cer.to_text
|
287
|
+
rescue Exception => ee
|
288
|
+
puts "Exception on method #{__method__} from #{url}: #{ee}"
|
289
|
+
end
|
290
|
+
return nil
|
291
|
+
end
|
292
|
+
alias_method :get_cert, :get_certificate
|
293
|
+
|
294
|
+
# Retrieve the X509 cert in the clear text from the remote web server, extract and return the common name field within the cert
|
295
|
+
def get_cert_cn (url)
|
296
|
+
puts "Extract the common name field from a X509 cert: #{cert}" if @verbose
|
297
|
+
begin
|
298
|
+
cert=get_certificate(url)
|
299
|
+
subject, cn = ""
|
300
|
+
if cert =~ /\n(.+)Subject\:(.+)\n/i
|
301
|
+
subject=$2
|
302
|
+
end
|
303
|
+
if subject =~/CN\=(.+)/i
|
304
|
+
cn=$1
|
305
|
+
end
|
306
|
+
return cn
|
307
|
+
rescue Exception => ee
|
308
|
+
puts "Error on method #{__method__} from #{cert}: #{ee}" if @verbose
|
309
|
+
end
|
310
|
+
return nil
|
311
|
+
end
|
312
|
+
alias_method :get_cn, :get_cert_cn
|
313
|
+
|
314
|
+
end
|
@@ -0,0 +1,381 @@
|
|
1
|
+
#--
|
2
|
+
# Wmap
|
3
|
+
#
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
5
|
+
#
|
6
|
+
# Copyright (c) 2012-2015 Yang Li
|
7
|
+
#++
|
8
|
+
require "net/http"
|
9
|
+
require "uri"
|
10
|
+
require "open-uri"
|
11
|
+
require "open_uri_redirections"
|
12
|
+
require "nokogiri"
|
13
|
+
require "parallel"
|
14
|
+
|
15
|
+
|
16
|
+
# Web site crawler class
|
17
|
+
class Wmap::UrlCrawler
|
18
|
+
include Wmap::Utils
|
19
|
+
|
20
|
+
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, :verbose, :data_dir
|
21
|
+
attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
|
22
|
+
# Global variable used to store the combined result of all the forked child processes. Note that class variable
|
23
|
+
# would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
|
24
|
+
# $discovered_urls=Hash.new
|
25
|
+
|
26
|
+
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
27
|
+
Max_http_timeout=8000
|
28
|
+
# set hard stop limit of crawler time-out to 1200 seconds or 20 minutes
|
29
|
+
Crawl_timeout=1200000
|
30
|
+
|
31
|
+
# Crawler instance default variables
|
32
|
+
def initialize (params = {})
|
33
|
+
@verbose=params.fetch(:verbose, false)
|
34
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
|
35
|
+
@http_timeout=params.fetch(:http_timeout, 5000)
|
36
|
+
@crawl_depth=params.fetch(:crawl_depth, 4)
|
37
|
+
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
38
|
+
@max_parallel=params.fetch(:max_parallel, 40)
|
39
|
+
# Discovered data store
|
40
|
+
@discovered_urls_by_crawler=Hash.new
|
41
|
+
@visited_urls_by_crawler=Hash.new
|
42
|
+
@crawl_start=Hash.new
|
43
|
+
@crawl_done=Hash.new
|
44
|
+
Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
|
45
|
+
@log_file=@data_dir + "crawler.log"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
|
49
|
+
def pre_crawl(url)
|
50
|
+
puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
|
51
|
+
begin
|
52
|
+
host=url_2_host(url)
|
53
|
+
# Use the following formula to 'guess' the right http time-out threshold for the scanner
|
54
|
+
nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
|
55
|
+
if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
|
56
|
+
@http_timeout = Max_http_timeout
|
57
|
+
else
|
58
|
+
@http_timeout = 1500 + nwk_to*2
|
59
|
+
end
|
60
|
+
puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
|
61
|
+
rescue Exception => ee
|
62
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
63
|
+
@http_timeout = Max_http_timeout
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# A web crawler to crawl a known website and search for html links within the same root domain. For example,
|
68
|
+
# by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
|
69
|
+
def crawl(url)
|
70
|
+
puts "Start web crawling on #{url}"
|
71
|
+
#begin
|
72
|
+
result=Array.new
|
73
|
+
url=url.chomp.strip
|
74
|
+
result.push(url_2_site(url))
|
75
|
+
raise "Error! Invalid url format: #{urls}" unless is_url?(url)
|
76
|
+
# Add logic to profile the web server before crawling; this is used to optimize the crawling speed
|
77
|
+
pre_crawl(url)
|
78
|
+
status = Timeout::timeout(Crawl_timeout/1000) {
|
79
|
+
result+=crawl_worker(url).keys
|
80
|
+
}
|
81
|
+
puts "Web crawling time-out on #{url}: #{status}" if @verbose
|
82
|
+
return result
|
83
|
+
#rescue => ee
|
84
|
+
#puts "Exception on method #{__method__} for URL #{url}: #{ee}"
|
85
|
+
#return result
|
86
|
+
#end
|
87
|
+
end
|
88
|
+
alias_method :query, :crawl
|
89
|
+
|
90
|
+
# The worker instance of crawler who perform the labour work
|
91
|
+
def crawl_worker(url0)
|
92
|
+
puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
|
93
|
+
#begin
|
94
|
+
# Input URL sanity check first
|
95
|
+
if is_url?(url0)
|
96
|
+
host=url_2_host(url0)
|
97
|
+
ip=host_2_ip(host).to_s
|
98
|
+
raise "Invalid IP address: #{url0}" if ip.nil?
|
99
|
+
port=url_2_port(url0).to_s
|
100
|
+
raise "Invalid port number: #{url0}" if port.nil?
|
101
|
+
else
|
102
|
+
raise "Invalid URL: #{url0}. Please check it out with your browser again."
|
103
|
+
end
|
104
|
+
log_info=Hash.new
|
105
|
+
log_info[1]="Start working on #{url0}"
|
106
|
+
url_stores=Hash.new
|
107
|
+
url_stores[url0]=true unless url_stores.key?(url0)
|
108
|
+
@discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
|
109
|
+
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
110
|
+
# $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
|
111
|
+
@crawl_depth.times do
|
112
|
+
url_stores.keys.each do |url|
|
113
|
+
# 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
|
114
|
+
next if @visited_urls_by_crawler.key?(url)
|
115
|
+
url_object = open_url(url)
|
116
|
+
next if url_object == nil
|
117
|
+
url = update_url_if_redirected(url, url_object)
|
118
|
+
url_body = read_url(url)
|
119
|
+
# Protection code - to avoid parsing failure on the empty or nil object
|
120
|
+
next if url_body.nil? or url_body.empty?
|
121
|
+
url_stores[url]=true unless url_stores.key?(url)
|
122
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
123
|
+
# $discovered_urls[url]=true unless $discovered_urls.key?(url)
|
124
|
+
doc = parse_html(url_body)
|
125
|
+
next if doc == nil
|
126
|
+
if url_stores.size >= @crawl_page_limit
|
127
|
+
#@visited_urls_by_crawler.merge!(url_stores)
|
128
|
+
@discovered_urls_by_crawler.merge!(url_stores)
|
129
|
+
# $discovered_urls.merge!(url_stores)
|
130
|
+
puts "Finish web crawling the url: #{url0}"
|
131
|
+
return url_stores
|
132
|
+
end
|
133
|
+
page_urls = find_urls_on_page(doc, url)
|
134
|
+
page_urls.uniq!
|
135
|
+
page_urls.map do |y|
|
136
|
+
y=normalize_url(y)
|
137
|
+
url_stores[y]=true unless url_stores.key?(y)
|
138
|
+
@discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
|
139
|
+
# $discovered_urls[y]=true unless $discovered_urls.key?(y)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
puts "Finish web crawling on: #{url0}"
|
144
|
+
log_info[2]="Finish working on: #{url0}"
|
145
|
+
wlog(log_info, "UrlCrawler", @log_file)
|
146
|
+
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
147
|
+
return url_stores
|
148
|
+
#rescue => ee
|
149
|
+
#puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
150
|
+
#log_info[3]="Exception on #{url0}"
|
151
|
+
#wlog(log_info,"UrlCrawler",@log_file)
|
152
|
+
#return url_stores
|
153
|
+
#end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
|
157
|
+
# each child process will continuously work on the target pool until all the works are done
|
158
|
+
def crawl_workers (targets,num=@max_parallel)
|
159
|
+
begin
|
160
|
+
raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
|
161
|
+
puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
|
162
|
+
#puts "This could be awhile depending on the list size. Please be patient ..."
|
163
|
+
# 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
|
164
|
+
targets -= ["", nil]
|
165
|
+
uniq_sites=Hash.new
|
166
|
+
targets.dup.map do |target|
|
167
|
+
if is_url?(target)
|
168
|
+
host=url_2_host(target)
|
169
|
+
ip=host_2_ip(host).to_s
|
170
|
+
next if ip.nil?
|
171
|
+
port=url_2_port(target).to_s
|
172
|
+
next if port.nil?
|
173
|
+
site_key=ip+":"+port
|
174
|
+
unless uniq_sites.key?(site_key)
|
175
|
+
uniq_sites[site_key]=target
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
puts "Sanitization done! " if @verbose
|
180
|
+
puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
|
181
|
+
puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
|
182
|
+
raise "Error: target list is empty!" if targets.size < 1
|
183
|
+
Parallel.map(uniq_sites.values, :in_processes => num) { |target|
|
184
|
+
puts "Working on #{target} ..." if @verbose
|
185
|
+
crawl(target)
|
186
|
+
}.dup.each do |process|
|
187
|
+
puts "process.inspect: #{process}" if @verbose
|
188
|
+
urls=process
|
189
|
+
urls-=["",nil] unless urls.nil?
|
190
|
+
if urls.nil?
|
191
|
+
next
|
192
|
+
elsif urls.empty?
|
193
|
+
next
|
194
|
+
#do nothing
|
195
|
+
else
|
196
|
+
urls.map do |url|
|
197
|
+
url.strip!
|
198
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
199
|
+
#$discovered_urls[url]=true unless $discovered_urls.key?(url)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
#return sites
|
204
|
+
return @discovered_urls_by_crawler.keys
|
205
|
+
rescue Exception => ee
|
206
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
207
|
+
return nil
|
208
|
+
end
|
209
|
+
end
|
210
|
+
alias_method :crawls, :crawl_workers
|
211
|
+
|
212
|
+
# Fast crawling method - build the target pool from the input file
|
213
|
+
def crawl_workers_on_file (file)
|
214
|
+
puts "Web crawl the list of targets from file: #{file}"
|
215
|
+
begin
|
216
|
+
targets=file_2_list(file)
|
217
|
+
sites=crawl_workers(targets,num=@max_parallel)
|
218
|
+
return sites
|
219
|
+
rescue => ee
|
220
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
221
|
+
return nil
|
222
|
+
end
|
223
|
+
end
|
224
|
+
alias_method :query_file, :crawl_workers_on_file
|
225
|
+
alias_method :crawl_file, :crawl_workers_on_file
|
226
|
+
|
227
|
+
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
228
|
+
def open_url(url)
|
229
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
230
|
+
#url_object = nil
|
231
|
+
begin
|
232
|
+
if url =~ /http\:/i
|
233
|
+
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
234
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
235
|
+
#url_object = open(url)
|
236
|
+
elsif url =~ /https\:/i
|
237
|
+
url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
|
238
|
+
#url_object = open(url,:ssl_verify_mode => 0)
|
239
|
+
else
|
240
|
+
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
241
|
+
end
|
242
|
+
return url_object
|
243
|
+
rescue => ee
|
244
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
245
|
+
return nil
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
# Wrapper to use OpenURI method 'read' to return url body contents
|
250
|
+
def read_url(url)
|
251
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
252
|
+
begin
|
253
|
+
url_object=open_url(url)
|
254
|
+
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
255
|
+
body=url_object.read
|
256
|
+
return body
|
257
|
+
rescue => ee
|
258
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
259
|
+
return nil
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
# Return the destination url in case of url re-direct
|
264
|
+
def update_url_if_redirected(url, url_object)
|
265
|
+
#puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
|
266
|
+
begin
|
267
|
+
if url != url_object.base_uri.to_s
|
268
|
+
return url_object.base_uri.to_s
|
269
|
+
end
|
270
|
+
return url
|
271
|
+
rescue => ee
|
272
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
273
|
+
return nil
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# Wrapper for the Nokogiri DOM parser
|
278
|
+
def parse_html(html_body)
|
279
|
+
#puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
|
280
|
+
begin
|
281
|
+
doc = Nokogiri::HTML(html_body)
|
282
|
+
#puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
|
283
|
+
#puts "doc: #{doc}" if @verbose
|
284
|
+
return doc
|
285
|
+
rescue => ee
|
286
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
287
|
+
return nil
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
# Search 'current_url' and return found URLs under the same domain
|
292
|
+
def find_urls_on_page(doc, current_url)
|
293
|
+
#puts "Search and return URLs within the doc: #{doc}" if @verbose
|
294
|
+
begin
|
295
|
+
urls_list = []
|
296
|
+
# case 1 - search embedded HTML tag <a href='url'> for the url elements
|
297
|
+
links=doc.css('a')
|
298
|
+
links.map do |x|
|
299
|
+
#puts "x: #{x}"
|
300
|
+
new_url = x.attribute('href').to_s
|
301
|
+
unless new_url == nil
|
302
|
+
if new_url.match("http")
|
303
|
+
#if urls_on_same_domain?(new_url,current_url)
|
304
|
+
urls_list.push(new_url)
|
305
|
+
#end
|
306
|
+
else
|
307
|
+
new_url = make_absolute(current_url, new_url)
|
308
|
+
urls_list.push(new_url)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
# case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
|
313
|
+
elements=doc.css("meta[http-equiv]")
|
314
|
+
unless elements.size == 0
|
315
|
+
link=elements.attr("content").value.split(/url\=/i)[1]
|
316
|
+
unless link.nil?
|
317
|
+
new_url = make_absolute(current_url, link)
|
318
|
+
urls_list.push(new_url) unless new_url.nil?
|
319
|
+
end
|
320
|
+
end
|
321
|
+
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
322
|
+
return urls_list.uniq-["",nil]
|
323
|
+
rescue => ee
|
324
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
325
|
+
return nil
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# Method to print out discovery URL result
|
330
|
+
def print_discovered_urls_by_crawler
|
331
|
+
puts "Print discovered url by the crawler. " if @verbose
|
332
|
+
begin
|
333
|
+
puts "\nSummary Report of Discovered URLs from the Crawler:"
|
334
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
335
|
+
puts url
|
336
|
+
end
|
337
|
+
puts "Total: #{@discovered_urls_by_crawler.keys.size}"
|
338
|
+
puts "End of the summary"
|
339
|
+
rescue => ee
|
340
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
341
|
+
return nil
|
342
|
+
end
|
343
|
+
end
|
344
|
+
alias_method :print, :print_discovered_urls_by_crawler
|
345
|
+
|
346
|
+
# Method to save URL discovery result
|
347
|
+
def save_discovered_urls (file)
|
348
|
+
puts "Save discovered urls by the crawler to file: #{file} "
|
349
|
+
begin
|
350
|
+
list_2_file(@discovered_urls_by_crawler.keys, file)
|
351
|
+
puts "Done!"
|
352
|
+
rescue => ee
|
353
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
354
|
+
return nil
|
355
|
+
end
|
356
|
+
end
|
357
|
+
alias_method :save, :save_discovered_urls
|
358
|
+
|
359
|
+
# Method to retrieve discovery site result
|
360
|
+
def get_discovered_sites_by_crawler
|
361
|
+
puts "Print summary report of discovered sites. " if @verbose
|
362
|
+
begin
|
363
|
+
puts "\nSummary Report of Discovered Sites from the Crawler:"
|
364
|
+
sites = Hash.new
|
365
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
366
|
+
site=url_2_site(url)
|
367
|
+
sites[site]=true unless sites.key?(site)
|
368
|
+
end
|
369
|
+
sites.keys.map { |site| puts site }
|
370
|
+
puts "Total: #{sites.size}"
|
371
|
+
puts "End of the summary"
|
372
|
+
return sites.keys
|
373
|
+
rescue => ee
|
374
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
375
|
+
return nil
|
376
|
+
end
|
377
|
+
end
|
378
|
+
alias_method :get_sites, :get_discovered_sites_by_crawler
|
379
|
+
|
380
|
+
private :open_url, :read_url, :update_url_if_redirected, :parse_html, :find_urls_on_page
|
381
|
+
end
|