wmap 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +141 -0
- data/LICENSE.txt +15 -0
- data/README.rdoc +98 -0
- data/TODO +13 -0
- data/bin/deprime +21 -0
- data/bin/distrust +38 -0
- data/bin/googleBot +23 -0
- data/bin/prime +21 -0
- data/bin/refresh +26 -0
- data/bin/run_tests +16 -0
- data/bin/spiderBot +26 -0
- data/bin/trust +38 -0
- data/bin/updateAll +57 -0
- data/bin/wadd +25 -0
- data/bin/wadds +26 -0
- data/bin/wcheck +28 -0
- data/bin/wdel +25 -0
- data/bin/wdump +21 -0
- data/bin/wmap +151 -0
- data/bin/wscan +32 -0
- data/data/cidrs +2 -0
- data/data/deactivated_sites +1 -0
- data/data/domains +2 -0
- data/data/hosts +1 -0
- data/data/prime_hosts +1 -0
- data/data/sites +2 -0
- data/data/sub_domains +2 -0
- data/demos/bruter.rb +27 -0
- data/demos/dns_brutes.rb +28 -0
- data/demos/filter_cidr.rb +18 -0
- data/demos/filter_crawls.rb +5 -0
- data/demos/filter_domain.rb +25 -0
- data/demos/filter_geoip.rb +26 -0
- data/demos/filter_known_services.rb +59 -0
- data/demos/filter_netinfo.rb +23 -0
- data/demos/filter_prime.rb +25 -0
- data/demos/filter_profiler.rb +3 -0
- data/demos/filter_redirection.rb +19 -0
- data/demos/filter_site.rb +40 -0
- data/demos/filter_siteip.rb +31 -0
- data/demos/filter_status.rb +17 -0
- data/demos/filter_timestamp.rb +23 -0
- data/demos/filter_url.rb +19 -0
- data/demos/new_fnd.rb +66 -0
- data/demos/nmap_parser.pl +138 -0
- data/demos/site_format.rb +18 -0
- data/demos/whois_domain.rb +78 -0
- data/dicts/GeoIP.dat +0 -0
- data/dicts/GeoIPASNum.dat +0 -0
- data/dicts/GeoLiteCity.dat +0 -0
- data/dicts/ccsld.txt +2646 -0
- data/dicts/cctld.txt +243 -0
- data/dicts/gtld.txt +25 -0
- data/dicts/hostnames-dict.big +1402 -0
- data/dicts/hostnames-dict.txt +101 -0
- data/lib/wmap/cidr_tracker.rb +327 -0
- data/lib/wmap/dns_bruter.rb +308 -0
- data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
- data/lib/wmap/domain_tracker.rb +342 -0
- data/lib/wmap/geoip_tracker.rb +72 -0
- data/lib/wmap/google_search_scraper.rb +177 -0
- data/lib/wmap/host_tracker/primary_host.rb +130 -0
- data/lib/wmap/host_tracker.rb +550 -0
- data/lib/wmap/network_profiler.rb +144 -0
- data/lib/wmap/port_scanner.rb +208 -0
- data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
- data/lib/wmap/site_tracker.rb +937 -0
- data/lib/wmap/url_checker.rb +314 -0
- data/lib/wmap/url_crawler.rb +381 -0
- data/lib/wmap/utils/domain_root.rb +184 -0
- data/lib/wmap/utils/logger.rb +53 -0
- data/lib/wmap/utils/url_magic.rb +343 -0
- data/lib/wmap/utils/utils.rb +333 -0
- data/lib/wmap/whois.rb +76 -0
- data/lib/wmap.rb +227 -0
- data/logs/wmap.log +17 -0
- data/ruby_whois_patches/base_cocca2.rb +149 -0
- data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
- data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
- data/ruby_whois_patches/whois.above.com.rb +61 -0
- data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
- data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
- data/ruby_whois_patches/whois.ai.rb +112 -0
- data/ruby_whois_patches/whois.arnes.si.rb +121 -0
- data/ruby_whois_patches/whois.ascio.com.rb +91 -0
- data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
- data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
- data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
- data/ruby_whois_patches/whois.denic.de.rb +174 -0
- data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
- data/ruby_whois_patches/whois.dns.be.rb +134 -0
- data/ruby_whois_patches/whois.dns.lu.rb +129 -0
- data/ruby_whois_patches/whois.dns.pl.rb +150 -0
- data/ruby_whois_patches/whois.dns.pt.rb +119 -0
- data/ruby_whois_patches/whois.domain.kg.rb +126 -0
- data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
- data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
- data/ruby_whois_patches/whois.dot.tk.rb +140 -0
- data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
- data/ruby_whois_patches/whois.isnic.is.rb +130 -0
- data/ruby_whois_patches/whois.je.rb +119 -0
- data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
- data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
- data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
- data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
- data/ruby_whois_patches/whois.nic.as.rb +96 -0
- data/ruby_whois_patches/whois.nic.at.rb +109 -0
- data/ruby_whois_patches/whois.nic.ch.rb +141 -0
- data/ruby_whois_patches/whois.nic.cl.rb +117 -0
- data/ruby_whois_patches/whois.nic.ec.rb +157 -0
- data/ruby_whois_patches/whois.nic.im.rb +120 -0
- data/ruby_whois_patches/whois.nic.it.rb +170 -0
- data/ruby_whois_patches/whois.nic.lv.rb +116 -0
- data/ruby_whois_patches/whois.nic.ly.rb +127 -0
- data/ruby_whois_patches/whois.nic.mu.rb +27 -0
- data/ruby_whois_patches/whois.nic.mx.rb +123 -0
- data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
- data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
- data/ruby_whois_patches/whois.nic.tel.rb +129 -0
- data/ruby_whois_patches/whois.nic.tr.rb +133 -0
- data/ruby_whois_patches/whois.nic.us.rb +129 -0
- data/ruby_whois_patches/whois.nic.ve.rb +135 -0
- data/ruby_whois_patches/whois.norid.no.rb +127 -0
- data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
- data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
- data/ruby_whois_patches/whois.registro.br.rb +109 -0
- data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
- data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
- data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
- data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
- data/ruby_whois_patches/whois.tucows.com.rb +70 -0
- data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
- data/settings/discovery_ports +24 -0
- data/settings/google_keywords.txt +9 -0
- data/settings/google_locator.txt +23 -0
- data/test/domain_tracker_test.rb +31 -0
- data/test/utils_test.rb +168 -0
- data/version.txt +13 -0
- data/wmap.gemspec +49 -0
- metadata +202 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Wmap
|
|
3
|
+
#
|
|
4
|
+
# A pure Ruby library for the Internet web application discovery and tracking.
|
|
5
|
+
#
|
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
|
7
|
+
#++
|
|
8
|
+
require "net/http"
|
|
9
|
+
require 'httpclient'
|
|
10
|
+
require "openssl"
|
|
11
|
+
require "uri"
|
|
12
|
+
require "digest/md5"
|
|
13
|
+
require "parallel"
|
|
14
|
+
|
|
15
|
+
# A quick checker class to identify / finger-print a URL / site
|
|
16
|
+
class Wmap::UrlChecker
|
|
17
|
+
include Wmap::Utils
|
|
18
|
+
attr_accessor :http_timeout, :max_parallel, :verbose, :data_dir
|
|
19
|
+
|
|
20
|
+
def initialize (params = {})
|
|
21
|
+
# Set default instance variables
|
|
22
|
+
@verbose=params.fetch(:verbose, false)
|
|
23
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
|
|
24
|
+
@http_timeout=params.fetch(:http_timeout, 5000)
|
|
25
|
+
@max_parallel=params.fetch(:max_parallel, 40)
|
|
26
|
+
@ssl_version=nil
|
|
27
|
+
@url_code={}
|
|
28
|
+
@url_redirection={}
|
|
29
|
+
@url_finger_print={}
|
|
30
|
+
@url_server={}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Main worker method to perform various checks on the URL / site
|
|
34
|
+
def url_worker (url)
|
|
35
|
+
puts "Checking out an unknown URL: #{url}" if @verbose
|
|
36
|
+
begin
|
|
37
|
+
url=url.strip.downcase
|
|
38
|
+
raise "Invalid URL format: #{url}" unless is_url?(url)
|
|
39
|
+
timestamp=Time.now
|
|
40
|
+
host=url_2_host(url)
|
|
41
|
+
ip=host_2_ip(host)
|
|
42
|
+
port=url_2_port(url)
|
|
43
|
+
code=10000
|
|
44
|
+
if @url_code.key?(url)
|
|
45
|
+
code=@url_code[url]
|
|
46
|
+
else
|
|
47
|
+
code=response_code(url)
|
|
48
|
+
end
|
|
49
|
+
if @url_redirection.key?(url)
|
|
50
|
+
loc=@url_redirection[url]
|
|
51
|
+
else
|
|
52
|
+
loc=redirect_location(url)
|
|
53
|
+
end
|
|
54
|
+
if @url_finger_print.key?(url)
|
|
55
|
+
fp=@url_finger_print[url]
|
|
56
|
+
else
|
|
57
|
+
fp=response_body_md5(url)
|
|
58
|
+
end
|
|
59
|
+
if @url_server.key?(url)
|
|
60
|
+
server=@url_server[url]
|
|
61
|
+
else
|
|
62
|
+
server=get_server_header(url)
|
|
63
|
+
end
|
|
64
|
+
# save the data
|
|
65
|
+
checker=Hash.new
|
|
66
|
+
checker['ip']=ip
|
|
67
|
+
checker['port']=port
|
|
68
|
+
checker['url']=url
|
|
69
|
+
checker['code']=code
|
|
70
|
+
checker['redirection']=loc
|
|
71
|
+
checker['md5']=fp
|
|
72
|
+
checker['server']=server
|
|
73
|
+
checker['timestamp']=timestamp
|
|
74
|
+
if Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
|
|
75
|
+
checker['status']="int_hosted"
|
|
76
|
+
else
|
|
77
|
+
checker['status']="ext_hosted"
|
|
78
|
+
end
|
|
79
|
+
return checker
|
|
80
|
+
rescue OpenSSL::SSL::SSLError => es # handler to temporally hold the openssl bug in bay: SSL_set_session: unable to find ssl method
|
|
81
|
+
checker=Hash.new
|
|
82
|
+
checker['ip']=ip
|
|
83
|
+
checker['port']=port
|
|
84
|
+
checker['url']=url
|
|
85
|
+
checker['code']=20000
|
|
86
|
+
checker['server']="Unknown SSL error: #{es}"
|
|
87
|
+
checker['md']=nil
|
|
88
|
+
checker['redirection']=nil
|
|
89
|
+
checker['timestamp']=timestamp
|
|
90
|
+
return checker
|
|
91
|
+
rescue Exception => ee
|
|
92
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" # if @verbose
|
|
93
|
+
return nil
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
alias_method :check, :url_worker
|
|
97
|
+
|
|
98
|
+
# Parallel scanner - by utilizing fork manager 'parallel' to spawn numbers of child processes on multiple urls simultaneously
|
|
99
|
+
def url_workers (targets,num=@max_parallel)
|
|
100
|
+
begin
|
|
101
|
+
results=Array.new
|
|
102
|
+
targets -= ["", nil]
|
|
103
|
+
if targets.size > 0
|
|
104
|
+
puts "Start the url checker on the targets:\n #{targets}"
|
|
105
|
+
Parallel.map(targets, :in_processes => num) { |target|
|
|
106
|
+
url_worker(target)
|
|
107
|
+
}.each do |process|
|
|
108
|
+
if process.nil?
|
|
109
|
+
next
|
|
110
|
+
elsif process.empty?
|
|
111
|
+
#do nothing
|
|
112
|
+
else
|
|
113
|
+
results << process
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
return results
|
|
118
|
+
rescue Exception => ee
|
|
119
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
120
|
+
return nil
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
alias_method :checks, :url_workers
|
|
124
|
+
|
|
125
|
+
# Test the URL and return the response code
|
|
126
|
+
def response_code (url)
|
|
127
|
+
puts "Check the http response code on the url: #{url}" if @verbose
|
|
128
|
+
response_code = 10000 # All unknown url connection exceptions go here
|
|
129
|
+
begin
|
|
130
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
|
131
|
+
url=url.strip.downcase
|
|
132
|
+
timeo = @http_timeout/1000.0
|
|
133
|
+
uri = URI.parse(url)
|
|
134
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
135
|
+
http.open_timeout = timeo
|
|
136
|
+
http.read_timeout = timeo
|
|
137
|
+
if (url =~ /https\:/i)
|
|
138
|
+
http.use_ssl = true
|
|
139
|
+
#http.ssl_version = :SSLv3
|
|
140
|
+
# Bypass the remote web server cert validation test
|
|
141
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
142
|
+
end
|
|
143
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
144
|
+
response = http.request(request)
|
|
145
|
+
puts "Server response the following: #{response}" if @verbose
|
|
146
|
+
response_code = response.code.to_i
|
|
147
|
+
#response.finish if response.started?()
|
|
148
|
+
@url_code[url]=response_code
|
|
149
|
+
puts "Response code on #{url}: #{response_code}" if @verbose
|
|
150
|
+
return response_code
|
|
151
|
+
rescue Exception => ee
|
|
152
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
153
|
+
case ee
|
|
154
|
+
# rescue "Connection reset by peer" error type
|
|
155
|
+
when Errno::ECONNRESET
|
|
156
|
+
response_code=104
|
|
157
|
+
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
|
158
|
+
#response_code=10000
|
|
159
|
+
when Timeout::Error # Quick fix
|
|
160
|
+
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
|
161
|
+
http.ssl_version = :SSLv3
|
|
162
|
+
response = http.request(request)
|
|
163
|
+
response_code = response.code.to_i
|
|
164
|
+
unless response_code.nil?
|
|
165
|
+
@ssl_version = http.ssl_version
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
else
|
|
169
|
+
#response_code=10000
|
|
170
|
+
end
|
|
171
|
+
@url_code[url]=response_code
|
|
172
|
+
return response_code
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
alias_method :query, :response_code
|
|
176
|
+
|
|
177
|
+
# Test the URL / site and return the redirection location (3xx response code only)
|
|
178
|
+
def redirect_location (url)
|
|
179
|
+
puts "Test the redirection location for the url: #{url}" if @verbose
|
|
180
|
+
location=""
|
|
181
|
+
begin
|
|
182
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
|
183
|
+
url=url.strip.downcase
|
|
184
|
+
timeo = @http_timeout/1000.0
|
|
185
|
+
uri = URI.parse(url)
|
|
186
|
+
code = response_code (url)
|
|
187
|
+
if code >= 300 && code < 400
|
|
188
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
189
|
+
http.open_timeout = timeo
|
|
190
|
+
http.read_timeout = timeo
|
|
191
|
+
if (url =~ /https\:/i)
|
|
192
|
+
http.use_ssl = true
|
|
193
|
+
# Bypass the remote web server cert validation test
|
|
194
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
195
|
+
http.ssl_version = @ssl_version
|
|
196
|
+
end
|
|
197
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
198
|
+
response = http.request(request)
|
|
199
|
+
case response
|
|
200
|
+
when Net::HTTPRedirection then
|
|
201
|
+
location = response['location']
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
@url_redirection[url]=location
|
|
205
|
+
return location
|
|
206
|
+
rescue Exception => ee
|
|
207
|
+
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
|
208
|
+
@url_redirection[url]=location
|
|
209
|
+
return location
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
alias_method :location, :redirect_location
|
|
213
|
+
|
|
214
|
+
# Test the URL / site and return the web server type from the HTTP header "server" field
|
|
215
|
+
def get_server_header (url)
|
|
216
|
+
puts "Retrieve the server header field from the url: #{url}" if @verbose
|
|
217
|
+
server=String.new
|
|
218
|
+
begin
|
|
219
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
|
220
|
+
url=url.strip.downcase
|
|
221
|
+
timeo = @http_timeout/1000.0
|
|
222
|
+
uri = URI.parse(url)
|
|
223
|
+
code = response_code (url)
|
|
224
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
225
|
+
http.open_timeout = timeo
|
|
226
|
+
http.read_timeout = timeo
|
|
227
|
+
if (url =~ /https\:/i)
|
|
228
|
+
http.use_ssl = true
|
|
229
|
+
# Bypass the remote web server cert validation test
|
|
230
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
231
|
+
http.ssl_version = @ssl_version
|
|
232
|
+
end
|
|
233
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
234
|
+
response = http.request(request)
|
|
235
|
+
server=response["server"]
|
|
236
|
+
server=server.gsub(/\,/,' ')
|
|
237
|
+
return server
|
|
238
|
+
rescue Exception => ee
|
|
239
|
+
puts "Exception on method get_server_header for URL #{url}: #{ee}" if @verbose
|
|
240
|
+
@url_server[url]=server
|
|
241
|
+
return server
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Use MD5 algorithm to fingerprint the URL / site response payload (web page content)
|
|
246
|
+
def response_body_md5(url)
|
|
247
|
+
puts "MD5 finger print page body content: #{url}" if @verbose
|
|
248
|
+
begin
|
|
249
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
|
250
|
+
url=url.strip.downcase
|
|
251
|
+
timeo = @http_timeout/1000.0
|
|
252
|
+
uri = URI.parse(url)
|
|
253
|
+
fp=""
|
|
254
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
255
|
+
http.open_timeout = timeo
|
|
256
|
+
http.read_timeout = timeo
|
|
257
|
+
if (url =~ /https\:/i)
|
|
258
|
+
http.use_ssl = true
|
|
259
|
+
# Bypass the remote web server cert validation test
|
|
260
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
261
|
+
http.ssl_version = @ssl_version
|
|
262
|
+
end
|
|
263
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
264
|
+
response = http.request(request)
|
|
265
|
+
response_body = response.body.to_s
|
|
266
|
+
fp=Digest::MD5.hexdigest(response_body) unless response_body.nil?
|
|
267
|
+
@url_finger_print[url] = fp
|
|
268
|
+
return fp
|
|
269
|
+
rescue Exception => ee
|
|
270
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
alias_method :md5, :response_body_md5
|
|
274
|
+
|
|
275
|
+
# Retrieve the remote web server certification, open it and return the cert content as a string
|
|
276
|
+
def get_certificate (url)
|
|
277
|
+
puts "Retrieve the remote web server SSL certificate in clear text: #{url}" if @verbose
|
|
278
|
+
begin
|
|
279
|
+
url=url.strip
|
|
280
|
+
raise "Invalid URL string: #{url}" unless is_ssl?(url)
|
|
281
|
+
client = HTTPClient.new
|
|
282
|
+
client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
283
|
+
response = client.get(url)
|
|
284
|
+
cert = response.peer_cert
|
|
285
|
+
cer = OpenSSL::X509::Certificate.new(cert)
|
|
286
|
+
return cer.to_text
|
|
287
|
+
rescue Exception => ee
|
|
288
|
+
puts "Exception on method #{__method__} from #{url}: #{ee}"
|
|
289
|
+
end
|
|
290
|
+
return nil
|
|
291
|
+
end
|
|
292
|
+
alias_method :get_cert, :get_certificate
|
|
293
|
+
|
|
294
|
+
# Retrieve the X509 cert in the clear text from the remote web server, extract and return the common name field within the cert
|
|
295
|
+
def get_cert_cn (url)
|
|
296
|
+
puts "Extract the common name field from a X509 cert: #{cert}" if @verbose
|
|
297
|
+
begin
|
|
298
|
+
cert=get_certificate(url)
|
|
299
|
+
subject, cn = ""
|
|
300
|
+
if cert =~ /\n(.+)Subject\:(.+)\n/i
|
|
301
|
+
subject=$2
|
|
302
|
+
end
|
|
303
|
+
if subject =~/CN\=(.+)/i
|
|
304
|
+
cn=$1
|
|
305
|
+
end
|
|
306
|
+
return cn
|
|
307
|
+
rescue Exception => ee
|
|
308
|
+
puts "Error on method #{__method__} from #{cert}: #{ee}" if @verbose
|
|
309
|
+
end
|
|
310
|
+
return nil
|
|
311
|
+
end
|
|
312
|
+
alias_method :get_cn, :get_cert_cn
|
|
313
|
+
|
|
314
|
+
end
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Wmap
|
|
3
|
+
#
|
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
|
5
|
+
#
|
|
6
|
+
# Copyright (c) 2012-2015 Yang Li
|
|
7
|
+
#++
|
|
8
|
+
require "net/http"
|
|
9
|
+
require "uri"
|
|
10
|
+
require "open-uri"
|
|
11
|
+
require "open_uri_redirections"
|
|
12
|
+
require "nokogiri"
|
|
13
|
+
require "parallel"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Web site crawler class
|
|
17
|
+
class Wmap::UrlCrawler
|
|
18
|
+
include Wmap::Utils
|
|
19
|
+
|
|
20
|
+
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, :verbose, :data_dir
|
|
21
|
+
attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
|
|
22
|
+
# Global variable used to store the combined result of all the forked child processes. Note that class variable
|
|
23
|
+
# would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
|
|
24
|
+
# $discovered_urls=Hash.new
|
|
25
|
+
|
|
26
|
+
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
|
27
|
+
Max_http_timeout=8000
|
|
28
|
+
# set hard stop limit of crawler time-out to 1200 seconds or 20 minutes
|
|
29
|
+
Crawl_timeout=1200000
|
|
30
|
+
|
|
31
|
+
# Crawler instance default variables
|
|
32
|
+
def initialize (params = {})
|
|
33
|
+
@verbose=params.fetch(:verbose, false)
|
|
34
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
|
|
35
|
+
@http_timeout=params.fetch(:http_timeout, 5000)
|
|
36
|
+
@crawl_depth=params.fetch(:crawl_depth, 4)
|
|
37
|
+
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
|
38
|
+
@max_parallel=params.fetch(:max_parallel, 40)
|
|
39
|
+
# Discovered data store
|
|
40
|
+
@discovered_urls_by_crawler=Hash.new
|
|
41
|
+
@visited_urls_by_crawler=Hash.new
|
|
42
|
+
@crawl_start=Hash.new
|
|
43
|
+
@crawl_done=Hash.new
|
|
44
|
+
Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
|
|
45
|
+
@log_file=@data_dir + "crawler.log"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
|
|
49
|
+
def pre_crawl(url)
|
|
50
|
+
puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
|
|
51
|
+
begin
|
|
52
|
+
host=url_2_host(url)
|
|
53
|
+
# Use the following formula to 'guess' the right http time-out threshold for the scanner
|
|
54
|
+
nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
|
|
55
|
+
if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
|
|
56
|
+
@http_timeout = Max_http_timeout
|
|
57
|
+
else
|
|
58
|
+
@http_timeout = 1500 + nwk_to*2
|
|
59
|
+
end
|
|
60
|
+
puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
|
|
61
|
+
rescue Exception => ee
|
|
62
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
|
63
|
+
@http_timeout = Max_http_timeout
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# A web crawler to crawl a known website and search for html links within the same root domain. For example,
|
|
68
|
+
# by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
|
|
69
|
+
def crawl(url)
|
|
70
|
+
puts "Start web crawling on #{url}"
|
|
71
|
+
#begin
|
|
72
|
+
result=Array.new
|
|
73
|
+
url=url.chomp.strip
|
|
74
|
+
result.push(url_2_site(url))
|
|
75
|
+
raise "Error! Invalid url format: #{urls}" unless is_url?(url)
|
|
76
|
+
# Add logic to profile the web server before crawling; this is used to optimize the crawling speed
|
|
77
|
+
pre_crawl(url)
|
|
78
|
+
status = Timeout::timeout(Crawl_timeout/1000) {
|
|
79
|
+
result+=crawl_worker(url).keys
|
|
80
|
+
}
|
|
81
|
+
puts "Web crawling time-out on #{url}: #{status}" if @verbose
|
|
82
|
+
return result
|
|
83
|
+
#rescue => ee
|
|
84
|
+
#puts "Exception on method #{__method__} for URL #{url}: #{ee}"
|
|
85
|
+
#return result
|
|
86
|
+
#end
|
|
87
|
+
end
|
|
88
|
+
alias_method :query, :crawl
|
|
89
|
+
|
|
90
|
+
# The worker instance of crawler who perform the labour work
|
|
91
|
+
def crawl_worker(url0)
|
|
92
|
+
puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
|
|
93
|
+
#begin
|
|
94
|
+
# Input URL sanity check first
|
|
95
|
+
if is_url?(url0)
|
|
96
|
+
host=url_2_host(url0)
|
|
97
|
+
ip=host_2_ip(host).to_s
|
|
98
|
+
raise "Invalid IP address: #{url0}" if ip.nil?
|
|
99
|
+
port=url_2_port(url0).to_s
|
|
100
|
+
raise "Invalid port number: #{url0}" if port.nil?
|
|
101
|
+
else
|
|
102
|
+
raise "Invalid URL: #{url0}. Please check it out with your browser again."
|
|
103
|
+
end
|
|
104
|
+
log_info=Hash.new
|
|
105
|
+
log_info[1]="Start working on #{url0}"
|
|
106
|
+
url_stores=Hash.new
|
|
107
|
+
url_stores[url0]=true unless url_stores.key?(url0)
|
|
108
|
+
@discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
|
|
109
|
+
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
|
110
|
+
# $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
|
|
111
|
+
@crawl_depth.times do
|
|
112
|
+
url_stores.keys.each do |url|
|
|
113
|
+
# 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
|
|
114
|
+
next if @visited_urls_by_crawler.key?(url)
|
|
115
|
+
url_object = open_url(url)
|
|
116
|
+
next if url_object == nil
|
|
117
|
+
url = update_url_if_redirected(url, url_object)
|
|
118
|
+
url_body = read_url(url)
|
|
119
|
+
# Protection code - to avoid parsing failure on the empty or nil object
|
|
120
|
+
next if url_body.nil? or url_body.empty?
|
|
121
|
+
url_stores[url]=true unless url_stores.key?(url)
|
|
122
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
|
123
|
+
# $discovered_urls[url]=true unless $discovered_urls.key?(url)
|
|
124
|
+
doc = parse_html(url_body)
|
|
125
|
+
next if doc == nil
|
|
126
|
+
if url_stores.size >= @crawl_page_limit
|
|
127
|
+
#@visited_urls_by_crawler.merge!(url_stores)
|
|
128
|
+
@discovered_urls_by_crawler.merge!(url_stores)
|
|
129
|
+
# $discovered_urls.merge!(url_stores)
|
|
130
|
+
puts "Finish web crawling the url: #{url0}"
|
|
131
|
+
return url_stores
|
|
132
|
+
end
|
|
133
|
+
page_urls = find_urls_on_page(doc, url)
|
|
134
|
+
page_urls.uniq!
|
|
135
|
+
page_urls.map do |y|
|
|
136
|
+
y=normalize_url(y)
|
|
137
|
+
url_stores[y]=true unless url_stores.key?(y)
|
|
138
|
+
@discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
|
|
139
|
+
# $discovered_urls[y]=true unless $discovered_urls.key?(y)
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
puts "Finish web crawling on: #{url0}"
|
|
144
|
+
log_info[2]="Finish working on: #{url0}"
|
|
145
|
+
wlog(log_info, "UrlCrawler", @log_file)
|
|
146
|
+
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
|
147
|
+
return url_stores
|
|
148
|
+
#rescue => ee
|
|
149
|
+
#puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
|
150
|
+
#log_info[3]="Exception on #{url0}"
|
|
151
|
+
#wlog(log_info,"UrlCrawler",@log_file)
|
|
152
|
+
#return url_stores
|
|
153
|
+
#end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
|
|
157
|
+
# each child process will continuously work on the target pool until all the works are done
|
|
158
|
+
def crawl_workers (targets,num=@max_parallel)
|
|
159
|
+
begin
|
|
160
|
+
raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
|
|
161
|
+
puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
|
|
162
|
+
#puts "This could be awhile depending on the list size. Please be patient ..."
|
|
163
|
+
# 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
|
|
164
|
+
targets -= ["", nil]
|
|
165
|
+
uniq_sites=Hash.new
|
|
166
|
+
targets.dup.map do |target|
|
|
167
|
+
if is_url?(target)
|
|
168
|
+
host=url_2_host(target)
|
|
169
|
+
ip=host_2_ip(host).to_s
|
|
170
|
+
next if ip.nil?
|
|
171
|
+
port=url_2_port(target).to_s
|
|
172
|
+
next if port.nil?
|
|
173
|
+
site_key=ip+":"+port
|
|
174
|
+
unless uniq_sites.key?(site_key)
|
|
175
|
+
uniq_sites[site_key]=target
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
puts "Sanitization done! " if @verbose
|
|
180
|
+
puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
|
|
181
|
+
puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
|
|
182
|
+
raise "Error: target list is empty!" if targets.size < 1
|
|
183
|
+
Parallel.map(uniq_sites.values, :in_processes => num) { |target|
|
|
184
|
+
puts "Working on #{target} ..." if @verbose
|
|
185
|
+
crawl(target)
|
|
186
|
+
}.dup.each do |process|
|
|
187
|
+
puts "process.inspect: #{process}" if @verbose
|
|
188
|
+
urls=process
|
|
189
|
+
urls-=["",nil] unless urls.nil?
|
|
190
|
+
if urls.nil?
|
|
191
|
+
next
|
|
192
|
+
elsif urls.empty?
|
|
193
|
+
next
|
|
194
|
+
#do nothing
|
|
195
|
+
else
|
|
196
|
+
urls.map do |url|
|
|
197
|
+
url.strip!
|
|
198
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
|
199
|
+
#$discovered_urls[url]=true unless $discovered_urls.key?(url)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
#return sites
|
|
204
|
+
return @discovered_urls_by_crawler.keys
|
|
205
|
+
rescue Exception => ee
|
|
206
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
207
|
+
return nil
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
alias_method :crawls, :crawl_workers
|
|
211
|
+
|
|
212
|
+
# Fast crawling method - build the target pool from the input file
|
|
213
|
+
def crawl_workers_on_file (file)
|
|
214
|
+
puts "Web crawl the list of targets from file: #{file}"
|
|
215
|
+
begin
|
|
216
|
+
targets=file_2_list(file)
|
|
217
|
+
sites=crawl_workers(targets,num=@max_parallel)
|
|
218
|
+
return sites
|
|
219
|
+
rescue => ee
|
|
220
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
221
|
+
return nil
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
alias_method :query_file, :crawl_workers_on_file
|
|
225
|
+
alias_method :crawl_file, :crawl_workers_on_file
|
|
226
|
+
|
|
227
|
+
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
|
228
|
+
def open_url(url)
|
|
229
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
|
230
|
+
#url_object = nil
|
|
231
|
+
begin
|
|
232
|
+
if url =~ /http\:/i
|
|
233
|
+
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
|
234
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
|
235
|
+
#url_object = open(url)
|
|
236
|
+
elsif url =~ /https\:/i
|
|
237
|
+
url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
|
|
238
|
+
#url_object = open(url,:ssl_verify_mode => 0)
|
|
239
|
+
else
|
|
240
|
+
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
|
241
|
+
end
|
|
242
|
+
return url_object
|
|
243
|
+
rescue => ee
|
|
244
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
245
|
+
return nil
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Wrapper to use OpenURI method 'read' to return url body contents
|
|
250
|
+
def read_url(url)
|
|
251
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
|
252
|
+
begin
|
|
253
|
+
url_object=open_url(url)
|
|
254
|
+
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
|
255
|
+
body=url_object.read
|
|
256
|
+
return body
|
|
257
|
+
rescue => ee
|
|
258
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
259
|
+
return nil
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Return the destination url in case of url re-direct
|
|
264
|
+
def update_url_if_redirected(url, url_object)
|
|
265
|
+
#puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
|
|
266
|
+
begin
|
|
267
|
+
if url != url_object.base_uri.to_s
|
|
268
|
+
return url_object.base_uri.to_s
|
|
269
|
+
end
|
|
270
|
+
return url
|
|
271
|
+
rescue => ee
|
|
272
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
273
|
+
return nil
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Wrapper for the Nokogiri DOM parser
|
|
278
|
+
def parse_html(html_body)
|
|
279
|
+
#puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
|
|
280
|
+
begin
|
|
281
|
+
doc = Nokogiri::HTML(html_body)
|
|
282
|
+
#puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
|
|
283
|
+
#puts "doc: #{doc}" if @verbose
|
|
284
|
+
return doc
|
|
285
|
+
rescue => ee
|
|
286
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
287
|
+
return nil
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
# Search 'current_url' and return found URLs under the same domain
|
|
292
|
+
def find_urls_on_page(doc, current_url)
|
|
293
|
+
#puts "Search and return URLs within the doc: #{doc}" if @verbose
|
|
294
|
+
begin
|
|
295
|
+
urls_list = []
|
|
296
|
+
# case 1 - search embedded HTML tag <a href='url'> for the url elements
|
|
297
|
+
links=doc.css('a')
|
|
298
|
+
links.map do |x|
|
|
299
|
+
#puts "x: #{x}"
|
|
300
|
+
new_url = x.attribute('href').to_s
|
|
301
|
+
unless new_url == nil
|
|
302
|
+
if new_url.match("http")
|
|
303
|
+
#if urls_on_same_domain?(new_url,current_url)
|
|
304
|
+
urls_list.push(new_url)
|
|
305
|
+
#end
|
|
306
|
+
else
|
|
307
|
+
new_url = make_absolute(current_url, new_url)
|
|
308
|
+
urls_list.push(new_url)
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
# case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
|
|
313
|
+
elements=doc.css("meta[http-equiv]")
|
|
314
|
+
unless elements.size == 0
|
|
315
|
+
link=elements.attr("content").value.split(/url\=/i)[1]
|
|
316
|
+
unless link.nil?
|
|
317
|
+
new_url = make_absolute(current_url, link)
|
|
318
|
+
urls_list.push(new_url) unless new_url.nil?
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
|
322
|
+
return urls_list.uniq-["",nil]
|
|
323
|
+
rescue => ee
|
|
324
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
325
|
+
return nil
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Method to print out discovery URL result
|
|
330
|
+
def print_discovered_urls_by_crawler
|
|
331
|
+
puts "Print discovered url by the crawler. " if @verbose
|
|
332
|
+
begin
|
|
333
|
+
puts "\nSummary Report of Discovered URLs from the Crawler:"
|
|
334
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
|
335
|
+
puts url
|
|
336
|
+
end
|
|
337
|
+
puts "Total: #{@discovered_urls_by_crawler.keys.size}"
|
|
338
|
+
puts "End of the summary"
|
|
339
|
+
rescue => ee
|
|
340
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
341
|
+
return nil
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
alias_method :print, :print_discovered_urls_by_crawler
|
|
345
|
+
|
|
346
|
+
# Method to save URL discovery result
|
|
347
|
+
def save_discovered_urls (file)
|
|
348
|
+
puts "Save discovered urls by the crawler to file: #{file} "
|
|
349
|
+
begin
|
|
350
|
+
list_2_file(@discovered_urls_by_crawler.keys, file)
|
|
351
|
+
puts "Done!"
|
|
352
|
+
rescue => ee
|
|
353
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
354
|
+
return nil
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
alias_method :save, :save_discovered_urls
|
|
358
|
+
|
|
359
|
+
# Method to retrieve discovery site result
|
|
360
|
+
def get_discovered_sites_by_crawler
|
|
361
|
+
puts "Print summary report of discovered sites. " if @verbose
|
|
362
|
+
begin
|
|
363
|
+
puts "\nSummary Report of Discovered Sites from the Crawler:"
|
|
364
|
+
sites = Hash.new
|
|
365
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
|
366
|
+
site=url_2_site(url)
|
|
367
|
+
sites[site]=true unless sites.key?(site)
|
|
368
|
+
end
|
|
369
|
+
sites.keys.map { |site| puts site }
|
|
370
|
+
puts "Total: #{sites.size}"
|
|
371
|
+
puts "End of the summary"
|
|
372
|
+
return sites.keys
|
|
373
|
+
rescue => ee
|
|
374
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
375
|
+
return nil
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
alias_method :get_sites, :get_discovered_sites_by_crawler
|
|
379
|
+
|
|
380
|
+
private :open_url, :read_url, :update_url_if_redirected, :parse_html, :find_urls_on_page
|
|
381
|
+
end
|