wmap 2.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,314 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for the Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "net/http"
9
+ require 'httpclient'
10
+ require "openssl"
11
+ require "uri"
12
+ require "digest/md5"
13
+ require "parallel"
14
+
15
+ # A quick checker class to identify / finger-print a URL / site
16
+ class Wmap::UrlChecker
17
+ include Wmap::Utils
18
+ attr_accessor :http_timeout, :max_parallel, :verbose, :data_dir
19
+
20
+ def initialize (params = {})
21
+ # Set default instance variables
22
+ @verbose=params.fetch(:verbose, false)
23
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
24
+ @http_timeout=params.fetch(:http_timeout, 5000)
25
+ @max_parallel=params.fetch(:max_parallel, 40)
26
+ @ssl_version=nil
27
+ @url_code={}
28
+ @url_redirection={}
29
+ @url_finger_print={}
30
+ @url_server={}
31
+ end
32
+
33
+ # Main worker method to perform various checks on the URL / site
34
+ def url_worker (url)
35
+ puts "Checking out an unknown URL: #{url}" if @verbose
36
+ begin
37
+ url=url.strip.downcase
38
+ raise "Invalid URL format: #{url}" unless is_url?(url)
39
+ timestamp=Time.now
40
+ host=url_2_host(url)
41
+ ip=host_2_ip(host)
42
+ port=url_2_port(url)
43
+ code=10000
44
+ if @url_code.key?(url)
45
+ code=@url_code[url]
46
+ else
47
+ code=response_code(url)
48
+ end
49
+ if @url_redirection.key?(url)
50
+ loc=@url_redirection[url]
51
+ else
52
+ loc=redirect_location(url)
53
+ end
54
+ if @url_finger_print.key?(url)
55
+ fp=@url_finger_print[url]
56
+ else
57
+ fp=response_body_md5(url)
58
+ end
59
+ if @url_server.key?(url)
60
+ server=@url_server[url]
61
+ else
62
+ server=get_server_header(url)
63
+ end
64
+ # save the data
65
+ checker=Hash.new
66
+ checker['ip']=ip
67
+ checker['port']=port
68
+ checker['url']=url
69
+ checker['code']=code
70
+ checker['redirection']=loc
71
+ checker['md5']=fp
72
+ checker['server']=server
73
+ checker['timestamp']=timestamp
74
+ if Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
75
+ checker['status']="int_hosted"
76
+ else
77
+ checker['status']="ext_hosted"
78
+ end
79
+ return checker
80
+ rescue OpenSSL::SSL::SSLError => es # handler to temporally hold the openssl bug in bay: SSL_set_session: unable to find ssl method
81
+ checker=Hash.new
82
+ checker['ip']=ip
83
+ checker['port']=port
84
+ checker['url']=url
85
+ checker['code']=20000
86
+ checker['server']="Unknown SSL error: #{es}"
87
+ checker['md']=nil
88
+ checker['redirection']=nil
89
+ checker['timestamp']=timestamp
90
+ return checker
91
+ rescue Exception => ee
92
+ puts "Exception on method #{__method__} for #{url}: #{ee}" # if @verbose
93
+ return nil
94
+ end
95
+ end
96
+ alias_method :check, :url_worker
97
+
98
+ # Parallel scanner - by utilizing fork manager 'parallel' to spawn numbers of child processes on multiple urls simultaneously
99
+ def url_workers (targets,num=@max_parallel)
100
+ begin
101
+ results=Array.new
102
+ targets -= ["", nil]
103
+ if targets.size > 0
104
+ puts "Start the url checker on the targets:\n #{targets}"
105
+ Parallel.map(targets, :in_processes => num) { |target|
106
+ url_worker(target)
107
+ }.each do |process|
108
+ if process.nil?
109
+ next
110
+ elsif process.empty?
111
+ #do nothing
112
+ else
113
+ results << process
114
+ end
115
+ end
116
+ end
117
+ return results
118
+ rescue Exception => ee
119
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
120
+ return nil
121
+ end
122
+ end
123
+ alias_method :checks, :url_workers
124
+
125
+ # Test the URL and return the response code
126
+ def response_code (url)
127
+ puts "Check the http response code on the url: #{url}" if @verbose
128
+ response_code = 10000 # All unknown url connection exceptions go here
129
+ begin
130
+ raise "Invalid url: #{url}" unless is_url?(url)
131
+ url=url.strip.downcase
132
+ timeo = @http_timeout/1000.0
133
+ uri = URI.parse(url)
134
+ http = Net::HTTP.new(uri.host, uri.port)
135
+ http.open_timeout = timeo
136
+ http.read_timeout = timeo
137
+ if (url =~ /https\:/i)
138
+ http.use_ssl = true
139
+ #http.ssl_version = :SSLv3
140
+ # Bypass the remote web server cert validation test
141
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
142
+ end
143
+ request = Net::HTTP::Get.new(uri.request_uri)
144
+ response = http.request(request)
145
+ puts "Server response the following: #{response}" if @verbose
146
+ response_code = response.code.to_i
147
+ #response.finish if response.started?()
148
+ @url_code[url]=response_code
149
+ puts "Response code on #{url}: #{response_code}" if @verbose
150
+ return response_code
151
+ rescue Exception => ee
152
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
153
+ case ee
154
+ # rescue "Connection reset by peer" error type
155
+ when Errno::ECONNRESET
156
+ response_code=104
157
+ when Errno::ECONNABORTED,Errno::ETIMEDOUT
158
+ #response_code=10000
159
+ when Timeout::Error # Quick fix
160
+ if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
161
+ http.ssl_version = :SSLv3
162
+ response = http.request(request)
163
+ response_code = response.code.to_i
164
+ unless response_code.nil?
165
+ @ssl_version = http.ssl_version
166
+ end
167
+ end
168
+ else
169
+ #response_code=10000
170
+ end
171
+ @url_code[url]=response_code
172
+ return response_code
173
+ end
174
+ end
175
+ alias_method :query, :response_code
176
+
177
+ # Test the URL / site and return the redirection location (3xx response code only)
178
+ def redirect_location (url)
179
+ puts "Test the redirection location for the url: #{url}" if @verbose
180
+ location=""
181
+ begin
182
+ raise "Invalid url: #{url}" unless is_url?(url)
183
+ url=url.strip.downcase
184
+ timeo = @http_timeout/1000.0
185
+ uri = URI.parse(url)
186
+ code = response_code (url)
187
+ if code >= 300 && code < 400
188
+ http = Net::HTTP.new(uri.host, uri.port)
189
+ http.open_timeout = timeo
190
+ http.read_timeout = timeo
191
+ if (url =~ /https\:/i)
192
+ http.use_ssl = true
193
+ # Bypass the remote web server cert validation test
194
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
195
+ http.ssl_version = @ssl_version
196
+ end
197
+ request = Net::HTTP::Get.new(uri.request_uri)
198
+ response = http.request(request)
199
+ case response
200
+ when Net::HTTPRedirection then
201
+ location = response['location']
202
+ end
203
+ end
204
+ @url_redirection[url]=location
205
+ return location
206
+ rescue Exception => ee
207
+ puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
208
+ @url_redirection[url]=location
209
+ return location
210
+ end
211
+ end
212
+ alias_method :location, :redirect_location
213
+
214
+ # Test the URL / site and return the web server type from the HTTP header "server" field
215
+ def get_server_header (url)
216
+ puts "Retrieve the server header field from the url: #{url}" if @verbose
217
+ server=String.new
218
+ begin
219
+ raise "Invalid url: #{url}" unless is_url?(url)
220
+ url=url.strip.downcase
221
+ timeo = @http_timeout/1000.0
222
+ uri = URI.parse(url)
223
+ code = response_code (url)
224
+ http = Net::HTTP.new(uri.host, uri.port)
225
+ http.open_timeout = timeo
226
+ http.read_timeout = timeo
227
+ if (url =~ /https\:/i)
228
+ http.use_ssl = true
229
+ # Bypass the remote web server cert validation test
230
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
231
+ http.ssl_version = @ssl_version
232
+ end
233
+ request = Net::HTTP::Get.new(uri.request_uri)
234
+ response = http.request(request)
235
+ server=response["server"]
236
+ server=server.gsub(/\,/,' ')
237
+ return server
238
+ rescue Exception => ee
239
+ puts "Exception on method get_server_header for URL #{url}: #{ee}" if @verbose
240
+ @url_server[url]=server
241
+ return server
242
+ end
243
+ end
244
+
245
+ # Use MD5 algorithm to fingerprint the URL / site response payload (web page content)
246
+ def response_body_md5(url)
247
+ puts "MD5 finger print page body content: #{url}" if @verbose
248
+ begin
249
+ raise "Invalid url: #{url}" unless is_url?(url)
250
+ url=url.strip.downcase
251
+ timeo = @http_timeout/1000.0
252
+ uri = URI.parse(url)
253
+ fp=""
254
+ http = Net::HTTP.new(uri.host, uri.port)
255
+ http.open_timeout = timeo
256
+ http.read_timeout = timeo
257
+ if (url =~ /https\:/i)
258
+ http.use_ssl = true
259
+ # Bypass the remote web server cert validation test
260
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
261
+ http.ssl_version = @ssl_version
262
+ end
263
+ request = Net::HTTP::Get.new(uri.request_uri)
264
+ response = http.request(request)
265
+ response_body = response.body.to_s
266
+ fp=Digest::MD5.hexdigest(response_body) unless response_body.nil?
267
+ @url_finger_print[url] = fp
268
+ return fp
269
+ rescue Exception => ee
270
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
271
+ end
272
+ end
273
+ alias_method :md5, :response_body_md5
274
+
275
+ # Retrieve the remote web server certification, open it and return the cert content as a string
276
+ def get_certificate (url)
277
+ puts "Retrieve the remote web server SSL certificate in clear text: #{url}" if @verbose
278
+ begin
279
+ url=url.strip
280
+ raise "Invalid URL string: #{url}" unless is_ssl?(url)
281
+ client = HTTPClient.new
282
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
283
+ response = client.get(url)
284
+ cert = response.peer_cert
285
+ cer = OpenSSL::X509::Certificate.new(cert)
286
+ return cer.to_text
287
+ rescue Exception => ee
288
+ puts "Exception on method #{__method__} from #{url}: #{ee}"
289
+ end
290
+ return nil
291
+ end
292
+ alias_method :get_cert, :get_certificate
293
+
294
+ # Retrieve the X509 cert in the clear text from the remote web server, extract and return the common name field within the cert
295
+ def get_cert_cn (url)
296
+ puts "Extract the common name field from a X509 cert: #{cert}" if @verbose
297
+ begin
298
+ cert=get_certificate(url)
299
+ subject, cn = ""
300
+ if cert =~ /\n(.+)Subject\:(.+)\n/i
301
+ subject=$2
302
+ end
303
+ if subject =~/CN\=(.+)/i
304
+ cn=$1
305
+ end
306
+ return cn
307
+ rescue Exception => ee
308
+ puts "Error on method #{__method__} from #{cert}: #{ee}" if @verbose
309
+ end
310
+ return nil
311
+ end
312
+ alias_method :get_cn, :get_cert_cn
313
+
314
+ end
@@ -0,0 +1,381 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li
7
+ #++
8
+ require "net/http"
9
+ require "uri"
10
+ require "open-uri"
11
+ require "open_uri_redirections"
12
+ require "nokogiri"
13
+ require "parallel"
14
+
15
+
16
+ # Web site crawler class
17
+ class Wmap::UrlCrawler
18
+ include Wmap::Utils
19
+
20
+ attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, :verbose, :data_dir
21
+ attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
22
+ # Global variable used to store the combined result of all the forked child processes. Note that class variable
23
+ # would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
24
+ # $discovered_urls=Hash.new
25
+
26
+ # set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
27
+ Max_http_timeout=8000
28
+ # set hard stop limit of crawler time-out to 1200 seconds or 20 minutes
29
+ Crawl_timeout=1200000
30
+
31
+ # Crawler instance default variables
32
+ def initialize (params = {})
33
+ @verbose=params.fetch(:verbose, false)
34
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
35
+ @http_timeout=params.fetch(:http_timeout, 5000)
36
+ @crawl_depth=params.fetch(:crawl_depth, 4)
37
+ @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
38
+ @max_parallel=params.fetch(:max_parallel, 40)
39
+ # Discovered data store
40
+ @discovered_urls_by_crawler=Hash.new
41
+ @visited_urls_by_crawler=Hash.new
42
+ @crawl_start=Hash.new
43
+ @crawl_done=Hash.new
44
+ Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
45
+ @log_file=@data_dir + "crawler.log"
46
+ end
47
+
48
+ # Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
49
+ def pre_crawl(url)
50
+ puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
51
+ begin
52
+ host=url_2_host(url)
53
+ # Use the following formula to 'guess' the right http time-out threshold for the scanner
54
+ nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
55
+ if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
56
+ @http_timeout = Max_http_timeout
57
+ else
58
+ @http_timeout = 1500 + nwk_to*2
59
+ end
60
+ puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
61
+ rescue Exception => ee
62
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
63
+ @http_timeout = Max_http_timeout
64
+ end
65
+ end
66
+
67
+ # A web crawler to crawl a known website and search for html links within the same root domain. For example,
68
+ # by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
69
+ def crawl(url)
70
+ puts "Start web crawling on #{url}"
71
+ #begin
72
+ result=Array.new
73
+ url=url.chomp.strip
74
+ result.push(url_2_site(url))
75
+ raise "Error! Invalid url format: #{urls}" unless is_url?(url)
76
+ # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
77
+ pre_crawl(url)
78
+ status = Timeout::timeout(Crawl_timeout/1000) {
79
+ result+=crawl_worker(url).keys
80
+ }
81
+ puts "Web crawling time-out on #{url}: #{status}" if @verbose
82
+ return result
83
+ #rescue => ee
84
+ #puts "Exception on method #{__method__} for URL #{url}: #{ee}"
85
+ #return result
86
+ #end
87
+ end
88
+ alias_method :query, :crawl
89
+
90
+ # The worker instance of crawler who perform the labour work
91
+ def crawl_worker(url0)
92
+ puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
93
+ #begin
94
+ # Input URL sanity check first
95
+ if is_url?(url0)
96
+ host=url_2_host(url0)
97
+ ip=host_2_ip(host).to_s
98
+ raise "Invalid IP address: #{url0}" if ip.nil?
99
+ port=url_2_port(url0).to_s
100
+ raise "Invalid port number: #{url0}" if port.nil?
101
+ else
102
+ raise "Invalid URL: #{url0}. Please check it out with your browser again."
103
+ end
104
+ log_info=Hash.new
105
+ log_info[1]="Start working on #{url0}"
106
+ url_stores=Hash.new
107
+ url_stores[url0]=true unless url_stores.key?(url0)
108
+ @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
109
+ @crawl_start[url0]=true unless @crawl_start.key?(url0)
110
+ # $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
111
+ @crawl_depth.times do
112
+ url_stores.keys.each do |url|
113
+ # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
114
+ next if @visited_urls_by_crawler.key?(url)
115
+ url_object = open_url(url)
116
+ next if url_object == nil
117
+ url = update_url_if_redirected(url, url_object)
118
+ url_body = read_url(url)
119
+ # Protection code - to avoid parsing failure on the empty or nil object
120
+ next if url_body.nil? or url_body.empty?
121
+ url_stores[url]=true unless url_stores.key?(url)
122
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
123
+ # $discovered_urls[url]=true unless $discovered_urls.key?(url)
124
+ doc = parse_html(url_body)
125
+ next if doc == nil
126
+ if url_stores.size >= @crawl_page_limit
127
+ #@visited_urls_by_crawler.merge!(url_stores)
128
+ @discovered_urls_by_crawler.merge!(url_stores)
129
+ # $discovered_urls.merge!(url_stores)
130
+ puts "Finish web crawling the url: #{url0}"
131
+ return url_stores
132
+ end
133
+ page_urls = find_urls_on_page(doc, url)
134
+ page_urls.uniq!
135
+ page_urls.map do |y|
136
+ y=normalize_url(y)
137
+ url_stores[y]=true unless url_stores.key?(y)
138
+ @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
139
+ # $discovered_urls[y]=true unless $discovered_urls.key?(y)
140
+ end
141
+ end
142
+ end
143
+ puts "Finish web crawling on: #{url0}"
144
+ log_info[2]="Finish working on: #{url0}"
145
+ wlog(log_info, "UrlCrawler", @log_file)
146
+ @crawl_done[url0]=true unless @crawl_done.key?(url0)
147
+ return url_stores
148
+ #rescue => ee
149
+ #puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
150
+ #log_info[3]="Exception on #{url0}"
151
+ #wlog(log_info,"UrlCrawler",@log_file)
152
+ #return url_stores
153
+ #end
154
+ end
155
+
156
+ # Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
157
+ # each child process will continuously work on the target pool until all the works are done
158
+ def crawl_workers (targets,num=@max_parallel)
159
+ begin
160
+ raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
161
+ puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
162
+ #puts "This could be awhile depending on the list size. Please be patient ..."
163
+ # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
164
+ targets -= ["", nil]
165
+ uniq_sites=Hash.new
166
+ targets.dup.map do |target|
167
+ if is_url?(target)
168
+ host=url_2_host(target)
169
+ ip=host_2_ip(host).to_s
170
+ next if ip.nil?
171
+ port=url_2_port(target).to_s
172
+ next if port.nil?
173
+ site_key=ip+":"+port
174
+ unless uniq_sites.key?(site_key)
175
+ uniq_sites[site_key]=target
176
+ end
177
+ end
178
+ end
179
+ puts "Sanitization done! " if @verbose
180
+ puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
181
+ puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
182
+ raise "Error: target list is empty!" if targets.size < 1
183
+ Parallel.map(uniq_sites.values, :in_processes => num) { |target|
184
+ puts "Working on #{target} ..." if @verbose
185
+ crawl(target)
186
+ }.dup.each do |process|
187
+ puts "process.inspect: #{process}" if @verbose
188
+ urls=process
189
+ urls-=["",nil] unless urls.nil?
190
+ if urls.nil?
191
+ next
192
+ elsif urls.empty?
193
+ next
194
+ #do nothing
195
+ else
196
+ urls.map do |url|
197
+ url.strip!
198
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
199
+ #$discovered_urls[url]=true unless $discovered_urls.key?(url)
200
+ end
201
+ end
202
+ end
203
+ #return sites
204
+ return @discovered_urls_by_crawler.keys
205
+ rescue Exception => ee
206
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
207
+ return nil
208
+ end
209
+ end
210
+ alias_method :crawls, :crawl_workers
211
+
212
+ # Fast crawling method - build the target pool from the input file
213
+ def crawl_workers_on_file (file)
214
+ puts "Web crawl the list of targets from file: #{file}"
215
+ begin
216
+ targets=file_2_list(file)
217
+ sites=crawl_workers(targets,num=@max_parallel)
218
+ return sites
219
+ rescue => ee
220
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
221
+ return nil
222
+ end
223
+ end
224
+ alias_method :query_file, :crawl_workers_on_file
225
+ alias_method :crawl_file, :crawl_workers_on_file
226
+
227
+ # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
228
+ def open_url(url)
229
+ puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
230
+ #url_object = nil
231
+ begin
232
+ if url =~ /http\:/i
233
+ # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
234
+ url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
235
+ #url_object = open(url)
236
+ elsif url =~ /https\:/i
237
+ url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
238
+ #url_object = open(url,:ssl_verify_mode => 0)
239
+ else
240
+ raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
241
+ end
242
+ return url_object
243
+ rescue => ee
244
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
+ return nil
246
+ end
247
+ end
248
+
249
+ # Wrapper to use OpenURI method 'read' to return url body contents
250
+ def read_url(url)
251
+ puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
252
+ begin
253
+ url_object=open_url(url)
254
+ @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
255
+ body=url_object.read
256
+ return body
257
+ rescue => ee
258
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
259
+ return nil
260
+ end
261
+ end
262
+
263
+ # Return the destination url in case of url re-direct
264
+ def update_url_if_redirected(url, url_object)
265
+ #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
266
+ begin
267
+ if url != url_object.base_uri.to_s
268
+ return url_object.base_uri.to_s
269
+ end
270
+ return url
271
+ rescue => ee
272
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
273
+ return nil
274
+ end
275
+ end
276
+
277
+ # Wrapper for the Nokogiri DOM parser
278
+ def parse_html(html_body)
279
+ #puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
280
+ begin
281
+ doc = Nokogiri::HTML(html_body)
282
+ #puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
283
+ #puts "doc: #{doc}" if @verbose
284
+ return doc
285
+ rescue => ee
286
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
287
+ return nil
288
+ end
289
+ end
290
+
291
+ # Search 'current_url' and return found URLs under the same domain
292
+ def find_urls_on_page(doc, current_url)
293
+ #puts "Search and return URLs within the doc: #{doc}" if @verbose
294
+ begin
295
+ urls_list = []
296
+ # case 1 - search embedded HTML tag <a href='url'> for the url elements
297
+ links=doc.css('a')
298
+ links.map do |x|
299
+ #puts "x: #{x}"
300
+ new_url = x.attribute('href').to_s
301
+ unless new_url == nil
302
+ if new_url.match("http")
303
+ #if urls_on_same_domain?(new_url,current_url)
304
+ urls_list.push(new_url)
305
+ #end
306
+ else
307
+ new_url = make_absolute(current_url, new_url)
308
+ urls_list.push(new_url)
309
+ end
310
+ end
311
+ end
312
+ # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
313
+ elements=doc.css("meta[http-equiv]")
314
+ unless elements.size == 0
315
+ link=elements.attr("content").value.split(/url\=/i)[1]
316
+ unless link.nil?
317
+ new_url = make_absolute(current_url, link)
318
+ urls_list.push(new_url) unless new_url.nil?
319
+ end
320
+ end
321
+ #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
322
+ return urls_list.uniq-["",nil]
323
+ rescue => ee
324
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
325
+ return nil
326
+ end
327
+ end
328
+
329
+ # Method to print out discovery URL result
330
+ def print_discovered_urls_by_crawler
331
+ puts "Print discovered url by the crawler. " if @verbose
332
+ begin
333
+ puts "\nSummary Report of Discovered URLs from the Crawler:"
334
+ @discovered_urls_by_crawler.keys.each do |url|
335
+ puts url
336
+ end
337
+ puts "Total: #{@discovered_urls_by_crawler.keys.size}"
338
+ puts "End of the summary"
339
+ rescue => ee
340
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
341
+ return nil
342
+ end
343
+ end
344
+ alias_method :print, :print_discovered_urls_by_crawler
345
+
346
+ # Method to save URL discovery result
347
+ def save_discovered_urls (file)
348
+ puts "Save discovered urls by the crawler to file: #{file} "
349
+ begin
350
+ list_2_file(@discovered_urls_by_crawler.keys, file)
351
+ puts "Done!"
352
+ rescue => ee
353
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
354
+ return nil
355
+ end
356
+ end
357
+ alias_method :save, :save_discovered_urls
358
+
359
+ # Method to retrieve discovery site result
360
+ def get_discovered_sites_by_crawler
361
+ puts "Print summary report of discovered sites. " if @verbose
362
+ begin
363
+ puts "\nSummary Report of Discovered Sites from the Crawler:"
364
+ sites = Hash.new
365
+ @discovered_urls_by_crawler.keys.each do |url|
366
+ site=url_2_site(url)
367
+ sites[site]=true unless sites.key?(site)
368
+ end
369
+ sites.keys.map { |site| puts site }
370
+ puts "Total: #{sites.size}"
371
+ puts "End of the summary"
372
+ return sites.keys
373
+ rescue => ee
374
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
375
+ return nil
376
+ end
377
+ end
378
+ alias_method :get_sites, :get_discovered_sites_by_crawler
379
+
380
+ private :open_url, :read_url, :update_url_if_redirected, :parse_html, :find_urls_on_page
381
+ end