wmap 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,314 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for the Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "net/http"
9
+ require 'httpclient'
10
+ require "openssl"
11
+ require "uri"
12
+ require "digest/md5"
13
+ require "parallel"
14
+
15
+ # A quick checker class to identify / finger-print a URL / site
16
+ class Wmap::UrlChecker
17
+ include Wmap::Utils
18
+ attr_accessor :http_timeout, :max_parallel, :verbose, :data_dir
19
+
20
+ def initialize (params = {})
21
+ # Set default instance variables
22
+ @verbose=params.fetch(:verbose, false)
23
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
24
+ @http_timeout=params.fetch(:http_timeout, 5000)
25
+ @max_parallel=params.fetch(:max_parallel, 40)
26
+ @ssl_version=nil
27
+ @url_code={}
28
+ @url_redirection={}
29
+ @url_finger_print={}
30
+ @url_server={}
31
+ end
32
+
33
+ # Main worker method to perform various checks on the URL / site
34
+ def url_worker (url)
35
+ puts "Checking out an unknown URL: #{url}" if @verbose
36
+ begin
37
+ url=url.strip.downcase
38
+ raise "Invalid URL format: #{url}" unless is_url?(url)
39
+ timestamp=Time.now
40
+ host=url_2_host(url)
41
+ ip=host_2_ip(host)
42
+ port=url_2_port(url)
43
+ code=10000
44
+ if @url_code.key?(url)
45
+ code=@url_code[url]
46
+ else
47
+ code=response_code(url)
48
+ end
49
+ if @url_redirection.key?(url)
50
+ loc=@url_redirection[url]
51
+ else
52
+ loc=redirect_location(url)
53
+ end
54
+ if @url_finger_print.key?(url)
55
+ fp=@url_finger_print[url]
56
+ else
57
+ fp=response_body_md5(url)
58
+ end
59
+ if @url_server.key?(url)
60
+ server=@url_server[url]
61
+ else
62
+ server=get_server_header(url)
63
+ end
64
+ # save the data
65
+ checker=Hash.new
66
+ checker['ip']=ip
67
+ checker['port']=port
68
+ checker['url']=url
69
+ checker['code']=code
70
+ checker['redirection']=loc
71
+ checker['md5']=fp
72
+ checker['server']=server
73
+ checker['timestamp']=timestamp
74
+ if Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
75
+ checker['status']="int_hosted"
76
+ else
77
+ checker['status']="ext_hosted"
78
+ end
79
+ return checker
80
+ rescue OpenSSL::SSL::SSLError => es # handler to temporally hold the openssl bug in bay: SSL_set_session: unable to find ssl method
81
+ checker=Hash.new
82
+ checker['ip']=ip
83
+ checker['port']=port
84
+ checker['url']=url
85
+ checker['code']=20000
86
+ checker['server']="Unknown SSL error: #{es}"
87
+ checker['md']=nil
88
+ checker['redirection']=nil
89
+ checker['timestamp']=timestamp
90
+ return checker
91
+ rescue Exception => ee
92
+ puts "Exception on method #{__method__} for #{url}: #{ee}" # if @verbose
93
+ return nil
94
+ end
95
+ end
96
+ alias_method :check, :url_worker
97
+
98
+ # Parallel scanner - by utilizing fork manager 'parallel' to spawn numbers of child processes on multiple urls simultaneously
99
+ def url_workers (targets,num=@max_parallel)
100
+ begin
101
+ results=Array.new
102
+ targets -= ["", nil]
103
+ if targets.size > 0
104
+ puts "Start the url checker on the targets:\n #{targets}"
105
+ Parallel.map(targets, :in_processes => num) { |target|
106
+ url_worker(target)
107
+ }.each do |process|
108
+ if process.nil?
109
+ next
110
+ elsif process.empty?
111
+ #do nothing
112
+ else
113
+ results << process
114
+ end
115
+ end
116
+ end
117
+ return results
118
+ rescue Exception => ee
119
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
120
+ return nil
121
+ end
122
+ end
123
+ alias_method :checks, :url_workers
124
+
125
+ # Test the URL and return the response code
126
+ def response_code (url)
127
+ puts "Check the http response code on the url: #{url}" if @verbose
128
+ response_code = 10000 # All unknown url connection exceptions go here
129
+ begin
130
+ raise "Invalid url: #{url}" unless is_url?(url)
131
+ url=url.strip.downcase
132
+ timeo = @http_timeout/1000.0
133
+ uri = URI.parse(url)
134
+ http = Net::HTTP.new(uri.host, uri.port)
135
+ http.open_timeout = timeo
136
+ http.read_timeout = timeo
137
+ if (url =~ /https\:/i)
138
+ http.use_ssl = true
139
+ #http.ssl_version = :SSLv3
140
+ # Bypass the remote web server cert validation test
141
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
142
+ end
143
+ request = Net::HTTP::Get.new(uri.request_uri)
144
+ response = http.request(request)
145
+ puts "Server response the following: #{response}" if @verbose
146
+ response_code = response.code.to_i
147
+ #response.finish if response.started?()
148
+ @url_code[url]=response_code
149
+ puts "Response code on #{url}: #{response_code}" if @verbose
150
+ return response_code
151
+ rescue Exception => ee
152
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
153
+ case ee
154
+ # rescue "Connection reset by peer" error type
155
+ when Errno::ECONNRESET
156
+ response_code=104
157
+ when Errno::ECONNABORTED,Errno::ETIMEDOUT
158
+ #response_code=10000
159
+ when Timeout::Error # Quick fix
160
+ if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
161
+ http.ssl_version = :SSLv3
162
+ response = http.request(request)
163
+ response_code = response.code.to_i
164
+ unless response_code.nil?
165
+ @ssl_version = http.ssl_version
166
+ end
167
+ end
168
+ else
169
+ #response_code=10000
170
+ end
171
+ @url_code[url]=response_code
172
+ return response_code
173
+ end
174
+ end
175
+ alias_method :query, :response_code
176
+
177
+ # Test the URL / site and return the redirection location (3xx response code only)
178
+ def redirect_location (url)
179
+ puts "Test the redirection location for the url: #{url}" if @verbose
180
+ location=""
181
+ begin
182
+ raise "Invalid url: #{url}" unless is_url?(url)
183
+ url=url.strip.downcase
184
+ timeo = @http_timeout/1000.0
185
+ uri = URI.parse(url)
186
+ code = response_code (url)
187
+ if code >= 300 && code < 400
188
+ http = Net::HTTP.new(uri.host, uri.port)
189
+ http.open_timeout = timeo
190
+ http.read_timeout = timeo
191
+ if (url =~ /https\:/i)
192
+ http.use_ssl = true
193
+ # Bypass the remote web server cert validation test
194
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
195
+ http.ssl_version = @ssl_version
196
+ end
197
+ request = Net::HTTP::Get.new(uri.request_uri)
198
+ response = http.request(request)
199
+ case response
200
+ when Net::HTTPRedirection then
201
+ location = response['location']
202
+ end
203
+ end
204
+ @url_redirection[url]=location
205
+ return location
206
+ rescue Exception => ee
207
+ puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
208
+ @url_redirection[url]=location
209
+ return location
210
+ end
211
+ end
212
+ alias_method :location, :redirect_location
213
+
214
+ # Test the URL / site and return the web server type from the HTTP header "server" field
215
+ def get_server_header (url)
216
+ puts "Retrieve the server header field from the url: #{url}" if @verbose
217
+ server=String.new
218
+ begin
219
+ raise "Invalid url: #{url}" unless is_url?(url)
220
+ url=url.strip.downcase
221
+ timeo = @http_timeout/1000.0
222
+ uri = URI.parse(url)
223
+ code = response_code (url)
224
+ http = Net::HTTP.new(uri.host, uri.port)
225
+ http.open_timeout = timeo
226
+ http.read_timeout = timeo
227
+ if (url =~ /https\:/i)
228
+ http.use_ssl = true
229
+ # Bypass the remote web server cert validation test
230
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
231
+ http.ssl_version = @ssl_version
232
+ end
233
+ request = Net::HTTP::Get.new(uri.request_uri)
234
+ response = http.request(request)
235
+ server=response["server"]
236
+ server=server.gsub(/\,/,' ')
237
+ return server
238
+ rescue Exception => ee
239
+ puts "Exception on method get_server_header for URL #{url}: #{ee}" if @verbose
240
+ @url_server[url]=server
241
+ return server
242
+ end
243
+ end
244
+
245
+ # Use MD5 algorithm to fingerprint the URL / site response payload (web page content)
246
+ def response_body_md5(url)
247
+ puts "MD5 finger print page body content: #{url}" if @verbose
248
+ begin
249
+ raise "Invalid url: #{url}" unless is_url?(url)
250
+ url=url.strip.downcase
251
+ timeo = @http_timeout/1000.0
252
+ uri = URI.parse(url)
253
+ fp=""
254
+ http = Net::HTTP.new(uri.host, uri.port)
255
+ http.open_timeout = timeo
256
+ http.read_timeout = timeo
257
+ if (url =~ /https\:/i)
258
+ http.use_ssl = true
259
+ # Bypass the remote web server cert validation test
260
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
261
+ http.ssl_version = @ssl_version
262
+ end
263
+ request = Net::HTTP::Get.new(uri.request_uri)
264
+ response = http.request(request)
265
+ response_body = response.body.to_s
266
+ fp=Digest::MD5.hexdigest(response_body) unless response_body.nil?
267
+ @url_finger_print[url] = fp
268
+ return fp
269
+ rescue Exception => ee
270
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
271
+ end
272
+ end
273
+ alias_method :md5, :response_body_md5
274
+
275
+ # Retrieve the remote web server certification, open it and return the cert content as a string
276
+ def get_certificate (url)
277
+ puts "Retrieve the remote web server SSL certificate in clear text: #{url}" if @verbose
278
+ begin
279
+ url=url.strip
280
+ raise "Invalid URL string: #{url}" unless is_ssl?(url)
281
+ client = HTTPClient.new
282
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
283
+ response = client.get(url)
284
+ cert = response.peer_cert
285
+ cer = OpenSSL::X509::Certificate.new(cert)
286
+ return cer.to_text
287
+ rescue Exception => ee
288
+ puts "Exception on method #{__method__} from #{url}: #{ee}"
289
+ end
290
+ return nil
291
+ end
292
+ alias_method :get_cert, :get_certificate
293
+
294
+ # Retrieve the X509 cert in the clear text from the remote web server, extract and return the common name field within the cert
295
+ def get_cert_cn (url)
296
+ puts "Extract the common name field from a X509 cert: #{cert}" if @verbose
297
+ begin
298
+ cert=get_certificate(url)
299
+ subject, cn = ""
300
+ if cert =~ /\n(.+)Subject\:(.+)\n/i
301
+ subject=$2
302
+ end
303
+ if subject =~/CN\=(.+)/i
304
+ cn=$1
305
+ end
306
+ return cn
307
+ rescue Exception => ee
308
+ puts "Error on method #{__method__} from #{cert}: #{ee}" if @verbose
309
+ end
310
+ return nil
311
+ end
312
+ alias_method :get_cn, :get_cert_cn
313
+
314
+ end
@@ -0,0 +1,381 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li
7
+ #++
8
+ require "net/http"
9
+ require "uri"
10
+ require "open-uri"
11
+ require "open_uri_redirections"
12
+ require "nokogiri"
13
+ require "parallel"
14
+
15
+
16
+ # Web site crawler class
17
+ class Wmap::UrlCrawler
18
+ include Wmap::Utils
19
+
20
+ attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, :verbose, :data_dir
21
+ attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
22
+ # Global variable used to store the combined result of all the forked child processes. Note that class variable
23
+ # would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
24
+ # $discovered_urls=Hash.new
25
+
26
+ # set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
27
+ Max_http_timeout=8000
28
+ # set hard stop limit of crawler time-out to 1200 seconds or 20 minutes
29
+ Crawl_timeout=1200000
30
+
31
+ # Crawler instance default variables
32
+ def initialize (params = {})
33
+ @verbose=params.fetch(:verbose, false)
34
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
35
+ @http_timeout=params.fetch(:http_timeout, 5000)
36
+ @crawl_depth=params.fetch(:crawl_depth, 4)
37
+ @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
38
+ @max_parallel=params.fetch(:max_parallel, 40)
39
+ # Discovered data store
40
+ @discovered_urls_by_crawler=Hash.new
41
+ @visited_urls_by_crawler=Hash.new
42
+ @crawl_start=Hash.new
43
+ @crawl_done=Hash.new
44
+ Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
45
+ @log_file=@data_dir + "crawler.log"
46
+ end
47
+
48
+ # Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
49
+ def pre_crawl(url)
50
+ puts "Perform network profiling works on the web server before the web crawling: #{url}" if @verbose
51
+ begin
52
+ host=url_2_host(url)
53
+ # Use the following formula to 'guess' the right http time-out threshold for the scanner
54
+ nwk_to=Wmap::NetworkProfiler.new.profile(host).to_i
55
+ if (1500 + Wmap::NetworkProfiler.new.profile(host)*2).to_i > Max_http_timeout
56
+ @http_timeout = Max_http_timeout
57
+ else
58
+ @http_timeout = 1500 + nwk_to*2
59
+ end
60
+ puts "Done with the pre-scan works: reset @http_timeout to: #{@http_timeout} ms" if @verbose
61
+ rescue Exception => ee
62
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
63
+ @http_timeout = Max_http_timeout
64
+ end
65
+ end
66
+
67
+ # A web crawler to crawl a known website and search for html links within the same root domain. For example,
68
+ # by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
69
+ def crawl(url)
70
+ puts "Start web crawling on #{url}"
71
+ #begin
72
+ result=Array.new
73
+ url=url.chomp.strip
74
+ result.push(url_2_site(url))
75
+ raise "Error! Invalid url format: #{urls}" unless is_url?(url)
76
+ # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
77
+ pre_crawl(url)
78
+ status = Timeout::timeout(Crawl_timeout/1000) {
79
+ result+=crawl_worker(url).keys
80
+ }
81
+ puts "Web crawling time-out on #{url}: #{status}" if @verbose
82
+ return result
83
+ #rescue => ee
84
+ #puts "Exception on method #{__method__} for URL #{url}: #{ee}"
85
+ #return result
86
+ #end
87
+ end
88
+ alias_method :query, :crawl
89
+
90
+ # The worker instance of crawler who perform the labour work
91
+ def crawl_worker(url0)
92
+ puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
93
+ #begin
94
+ # Input URL sanity check first
95
+ if is_url?(url0)
96
+ host=url_2_host(url0)
97
+ ip=host_2_ip(host).to_s
98
+ raise "Invalid IP address: #{url0}" if ip.nil?
99
+ port=url_2_port(url0).to_s
100
+ raise "Invalid port number: #{url0}" if port.nil?
101
+ else
102
+ raise "Invalid URL: #{url0}. Please check it out with your browser again."
103
+ end
104
+ log_info=Hash.new
105
+ log_info[1]="Start working on #{url0}"
106
+ url_stores=Hash.new
107
+ url_stores[url0]=true unless url_stores.key?(url0)
108
+ @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
109
+ @crawl_start[url0]=true unless @crawl_start.key?(url0)
110
+ # $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
111
+ @crawl_depth.times do
112
+ url_stores.keys.each do |url|
113
+ # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
114
+ next if @visited_urls_by_crawler.key?(url)
115
+ url_object = open_url(url)
116
+ next if url_object == nil
117
+ url = update_url_if_redirected(url, url_object)
118
+ url_body = read_url(url)
119
+ # Protection code - to avoid parsing failure on the empty or nil object
120
+ next if url_body.nil? or url_body.empty?
121
+ url_stores[url]=true unless url_stores.key?(url)
122
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
123
+ # $discovered_urls[url]=true unless $discovered_urls.key?(url)
124
+ doc = parse_html(url_body)
125
+ next if doc == nil
126
+ if url_stores.size >= @crawl_page_limit
127
+ #@visited_urls_by_crawler.merge!(url_stores)
128
+ @discovered_urls_by_crawler.merge!(url_stores)
129
+ # $discovered_urls.merge!(url_stores)
130
+ puts "Finish web crawling the url: #{url0}"
131
+ return url_stores
132
+ end
133
+ page_urls = find_urls_on_page(doc, url)
134
+ page_urls.uniq!
135
+ page_urls.map do |y|
136
+ y=normalize_url(y)
137
+ url_stores[y]=true unless url_stores.key?(y)
138
+ @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
139
+ # $discovered_urls[y]=true unless $discovered_urls.key?(y)
140
+ end
141
+ end
142
+ end
143
+ puts "Finish web crawling on: #{url0}"
144
+ log_info[2]="Finish working on: #{url0}"
145
+ wlog(log_info, "UrlCrawler", @log_file)
146
+ @crawl_done[url0]=true unless @crawl_done.key?(url0)
147
+ return url_stores
148
+ #rescue => ee
149
+ #puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
150
+ #log_info[3]="Exception on #{url0}"
151
+ #wlog(log_info,"UrlCrawler",@log_file)
152
+ #return url_stores
153
+ #end
154
+ end
155
+
156
+ # Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
157
+ # each child process will continuously work on the target pool until all the works are done
158
+ def crawl_workers (targets,num=@max_parallel)
159
+ begin
160
+ raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
161
+ puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
162
+ #puts "This could be awhile depending on the list size. Please be patient ..."
163
+ # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
164
+ targets -= ["", nil]
165
+ uniq_sites=Hash.new
166
+ targets.dup.map do |target|
167
+ if is_url?(target)
168
+ host=url_2_host(target)
169
+ ip=host_2_ip(host).to_s
170
+ next if ip.nil?
171
+ port=url_2_port(target).to_s
172
+ next if port.nil?
173
+ site_key=ip+":"+port
174
+ unless uniq_sites.key?(site_key)
175
+ uniq_sites[site_key]=target
176
+ end
177
+ end
178
+ end
179
+ puts "Sanitization done! " if @verbose
180
+ puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
181
+ puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
182
+ raise "Error: target list is empty!" if targets.size < 1
183
+ Parallel.map(uniq_sites.values, :in_processes => num) { |target|
184
+ puts "Working on #{target} ..." if @verbose
185
+ crawl(target)
186
+ }.dup.each do |process|
187
+ puts "process.inspect: #{process}" if @verbose
188
+ urls=process
189
+ urls-=["",nil] unless urls.nil?
190
+ if urls.nil?
191
+ next
192
+ elsif urls.empty?
193
+ next
194
+ #do nothing
195
+ else
196
+ urls.map do |url|
197
+ url.strip!
198
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
199
+ #$discovered_urls[url]=true unless $discovered_urls.key?(url)
200
+ end
201
+ end
202
+ end
203
+ #return sites
204
+ return @discovered_urls_by_crawler.keys
205
+ rescue Exception => ee
206
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
207
+ return nil
208
+ end
209
+ end
210
+ alias_method :crawls, :crawl_workers
211
+
212
+ # Fast crawling method - build the target pool from the input file
213
+ def crawl_workers_on_file (file)
214
+ puts "Web crawl the list of targets from file: #{file}"
215
+ begin
216
+ targets=file_2_list(file)
217
+ sites=crawl_workers(targets,num=@max_parallel)
218
+ return sites
219
+ rescue => ee
220
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
221
+ return nil
222
+ end
223
+ end
224
+ alias_method :query_file, :crawl_workers_on_file
225
+ alias_method :crawl_file, :crawl_workers_on_file
226
+
227
+ # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
228
+ def open_url(url)
229
+ puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
230
+ #url_object = nil
231
+ begin
232
+ if url =~ /http\:/i
233
+ # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
234
+ url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
235
+ #url_object = open(url)
236
+ elsif url =~ /https\:/i
237
+ url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
238
+ #url_object = open(url,:ssl_verify_mode => 0)
239
+ else
240
+ raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
241
+ end
242
+ return url_object
243
+ rescue => ee
244
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
+ return nil
246
+ end
247
+ end
248
+
249
+ # Wrapper to use OpenURI method 'read' to return url body contents
250
+ def read_url(url)
251
+ puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
252
+ begin
253
+ url_object=open_url(url)
254
+ @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
255
+ body=url_object.read
256
+ return body
257
+ rescue => ee
258
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
259
+ return nil
260
+ end
261
+ end
262
+
263
+ # Return the destination url in case of url re-direct
264
+ def update_url_if_redirected(url, url_object)
265
+ #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
266
+ begin
267
+ if url != url_object.base_uri.to_s
268
+ return url_object.base_uri.to_s
269
+ end
270
+ return url
271
+ rescue => ee
272
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
273
+ return nil
274
+ end
275
+ end
276
+
277
+ # Wrapper for the Nokogiri DOM parser
278
+ def parse_html(html_body)
279
+ #puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
280
+ begin
281
+ doc = Nokogiri::HTML(html_body)
282
+ #puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
283
+ #puts "doc: #{doc}" if @verbose
284
+ return doc
285
+ rescue => ee
286
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
287
+ return nil
288
+ end
289
+ end
290
+
291
+ # Search 'current_url' and return found URLs under the same domain
292
+ def find_urls_on_page(doc, current_url)
293
+ #puts "Search and return URLs within the doc: #{doc}" if @verbose
294
+ begin
295
+ urls_list = []
296
+ # case 1 - search embedded HTML tag <a href='url'> for the url elements
297
+ links=doc.css('a')
298
+ links.map do |x|
299
+ #puts "x: #{x}"
300
+ new_url = x.attribute('href').to_s
301
+ unless new_url == nil
302
+ if new_url.match("http")
303
+ #if urls_on_same_domain?(new_url,current_url)
304
+ urls_list.push(new_url)
305
+ #end
306
+ else
307
+ new_url = make_absolute(current_url, new_url)
308
+ urls_list.push(new_url)
309
+ end
310
+ end
311
+ end
312
+ # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
313
+ elements=doc.css("meta[http-equiv]")
314
+ unless elements.size == 0
315
+ link=elements.attr("content").value.split(/url\=/i)[1]
316
+ unless link.nil?
317
+ new_url = make_absolute(current_url, link)
318
+ urls_list.push(new_url) unless new_url.nil?
319
+ end
320
+ end
321
+ #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
322
+ return urls_list.uniq-["",nil]
323
+ rescue => ee
324
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
325
+ return nil
326
+ end
327
+ end
328
+
329
+ # Method to print out discovery URL result
330
+ def print_discovered_urls_by_crawler
331
+ puts "Print discovered url by the crawler. " if @verbose
332
+ begin
333
+ puts "\nSummary Report of Discovered URLs from the Crawler:"
334
+ @discovered_urls_by_crawler.keys.each do |url|
335
+ puts url
336
+ end
337
+ puts "Total: #{@discovered_urls_by_crawler.keys.size}"
338
+ puts "End of the summary"
339
+ rescue => ee
340
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
341
+ return nil
342
+ end
343
+ end
344
+ alias_method :print, :print_discovered_urls_by_crawler
345
+
346
+ # Method to save URL discovery result
347
+ def save_discovered_urls (file)
348
+ puts "Save discovered urls by the crawler to file: #{file} "
349
+ begin
350
+ list_2_file(@discovered_urls_by_crawler.keys, file)
351
+ puts "Done!"
352
+ rescue => ee
353
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
354
+ return nil
355
+ end
356
+ end
357
+ alias_method :save, :save_discovered_urls
358
+
359
+ # Method to retrieve discovery site result
360
+ def get_discovered_sites_by_crawler
361
+ puts "Print summary report of discovered sites. " if @verbose
362
+ begin
363
+ puts "\nSummary Report of Discovered Sites from the Crawler:"
364
+ sites = Hash.new
365
+ @discovered_urls_by_crawler.keys.each do |url|
366
+ site=url_2_site(url)
367
+ sites[site]=true unless sites.key?(site)
368
+ end
369
+ sites.keys.map { |site| puts site }
370
+ puts "Total: #{sites.size}"
371
+ puts "End of the summary"
372
+ return sites.keys
373
+ rescue => ee
374
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
375
+ return nil
376
+ end
377
+ end
378
+ alias_method :get_sites, :get_discovered_sites_by_crawler
379
+
380
+ private :open_url, :read_url, :update_url_if_redirected, :parse_html, :find_urls_on_page
381
+ end