wmap 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,937 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for the Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "parallel"
9
+ #require "singleton"
10
+ require "nokogiri"
11
+
12
+
13
+ # Main class to automatically track the site inventory
14
+ class Wmap::SiteTracker
15
+ include Wmap::Utils
16
+ #include Singleton
17
+
18
+ attr_accessor :sites_file, :max_parallel, :verbose, :data_dir
19
+ attr_reader :known_sites
20
+
21
+ # Set default instance variables
22
+ def initialize (params = {})
23
+ # Initialize the instance variables
24
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
25
+ @file_sites=@data_dir+'sites'
26
+ @file_stores=params.fetch(:sites_file, @file_sites)
27
+ @verbose=params.fetch(:verbose, false)
28
+ @max_parallel=params.fetch(:max_parallel, 30)
29
+ # Hash table to hold the site store
30
+ File.write(@file_stores, "") unless File.exist?(@file_stores)
31
+ @known_sites=load_site_stores_from_file(@file_stores)
32
+ end
33
+
34
+ # Setter to load the known hosts into an instance variable
35
+ def load_site_stores_from_file (file)
36
+ puts "Loading the site store data repository from file: #{file} " if @verbose
37
+ begin
38
+ known_sites=Hash.new
39
+ f=File.open(file, 'r')
40
+ f.each do |line|
41
+ line=line.chomp.strip
42
+ next if line.nil?
43
+ next if line.empty?
44
+ next if line =~ /^\s*#/
45
+ entry=line.split(%r{\t+|\,})
46
+ site=entry[0].downcase
47
+ ip=entry[1]
48
+ port=entry[2]
49
+ status=entry[3]
50
+ server=entry[4]
51
+ res=entry[5].to_i
52
+ fp=entry[6]
53
+ loc=entry[7]
54
+ timestamp=entry[8]
55
+ puts "Loading entry: #{site} - #{ip} - #{status}" if @verbose
56
+ known_sites[site]= Hash.new unless known_sites.key?(site)
57
+ known_sites[site]['ip']=ip
58
+ known_sites[site]['port']=port
59
+ known_sites[site]['status']=status
60
+ known_sites[site]['server']=server
61
+ known_sites[site]['code']=res
62
+ known_sites[site]['md5']=fp
63
+ known_sites[site]['redirection']=loc
64
+ known_sites[site]['timestamp']=timestamp
65
+ end
66
+ f.close
67
+ puts "Successfully loading file: #{file}" if @verbose
68
+ return known_sites
69
+ rescue => ee
70
+ puts "Exception on method #{__method__} for file #{file}: #{ee}"
71
+ end
72
+ end
73
+
74
+ # Save the current site store hash table into a file
75
+ def save_sites_to_file!(file_sites=@file_stores)
76
+ puts "Saving the current site store table from memory to file: #{file_sites}"
77
+ begin
78
+ timestamp=Time.now
79
+ f=File.open(file_sites, 'w')
80
+ f.write "# Local site store created by class #{self.class} method #{__method__} at: #{timestamp}\n"
81
+ f.write "# Website,Primary IP,Port,Hosting Status,Server,Response Code,MD5 Finger-print,Redirection,Timestamp\n"
82
+ @known_sites.keys.sort.map do |key|
83
+ f.write "#{key},#{@known_sites[key]['ip']},#{@known_sites[key]['port']},#{@known_sites[key]['status']},#{@known_sites[key]['server']},#{@known_sites[key]['code']},#{@known_sites[key]['md5']},#{@known_sites[key]['redirection']},#{@known_sites[key]['timestamp']}\n"
84
+ end
85
+ f.close
86
+ puts "site store table is successfully saved: #{file_sites}"
87
+ rescue => ee
88
+ puts "Exception on method #{__method__}: #{ee}"
89
+ end
90
+ end
91
+ alias_method :save!, :save_sites_to_file!
92
+
93
+ # Count numbers of entries in the site store table
94
+ def count
95
+ puts "Counting number of entries in the site store table ..."
96
+ begin
97
+ return @known_sites.size
98
+ rescue => ee
99
+ puts "Exception on method #{__method__}: #{ee}"
100
+ end
101
+ end
102
+
103
+ # Setter to add site entry to the cache one at a time
104
+ def add(site)
105
+ puts "Add entry to the site store: #{site}"
106
+ begin
107
+ # Preliminary sanity check
108
+ site=site.strip.downcase unless site.nil?
109
+ raise "Site is already exist. Skip #{site}" if site_known?(site)
110
+ site=normalize_url(site) if is_url?(site)
111
+ site=url_2_site(site) if is_url?(site)
112
+ puts "Site in standard format: #{site}" if @verbose
113
+ raise "Exception on method #{__method__}: invalid site format of #{site}. Expected format is: http://your_website_name/" unless is_site?(site)
114
+ trusted=false
115
+ host=url_2_host(site)
116
+ ip=host_2_ip(host)
117
+ # Additional logic to refresh deactivated site, 02/12/2014
118
+ deact=Wmap::SiteTracker::DeactivatedSite.new(:data_dir=>@data_dir)
119
+ # only trust either the domain or IP we know
120
+ if is_ip?(host)
121
+ trusted=Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
122
+ else
123
+ root=get_domain_root(host)
124
+ if root.nil?
125
+ raise "Invalid web site format. Please check your record again."
126
+ else
127
+ trusted=Wmap::DomainTracker.new(:data_dir=>@data_dir).domain_known?(root)
128
+ end
129
+ end
130
+ # add record only if trusted
131
+ if trusted
132
+ # Add logic to check site status before adding it
133
+ checker=Wmap::UrlChecker.new(:data_dir=>@data_dir).check(site)
134
+ raise "Site is currently down. Skip #{site}" if checker.nil?
135
+ # Skip the http site if it's un-responsive; for the https we'll keep it because we're interested in analysing the SSL layer later
136
+ if is_https?(site)
137
+ # do nothing
138
+ else
139
+ raise "Site is currently down. Skip #{site}" if checker['code']==10000
140
+ end
141
+ raise "Exception on add method - Fail to resolve the host-name: Host - #{host}, IP - #{ip}. Skip #{site}" unless is_ip?(ip)
142
+ my_tracker = Wmap::HostTracker.new(:data_dir=>@data_dir)
143
+ # Update the local host table when necessary
144
+ if is_ip?(host)
145
+ # Case #1: Trusted site contains IP
146
+ if my_tracker.ip_known?(host)
147
+ # Try local reverse DNS lookup first
148
+ puts "Local hosts table lookup for IP: #{ip}" if @verbose
149
+ host=my_tracker.local_ip_2_host(host)
150
+ puts "Host found from the local hosts table for #{ip}: #{host}" if @verbose
151
+ site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
152
+ else
153
+ # Try reverse DNS lookup over Internet as secondary precaution
154
+ puts "Reverse DNS lookup for IP: #{ip}" if @verbose
155
+ host1=ip_2_host(host)
156
+ puts "host1: #{host1}" if @verbose
157
+ if is_fqdn?(host1)
158
+ if Wmap::HostTracker.new(:data_dir=>@data_dir).domain_known?(host1)
159
+ # replace IP with host-name only if domain root is known
160
+ puts "Host found from the Internet reverse DNS lookup for #{ip}: #{host1}" if @verbose
161
+ host=host1
162
+ site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
163
+ end
164
+ end
165
+ end
166
+ # Adding site for Case #1
167
+ raise "Site already exist! Skip #{site}" if @known_sites.key?(site)
168
+ puts "Adding site: #{site}" if @verbose
169
+ @known_sites[site]=Hash.new
170
+ @known_sites[site]=checker
171
+ if deact.site_known?(site)
172
+ deact.delete(site)
173
+ deact.save!
174
+ end
175
+ puts "Site entry loaded: #{checker}"
176
+ if is_fqdn?(host)
177
+ # Add logic to update the hosts table for case #1 variance
178
+ # - case that reverse DNS lookup successful
179
+ puts "Update local hosts table for host: #{host}"
180
+ if my_tracker.host_known?(host)
181
+ old_ip=my_tracker.local_host_2_ip(host)
182
+ if old_ip != ip
183
+ my_tracker.efresh(host)
184
+ my_tracker.save!
185
+ else
186
+ puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
187
+ end
188
+ else
189
+ my_tracker.add(host)
190
+ my_tracker.save!
191
+ end
192
+ end
193
+ else
194
+ # Case #2: Trusted site contains valid FQDN
195
+ puts "Ading site: #{site}" if @verbose
196
+ @known_sites[site]=Hash.new
197
+ @known_sites[site]=checker
198
+ if deact.site_known?(site)
199
+ deact.delete(site)
200
+ deact.save!
201
+ end
202
+ puts "Site entry loaded: #{checker}"
203
+ # Add logic to update the hosts table for case #2
204
+ puts "Update local hosts table for host: #{host}"
205
+ if my_tracker.host_known?(host)
206
+ old_ip=my_tracker.local_host_2_ip(host)
207
+ if old_ip != ip
208
+ my_tracker.efresh(host)
209
+ my_tracker.save!
210
+ else
211
+ # Skip - no need to update the local hosts table
212
+ end
213
+ else
214
+ my_tracker.add(host)
215
+ my_tracker.save!
216
+ end
217
+ end
218
+ deact=nil
219
+ my_tracker=nil
220
+ return checker
221
+ else
222
+ puts "Problem found: untrusted Internet domain or IP. Skip #{site}"
223
+ deact=nil
224
+ my_tracker=nil
225
+ return nil
226
+ end
227
+ rescue => ee
228
+ puts "Exception on method #{__method__}: #{ee}"
229
+ deact=nil
230
+ return nil
231
+ end
232
+ end
233
+
234
+ # Setter to add site entry to the cache table in batch (from a file)
235
+ def file_add(file)
236
+ puts "Add entries to the local site store from file: #{file}"
237
+ begin
238
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
239
+ changes=Hash.new
240
+ sites=file_2_list(file)
241
+ changes=bulk_add(sites) unless sites.nil? or sites.empty?
242
+ puts "Done loading file #{file}. "
243
+ return changes
244
+ rescue => ee
245
+ puts "Exception on method #{__method__}: #{ee}"
246
+ end
247
+ end
248
+
249
+ # Setter to add site entry to the cache in batch (from a list)
250
+ def bulk_add(list,num=@max_parallel)
251
+ puts "Add entries to the local site store from list:\n #{list}"
252
+ #begin
253
+ results=Hash.new
254
+ if list.size > 0
255
+ puts "Start parallel adding on the sites:\n #{list}"
256
+ Parallel.map(list, :in_processes => num) { |target|
257
+ add(target)
258
+ }.each do |process|
259
+ if process.nil?
260
+ next
261
+ elsif process.empty?
262
+ #do nothing
263
+ else
264
+ results[process['url']]=Hash.new
265
+ results[process['url']]=process
266
+ end
267
+ end
268
+ @known_sites.merge!(results)
269
+ else
270
+ puts "Error: no entry is added. Please check your list and try again."
271
+ end
272
+ puts "Done adding site entries."
273
+ if results.size>0
274
+ puts "New entries added: #{results}"
275
+ else
276
+ puts "No new entry added. "
277
+ end
278
+ return results
279
+ #rescue => ee
280
+ #puts "Exception on method #{__method__}: #{ee}" if @verbose
281
+ #end
282
+ end
283
+ alias_method :adds, :bulk_add
284
+
285
+ # Setter to remove entry from the site store one at a time
286
+ def delete(site)
287
+ puts "Remove entry from the site store: #{site} " if @verbose
288
+ begin
289
+ # Additional logic to deactivate the site properly, by moving it to the DeactivatedSite list, 02/07/2014
290
+ deact=Wmap::SiteTracker::DeactivatedSite.new(:data_dir=>@data_dir)
291
+ site=site.strip.downcase
292
+ site=url_2_site(site)
293
+ if @known_sites.key?(site)
294
+ site_info=@known_sites[site]
295
+ deact.add(site,site_info)
296
+ deact.save!
297
+ deact=nil
298
+ del=@known_sites.delete(site)
299
+ puts "Entry cleared: #{site}"
300
+ return del
301
+ else
302
+ puts "Entry not fund. Skip #{site}"
303
+ deact=nil
304
+ return nil
305
+ end
306
+ rescue => ee
307
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
308
+ deact=nil
309
+ end
310
+ end
311
+ alias_method :del, :delete
312
+
313
+ # Setter to delete site entry to the cache in batch (from a file)
314
+ def file_delete(file)
315
+ begin
316
+ puts "Delete entries to the local site store from file: #{file}" if @verbose
317
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
318
+ sites=file_2_list(file)
319
+ changes=Array.new
320
+ changes=bulk_delete(sites) unless sites.nil? or sites.empty?
321
+ rescue => ee
322
+ puts "Exception on method file_delete: #{ee} for file: #{file}" if @verbose
323
+ end
324
+ end
325
+ alias_method :file_del, :file_delete
326
+
327
+ # Setter to delete site entry to the cache in batch (from a list)
328
+ def bulk_delete(list)
329
+ puts "Delete entries to the local site store from list:\n #{list}" if @verbose
330
+ begin
331
+ sites=list
332
+ changes=Array.new
333
+ if sites.size > 0
334
+ sites.map do |x|
335
+ x=url_2_site(x)
336
+ site=delete(x)
337
+ changes.push(site) unless site.nil?
338
+ end
339
+ puts "Done deleting sites from the list:\n #{list}"
340
+ return changes
341
+ else
342
+ puts "Error: no entry is loaded. Please check your list and try again."
343
+ end
344
+ rescue => ee
345
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
346
+ end
347
+ end
348
+ alias_method :dels, :bulk_delete
349
+
350
+ # Setter to refresh the entry in the site store one at a time
351
+ def refresh(site)
352
+ puts "Refresh the local site store for site: #{site} "
353
+ begin
354
+ raise "Invalid site: #{site}" if site.nil? or site.empty?
355
+ site=site.strip.downcase
356
+ if @known_sites.key?(site)
357
+ delete(site)
358
+ site_info=add(site)
359
+ puts "Done refresh entry: #{site}"
360
+ return site_info
361
+ else
362
+ puts "Error entry non exist: #{site}"
363
+ end
364
+ return nil
365
+ rescue => ee
366
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
367
+ return nil
368
+ end
369
+ end
370
+
371
+ # 'Refresh sites in the site store in batch (from a file)
372
+ def file_refresh(file)
373
+ puts "Refresh entries in the site store from file: #{file}" if @verbose
374
+ begin
375
+ changes=Hash.new
376
+ sites=file_2_list(file)
377
+ changes=bulk_refresh(sites) unless sites.nil? or sites.empty?
378
+ return changes
379
+ rescue => ee
380
+ puts "Exception on method #{__method__}: #{ee} for file: #{file}" if @verbose
381
+ end
382
+ end
383
+
384
+ # 'Refresh unique sites in the site store only
385
+ def refresh_uniq_sites
386
+ puts "Refresh unique site entries in the site store. " if @verbose
387
+ begin
388
+ changes=Hash.new
389
+ sites=get_uniq_sites
390
+ if sites.size > 0
391
+ changes=bulk_refresh(sites)
392
+ else
393
+ puts "Error: no entry is refreshed. Please check your site store and try again."
394
+ end
395
+ return changes
396
+ rescue => ee
397
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
398
+ end
399
+ end
400
+
401
+ # 'Refresh sites in the site store in batch (from a list)
402
+ def bulk_refresh(list,num=@max_parallel)
403
+ puts "Refresh entries in the site store from list:\n #{list}" if @verbose
404
+ begin
405
+ results=Hash.new
406
+ if list.size > 0
407
+ puts "Start parallel refreshing on the sites:\n #{list}"
408
+ Parallel.map(list, :in_processes => num) { |target|
409
+ refresh(target)
410
+ }.each do |process|
411
+ if process.nil?
412
+ next
413
+ elsif process.empty?
414
+ #do nothing
415
+ else
416
+ results[process['url']]=Hash.new
417
+ results[process['url']]=process
418
+ end
419
+ end
420
+ # Clean up old entries, by Y.L. 03/30/2015
421
+ list.map {|x| @known_sites.delete(x)}
422
+ # Add back fresh entries
423
+ @known_sites.merge!(results)
424
+ puts "Done refresh sites."
425
+ else
426
+ puts "Error: no entry is loaded. Please check your list and try again."
427
+ end
428
+ return results
429
+ rescue => ee
430
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
431
+ end
432
+ end
433
+ alias_method :refreshs, :bulk_refresh
434
+
435
+
436
+ # Refresh all site entries in the stores in one shot
437
+ def refresh_all
438
+ puts "Refresh all the entries within the local site store ... "
439
+ begin
440
+ changes=Hash.new
441
+ changes=bulk_refresh(@known_sites.keys)
442
+ @known_sites.merge!(changes)
443
+ puts "Done refresh all entries."
444
+ return changes
445
+ rescue => ee
446
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
447
+ end
448
+ end
449
+
450
+ # Refresh all site entries in the stores that contains an IP instead of a hostname
451
+ def refresh_ip_sites
452
+ puts "Refresh all entries that contain an IP address instead of a FQDN ... "
453
+ begin
454
+ sites=get_ip_sites
455
+ live_sites=sites.delete_if { |x| @known_sites[x]['code'] == 10000 or @known_sites[x]['code'] == 20000 }
456
+ changes=Hash.new
457
+ changes=bulk_refresh(live_sites)
458
+ @known_sites.merge!(changes)
459
+ puts "Done refresh IP sites."
460
+ return changes
461
+ rescue => ee
462
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
463
+ end
464
+ end
465
+
466
+ # Quick validation if a site is already covered under the site store
467
+ def site_known?(site)
468
+ begin
469
+ raise "Web site store not loaded properly! " if @known_sites.nil?
470
+ site=site.strip.downcase unless site.nil?
471
+ site=url_2_site(site)
472
+ return @known_sites.key?(site) unless site.nil?
473
+ rescue => ee
474
+ puts "Error checking web site #{site} against the site store: #{ee}"
475
+ end
476
+ return false
477
+ end
478
+ alias_method :is_known?, :site_known?
479
+
480
+ # Quick validation check on an IP is already part of the site store
481
+ def site_ip_known?(ip)
482
+ begin
483
+ ip=ip.chomp.strip
484
+ known=false
485
+ if is_ip?(ip)
486
+ @known_sites.keys.map do |site|
487
+ if @known_sites[site]['ip']==ip
488
+ return true
489
+ end
490
+ end
491
+ end
492
+ myDis=nil
493
+ return known
494
+ rescue => ee
495
+ puts "Exception on method #{__method__}: #{ee}"
496
+ return false
497
+ end
498
+ end
499
+ alias_method :siteip_known?, :site_ip_known?
500
+
501
+ # Quick check of the stored information of a site within the store
502
+ def site_check(site)
503
+ begin
504
+ raise "Web site store not loaded properly! " if @known_sites.nil?
505
+ site=site.strip.downcase unless site.nil?
506
+ site=url_2_site(site)
507
+ return @known_sites[site] unless site.nil?
508
+ rescue => ee
509
+ puts "Exception on method #{__method__}: #{ee}"
510
+ return nil
511
+ end
512
+ end
513
+ alias_method :check, :site_check
514
+
515
+ # Retrieve external hosted sites into a list
516
+ def get_ext_sites
517
+ puts "getter to retrieve all the external hosted sites. " if @verbose
518
+ begin
519
+ sites=Array.new
520
+ @known_sites.keys.map do |key|
521
+ if @known_sites[key]['status']=="ext_hosted"
522
+ sites.push(key)
523
+ end
524
+ end
525
+ sites.sort!
526
+ return sites
527
+ rescue Exception => ee
528
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
529
+ return nil
530
+ end
531
+ end
532
+ alias_method :get_ext, :get_ext_sites
533
+
534
+ # Retrieve a list of internal hosted site URLs
535
+ def get_int_sites
536
+ puts "getter to retrieve all the internal hosted sites." if @verbose
537
+ begin
538
+ sites=Array.new
539
+ @known_sites.keys.map do |key|
540
+ if @known_sites[key]['status']=="int_hosted"
541
+ sites.push(key)
542
+ end
543
+ end
544
+ sites.sort!
545
+ return sites
546
+ rescue Exception => ee
547
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
548
+ return nil
549
+ end
550
+ end
551
+ alias_method :get_int, :get_int_sites
552
+
553
+ # Retrieve a list of sites that contain an IP in the site URL
554
+ def get_ip_sites
555
+ puts "Getter to retrieve sites contain an IP instead of a host-name ." if @verbose
556
+ begin
557
+ sites=Array.new
558
+ @known_sites.keys.map do |key|
559
+ host=url_2_host(key)
560
+ if is_ip?(host)
561
+ sites.push(key)
562
+ end
563
+ end
564
+ sites.sort!
565
+ return sites
566
+ rescue Exception => ee
567
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
568
+ return nil
569
+ end
570
+ end
571
+
572
+ # Retrieve a list of unique sites within the known site store
573
+ def get_uniq_sites
574
+ puts "Getter to retrieve unique sites containing unique IP:PORT key identifier." if @verbose
575
+ begin
576
+ #primary_host_tracker=Wmap::HostTracker::PrimaryHost.new
577
+ sites=Hash.new
578
+ #uniqueness=Hash.new
579
+ my_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
580
+ @known_sites.keys.map do |key|
581
+ port=url_2_port(key).to_s
582
+ host=url_2_host(key)
583
+ md5=@known_sites[key]['md5']
584
+ code=@known_sites[key]['code']
585
+ ip=my_trakcer.local_host_2_ip(host)
586
+ ip=host_2_ip(host) if ip.nil?
587
+ # filtering out 'un-reachable' sites
588
+ next if (code == 10000 or code == 20000)
589
+ # filtering out 'empty' sites
590
+ next if (md5.nil? or md5.empty?)
591
+ next if ip.nil?
592
+ # url_new=key
593
+ #if primary_host_tracker.ip_known?(ip)
594
+ # p_host=primary_host_tracker.known_hosts[ip]
595
+ # url_new=key.sub(host,p_host)
596
+ #end
597
+ id=ip+":"+port
598
+ # filtering out duplicates by 'IP:PORT' key pair
599
+ unless sites.key?(id)
600
+ #if @known_sites.key?(key)
601
+ # sites[id]=url_new
602
+ #else
603
+ # Further filtering out redundant site by checking MD5 finger-print
604
+ #unless uniqueness.key?(md5)
605
+ sites[id]=key
606
+ # uniqueness[md5]=true
607
+ #end
608
+ #end
609
+ end
610
+ end
611
+ #primary_host_tracker=nil
612
+ my_tracker=nil
613
+ return sites.values
614
+ rescue Exception => ee
615
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
616
+ return nil
617
+ end
618
+ end
619
+ alias_method :uniq_sites, :get_uniq_sites
620
+
621
+ # Retrieve a list of sites that contain an IP in the site URL
622
+ def get_ssl_sites
623
+ puts "getter to retrieve https sites from the site store." if @verbose
624
+ begin
625
+ sites=Array.new
626
+ @known_sites.keys.map do |key|
627
+ key =~ /https/i
628
+ sites.push(key)
629
+ end
630
+ sites.sort!
631
+ return sites
632
+ rescue Exception => ee
633
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
634
+ return nil
635
+ end
636
+ end
637
+
638
+ # Retrieve a list of redirection URLs from the site store
639
+ def get_redirection_urls
640
+ puts "getter to retrieve all the redirection URLs from the site store." if @verbose
641
+ begin
642
+ urls=Array.new
643
+ @known_sites.keys.map do |key|
644
+ unless @known_sites[key]['redirection'].nil?
645
+ urls.push(@known_sites[key]['redirection'])
646
+ end
647
+ end
648
+ urls.sort!
649
+ return urls
650
+ rescue Exception => ee
651
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
652
+ return nil
653
+ end
654
+ end
655
+
656
+ # Retrieve redirection URL if available
657
+ def get_redirection_url (site)
658
+ puts "getter to retrieve the redirection URL from the site store." if @verbose
659
+ begin
660
+ site=site.strip.downcase
661
+ if @known_sites.key?(site)
662
+ return @known_sites[site]['redirection']
663
+ else
664
+ puts "Unknown site: #{site}" if @verbose
665
+ return nil
666
+ end
667
+ rescue Exception => ee
668
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
669
+ return nil
670
+ end
671
+ end
672
+
673
+ # Perform local host table reverse lookup for the IP sites, in hope that the hostname could now be resolved since the site was discovered
674
+ def resolve_ip_sites
675
+ puts "Resolve sites that contain an IP address. Update the site cache table once a hostname is found in the local host table." if @verbose
676
+ begin
677
+ updates=Array.new
678
+ sites=get_ip_sites
679
+ my_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
680
+ sites.map do |site|
681
+ puts "Work on resolve the IP site: #{site}" if @verbose
682
+ ip=url_2_host(site)
683
+ hostname=my_tracker.local_ip_2_host(ip)
684
+ if hostname.nil?
685
+ puts "Can't resolve #{ip} from the local host store. Skip #{site}" if @verbose
686
+ else
687
+ puts "Host-name found for IP #{ip}: #{hostname}" if @verbose
688
+ updates.push(site)
689
+ refresh(site)
690
+ end
691
+ end
692
+ updates.sort!
693
+ puts "The following sites are now refreshed: #{updates}" if @verbose
694
+ my_tracker=nil
695
+ return updates
696
+ rescue Exception => ee
697
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
698
+ end
699
+ end
700
+
701
+ # Search potential matching sites from the site store by using simple regular expression. Note that any upper-case char in the search string will be automatically converted into lower case
702
+ def search (pattern)
703
+ puts "Search site store based on the regular expression: #{pattern}" if @verbose
704
+ begin
705
+ pattern=pattern.strip.downcase
706
+ results=Array.new
707
+ @known_sites.keys.map do |key|
708
+ if key =~ /#{pattern}/i
709
+ results.push(key)
710
+ end
711
+ end
712
+ return results
713
+ rescue Exception => ee
714
+ puts "Exception on method search: #{ee}" if @verbose
715
+ return nil
716
+ end
717
+ end
718
+
719
+ # Print summary report on all sites that contain an IP in the site URL
720
+ def print_ip_sites
721
+ puts "Print sites contain an IP instead of a host-name."
722
+ sites=get_ip_sites
723
+ sites.map { |x| puts x }
724
+ puts "End of report. "
725
+ end
726
+
727
+ # Retrieve and print specific information of a site in the site store
728
+ def print_site(site)
729
+ puts "Site Information Report for: #{site}" if @verbose
730
+ begin
731
+ site=site.strip unless site.nil?
732
+ raise "Unknown site: #{site}" unless @known_sites.key?(site)
733
+ ip=@known_sites[site]['ip']
734
+ port=@known_sites[site]['port']
735
+ status=@known_sites[site]['status']
736
+ server=@known_sites[site]['server']
737
+ fp=@known_sites[site]['md5']
738
+ loc=@known_sites[site]['redirection']
739
+ res=@known_sites[site]['code']
740
+ timestamp=@known_sites[site]['timestamp']
741
+ puts "#{site},#{ip},#{port},#{status},#{server},#{res},#{fp},#{loc},#{timestamp}"
742
+ rescue => ee
743
+ puts "Exception on method #{__method__} for #{site}: #{ee}"
744
+ end
745
+ end
746
+ alias_method :print, :print_site
747
+
748
+
749
+ # Print summary report of all sites URL in the site store
750
+ def print_all_sites
751
+ puts "\nSummary Report of the site store:"
752
+ sites=@known_sites.keys.sort
753
+ sites.each do |site|
754
+ puts site
755
+ end
756
+
757
+ puts "End of the summary"
758
+ #return sites
759
+ end
760
+ alias_method :print_all, :print_all_sites
761
+
762
+ # Retrieve and save unique sites information for the quarterly scan into a plain local file
763
+ def save_uniq_sites(file)
764
+ puts "Save unique sites information into a flat file: #{file}\nThis may take a long while as it go through a lengthy self correction check process, please be patient ..."
765
+ begin
766
+ prime_sites=get_prim_uniq_sites
767
+ puts "Primary Sites: #{prime_sites}" if @verbose
768
+ f=File.open(file,"w")
769
+ f.write "Unique Sites Information Report\n"
770
+ f.write "Site, IP, Port, Server, Hosting, Response Code, MD5, Redirect, Timestamps\n"
771
+ prime_sites.map do |key|
772
+ next if key.nil?
773
+ site=key.strip
774
+ raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\wadd #{site}\n" unless @known_sites.key?(site)
775
+ ip=@known_sites[site]['ip']
776
+ port=@known_sites[site]['port']
777
+ status=@known_sites[site]['status']
778
+ server=@known_sites[site]['server']
779
+ fp=@known_sites[site]['md5']
780
+ loc=@known_sites[site]['redirection']
781
+ res=@known_sites[site]['code']
782
+ timestamp=@known_sites[site]['timestamp']
783
+ f.write "#{site},#{ip},#{port},#{server},#{status},#{res},#{fp},#{loc},#{timestamp}\n"
784
+ end
785
+ f.close
786
+ puts "Done!"
787
+ return true # success
788
+ rescue => ee
789
+ puts "Exception on method #{__method__}: #{ee}"
790
+ return false # fail
791
+ end
792
+ end
793
+ alias_method :dump, :save_uniq_sites
794
+
795
+ # Retrieve and save unique sites information for the quarterly scan into a XML file
796
+ def save_uniq_sites_xml(file)
797
+ puts "Save unique sites information into XML file: #{file}\nThis may take a long while as it go through lengthy self correctness check, please be patient ..."
798
+ begin
799
+ prime_sites=get_prim_uniq_sites
800
+ builder = Nokogiri::XML::Builder.new do |xml|
801
+ xml.root {
802
+ xml.websites {
803
+ prime_sites.each do |key|
804
+ next if key.nil?
805
+ site=key.strip
806
+ raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twmap #{site}\n" unless @known_sites.key?(site)
807
+ xml.site {
808
+ xml.name site
809
+ xml.ip_ @known_sites[site]['ip']
810
+ xml.port_ @known_sites[site]['port']
811
+ xml.status_ @known_sites[site]['status']
812
+ xml.server_ @known_sites[site]['server']
813
+ xml.fingerprint_ @known_sites[site]['md5']
814
+ xml.redirection_ @known_sites[site]['redirection']
815
+ xml.responsecode_ @known_sites[site]['code']
816
+ xml.timestamp_ @known_sites[site]['timestamp']
817
+ }
818
+ end
819
+ }
820
+ }
821
+ end
822
+ puts builder.to_xml if @verbose
823
+ f=File.new(file,'w')
824
+ f.write(builder.to_xml)
825
+ f.close
826
+ puts "Done!"
827
+ return true
828
+ rescue => ee
829
+ puts "Exception on method #{__method__}: #{ee}"
830
+ return false
831
+ end
832
+ end
833
+ alias_method :dump_xml, :save_uniq_sites_xml
834
+
835
+ # Retrieve the unique sites from the local site store in the primary host format
836
+ def get_prim_uniq_sites
837
+ puts "Retrieve and prime unique sites in the site store. " if @verbose
838
+ begin
839
+ host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
840
+ primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
841
+ # Step 1. Retrieve the unique site list first
842
+ sites=get_uniq_sites
843
+ prim_uniq_sites=Array.new
844
+ # Step 2. Iterate on the unique site list, spit out the site in the primary host format one at a time
845
+ sites.map do |site|
846
+ puts "Work on priming unique site: #{site}" if @verbose
847
+ host=url_2_host(site)
848
+ # case#1, for the IP only site, do nothing (presuming 'refresh_ip_sites' or 'refresh_all' method already take care of the potential discrepancy here).
849
+ if is_ip?(host)
850
+ prim_uniq_sites.push(site)
851
+ next
852
+ end
853
+ ip=@known_sites[site]['ip']
854
+ # case#2, for site with an unique IP, do nothing
855
+ puts "Local hosts table entry count for #{ip}: #{host_tracker.alias[ip]}" if @verbose
856
+ if host_tracker.alias[ip] == 1
857
+ prim_uniq_sites.push(site)
858
+ next
859
+ end
860
+ # case#3, case of multiple IPs for A DNS record, where the site IP may have 0 alias count, do nothing
861
+ if host_tracker.alias[ip] == nil
862
+ prim_uniq_sites.push(site)
863
+ next
864
+ end
865
+ # case#4, for the site has a duplicate IP with others, we try to determine which one is the primary site
866
+ # raise "Error: inconsistency detected on record: #{site}. Please run the following shell command to refresh it first: \n\srefresh #{site}" if tracker1.alias[ip].nil?
867
+ if ( primary_host_tracker.known_hosts.key?(ip) and (host_tracker.alias[ip] > 1) )
868
+ new_host=primary_host_tracker.prime(host)
869
+ puts "Host: #{host}, New host:#{new_host}" if @verbose
870
+ unless host==new_host
871
+ new_site=site.sub(host,new_host)
872
+ raise "Site not found in the site tracking data repository: #{new_site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twadd #{new_site}\n" unless @known_sites.key?(new_site)
873
+ new_ip=@known_sites[new_site]['ip']
874
+ if new_ip==ip # consistency check
875
+ site=new_site
876
+ else
877
+ # TBD - case of multiple IPs for A DNS record
878
+ #raise "Inconsistency found on prime host entrance: #{new_ip}, #{ip}; #{new_site}, #{site}. Please refresh your entries by running the following shell command: \n\s refresh #{new_site}"
879
+ end
880
+ end
881
+ end
882
+ prim_uniq_sites.push(site)
883
+ end
884
+ primary_host_tracker=nil
885
+ host_tracker=nil
886
+ return prim_uniq_sites
887
+ rescue => ee
888
+ puts "Exception on method #{__method__}: #{ee}"
889
+ end
890
+ end
891
+ alias_method :get_prime, :get_prim_uniq_sites
892
+
893
+ # Print summary report of external hosted sites URL in the
894
+ def print_ext_sites
895
+ puts "\nSummary Report of the External Hosted Site"
896
+ sites=get_ext_sites
897
+ sites.each do |site|
898
+ puts site
899
+ end
900
+ return nil
901
+ end
902
+ alias_method :print_ext, :print_ext_sites
903
+
904
+ # Print summary report of internal hosted site URLs
905
+ def print_int_sites
906
+ puts "\nSummary Report of the Internal Hosted Site"
907
+ sites=get_int_sites
908
+ sites.each do |site|
909
+ puts site
910
+ end
911
+ return nil
912
+ end
913
+ alias_method :print_int, :print_int_sites
914
+
915
+ # Print summary report of internal hosted site URLs
916
+ def print_ssl_sites
917
+ puts "\nSummary Report of the HTTPS Sites from the Site Store"
918
+ sites=get_ssl_sites
919
+ sites.each do |site|
920
+ puts site
921
+ end
922
+ return nil
923
+ end
924
+
925
+ # Print summary report of unique sites in the site store
926
+ def print_uniq_sites
927
+ puts "Summary Report for the Unique sites:"
928
+ puts "Website,Primary IP,Port,Hosting Status,Server,Response Code,Site MD5 Finger-print,Site Redirection,Timestamp"
929
+ sites=get_uniq_sites
930
+ sites.each do |site|
931
+ print_site(site)
932
+ end
933
+ end
934
+
935
+ private
936
+
937
+ end