wmap 2.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,937 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for the Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "parallel"
9
+ #require "singleton"
10
+ require "nokogiri"
11
+
12
+
13
+ # Main class to automatically track the site inventory
14
+ class Wmap::SiteTracker
15
+ include Wmap::Utils
16
+ #include Singleton
17
+
18
+ attr_accessor :sites_file, :max_parallel, :verbose, :data_dir
19
+ attr_reader :known_sites
20
+
21
+ # Set default instance variables
22
+ def initialize (params = {})
23
+ # Initialize the instance variables
24
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
25
+ @file_sites=@data_dir+'sites'
26
+ @file_stores=params.fetch(:sites_file, @file_sites)
27
+ @verbose=params.fetch(:verbose, false)
28
+ @max_parallel=params.fetch(:max_parallel, 30)
29
+ # Hash table to hold the site store
30
+ File.write(@file_stores, "") unless File.exist?(@file_stores)
31
+ @known_sites=load_site_stores_from_file(@file_stores)
32
+ end
33
+
34
+ # Setter to load the known hosts into an instance variable
35
+ def load_site_stores_from_file (file)
36
+ puts "Loading the site store data repository from file: #{file} " if @verbose
37
+ begin
38
+ known_sites=Hash.new
39
+ f=File.open(file, 'r')
40
+ f.each do |line|
41
+ line=line.chomp.strip
42
+ next if line.nil?
43
+ next if line.empty?
44
+ next if line =~ /^\s*#/
45
+ entry=line.split(%r{\t+|\,})
46
+ site=entry[0].downcase
47
+ ip=entry[1]
48
+ port=entry[2]
49
+ status=entry[3]
50
+ server=entry[4]
51
+ res=entry[5].to_i
52
+ fp=entry[6]
53
+ loc=entry[7]
54
+ timestamp=entry[8]
55
+ puts "Loading entry: #{site} - #{ip} - #{status}" if @verbose
56
+ known_sites[site]= Hash.new unless known_sites.key?(site)
57
+ known_sites[site]['ip']=ip
58
+ known_sites[site]['port']=port
59
+ known_sites[site]['status']=status
60
+ known_sites[site]['server']=server
61
+ known_sites[site]['code']=res
62
+ known_sites[site]['md5']=fp
63
+ known_sites[site]['redirection']=loc
64
+ known_sites[site]['timestamp']=timestamp
65
+ end
66
+ f.close
67
+ puts "Successfully loading file: #{file}" if @verbose
68
+ return known_sites
69
+ rescue => ee
70
+ puts "Exception on method #{__method__} for file #{file}: #{ee}"
71
+ end
72
+ end
73
+
74
+ # Save the current site store hash table into a file
75
+ def save_sites_to_file!(file_sites=@file_stores)
76
+ puts "Saving the current site store table from memory to file: #{file_sites}"
77
+ begin
78
+ timestamp=Time.now
79
+ f=File.open(file_sites, 'w')
80
+ f.write "# Local site store created by class #{self.class} method #{__method__} at: #{timestamp}\n"
81
+ f.write "# Website,Primary IP,Port,Hosting Status,Server,Response Code,MD5 Finger-print,Redirection,Timestamp\n"
82
+ @known_sites.keys.sort.map do |key|
83
+ f.write "#{key},#{@known_sites[key]['ip']},#{@known_sites[key]['port']},#{@known_sites[key]['status']},#{@known_sites[key]['server']},#{@known_sites[key]['code']},#{@known_sites[key]['md5']},#{@known_sites[key]['redirection']},#{@known_sites[key]['timestamp']}\n"
84
+ end
85
+ f.close
86
+ puts "site store table is successfully saved: #{file_sites}"
87
+ rescue => ee
88
+ puts "Exception on method #{__method__}: #{ee}"
89
+ end
90
+ end
91
+ alias_method :save!, :save_sites_to_file!
92
+
93
+ # Count numbers of entries in the site store table
94
+ def count
95
+ puts "Counting number of entries in the site store table ..."
96
+ begin
97
+ return @known_sites.size
98
+ rescue => ee
99
+ puts "Exception on method #{__method__}: #{ee}"
100
+ end
101
+ end
102
+
103
+ # Setter to add site entry to the cache one at a time
104
+ def add(site)
105
+ puts "Add entry to the site store: #{site}"
106
+ begin
107
+ # Preliminary sanity check
108
+ site=site.strip.downcase unless site.nil?
109
+ raise "Site is already exist. Skip #{site}" if site_known?(site)
110
+ site=normalize_url(site) if is_url?(site)
111
+ site=url_2_site(site) if is_url?(site)
112
+ puts "Site in standard format: #{site}" if @verbose
113
+ raise "Exception on method #{__method__}: invalid site format of #{site}. Expected format is: http://your_website_name/" unless is_site?(site)
114
+ trusted=false
115
+ host=url_2_host(site)
116
+ ip=host_2_ip(host)
117
+ # Additional logic to refresh deactivated site, 02/12/2014
118
+ deact=Wmap::SiteTracker::DeactivatedSite.new(:data_dir=>@data_dir)
119
+ # only trust either the domain or IP we know
120
+ if is_ip?(host)
121
+ trusted=Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
122
+ else
123
+ root=get_domain_root(host)
124
+ if root.nil?
125
+ raise "Invalid web site format. Please check your record again."
126
+ else
127
+ trusted=Wmap::DomainTracker.new(:data_dir=>@data_dir).domain_known?(root)
128
+ end
129
+ end
130
+ # add record only if trusted
131
+ if trusted
132
+ # Add logic to check site status before adding it
133
+ checker=Wmap::UrlChecker.new(:data_dir=>@data_dir).check(site)
134
+ raise "Site is currently down. Skip #{site}" if checker.nil?
135
+ # Skip the http site if it's un-responsive; for the https we'll keep it because we're interested in analysing the SSL layer later
136
+ if is_https?(site)
137
+ # do nothing
138
+ else
139
+ raise "Site is currently down. Skip #{site}" if checker['code']==10000
140
+ end
141
+ raise "Exception on add method - Fail to resolve the host-name: Host - #{host}, IP - #{ip}. Skip #{site}" unless is_ip?(ip)
142
+ my_tracker = Wmap::HostTracker.new(:data_dir=>@data_dir)
143
+ # Update the local host table when necessary
144
+ if is_ip?(host)
145
+ # Case #1: Trusted site contains IP
146
+ if my_tracker.ip_known?(host)
147
+ # Try local reverse DNS lookup first
148
+ puts "Local hosts table lookup for IP: #{ip}" if @verbose
149
+ host=my_tracker.local_ip_2_host(host)
150
+ puts "Host found from the local hosts table for #{ip}: #{host}" if @verbose
151
+ site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
152
+ else
153
+ # Try reverse DNS lookup over Internet as secondary precaution
154
+ puts "Reverse DNS lookup for IP: #{ip}" if @verbose
155
+ host1=ip_2_host(host)
156
+ puts "host1: #{host1}" if @verbose
157
+ if is_fqdn?(host1)
158
+ if Wmap::HostTracker.new(:data_dir=>@data_dir).domain_known?(host1)
159
+ # replace IP with host-name only if domain root is known
160
+ puts "Host found from the Internet reverse DNS lookup for #{ip}: #{host1}" if @verbose
161
+ host=host1
162
+ site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
163
+ end
164
+ end
165
+ end
166
+ # Adding site for Case #1
167
+ raise "Site already exist! Skip #{site}" if @known_sites.key?(site)
168
+ puts "Adding site: #{site}" if @verbose
169
+ @known_sites[site]=Hash.new
170
+ @known_sites[site]=checker
171
+ if deact.site_known?(site)
172
+ deact.delete(site)
173
+ deact.save!
174
+ end
175
+ puts "Site entry loaded: #{checker}"
176
+ if is_fqdn?(host)
177
+ # Add logic to update the hosts table for case #1 variance
178
+ # - case that reverse DNS lookup successful
179
+ puts "Update local hosts table for host: #{host}"
180
+ if my_tracker.host_known?(host)
181
+ old_ip=my_tracker.local_host_2_ip(host)
182
+ if old_ip != ip
183
+ my_tracker.efresh(host)
184
+ my_tracker.save!
185
+ else
186
+ puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
187
+ end
188
+ else
189
+ my_tracker.add(host)
190
+ my_tracker.save!
191
+ end
192
+ end
193
+ else
194
+ # Case #2: Trusted site contains valid FQDN
195
+ puts "Ading site: #{site}" if @verbose
196
+ @known_sites[site]=Hash.new
197
+ @known_sites[site]=checker
198
+ if deact.site_known?(site)
199
+ deact.delete(site)
200
+ deact.save!
201
+ end
202
+ puts "Site entry loaded: #{checker}"
203
+ # Add logic to update the hosts table for case #2
204
+ puts "Update local hosts table for host: #{host}"
205
+ if my_tracker.host_known?(host)
206
+ old_ip=my_tracker.local_host_2_ip(host)
207
+ if old_ip != ip
208
+ my_tracker.efresh(host)
209
+ my_tracker.save!
210
+ else
211
+ # Skip - no need to update the local hosts table
212
+ end
213
+ else
214
+ my_tracker.add(host)
215
+ my_tracker.save!
216
+ end
217
+ end
218
+ deact=nil
219
+ my_tracker=nil
220
+ return checker
221
+ else
222
+ puts "Problem found: untrusted Internet domain or IP. Skip #{site}"
223
+ deact=nil
224
+ my_tracker=nil
225
+ return nil
226
+ end
227
+ rescue => ee
228
+ puts "Exception on method #{__method__}: #{ee}"
229
+ deact=nil
230
+ return nil
231
+ end
232
+ end
233
+
234
+ # Setter to add site entry to the cache table in batch (from a file)
235
+ def file_add(file)
236
+ puts "Add entries to the local site store from file: #{file}"
237
+ begin
238
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
239
+ changes=Hash.new
240
+ sites=file_2_list(file)
241
+ changes=bulk_add(sites) unless sites.nil? or sites.empty?
242
+ puts "Done loading file #{file}. "
243
+ return changes
244
+ rescue => ee
245
+ puts "Exception on method #{__method__}: #{ee}"
246
+ end
247
+ end
248
+
249
+ # Setter to add site entry to the cache in batch (from a list)
250
+ def bulk_add(list,num=@max_parallel)
251
+ puts "Add entries to the local site store from list:\n #{list}"
252
+ #begin
253
+ results=Hash.new
254
+ if list.size > 0
255
+ puts "Start parallel adding on the sites:\n #{list}"
256
+ Parallel.map(list, :in_processes => num) { |target|
257
+ add(target)
258
+ }.each do |process|
259
+ if process.nil?
260
+ next
261
+ elsif process.empty?
262
+ #do nothing
263
+ else
264
+ results[process['url']]=Hash.new
265
+ results[process['url']]=process
266
+ end
267
+ end
268
+ @known_sites.merge!(results)
269
+ else
270
+ puts "Error: no entry is added. Please check your list and try again."
271
+ end
272
+ puts "Done adding site entries."
273
+ if results.size>0
274
+ puts "New entries added: #{results}"
275
+ else
276
+ puts "No new entry added. "
277
+ end
278
+ return results
279
+ #rescue => ee
280
+ #puts "Exception on method #{__method__}: #{ee}" if @verbose
281
+ #end
282
+ end
283
+ alias_method :adds, :bulk_add
284
+
285
+ # Setter to remove entry from the site store one at a time
286
+ def delete(site)
287
+ puts "Remove entry from the site store: #{site} " if @verbose
288
+ begin
289
+ # Additional logic to deactivate the site properly, by moving it to the DeactivatedSite list, 02/07/2014
290
+ deact=Wmap::SiteTracker::DeactivatedSite.new(:data_dir=>@data_dir)
291
+ site=site.strip.downcase
292
+ site=url_2_site(site)
293
+ if @known_sites.key?(site)
294
+ site_info=@known_sites[site]
295
+ deact.add(site,site_info)
296
+ deact.save!
297
+ deact=nil
298
+ del=@known_sites.delete(site)
299
+ puts "Entry cleared: #{site}"
300
+ return del
301
+ else
302
+ puts "Entry not fund. Skip #{site}"
303
+ deact=nil
304
+ return nil
305
+ end
306
+ rescue => ee
307
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
308
+ deact=nil
309
+ end
310
+ end
311
+ alias_method :del, :delete
312
+
313
+ # Setter to delete site entry to the cache in batch (from a file)
314
+ def file_delete(file)
315
+ begin
316
+ puts "Delete entries to the local site store from file: #{file}" if @verbose
317
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
318
+ sites=file_2_list(file)
319
+ changes=Array.new
320
+ changes=bulk_delete(sites) unless sites.nil? or sites.empty?
321
+ rescue => ee
322
+ puts "Exception on method file_delete: #{ee} for file: #{file}" if @verbose
323
+ end
324
+ end
325
+ alias_method :file_del, :file_delete
326
+
327
+ # Setter to delete site entry to the cache in batch (from a list)
328
+ def bulk_delete(list)
329
+ puts "Delete entries to the local site store from list:\n #{list}" if @verbose
330
+ begin
331
+ sites=list
332
+ changes=Array.new
333
+ if sites.size > 0
334
+ sites.map do |x|
335
+ x=url_2_site(x)
336
+ site=delete(x)
337
+ changes.push(site) unless site.nil?
338
+ end
339
+ puts "Done deleting sites from the list:\n #{list}"
340
+ return changes
341
+ else
342
+ puts "Error: no entry is loaded. Please check your list and try again."
343
+ end
344
+ rescue => ee
345
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
346
+ end
347
+ end
348
+ alias_method :dels, :bulk_delete
349
+
350
+ # Setter to refresh the entry in the site store one at a time
351
+ def refresh(site)
352
+ puts "Refresh the local site store for site: #{site} "
353
+ begin
354
+ raise "Invalid site: #{site}" if site.nil? or site.empty?
355
+ site=site.strip.downcase
356
+ if @known_sites.key?(site)
357
+ delete(site)
358
+ site_info=add(site)
359
+ puts "Done refresh entry: #{site}"
360
+ return site_info
361
+ else
362
+ puts "Error entry non exist: #{site}"
363
+ end
364
+ return nil
365
+ rescue => ee
366
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
367
+ return nil
368
+ end
369
+ end
370
+
371
+ # 'Refresh sites in the site store in batch (from a file)
372
+ def file_refresh(file)
373
+ puts "Refresh entries in the site store from file: #{file}" if @verbose
374
+ begin
375
+ changes=Hash.new
376
+ sites=file_2_list(file)
377
+ changes=bulk_refresh(sites) unless sites.nil? or sites.empty?
378
+ return changes
379
+ rescue => ee
380
+ puts "Exception on method #{__method__}: #{ee} for file: #{file}" if @verbose
381
+ end
382
+ end
383
+
384
+ # 'Refresh unique sites in the site store only
385
+ def refresh_uniq_sites
386
+ puts "Refresh unique site entries in the site store. " if @verbose
387
+ begin
388
+ changes=Hash.new
389
+ sites=get_uniq_sites
390
+ if sites.size > 0
391
+ changes=bulk_refresh(sites)
392
+ else
393
+ puts "Error: no entry is refreshed. Please check your site store and try again."
394
+ end
395
+ return changes
396
+ rescue => ee
397
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
398
+ end
399
+ end
400
+
401
+ # 'Refresh sites in the site store in batch (from a list)
402
+ def bulk_refresh(list,num=@max_parallel)
403
+ puts "Refresh entries in the site store from list:\n #{list}" if @verbose
404
+ begin
405
+ results=Hash.new
406
+ if list.size > 0
407
+ puts "Start parallel refreshing on the sites:\n #{list}"
408
+ Parallel.map(list, :in_processes => num) { |target|
409
+ refresh(target)
410
+ }.each do |process|
411
+ if process.nil?
412
+ next
413
+ elsif process.empty?
414
+ #do nothing
415
+ else
416
+ results[process['url']]=Hash.new
417
+ results[process['url']]=process
418
+ end
419
+ end
420
+ # Clean up old entries, by Y.L. 03/30/2015
421
+ list.map {|x| @known_sites.delete(x)}
422
+ # Add back fresh entries
423
+ @known_sites.merge!(results)
424
+ puts "Done refresh sites."
425
+ else
426
+ puts "Error: no entry is loaded. Please check your list and try again."
427
+ end
428
+ return results
429
+ rescue => ee
430
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
431
+ end
432
+ end
433
+ alias_method :refreshs, :bulk_refresh
434
+
435
+
436
+ # Refresh all site entries in the stores in one shot
437
+ def refresh_all
438
+ puts "Refresh all the entries within the local site store ... "
439
+ begin
440
+ changes=Hash.new
441
+ changes=bulk_refresh(@known_sites.keys)
442
+ @known_sites.merge!(changes)
443
+ puts "Done refresh all entries."
444
+ return changes
445
+ rescue => ee
446
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
447
+ end
448
+ end
449
+
450
+ # Refresh all site entries in the stores that contains an IP instead of a hostname
451
+ def refresh_ip_sites
452
+ puts "Refresh all entries that contain an IP address instead of a FQDN ... "
453
+ begin
454
+ sites=get_ip_sites
455
+ live_sites=sites.delete_if { |x| @known_sites[x]['code'] == 10000 or @known_sites[x]['code'] == 20000 }
456
+ changes=Hash.new
457
+ changes=bulk_refresh(live_sites)
458
+ @known_sites.merge!(changes)
459
+ puts "Done refresh IP sites."
460
+ return changes
461
+ rescue => ee
462
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
463
+ end
464
+ end
465
+
466
+ # Quick validation if a site is already covered under the site store
467
+ def site_known?(site)
468
+ begin
469
+ raise "Web site store not loaded properly! " if @known_sites.nil?
470
+ site=site.strip.downcase unless site.nil?
471
+ site=url_2_site(site)
472
+ return @known_sites.key?(site) unless site.nil?
473
+ rescue => ee
474
+ puts "Error checking web site #{site} against the site store: #{ee}"
475
+ end
476
+ return false
477
+ end
478
+ alias_method :is_known?, :site_known?
479
+
480
+ # Quick validation check on an IP is already part of the site store
481
+ def site_ip_known?(ip)
482
+ begin
483
+ ip=ip.chomp.strip
484
+ known=false
485
+ if is_ip?(ip)
486
+ @known_sites.keys.map do |site|
487
+ if @known_sites[site]['ip']==ip
488
+ return true
489
+ end
490
+ end
491
+ end
492
+ myDis=nil
493
+ return known
494
+ rescue => ee
495
+ puts "Exception on method #{__method__}: #{ee}"
496
+ return false
497
+ end
498
+ end
499
+ alias_method :siteip_known?, :site_ip_known?
500
+
501
+ # Quick check of the stored information of a site within the store
502
+ def site_check(site)
503
+ begin
504
+ raise "Web site store not loaded properly! " if @known_sites.nil?
505
+ site=site.strip.downcase unless site.nil?
506
+ site=url_2_site(site)
507
+ return @known_sites[site] unless site.nil?
508
+ rescue => ee
509
+ puts "Exception on method #{__method__}: #{ee}"
510
+ return nil
511
+ end
512
+ end
513
+ alias_method :check, :site_check
514
+
515
+ # Retrieve external hosted sites into a list
516
+ def get_ext_sites
517
+ puts "getter to retrieve all the external hosted sites. " if @verbose
518
+ begin
519
+ sites=Array.new
520
+ @known_sites.keys.map do |key|
521
+ if @known_sites[key]['status']=="ext_hosted"
522
+ sites.push(key)
523
+ end
524
+ end
525
+ sites.sort!
526
+ return sites
527
+ rescue Exception => ee
528
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
529
+ return nil
530
+ end
531
+ end
532
+ alias_method :get_ext, :get_ext_sites
533
+
534
+ # Retrieve a list of internal hosted site URLs
535
+ def get_int_sites
536
+ puts "getter to retrieve all the internal hosted sites." if @verbose
537
+ begin
538
+ sites=Array.new
539
+ @known_sites.keys.map do |key|
540
+ if @known_sites[key]['status']=="int_hosted"
541
+ sites.push(key)
542
+ end
543
+ end
544
+ sites.sort!
545
+ return sites
546
+ rescue Exception => ee
547
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
548
+ return nil
549
+ end
550
+ end
551
+ alias_method :get_int, :get_int_sites
552
+
553
+ # Retrieve a list of sites that contain an IP in the site URL
554
+ def get_ip_sites
555
+ puts "Getter to retrieve sites contain an IP instead of a host-name ." if @verbose
556
+ begin
557
+ sites=Array.new
558
+ @known_sites.keys.map do |key|
559
+ host=url_2_host(key)
560
+ if is_ip?(host)
561
+ sites.push(key)
562
+ end
563
+ end
564
+ sites.sort!
565
+ return sites
566
+ rescue Exception => ee
567
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
568
+ return nil
569
+ end
570
+ end
571
+
572
+ # Retrieve a list of unique sites within the known site store
573
+ def get_uniq_sites
574
+ puts "Getter to retrieve unique sites containing unique IP:PORT key identifier." if @verbose
575
+ begin
576
+ #primary_host_tracker=Wmap::HostTracker::PrimaryHost.new
577
+ sites=Hash.new
578
+ #uniqueness=Hash.new
579
+ my_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
580
+ @known_sites.keys.map do |key|
581
+ port=url_2_port(key).to_s
582
+ host=url_2_host(key)
583
+ md5=@known_sites[key]['md5']
584
+ code=@known_sites[key]['code']
585
+ ip=my_trakcer.local_host_2_ip(host)
586
+ ip=host_2_ip(host) if ip.nil?
587
+ # filtering out 'un-reachable' sites
588
+ next if (code == 10000 or code == 20000)
589
+ # filtering out 'empty' sites
590
+ next if (md5.nil? or md5.empty?)
591
+ next if ip.nil?
592
+ # url_new=key
593
+ #if primary_host_tracker.ip_known?(ip)
594
+ # p_host=primary_host_tracker.known_hosts[ip]
595
+ # url_new=key.sub(host,p_host)
596
+ #end
597
+ id=ip+":"+port
598
+ # filtering out duplicates by 'IP:PORT' key pair
599
+ unless sites.key?(id)
600
+ #if @known_sites.key?(key)
601
+ # sites[id]=url_new
602
+ #else
603
+ # Further filtering out redundant site by checking MD5 finger-print
604
+ #unless uniqueness.key?(md5)
605
+ sites[id]=key
606
+ # uniqueness[md5]=true
607
+ #end
608
+ #end
609
+ end
610
+ end
611
+ #primary_host_tracker=nil
612
+ my_tracker=nil
613
+ return sites.values
614
+ rescue Exception => ee
615
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
616
+ return nil
617
+ end
618
+ end
619
+ alias_method :uniq_sites, :get_uniq_sites
620
+
621
+ # Retrieve a list of sites that contain an IP in the site URL
622
+ def get_ssl_sites
623
+ puts "getter to retrieve https sites from the site store." if @verbose
624
+ begin
625
+ sites=Array.new
626
+ @known_sites.keys.map do |key|
627
+ key =~ /https/i
628
+ sites.push(key)
629
+ end
630
+ sites.sort!
631
+ return sites
632
+ rescue Exception => ee
633
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
634
+ return nil
635
+ end
636
+ end
637
+
638
+ # Retrieve a list of redirection URLs from the site store
639
+ def get_redirection_urls
640
+ puts "getter to retrieve all the redirection URLs from the site store." if @verbose
641
+ begin
642
+ urls=Array.new
643
+ @known_sites.keys.map do |key|
644
+ unless @known_sites[key]['redirection'].nil?
645
+ urls.push(@known_sites[key]['redirection'])
646
+ end
647
+ end
648
+ urls.sort!
649
+ return urls
650
+ rescue Exception => ee
651
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
652
+ return nil
653
+ end
654
+ end
655
+
656
+ # Retrieve redirection URL if available
657
+ def get_redirection_url (site)
658
+ puts "getter to retrieve the redirection URL from the site store." if @verbose
659
+ begin
660
+ site=site.strip.downcase
661
+ if @known_sites.key?(site)
662
+ return @known_sites[site]['redirection']
663
+ else
664
+ puts "Unknown site: #{site}" if @verbose
665
+ return nil
666
+ end
667
+ rescue Exception => ee
668
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
669
+ return nil
670
+ end
671
+ end
672
+
673
+ # Perform local host table reverse lookup for the IP sites, in hope that the hostname could now be resolved since the site was discovered
674
+ def resolve_ip_sites
675
+ puts "Resolve sites that contain an IP address. Update the site cache table once a hostname is found in the local host table." if @verbose
676
+ begin
677
+ updates=Array.new
678
+ sites=get_ip_sites
679
+ my_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
680
+ sites.map do |site|
681
+ puts "Work on resolve the IP site: #{site}" if @verbose
682
+ ip=url_2_host(site)
683
+ hostname=my_tracker.local_ip_2_host(ip)
684
+ if hostname.nil?
685
+ puts "Can't resolve #{ip} from the local host store. Skip #{site}" if @verbose
686
+ else
687
+ puts "Host-name found for IP #{ip}: #{hostname}" if @verbose
688
+ updates.push(site)
689
+ refresh(site)
690
+ end
691
+ end
692
+ updates.sort!
693
+ puts "The following sites are now refreshed: #{updates}" if @verbose
694
+ my_tracker=nil
695
+ return updates
696
+ rescue Exception => ee
697
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
698
+ end
699
+ end
700
+
701
+ # Search potential matching sites from the site store by using simple regular expression. Note that any upper-case char in the search string will be automatically converted into lower case
702
+ def search (pattern)
703
+ puts "Search site store based on the regular expression: #{pattern}" if @verbose
704
+ begin
705
+ pattern=pattern.strip.downcase
706
+ results=Array.new
707
+ @known_sites.keys.map do |key|
708
+ if key =~ /#{pattern}/i
709
+ results.push(key)
710
+ end
711
+ end
712
+ return results
713
+ rescue Exception => ee
714
+ puts "Exception on method search: #{ee}" if @verbose
715
+ return nil
716
+ end
717
+ end
718
+
719
+ # Print summary report on all sites that contain an IP in the site URL
720
+ def print_ip_sites
721
+ puts "Print sites contain an IP instead of a host-name."
722
+ sites=get_ip_sites
723
+ sites.map { |x| puts x }
724
+ puts "End of report. "
725
+ end
726
+
727
+ # Retrieve and print specific information of a site in the site store
728
+ def print_site(site)
729
+ puts "Site Information Report for: #{site}" if @verbose
730
+ begin
731
+ site=site.strip unless site.nil?
732
+ raise "Unknown site: #{site}" unless @known_sites.key?(site)
733
+ ip=@known_sites[site]['ip']
734
+ port=@known_sites[site]['port']
735
+ status=@known_sites[site]['status']
736
+ server=@known_sites[site]['server']
737
+ fp=@known_sites[site]['md5']
738
+ loc=@known_sites[site]['redirection']
739
+ res=@known_sites[site]['code']
740
+ timestamp=@known_sites[site]['timestamp']
741
+ puts "#{site},#{ip},#{port},#{status},#{server},#{res},#{fp},#{loc},#{timestamp}"
742
+ rescue => ee
743
+ puts "Exception on method #{__method__} for #{site}: #{ee}"
744
+ end
745
+ end
746
+ alias_method :print, :print_site
747
+
748
+
749
+ # Print summary report of all sites URL in the site store
750
+ def print_all_sites
751
+ puts "\nSummary Report of the site store:"
752
+ sites=@known_sites.keys.sort
753
+ sites.each do |site|
754
+ puts site
755
+ end
756
+
757
+ puts "End of the summary"
758
+ #return sites
759
+ end
760
+ alias_method :print_all, :print_all_sites
761
+
762
+ # Retrieve and save unique sites information for the quarterly scan into a plain local file
763
+ def save_uniq_sites(file)
764
+ puts "Save unique sites information into a flat file: #{file}\nThis may take a long while as it go through a lengthy self correction check process, please be patient ..."
765
+ begin
766
+ prime_sites=get_prim_uniq_sites
767
+ puts "Primary Sites: #{prime_sites}" if @verbose
768
+ f=File.open(file,"w")
769
+ f.write "Unique Sites Information Report\n"
770
+ f.write "Site, IP, Port, Server, Hosting, Response Code, MD5, Redirect, Timestamps\n"
771
+ prime_sites.map do |key|
772
+ next if key.nil?
773
+ site=key.strip
774
+ raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\wadd #{site}\n" unless @known_sites.key?(site)
775
+ ip=@known_sites[site]['ip']
776
+ port=@known_sites[site]['port']
777
+ status=@known_sites[site]['status']
778
+ server=@known_sites[site]['server']
779
+ fp=@known_sites[site]['md5']
780
+ loc=@known_sites[site]['redirection']
781
+ res=@known_sites[site]['code']
782
+ timestamp=@known_sites[site]['timestamp']
783
+ f.write "#{site},#{ip},#{port},#{server},#{status},#{res},#{fp},#{loc},#{timestamp}\n"
784
+ end
785
+ f.close
786
+ puts "Done!"
787
+ return true # success
788
+ rescue => ee
789
+ puts "Exception on method #{__method__}: #{ee}"
790
+ return false # fail
791
+ end
792
+ end
793
+ alias_method :dump, :save_uniq_sites
794
+
795
+ # Retrieve and save unique sites information for the quarterly scan into a XML file
796
+ def save_uniq_sites_xml(file)
797
+ puts "Save unique sites information into XML file: #{file}\nThis may take a long while as it go through lengthy self correctness check, please be patient ..."
798
+ begin
799
+ prime_sites=get_prim_uniq_sites
800
+ builder = Nokogiri::XML::Builder.new do |xml|
801
+ xml.root {
802
+ xml.websites {
803
+ prime_sites.each do |key|
804
+ next if key.nil?
805
+ site=key.strip
806
+ raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twmap #{site}\n" unless @known_sites.key?(site)
807
+ xml.site {
808
+ xml.name site
809
+ xml.ip_ @known_sites[site]['ip']
810
+ xml.port_ @known_sites[site]['port']
811
+ xml.status_ @known_sites[site]['status']
812
+ xml.server_ @known_sites[site]['server']
813
+ xml.fingerprint_ @known_sites[site]['md5']
814
+ xml.redirection_ @known_sites[site]['redirection']
815
+ xml.responsecode_ @known_sites[site]['code']
816
+ xml.timestamp_ @known_sites[site]['timestamp']
817
+ }
818
+ end
819
+ }
820
+ }
821
+ end
822
+ puts builder.to_xml if @verbose
823
+ f=File.new(file,'w')
824
+ f.write(builder.to_xml)
825
+ f.close
826
+ puts "Done!"
827
+ return true
828
+ rescue => ee
829
+ puts "Exception on method #{__method__}: #{ee}"
830
+ return false
831
+ end
832
+ end
833
+ alias_method :dump_xml, :save_uniq_sites_xml
834
+
835
+ # Retrieve the unique sites from the local site store in the primary host format
836
+ def get_prim_uniq_sites
837
+ puts "Retrieve and prime unique sites in the site store. " if @verbose
838
+ begin
839
+ host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
840
+ primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
841
+ # Step 1. Retrieve the unique site list first
842
+ sites=get_uniq_sites
843
+ prim_uniq_sites=Array.new
844
+ # Step 2. Iterate on the unique site list, spit out the site in the primary host format one at a time
845
+ sites.map do |site|
846
+ puts "Work on priming unique site: #{site}" if @verbose
847
+ host=url_2_host(site)
848
+ # case#1, for the IP only site, do nothing (presuming 'refresh_ip_sites' or 'refresh_all' method already take care of the potential discrepancy here).
849
+ if is_ip?(host)
850
+ prim_uniq_sites.push(site)
851
+ next
852
+ end
853
+ ip=@known_sites[site]['ip']
854
+ # case#2, for site with an unique IP, do nothing
855
+ puts "Local hosts table entry count for #{ip}: #{host_tracker.alias[ip]}" if @verbose
856
+ if host_tracker.alias[ip] == 1
857
+ prim_uniq_sites.push(site)
858
+ next
859
+ end
860
+ # case#3, case of multiple IPs for A DNS record, where the site IP may have 0 alias count, do nothing
861
+ if host_tracker.alias[ip] == nil
862
+ prim_uniq_sites.push(site)
863
+ next
864
+ end
865
+ # case#4, for the site has a duplicate IP with others, we try to determine which one is the primary site
866
+ # raise "Error: inconsistency detected on record: #{site}. Please run the following shell command to refresh it first: \n\srefresh #{site}" if tracker1.alias[ip].nil?
867
+ if ( primary_host_tracker.known_hosts.key?(ip) and (host_tracker.alias[ip] > 1) )
868
+ new_host=primary_host_tracker.prime(host)
869
+ puts "Host: #{host}, New host:#{new_host}" if @verbose
870
+ unless host==new_host
871
+ new_site=site.sub(host,new_host)
872
+ raise "Site not found in the site tracking data repository: #{new_site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twadd #{new_site}\n" unless @known_sites.key?(new_site)
873
+ new_ip=@known_sites[new_site]['ip']
874
+ if new_ip==ip # consistency check
875
+ site=new_site
876
+ else
877
+ # TBD - case of multiple IPs for A DNS record
878
+ #raise "Inconsistency found on prime host entrance: #{new_ip}, #{ip}; #{new_site}, #{site}. Please refresh your entries by running the following shell command: \n\s refresh #{new_site}"
879
+ end
880
+ end
881
+ end
882
+ prim_uniq_sites.push(site)
883
+ end
884
+ primary_host_tracker=nil
885
+ host_tracker=nil
886
+ return prim_uniq_sites
887
+ rescue => ee
888
+ puts "Exception on method #{__method__}: #{ee}"
889
+ end
890
+ end
891
+ alias_method :get_prime, :get_prim_uniq_sites
892
+
893
+ # Print summary report of external hosted sites URL in the
894
+ def print_ext_sites
895
+ puts "\nSummary Report of the External Hosted Site"
896
+ sites=get_ext_sites
897
+ sites.each do |site|
898
+ puts site
899
+ end
900
+ return nil
901
+ end
902
+ alias_method :print_ext, :print_ext_sites
903
+
904
+ # Print summary report of internal hosted site URLs
905
+ def print_int_sites
906
+ puts "\nSummary Report of the Internal Hosted Site"
907
+ sites=get_int_sites
908
+ sites.each do |site|
909
+ puts site
910
+ end
911
+ return nil
912
+ end
913
+ alias_method :print_int, :print_int_sites
914
+
915
+ # Print summary report of internal hosted site URLs
916
+ def print_ssl_sites
917
+ puts "\nSummary Report of the HTTPS Sites from the Site Store"
918
+ sites=get_ssl_sites
919
+ sites.each do |site|
920
+ puts site
921
+ end
922
+ return nil
923
+ end
924
+
925
+ # Print summary report of unique sites in the site store
926
+ def print_uniq_sites
927
+ puts "Summary Report for the Unique sites:"
928
+ puts "Website,Primary IP,Port,Hosting Status,Server,Response Code,Site MD5 Finger-print,Site Redirection,Timestamp"
929
+ sites=get_uniq_sites
930
+ sites.each do |site|
931
+ print_site(site)
932
+ end
933
+ end
934
+
935
+ private
936
+
937
+ end