wmap 2.4.4 → 2.4.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -64,9 +64,11 @@ class Wmap::CidrTracker
64
64
  @known_cidr_blks[key]['netname']=entry[2].nil? ? nil : entry[2].strip
65
65
  end
66
66
  f.close
67
- # Sort the blocks in order once for better performance
68
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
69
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
67
+ # Sort the blocks in order once for better performance. Update 10/29/2018 to support Netaddr 2.x syntax
68
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
69
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
70
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
71
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
70
72
  rescue => ee
71
73
  puts "Exception on method #{__method__}: #{ee}" # if @verbose
72
74
  end
@@ -75,7 +77,7 @@ class Wmap::CidrTracker
75
77
  # 'setter' to add an entry to CIDR store @known_cidr_blks
76
78
  def add (cidr,ref=nil,netname=nil)
77
79
  puts "Load the entry into the CIDR store: #{cidr}"
78
- begin
80
+ #begin
79
81
  raise "Unknown CIDR format: #{cidr}" unless is_cidr?(cidr)
80
82
  # Obtain the 'ref' and 'netname' value automatically in case not passed as method parameters
81
83
  if ref.nil? or netname.nil?
@@ -96,11 +98,13 @@ class Wmap::CidrTracker
96
98
  puts "Entry loaded!"
97
99
  end
98
100
  # Re-sort the blocks in order for better performance
99
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
100
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
101
- rescue => ee
102
- puts "Exception on method #{__method__}: #{ee}" # if @verbose
103
- end
101
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
102
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
103
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
104
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
105
+ #rescue => ee
106
+ # puts "Exception on method #{__method__}: #{ee}" # if @verbose
107
+ #end
104
108
  end
105
109
 
106
110
  # 'setter' to remove an entry to CIDR store @known_cidr_blks
@@ -117,8 +121,10 @@ class Wmap::CidrTracker
117
121
  raise "Unknown CIDR entry: #{cidr}"
118
122
  end
119
123
  # Re-sort the blocks in order for better performance
120
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
121
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
124
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
125
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
126
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
127
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
122
128
  rescue => ee
123
129
  puts "Exception on method #{__method__}: #{ee}" # if @verbose
124
130
  end
@@ -167,6 +173,7 @@ class Wmap::CidrTracker
167
173
  @known_cidr_blks_desc_index.each do |line|
168
174
  first_octet_blk = line.split('.').first.to_i
169
175
  next if first_octet_blk > first_octet_ip
176
+ puts "line: #{line}" if @verbose
170
177
  cidr4 = NetAddr::CIDR.create(line)
171
178
  known = cidr4.contains?(ip+'/32')
172
179
  break if known
@@ -276,7 +283,7 @@ class Wmap::CidrTracker
276
283
  # Save the current cidr hash table into a file
277
284
  def save_cidrs_to_file!(file_cidrs=@file_cidr_seeds)
278
285
  puts "Saving the current cidrs cache table from memory to file: #{file_cidrs} ..." if @verbose
279
- begin
286
+ #begin
280
287
  timestamp=Time.now
281
288
  f=File.open(file_cidrs, 'w')
282
289
  f.write "# Local cidrs file created by Wmap::CidrTracker.save method at: #{timestamp}\n"
@@ -288,9 +295,9 @@ class Wmap::CidrTracker
288
295
  end
289
296
  f.close
290
297
  puts "CIDR cache table is successfully saved: #{file_cidrs}"
291
- rescue => ee
292
- puts "Exception on method #{__method__}: #{ee}" if @verbose
293
- end
298
+ #rescue => ee
299
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
300
+ #end
294
301
  end
295
302
  alias_method :save!, :save_cidrs_to_file!
296
303
 
@@ -14,7 +14,7 @@ module Wmap
14
14
  # Class to differentiate the primary host-name from the potential aliases. This is needed in order to minimize the confusion on our final site inventory list, as it contains a large number of duplicates (aliases). More specifically, a filter could be built by using this class to track the primary url of a website.
15
15
  class PrimaryHost < Wmap::HostTracker
16
16
  include Wmap::Utils
17
- include Singleton
17
+ #include Singleton
18
18
 
19
19
  attr_accessor :hosts_file, :verbose, :data_dir
20
20
  attr_reader :known_hosts, :known_ips
@@ -42,7 +42,7 @@ class Wmap::HostTracker
42
42
  entry=line.chomp.split(%r{\t+|\s+|\,})
43
43
  key=entry[0].downcase
44
44
  value=entry[1]
45
- puts "Loading value pair: #{key} - #{value}" if @verbose
45
+ puts "Loading key value pair: #{key} - #{value}" if @verbose
46
46
  known_hosts[key] = Hash.new unless known_hosts.key?(key)
47
47
  known_hosts[key]= value
48
48
  # For reverse host lookup
@@ -103,7 +103,7 @@ class Wmap::HostTracker
103
103
  # Setter to add host entry to the cache once at a time
104
104
  def add(host)
105
105
  puts "Add entry to the local host repository: #{host}"
106
- begin
106
+ #begin
107
107
  host=host.strip.downcase unless host.nil?
108
108
  unless @known_hosts.key?(host)
109
109
  ip=host_2_ip(host)
@@ -137,9 +137,9 @@ class Wmap::HostTracker
137
137
  else
138
138
  puts "Host is already exist. Skip: #{host}"
139
139
  end
140
- rescue => ee
141
- puts "Exception on method #{__method__}: #{ee}" if @verbose
142
- end
140
+ #rescue => ee
141
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
142
+ #end
143
143
  end
144
144
 
145
145
  # Setter to add host entry to the local hosts in batch (from an array)
@@ -521,7 +521,7 @@ class Wmap::HostTracker
521
521
  entry=line.chomp.split(%r{\t+|\s+|\,})
522
522
  key=entry[0].downcase
523
523
  value=entry[1]
524
- puts "Loading value pair: #{key} - #{value}" if @verbose
524
+ puts "Loading key value pair: #{key} - #{value}" if @verbose
525
525
  host_store[key] = Hash.new unless known_hosts.key?(key)
526
526
  host_store[key]= value
527
527
  end
@@ -180,7 +180,7 @@ class Wmap::SiteTracker
180
180
  if my_tracker.host_known?(host)
181
181
  old_ip=my_tracker.local_host_2_ip(host)
182
182
  if old_ip != ip
183
- my_tracker.efresh(host)
183
+ my_tracker.refresh(host)
184
184
  my_tracker.save!
185
185
  else
186
186
  puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
@@ -205,7 +205,7 @@ class Wmap::SiteTracker
205
205
  if my_tracker.host_known?(host)
206
206
  old_ip=my_tracker.local_host_2_ip(host)
207
207
  if old_ip != ip
208
- my_tracker.efresh(host)
208
+ my_tracker.refresh(host)
209
209
  my_tracker.save!
210
210
  else
211
211
  # Skip - no need to update the local hosts table
@@ -582,7 +582,7 @@ class Wmap::SiteTracker
582
582
  host=url_2_host(key)
583
583
  md5=@known_sites[key]['md5']
584
584
  code=@known_sites[key]['code']
585
- ip=my_trakcer.local_host_2_ip(host)
585
+ ip=my_tracker.local_host_2_ip(host)
586
586
  ip=host_2_ip(host) if ip.nil?
587
587
  # filtering out 'un-reachable' sites
588
588
  next if (code == 10000 or code == 20000)
@@ -835,7 +835,7 @@ class Wmap::SiteTracker
835
835
  # Retrieve the unique sites from the local site store in the primary host format
836
836
  def get_prim_uniq_sites
837
837
  puts "Retrieve and prime unique sites in the site store. " if @verbose
838
- begin
838
+ #begin
839
839
  host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
840
840
  primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
841
841
  # Step 1. Retrieve the unique site list first
@@ -884,9 +884,9 @@ class Wmap::SiteTracker
884
884
  primary_host_tracker=nil
885
885
  host_tracker=nil
886
886
  return prim_uniq_sites
887
- rescue => ee
888
- puts "Exception on method #{__method__}: #{ee}"
889
- end
887
+ #rescue => ee
888
+ # puts "Exception on method #{__method__}: #{ee}"
889
+ #end
890
890
  end
891
891
  alias_method :get_prime, :get_prim_uniq_sites
892
892
 
@@ -31,7 +31,7 @@ class Wmap::UrlCrawler
31
31
  # Crawler instance default variables
32
32
  def initialize (params = {})
33
33
  @verbose=params.fetch(:verbose, false)
34
- @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
34
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
35
35
  @http_timeout=params.fetch(:http_timeout, 5000)
36
36
  @crawl_depth=params.fetch(:crawl_depth, 4)
37
37
  @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
@@ -224,11 +224,11 @@ class Wmap::UrlCrawler
224
224
  alias_method :query_file, :crawl_workers_on_file
225
225
  alias_method :crawl_file, :crawl_workers_on_file
226
226
 
227
- # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
227
+ # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
228
228
  def open_url(url)
229
- puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
230
229
  #url_object = nil
231
- begin
230
+ begin
231
+ puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
232
232
  if url =~ /http\:/i
233
233
  # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
234
234
  url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
@@ -240,24 +240,24 @@ class Wmap::UrlCrawler
240
240
  raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
241
241
  end
242
242
  return url_object
243
- rescue => ee
244
- puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
- return nil
246
- end
243
+ rescue => ee
244
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
+ return nil
247
246
  end
247
+ end
248
248
 
249
249
  # Wrapper to use OpenURI method 'read' to return url body contents
250
250
  def read_url(url)
251
- puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
252
251
  begin
252
+ puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
253
253
  url_object=open_url(url)
254
254
  @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
255
255
  body=url_object.read
256
256
  return body
257
- rescue => ee
258
- puts "Exception on method #{__method__}: #{ee}" if @verbose
259
- return nil
260
- end
257
+ rescue => ee
258
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
259
+ return nil
260
+ end
261
261
  end
262
262
 
263
263
  # Return the destination url in case of url re-direct
@@ -268,11 +268,11 @@ class Wmap::UrlCrawler
268
268
  return url_object.base_uri.to_s
269
269
  end
270
270
  return url
271
- rescue => ee
272
- puts "Exception on method #{__method__}: #{ee}" if @verbose
273
- return nil
274
- end
271
+ rescue => ee
272
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
273
+ return nil
275
274
  end
275
+ end
276
276
 
277
277
  # Wrapper for the Nokogiri DOM parser
278
278
  def parse_html(html_body)
@@ -8,25 +8,29 @@
8
8
 
9
9
 
10
10
  module Wmap
11
- module Utils
11
+ module Utils
12
12
  # Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
13
- module DomainRoot
13
+ module DomainRoot
14
14
  extend self
15
15
  # Internet Domain Architecture Definitions
16
16
  File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
17
17
  File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
18
18
  File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
19
-
19
+ File_tld=File.dirname(__FILE__)+'/../../../dicts/tlds.txt'
20
+
20
21
  # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
21
22
  def get_domain_root (host)
22
23
  puts "Retrieve the root domain for host: #{host}" if @verbose
23
24
  begin
25
+ # Comnplete Top Level Domain List - loading once
26
+ @tlds=file_2_hash(File_tld) if @tlds.nil?
24
27
  # Generic Top Level Domain List - loading once
25
28
  @gtld=file_2_hash(File_gtld) if @gtld.nil?
26
29
  # Country code top-level domain list - loading once
27
30
  @cctld=file_2_hash(File_cctld) if @cctld.nil?
28
31
  # Country code second level domain - loading once
29
32
  @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
33
+
30
34
  if host.strip.nil?
31
35
  puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
32
36
  return nil
@@ -35,15 +39,15 @@ module Wmap
35
39
  end
36
40
  found_tld=false
37
41
  found_cctld=false
38
- # search the general top level domain list first
42
+ # search the top level domain list first
39
43
  root_domain=""
40
44
  dn=host.split(".")
41
- if @gtld.key?(dn.last)
42
- found=false
43
- if @cctld.key?(dn[dn.length-2])
44
- found=true
45
+ if @tlds.key?(dn.last)
46
+ cc_found=false
47
+ if @cctld.key?(dn[dn.length-2])
48
+ cc_found=true
45
49
  end
46
- if found
50
+ if cc_found
47
51
  root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
48
52
  else
49
53
  root_domain=dn[dn.length-2] + "." + dn.last
@@ -51,7 +55,7 @@ module Wmap
51
55
  found_tld=true
52
56
  end
53
57
  # search the country code top level domain list secondly
54
- if @cctld.key?(dn.last)
58
+ if @cctld.key?(dn.last)
55
59
  found=false
56
60
  # reverse search of general top level domain
57
61
  if @gtld.key?(dn[dn.length-2])
@@ -65,8 +69,8 @@ module Wmap
65
69
  break
66
70
  end
67
71
  end
68
- # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
69
- #unless found
72
+ # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
73
+ #unless found
70
74
  # if @gtld.key?(dn[dn.length-2])
71
75
  # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
72
76
  # return nil
@@ -77,17 +81,17 @@ module Wmap
77
81
  root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
78
82
  else
79
83
  root_domain=dn[dn.length-2] + "." + dn.last
80
- end
84
+ end
81
85
  found_cctld=true
82
86
  end
83
- unless (found_tld or found_cctld)
87
+ unless (found_tld or found_cctld)
84
88
  puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
85
89
  return nil
86
90
  else
87
91
  puts "Domain root found: #{root_domain}" if @verbose
88
92
  return root_domain
89
93
  end
90
- rescue => ee
94
+ rescue => ee
91
95
  puts "Exception on method #{__method__}: #{ee}" if @verbose
92
96
  return nil
93
97
  end
@@ -96,7 +100,7 @@ module Wmap
96
100
  alias_method :root_domain, :get_domain_root
97
101
  alias_method :domain_root, :get_domain_root
98
102
  alias_method :host_2_domain, :get_domain_root
99
-
103
+
100
104
  # 'setter' to parse and load the known country code second level domain table from the file
101
105
  # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
102
106
  def load_ccsld_from_file (file_ccsld)
@@ -107,10 +111,10 @@ module Wmap
107
111
  f.each do |line|
108
112
  next unless line =~ /^\s+\.\w/
109
113
  line=line.chomp.strip.downcase
110
- entry=line.split(' ')[0].split('.')
114
+ entry=line.split(' ')[0].split('.')
111
115
  if entry.length > 2
112
116
  key=entry.last
113
- ccsld[key] = Array.new if not ccsld.key?(key)
117
+ ccsld[key] = Array.new if not ccsld.key?(key)
114
118
  val=entry[entry.length-2]
115
119
  #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
116
120
  ccsld[key].push(val) unless key.nil?
@@ -119,7 +123,7 @@ module Wmap
119
123
  f.close
120
124
  # Sort the blocks once in descendant order once for better performance
121
125
  return ccsld
122
- rescue => ee
126
+ rescue => ee
123
127
  puts "Exception on method #{__method__}: #{ee}" if @verbose
124
128
  end
125
129
  end
@@ -158,15 +162,15 @@ module Wmap
158
162
  rescue Exception => ee
159
163
  puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
160
164
  return nil
161
- end
165
+ end
162
166
  end
163
167
  alias_method :get_subdomain, :get_sub_domain
164
-
168
+
165
169
  # Function to print instance variable - General top level domain list
166
170
  def print_gtld
167
171
  puts @gtld
168
172
  end
169
-
173
+
170
174
  # Function to print instance variable - Country code top-level domain list
171
175
  def print_cctld
172
176
  puts @cctld
@@ -176,9 +180,9 @@ module Wmap
176
180
  def print_ccsld
177
181
  puts @ccsld
178
182
  end
179
-
183
+
180
184
  private :load_ccsld_from_file
181
-
185
+
182
186
  end
183
187
  end
184
188
  end