wmap 2.4.4 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,9 +64,11 @@ class Wmap::CidrTracker
64
64
  @known_cidr_blks[key]['netname']=entry[2].nil? ? nil : entry[2].strip
65
65
  end
66
66
  f.close
67
- # Sort the blocks in order once for better performance
68
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
69
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
67
+ # Sort the blocks in order once for better performance. Update 10/29/2018 to support Netaddr 2.x syntax
68
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
69
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
70
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
71
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
70
72
  rescue => ee
71
73
  puts "Exception on method #{__method__}: #{ee}" # if @verbose
72
74
  end
@@ -75,7 +77,7 @@ class Wmap::CidrTracker
75
77
  # 'setter' to add an entry to CIDR store @known_cidr_blks
76
78
  def add (cidr,ref=nil,netname=nil)
77
79
  puts "Load the entry into the CIDR store: #{cidr}"
78
- begin
80
+ #begin
79
81
  raise "Unknown CIDR format: #{cidr}" unless is_cidr?(cidr)
80
82
  # Obtain the 'ref' and 'netname' value automatically in case not passed as method parameters
81
83
  if ref.nil? or netname.nil?
@@ -96,11 +98,13 @@ class Wmap::CidrTracker
96
98
  puts "Entry loaded!"
97
99
  end
98
100
  # Re-sort the blocks in order for better performance
99
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
100
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
101
- rescue => ee
102
- puts "Exception on method #{__method__}: #{ee}" # if @verbose
103
- end
101
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
102
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
103
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
104
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
105
+ #rescue => ee
106
+ # puts "Exception on method #{__method__}: #{ee}" # if @verbose
107
+ #end
104
108
  end
105
109
 
106
110
  # 'setter' to remove an entry to CIDR store @known_cidr_blks
@@ -117,8 +121,10 @@ class Wmap::CidrTracker
117
121
  raise "Unknown CIDR entry: #{cidr}"
118
122
  end
119
123
  # Re-sort the blocks in order for better performance
120
- @known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
121
- @known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
124
+ #@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
125
+ #@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
126
+ @known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
127
+ @known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
122
128
  rescue => ee
123
129
  puts "Exception on method #{__method__}: #{ee}" # if @verbose
124
130
  end
@@ -167,6 +173,7 @@ class Wmap::CidrTracker
167
173
  @known_cidr_blks_desc_index.each do |line|
168
174
  first_octet_blk = line.split('.').first.to_i
169
175
  next if first_octet_blk > first_octet_ip
176
+ puts "line: #{line}" if @verbose
170
177
  cidr4 = NetAddr::CIDR.create(line)
171
178
  known = cidr4.contains?(ip+'/32')
172
179
  break if known
@@ -276,7 +283,7 @@ class Wmap::CidrTracker
276
283
  # Save the current cidr hash table into a file
277
284
  def save_cidrs_to_file!(file_cidrs=@file_cidr_seeds)
278
285
  puts "Saving the current cidrs cache table from memory to file: #{file_cidrs} ..." if @verbose
279
- begin
286
+ #begin
280
287
  timestamp=Time.now
281
288
  f=File.open(file_cidrs, 'w')
282
289
  f.write "# Local cidrs file created by Wmap::CidrTracker.save method at: #{timestamp}\n"
@@ -288,9 +295,9 @@ class Wmap::CidrTracker
288
295
  end
289
296
  f.close
290
297
  puts "CIDR cache table is successfully saved: #{file_cidrs}"
291
- rescue => ee
292
- puts "Exception on method #{__method__}: #{ee}" if @verbose
293
- end
298
+ #rescue => ee
299
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
300
+ #end
294
301
  end
295
302
  alias_method :save!, :save_cidrs_to_file!
296
303
 
@@ -14,7 +14,7 @@ module Wmap
14
14
  # Class to differentiate the primary host-name from the potential aliases. This is needed in order to minimize the confusion on our final site inventory list, as it contains a large number of duplicates (aliases). More specifically, a filter could be built by using this class to track the primary url of a website.
15
15
  class PrimaryHost < Wmap::HostTracker
16
16
  include Wmap::Utils
17
- include Singleton
17
+ #include Singleton
18
18
 
19
19
  attr_accessor :hosts_file, :verbose, :data_dir
20
20
  attr_reader :known_hosts, :known_ips
@@ -42,7 +42,7 @@ class Wmap::HostTracker
42
42
  entry=line.chomp.split(%r{\t+|\s+|\,})
43
43
  key=entry[0].downcase
44
44
  value=entry[1]
45
- puts "Loading value pair: #{key} - #{value}" if @verbose
45
+ puts "Loading key value pair: #{key} - #{value}" if @verbose
46
46
  known_hosts[key] = Hash.new unless known_hosts.key?(key)
47
47
  known_hosts[key]= value
48
48
  # For reverse host lookup
@@ -103,7 +103,7 @@ class Wmap::HostTracker
103
103
  # Setter to add host entry to the cache once at a time
104
104
  def add(host)
105
105
  puts "Add entry to the local host repository: #{host}"
106
- begin
106
+ #begin
107
107
  host=host.strip.downcase unless host.nil?
108
108
  unless @known_hosts.key?(host)
109
109
  ip=host_2_ip(host)
@@ -137,9 +137,9 @@ class Wmap::HostTracker
137
137
  else
138
138
  puts "Host is already exist. Skip: #{host}"
139
139
  end
140
- rescue => ee
141
- puts "Exception on method #{__method__}: #{ee}" if @verbose
142
- end
140
+ #rescue => ee
141
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
142
+ #end
143
143
  end
144
144
 
145
145
  # Setter to add host entry to the local hosts in batch (from an array)
@@ -521,7 +521,7 @@ class Wmap::HostTracker
521
521
  entry=line.chomp.split(%r{\t+|\s+|\,})
522
522
  key=entry[0].downcase
523
523
  value=entry[1]
524
- puts "Loading value pair: #{key} - #{value}" if @verbose
524
+ puts "Loading key value pair: #{key} - #{value}" if @verbose
525
525
  host_store[key] = Hash.new unless known_hosts.key?(key)
526
526
  host_store[key]= value
527
527
  end
@@ -180,7 +180,7 @@ class Wmap::SiteTracker
180
180
  if my_tracker.host_known?(host)
181
181
  old_ip=my_tracker.local_host_2_ip(host)
182
182
  if old_ip != ip
183
- my_tracker.efresh(host)
183
+ my_tracker.refresh(host)
184
184
  my_tracker.save!
185
185
  else
186
186
  puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
@@ -205,7 +205,7 @@ class Wmap::SiteTracker
205
205
  if my_tracker.host_known?(host)
206
206
  old_ip=my_tracker.local_host_2_ip(host)
207
207
  if old_ip != ip
208
- my_tracker.efresh(host)
208
+ my_tracker.refresh(host)
209
209
  my_tracker.save!
210
210
  else
211
211
  # Skip - no need to update the local hosts table
@@ -582,7 +582,7 @@ class Wmap::SiteTracker
582
582
  host=url_2_host(key)
583
583
  md5=@known_sites[key]['md5']
584
584
  code=@known_sites[key]['code']
585
- ip=my_trakcer.local_host_2_ip(host)
585
+ ip=my_tracker.local_host_2_ip(host)
586
586
  ip=host_2_ip(host) if ip.nil?
587
587
  # filtering out 'un-reachable' sites
588
588
  next if (code == 10000 or code == 20000)
@@ -835,7 +835,7 @@ class Wmap::SiteTracker
835
835
  # Retrieve the unique sites from the local site store in the primary host format
836
836
  def get_prim_uniq_sites
837
837
  puts "Retrieve and prime unique sites in the site store. " if @verbose
838
- begin
838
+ #begin
839
839
  host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
840
840
  primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
841
841
  # Step 1. Retrieve the unique site list first
@@ -884,9 +884,9 @@ class Wmap::SiteTracker
884
884
  primary_host_tracker=nil
885
885
  host_tracker=nil
886
886
  return prim_uniq_sites
887
- rescue => ee
888
- puts "Exception on method #{__method__}: #{ee}"
889
- end
887
+ #rescue => ee
888
+ # puts "Exception on method #{__method__}: #{ee}"
889
+ #end
890
890
  end
891
891
  alias_method :get_prime, :get_prim_uniq_sites
892
892
 
@@ -31,7 +31,7 @@ class Wmap::UrlCrawler
31
31
  # Crawler instance default variables
32
32
  def initialize (params = {})
33
33
  @verbose=params.fetch(:verbose, false)
34
- @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../logs/')
34
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
35
35
  @http_timeout=params.fetch(:http_timeout, 5000)
36
36
  @crawl_depth=params.fetch(:crawl_depth, 4)
37
37
  @crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
@@ -224,11 +224,11 @@ class Wmap::UrlCrawler
224
224
  alias_method :query_file, :crawl_workers_on_file
225
225
  alias_method :crawl_file, :crawl_workers_on_file
226
226
 
227
- # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
227
+ # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
228
228
  def open_url(url)
229
- puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
230
229
  #url_object = nil
231
- begin
230
+ begin
231
+ puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
232
232
  if url =~ /http\:/i
233
233
  # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
234
234
  url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
@@ -240,24 +240,24 @@ class Wmap::UrlCrawler
240
240
  raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
241
241
  end
242
242
  return url_object
243
- rescue => ee
244
- puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
- return nil
246
- end
243
+ rescue => ee
244
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
245
+ return nil
247
246
  end
247
+ end
248
248
 
249
249
  # Wrapper to use OpenURI method 'read' to return url body contents
250
250
  def read_url(url)
251
- puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
252
251
  begin
252
+ puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
253
253
  url_object=open_url(url)
254
254
  @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
255
255
  body=url_object.read
256
256
  return body
257
- rescue => ee
258
- puts "Exception on method #{__method__}: #{ee}" if @verbose
259
- return nil
260
- end
257
+ rescue => ee
258
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
259
+ return nil
260
+ end
261
261
  end
262
262
 
263
263
  # Return the destination url in case of url re-direct
@@ -268,11 +268,11 @@ class Wmap::UrlCrawler
268
268
  return url_object.base_uri.to_s
269
269
  end
270
270
  return url
271
- rescue => ee
272
- puts "Exception on method #{__method__}: #{ee}" if @verbose
273
- return nil
274
- end
271
+ rescue => ee
272
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
273
+ return nil
275
274
  end
275
+ end
276
276
 
277
277
  # Wrapper for the Nokogiri DOM parser
278
278
  def parse_html(html_body)
@@ -8,25 +8,29 @@
8
8
 
9
9
 
10
10
  module Wmap
11
- module Utils
11
+ module Utils
12
12
  # Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
13
- module DomainRoot
13
+ module DomainRoot
14
14
  extend self
15
15
  # Internet Domain Architecture Definitions
16
16
  File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
17
17
  File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
18
18
  File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
19
-
19
+ File_tld=File.dirname(__FILE__)+'/../../../dicts/tlds.txt'
20
+
20
21
  # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
21
22
  def get_domain_root (host)
22
23
  puts "Retrieve the root domain for host: #{host}" if @verbose
23
24
  begin
25
+ # Comnplete Top Level Domain List - loading once
26
+ @tlds=file_2_hash(File_tld) if @tlds.nil?
24
27
  # Generic Top Level Domain List - loading once
25
28
  @gtld=file_2_hash(File_gtld) if @gtld.nil?
26
29
  # Country code top-level domain list - loading once
27
30
  @cctld=file_2_hash(File_cctld) if @cctld.nil?
28
31
  # Country code second level domain - loading once
29
32
  @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
33
+
30
34
  if host.strip.nil?
31
35
  puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
32
36
  return nil
@@ -35,15 +39,15 @@ module Wmap
35
39
  end
36
40
  found_tld=false
37
41
  found_cctld=false
38
- # search the general top level domain list first
42
+ # search the top level domain list first
39
43
  root_domain=""
40
44
  dn=host.split(".")
41
- if @gtld.key?(dn.last)
42
- found=false
43
- if @cctld.key?(dn[dn.length-2])
44
- found=true
45
+ if @tlds.key?(dn.last)
46
+ cc_found=false
47
+ if @cctld.key?(dn[dn.length-2])
48
+ cc_found=true
45
49
  end
46
- if found
50
+ if cc_found
47
51
  root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
48
52
  else
49
53
  root_domain=dn[dn.length-2] + "." + dn.last
@@ -51,7 +55,7 @@ module Wmap
51
55
  found_tld=true
52
56
  end
53
57
  # search the country code top level domain list secondly
54
- if @cctld.key?(dn.last)
58
+ if @cctld.key?(dn.last)
55
59
  found=false
56
60
  # reverse search of general top level domain
57
61
  if @gtld.key?(dn[dn.length-2])
@@ -65,8 +69,8 @@ module Wmap
65
69
  break
66
70
  end
67
71
  end
68
- # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
69
- #unless found
72
+ # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
73
+ #unless found
70
74
  # if @gtld.key?(dn[dn.length-2])
71
75
  # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
72
76
  # return nil
@@ -77,17 +81,17 @@ module Wmap
77
81
  root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
78
82
  else
79
83
  root_domain=dn[dn.length-2] + "." + dn.last
80
- end
84
+ end
81
85
  found_cctld=true
82
86
  end
83
- unless (found_tld or found_cctld)
87
+ unless (found_tld or found_cctld)
84
88
  puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
85
89
  return nil
86
90
  else
87
91
  puts "Domain root found: #{root_domain}" if @verbose
88
92
  return root_domain
89
93
  end
90
- rescue => ee
94
+ rescue => ee
91
95
  puts "Exception on method #{__method__}: #{ee}" if @verbose
92
96
  return nil
93
97
  end
@@ -96,7 +100,7 @@ module Wmap
96
100
  alias_method :root_domain, :get_domain_root
97
101
  alias_method :domain_root, :get_domain_root
98
102
  alias_method :host_2_domain, :get_domain_root
99
-
103
+
100
104
  # 'setter' to parse and load the known country code second level domain table from the file
101
105
  # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
102
106
  def load_ccsld_from_file (file_ccsld)
@@ -107,10 +111,10 @@ module Wmap
107
111
  f.each do |line|
108
112
  next unless line =~ /^\s+\.\w/
109
113
  line=line.chomp.strip.downcase
110
- entry=line.split(' ')[0].split('.')
114
+ entry=line.split(' ')[0].split('.')
111
115
  if entry.length > 2
112
116
  key=entry.last
113
- ccsld[key] = Array.new if not ccsld.key?(key)
117
+ ccsld[key] = Array.new if not ccsld.key?(key)
114
118
  val=entry[entry.length-2]
115
119
  #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
116
120
  ccsld[key].push(val) unless key.nil?
@@ -119,7 +123,7 @@ module Wmap
119
123
  f.close
120
124
  # Sort the blocks once in descendant order once for better performance
121
125
  return ccsld
122
- rescue => ee
126
+ rescue => ee
123
127
  puts "Exception on method #{__method__}: #{ee}" if @verbose
124
128
  end
125
129
  end
@@ -158,15 +162,15 @@ module Wmap
158
162
  rescue Exception => ee
159
163
  puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
160
164
  return nil
161
- end
165
+ end
162
166
  end
163
167
  alias_method :get_subdomain, :get_sub_domain
164
-
168
+
165
169
  # Function to print instance variable - General top level domain list
166
170
  def print_gtld
167
171
  puts @gtld
168
172
  end
169
-
173
+
170
174
  # Function to print instance variable - Country code top-level domain list
171
175
  def print_cctld
172
176
  puts @cctld
@@ -176,9 +180,9 @@ module Wmap
176
180
  def print_ccsld
177
181
  puts @ccsld
178
182
  end
179
-
183
+
180
184
  private :load_ccsld_from_file
181
-
185
+
182
186
  end
183
187
  end
184
188
  end