wmap 2.4.4 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.rdoc +27 -20
- data/bin/RHPG +85 -0
- data/bin/trust +5 -5
- data/bin/trusts +38 -0
- data/bin/updateAll +5 -9
- data/bin/wadds +1 -1
- data/bin/wmaps +24 -0
- data/dicts/tlds.txt +1537 -0
- data/lib/wmap/cidr_tracker.rb +22 -15
- data/lib/wmap/host_tracker/primary_host.rb +1 -1
- data/lib/wmap/host_tracker.rb +6 -6
- data/lib/wmap/site_tracker.rb +7 -7
- data/lib/wmap/url_crawler.rb +17 -17
- data/lib/wmap/utils/domain_root.rb +28 -24
- data/lib/wmap/wp_tracker.rb +302 -0
- data/logs/wmap.log +1516 -17
- data/version.txt +4 -4
- data/wmap.gemspec +20 -5
- metadata +179 -14
- data/data/cidrs +0 -2
- data/data/deactivated_sites +0 -1
- data/data/domains +0 -2
- data/data/hosts +0 -1
- data/data/prime_hosts +0 -1
- data/data/sites +0 -2
- data/data/sub_domains +0 -2
- data/lib/wmap.rb +0 -227
data/lib/wmap/cidr_tracker.rb
CHANGED
@@ -64,9 +64,11 @@ class Wmap::CidrTracker
|
|
64
64
|
@known_cidr_blks[key]['netname']=entry[2].nil? ? nil : entry[2].strip
|
65
65
|
end
|
66
66
|
f.close
|
67
|
-
# Sort the blocks in order once for better performance
|
68
|
-
|
69
|
-
|
67
|
+
# Sort the blocks in order once for better performance. Update 10/29/2018 to support Netaddr 2.x syntax
|
68
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
69
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
70
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
71
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
70
72
|
rescue => ee
|
71
73
|
puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
72
74
|
end
|
@@ -75,7 +77,7 @@ class Wmap::CidrTracker
|
|
75
77
|
# 'setter' to add an entry to CIDR store @known_cidr_blks
|
76
78
|
def add (cidr,ref=nil,netname=nil)
|
77
79
|
puts "Load the entry into the CIDR store: #{cidr}"
|
78
|
-
begin
|
80
|
+
#begin
|
79
81
|
raise "Unknown CIDR format: #{cidr}" unless is_cidr?(cidr)
|
80
82
|
# Obtain the 'ref' and 'netname' value automatically in case not passed as method parameters
|
81
83
|
if ref.nil? or netname.nil?
|
@@ -96,11 +98,13 @@ class Wmap::CidrTracker
|
|
96
98
|
puts "Entry loaded!"
|
97
99
|
end
|
98
100
|
# Re-sort the blocks in order for better performance
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
101
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
102
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
103
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
104
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
105
|
+
#rescue => ee
|
106
|
+
# puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
107
|
+
#end
|
104
108
|
end
|
105
109
|
|
106
110
|
# 'setter' to remove an entry to CIDR store @known_cidr_blks
|
@@ -117,8 +121,10 @@ class Wmap::CidrTracker
|
|
117
121
|
raise "Unknown CIDR entry: #{cidr}"
|
118
122
|
end
|
119
123
|
# Re-sort the blocks in order for better performance
|
120
|
-
|
121
|
-
|
124
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
125
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
126
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
127
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
122
128
|
rescue => ee
|
123
129
|
puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
124
130
|
end
|
@@ -167,6 +173,7 @@ class Wmap::CidrTracker
|
|
167
173
|
@known_cidr_blks_desc_index.each do |line|
|
168
174
|
first_octet_blk = line.split('.').first.to_i
|
169
175
|
next if first_octet_blk > first_octet_ip
|
176
|
+
puts "line: #{line}" if @verbose
|
170
177
|
cidr4 = NetAddr::CIDR.create(line)
|
171
178
|
known = cidr4.contains?(ip+'/32')
|
172
179
|
break if known
|
@@ -276,7 +283,7 @@ class Wmap::CidrTracker
|
|
276
283
|
# Save the current cidr hash table into a file
|
277
284
|
def save_cidrs_to_file!(file_cidrs=@file_cidr_seeds)
|
278
285
|
puts "Saving the current cidrs cache table from memory to file: #{file_cidrs} ..." if @verbose
|
279
|
-
begin
|
286
|
+
#begin
|
280
287
|
timestamp=Time.now
|
281
288
|
f=File.open(file_cidrs, 'w')
|
282
289
|
f.write "# Local cidrs file created by Wmap::CidrTracker.save method at: #{timestamp}\n"
|
@@ -288,9 +295,9 @@ class Wmap::CidrTracker
|
|
288
295
|
end
|
289
296
|
f.close
|
290
297
|
puts "CIDR cache table is successfully saved: #{file_cidrs}"
|
291
|
-
rescue => ee
|
292
|
-
|
293
|
-
end
|
298
|
+
#rescue => ee
|
299
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
300
|
+
#end
|
294
301
|
end
|
295
302
|
alias_method :save!, :save_cidrs_to_file!
|
296
303
|
|
@@ -14,7 +14,7 @@ module Wmap
|
|
14
14
|
# Class to differentiate the primary host-name from the potential aliases. This is needed in order to minimize the confusion on our final site inventory list, as it contains a large number of duplicates (aliases). More specifically, a filter could be built by using this class to track the primary url of a website.
|
15
15
|
class PrimaryHost < Wmap::HostTracker
|
16
16
|
include Wmap::Utils
|
17
|
-
include Singleton
|
17
|
+
#include Singleton
|
18
18
|
|
19
19
|
attr_accessor :hosts_file, :verbose, :data_dir
|
20
20
|
attr_reader :known_hosts, :known_ips
|
data/lib/wmap/host_tracker.rb
CHANGED
@@ -42,7 +42,7 @@ class Wmap::HostTracker
|
|
42
42
|
entry=line.chomp.split(%r{\t+|\s+|\,})
|
43
43
|
key=entry[0].downcase
|
44
44
|
value=entry[1]
|
45
|
-
puts "Loading value pair: #{key} - #{value}" if @verbose
|
45
|
+
puts "Loading key value pair: #{key} - #{value}" if @verbose
|
46
46
|
known_hosts[key] = Hash.new unless known_hosts.key?(key)
|
47
47
|
known_hosts[key]= value
|
48
48
|
# For reverse host lookup
|
@@ -103,7 +103,7 @@ class Wmap::HostTracker
|
|
103
103
|
# Setter to add host entry to the cache once at a time
|
104
104
|
def add(host)
|
105
105
|
puts "Add entry to the local host repository: #{host}"
|
106
|
-
begin
|
106
|
+
#begin
|
107
107
|
host=host.strip.downcase unless host.nil?
|
108
108
|
unless @known_hosts.key?(host)
|
109
109
|
ip=host_2_ip(host)
|
@@ -137,9 +137,9 @@ class Wmap::HostTracker
|
|
137
137
|
else
|
138
138
|
puts "Host is already exist. Skip: #{host}"
|
139
139
|
end
|
140
|
-
rescue => ee
|
141
|
-
|
142
|
-
end
|
140
|
+
#rescue => ee
|
141
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
142
|
+
#end
|
143
143
|
end
|
144
144
|
|
145
145
|
# Setter to add host entry to the local hosts in batch (from an array)
|
@@ -521,7 +521,7 @@ class Wmap::HostTracker
|
|
521
521
|
entry=line.chomp.split(%r{\t+|\s+|\,})
|
522
522
|
key=entry[0].downcase
|
523
523
|
value=entry[1]
|
524
|
-
puts "Loading value pair: #{key} - #{value}" if @verbose
|
524
|
+
puts "Loading key value pair: #{key} - #{value}" if @verbose
|
525
525
|
host_store[key] = Hash.new unless known_hosts.key?(key)
|
526
526
|
host_store[key]= value
|
527
527
|
end
|
data/lib/wmap/site_tracker.rb
CHANGED
@@ -180,7 +180,7 @@ class Wmap::SiteTracker
|
|
180
180
|
if my_tracker.host_known?(host)
|
181
181
|
old_ip=my_tracker.local_host_2_ip(host)
|
182
182
|
if old_ip != ip
|
183
|
-
my_tracker.
|
183
|
+
my_tracker.refresh(host)
|
184
184
|
my_tracker.save!
|
185
185
|
else
|
186
186
|
puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
|
@@ -205,7 +205,7 @@ class Wmap::SiteTracker
|
|
205
205
|
if my_tracker.host_known?(host)
|
206
206
|
old_ip=my_tracker.local_host_2_ip(host)
|
207
207
|
if old_ip != ip
|
208
|
-
my_tracker.
|
208
|
+
my_tracker.refresh(host)
|
209
209
|
my_tracker.save!
|
210
210
|
else
|
211
211
|
# Skip - no need to update the local hosts table
|
@@ -582,7 +582,7 @@ class Wmap::SiteTracker
|
|
582
582
|
host=url_2_host(key)
|
583
583
|
md5=@known_sites[key]['md5']
|
584
584
|
code=@known_sites[key]['code']
|
585
|
-
ip=
|
585
|
+
ip=my_tracker.local_host_2_ip(host)
|
586
586
|
ip=host_2_ip(host) if ip.nil?
|
587
587
|
# filtering out 'un-reachable' sites
|
588
588
|
next if (code == 10000 or code == 20000)
|
@@ -835,7 +835,7 @@ class Wmap::SiteTracker
|
|
835
835
|
# Retrieve the unique sites from the local site store in the primary host format
|
836
836
|
def get_prim_uniq_sites
|
837
837
|
puts "Retrieve and prime unique sites in the site store. " if @verbose
|
838
|
-
begin
|
838
|
+
#begin
|
839
839
|
host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
|
840
840
|
primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
|
841
841
|
# Step 1. Retrieve the unique site list first
|
@@ -884,9 +884,9 @@ class Wmap::SiteTracker
|
|
884
884
|
primary_host_tracker=nil
|
885
885
|
host_tracker=nil
|
886
886
|
return prim_uniq_sites
|
887
|
-
rescue => ee
|
888
|
-
|
889
|
-
end
|
887
|
+
#rescue => ee
|
888
|
+
# puts "Exception on method #{__method__}: #{ee}"
|
889
|
+
#end
|
890
890
|
end
|
891
891
|
alias_method :get_prime, :get_prim_uniq_sites
|
892
892
|
|
data/lib/wmap/url_crawler.rb
CHANGED
@@ -31,7 +31,7 @@ class Wmap::UrlCrawler
|
|
31
31
|
# Crawler instance default variables
|
32
32
|
def initialize (params = {})
|
33
33
|
@verbose=params.fetch(:verbose, false)
|
34
|
-
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../
|
34
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
|
35
35
|
@http_timeout=params.fetch(:http_timeout, 5000)
|
36
36
|
@crawl_depth=params.fetch(:crawl_depth, 4)
|
37
37
|
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
@@ -224,11 +224,11 @@ class Wmap::UrlCrawler
|
|
224
224
|
alias_method :query_file, :crawl_workers_on_file
|
225
225
|
alias_method :crawl_file, :crawl_workers_on_file
|
226
226
|
|
227
|
-
|
227
|
+
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
228
228
|
def open_url(url)
|
229
|
-
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
230
229
|
#url_object = nil
|
231
|
-
|
230
|
+
begin
|
231
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
232
232
|
if url =~ /http\:/i
|
233
233
|
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
234
234
|
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
@@ -240,24 +240,24 @@ class Wmap::UrlCrawler
|
|
240
240
|
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
241
241
|
end
|
242
242
|
return url_object
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
end
|
243
|
+
rescue => ee
|
244
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
245
|
+
return nil
|
247
246
|
end
|
247
|
+
end
|
248
248
|
|
249
249
|
# Wrapper to use OpenURI method 'read' to return url body contents
|
250
250
|
def read_url(url)
|
251
|
-
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
252
251
|
begin
|
252
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
253
253
|
url_object=open_url(url)
|
254
254
|
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
255
255
|
body=url_object.read
|
256
256
|
return body
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
257
|
+
rescue => ee
|
258
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
259
|
+
return nil
|
260
|
+
end
|
261
261
|
end
|
262
262
|
|
263
263
|
# Return the destination url in case of url re-direct
|
@@ -268,11 +268,11 @@ class Wmap::UrlCrawler
|
|
268
268
|
return url_object.base_uri.to_s
|
269
269
|
end
|
270
270
|
return url
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
end
|
271
|
+
rescue => ee
|
272
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
273
|
+
return nil
|
275
274
|
end
|
275
|
+
end
|
276
276
|
|
277
277
|
# Wrapper for the Nokogiri DOM parser
|
278
278
|
def parse_html(html_body)
|
@@ -8,25 +8,29 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
module Wmap
|
11
|
-
module Utils
|
11
|
+
module Utils
|
12
12
|
# Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
|
13
|
-
module DomainRoot
|
13
|
+
module DomainRoot
|
14
14
|
extend self
|
15
15
|
# Internet Domain Architecture Definitions
|
16
16
|
File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
|
17
17
|
File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
|
18
18
|
File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
|
19
|
-
|
19
|
+
File_tld=File.dirname(__FILE__)+'/../../../dicts/tlds.txt'
|
20
|
+
|
20
21
|
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
21
22
|
def get_domain_root (host)
|
22
23
|
puts "Retrieve the root domain for host: #{host}" if @verbose
|
23
24
|
begin
|
25
|
+
# Comnplete Top Level Domain List - loading once
|
26
|
+
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
24
27
|
# Generic Top Level Domain List - loading once
|
25
28
|
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
26
29
|
# Country code top-level domain list - loading once
|
27
30
|
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
28
31
|
# Country code second level domain - loading once
|
29
32
|
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
33
|
+
|
30
34
|
if host.strip.nil?
|
31
35
|
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
32
36
|
return nil
|
@@ -35,15 +39,15 @@ module Wmap
|
|
35
39
|
end
|
36
40
|
found_tld=false
|
37
41
|
found_cctld=false
|
38
|
-
# search the
|
42
|
+
# search the top level domain list first
|
39
43
|
root_domain=""
|
40
44
|
dn=host.split(".")
|
41
|
-
if @
|
42
|
-
|
43
|
-
if @cctld.key?(dn[dn.length-2])
|
44
|
-
|
45
|
+
if @tlds.key?(dn.last)
|
46
|
+
cc_found=false
|
47
|
+
if @cctld.key?(dn[dn.length-2])
|
48
|
+
cc_found=true
|
45
49
|
end
|
46
|
-
if
|
50
|
+
if cc_found
|
47
51
|
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
48
52
|
else
|
49
53
|
root_domain=dn[dn.length-2] + "." + dn.last
|
@@ -51,7 +55,7 @@ module Wmap
|
|
51
55
|
found_tld=true
|
52
56
|
end
|
53
57
|
# search the country code top level domain list secondly
|
54
|
-
if @cctld.key?(dn.last)
|
58
|
+
if @cctld.key?(dn.last)
|
55
59
|
found=false
|
56
60
|
# reverse search of general top level domain
|
57
61
|
if @gtld.key?(dn[dn.length-2])
|
@@ -65,8 +69,8 @@ module Wmap
|
|
65
69
|
break
|
66
70
|
end
|
67
71
|
end
|
68
|
-
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
69
|
-
#unless found
|
72
|
+
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
73
|
+
#unless found
|
70
74
|
# if @gtld.key?(dn[dn.length-2])
|
71
75
|
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
72
76
|
# return nil
|
@@ -77,17 +81,17 @@ module Wmap
|
|
77
81
|
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
78
82
|
else
|
79
83
|
root_domain=dn[dn.length-2] + "." + dn.last
|
80
|
-
end
|
84
|
+
end
|
81
85
|
found_cctld=true
|
82
86
|
end
|
83
|
-
unless (found_tld or found_cctld)
|
87
|
+
unless (found_tld or found_cctld)
|
84
88
|
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
85
89
|
return nil
|
86
90
|
else
|
87
91
|
puts "Domain root found: #{root_domain}" if @verbose
|
88
92
|
return root_domain
|
89
93
|
end
|
90
|
-
rescue => ee
|
94
|
+
rescue => ee
|
91
95
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
92
96
|
return nil
|
93
97
|
end
|
@@ -96,7 +100,7 @@ module Wmap
|
|
96
100
|
alias_method :root_domain, :get_domain_root
|
97
101
|
alias_method :domain_root, :get_domain_root
|
98
102
|
alias_method :host_2_domain, :get_domain_root
|
99
|
-
|
103
|
+
|
100
104
|
# 'setter' to parse and load the known country code second level domain table from the file
|
101
105
|
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
102
106
|
def load_ccsld_from_file (file_ccsld)
|
@@ -107,10 +111,10 @@ module Wmap
|
|
107
111
|
f.each do |line|
|
108
112
|
next unless line =~ /^\s+\.\w/
|
109
113
|
line=line.chomp.strip.downcase
|
110
|
-
entry=line.split(' ')[0].split('.')
|
114
|
+
entry=line.split(' ')[0].split('.')
|
111
115
|
if entry.length > 2
|
112
116
|
key=entry.last
|
113
|
-
ccsld[key] = Array.new if not ccsld.key?(key)
|
117
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
114
118
|
val=entry[entry.length-2]
|
115
119
|
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
116
120
|
ccsld[key].push(val) unless key.nil?
|
@@ -119,7 +123,7 @@ module Wmap
|
|
119
123
|
f.close
|
120
124
|
# Sort the blocks once in descendant order once for better performance
|
121
125
|
return ccsld
|
122
|
-
rescue => ee
|
126
|
+
rescue => ee
|
123
127
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
124
128
|
end
|
125
129
|
end
|
@@ -158,15 +162,15 @@ module Wmap
|
|
158
162
|
rescue Exception => ee
|
159
163
|
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
160
164
|
return nil
|
161
|
-
end
|
165
|
+
end
|
162
166
|
end
|
163
167
|
alias_method :get_subdomain, :get_sub_domain
|
164
|
-
|
168
|
+
|
165
169
|
# Function to print instance variable - General top level domain list
|
166
170
|
def print_gtld
|
167
171
|
puts @gtld
|
168
172
|
end
|
169
|
-
|
173
|
+
|
170
174
|
# Function to print instance variable - Country code top-level domain list
|
171
175
|
def print_cctld
|
172
176
|
puts @cctld
|
@@ -176,9 +180,9 @@ module Wmap
|
|
176
180
|
def print_ccsld
|
177
181
|
puts @ccsld
|
178
182
|
end
|
179
|
-
|
183
|
+
|
180
184
|
private :load_ccsld_from_file
|
181
|
-
|
185
|
+
|
182
186
|
end
|
183
187
|
end
|
184
188
|
end
|