wmap 2.4.4 → 2.4.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.rdoc +27 -20
- data/bin/RHPG +85 -0
- data/bin/trust +5 -5
- data/bin/trusts +38 -0
- data/bin/updateAll +5 -9
- data/bin/wadds +1 -1
- data/bin/wmaps +24 -0
- data/dicts/tlds.txt +1537 -0
- data/lib/wmap/cidr_tracker.rb +22 -15
- data/lib/wmap/host_tracker/primary_host.rb +1 -1
- data/lib/wmap/host_tracker.rb +6 -6
- data/lib/wmap/site_tracker.rb +7 -7
- data/lib/wmap/url_crawler.rb +17 -17
- data/lib/wmap/utils/domain_root.rb +28 -24
- data/lib/wmap/wp_tracker.rb +302 -0
- data/logs/wmap.log +1516 -17
- data/version.txt +4 -4
- data/wmap.gemspec +20 -5
- metadata +179 -14
- data/data/cidrs +0 -2
- data/data/deactivated_sites +0 -1
- data/data/domains +0 -2
- data/data/hosts +0 -1
- data/data/prime_hosts +0 -1
- data/data/sites +0 -2
- data/data/sub_domains +0 -2
- data/lib/wmap.rb +0 -227
data/lib/wmap/cidr_tracker.rb
CHANGED
@@ -64,9 +64,11 @@ class Wmap::CidrTracker
|
|
64
64
|
@known_cidr_blks[key]['netname']=entry[2].nil? ? nil : entry[2].strip
|
65
65
|
end
|
66
66
|
f.close
|
67
|
-
# Sort the blocks in order once for better performance
|
68
|
-
|
69
|
-
|
67
|
+
# Sort the blocks in order once for better performance. Update 10/29/2018 to support Netaddr 2.x syntax
|
68
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
69
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
70
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
71
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
70
72
|
rescue => ee
|
71
73
|
puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
72
74
|
end
|
@@ -75,7 +77,7 @@ class Wmap::CidrTracker
|
|
75
77
|
# 'setter' to add an entry to CIDR store @known_cidr_blks
|
76
78
|
def add (cidr,ref=nil,netname=nil)
|
77
79
|
puts "Load the entry into the CIDR store: #{cidr}"
|
78
|
-
begin
|
80
|
+
#begin
|
79
81
|
raise "Unknown CIDR format: #{cidr}" unless is_cidr?(cidr)
|
80
82
|
# Obtain the 'ref' and 'netname' value automatically in case not passed as method parameters
|
81
83
|
if ref.nil? or netname.nil?
|
@@ -96,11 +98,13 @@ class Wmap::CidrTracker
|
|
96
98
|
puts "Entry loaded!"
|
97
99
|
end
|
98
100
|
# Re-sort the blocks in order for better performance
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
101
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
102
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
103
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
104
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
105
|
+
#rescue => ee
|
106
|
+
# puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
107
|
+
#end
|
104
108
|
end
|
105
109
|
|
106
110
|
# 'setter' to remove an entry to CIDR store @known_cidr_blks
|
@@ -117,8 +121,10 @@ class Wmap::CidrTracker
|
|
117
121
|
raise "Unknown CIDR entry: #{cidr}"
|
118
122
|
end
|
119
123
|
# Re-sort the blocks in order for better performance
|
120
|
-
|
121
|
-
|
124
|
+
#@known_cidr_blks_desc_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>true)
|
125
|
+
#@known_cidr_blks_asce_index=NetAddr.sort(@known_cidr_blks.keys, :Desc=>false)
|
126
|
+
@known_cidr_blks_asce_index=@known_cidr_blks.keys.sort
|
127
|
+
@known_cidr_blks_desc_index=@known_cidr_blks_asce_index.reverse
|
122
128
|
rescue => ee
|
123
129
|
puts "Exception on method #{__method__}: #{ee}" # if @verbose
|
124
130
|
end
|
@@ -167,6 +173,7 @@ class Wmap::CidrTracker
|
|
167
173
|
@known_cidr_blks_desc_index.each do |line|
|
168
174
|
first_octet_blk = line.split('.').first.to_i
|
169
175
|
next if first_octet_blk > first_octet_ip
|
176
|
+
puts "line: #{line}" if @verbose
|
170
177
|
cidr4 = NetAddr::CIDR.create(line)
|
171
178
|
known = cidr4.contains?(ip+'/32')
|
172
179
|
break if known
|
@@ -276,7 +283,7 @@ class Wmap::CidrTracker
|
|
276
283
|
# Save the current cidr hash table into a file
|
277
284
|
def save_cidrs_to_file!(file_cidrs=@file_cidr_seeds)
|
278
285
|
puts "Saving the current cidrs cache table from memory to file: #{file_cidrs} ..." if @verbose
|
279
|
-
begin
|
286
|
+
#begin
|
280
287
|
timestamp=Time.now
|
281
288
|
f=File.open(file_cidrs, 'w')
|
282
289
|
f.write "# Local cidrs file created by Wmap::CidrTracker.save method at: #{timestamp}\n"
|
@@ -288,9 +295,9 @@ class Wmap::CidrTracker
|
|
288
295
|
end
|
289
296
|
f.close
|
290
297
|
puts "CIDR cache table is successfully saved: #{file_cidrs}"
|
291
|
-
rescue => ee
|
292
|
-
|
293
|
-
end
|
298
|
+
#rescue => ee
|
299
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
300
|
+
#end
|
294
301
|
end
|
295
302
|
alias_method :save!, :save_cidrs_to_file!
|
296
303
|
|
@@ -14,7 +14,7 @@ module Wmap
|
|
14
14
|
# Class to differentiate the primary host-name from the potential aliases. This is needed in order to minimize the confusion on our final site inventory list, as it contains a large number of duplicates (aliases). More specifically, a filter could be built by using this class to track the primary url of a website.
|
15
15
|
class PrimaryHost < Wmap::HostTracker
|
16
16
|
include Wmap::Utils
|
17
|
-
include Singleton
|
17
|
+
#include Singleton
|
18
18
|
|
19
19
|
attr_accessor :hosts_file, :verbose, :data_dir
|
20
20
|
attr_reader :known_hosts, :known_ips
|
data/lib/wmap/host_tracker.rb
CHANGED
@@ -42,7 +42,7 @@ class Wmap::HostTracker
|
|
42
42
|
entry=line.chomp.split(%r{\t+|\s+|\,})
|
43
43
|
key=entry[0].downcase
|
44
44
|
value=entry[1]
|
45
|
-
puts "Loading value pair: #{key} - #{value}" if @verbose
|
45
|
+
puts "Loading key value pair: #{key} - #{value}" if @verbose
|
46
46
|
known_hosts[key] = Hash.new unless known_hosts.key?(key)
|
47
47
|
known_hosts[key]= value
|
48
48
|
# For reverse host lookup
|
@@ -103,7 +103,7 @@ class Wmap::HostTracker
|
|
103
103
|
# Setter to add host entry to the cache once at a time
|
104
104
|
def add(host)
|
105
105
|
puts "Add entry to the local host repository: #{host}"
|
106
|
-
begin
|
106
|
+
#begin
|
107
107
|
host=host.strip.downcase unless host.nil?
|
108
108
|
unless @known_hosts.key?(host)
|
109
109
|
ip=host_2_ip(host)
|
@@ -137,9 +137,9 @@ class Wmap::HostTracker
|
|
137
137
|
else
|
138
138
|
puts "Host is already exist. Skip: #{host}"
|
139
139
|
end
|
140
|
-
rescue => ee
|
141
|
-
|
142
|
-
end
|
140
|
+
#rescue => ee
|
141
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
142
|
+
#end
|
143
143
|
end
|
144
144
|
|
145
145
|
# Setter to add host entry to the local hosts in batch (from an array)
|
@@ -521,7 +521,7 @@ class Wmap::HostTracker
|
|
521
521
|
entry=line.chomp.split(%r{\t+|\s+|\,})
|
522
522
|
key=entry[0].downcase
|
523
523
|
value=entry[1]
|
524
|
-
puts "Loading value pair: #{key} - #{value}" if @verbose
|
524
|
+
puts "Loading key value pair: #{key} - #{value}" if @verbose
|
525
525
|
host_store[key] = Hash.new unless known_hosts.key?(key)
|
526
526
|
host_store[key]= value
|
527
527
|
end
|
data/lib/wmap/site_tracker.rb
CHANGED
@@ -180,7 +180,7 @@ class Wmap::SiteTracker
|
|
180
180
|
if my_tracker.host_known?(host)
|
181
181
|
old_ip=my_tracker.local_host_2_ip(host)
|
182
182
|
if old_ip != ip
|
183
|
-
my_tracker.
|
183
|
+
my_tracker.refresh(host)
|
184
184
|
my_tracker.save!
|
185
185
|
else
|
186
186
|
puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
|
@@ -205,7 +205,7 @@ class Wmap::SiteTracker
|
|
205
205
|
if my_tracker.host_known?(host)
|
206
206
|
old_ip=my_tracker.local_host_2_ip(host)
|
207
207
|
if old_ip != ip
|
208
|
-
my_tracker.
|
208
|
+
my_tracker.refresh(host)
|
209
209
|
my_tracker.save!
|
210
210
|
else
|
211
211
|
# Skip - no need to update the local hosts table
|
@@ -582,7 +582,7 @@ class Wmap::SiteTracker
|
|
582
582
|
host=url_2_host(key)
|
583
583
|
md5=@known_sites[key]['md5']
|
584
584
|
code=@known_sites[key]['code']
|
585
|
-
ip=
|
585
|
+
ip=my_tracker.local_host_2_ip(host)
|
586
586
|
ip=host_2_ip(host) if ip.nil?
|
587
587
|
# filtering out 'un-reachable' sites
|
588
588
|
next if (code == 10000 or code == 20000)
|
@@ -835,7 +835,7 @@ class Wmap::SiteTracker
|
|
835
835
|
# Retrieve the unique sites from the local site store in the primary host format
|
836
836
|
def get_prim_uniq_sites
|
837
837
|
puts "Retrieve and prime unique sites in the site store. " if @verbose
|
838
|
-
begin
|
838
|
+
#begin
|
839
839
|
host_tracker=Wmap::HostTracker.new(:data_dir=>@data_dir)
|
840
840
|
primary_host_tracker=Wmap::HostTracker::PrimaryHost.new(:data_dir=>@data_dir)
|
841
841
|
# Step 1. Retrieve the unique site list first
|
@@ -884,9 +884,9 @@ class Wmap::SiteTracker
|
|
884
884
|
primary_host_tracker=nil
|
885
885
|
host_tracker=nil
|
886
886
|
return prim_uniq_sites
|
887
|
-
rescue => ee
|
888
|
-
|
889
|
-
end
|
887
|
+
#rescue => ee
|
888
|
+
# puts "Exception on method #{__method__}: #{ee}"
|
889
|
+
#end
|
890
890
|
end
|
891
891
|
alias_method :get_prime, :get_prim_uniq_sites
|
892
892
|
|
data/lib/wmap/url_crawler.rb
CHANGED
@@ -31,7 +31,7 @@ class Wmap::UrlCrawler
|
|
31
31
|
# Crawler instance default variables
|
32
32
|
def initialize (params = {})
|
33
33
|
@verbose=params.fetch(:verbose, false)
|
34
|
-
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../
|
34
|
+
@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
|
35
35
|
@http_timeout=params.fetch(:http_timeout, 5000)
|
36
36
|
@crawl_depth=params.fetch(:crawl_depth, 4)
|
37
37
|
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
@@ -224,11 +224,11 @@ class Wmap::UrlCrawler
|
|
224
224
|
alias_method :query_file, :crawl_workers_on_file
|
225
225
|
alias_method :crawl_file, :crawl_workers_on_file
|
226
226
|
|
227
|
-
|
227
|
+
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
228
228
|
def open_url(url)
|
229
|
-
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
230
229
|
#url_object = nil
|
231
|
-
|
230
|
+
begin
|
231
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
232
232
|
if url =~ /http\:/i
|
233
233
|
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
234
234
|
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
@@ -240,24 +240,24 @@ class Wmap::UrlCrawler
|
|
240
240
|
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
241
241
|
end
|
242
242
|
return url_object
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
end
|
243
|
+
rescue => ee
|
244
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
245
|
+
return nil
|
247
246
|
end
|
247
|
+
end
|
248
248
|
|
249
249
|
# Wrapper to use OpenURI method 'read' to return url body contents
|
250
250
|
def read_url(url)
|
251
|
-
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
252
251
|
begin
|
252
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
253
253
|
url_object=open_url(url)
|
254
254
|
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
255
255
|
body=url_object.read
|
256
256
|
return body
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
257
|
+
rescue => ee
|
258
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
259
|
+
return nil
|
260
|
+
end
|
261
261
|
end
|
262
262
|
|
263
263
|
# Return the destination url in case of url re-direct
|
@@ -268,11 +268,11 @@ class Wmap::UrlCrawler
|
|
268
268
|
return url_object.base_uri.to_s
|
269
269
|
end
|
270
270
|
return url
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
end
|
271
|
+
rescue => ee
|
272
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
273
|
+
return nil
|
275
274
|
end
|
275
|
+
end
|
276
276
|
|
277
277
|
# Wrapper for the Nokogiri DOM parser
|
278
278
|
def parse_html(html_body)
|
@@ -8,25 +8,29 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
module Wmap
|
11
|
-
module Utils
|
11
|
+
module Utils
|
12
12
|
# Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
|
13
|
-
module DomainRoot
|
13
|
+
module DomainRoot
|
14
14
|
extend self
|
15
15
|
# Internet Domain Architecture Definitions
|
16
16
|
File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
|
17
17
|
File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
|
18
18
|
File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
|
19
|
-
|
19
|
+
File_tld=File.dirname(__FILE__)+'/../../../dicts/tlds.txt'
|
20
|
+
|
20
21
|
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
21
22
|
def get_domain_root (host)
|
22
23
|
puts "Retrieve the root domain for host: #{host}" if @verbose
|
23
24
|
begin
|
25
|
+
# Comnplete Top Level Domain List - loading once
|
26
|
+
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
24
27
|
# Generic Top Level Domain List - loading once
|
25
28
|
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
26
29
|
# Country code top-level domain list - loading once
|
27
30
|
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
28
31
|
# Country code second level domain - loading once
|
29
32
|
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
33
|
+
|
30
34
|
if host.strip.nil?
|
31
35
|
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
32
36
|
return nil
|
@@ -35,15 +39,15 @@ module Wmap
|
|
35
39
|
end
|
36
40
|
found_tld=false
|
37
41
|
found_cctld=false
|
38
|
-
# search the
|
42
|
+
# search the top level domain list first
|
39
43
|
root_domain=""
|
40
44
|
dn=host.split(".")
|
41
|
-
if @
|
42
|
-
|
43
|
-
if @cctld.key?(dn[dn.length-2])
|
44
|
-
|
45
|
+
if @tlds.key?(dn.last)
|
46
|
+
cc_found=false
|
47
|
+
if @cctld.key?(dn[dn.length-2])
|
48
|
+
cc_found=true
|
45
49
|
end
|
46
|
-
if
|
50
|
+
if cc_found
|
47
51
|
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
48
52
|
else
|
49
53
|
root_domain=dn[dn.length-2] + "." + dn.last
|
@@ -51,7 +55,7 @@ module Wmap
|
|
51
55
|
found_tld=true
|
52
56
|
end
|
53
57
|
# search the country code top level domain list secondly
|
54
|
-
if @cctld.key?(dn.last)
|
58
|
+
if @cctld.key?(dn.last)
|
55
59
|
found=false
|
56
60
|
# reverse search of general top level domain
|
57
61
|
if @gtld.key?(dn[dn.length-2])
|
@@ -65,8 +69,8 @@ module Wmap
|
|
65
69
|
break
|
66
70
|
end
|
67
71
|
end
|
68
|
-
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
69
|
-
#unless found
|
72
|
+
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
73
|
+
#unless found
|
70
74
|
# if @gtld.key?(dn[dn.length-2])
|
71
75
|
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
72
76
|
# return nil
|
@@ -77,17 +81,17 @@ module Wmap
|
|
77
81
|
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
78
82
|
else
|
79
83
|
root_domain=dn[dn.length-2] + "." + dn.last
|
80
|
-
end
|
84
|
+
end
|
81
85
|
found_cctld=true
|
82
86
|
end
|
83
|
-
unless (found_tld or found_cctld)
|
87
|
+
unless (found_tld or found_cctld)
|
84
88
|
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
85
89
|
return nil
|
86
90
|
else
|
87
91
|
puts "Domain root found: #{root_domain}" if @verbose
|
88
92
|
return root_domain
|
89
93
|
end
|
90
|
-
rescue => ee
|
94
|
+
rescue => ee
|
91
95
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
92
96
|
return nil
|
93
97
|
end
|
@@ -96,7 +100,7 @@ module Wmap
|
|
96
100
|
alias_method :root_domain, :get_domain_root
|
97
101
|
alias_method :domain_root, :get_domain_root
|
98
102
|
alias_method :host_2_domain, :get_domain_root
|
99
|
-
|
103
|
+
|
100
104
|
# 'setter' to parse and load the known country code second level domain table from the file
|
101
105
|
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
102
106
|
def load_ccsld_from_file (file_ccsld)
|
@@ -107,10 +111,10 @@ module Wmap
|
|
107
111
|
f.each do |line|
|
108
112
|
next unless line =~ /^\s+\.\w/
|
109
113
|
line=line.chomp.strip.downcase
|
110
|
-
entry=line.split(' ')[0].split('.')
|
114
|
+
entry=line.split(' ')[0].split('.')
|
111
115
|
if entry.length > 2
|
112
116
|
key=entry.last
|
113
|
-
ccsld[key] = Array.new if not ccsld.key?(key)
|
117
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
114
118
|
val=entry[entry.length-2]
|
115
119
|
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
116
120
|
ccsld[key].push(val) unless key.nil?
|
@@ -119,7 +123,7 @@ module Wmap
|
|
119
123
|
f.close
|
120
124
|
# Sort the blocks once in descendant order once for better performance
|
121
125
|
return ccsld
|
122
|
-
rescue => ee
|
126
|
+
rescue => ee
|
123
127
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
124
128
|
end
|
125
129
|
end
|
@@ -158,15 +162,15 @@ module Wmap
|
|
158
162
|
rescue Exception => ee
|
159
163
|
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
160
164
|
return nil
|
161
|
-
end
|
165
|
+
end
|
162
166
|
end
|
163
167
|
alias_method :get_subdomain, :get_sub_domain
|
164
|
-
|
168
|
+
|
165
169
|
# Function to print instance variable - General top level domain list
|
166
170
|
def print_gtld
|
167
171
|
puts @gtld
|
168
172
|
end
|
169
|
-
|
173
|
+
|
170
174
|
# Function to print instance variable - Country code top-level domain list
|
171
175
|
def print_cctld
|
172
176
|
puts @cctld
|
@@ -176,9 +180,9 @@ module Wmap
|
|
176
180
|
def print_ccsld
|
177
181
|
puts @ccsld
|
178
182
|
end
|
179
|
-
|
183
|
+
|
180
184
|
private :load_ccsld_from_file
|
181
|
-
|
185
|
+
|
182
186
|
end
|
183
187
|
end
|
184
188
|
end
|