wmap 2.6.6 → 2.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wmap +12 -15
- data/lib/wmap/host_tracker.rb +3 -3
- data/lib/wmap/site_tracker.rb +2 -2
- data/lib/wmap/url_crawler/adware_tag.rb +5 -7
- data/lib/wmap/url_crawler.rb +206 -228
- data/version.txt +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e4f2a2dfe9b4b119331eefffc7b9b025d9953c2ce5f7255e4d2a08929a591c3
|
4
|
+
data.tar.gz: 3d018d69469cf4e4551b38397657341661fd95c3f59bebe8bb21405d4e107881
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e7d27b4d4abfc34ab3df0933412b4c99e94af93f71251a2e7a0706b4782ad62a2541dbf7c5f391f47d5a3b9eae9feb45ce1690b2e43fddab4f52a14e7bb334b
|
7
|
+
data.tar.gz: 0117422a9eac9f1c7a66783a0a4ca870711aeaa95252a9df8fe4f9ce2f8f10fd4ba461b2146a681f3b867becb9bace05c61a3e11b284714732432c95e12dc983
|
data/bin/wmap
CHANGED
@@ -6,13 +6,8 @@
|
|
6
6
|
require "wmap"
|
7
7
|
require "optparse"
|
8
8
|
|
9
|
-
# program helper
|
10
|
-
def print_usage
|
11
|
-
abort "Program to perform website asset discovery and tracking. \nUsage: wmap -t <Target Host | URL | IP | CIDR | or a seed file with any of the above combo> -d <Optional Discovery Result Directory>"
|
12
|
-
end
|
13
|
-
|
14
9
|
# program command line options
|
15
|
-
options = {:data_dir => nil, :target => nil}
|
10
|
+
options = {:data_dir => nil, :target => nil, :verbose => false}
|
16
11
|
parser = OptionParser.new do|opts|
|
17
12
|
opts.banner = Wmap.banner
|
18
13
|
opts.on('-d', '--data_dir data_dir', 'Web Mapper local cache data directory') do |data_dir|
|
@@ -21,9 +16,11 @@ parser = OptionParser.new do|opts|
|
|
21
16
|
opts.on('-t', '--target target', 'Web Mapper target') do |target|
|
22
17
|
options[:target] = target;
|
23
18
|
end
|
19
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
20
|
+
options[:verbose] = v;
|
21
|
+
end
|
24
22
|
opts.on('-h', '--help', 'Displays Help') do
|
25
|
-
|
26
|
-
print_usage
|
23
|
+
puts opts
|
27
24
|
exit 0
|
28
25
|
end
|
29
26
|
end
|
@@ -47,7 +44,7 @@ Dir.mkdir(Log_dir) unless Dir.exist?(Log_dir)
|
|
47
44
|
Wmap.wlog("Execute the command: wmap -t #{options[:target]}","wmap",Log_dir.join("wmap.log").to_s)
|
48
45
|
urls = Array.new
|
49
46
|
# first step - construct the host list
|
50
|
-
scanner = Wmap::PortScanner.new(:verbose=>
|
47
|
+
scanner = Wmap::PortScanner.new(:verbose=>options[:verbose], :socket_timeout=>600) # default time-out of 600 milliseconds
|
51
48
|
hosts=Array.new
|
52
49
|
if File.exist?(options[:target])
|
53
50
|
puts "Parsing the discovery seed file: \"#{options[:target]}\" "
|
@@ -65,18 +62,18 @@ if File.exist?(options[:target])
|
|
65
62
|
cidrs.push(x) if scanner.is_cidr?(x)
|
66
63
|
end
|
67
64
|
puts "Parsing done. "
|
68
|
-
hosts+=Wmap::DnsBruter.new(:verbose=>
|
65
|
+
hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_workers(domains.uniq).values.flatten if domains.size > 0
|
69
66
|
cidrs.map { |x| hosts+= scanner.cidr_2_ips(x) } if cidrs.size > 0
|
70
67
|
elsif scanner.is_url?(options[:target])
|
71
68
|
puts "Processing the URL: #{options[:target]}"
|
72
69
|
urls.push(options[:target])
|
73
70
|
elsif Wmap.domain_known?(options[:target]) or Wmap.sub_domain_known?(options[:target])
|
74
71
|
puts "Processing the domain: #{options[:target]}"
|
75
|
-
hosts+=Wmap::DnsBruter.new(:verbose=>
|
72
|
+
hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten
|
76
73
|
elsif scanner.is_fqdn?(options[:target])
|
77
74
|
puts "Processing the host: #{options[:target]}"
|
78
75
|
hosts.push(options[:target])
|
79
|
-
my_hosts=Wmap::DnsBruter.new(:verbose=>
|
76
|
+
my_hosts=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten if (options[:target].split('.')[0] =~ /\d+/)
|
80
77
|
hosts+=my_hosts unless my_hosts.nil?
|
81
78
|
elsif scanner.is_cidr?(options[:target])
|
82
79
|
puts "Processing the network block: #{options[:target]}"
|
@@ -102,7 +99,7 @@ if options[:target] && options[:data_dir]
|
|
102
99
|
crawler = Wmap::UrlCrawler.new(:data_dir => options[:data_dir])
|
103
100
|
elsif options[:target]
|
104
101
|
puts "Fire up the crawler."
|
105
|
-
crawler = Wmap::UrlCrawler.new(:verbose=>
|
102
|
+
crawler = Wmap::UrlCrawler.new(:verbose=>options[:verbose])
|
106
103
|
else
|
107
104
|
abort "Error firing up UrlCrawler instance!"
|
108
105
|
end
|
@@ -168,14 +165,14 @@ end
|
|
168
165
|
if options[:target] && options[:data_dir]
|
169
166
|
puts "Invoke the HostTracker with optional directory setter."
|
170
167
|
host_tracker = Wmap::HostTracker.instance
|
171
|
-
host_tracker.verbose=
|
168
|
+
host_tracker.verbose=options[:verbose]
|
172
169
|
host_tracker.data_dir = options[:data_dir]
|
173
170
|
host_tracker.hosts_file = host_tracker.data_dir + "/" + "hosts"
|
174
171
|
host_tracker.load_known_hosts_from_file(host_tracker.hosts_file)
|
175
172
|
elsif options[:target]
|
176
173
|
puts puts "Invoke the HostTracker."
|
177
174
|
host_tracker = Wmap::HostTracker.instance
|
178
|
-
host_tracker.verbose=
|
175
|
+
host_tracker.verbose=options[:verbose]
|
179
176
|
else
|
180
177
|
abort "Error firing up HostTracker instance!"
|
181
178
|
end
|
data/lib/wmap/host_tracker.rb
CHANGED
@@ -57,9 +57,9 @@ class Wmap::HostTracker
|
|
57
57
|
end
|
58
58
|
f.close
|
59
59
|
return @known_hosts
|
60
|
-
|
61
|
-
|
62
|
-
|
60
|
+
rescue => ee
|
61
|
+
puts "Exception on method #{__method__}: #{ee}"
|
62
|
+
return known_hosts
|
63
63
|
end
|
64
64
|
|
65
65
|
# Save the current local hosts hash table into a (random) data repository file
|
data/lib/wmap/site_tracker.rb
CHANGED
@@ -282,8 +282,8 @@ class Wmap::SiteTracker
|
|
282
282
|
puts "No new entry added. "
|
283
283
|
end
|
284
284
|
return results
|
285
|
-
|
286
|
-
|
285
|
+
rescue => ee
|
286
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
287
287
|
end
|
288
288
|
alias_method :adds, :bulk_add
|
289
289
|
|
@@ -12,7 +12,7 @@ module Wmap
|
|
12
12
|
|
13
13
|
# Class to identify and track adware within the site store
|
14
14
|
include Wmap::Utils
|
15
|
-
attr_accessor :signature_file, :tag_file, :verbose, :data_dir
|
15
|
+
attr_accessor :signature_file, :tag_file, :verbose, :data_dir
|
16
16
|
attr_reader :tag_signatures, :tag_store
|
17
17
|
|
18
18
|
|
@@ -26,7 +26,7 @@ module Wmap
|
|
26
26
|
# Set default instance variables
|
27
27
|
@signature_file=File.dirname(__FILE__) + '/../../../settings/' + 'tag_signatures'
|
28
28
|
file=params.fetch(:signature_file, @signature_file)
|
29
|
-
@tag_signatures=
|
29
|
+
@tag_signatures=load_sig_from_file(file)
|
30
30
|
@tag_file=params.fetch(:tag_file, @data_dir + 'tag_sites')
|
31
31
|
File.write(@tag_file, "") unless File.exist?(@tag_file)
|
32
32
|
# load the known tag store
|
@@ -34,9 +34,8 @@ module Wmap
|
|
34
34
|
@landings = Hash.new # cache landing page to reduce redundant browsing
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
37
|
# load the known tag signatures into an instance variable
|
39
|
-
def
|
38
|
+
def load_sig_from_file (file, lc=true)
|
40
39
|
puts "Loading data file: #{file}" if @verbose
|
41
40
|
data_store=Hash.new
|
42
41
|
f = File.open(file, 'r')
|
@@ -53,7 +52,6 @@ module Wmap
|
|
53
52
|
else
|
54
53
|
data_store[entry[0]]=entry[1].strip
|
55
54
|
end
|
56
|
-
|
57
55
|
end
|
58
56
|
f.close
|
59
57
|
return data_store
|
@@ -105,11 +103,11 @@ module Wmap
|
|
105
103
|
end
|
106
104
|
alias_method :save!, :save_to_file!
|
107
105
|
|
108
|
-
#
|
106
|
+
# Refresh adware tag store signatures
|
109
107
|
def refresh (num=@max_parallel,use_cache=true)
|
110
108
|
puts "Add entries to the local cache table from site tracker: " if @verbose
|
111
109
|
results = Hash.new
|
112
|
-
tags =
|
110
|
+
tags = @tag_store.keys
|
113
111
|
if tags.size > 0
|
114
112
|
Parallel.map(tags, :in_processes => num) { |target|
|
115
113
|
check_adware(target,use_cache)
|
data/lib/wmap/url_crawler.rb
CHANGED
@@ -66,210 +66,196 @@ class Wmap::UrlCrawler
|
|
66
66
|
# A web crawler to crawl a known website and search for html links within the same root domain. For example,
|
67
67
|
# by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
|
68
68
|
def crawl(url)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
return result
|
85
|
-
end
|
69
|
+
puts "Start web crawling on #{url}"
|
70
|
+
result=Array.new
|
71
|
+
url=url.chomp.strip
|
72
|
+
result.push(url_2_site(url))
|
73
|
+
raise "Error! Invalid url format: #{urls}" unless is_url?(url)
|
74
|
+
# Add logic to profile the web server before crawling; this is used to optimize the crawling speed
|
75
|
+
pre_crawl(url)
|
76
|
+
status = Timeout::timeout(Crawl_timeout/1000) {
|
77
|
+
result+=crawl_worker(url).keys
|
78
|
+
}
|
79
|
+
puts "Web crawling time-out on #{url}: #{status}" if @verbose
|
80
|
+
return result
|
81
|
+
rescue => ee
|
82
|
+
puts "Exception on method #{__method__} for URL #{url}: #{ee}"
|
83
|
+
return result
|
86
84
|
end
|
87
85
|
alias_method :query, :crawl
|
88
86
|
|
89
87
|
# The worker instance of crawler who perform the labour work
|
90
88
|
def crawl_worker(url0)
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
89
|
+
puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
|
90
|
+
# Input URL sanity check first
|
91
|
+
if is_url?(url0)
|
92
|
+
host=url_2_host(url0)
|
93
|
+
ip=host_2_ip(host).to_s
|
94
|
+
raise "Invalid IP address: #{url0}" if ip.nil?
|
95
|
+
port=url_2_port(url0).to_s
|
96
|
+
raise "Invalid port number: #{url0}" if port.nil?
|
97
|
+
else
|
98
|
+
raise "Invalid URL: #{url0}. Please check it out with your browser again."
|
99
|
+
end
|
100
|
+
log_info=Hash.new
|
101
|
+
log_info[1]="Start working on #{url0}"
|
102
|
+
url_stores=Hash.new
|
103
|
+
url_stores[url0]=true unless url_stores.key?(url0)
|
104
|
+
@discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
|
105
|
+
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
109
106
|
# $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
107
|
+
@crawl_depth.times do
|
108
|
+
url_stores.keys.each do |url|
|
109
|
+
# 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
|
110
|
+
next if @visited_urls_by_crawler.key?(url)
|
111
|
+
url_object = open_url(url)
|
112
|
+
next if url_object == nil
|
113
|
+
url = update_url_if_redirected(url, url_object)
|
114
|
+
url_body = read_url(url)
|
115
|
+
# Protection code - to avoid parsing failure on the empty or nil object
|
116
|
+
next if url_body.nil? or url_body.empty?
|
117
|
+
url_stores[url]=true unless url_stores.key?(url)
|
118
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
122
119
|
# $discovered_urls[url]=true unless $discovered_urls.key?(url)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
120
|
+
doc = Nokogiri::HTML(url_body)
|
121
|
+
next if doc == nil
|
122
|
+
if url_stores.size >= @crawl_page_limit
|
123
|
+
#@visited_urls_by_crawler.merge!(url_stores)
|
124
|
+
@discovered_urls_by_crawler.merge!(url_stores)
|
128
125
|
# $discovered_urls.merge!(url_stores)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
126
|
+
puts "Finish web crawling the url: #{url0}"
|
127
|
+
return url_stores
|
128
|
+
end
|
129
|
+
page_urls = find_urls_on_page(doc, url)
|
130
|
+
page_urls.uniq!
|
131
|
+
page_urls.map do |y|
|
132
|
+
y=normalize_url(y)
|
133
|
+
url_stores[y]=true unless url_stores.key?(y)
|
134
|
+
@discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
|
138
135
|
# $discovered_urls[y]=true unless $discovered_urls.key?(y)
|
139
|
-
end
|
140
136
|
end
|
141
137
|
end
|
142
|
-
puts "Finish web crawling on: #{url0}"
|
143
|
-
log_info[2]="Finish working on: #{url0}"
|
144
|
-
wlog(log_info, "UrlCrawler", @log_file)
|
145
|
-
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
146
|
-
return url_stores
|
147
|
-
rescue => ee
|
148
|
-
puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
149
|
-
log_info[3]="Exception on #{url0}"
|
150
|
-
wlog(log_info,"UrlCrawler",@log_file)
|
151
|
-
return url_stores
|
152
138
|
end
|
139
|
+
puts "Finish web crawling on: #{url0}"
|
140
|
+
log_info[2]="Finish working on: #{url0}"
|
141
|
+
wlog(log_info, "UrlCrawler", @log_file)
|
142
|
+
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
143
|
+
return url_stores
|
144
|
+
rescue => ee
|
145
|
+
puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
146
|
+
log_info[3]="Exception on #{url0}"
|
147
|
+
wlog(log_info,"UrlCrawler",@log_file)
|
148
|
+
return url_stores
|
153
149
|
end
|
154
150
|
|
155
151
|
# Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
|
156
152
|
# each child process will continuously work on the target pool until all the works are done
|
157
153
|
def crawl_workers (targets,num=@max_parallel)
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
uniq_sites[site_key]=target
|
175
|
-
end
|
154
|
+
raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
|
155
|
+
puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
|
156
|
+
#puts "This could be awhile depending on the list size. Please be patient ..."
|
157
|
+
# 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
|
158
|
+
targets -= ["", nil]
|
159
|
+
uniq_sites=Hash.new
|
160
|
+
targets.dup.map do |target|
|
161
|
+
if is_url?(target)
|
162
|
+
host=url_2_host(target)
|
163
|
+
ip=host_2_ip(host).to_s
|
164
|
+
next if ip.nil?
|
165
|
+
port=url_2_port(target).to_s
|
166
|
+
next if port.nil?
|
167
|
+
site_key=ip+":"+port
|
168
|
+
unless uniq_sites.key?(site_key)
|
169
|
+
uniq_sites[site_key]=target
|
176
170
|
end
|
177
171
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
172
|
+
end
|
173
|
+
puts "Sanitization done! " if @verbose
|
174
|
+
puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
|
175
|
+
puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
|
176
|
+
raise "Error: target list is empty!" if targets.size < 1
|
177
|
+
Parallel.map(uniq_sites.values, :in_processes => num) { |target|
|
178
|
+
puts "Working on #{target} ..." if @verbose
|
179
|
+
crawl(target)
|
180
|
+
}.dup.each do |process|
|
181
|
+
puts "process.inspect: #{process}" if @verbose
|
182
|
+
urls=process
|
183
|
+
urls-=["",nil] unless urls.nil?
|
184
|
+
if urls.nil?
|
185
|
+
next
|
186
|
+
elsif urls.empty?
|
187
|
+
next
|
188
|
+
#do nothing
|
189
|
+
else
|
190
|
+
urls.map do |url|
|
191
|
+
url.strip!
|
192
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
193
|
+
#$discovered_urls[url]=true unless $discovered_urls.key?(url)
|
200
194
|
end
|
201
195
|
end
|
202
|
-
#return sites
|
203
|
-
return @discovered_urls_by_crawler.keys
|
204
|
-
rescue Exception => ee
|
205
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
206
|
-
return nil
|
207
196
|
end
|
197
|
+
#return sites
|
198
|
+
return @discovered_urls_by_crawler.keys
|
199
|
+
rescue Exception => ee
|
200
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
201
|
+
return nil
|
208
202
|
end
|
209
203
|
alias_method :crawls, :crawl_workers
|
210
204
|
|
211
205
|
# Fast crawling method - build the target pool from the input file
|
212
206
|
def crawl_workers_on_file (file)
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
return nil
|
221
|
-
end
|
207
|
+
puts "Web crawl the list of targets from file: #{file}"
|
208
|
+
targets=file_2_list(file)
|
209
|
+
sites=crawl_workers(targets,num=@max_parallel)
|
210
|
+
return sites
|
211
|
+
rescue => ee
|
212
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
213
|
+
return nil
|
222
214
|
end
|
223
215
|
alias_method :query_file, :crawl_workers_on_file
|
224
216
|
alias_method :crawl_file, :crawl_workers_on_file
|
225
217
|
|
226
218
|
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
227
219
|
def open_url(url)
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
return nil
|
244
|
-
end
|
220
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
221
|
+
if url =~ /http\:/i
|
222
|
+
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
223
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
224
|
+
#url_object = open(url)
|
225
|
+
elsif url =~ /https\:/i
|
226
|
+
url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
|
227
|
+
#url_object = open(url,:ssl_verify_mode => 0)
|
228
|
+
else
|
229
|
+
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
230
|
+
end
|
231
|
+
return url_object
|
232
|
+
rescue => ee
|
233
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
234
|
+
return nil
|
245
235
|
end
|
246
236
|
|
247
237
|
# Wrapper to use OpenURI method 'read' to return url body contents
|
248
238
|
def read_url(url)
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
return nil
|
258
|
-
end
|
239
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
240
|
+
url_object=open_url(url)
|
241
|
+
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
242
|
+
body=url_object.read
|
243
|
+
return body
|
244
|
+
rescue => ee
|
245
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
246
|
+
return nil
|
259
247
|
end
|
260
248
|
|
261
249
|
# Return the destination url in case of url re-direct
|
262
250
|
def update_url_if_redirected(url, url_object)
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
return nil
|
272
|
-
end
|
251
|
+
#puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
|
252
|
+
if url != url_object.base_uri.to_s
|
253
|
+
return url_object.base_uri.to_s
|
254
|
+
end
|
255
|
+
return url
|
256
|
+
rescue => ee
|
257
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
258
|
+
return nil
|
273
259
|
end
|
274
260
|
|
275
261
|
=begin
|
@@ -290,90 +276,82 @@ class Wmap::UrlCrawler
|
|
290
276
|
|
291
277
|
# Search 'current_url' and return found URLs under the same domain
|
292
278
|
def find_urls_on_page(doc, current_url)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
if new_url
|
303
|
-
#if urls_on_same_domain?(new_url,current_url)
|
304
|
-
urls_list.push(new_url)
|
305
|
-
#end
|
306
|
-
else
|
307
|
-
new_url = make_absolute(current_url, new_url)
|
279
|
+
puts "Search and return URLs within the doc: #{doc}" if @verbose
|
280
|
+
urls_list = []
|
281
|
+
# case 1 - search embedded HTML tag <a href='url'> for the url elements
|
282
|
+
links=doc.css('a')
|
283
|
+
links.map do |x|
|
284
|
+
#puts "x: #{x}"
|
285
|
+
new_url = x.attribute('href').to_s
|
286
|
+
unless new_url == nil
|
287
|
+
if new_url.match("http")
|
288
|
+
#if urls_on_same_domain?(new_url,current_url)
|
308
289
|
urls_list.push(new_url)
|
309
|
-
end
|
290
|
+
#end
|
291
|
+
else
|
292
|
+
new_url = make_absolute(current_url, new_url)
|
293
|
+
urls_list.push(new_url)
|
310
294
|
end
|
311
295
|
end
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
296
|
+
end
|
297
|
+
# case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
|
298
|
+
elements=doc.css("meta[http-equiv]")
|
299
|
+
unless elements.size == 0
|
300
|
+
link=elements.attr("content").value.split(/url\=/i)[1]
|
301
|
+
unless link.nil?
|
302
|
+
new_url = make_absolute(current_url, link)
|
303
|
+
urls_list.push(new_url) unless new_url.nil?
|
320
304
|
end
|
321
|
-
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
322
|
-
return urls_list.uniq-["",nil]
|
323
|
-
rescue => ee
|
324
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
325
|
-
return nil
|
326
305
|
end
|
306
|
+
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
307
|
+
return urls_list.uniq-["",nil]
|
308
|
+
rescue => ee
|
309
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
310
|
+
return nil
|
327
311
|
end
|
328
312
|
|
329
313
|
# Method to print out discovery URL result
|
330
314
|
def print_discovered_urls_by_crawler
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
return nil
|
342
|
-
end
|
315
|
+
puts "Print discovered url by the crawler. " if @verbose
|
316
|
+
puts "\nSummary Report of Discovered URLs from the Crawler:"
|
317
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
318
|
+
puts url
|
319
|
+
end
|
320
|
+
puts "Total: #{@discovered_urls_by_crawler.keys.size}"
|
321
|
+
puts "End of the summary"
|
322
|
+
rescue => ee
|
323
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
324
|
+
return nil
|
343
325
|
end
|
344
326
|
alias_method :print, :print_discovered_urls_by_crawler
|
345
327
|
|
346
328
|
# Method to save URL discovery result
|
347
329
|
def save_discovered_urls (file)
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
return nil
|
355
|
-
end
|
330
|
+
puts "Save discovered urls by the crawler to file: #{file} "
|
331
|
+
list_2_file(@discovered_urls_by_crawler.keys, file)
|
332
|
+
puts "Done!"
|
333
|
+
rescue => ee
|
334
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
335
|
+
return nil
|
356
336
|
end
|
357
337
|
alias_method :save, :save_discovered_urls
|
358
338
|
|
359
339
|
# Method to retrieve discovery site result
|
360
340
|
def get_discovered_sites_by_crawler
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
return nil
|
376
|
-
end
|
341
|
+
puts "Print summary report of discovered sites. " if @verbose
|
342
|
+
puts "\nSummary Report of Discovered Sites from the Crawler:"
|
343
|
+
sites = Hash.new
|
344
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
345
|
+
site=url_2_site(url)
|
346
|
+
sites[site]=true unless sites.key?(site)
|
347
|
+
end
|
348
|
+
sites.keys.map { |site| puts site }
|
349
|
+
puts "Total: #{sites.size}"
|
350
|
+
puts "End of the summary"
|
351
|
+
return sites.keys
|
352
|
+
rescue => ee
|
353
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
354
|
+
return nil
|
377
355
|
end
|
378
356
|
alias_method :get_sites, :get_discovered_sites_by_crawler
|
379
357
|
|
data/version.txt
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
###############################################################################
|
4
4
|
package = wmap
|
5
5
|
# wmap version 2.0 == web_discovery version 1.5.3
|
6
|
-
version = 2.6.
|
7
|
-
date = 2019-11-
|
6
|
+
version = 2.6.7
|
7
|
+
date = 2019-11-19
|
8
8
|
|
9
9
|
author = Sam (Yang) Li
|
10
10
|
email = yang.li@owasp.org
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.6.
|
4
|
+
version: 2.6.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam (Yang) Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dnsruby
|