wmap 2.6.6 → 2.6.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wmap +12 -15
- data/lib/wmap/host_tracker.rb +3 -3
- data/lib/wmap/site_tracker.rb +2 -2
- data/lib/wmap/url_crawler/adware_tag.rb +5 -7
- data/lib/wmap/url_crawler.rb +206 -228
- data/version.txt +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e4f2a2dfe9b4b119331eefffc7b9b025d9953c2ce5f7255e4d2a08929a591c3
|
4
|
+
data.tar.gz: 3d018d69469cf4e4551b38397657341661fd95c3f59bebe8bb21405d4e107881
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e7d27b4d4abfc34ab3df0933412b4c99e94af93f71251a2e7a0706b4782ad62a2541dbf7c5f391f47d5a3b9eae9feb45ce1690b2e43fddab4f52a14e7bb334b
|
7
|
+
data.tar.gz: 0117422a9eac9f1c7a66783a0a4ca870711aeaa95252a9df8fe4f9ce2f8f10fd4ba461b2146a681f3b867becb9bace05c61a3e11b284714732432c95e12dc983
|
data/bin/wmap
CHANGED
@@ -6,13 +6,8 @@
|
|
6
6
|
require "wmap"
|
7
7
|
require "optparse"
|
8
8
|
|
9
|
-
# program helper
|
10
|
-
def print_usage
|
11
|
-
abort "Program to perform website asset discovery and tracking. \nUsage: wmap -t <Target Host | URL | IP | CIDR | or a seed file with any of the above combo> -d <Optional Discovery Result Directory>"
|
12
|
-
end
|
13
|
-
|
14
9
|
# program command line options
|
15
|
-
options = {:data_dir => nil, :target => nil}
|
10
|
+
options = {:data_dir => nil, :target => nil, :verbose => false}
|
16
11
|
parser = OptionParser.new do|opts|
|
17
12
|
opts.banner = Wmap.banner
|
18
13
|
opts.on('-d', '--data_dir data_dir', 'Web Mapper local cache data directory') do |data_dir|
|
@@ -21,9 +16,11 @@ parser = OptionParser.new do|opts|
|
|
21
16
|
opts.on('-t', '--target target', 'Web Mapper target') do |target|
|
22
17
|
options[:target] = target;
|
23
18
|
end
|
19
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
20
|
+
options[:verbose] = v;
|
21
|
+
end
|
24
22
|
opts.on('-h', '--help', 'Displays Help') do
|
25
|
-
|
26
|
-
print_usage
|
23
|
+
puts opts
|
27
24
|
exit 0
|
28
25
|
end
|
29
26
|
end
|
@@ -47,7 +44,7 @@ Dir.mkdir(Log_dir) unless Dir.exist?(Log_dir)
|
|
47
44
|
Wmap.wlog("Execute the command: wmap -t #{options[:target]}","wmap",Log_dir.join("wmap.log").to_s)
|
48
45
|
urls = Array.new
|
49
46
|
# first step - construct the host list
|
50
|
-
scanner = Wmap::PortScanner.new(:verbose=>
|
47
|
+
scanner = Wmap::PortScanner.new(:verbose=>options[:verbose], :socket_timeout=>600) # default time-out of 600 milliseconds
|
51
48
|
hosts=Array.new
|
52
49
|
if File.exist?(options[:target])
|
53
50
|
puts "Parsing the discovery seed file: \"#{options[:target]}\" "
|
@@ -65,18 +62,18 @@ if File.exist?(options[:target])
|
|
65
62
|
cidrs.push(x) if scanner.is_cidr?(x)
|
66
63
|
end
|
67
64
|
puts "Parsing done. "
|
68
|
-
hosts+=Wmap::DnsBruter.new(:verbose=>
|
65
|
+
hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_workers(domains.uniq).values.flatten if domains.size > 0
|
69
66
|
cidrs.map { |x| hosts+= scanner.cidr_2_ips(x) } if cidrs.size > 0
|
70
67
|
elsif scanner.is_url?(options[:target])
|
71
68
|
puts "Processing the URL: #{options[:target]}"
|
72
69
|
urls.push(options[:target])
|
73
70
|
elsif Wmap.domain_known?(options[:target]) or Wmap.sub_domain_known?(options[:target])
|
74
71
|
puts "Processing the domain: #{options[:target]}"
|
75
|
-
hosts+=Wmap::DnsBruter.new(:verbose=>
|
72
|
+
hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten
|
76
73
|
elsif scanner.is_fqdn?(options[:target])
|
77
74
|
puts "Processing the host: #{options[:target]}"
|
78
75
|
hosts.push(options[:target])
|
79
|
-
my_hosts=Wmap::DnsBruter.new(:verbose=>
|
76
|
+
my_hosts=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten if (options[:target].split('.')[0] =~ /\d+/)
|
80
77
|
hosts+=my_hosts unless my_hosts.nil?
|
81
78
|
elsif scanner.is_cidr?(options[:target])
|
82
79
|
puts "Processing the network block: #{options[:target]}"
|
@@ -102,7 +99,7 @@ if options[:target] && options[:data_dir]
|
|
102
99
|
crawler = Wmap::UrlCrawler.new(:data_dir => options[:data_dir])
|
103
100
|
elsif options[:target]
|
104
101
|
puts "Fire up the crawler."
|
105
|
-
crawler = Wmap::UrlCrawler.new(:verbose=>
|
102
|
+
crawler = Wmap::UrlCrawler.new(:verbose=>options[:verbose])
|
106
103
|
else
|
107
104
|
abort "Error firing up UrlCrawler instance!"
|
108
105
|
end
|
@@ -168,14 +165,14 @@ end
|
|
168
165
|
if options[:target] && options[:data_dir]
|
169
166
|
puts "Invoke the HostTracker with optional directory setter."
|
170
167
|
host_tracker = Wmap::HostTracker.instance
|
171
|
-
host_tracker.verbose=
|
168
|
+
host_tracker.verbose=options[:verbose]
|
172
169
|
host_tracker.data_dir = options[:data_dir]
|
173
170
|
host_tracker.hosts_file = host_tracker.data_dir + "/" + "hosts"
|
174
171
|
host_tracker.load_known_hosts_from_file(host_tracker.hosts_file)
|
175
172
|
elsif options[:target]
|
176
173
|
puts puts "Invoke the HostTracker."
|
177
174
|
host_tracker = Wmap::HostTracker.instance
|
178
|
-
host_tracker.verbose=
|
175
|
+
host_tracker.verbose=options[:verbose]
|
179
176
|
else
|
180
177
|
abort "Error firing up HostTracker instance!"
|
181
178
|
end
|
data/lib/wmap/host_tracker.rb
CHANGED
@@ -57,9 +57,9 @@ class Wmap::HostTracker
|
|
57
57
|
end
|
58
58
|
f.close
|
59
59
|
return @known_hosts
|
60
|
-
|
61
|
-
|
62
|
-
|
60
|
+
rescue => ee
|
61
|
+
puts "Exception on method #{__method__}: #{ee}"
|
62
|
+
return known_hosts
|
63
63
|
end
|
64
64
|
|
65
65
|
# Save the current local hosts hash table into a (random) data repository file
|
data/lib/wmap/site_tracker.rb
CHANGED
@@ -282,8 +282,8 @@ class Wmap::SiteTracker
|
|
282
282
|
puts "No new entry added. "
|
283
283
|
end
|
284
284
|
return results
|
285
|
-
|
286
|
-
|
285
|
+
rescue => ee
|
286
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
287
287
|
end
|
288
288
|
alias_method :adds, :bulk_add
|
289
289
|
|
@@ -12,7 +12,7 @@ module Wmap
|
|
12
12
|
|
13
13
|
# Class to identify and track adware within the site store
|
14
14
|
include Wmap::Utils
|
15
|
-
attr_accessor :signature_file, :tag_file, :verbose, :data_dir
|
15
|
+
attr_accessor :signature_file, :tag_file, :verbose, :data_dir
|
16
16
|
attr_reader :tag_signatures, :tag_store
|
17
17
|
|
18
18
|
|
@@ -26,7 +26,7 @@ module Wmap
|
|
26
26
|
# Set default instance variables
|
27
27
|
@signature_file=File.dirname(__FILE__) + '/../../../settings/' + 'tag_signatures'
|
28
28
|
file=params.fetch(:signature_file, @signature_file)
|
29
|
-
@tag_signatures=
|
29
|
+
@tag_signatures=load_sig_from_file(file)
|
30
30
|
@tag_file=params.fetch(:tag_file, @data_dir + 'tag_sites')
|
31
31
|
File.write(@tag_file, "") unless File.exist?(@tag_file)
|
32
32
|
# load the known tag store
|
@@ -34,9 +34,8 @@ module Wmap
|
|
34
34
|
@landings = Hash.new # cache landing page to reduce redundant browsing
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
37
|
# load the known tag signatures into an instance variable
|
39
|
-
def
|
38
|
+
def load_sig_from_file (file, lc=true)
|
40
39
|
puts "Loading data file: #{file}" if @verbose
|
41
40
|
data_store=Hash.new
|
42
41
|
f = File.open(file, 'r')
|
@@ -53,7 +52,6 @@ module Wmap
|
|
53
52
|
else
|
54
53
|
data_store[entry[0]]=entry[1].strip
|
55
54
|
end
|
56
|
-
|
57
55
|
end
|
58
56
|
f.close
|
59
57
|
return data_store
|
@@ -105,11 +103,11 @@ module Wmap
|
|
105
103
|
end
|
106
104
|
alias_method :save!, :save_to_file!
|
107
105
|
|
108
|
-
#
|
106
|
+
# Refresh adware tag store signatures
|
109
107
|
def refresh (num=@max_parallel,use_cache=true)
|
110
108
|
puts "Add entries to the local cache table from site tracker: " if @verbose
|
111
109
|
results = Hash.new
|
112
|
-
tags =
|
110
|
+
tags = @tag_store.keys
|
113
111
|
if tags.size > 0
|
114
112
|
Parallel.map(tags, :in_processes => num) { |target|
|
115
113
|
check_adware(target,use_cache)
|
data/lib/wmap/url_crawler.rb
CHANGED
@@ -66,210 +66,196 @@ class Wmap::UrlCrawler
|
|
66
66
|
# A web crawler to crawl a known website and search for html links within the same root domain. For example,
|
67
67
|
# by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
|
68
68
|
def crawl(url)
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
return result
|
85
|
-
end
|
69
|
+
puts "Start web crawling on #{url}"
|
70
|
+
result=Array.new
|
71
|
+
url=url.chomp.strip
|
72
|
+
result.push(url_2_site(url))
|
73
|
+
raise "Error! Invalid url format: #{urls}" unless is_url?(url)
|
74
|
+
# Add logic to profile the web server before crawling; this is used to optimize the crawling speed
|
75
|
+
pre_crawl(url)
|
76
|
+
status = Timeout::timeout(Crawl_timeout/1000) {
|
77
|
+
result+=crawl_worker(url).keys
|
78
|
+
}
|
79
|
+
puts "Web crawling time-out on #{url}: #{status}" if @verbose
|
80
|
+
return result
|
81
|
+
rescue => ee
|
82
|
+
puts "Exception on method #{__method__} for URL #{url}: #{ee}"
|
83
|
+
return result
|
86
84
|
end
|
87
85
|
alias_method :query, :crawl
|
88
86
|
|
89
87
|
# The worker instance of crawler who perform the labour work
|
90
88
|
def crawl_worker(url0)
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
89
|
+
puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
|
90
|
+
# Input URL sanity check first
|
91
|
+
if is_url?(url0)
|
92
|
+
host=url_2_host(url0)
|
93
|
+
ip=host_2_ip(host).to_s
|
94
|
+
raise "Invalid IP address: #{url0}" if ip.nil?
|
95
|
+
port=url_2_port(url0).to_s
|
96
|
+
raise "Invalid port number: #{url0}" if port.nil?
|
97
|
+
else
|
98
|
+
raise "Invalid URL: #{url0}. Please check it out with your browser again."
|
99
|
+
end
|
100
|
+
log_info=Hash.new
|
101
|
+
log_info[1]="Start working on #{url0}"
|
102
|
+
url_stores=Hash.new
|
103
|
+
url_stores[url0]=true unless url_stores.key?(url0)
|
104
|
+
@discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
|
105
|
+
@crawl_start[url0]=true unless @crawl_start.key?(url0)
|
109
106
|
# $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
107
|
+
@crawl_depth.times do
|
108
|
+
url_stores.keys.each do |url|
|
109
|
+
# 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
|
110
|
+
next if @visited_urls_by_crawler.key?(url)
|
111
|
+
url_object = open_url(url)
|
112
|
+
next if url_object == nil
|
113
|
+
url = update_url_if_redirected(url, url_object)
|
114
|
+
url_body = read_url(url)
|
115
|
+
# Protection code - to avoid parsing failure on the empty or nil object
|
116
|
+
next if url_body.nil? or url_body.empty?
|
117
|
+
url_stores[url]=true unless url_stores.key?(url)
|
118
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
122
119
|
# $discovered_urls[url]=true unless $discovered_urls.key?(url)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
120
|
+
doc = Nokogiri::HTML(url_body)
|
121
|
+
next if doc == nil
|
122
|
+
if url_stores.size >= @crawl_page_limit
|
123
|
+
#@visited_urls_by_crawler.merge!(url_stores)
|
124
|
+
@discovered_urls_by_crawler.merge!(url_stores)
|
128
125
|
# $discovered_urls.merge!(url_stores)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
126
|
+
puts "Finish web crawling the url: #{url0}"
|
127
|
+
return url_stores
|
128
|
+
end
|
129
|
+
page_urls = find_urls_on_page(doc, url)
|
130
|
+
page_urls.uniq!
|
131
|
+
page_urls.map do |y|
|
132
|
+
y=normalize_url(y)
|
133
|
+
url_stores[y]=true unless url_stores.key?(y)
|
134
|
+
@discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
|
138
135
|
# $discovered_urls[y]=true unless $discovered_urls.key?(y)
|
139
|
-
end
|
140
136
|
end
|
141
137
|
end
|
142
|
-
puts "Finish web crawling on: #{url0}"
|
143
|
-
log_info[2]="Finish working on: #{url0}"
|
144
|
-
wlog(log_info, "UrlCrawler", @log_file)
|
145
|
-
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
146
|
-
return url_stores
|
147
|
-
rescue => ee
|
148
|
-
puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
149
|
-
log_info[3]="Exception on #{url0}"
|
150
|
-
wlog(log_info,"UrlCrawler",@log_file)
|
151
|
-
return url_stores
|
152
138
|
end
|
139
|
+
puts "Finish web crawling on: #{url0}"
|
140
|
+
log_info[2]="Finish working on: #{url0}"
|
141
|
+
wlog(log_info, "UrlCrawler", @log_file)
|
142
|
+
@crawl_done[url0]=true unless @crawl_done.key?(url0)
|
143
|
+
return url_stores
|
144
|
+
rescue => ee
|
145
|
+
puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
|
146
|
+
log_info[3]="Exception on #{url0}"
|
147
|
+
wlog(log_info,"UrlCrawler",@log_file)
|
148
|
+
return url_stores
|
153
149
|
end
|
154
150
|
|
155
151
|
# Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
|
156
152
|
# each child process will continuously work on the target pool until all the works are done
|
157
153
|
def crawl_workers (targets,num=@max_parallel)
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
uniq_sites[site_key]=target
|
175
|
-
end
|
154
|
+
raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
|
155
|
+
puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
|
156
|
+
#puts "This could be awhile depending on the list size. Please be patient ..."
|
157
|
+
# 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
|
158
|
+
targets -= ["", nil]
|
159
|
+
uniq_sites=Hash.new
|
160
|
+
targets.dup.map do |target|
|
161
|
+
if is_url?(target)
|
162
|
+
host=url_2_host(target)
|
163
|
+
ip=host_2_ip(host).to_s
|
164
|
+
next if ip.nil?
|
165
|
+
port=url_2_port(target).to_s
|
166
|
+
next if port.nil?
|
167
|
+
site_key=ip+":"+port
|
168
|
+
unless uniq_sites.key?(site_key)
|
169
|
+
uniq_sites[site_key]=target
|
176
170
|
end
|
177
171
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
172
|
+
end
|
173
|
+
puts "Sanitization done! " if @verbose
|
174
|
+
puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
|
175
|
+
puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
|
176
|
+
raise "Error: target list is empty!" if targets.size < 1
|
177
|
+
Parallel.map(uniq_sites.values, :in_processes => num) { |target|
|
178
|
+
puts "Working on #{target} ..." if @verbose
|
179
|
+
crawl(target)
|
180
|
+
}.dup.each do |process|
|
181
|
+
puts "process.inspect: #{process}" if @verbose
|
182
|
+
urls=process
|
183
|
+
urls-=["",nil] unless urls.nil?
|
184
|
+
if urls.nil?
|
185
|
+
next
|
186
|
+
elsif urls.empty?
|
187
|
+
next
|
188
|
+
#do nothing
|
189
|
+
else
|
190
|
+
urls.map do |url|
|
191
|
+
url.strip!
|
192
|
+
@discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
|
193
|
+
#$discovered_urls[url]=true unless $discovered_urls.key?(url)
|
200
194
|
end
|
201
195
|
end
|
202
|
-
#return sites
|
203
|
-
return @discovered_urls_by_crawler.keys
|
204
|
-
rescue Exception => ee
|
205
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
206
|
-
return nil
|
207
196
|
end
|
197
|
+
#return sites
|
198
|
+
return @discovered_urls_by_crawler.keys
|
199
|
+
rescue Exception => ee
|
200
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
201
|
+
return nil
|
208
202
|
end
|
209
203
|
alias_method :crawls, :crawl_workers
|
210
204
|
|
211
205
|
# Fast crawling method - build the target pool from the input file
|
212
206
|
def crawl_workers_on_file (file)
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
return nil
|
221
|
-
end
|
207
|
+
puts "Web crawl the list of targets from file: #{file}"
|
208
|
+
targets=file_2_list(file)
|
209
|
+
sites=crawl_workers(targets,num=@max_parallel)
|
210
|
+
return sites
|
211
|
+
rescue => ee
|
212
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
213
|
+
return nil
|
222
214
|
end
|
223
215
|
alias_method :query_file, :crawl_workers_on_file
|
224
216
|
alias_method :crawl_file, :crawl_workers_on_file
|
225
217
|
|
226
218
|
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
227
219
|
def open_url(url)
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
return nil
|
244
|
-
end
|
220
|
+
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
221
|
+
if url =~ /http\:/i
|
222
|
+
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
223
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
224
|
+
#url_object = open(url)
|
225
|
+
elsif url =~ /https\:/i
|
226
|
+
url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
|
227
|
+
#url_object = open(url,:ssl_verify_mode => 0)
|
228
|
+
else
|
229
|
+
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
230
|
+
end
|
231
|
+
return url_object
|
232
|
+
rescue => ee
|
233
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
234
|
+
return nil
|
245
235
|
end
|
246
236
|
|
247
237
|
# Wrapper to use OpenURI method 'read' to return url body contents
|
248
238
|
def read_url(url)
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
return nil
|
258
|
-
end
|
239
|
+
puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
|
240
|
+
url_object=open_url(url)
|
241
|
+
@visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
|
242
|
+
body=url_object.read
|
243
|
+
return body
|
244
|
+
rescue => ee
|
245
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
246
|
+
return nil
|
259
247
|
end
|
260
248
|
|
261
249
|
# Return the destination url in case of url re-direct
|
262
250
|
def update_url_if_redirected(url, url_object)
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
return nil
|
272
|
-
end
|
251
|
+
#puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
|
252
|
+
if url != url_object.base_uri.to_s
|
253
|
+
return url_object.base_uri.to_s
|
254
|
+
end
|
255
|
+
return url
|
256
|
+
rescue => ee
|
257
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
258
|
+
return nil
|
273
259
|
end
|
274
260
|
|
275
261
|
=begin
|
@@ -290,90 +276,82 @@ class Wmap::UrlCrawler
|
|
290
276
|
|
291
277
|
# Search 'current_url' and return found URLs under the same domain
|
292
278
|
def find_urls_on_page(doc, current_url)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
if new_url
|
303
|
-
#if urls_on_same_domain?(new_url,current_url)
|
304
|
-
urls_list.push(new_url)
|
305
|
-
#end
|
306
|
-
else
|
307
|
-
new_url = make_absolute(current_url, new_url)
|
279
|
+
puts "Search and return URLs within the doc: #{doc}" if @verbose
|
280
|
+
urls_list = []
|
281
|
+
# case 1 - search embedded HTML tag <a href='url'> for the url elements
|
282
|
+
links=doc.css('a')
|
283
|
+
links.map do |x|
|
284
|
+
#puts "x: #{x}"
|
285
|
+
new_url = x.attribute('href').to_s
|
286
|
+
unless new_url == nil
|
287
|
+
if new_url.match("http")
|
288
|
+
#if urls_on_same_domain?(new_url,current_url)
|
308
289
|
urls_list.push(new_url)
|
309
|
-
end
|
290
|
+
#end
|
291
|
+
else
|
292
|
+
new_url = make_absolute(current_url, new_url)
|
293
|
+
urls_list.push(new_url)
|
310
294
|
end
|
311
295
|
end
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
296
|
+
end
|
297
|
+
# case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
|
298
|
+
elements=doc.css("meta[http-equiv]")
|
299
|
+
unless elements.size == 0
|
300
|
+
link=elements.attr("content").value.split(/url\=/i)[1]
|
301
|
+
unless link.nil?
|
302
|
+
new_url = make_absolute(current_url, link)
|
303
|
+
urls_list.push(new_url) unless new_url.nil?
|
320
304
|
end
|
321
|
-
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
322
|
-
return urls_list.uniq-["",nil]
|
323
|
-
rescue => ee
|
324
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
325
|
-
return nil
|
326
305
|
end
|
306
|
+
#puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
|
307
|
+
return urls_list.uniq-["",nil]
|
308
|
+
rescue => ee
|
309
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
310
|
+
return nil
|
327
311
|
end
|
328
312
|
|
329
313
|
# Method to print out discovery URL result
|
330
314
|
def print_discovered_urls_by_crawler
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
return nil
|
342
|
-
end
|
315
|
+
puts "Print discovered url by the crawler. " if @verbose
|
316
|
+
puts "\nSummary Report of Discovered URLs from the Crawler:"
|
317
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
318
|
+
puts url
|
319
|
+
end
|
320
|
+
puts "Total: #{@discovered_urls_by_crawler.keys.size}"
|
321
|
+
puts "End of the summary"
|
322
|
+
rescue => ee
|
323
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
324
|
+
return nil
|
343
325
|
end
|
344
326
|
alias_method :print, :print_discovered_urls_by_crawler
|
345
327
|
|
346
328
|
# Method to save URL discovery result
|
347
329
|
def save_discovered_urls (file)
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
return nil
|
355
|
-
end
|
330
|
+
puts "Save discovered urls by the crawler to file: #{file} "
|
331
|
+
list_2_file(@discovered_urls_by_crawler.keys, file)
|
332
|
+
puts "Done!"
|
333
|
+
rescue => ee
|
334
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
335
|
+
return nil
|
356
336
|
end
|
357
337
|
alias_method :save, :save_discovered_urls
|
358
338
|
|
359
339
|
# Method to retrieve discovery site result
|
360
340
|
def get_discovered_sites_by_crawler
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
return nil
|
376
|
-
end
|
341
|
+
puts "Print summary report of discovered sites. " if @verbose
|
342
|
+
puts "\nSummary Report of Discovered Sites from the Crawler:"
|
343
|
+
sites = Hash.new
|
344
|
+
@discovered_urls_by_crawler.keys.each do |url|
|
345
|
+
site=url_2_site(url)
|
346
|
+
sites[site]=true unless sites.key?(site)
|
347
|
+
end
|
348
|
+
sites.keys.map { |site| puts site }
|
349
|
+
puts "Total: #{sites.size}"
|
350
|
+
puts "End of the summary"
|
351
|
+
return sites.keys
|
352
|
+
rescue => ee
|
353
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
354
|
+
return nil
|
377
355
|
end
|
378
356
|
alias_method :get_sites, :get_discovered_sites_by_crawler
|
379
357
|
|
data/version.txt
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
###############################################################################
|
4
4
|
package = wmap
|
5
5
|
# wmap version 2.0 == web_discovery version 1.5.3
|
6
|
-
version = 2.6.
|
7
|
-
date = 2019-11-
|
6
|
+
version = 2.6.7
|
7
|
+
date = 2019-11-19
|
8
8
|
|
9
9
|
author = Sam (Yang) Li
|
10
10
|
email = yang.li@owasp.org
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.6.
|
4
|
+
version: 2.6.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam (Yang) Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dnsruby
|