wmap 2.6.6 → 2.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4f814d2c0a04e5aedf5314a05a6de6ea59ff1a9cc6a71296bfcf94c301f9e67
4
- data.tar.gz: 641c3dab030ff37bd0292f0ab2bfc2750c94d72051ddb46891ce490d917fb3b9
3
+ metadata.gz: 2e4f2a2dfe9b4b119331eefffc7b9b025d9953c2ce5f7255e4d2a08929a591c3
4
+ data.tar.gz: 3d018d69469cf4e4551b38397657341661fd95c3f59bebe8bb21405d4e107881
5
5
  SHA512:
6
- metadata.gz: 977683b1f6a166ce1da6036e465b88e71aa9445b20e9c80048a640f6bf1dacee26b3503123529b0e5241c862370b0bc03c108c94ccfdf2e63c4af809af28a5ea
7
- data.tar.gz: bac9ed1cd639750040f035b88001244751d7a1e627594c750d38962c4cb8625035cd2e2ec64d40b9464989b33d33a8cc70b702d7f5d737e0373d104b3ada443e
6
+ metadata.gz: 7e7d27b4d4abfc34ab3df0933412b4c99e94af93f71251a2e7a0706b4782ad62a2541dbf7c5f391f47d5a3b9eae9feb45ce1690b2e43fddab4f52a14e7bb334b
7
+ data.tar.gz: 0117422a9eac9f1c7a66783a0a4ca870711aeaa95252a9df8fe4f9ce2f8f10fd4ba461b2146a681f3b867becb9bace05c61a3e11b284714732432c95e12dc983
data/bin/wmap CHANGED
@@ -6,13 +6,8 @@
6
6
  require "wmap"
7
7
  require "optparse"
8
8
 
9
- # program helper
10
- def print_usage
11
- abort "Program to perform website asset discovery and tracking. \nUsage: wmap -t <Target Host | URL | IP | CIDR | or a seed file with any of the above combo> -d <Optional Discovery Result Directory>"
12
- end
13
-
14
9
  # program command line options
15
- options = {:data_dir => nil, :target => nil}
10
+ options = {:data_dir => nil, :target => nil, :verbose => false}
16
11
  parser = OptionParser.new do|opts|
17
12
  opts.banner = Wmap.banner
18
13
  opts.on('-d', '--data_dir data_dir', 'Web Mapper local cache data directory') do |data_dir|
@@ -21,9 +16,11 @@ parser = OptionParser.new do|opts|
21
16
  opts.on('-t', '--target target', 'Web Mapper target') do |target|
22
17
  options[:target] = target;
23
18
  end
19
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
20
+ options[:verbose] = v;
21
+ end
24
22
  opts.on('-h', '--help', 'Displays Help') do
25
- print Wmap.banner,"\n"
26
- print_usage
23
+ puts opts
27
24
  exit 0
28
25
  end
29
26
  end
@@ -47,7 +44,7 @@ Dir.mkdir(Log_dir) unless Dir.exist?(Log_dir)
47
44
  Wmap.wlog("Execute the command: wmap -t #{options[:target]}","wmap",Log_dir.join("wmap.log").to_s)
48
45
  urls = Array.new
49
46
  # first step - construct the host list
50
- scanner = Wmap::PortScanner.new(:verbose=>false, :socket_timeout=>600) # default time-out of 600 milliseconds
47
+ scanner = Wmap::PortScanner.new(:verbose=>options[:verbose], :socket_timeout=>600) # default time-out of 600 milliseconds
51
48
  hosts=Array.new
52
49
  if File.exist?(options[:target])
53
50
  puts "Parsing the discovery seed file: \"#{options[:target]}\" "
@@ -65,18 +62,18 @@ if File.exist?(options[:target])
65
62
  cidrs.push(x) if scanner.is_cidr?(x)
66
63
  end
67
64
  puts "Parsing done. "
68
- hosts+=Wmap::DnsBruter.new(:verbose=>false).dns_brute_workers(domains.uniq).values.flatten if domains.size > 0
65
+ hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_workers(domains.uniq).values.flatten if domains.size > 0
69
66
  cidrs.map { |x| hosts+= scanner.cidr_2_ips(x) } if cidrs.size > 0
70
67
  elsif scanner.is_url?(options[:target])
71
68
  puts "Processing the URL: #{options[:target]}"
72
69
  urls.push(options[:target])
73
70
  elsif Wmap.domain_known?(options[:target]) or Wmap.sub_domain_known?(options[:target])
74
71
  puts "Processing the domain: #{options[:target]}"
75
- hosts+=Wmap::DnsBruter.new(:verbose=>false).dns_brute_worker(options[:target]).values.flatten
72
+ hosts+=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten
76
73
  elsif scanner.is_fqdn?(options[:target])
77
74
  puts "Processing the host: #{options[:target]}"
78
75
  hosts.push(options[:target])
79
- my_hosts=Wmap::DnsBruter.new(:verbose=>false).dns_brute_worker(options[:target]).values.flatten if (options[:target].split('.')[0] =~ /\d+/)
76
+ my_hosts=Wmap::DnsBruter.new(:verbose=>options[:verbose]).dns_brute_worker(options[:target]).values.flatten if (options[:target].split('.')[0] =~ /\d+/)
80
77
  hosts+=my_hosts unless my_hosts.nil?
81
78
  elsif scanner.is_cidr?(options[:target])
82
79
  puts "Processing the network block: #{options[:target]}"
@@ -102,7 +99,7 @@ if options[:target] && options[:data_dir]
102
99
  crawler = Wmap::UrlCrawler.new(:data_dir => options[:data_dir])
103
100
  elsif options[:target]
104
101
  puts "Fire up the crawler."
105
- crawler = Wmap::UrlCrawler.new(:verbose=>false)
102
+ crawler = Wmap::UrlCrawler.new(:verbose=>options[:verbose])
106
103
  else
107
104
  abort "Error firing up UrlCrawler instance!"
108
105
  end
@@ -168,14 +165,14 @@ end
168
165
  if options[:target] && options[:data_dir]
169
166
  puts "Invoke the HostTracker with optional directory setter."
170
167
  host_tracker = Wmap::HostTracker.instance
171
- host_tracker.verbose=false
168
+ host_tracker.verbose=options[:verbose]
172
169
  host_tracker.data_dir = options[:data_dir]
173
170
  host_tracker.hosts_file = host_tracker.data_dir + "/" + "hosts"
174
171
  host_tracker.load_known_hosts_from_file(host_tracker.hosts_file)
175
172
  elsif options[:target]
176
173
  puts puts "Invoke the HostTracker."
177
174
  host_tracker = Wmap::HostTracker.instance
178
- host_tracker.verbose=false
175
+ host_tracker.verbose=options[:verbose]
179
176
  else
180
177
  abort "Error firing up HostTracker instance!"
181
178
  end
@@ -57,9 +57,9 @@ class Wmap::HostTracker
57
57
  end
58
58
  f.close
59
59
  return @known_hosts
60
- #rescue => ee
61
- # puts "Exception on method #{__method__}: #{ee}"
62
- # return known_hosts
60
+ rescue => ee
61
+ puts "Exception on method #{__method__}: #{ee}"
62
+ return known_hosts
63
63
  end
64
64
 
65
65
  # Save the current local hosts hash table into a (random) data repository file
@@ -282,8 +282,8 @@ class Wmap::SiteTracker
282
282
  puts "No new entry added. "
283
283
  end
284
284
  return results
285
- #rescue => ee
286
- #puts "Exception on method #{__method__}: #{ee}" if @verbose
285
+ rescue => ee
286
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
287
287
  end
288
288
  alias_method :adds, :bulk_add
289
289
 
@@ -12,7 +12,7 @@ module Wmap
12
12
 
13
13
  # Class to identify and track adware within the site store
14
14
  include Wmap::Utils
15
- attr_accessor :signature_file, :tag_file, :verbose, :data_dir, :data_store
15
+ attr_accessor :signature_file, :tag_file, :verbose, :data_dir
16
16
  attr_reader :tag_signatures, :tag_store
17
17
 
18
18
 
@@ -26,7 +26,7 @@ module Wmap
26
26
  # Set default instance variables
27
27
  @signature_file=File.dirname(__FILE__) + '/../../../settings/' + 'tag_signatures'
28
28
  file=params.fetch(:signature_file, @signature_file)
29
- @tag_signatures=load_from_file(file)
29
+ @tag_signatures=load_sig_from_file(file)
30
30
  @tag_file=params.fetch(:tag_file, @data_dir + 'tag_sites')
31
31
  File.write(@tag_file, "") unless File.exist?(@tag_file)
32
32
  # load the known tag store
@@ -34,9 +34,8 @@ module Wmap
34
34
  @landings = Hash.new # cache landing page to reduce redundant browsing
35
35
  end
36
36
 
37
-
38
37
  # load the known tag signatures into an instance variable
39
- def load_from_file (file, lc=true)
38
+ def load_sig_from_file (file, lc=true)
40
39
  puts "Loading data file: #{file}" if @verbose
41
40
  data_store=Hash.new
42
41
  f = File.open(file, 'r')
@@ -53,7 +52,6 @@ module Wmap
53
52
  else
54
53
  data_store[entry[0]]=entry[1].strip
55
54
  end
56
-
57
55
  end
58
56
  f.close
59
57
  return data_store
@@ -105,11 +103,11 @@ module Wmap
105
103
  end
106
104
  alias_method :save!, :save_to_file!
107
105
 
108
- # add tag entries (from the sitetracker list)
106
+ # Refresh adware tag store signatures
109
107
  def refresh (num=@max_parallel,use_cache=true)
110
108
  puts "Add entries to the local cache table from site tracker: " if @verbose
111
109
  results = Hash.new
112
- tags = Wmap::SiteTracker.instance.known_sites.keys
110
+ tags = @tag_store.keys
113
111
  if tags.size > 0
114
112
  Parallel.map(tags, :in_processes => num) { |target|
115
113
  check_adware(target,use_cache)
@@ -66,210 +66,196 @@ class Wmap::UrlCrawler
66
66
  # A web crawler to crawl a known website and search for html links within the same root domain. For example,
67
67
  # by crawling 'http://www.yahoo.com/' it could discover 'http://login.yahoo.com/'
68
68
  def crawl(url)
69
- begin
70
- puts "Start web crawling on #{url}"
71
- result=Array.new
72
- url=url.chomp.strip
73
- result.push(url_2_site(url))
74
- raise "Error! Invalid url format: #{urls}" unless is_url?(url)
75
- # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
76
- pre_crawl(url)
77
- status = Timeout::timeout(Crawl_timeout/1000) {
78
- result+=crawl_worker(url).keys
79
- }
80
- puts "Web crawling time-out on #{url}: #{status}" if @verbose
81
- return result
82
- rescue => ee
83
- puts "Exception on method #{__method__} for URL #{url}: #{ee}"
84
- return result
85
- end
69
+ puts "Start web crawling on #{url}"
70
+ result=Array.new
71
+ url=url.chomp.strip
72
+ result.push(url_2_site(url))
73
+ raise "Error! Invalid url format: #{urls}" unless is_url?(url)
74
+ # Add logic to profile the web server before crawling; this is used to optimize the crawling speed
75
+ pre_crawl(url)
76
+ status = Timeout::timeout(Crawl_timeout/1000) {
77
+ result+=crawl_worker(url).keys
78
+ }
79
+ puts "Web crawling time-out on #{url}: #{status}" if @verbose
80
+ return result
81
+ rescue => ee
82
+ puts "Exception on method #{__method__} for URL #{url}: #{ee}"
83
+ return result
86
84
  end
87
85
  alias_method :query, :crawl
88
86
 
89
87
  # The worker instance of crawler who perform the labour work
90
88
  def crawl_worker(url0)
91
- begin
92
- puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
93
- # Input URL sanity check first
94
- if is_url?(url0)
95
- host=url_2_host(url0)
96
- ip=host_2_ip(host).to_s
97
- raise "Invalid IP address: #{url0}" if ip.nil?
98
- port=url_2_port(url0).to_s
99
- raise "Invalid port number: #{url0}" if port.nil?
100
- else
101
- raise "Invalid URL: #{url0}. Please check it out with your browser again."
102
- end
103
- log_info=Hash.new
104
- log_info[1]="Start working on #{url0}"
105
- url_stores=Hash.new
106
- url_stores[url0]=true unless url_stores.key?(url0)
107
- @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
108
- @crawl_start[url0]=true unless @crawl_start.key?(url0)
89
+ puts "Please be aware that it may take a while to crawl #{url0}, depending on the site's responsiveness and the amount of contents."
90
+ # Input URL sanity check first
91
+ if is_url?(url0)
92
+ host=url_2_host(url0)
93
+ ip=host_2_ip(host).to_s
94
+ raise "Invalid IP address: #{url0}" if ip.nil?
95
+ port=url_2_port(url0).to_s
96
+ raise "Invalid port number: #{url0}" if port.nil?
97
+ else
98
+ raise "Invalid URL: #{url0}. Please check it out with your browser again."
99
+ end
100
+ log_info=Hash.new
101
+ log_info[1]="Start working on #{url0}"
102
+ url_stores=Hash.new
103
+ url_stores[url0]=true unless url_stores.key?(url0)
104
+ @discovered_urls_by_crawler[url0]=true unless @discovered_urls_by_crawler.key?(url0)
105
+ @crawl_start[url0]=true unless @crawl_start.key?(url0)
109
106
  # $discovered_urls[url0]=true unless $discovered_urls.key?(url0)
110
- @crawl_depth.times do
111
- url_stores.keys.each do |url|
112
- # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
113
- next if @visited_urls_by_crawler.key?(url)
114
- url_object = open_url(url)
115
- next if url_object == nil
116
- url = update_url_if_redirected(url, url_object)
117
- url_body = read_url(url)
118
- # Protection code - to avoid parsing failure on the empty or nil object
119
- next if url_body.nil? or url_body.empty?
120
- url_stores[url]=true unless url_stores.key?(url)
121
- @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
107
+ @crawl_depth.times do
108
+ url_stores.keys.each do |url|
109
+ # 10/01/2013 add logic to avoid unnecessary crawling within the same child instance
110
+ next if @visited_urls_by_crawler.key?(url)
111
+ url_object = open_url(url)
112
+ next if url_object == nil
113
+ url = update_url_if_redirected(url, url_object)
114
+ url_body = read_url(url)
115
+ # Protection code - to avoid parsing failure on the empty or nil object
116
+ next if url_body.nil? or url_body.empty?
117
+ url_stores[url]=true unless url_stores.key?(url)
118
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
122
119
  # $discovered_urls[url]=true unless $discovered_urls.key?(url)
123
- doc = Nokogiri::HTML(url_body)
124
- next if doc == nil
125
- if url_stores.size >= @crawl_page_limit
126
- #@visited_urls_by_crawler.merge!(url_stores)
127
- @discovered_urls_by_crawler.merge!(url_stores)
120
+ doc = Nokogiri::HTML(url_body)
121
+ next if doc == nil
122
+ if url_stores.size >= @crawl_page_limit
123
+ #@visited_urls_by_crawler.merge!(url_stores)
124
+ @discovered_urls_by_crawler.merge!(url_stores)
128
125
  # $discovered_urls.merge!(url_stores)
129
- puts "Finish web crawling the url: #{url0}"
130
- return url_stores
131
- end
132
- page_urls = find_urls_on_page(doc, url)
133
- page_urls.uniq!
134
- page_urls.map do |y|
135
- y=normalize_url(y)
136
- url_stores[y]=true unless url_stores.key?(y)
137
- @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
126
+ puts "Finish web crawling the url: #{url0}"
127
+ return url_stores
128
+ end
129
+ page_urls = find_urls_on_page(doc, url)
130
+ page_urls.uniq!
131
+ page_urls.map do |y|
132
+ y=normalize_url(y)
133
+ url_stores[y]=true unless url_stores.key?(y)
134
+ @discovered_urls_by_crawler[y]=true unless @discovered_urls_by_crawler.key?(y)
138
135
  # $discovered_urls[y]=true unless $discovered_urls.key?(y)
139
- end
140
136
  end
141
137
  end
142
- puts "Finish web crawling on: #{url0}"
143
- log_info[2]="Finish working on: #{url0}"
144
- wlog(log_info, "UrlCrawler", @log_file)
145
- @crawl_done[url0]=true unless @crawl_done.key?(url0)
146
- return url_stores
147
- rescue => ee
148
- puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
149
- log_info[3]="Exception on #{url0}"
150
- wlog(log_info,"UrlCrawler",@log_file)
151
- return url_stores
152
138
  end
139
+ puts "Finish web crawling on: #{url0}"
140
+ log_info[2]="Finish working on: #{url0}"
141
+ wlog(log_info, "UrlCrawler", @log_file)
142
+ @crawl_done[url0]=true unless @crawl_done.key?(url0)
143
+ return url_stores
144
+ rescue => ee
145
+ puts "Exception on method #{__method__} for URL #{url0}: #{ee}" if @verbose
146
+ log_info[3]="Exception on #{url0}"
147
+ wlog(log_info,"UrlCrawler",@log_file)
148
+ return url_stores
153
149
  end
154
150
 
155
151
  # Fast crawling by utilizing fork manager parallel to spawn numbers of child processes at the same time
156
152
  # each child process will continuously work on the target pool until all the works are done
157
153
  def crawl_workers (targets,num=@max_parallel)
158
- begin
159
- raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
160
- puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
161
- #puts "This could be awhile depending on the list size. Please be patient ..."
162
- # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
163
- targets -= ["", nil]
164
- uniq_sites=Hash.new
165
- targets.dup.map do |target|
166
- if is_url?(target)
167
- host=url_2_host(target)
168
- ip=host_2_ip(host).to_s
169
- next if ip.nil?
170
- port=url_2_port(target).to_s
171
- next if port.nil?
172
- site_key=ip+":"+port
173
- unless uniq_sites.key?(site_key)
174
- uniq_sites[site_key]=target
175
- end
154
+ raise "Input error - expecting targets in an array format: #{targets}" unless targets.kind_of? Array
155
+ puts "Sanitize the URL seeds to eliminate the unnecessary duplication(s) ..." if @verbose
156
+ #puts "This could be awhile depending on the list size. Please be patient ..."
157
+ # 09/30/2013 Add additional logic to eliminate the duplicate target site(s) before the crawlers are invoked.
158
+ targets -= ["", nil]
159
+ uniq_sites=Hash.new
160
+ targets.dup.map do |target|
161
+ if is_url?(target)
162
+ host=url_2_host(target)
163
+ ip=host_2_ip(host).to_s
164
+ next if ip.nil?
165
+ port=url_2_port(target).to_s
166
+ next if port.nil?
167
+ site_key=ip+":"+port
168
+ unless uniq_sites.key?(site_key)
169
+ uniq_sites[site_key]=target
176
170
  end
177
171
  end
178
- puts "Sanitization done! " if @verbose
179
- puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
180
- puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
181
- raise "Error: target list is empty!" if targets.size < 1
182
- Parallel.map(uniq_sites.values, :in_processes => num) { |target|
183
- puts "Working on #{target} ..." if @verbose
184
- crawl(target)
185
- }.dup.each do |process|
186
- puts "process.inspect: #{process}" if @verbose
187
- urls=process
188
- urls-=["",nil] unless urls.nil?
189
- if urls.nil?
190
- next
191
- elsif urls.empty?
192
- next
193
- #do nothing
194
- else
195
- urls.map do |url|
196
- url.strip!
197
- @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
198
- #$discovered_urls[url]=true unless $discovered_urls.key?(url)
199
- end
172
+ end
173
+ puts "Sanitization done! " if @verbose
174
+ puts "Start the parallel engine on the normalized crawling list:\n #{targets} "
175
+ puts "Maximum number of web crawling sessions allowed: #{num}" #if @verbose
176
+ raise "Error: target list is empty!" if targets.size < 1
177
+ Parallel.map(uniq_sites.values, :in_processes => num) { |target|
178
+ puts "Working on #{target} ..." if @verbose
179
+ crawl(target)
180
+ }.dup.each do |process|
181
+ puts "process.inspect: #{process}" if @verbose
182
+ urls=process
183
+ urls-=["",nil] unless urls.nil?
184
+ if urls.nil?
185
+ next
186
+ elsif urls.empty?
187
+ next
188
+ #do nothing
189
+ else
190
+ urls.map do |url|
191
+ url.strip!
192
+ @discovered_urls_by_crawler[url]=true unless @discovered_urls_by_crawler.key?(url)
193
+ #$discovered_urls[url]=true unless $discovered_urls.key?(url)
200
194
  end
201
195
  end
202
- #return sites
203
- return @discovered_urls_by_crawler.keys
204
- rescue Exception => ee
205
- puts "Exception on method #{__method__}: #{ee}" if @verbose
206
- return nil
207
196
  end
197
+ #return sites
198
+ return @discovered_urls_by_crawler.keys
199
+ rescue Exception => ee
200
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
201
+ return nil
208
202
  end
209
203
  alias_method :crawls, :crawl_workers
210
204
 
211
205
  # Fast crawling method - build the target pool from the input file
212
206
  def crawl_workers_on_file (file)
213
- begin
214
- puts "Web crawl the list of targets from file: #{file}"
215
- targets=file_2_list(file)
216
- sites=crawl_workers(targets,num=@max_parallel)
217
- return sites
218
- rescue => ee
219
- puts "Exception on method #{__method__}: #{ee}" if @verbose
220
- return nil
221
- end
207
+ puts "Web crawl the list of targets from file: #{file}"
208
+ targets=file_2_list(file)
209
+ sites=crawl_workers(targets,num=@max_parallel)
210
+ return sites
211
+ rescue => ee
212
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
213
+ return nil
222
214
  end
223
215
  alias_method :query_file, :crawl_workers_on_file
224
216
  alias_method :crawl_file, :crawl_workers_on_file
225
217
 
226
218
  # Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
227
219
  def open_url(url)
228
- begin
229
- puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
230
- if url =~ /http\:/i
231
- # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
232
- url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
233
- #url_object = open(url)
234
- elsif url =~ /https\:/i
235
- url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
236
- #url_object = open(url,:ssl_verify_mode => 0)
237
- else
238
- raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
239
- end
240
- return url_object
241
- rescue => ee
242
- puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
243
- return nil
244
- end
220
+ puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
221
+ if url =~ /http\:/i
222
+ # patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
223
+ url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
224
+ #url_object = open(url)
225
+ elsif url =~ /https\:/i
226
+ url_object = open(url,:ssl_verify_mode => 0, :allow_redirections =>:safe, :read_timeout=>Max_http_timeout/1000)
227
+ #url_object = open(url,:ssl_verify_mode => 0)
228
+ else
229
+ raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
230
+ end
231
+ return url_object
232
+ rescue => ee
233
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
234
+ return nil
245
235
  end
246
236
 
247
237
  # Wrapper to use OpenURI method 'read' to return url body contents
248
238
  def read_url(url)
249
- begin
250
- puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
251
- url_object=open_url(url)
252
- @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
253
- body=url_object.read
254
- return body
255
- rescue => ee
256
- puts "Exception on method #{__method__}: #{ee}" if @verbose
257
- return nil
258
- end
239
+ puts "Wrapper to return the OpenURI object for url: #{url}" if @verbose
240
+ url_object=open_url(url)
241
+ @visited_urls_by_crawler[url]=true unless @visited_urls_by_crawler.key?(url)
242
+ body=url_object.read
243
+ return body
244
+ rescue => ee
245
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
246
+ return nil
259
247
  end
260
248
 
261
249
  # Return the destination url in case of url re-direct
262
250
  def update_url_if_redirected(url, url_object)
263
- begin
264
- #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
265
- if url != url_object.base_uri.to_s
266
- return url_object.base_uri.to_s
267
- end
268
- return url
269
- rescue => ee
270
- puts "Exception on method #{__method__}: #{ee}" if @verbose
271
- return nil
272
- end
251
+ #puts "Comparing the original URL with the return object base_uri. Return the one where the true content is found. " if @verbose
252
+ if url != url_object.base_uri.to_s
253
+ return url_object.base_uri.to_s
254
+ end
255
+ return url
256
+ rescue => ee
257
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
258
+ return nil
273
259
  end
274
260
 
275
261
  =begin
@@ -290,90 +276,82 @@ class Wmap::UrlCrawler
290
276
 
291
277
  # Search 'current_url' and return found URLs under the same domain
292
278
  def find_urls_on_page(doc, current_url)
293
- begin
294
- puts "Search and return URLs within the doc: #{doc}" if @verbose
295
- urls_list = []
296
- # case 1 - search embedded HTML tag <a href='url'> for the url elements
297
- links=doc.css('a')
298
- links.map do |x|
299
- #puts "x: #{x}"
300
- new_url = x.attribute('href').to_s
301
- unless new_url == nil
302
- if new_url.match("http")
303
- #if urls_on_same_domain?(new_url,current_url)
304
- urls_list.push(new_url)
305
- #end
306
- else
307
- new_url = make_absolute(current_url, new_url)
279
+ puts "Search and return URLs within the doc: #{doc}" if @verbose
280
+ urls_list = []
281
+ # case 1 - search embedded HTML tag <a href='url'> for the url elements
282
+ links=doc.css('a')
283
+ links.map do |x|
284
+ #puts "x: #{x}"
285
+ new_url = x.attribute('href').to_s
286
+ unless new_url == nil
287
+ if new_url.match("http")
288
+ #if urls_on_same_domain?(new_url,current_url)
308
289
  urls_list.push(new_url)
309
- end
290
+ #end
291
+ else
292
+ new_url = make_absolute(current_url, new_url)
293
+ urls_list.push(new_url)
310
294
  end
311
295
  end
312
- # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
313
- elements=doc.css("meta[http-equiv]")
314
- unless elements.size == 0
315
- link=elements.attr("content").value.split(/url\=/i)[1]
316
- unless link.nil?
317
- new_url = make_absolute(current_url, link)
318
- urls_list.push(new_url) unless new_url.nil?
319
- end
296
+ end
297
+ # case 2 - search client side redirect - <meta http-equiv="refresh" content="5;URL='http://example.com/'">
298
+ elements=doc.css("meta[http-equiv]")
299
+ unless elements.size == 0
300
+ link=elements.attr("content").value.split(/url\=/i)[1]
301
+ unless link.nil?
302
+ new_url = make_absolute(current_url, link)
303
+ urls_list.push(new_url) unless new_url.nil?
320
304
  end
321
- #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
322
- return urls_list.uniq-["",nil]
323
- rescue => ee
324
- puts "Exception on method #{__method__}: #{ee}" if @verbose
325
- return nil
326
305
  end
306
+ #puts "Found URLs under page #{current_url}:\n#{urls_list}" if @verbose
307
+ return urls_list.uniq-["",nil]
308
+ rescue => ee
309
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
310
+ return nil
327
311
  end
328
312
 
329
313
  # Method to print out discovery URL result
330
314
  def print_discovered_urls_by_crawler
331
- begin
332
- puts "Print discovered url by the crawler. " if @verbose
333
- puts "\nSummary Report of Discovered URLs from the Crawler:"
334
- @discovered_urls_by_crawler.keys.each do |url|
335
- puts url
336
- end
337
- puts "Total: #{@discovered_urls_by_crawler.keys.size}"
338
- puts "End of the summary"
339
- rescue => ee
340
- puts "Exception on method #{__method__}: #{ee}" if @verbose
341
- return nil
342
- end
315
+ puts "Print discovered url by the crawler. " if @verbose
316
+ puts "\nSummary Report of Discovered URLs from the Crawler:"
317
+ @discovered_urls_by_crawler.keys.each do |url|
318
+ puts url
319
+ end
320
+ puts "Total: #{@discovered_urls_by_crawler.keys.size}"
321
+ puts "End of the summary"
322
+ rescue => ee
323
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
324
+ return nil
343
325
  end
344
326
  alias_method :print, :print_discovered_urls_by_crawler
345
327
 
346
328
  # Method to save URL discovery result
347
329
  def save_discovered_urls (file)
348
- begin
349
- puts "Save discovered urls by the crawler to file: #{file} "
350
- list_2_file(@discovered_urls_by_crawler.keys, file)
351
- puts "Done!"
352
- rescue => ee
353
- puts "Exception on method #{__method__}: #{ee}" if @verbose
354
- return nil
355
- end
330
+ puts "Save discovered urls by the crawler to file: #{file} "
331
+ list_2_file(@discovered_urls_by_crawler.keys, file)
332
+ puts "Done!"
333
+ rescue => ee
334
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
335
+ return nil
356
336
  end
357
337
  alias_method :save, :save_discovered_urls
358
338
 
359
339
  # Method to retrieve discovery site result
360
340
  def get_discovered_sites_by_crawler
361
- begin
362
- puts "Print summary report of discovered sites. " if @verbose
363
- puts "\nSummary Report of Discovered Sites from the Crawler:"
364
- sites = Hash.new
365
- @discovered_urls_by_crawler.keys.each do |url|
366
- site=url_2_site(url)
367
- sites[site]=true unless sites.key?(site)
368
- end
369
- sites.keys.map { |site| puts site }
370
- puts "Total: #{sites.size}"
371
- puts "End of the summary"
372
- return sites.keys
373
- rescue => ee
374
- puts "Exception on method #{__method__}: #{ee}" if @verbose
375
- return nil
376
- end
341
+ puts "Print summary report of discovered sites. " if @verbose
342
+ puts "\nSummary Report of Discovered Sites from the Crawler:"
343
+ sites = Hash.new
344
+ @discovered_urls_by_crawler.keys.each do |url|
345
+ site=url_2_site(url)
346
+ sites[site]=true unless sites.key?(site)
347
+ end
348
+ sites.keys.map { |site| puts site }
349
+ puts "Total: #{sites.size}"
350
+ puts "End of the summary"
351
+ return sites.keys
352
+ rescue => ee
353
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
354
+ return nil
377
355
  end
378
356
  alias_method :get_sites, :get_discovered_sites_by_crawler
379
357
 
data/version.txt CHANGED
@@ -3,8 +3,8 @@
3
3
  ###############################################################################
4
4
  package = wmap
5
5
  # wmap version 2.0 == web_discovery version 1.5.3
6
- version = 2.6.6
7
- date = 2019-11-12
6
+ version = 2.6.7
7
+ date = 2019-11-19
8
8
 
9
9
  author = Sam (Yang) Li
10
10
  email = yang.li@owasp.org
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wmap
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.6
4
+ version: 2.6.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam (Yang) Li
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-12 00:00:00.000000000 Z
11
+ date: 2019-11-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: dnsruby