wmap 2.7.9 → 2.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wmap +4 -0
- data/lib/wmap/url_crawler.rb +9 -21
- data/lib/wmap/utils/logger.rb +31 -34
- data/lib/wmap/utils/url_magic.rb +4 -2
- data/version.txt +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0430d7df339ee9c4f6099e1175b6cdef5ee0a453c2a5340e72e897e92a24ecfd
|
4
|
+
data.tar.gz: bb4aef610f1f71c992a25c41e0bcd1601039a224a250c251a4fc6b64bc2c362c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 371d69e9e72145befa5589a79ca7a7ac541c022881fc4a7ebf2c0ccedd3ab92962557487f7195f216427e1cc3f876efcbcae7cd00adf6ea4e55fec72ec94c5fe
|
7
|
+
data.tar.gz: f3e2245e00a680240b8c0b9f41a2074aff46f5b5ae7cd71ace31be71456a7ee0b909f95223b49dc846cd2f40bdd71cb26f815553943b2b1e78e9b2b567a8008c
|
data/bin/wmap
CHANGED
data/lib/wmap/url_crawler.rb
CHANGED
@@ -17,7 +17,8 @@ require "parallel"
|
|
17
17
|
class Wmap::UrlCrawler
|
18
18
|
include Wmap::Utils
|
19
19
|
|
20
|
-
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel,
|
20
|
+
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, \
|
21
|
+
:verbose, :data_dir, :user_agent
|
21
22
|
attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
|
22
23
|
# Global variable used to store the combined result of all the forked child processes. Note that class variable
|
23
24
|
# would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
|
@@ -35,13 +36,16 @@ class Wmap::UrlCrawler
|
|
35
36
|
@crawl_depth=params.fetch(:crawl_depth, 4)
|
36
37
|
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
37
38
|
@max_parallel=params.fetch(:max_parallel, 40)
|
39
|
+
@user_agent=params.fetch(:user_agent, "OWASP WMAP Spider")
|
38
40
|
# Discovered data store
|
39
41
|
@discovered_urls_by_crawler=Hash.new
|
40
42
|
@visited_urls_by_crawler=Hash.new
|
41
43
|
@crawl_start=Hash.new
|
42
44
|
@crawl_done=Hash.new
|
43
45
|
Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
|
44
|
-
@
|
46
|
+
@log_dir=@data_dir + "/../logs/"
|
47
|
+
Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir)
|
48
|
+
@log_file=@log_dir + "crawler.log"
|
45
49
|
end
|
46
50
|
|
47
51
|
# Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
|
@@ -216,14 +220,14 @@ class Wmap::UrlCrawler
|
|
216
220
|
alias_method :crawl_file, :crawl_workers_on_file
|
217
221
|
|
218
222
|
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
219
|
-
def open_url(url)
|
223
|
+
def open_url(url,user_agent=@user_agent)
|
220
224
|
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
221
225
|
if url =~ /http\:/i
|
222
226
|
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
223
|
-
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
227
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
|
224
228
|
#url_object = open(url)
|
225
229
|
elsif url =~ /https\:/i
|
226
|
-
url_object = open(url
|
230
|
+
url_object = open(url, :ssl_verify_mode=>0, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
|
227
231
|
#url_object = open(url,:ssl_verify_mode => 0)
|
228
232
|
else
|
229
233
|
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
@@ -258,22 +262,6 @@ class Wmap::UrlCrawler
|
|
258
262
|
return nil
|
259
263
|
end
|
260
264
|
|
261
|
-
=begin
|
262
|
-
# Wrapper for the Nokogiri DOM parser
|
263
|
-
def parse_html(html_body)
|
264
|
-
begin
|
265
|
-
#puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
|
266
|
-
doc = Nokogiri::HTML(html_body)
|
267
|
-
#puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
|
268
|
-
#puts "doc: #{doc}" if @verbose
|
269
|
-
return doc
|
270
|
-
rescue => ee
|
271
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
272
|
-
return nil
|
273
|
-
end
|
274
|
-
end
|
275
|
-
=end
|
276
|
-
|
277
265
|
# Search 'current_url' and return found URLs under the same domain
|
278
266
|
def find_urls_on_page(doc, current_url)
|
279
267
|
puts "Search and return URLs within the doc: #{doc}" if @verbose
|
data/lib/wmap/utils/logger.rb
CHANGED
@@ -8,46 +8,43 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
module Wmap
|
11
|
-
module Utils
|
11
|
+
module Utils
|
12
12
|
# Module to log debugging and other messages
|
13
|
-
module Logger
|
13
|
+
module Logger
|
14
14
|
extend self
|
15
15
|
# Append information into the log file for the trouble-shooting purpose
|
16
16
|
def wlog (obj, agent, file)
|
17
17
|
puts "Writing #{obj} into log file: #{file}" if @verbose
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
obj.map { |x| @@f.write " #{x}\n" }
|
28
|
-
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
29
|
-
end
|
30
|
-
when Hash
|
31
|
-
if obj.length >= 0
|
32
|
-
@@f.write "#{timestamp}: #{agent}: \n"
|
33
|
-
obj.each_value { |value| @@f.write " #{value}\n" }
|
34
|
-
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
35
|
-
end
|
36
|
-
when String
|
37
|
-
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
38
|
-
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
39
|
-
else
|
40
|
-
#do nothing
|
41
|
-
puts "Un-handled exception on: #{obj}" if @verbose
|
18
|
+
return false if obj.nil?
|
19
|
+
@@f=File.open(file,'a')
|
20
|
+
timestamp=Time.now
|
21
|
+
case obj
|
22
|
+
when Array
|
23
|
+
if obj.size >= 0
|
24
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
25
|
+
obj.map { |x| @@f.write " #{x}\n" }
|
26
|
+
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
42
27
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
28
|
+
when Hash
|
29
|
+
if obj.length >= 0
|
30
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
31
|
+
obj.each_value { |value| @@f.write " #{value}\n" }
|
32
|
+
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
33
|
+
end
|
34
|
+
when String
|
35
|
+
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
36
|
+
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
37
|
+
else
|
38
|
+
#do nothing
|
39
|
+
puts "Un-handled exception on: #{obj}" if @verbose
|
40
|
+
end
|
41
|
+
@@f.close
|
42
|
+
return true
|
43
|
+
rescue => ee
|
44
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
45
|
+
return false
|
49
46
|
end
|
50
|
-
|
51
|
-
end
|
47
|
+
|
48
|
+
end
|
52
49
|
end
|
53
50
|
end
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -15,6 +15,7 @@ module Wmap
|
|
15
15
|
|
16
16
|
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
17
|
Max_http_timeout=15000
|
18
|
+
User_agent = "OWASP WMAP Spider"
|
18
19
|
|
19
20
|
# Simple sanity check on a 'claimed' URL string.
|
20
21
|
def is_url?(url)
|
@@ -377,7 +378,8 @@ module Wmap
|
|
377
378
|
|
378
379
|
# Given an URL, open the page, then return the DOM text from a normal user perspective
|
379
380
|
def open_page(url)
|
380
|
-
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe,
|
381
|
+
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, \
|
382
|
+
read_timeout: Max_http_timeout/1000, "User-Agent"=>User_agent}
|
381
383
|
doc = Nokogiri::HTML(open(url, args))
|
382
384
|
if doc.text.include?("Please enable JavaScript to view the page content")
|
383
385
|
puts "Invoke headless chrome through webdriver ..." if @verbose
|
@@ -385,7 +387,7 @@ module Wmap
|
|
385
387
|
#driver = Selenium::WebDriver.for :chrome
|
386
388
|
# http://watir.com/guides/chrome/
|
387
389
|
args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate', '--disk-cache-size 8192']
|
388
|
-
browser = Watir::Browser.new :chrome, headless: true,
|
390
|
+
browser = Watir::Browser.new :chrome, headless: true, switches: %w[--user-agent=OWASP\ WMAP\ Spider]
|
389
391
|
browser.goto(url)
|
390
392
|
sleep(2) # wait for the loading
|
391
393
|
doc = Nokogiri::HTML(browser.html)
|
data/version.txt
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
###############################################################################
|
4
4
|
package = wmap
|
5
5
|
# wmap version 2.0 == web_discovery version 1.5.3
|
6
|
-
version = 2.
|
7
|
-
date = 2020-
|
6
|
+
version = 2.8.1
|
7
|
+
date = 2020-05-08
|
8
8
|
|
9
9
|
author = Sam (Yang) Li
|
10
10
|
email = yang.li@owasp.org
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam (Yang) Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dnsruby
|