wmap 2.7.9 → 2.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wmap +4 -0
- data/lib/wmap/url_crawler.rb +9 -21
- data/lib/wmap/utils/logger.rb +31 -34
- data/lib/wmap/utils/url_magic.rb +4 -2
- data/version.txt +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0430d7df339ee9c4f6099e1175b6cdef5ee0a453c2a5340e72e897e92a24ecfd
|
4
|
+
data.tar.gz: bb4aef610f1f71c992a25c41e0bcd1601039a224a250c251a4fc6b64bc2c362c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 371d69e9e72145befa5589a79ca7a7ac541c022881fc4a7ebf2c0ccedd3ab92962557487f7195f216427e1cc3f876efcbcae7cd00adf6ea4e55fec72ec94c5fe
|
7
|
+
data.tar.gz: f3e2245e00a680240b8c0b9f41a2074aff46f5b5ae7cd71ace31be71456a7ee0b909f95223b49dc846cd2f40bdd71cb26f815553943b2b1e78e9b2b567a8008c
|
data/bin/wmap
CHANGED
data/lib/wmap/url_crawler.rb
CHANGED
@@ -17,7 +17,8 @@ require "parallel"
|
|
17
17
|
class Wmap::UrlCrawler
|
18
18
|
include Wmap::Utils
|
19
19
|
|
20
|
-
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel,
|
20
|
+
attr_accessor :http_timeout, :crawl_page_limit, :crawl_depth, :max_parallel, \
|
21
|
+
:verbose, :data_dir, :user_agent
|
21
22
|
attr_reader :discovered_urls_by_crawler, :visited_urls_by_crawler, :crawl_start, :crawl_done
|
22
23
|
# Global variable used to store the combined result of all the forked child processes. Note that class variable
|
23
24
|
# would not be able to pass the result due the limitation of IO Pipe communication mechanism used by 'parallel' fork manager
|
@@ -35,13 +36,16 @@ class Wmap::UrlCrawler
|
|
35
36
|
@crawl_depth=params.fetch(:crawl_depth, 4)
|
36
37
|
@crawl_page_limit=params.fetch(:crawl_page_limit, 1000)
|
37
38
|
@max_parallel=params.fetch(:max_parallel, 40)
|
39
|
+
@user_agent=params.fetch(:user_agent, "OWASP WMAP Spider")
|
38
40
|
# Discovered data store
|
39
41
|
@discovered_urls_by_crawler=Hash.new
|
40
42
|
@visited_urls_by_crawler=Hash.new
|
41
43
|
@crawl_start=Hash.new
|
42
44
|
@crawl_done=Hash.new
|
43
45
|
Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
|
44
|
-
@
|
46
|
+
@log_dir=@data_dir + "/../logs/"
|
47
|
+
Dir.mkdir(@log_dir) unless Dir.exist?(@log_dir)
|
48
|
+
@log_file=@log_dir + "crawler.log"
|
45
49
|
end
|
46
50
|
|
47
51
|
# Pre-crawl profiler, to be used for network profiling to maximum the crawler performance.
|
@@ -216,14 +220,14 @@ class Wmap::UrlCrawler
|
|
216
220
|
alias_method :crawl_file, :crawl_workers_on_file
|
217
221
|
|
218
222
|
# Wrapper for the OpenURI open method - create an open_uri object and return the reference upon success
|
219
|
-
def open_url(url)
|
223
|
+
def open_url(url,user_agent=@user_agent)
|
220
224
|
puts "Open url #{url} by creating an open_uri object. Return the reference upon success." if @verbose
|
221
225
|
if url =~ /http\:/i
|
222
226
|
# patch for allow the 'un-safe' URL redirection i.e. https://www.example.com -> http://www.example.com
|
223
|
-
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000)
|
227
|
+
url_object = open(url, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
|
224
228
|
#url_object = open(url)
|
225
229
|
elsif url =~ /https\:/i
|
226
|
-
url_object = open(url
|
230
|
+
url_object = open(url, :ssl_verify_mode=>0, :allow_redirections=>:safe, :read_timeout=>Max_http_timeout/1000, "User-Agent"=>user_agent)
|
227
231
|
#url_object = open(url,:ssl_verify_mode => 0)
|
228
232
|
else
|
229
233
|
raise "Invalid URL format - please specify the protocol prefix http(s) in the URL: #{url}"
|
@@ -258,22 +262,6 @@ class Wmap::UrlCrawler
|
|
258
262
|
return nil
|
259
263
|
end
|
260
264
|
|
261
|
-
=begin
|
262
|
-
# Wrapper for the Nokogiri DOM parser
|
263
|
-
def parse_html(html_body)
|
264
|
-
begin
|
265
|
-
#puts "Parsing the html content: #{html_body}. Return DOM " if @verbose
|
266
|
-
doc = Nokogiri::HTML(html_body)
|
267
|
-
#puts "Successfully crawling the url: #{url_object.base_uri.to_s}" if @verbose
|
268
|
-
#puts "doc: #{doc}" if @verbose
|
269
|
-
return doc
|
270
|
-
rescue => ee
|
271
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
272
|
-
return nil
|
273
|
-
end
|
274
|
-
end
|
275
|
-
=end
|
276
|
-
|
277
265
|
# Search 'current_url' and return found URLs under the same domain
|
278
266
|
def find_urls_on_page(doc, current_url)
|
279
267
|
puts "Search and return URLs within the doc: #{doc}" if @verbose
|
data/lib/wmap/utils/logger.rb
CHANGED
@@ -8,46 +8,43 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
module Wmap
|
11
|
-
module Utils
|
11
|
+
module Utils
|
12
12
|
# Module to log debugging and other messages
|
13
|
-
module Logger
|
13
|
+
module Logger
|
14
14
|
extend self
|
15
15
|
# Append information into the log file for the trouble-shooting purpose
|
16
16
|
def wlog (obj, agent, file)
|
17
17
|
puts "Writing #{obj} into log file: #{file}" if @verbose
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
obj.map { |x| @@f.write " #{x}\n" }
|
28
|
-
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
29
|
-
end
|
30
|
-
when Hash
|
31
|
-
if obj.length >= 0
|
32
|
-
@@f.write "#{timestamp}: #{agent}: \n"
|
33
|
-
obj.each_value { |value| @@f.write " #{value}\n" }
|
34
|
-
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
35
|
-
end
|
36
|
-
when String
|
37
|
-
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
38
|
-
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
39
|
-
else
|
40
|
-
#do nothing
|
41
|
-
puts "Un-handled exception on: #{obj}" if @verbose
|
18
|
+
return false if obj.nil?
|
19
|
+
@@f=File.open(file,'a')
|
20
|
+
timestamp=Time.now
|
21
|
+
case obj
|
22
|
+
when Array
|
23
|
+
if obj.size >= 0
|
24
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
25
|
+
obj.map { |x| @@f.write " #{x}\n" }
|
26
|
+
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
42
27
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
28
|
+
when Hash
|
29
|
+
if obj.length >= 0
|
30
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
31
|
+
obj.each_value { |value| @@f.write " #{value}\n" }
|
32
|
+
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
33
|
+
end
|
34
|
+
when String
|
35
|
+
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
36
|
+
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
37
|
+
else
|
38
|
+
#do nothing
|
39
|
+
puts "Un-handled exception on: #{obj}" if @verbose
|
40
|
+
end
|
41
|
+
@@f.close
|
42
|
+
return true
|
43
|
+
rescue => ee
|
44
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
45
|
+
return false
|
49
46
|
end
|
50
|
-
|
51
|
-
end
|
47
|
+
|
48
|
+
end
|
52
49
|
end
|
53
50
|
end
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -15,6 +15,7 @@ module Wmap
|
|
15
15
|
|
16
16
|
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
17
|
Max_http_timeout=15000
|
18
|
+
User_agent = "OWASP WMAP Spider"
|
18
19
|
|
19
20
|
# Simple sanity check on a 'claimed' URL string.
|
20
21
|
def is_url?(url)
|
@@ -377,7 +378,8 @@ module Wmap
|
|
377
378
|
|
378
379
|
# Given an URL, open the page, then return the DOM text from a normal user perspective
|
379
380
|
def open_page(url)
|
380
|
-
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe,
|
381
|
+
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, \
|
382
|
+
read_timeout: Max_http_timeout/1000, "User-Agent"=>User_agent}
|
381
383
|
doc = Nokogiri::HTML(open(url, args))
|
382
384
|
if doc.text.include?("Please enable JavaScript to view the page content")
|
383
385
|
puts "Invoke headless chrome through webdriver ..." if @verbose
|
@@ -385,7 +387,7 @@ module Wmap
|
|
385
387
|
#driver = Selenium::WebDriver.for :chrome
|
386
388
|
# http://watir.com/guides/chrome/
|
387
389
|
args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate', '--disk-cache-size 8192']
|
388
|
-
browser = Watir::Browser.new :chrome, headless: true,
|
390
|
+
browser = Watir::Browser.new :chrome, headless: true, switches: %w[--user-agent=OWASP\ WMAP\ Spider]
|
389
391
|
browser.goto(url)
|
390
392
|
sleep(2) # wait for the loading
|
391
393
|
doc = Nokogiri::HTML(browser.html)
|
data/version.txt
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
###############################################################################
|
4
4
|
package = wmap
|
5
5
|
# wmap version 2.0 == web_discovery version 1.5.3
|
6
|
-
version = 2.
|
7
|
-
date = 2020-
|
6
|
+
version = 2.8.1
|
7
|
+
date = 2020-05-08
|
8
8
|
|
9
9
|
author = Sam (Yang) Li
|
10
10
|
email = yang.li@owasp.org
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wmap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam (Yang) Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: dnsruby
|