wmap 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +141 -0
- data/LICENSE.txt +15 -0
- data/README.rdoc +98 -0
- data/TODO +13 -0
- data/bin/deprime +21 -0
- data/bin/distrust +38 -0
- data/bin/googleBot +23 -0
- data/bin/prime +21 -0
- data/bin/refresh +26 -0
- data/bin/run_tests +16 -0
- data/bin/spiderBot +26 -0
- data/bin/trust +38 -0
- data/bin/updateAll +57 -0
- data/bin/wadd +25 -0
- data/bin/wadds +26 -0
- data/bin/wcheck +28 -0
- data/bin/wdel +25 -0
- data/bin/wdump +21 -0
- data/bin/wmap +151 -0
- data/bin/wscan +32 -0
- data/data/cidrs +2 -0
- data/data/deactivated_sites +1 -0
- data/data/domains +2 -0
- data/data/hosts +1 -0
- data/data/prime_hosts +1 -0
- data/data/sites +2 -0
- data/data/sub_domains +2 -0
- data/demos/bruter.rb +27 -0
- data/demos/dns_brutes.rb +28 -0
- data/demos/filter_cidr.rb +18 -0
- data/demos/filter_crawls.rb +5 -0
- data/demos/filter_domain.rb +25 -0
- data/demos/filter_geoip.rb +26 -0
- data/demos/filter_known_services.rb +59 -0
- data/demos/filter_netinfo.rb +23 -0
- data/demos/filter_prime.rb +25 -0
- data/demos/filter_profiler.rb +3 -0
- data/demos/filter_redirection.rb +19 -0
- data/demos/filter_site.rb +40 -0
- data/demos/filter_siteip.rb +31 -0
- data/demos/filter_status.rb +17 -0
- data/demos/filter_timestamp.rb +23 -0
- data/demos/filter_url.rb +19 -0
- data/demos/new_fnd.rb +66 -0
- data/demos/nmap_parser.pl +138 -0
- data/demos/site_format.rb +18 -0
- data/demos/whois_domain.rb +78 -0
- data/dicts/GeoIP.dat +0 -0
- data/dicts/GeoIPASNum.dat +0 -0
- data/dicts/GeoLiteCity.dat +0 -0
- data/dicts/ccsld.txt +2646 -0
- data/dicts/cctld.txt +243 -0
- data/dicts/gtld.txt +25 -0
- data/dicts/hostnames-dict.big +1402 -0
- data/dicts/hostnames-dict.txt +101 -0
- data/lib/wmap/cidr_tracker.rb +327 -0
- data/lib/wmap/dns_bruter.rb +308 -0
- data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
- data/lib/wmap/domain_tracker.rb +342 -0
- data/lib/wmap/geoip_tracker.rb +72 -0
- data/lib/wmap/google_search_scraper.rb +177 -0
- data/lib/wmap/host_tracker/primary_host.rb +130 -0
- data/lib/wmap/host_tracker.rb +550 -0
- data/lib/wmap/network_profiler.rb +144 -0
- data/lib/wmap/port_scanner.rb +208 -0
- data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
- data/lib/wmap/site_tracker.rb +937 -0
- data/lib/wmap/url_checker.rb +314 -0
- data/lib/wmap/url_crawler.rb +381 -0
- data/lib/wmap/utils/domain_root.rb +184 -0
- data/lib/wmap/utils/logger.rb +53 -0
- data/lib/wmap/utils/url_magic.rb +343 -0
- data/lib/wmap/utils/utils.rb +333 -0
- data/lib/wmap/whois.rb +76 -0
- data/lib/wmap.rb +227 -0
- data/logs/wmap.log +17 -0
- data/ruby_whois_patches/base_cocca2.rb +149 -0
- data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
- data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
- data/ruby_whois_patches/whois.above.com.rb +61 -0
- data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
- data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
- data/ruby_whois_patches/whois.ai.rb +112 -0
- data/ruby_whois_patches/whois.arnes.si.rb +121 -0
- data/ruby_whois_patches/whois.ascio.com.rb +91 -0
- data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
- data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
- data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
- data/ruby_whois_patches/whois.denic.de.rb +174 -0
- data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
- data/ruby_whois_patches/whois.dns.be.rb +134 -0
- data/ruby_whois_patches/whois.dns.lu.rb +129 -0
- data/ruby_whois_patches/whois.dns.pl.rb +150 -0
- data/ruby_whois_patches/whois.dns.pt.rb +119 -0
- data/ruby_whois_patches/whois.domain.kg.rb +126 -0
- data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
- data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
- data/ruby_whois_patches/whois.dot.tk.rb +140 -0
- data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
- data/ruby_whois_patches/whois.isnic.is.rb +130 -0
- data/ruby_whois_patches/whois.je.rb +119 -0
- data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
- data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
- data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
- data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
- data/ruby_whois_patches/whois.nic.as.rb +96 -0
- data/ruby_whois_patches/whois.nic.at.rb +109 -0
- data/ruby_whois_patches/whois.nic.ch.rb +141 -0
- data/ruby_whois_patches/whois.nic.cl.rb +117 -0
- data/ruby_whois_patches/whois.nic.ec.rb +157 -0
- data/ruby_whois_patches/whois.nic.im.rb +120 -0
- data/ruby_whois_patches/whois.nic.it.rb +170 -0
- data/ruby_whois_patches/whois.nic.lv.rb +116 -0
- data/ruby_whois_patches/whois.nic.ly.rb +127 -0
- data/ruby_whois_patches/whois.nic.mu.rb +27 -0
- data/ruby_whois_patches/whois.nic.mx.rb +123 -0
- data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
- data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
- data/ruby_whois_patches/whois.nic.tel.rb +129 -0
- data/ruby_whois_patches/whois.nic.tr.rb +133 -0
- data/ruby_whois_patches/whois.nic.us.rb +129 -0
- data/ruby_whois_patches/whois.nic.ve.rb +135 -0
- data/ruby_whois_patches/whois.norid.no.rb +127 -0
- data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
- data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
- data/ruby_whois_patches/whois.registro.br.rb +109 -0
- data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
- data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
- data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
- data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
- data/ruby_whois_patches/whois.tucows.com.rb +70 -0
- data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
- data/settings/discovery_ports +24 -0
- data/settings/google_keywords.txt +9 -0
- data/settings/google_locator.txt +23 -0
- data/test/domain_tracker_test.rb +31 -0
- data/test/utils_test.rb +168 -0
- data/version.txt +13 -0
- data/wmap.gemspec +49 -0
- metadata +202 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Wmap
|
|
3
|
+
#
|
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
|
5
|
+
#
|
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
|
7
|
+
#++
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
module Wmap
|
|
11
|
+
module Utils
|
|
12
|
+
# Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
|
|
13
|
+
module DomainRoot
|
|
14
|
+
extend self
|
|
15
|
+
# Internet Domain Architecture Definitions
|
|
16
|
+
File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
|
|
17
|
+
File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
|
|
18
|
+
File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
|
|
19
|
+
|
|
20
|
+
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
|
21
|
+
def get_domain_root (host)
|
|
22
|
+
puts "Retrieve the root domain for host: #{host}" if @verbose
|
|
23
|
+
begin
|
|
24
|
+
# Generic Top Level Domain List - loading once
|
|
25
|
+
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
|
26
|
+
# Country code top-level domain list - loading once
|
|
27
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
|
28
|
+
# Country code second level domain - loading once
|
|
29
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
|
30
|
+
if host.strip.nil?
|
|
31
|
+
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
|
32
|
+
return nil
|
|
33
|
+
else
|
|
34
|
+
host=host.downcase.strip
|
|
35
|
+
end
|
|
36
|
+
found_tld=false
|
|
37
|
+
found_cctld=false
|
|
38
|
+
# search the general top level domain list first
|
|
39
|
+
root_domain=""
|
|
40
|
+
dn=host.split(".")
|
|
41
|
+
if @gtld.key?(dn.last)
|
|
42
|
+
found=false
|
|
43
|
+
if @cctld.key?(dn[dn.length-2])
|
|
44
|
+
found=true
|
|
45
|
+
end
|
|
46
|
+
if found
|
|
47
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
|
48
|
+
else
|
|
49
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
|
50
|
+
end
|
|
51
|
+
found_tld=true
|
|
52
|
+
end
|
|
53
|
+
# search the country code top level domain list secondly
|
|
54
|
+
if @cctld.key?(dn.last)
|
|
55
|
+
found=false
|
|
56
|
+
# reverse search of general top level domain
|
|
57
|
+
if @gtld.key?(dn[dn.length-2])
|
|
58
|
+
found=true
|
|
59
|
+
end
|
|
60
|
+
# search country code second level domain list
|
|
61
|
+
if @ccsld.key?(dn.last)
|
|
62
|
+
@ccsld[dn.last].each do |v|
|
|
63
|
+
if ( v =~ /#{dn[dn.length-2]}/i )
|
|
64
|
+
found=true
|
|
65
|
+
break
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
|
69
|
+
#unless found
|
|
70
|
+
# if @gtld.key?(dn[dn.length-2])
|
|
71
|
+
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
|
72
|
+
# return nil
|
|
73
|
+
# end
|
|
74
|
+
#end
|
|
75
|
+
end
|
|
76
|
+
if found
|
|
77
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
|
78
|
+
else
|
|
79
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
|
80
|
+
end
|
|
81
|
+
found_cctld=true
|
|
82
|
+
end
|
|
83
|
+
unless (found_tld or found_cctld)
|
|
84
|
+
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
|
85
|
+
return nil
|
|
86
|
+
else
|
|
87
|
+
puts "Domain root found: #{root_domain}" if @verbose
|
|
88
|
+
return root_domain
|
|
89
|
+
end
|
|
90
|
+
rescue => ee
|
|
91
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
92
|
+
return nil
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
alias_method :get_root_domain, :get_domain_root
|
|
96
|
+
alias_method :root_domain, :get_domain_root
|
|
97
|
+
alias_method :domain_root, :get_domain_root
|
|
98
|
+
alias_method :host_2_domain, :get_domain_root
|
|
99
|
+
|
|
100
|
+
# 'setter' to parse and load the known country code second level domain table from the file
|
|
101
|
+
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
|
102
|
+
def load_ccsld_from_file (file_ccsld)
|
|
103
|
+
begin
|
|
104
|
+
ccsld=Hash.new
|
|
105
|
+
puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
|
|
106
|
+
f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
|
|
107
|
+
f.each do |line|
|
|
108
|
+
next unless line =~ /^\s+\.\w/
|
|
109
|
+
line=line.chomp.strip.downcase
|
|
110
|
+
entry=line.split(' ')[0].split('.')
|
|
111
|
+
if entry.length > 2
|
|
112
|
+
key=entry.last
|
|
113
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
|
114
|
+
val=entry[entry.length-2]
|
|
115
|
+
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
|
116
|
+
ccsld[key].push(val) unless key.nil?
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
f.close
|
|
120
|
+
# Sort the blocks once in descendant order once for better performance
|
|
121
|
+
return ccsld
|
|
122
|
+
rescue => ee
|
|
123
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Test a host string to see if it's a valid Internet root domain
|
|
128
|
+
def is_domain_root? (domain)
|
|
129
|
+
puts "Validate the domain name is valid: #{domain}" if @verbose
|
|
130
|
+
begin
|
|
131
|
+
domain=domain.strip.downcase
|
|
132
|
+
return domain == get_domain_root(domain)
|
|
133
|
+
rescue => ee
|
|
134
|
+
puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
|
|
135
|
+
return false
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
alias_method :is_root_domain?, :is_domain_root?
|
|
139
|
+
alias_method :is_domain?, :is_domain_root?
|
|
140
|
+
alias_method :is_root?, :is_domain_root?
|
|
141
|
+
|
|
142
|
+
# Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
|
|
143
|
+
def get_sub_domain (host)
|
|
144
|
+
puts "Retrieve sub-domain from host: #{host}" if @verbose
|
|
145
|
+
begin
|
|
146
|
+
subdomain=String.new
|
|
147
|
+
host=host.strip.downcase
|
|
148
|
+
domain=get_domain_root(host)
|
|
149
|
+
record_h=host.split(".")
|
|
150
|
+
record_d=domain.split(".")
|
|
151
|
+
if (record_h.length - record_d.length) >= 2
|
|
152
|
+
subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
|
|
153
|
+
puts "Sub domain found: #{subdomain}" if @verbose
|
|
154
|
+
return subdomain
|
|
155
|
+
else
|
|
156
|
+
return nil
|
|
157
|
+
end
|
|
158
|
+
rescue Exception => ee
|
|
159
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
|
160
|
+
return nil
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
alias_method :get_subdomain, :get_sub_domain
|
|
164
|
+
|
|
165
|
+
# Function to print instance variable - General top level domain list
|
|
166
|
+
def print_gtld
|
|
167
|
+
puts @gtld
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Function to print instance variable - Country code top-level domain list
|
|
171
|
+
def print_cctld
|
|
172
|
+
puts @cctld
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Function to print instance variable - Country code second-level domain list
|
|
176
|
+
def print_ccsld
|
|
177
|
+
puts @ccsld
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
private :load_ccsld_from_file
|
|
181
|
+
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Wmap
|
|
3
|
+
#
|
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
|
5
|
+
#
|
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
|
7
|
+
#++
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
module Wmap
|
|
11
|
+
module Utils
|
|
12
|
+
# Module to log debugging and other messages
|
|
13
|
+
module Logger
|
|
14
|
+
extend self
|
|
15
|
+
# Append information into the log file for the trouble-shooting purpose
|
|
16
|
+
def wlog (obj, agent, file)
|
|
17
|
+
puts "Writing #{obj} into log file: #{file}" if @verbose
|
|
18
|
+
begin
|
|
19
|
+
return false if obj.nil?
|
|
20
|
+
# 01/27/2015, implementing singleton pattern for the logger
|
|
21
|
+
@@f=File.open(file,'a')
|
|
22
|
+
timestamp=Time.now
|
|
23
|
+
case obj
|
|
24
|
+
when Array
|
|
25
|
+
if obj.size >= 0
|
|
26
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
|
27
|
+
obj.map { |x| @@f.write " #{x}\n" }
|
|
28
|
+
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
|
29
|
+
end
|
|
30
|
+
when Hash
|
|
31
|
+
if obj.length >= 0
|
|
32
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
|
33
|
+
obj.each_value { |value| @@f.write " #{value}\n" }
|
|
34
|
+
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
|
35
|
+
end
|
|
36
|
+
when String
|
|
37
|
+
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
|
38
|
+
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
|
39
|
+
else
|
|
40
|
+
#do nothing
|
|
41
|
+
puts "Un-handled exception on: #{obj}" if @verbose
|
|
42
|
+
end
|
|
43
|
+
@@f.close
|
|
44
|
+
return true
|
|
45
|
+
rescue => ee
|
|
46
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
47
|
+
return false
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
#--
|
|
2
|
+
# Wmap
|
|
3
|
+
#
|
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
|
5
|
+
#
|
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
|
7
|
+
#++
|
|
8
|
+
# require "uri"
|
|
9
|
+
|
|
10
|
+
module Wmap
|
|
11
|
+
module Utils
|
|
12
|
+
module UrlMagic
|
|
13
|
+
extend self
|
|
14
|
+
|
|
15
|
+
# Simple sanity check on a 'claimed' URL string.
|
|
16
|
+
def is_url?(url)
|
|
17
|
+
puts "Validate the URL format is valid: #{url}" if @verbose
|
|
18
|
+
begin
|
|
19
|
+
if url =~ /(http|https)\:\/\/((.)+)/i
|
|
20
|
+
host=$2.split('/')[0]
|
|
21
|
+
host=host.split(':')[0]
|
|
22
|
+
if is_ip?(host) or is_fqdn?(host)
|
|
23
|
+
return true
|
|
24
|
+
else
|
|
25
|
+
return false
|
|
26
|
+
end
|
|
27
|
+
else
|
|
28
|
+
puts "Unknown URL format: #{url}" if @verbose
|
|
29
|
+
return false
|
|
30
|
+
end
|
|
31
|
+
rescue => ee
|
|
32
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
33
|
+
return false
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Simple sanity check on a 'claimed' SSL enabled URL string
|
|
38
|
+
def is_ssl?(url)
|
|
39
|
+
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
|
40
|
+
begin
|
|
41
|
+
url=url.strip
|
|
42
|
+
if is_url?(url) && url =~ /https/i
|
|
43
|
+
return true
|
|
44
|
+
else
|
|
45
|
+
return false
|
|
46
|
+
end
|
|
47
|
+
rescue => ee
|
|
48
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
49
|
+
return false
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
alias_method :is_https?, :is_ssl?
|
|
53
|
+
|
|
54
|
+
# Simple sanity check on a 'claimed' web site base string.
|
|
55
|
+
def is_site?(url)
|
|
56
|
+
puts "Validate the website string format for: #{url}" if @verbose
|
|
57
|
+
begin
|
|
58
|
+
url=url.strip.downcase
|
|
59
|
+
if is_url?(url)
|
|
60
|
+
if url == url_2_site(url)
|
|
61
|
+
return true
|
|
62
|
+
else
|
|
63
|
+
return false
|
|
64
|
+
end
|
|
65
|
+
else
|
|
66
|
+
puts "Unknown site format: #{url}" if @verbose
|
|
67
|
+
return false
|
|
68
|
+
end
|
|
69
|
+
rescue => ee
|
|
70
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
71
|
+
return nil
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Check if URL is an absolute one
|
|
76
|
+
#def is_absolute?(url)
|
|
77
|
+
# puts "Validate if the url is absolute: #{url}" if @verbose
|
|
78
|
+
# begin
|
|
79
|
+
# url.strip!
|
|
80
|
+
# URI.absolute?(url)
|
|
81
|
+
# rescue => ee
|
|
82
|
+
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
83
|
+
# return false
|
|
84
|
+
# end
|
|
85
|
+
#end
|
|
86
|
+
|
|
87
|
+
# Check if URL is relative one
|
|
88
|
+
#def is_relative?(url)
|
|
89
|
+
# begin
|
|
90
|
+
# url.strip!
|
|
91
|
+
# !is_absolute?(url)
|
|
92
|
+
# rescue => ee
|
|
93
|
+
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
94
|
+
# return false
|
|
95
|
+
# end
|
|
96
|
+
#end
|
|
97
|
+
|
|
98
|
+
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
|
99
|
+
def url_2_host (url)
|
|
100
|
+
begin
|
|
101
|
+
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
|
102
|
+
record1 = url.split('/')
|
|
103
|
+
if record1[0].nil?
|
|
104
|
+
puts "Error process url: #{url}"
|
|
105
|
+
return nil
|
|
106
|
+
else
|
|
107
|
+
record2 = record1[0].split(':')
|
|
108
|
+
return record2[0]
|
|
109
|
+
end
|
|
110
|
+
rescue => ee
|
|
111
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
112
|
+
return nil
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
|
|
117
|
+
def url_2_port (url)
|
|
118
|
+
puts "Retrieve service port on URL: #{url}" if @verbose
|
|
119
|
+
begin
|
|
120
|
+
ssl = (url =~ /https/i)
|
|
121
|
+
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
|
122
|
+
record1 = url.split('/')
|
|
123
|
+
record2 = record1[0].split(':')
|
|
124
|
+
if (record2.length == 2)
|
|
125
|
+
puts "The service port: #{record2[1]}" if @verbose
|
|
126
|
+
return record2[1].to_i
|
|
127
|
+
elsif ssl
|
|
128
|
+
puts "The service port: 443" if @verbose
|
|
129
|
+
return 443
|
|
130
|
+
else
|
|
131
|
+
puts "The service port: 80" if @verbose
|
|
132
|
+
return 80
|
|
133
|
+
end
|
|
134
|
+
rescue => ee
|
|
135
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
136
|
+
return nil
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
|
|
141
|
+
def url_2_site (url)
|
|
142
|
+
puts "Retrieve the web site base for url: #{url}" if @verbose
|
|
143
|
+
begin
|
|
144
|
+
url = url.downcase
|
|
145
|
+
url = url.sub(/^(.*?)http/i,'http')
|
|
146
|
+
entry = url.split(%r{\/\/})
|
|
147
|
+
prot=entry[0]
|
|
148
|
+
# step 1, extract the host:port pair from the url
|
|
149
|
+
host_port=entry[1].split(%r{\/})[0]
|
|
150
|
+
if host_port =~ /\:/
|
|
151
|
+
host=host_port.split(%r{\:})[0]
|
|
152
|
+
port=host_port.split(%r{\:})[1].to_i
|
|
153
|
+
elsif prot =~ /https/i
|
|
154
|
+
host=host_port
|
|
155
|
+
port=443
|
|
156
|
+
elsif prot =~ /http/i
|
|
157
|
+
host=host_port
|
|
158
|
+
port=80
|
|
159
|
+
else
|
|
160
|
+
host=host_port
|
|
161
|
+
#raise "Unknown url format: #{url}"
|
|
162
|
+
end
|
|
163
|
+
# additional logic to handle uncommon url base structures
|
|
164
|
+
unless is_fqdn?(host)
|
|
165
|
+
case host
|
|
166
|
+
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
|
167
|
+
when /\?|\#/
|
|
168
|
+
host=host.split(%r{\?|\#})[0]
|
|
169
|
+
else
|
|
170
|
+
#do nothing
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
# step 2, put the host:port pair back to the normal site format
|
|
174
|
+
prot="https:" if port==443
|
|
175
|
+
if port==80 || port==443
|
|
176
|
+
site=prot+"//"+host+"/"
|
|
177
|
+
else
|
|
178
|
+
site=prot+"//"+host+":"+port.to_s+"/"
|
|
179
|
+
end
|
|
180
|
+
if site=~ /http/i
|
|
181
|
+
#puts "Base found: #{site}" if @verbose
|
|
182
|
+
return site
|
|
183
|
+
else
|
|
184
|
+
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
|
185
|
+
return nil
|
|
186
|
+
end
|
|
187
|
+
rescue => ee
|
|
188
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
189
|
+
return nil
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
|
|
194
|
+
def url_2_path(url)
|
|
195
|
+
#puts "Retrieve the relative path component of the url: #{url}" if @verbose
|
|
196
|
+
begin
|
|
197
|
+
url.strip!
|
|
198
|
+
base = url_2_site(url).chop
|
|
199
|
+
path=url.sub(base,'')
|
|
200
|
+
#puts "Path component found: #{path}" if @verbose
|
|
201
|
+
return path
|
|
202
|
+
rescue => ee
|
|
203
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
|
209
|
+
def urls_on_same_domain?(url1, url2)
|
|
210
|
+
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
|
|
211
|
+
begin
|
|
212
|
+
host1=url_2_host(url1)
|
|
213
|
+
host2=url_2_host(url2)
|
|
214
|
+
return get_domain_root(host1) == get_domain_root(host2)
|
|
215
|
+
rescue => ee
|
|
216
|
+
puts "Error searching the object content: #{ee}" if @verbose
|
|
217
|
+
return nil
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Input is host and open port, output is a URL for valid http response code or nil
|
|
222
|
+
def host_2_url (host,port=80)
|
|
223
|
+
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
|
224
|
+
begin
|
|
225
|
+
host=host.strip
|
|
226
|
+
if port.to_i == 80
|
|
227
|
+
url_1 = "http://" + host + "/"
|
|
228
|
+
elsif port.to_i ==443
|
|
229
|
+
url_1 = "https://" + host + "/"
|
|
230
|
+
else
|
|
231
|
+
url_1 = "http://" + host + ":" + port.to_s + "/"
|
|
232
|
+
url_2 = "https://" + host + ":" + port.to_s + "/"
|
|
233
|
+
end
|
|
234
|
+
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
|
235
|
+
checker=Wmap::UrlChecker.new
|
|
236
|
+
if checker.response_code(url_1) != 10000
|
|
237
|
+
puts "Found URL: #{url_1}" if @verbose
|
|
238
|
+
return url_1
|
|
239
|
+
elsif checker.response_code(url_2) != 10000
|
|
240
|
+
puts "Found URL: #{url_2}" if @verbose
|
|
241
|
+
return url_2
|
|
242
|
+
else
|
|
243
|
+
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
|
244
|
+
return nil
|
|
245
|
+
end
|
|
246
|
+
rescue => ee
|
|
247
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
248
|
+
return nil
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
|
253
|
+
def make_absolute(base, relative_url)
|
|
254
|
+
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
|
255
|
+
begin
|
|
256
|
+
absolute_url = nil;
|
|
257
|
+
if relative_url =~ /^\//
|
|
258
|
+
absolute_url = create_absolute_url_from_base(base, relative_url)
|
|
259
|
+
else
|
|
260
|
+
absolute_url = create_absolute_url_from_context(base, relative_url)
|
|
261
|
+
end
|
|
262
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
|
263
|
+
return absolute_url
|
|
264
|
+
rescue => ee
|
|
265
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
266
|
+
return nil
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
|
271
|
+
def create_absolute_url_from_base(potential_base, relative_url)
|
|
272
|
+
begin
|
|
273
|
+
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
|
274
|
+
naked_base = url_2_site(potential_base).strip.chop
|
|
275
|
+
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
|
276
|
+
return naked_base + relative_url
|
|
277
|
+
rescue => ee
|
|
278
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
279
|
+
return nil
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# Construct the absolute URL by comparing a known URL and the relative file path
|
|
284
|
+
def create_absolute_url_from_context(potential_base, relative_url)
|
|
285
|
+
puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
|
|
286
|
+
begin
|
|
287
|
+
absolute_url = nil
|
|
288
|
+
# make relative URL naked by removing the beginning '/'
|
|
289
|
+
relative_url.sub!(/^\//,'')
|
|
290
|
+
if potential_base =~ /\/$/
|
|
291
|
+
absolute_url = potential_base+relative_url.strip
|
|
292
|
+
else
|
|
293
|
+
last_index_of_slash = potential_base.rindex('/')
|
|
294
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
|
295
|
+
absolute_url = potential_base+relative_url
|
|
296
|
+
else
|
|
297
|
+
last_index_of_dot = potential_base.rindex('.')
|
|
298
|
+
if last_index_of_dot < last_index_of_slash
|
|
299
|
+
absolute_url = potential_base.strip.chop+relative_url
|
|
300
|
+
else
|
|
301
|
+
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
|
306
|
+
return absolute_url
|
|
307
|
+
rescue => ee
|
|
308
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
|
309
|
+
return nil
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
|
314
|
+
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
|
315
|
+
def normalize_url(url)
|
|
316
|
+
begin
|
|
317
|
+
url.strip!
|
|
318
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
|
319
|
+
# Normalize the base
|
|
320
|
+
base=url_2_site(url)
|
|
321
|
+
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
|
322
|
+
base=base.sub(/\.\/$/,'/')
|
|
323
|
+
# Normalize the relative path, case#1
|
|
324
|
+
# retrieve the file path and remove the first '/' or '.',
|
|
325
|
+
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
|
326
|
+
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
|
327
|
+
# Normalize the relative path, case#2
|
|
328
|
+
# Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
|
|
329
|
+
path=path.gsub(/\/\.{1,2}\//,'/')
|
|
330
|
+
if path.nil?
|
|
331
|
+
return base
|
|
332
|
+
else
|
|
333
|
+
return base+path
|
|
334
|
+
end
|
|
335
|
+
rescue => ee
|
|
336
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
|
337
|
+
return url
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
end
|