wmap 2.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +141 -0
- data/LICENSE.txt +15 -0
- data/README.rdoc +98 -0
- data/TODO +13 -0
- data/bin/deprime +21 -0
- data/bin/distrust +38 -0
- data/bin/googleBot +23 -0
- data/bin/prime +21 -0
- data/bin/refresh +26 -0
- data/bin/run_tests +16 -0
- data/bin/spiderBot +26 -0
- data/bin/trust +38 -0
- data/bin/updateAll +57 -0
- data/bin/wadd +25 -0
- data/bin/wadds +26 -0
- data/bin/wcheck +28 -0
- data/bin/wdel +25 -0
- data/bin/wdump +21 -0
- data/bin/wmap +151 -0
- data/bin/wscan +32 -0
- data/data/cidrs +2 -0
- data/data/deactivated_sites +1 -0
- data/data/domains +2 -0
- data/data/hosts +1 -0
- data/data/prime_hosts +1 -0
- data/data/sites +2 -0
- data/data/sub_domains +2 -0
- data/demos/bruter.rb +27 -0
- data/demos/dns_brutes.rb +28 -0
- data/demos/filter_cidr.rb +18 -0
- data/demos/filter_crawls.rb +5 -0
- data/demos/filter_domain.rb +25 -0
- data/demos/filter_geoip.rb +26 -0
- data/demos/filter_known_services.rb +59 -0
- data/demos/filter_netinfo.rb +23 -0
- data/demos/filter_prime.rb +25 -0
- data/demos/filter_profiler.rb +3 -0
- data/demos/filter_redirection.rb +19 -0
- data/demos/filter_site.rb +40 -0
- data/demos/filter_siteip.rb +31 -0
- data/demos/filter_status.rb +17 -0
- data/demos/filter_timestamp.rb +23 -0
- data/demos/filter_url.rb +19 -0
- data/demos/new_fnd.rb +66 -0
- data/demos/nmap_parser.pl +138 -0
- data/demos/site_format.rb +18 -0
- data/demos/whois_domain.rb +78 -0
- data/dicts/GeoIP.dat +0 -0
- data/dicts/GeoIPASNum.dat +0 -0
- data/dicts/GeoLiteCity.dat +0 -0
- data/dicts/ccsld.txt +2646 -0
- data/dicts/cctld.txt +243 -0
- data/dicts/gtld.txt +25 -0
- data/dicts/hostnames-dict.big +1402 -0
- data/dicts/hostnames-dict.txt +101 -0
- data/lib/wmap/cidr_tracker.rb +327 -0
- data/lib/wmap/dns_bruter.rb +308 -0
- data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
- data/lib/wmap/domain_tracker.rb +342 -0
- data/lib/wmap/geoip_tracker.rb +72 -0
- data/lib/wmap/google_search_scraper.rb +177 -0
- data/lib/wmap/host_tracker/primary_host.rb +130 -0
- data/lib/wmap/host_tracker.rb +550 -0
- data/lib/wmap/network_profiler.rb +144 -0
- data/lib/wmap/port_scanner.rb +208 -0
- data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
- data/lib/wmap/site_tracker.rb +937 -0
- data/lib/wmap/url_checker.rb +314 -0
- data/lib/wmap/url_crawler.rb +381 -0
- data/lib/wmap/utils/domain_root.rb +184 -0
- data/lib/wmap/utils/logger.rb +53 -0
- data/lib/wmap/utils/url_magic.rb +343 -0
- data/lib/wmap/utils/utils.rb +333 -0
- data/lib/wmap/whois.rb +76 -0
- data/lib/wmap.rb +227 -0
- data/logs/wmap.log +17 -0
- data/ruby_whois_patches/base_cocca2.rb +149 -0
- data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
- data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
- data/ruby_whois_patches/whois.above.com.rb +61 -0
- data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
- data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
- data/ruby_whois_patches/whois.ai.rb +112 -0
- data/ruby_whois_patches/whois.arnes.si.rb +121 -0
- data/ruby_whois_patches/whois.ascio.com.rb +91 -0
- data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
- data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
- data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
- data/ruby_whois_patches/whois.denic.de.rb +174 -0
- data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
- data/ruby_whois_patches/whois.dns.be.rb +134 -0
- data/ruby_whois_patches/whois.dns.lu.rb +129 -0
- data/ruby_whois_patches/whois.dns.pl.rb +150 -0
- data/ruby_whois_patches/whois.dns.pt.rb +119 -0
- data/ruby_whois_patches/whois.domain.kg.rb +126 -0
- data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
- data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
- data/ruby_whois_patches/whois.dot.tk.rb +140 -0
- data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
- data/ruby_whois_patches/whois.isnic.is.rb +130 -0
- data/ruby_whois_patches/whois.je.rb +119 -0
- data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
- data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
- data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
- data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
- data/ruby_whois_patches/whois.nic.as.rb +96 -0
- data/ruby_whois_patches/whois.nic.at.rb +109 -0
- data/ruby_whois_patches/whois.nic.ch.rb +141 -0
- data/ruby_whois_patches/whois.nic.cl.rb +117 -0
- data/ruby_whois_patches/whois.nic.ec.rb +157 -0
- data/ruby_whois_patches/whois.nic.im.rb +120 -0
- data/ruby_whois_patches/whois.nic.it.rb +170 -0
- data/ruby_whois_patches/whois.nic.lv.rb +116 -0
- data/ruby_whois_patches/whois.nic.ly.rb +127 -0
- data/ruby_whois_patches/whois.nic.mu.rb +27 -0
- data/ruby_whois_patches/whois.nic.mx.rb +123 -0
- data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
- data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
- data/ruby_whois_patches/whois.nic.tel.rb +129 -0
- data/ruby_whois_patches/whois.nic.tr.rb +133 -0
- data/ruby_whois_patches/whois.nic.us.rb +129 -0
- data/ruby_whois_patches/whois.nic.ve.rb +135 -0
- data/ruby_whois_patches/whois.norid.no.rb +127 -0
- data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
- data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
- data/ruby_whois_patches/whois.registro.br.rb +109 -0
- data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
- data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
- data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
- data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
- data/ruby_whois_patches/whois.tucows.com.rb +70 -0
- data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
- data/settings/discovery_ports +24 -0
- data/settings/google_keywords.txt +9 -0
- data/settings/google_locator.txt +23 -0
- data/test/domain_tracker_test.rb +31 -0
- data/test/utils_test.rb +168 -0
- data/version.txt +13 -0
- data/wmap.gemspec +49 -0
- metadata +202 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
#--
|
2
|
+
# Wmap
|
3
|
+
#
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
5
|
+
#
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
|
+
#++
|
8
|
+
|
9
|
+
|
10
|
+
module Wmap
|
11
|
+
module Utils
|
12
|
+
# Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
|
13
|
+
module DomainRoot
|
14
|
+
extend self
|
15
|
+
# Internet Domain Architecture Definitions
|
16
|
+
File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
|
17
|
+
File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
|
18
|
+
File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
|
19
|
+
|
20
|
+
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
21
|
+
def get_domain_root (host)
|
22
|
+
puts "Retrieve the root domain for host: #{host}" if @verbose
|
23
|
+
begin
|
24
|
+
# Generic Top Level Domain List - loading once
|
25
|
+
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
26
|
+
# Country code top-level domain list - loading once
|
27
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
28
|
+
# Country code second level domain - loading once
|
29
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
30
|
+
if host.strip.nil?
|
31
|
+
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
32
|
+
return nil
|
33
|
+
else
|
34
|
+
host=host.downcase.strip
|
35
|
+
end
|
36
|
+
found_tld=false
|
37
|
+
found_cctld=false
|
38
|
+
# search the general top level domain list first
|
39
|
+
root_domain=""
|
40
|
+
dn=host.split(".")
|
41
|
+
if @gtld.key?(dn.last)
|
42
|
+
found=false
|
43
|
+
if @cctld.key?(dn[dn.length-2])
|
44
|
+
found=true
|
45
|
+
end
|
46
|
+
if found
|
47
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
48
|
+
else
|
49
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
50
|
+
end
|
51
|
+
found_tld=true
|
52
|
+
end
|
53
|
+
# search the country code top level domain list secondly
|
54
|
+
if @cctld.key?(dn.last)
|
55
|
+
found=false
|
56
|
+
# reverse search of general top level domain
|
57
|
+
if @gtld.key?(dn[dn.length-2])
|
58
|
+
found=true
|
59
|
+
end
|
60
|
+
# search country code second level domain list
|
61
|
+
if @ccsld.key?(dn.last)
|
62
|
+
@ccsld[dn.last].each do |v|
|
63
|
+
if ( v =~ /#{dn[dn.length-2]}/i )
|
64
|
+
found=true
|
65
|
+
break
|
66
|
+
end
|
67
|
+
end
|
68
|
+
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
69
|
+
#unless found
|
70
|
+
# if @gtld.key?(dn[dn.length-2])
|
71
|
+
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
72
|
+
# return nil
|
73
|
+
# end
|
74
|
+
#end
|
75
|
+
end
|
76
|
+
if found
|
77
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
78
|
+
else
|
79
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
80
|
+
end
|
81
|
+
found_cctld=true
|
82
|
+
end
|
83
|
+
unless (found_tld or found_cctld)
|
84
|
+
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
85
|
+
return nil
|
86
|
+
else
|
87
|
+
puts "Domain root found: #{root_domain}" if @verbose
|
88
|
+
return root_domain
|
89
|
+
end
|
90
|
+
rescue => ee
|
91
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
92
|
+
return nil
|
93
|
+
end
|
94
|
+
end
|
95
|
+
alias_method :get_root_domain, :get_domain_root
|
96
|
+
alias_method :root_domain, :get_domain_root
|
97
|
+
alias_method :domain_root, :get_domain_root
|
98
|
+
alias_method :host_2_domain, :get_domain_root
|
99
|
+
|
100
|
+
# 'setter' to parse and load the known country code second level domain table from the file
|
101
|
+
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
102
|
+
def load_ccsld_from_file (file_ccsld)
|
103
|
+
begin
|
104
|
+
ccsld=Hash.new
|
105
|
+
puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
|
106
|
+
f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
|
107
|
+
f.each do |line|
|
108
|
+
next unless line =~ /^\s+\.\w/
|
109
|
+
line=line.chomp.strip.downcase
|
110
|
+
entry=line.split(' ')[0].split('.')
|
111
|
+
if entry.length > 2
|
112
|
+
key=entry.last
|
113
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
114
|
+
val=entry[entry.length-2]
|
115
|
+
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
116
|
+
ccsld[key].push(val) unless key.nil?
|
117
|
+
end
|
118
|
+
end
|
119
|
+
f.close
|
120
|
+
# Sort the blocks once in descendant order once for better performance
|
121
|
+
return ccsld
|
122
|
+
rescue => ee
|
123
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Test a host string to see if it's a valid Internet root domain
|
128
|
+
def is_domain_root? (domain)
|
129
|
+
puts "Validate the domain name is valid: #{domain}" if @verbose
|
130
|
+
begin
|
131
|
+
domain=domain.strip.downcase
|
132
|
+
return domain == get_domain_root(domain)
|
133
|
+
rescue => ee
|
134
|
+
puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
end
|
138
|
+
alias_method :is_root_domain?, :is_domain_root?
|
139
|
+
alias_method :is_domain?, :is_domain_root?
|
140
|
+
alias_method :is_root?, :is_domain_root?
|
141
|
+
|
142
|
+
# Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
|
143
|
+
def get_sub_domain (host)
|
144
|
+
puts "Retrieve sub-domain from host: #{host}" if @verbose
|
145
|
+
begin
|
146
|
+
subdomain=String.new
|
147
|
+
host=host.strip.downcase
|
148
|
+
domain=get_domain_root(host)
|
149
|
+
record_h=host.split(".")
|
150
|
+
record_d=domain.split(".")
|
151
|
+
if (record_h.length - record_d.length) >= 2
|
152
|
+
subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
|
153
|
+
puts "Sub domain found: #{subdomain}" if @verbose
|
154
|
+
return subdomain
|
155
|
+
else
|
156
|
+
return nil
|
157
|
+
end
|
158
|
+
rescue Exception => ee
|
159
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
160
|
+
return nil
|
161
|
+
end
|
162
|
+
end
|
163
|
+
alias_method :get_subdomain, :get_sub_domain
|
164
|
+
|
165
|
+
# Function to print instance variable - General top level domain list
|
166
|
+
def print_gtld
|
167
|
+
puts @gtld
|
168
|
+
end
|
169
|
+
|
170
|
+
# Function to print instance variable - Country code top-level domain list
|
171
|
+
def print_cctld
|
172
|
+
puts @cctld
|
173
|
+
end
|
174
|
+
|
175
|
+
# Function to print instance variable - Country code second-level domain list
|
176
|
+
def print_ccsld
|
177
|
+
puts @ccsld
|
178
|
+
end
|
179
|
+
|
180
|
+
private :load_ccsld_from_file
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#--
|
2
|
+
# Wmap
|
3
|
+
#
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
5
|
+
#
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
|
+
#++
|
8
|
+
|
9
|
+
|
10
|
+
module Wmap
|
11
|
+
module Utils
|
12
|
+
# Module to log debugging and other messages
|
13
|
+
module Logger
|
14
|
+
extend self
|
15
|
+
# Append information into the log file for the trouble-shooting purpose
|
16
|
+
def wlog (obj, agent, file)
|
17
|
+
puts "Writing #{obj} into log file: #{file}" if @verbose
|
18
|
+
begin
|
19
|
+
return false if obj.nil?
|
20
|
+
# 01/27/2015, implementing singleton pattern for the logger
|
21
|
+
@@f=File.open(file,'a')
|
22
|
+
timestamp=Time.now
|
23
|
+
case obj
|
24
|
+
when Array
|
25
|
+
if obj.size >= 0
|
26
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
27
|
+
obj.map { |x| @@f.write " #{x}\n" }
|
28
|
+
puts "The list is successfully saved into the log file: #{file} " if @verbose
|
29
|
+
end
|
30
|
+
when Hash
|
31
|
+
if obj.length >= 0
|
32
|
+
@@f.write "#{timestamp}: #{agent}: \n"
|
33
|
+
obj.each_value { |value| @@f.write " #{value}\n" }
|
34
|
+
puts "The hash is successfully saved into the log file: #{file} " if @verbose
|
35
|
+
end
|
36
|
+
when String
|
37
|
+
@@f.write "#{timestamp}: #{agent}: #{obj}\n"
|
38
|
+
puts "The string is successfully saved into the log file: #{file} " if @verbose
|
39
|
+
else
|
40
|
+
#do nothing
|
41
|
+
puts "Un-handled exception on: #{obj}" if @verbose
|
42
|
+
end
|
43
|
+
@@f.close
|
44
|
+
return true
|
45
|
+
rescue => ee
|
46
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
#--
|
2
|
+
# Wmap
|
3
|
+
#
|
4
|
+
# A pure Ruby library for Internet web application discovery and tracking.
|
5
|
+
#
|
6
|
+
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
|
+
#++
|
8
|
+
# require "uri"
|
9
|
+
|
10
|
+
module Wmap
|
11
|
+
module Utils
|
12
|
+
module UrlMagic
|
13
|
+
extend self
|
14
|
+
|
15
|
+
# Simple sanity check on a 'claimed' URL string.
|
16
|
+
def is_url?(url)
|
17
|
+
puts "Validate the URL format is valid: #{url}" if @verbose
|
18
|
+
begin
|
19
|
+
if url =~ /(http|https)\:\/\/((.)+)/i
|
20
|
+
host=$2.split('/')[0]
|
21
|
+
host=host.split(':')[0]
|
22
|
+
if is_ip?(host) or is_fqdn?(host)
|
23
|
+
return true
|
24
|
+
else
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
else
|
28
|
+
puts "Unknown URL format: #{url}" if @verbose
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
rescue => ee
|
32
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
33
|
+
return false
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Simple sanity check on a 'claimed' SSL enabled URL string
|
38
|
+
def is_ssl?(url)
|
39
|
+
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
40
|
+
begin
|
41
|
+
url=url.strip
|
42
|
+
if is_url?(url) && url =~ /https/i
|
43
|
+
return true
|
44
|
+
else
|
45
|
+
return false
|
46
|
+
end
|
47
|
+
rescue => ee
|
48
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
49
|
+
return false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
alias_method :is_https?, :is_ssl?
|
53
|
+
|
54
|
+
# Simple sanity check on a 'claimed' web site base string.
|
55
|
+
def is_site?(url)
|
56
|
+
puts "Validate the website string format for: #{url}" if @verbose
|
57
|
+
begin
|
58
|
+
url=url.strip.downcase
|
59
|
+
if is_url?(url)
|
60
|
+
if url == url_2_site(url)
|
61
|
+
return true
|
62
|
+
else
|
63
|
+
return false
|
64
|
+
end
|
65
|
+
else
|
66
|
+
puts "Unknown site format: #{url}" if @verbose
|
67
|
+
return false
|
68
|
+
end
|
69
|
+
rescue => ee
|
70
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Check if URL is an absolute one
|
76
|
+
#def is_absolute?(url)
|
77
|
+
# puts "Validate if the url is absolute: #{url}" if @verbose
|
78
|
+
# begin
|
79
|
+
# url.strip!
|
80
|
+
# URI.absolute?(url)
|
81
|
+
# rescue => ee
|
82
|
+
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
83
|
+
# return false
|
84
|
+
# end
|
85
|
+
#end
|
86
|
+
|
87
|
+
# Check if URL is relative one
|
88
|
+
#def is_relative?(url)
|
89
|
+
# begin
|
90
|
+
# url.strip!
|
91
|
+
# !is_absolute?(url)
|
92
|
+
# rescue => ee
|
93
|
+
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
94
|
+
# return false
|
95
|
+
# end
|
96
|
+
#end
|
97
|
+
|
98
|
+
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
99
|
+
def url_2_host (url)
|
100
|
+
begin
|
101
|
+
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
102
|
+
record1 = url.split('/')
|
103
|
+
if record1[0].nil?
|
104
|
+
puts "Error process url: #{url}"
|
105
|
+
return nil
|
106
|
+
else
|
107
|
+
record2 = record1[0].split(':')
|
108
|
+
return record2[0]
|
109
|
+
end
|
110
|
+
rescue => ee
|
111
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
|
117
|
+
def url_2_port (url)
|
118
|
+
puts "Retrieve service port on URL: #{url}" if @verbose
|
119
|
+
begin
|
120
|
+
ssl = (url =~ /https/i)
|
121
|
+
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
122
|
+
record1 = url.split('/')
|
123
|
+
record2 = record1[0].split(':')
|
124
|
+
if (record2.length == 2)
|
125
|
+
puts "The service port: #{record2[1]}" if @verbose
|
126
|
+
return record2[1].to_i
|
127
|
+
elsif ssl
|
128
|
+
puts "The service port: 443" if @verbose
|
129
|
+
return 443
|
130
|
+
else
|
131
|
+
puts "The service port: 80" if @verbose
|
132
|
+
return 80
|
133
|
+
end
|
134
|
+
rescue => ee
|
135
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
136
|
+
return nil
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
|
141
|
+
def url_2_site (url)
|
142
|
+
puts "Retrieve the web site base for url: #{url}" if @verbose
|
143
|
+
begin
|
144
|
+
url = url.downcase
|
145
|
+
url = url.sub(/^(.*?)http/i,'http')
|
146
|
+
entry = url.split(%r{\/\/})
|
147
|
+
prot=entry[0]
|
148
|
+
# step 1, extract the host:port pair from the url
|
149
|
+
host_port=entry[1].split(%r{\/})[0]
|
150
|
+
if host_port =~ /\:/
|
151
|
+
host=host_port.split(%r{\:})[0]
|
152
|
+
port=host_port.split(%r{\:})[1].to_i
|
153
|
+
elsif prot =~ /https/i
|
154
|
+
host=host_port
|
155
|
+
port=443
|
156
|
+
elsif prot =~ /http/i
|
157
|
+
host=host_port
|
158
|
+
port=80
|
159
|
+
else
|
160
|
+
host=host_port
|
161
|
+
#raise "Unknown url format: #{url}"
|
162
|
+
end
|
163
|
+
# additional logic to handle uncommon url base structures
|
164
|
+
unless is_fqdn?(host)
|
165
|
+
case host
|
166
|
+
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
167
|
+
when /\?|\#/
|
168
|
+
host=host.split(%r{\?|\#})[0]
|
169
|
+
else
|
170
|
+
#do nothing
|
171
|
+
end
|
172
|
+
end
|
173
|
+
# step 2, put the host:port pair back to the normal site format
|
174
|
+
prot="https:" if port==443
|
175
|
+
if port==80 || port==443
|
176
|
+
site=prot+"//"+host+"/"
|
177
|
+
else
|
178
|
+
site=prot+"//"+host+":"+port.to_s+"/"
|
179
|
+
end
|
180
|
+
if site=~ /http/i
|
181
|
+
#puts "Base found: #{site}" if @verbose
|
182
|
+
return site
|
183
|
+
else
|
184
|
+
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
185
|
+
return nil
|
186
|
+
end
|
187
|
+
rescue => ee
|
188
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
189
|
+
return nil
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
|
194
|
+
def url_2_path(url)
|
195
|
+
#puts "Retrieve the relative path component of the url: #{url}" if @verbose
|
196
|
+
begin
|
197
|
+
url.strip!
|
198
|
+
base = url_2_site(url).chop
|
199
|
+
path=url.sub(base,'')
|
200
|
+
#puts "Path component found: #{path}" if @verbose
|
201
|
+
return path
|
202
|
+
rescue => ee
|
203
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
204
|
+
end
|
205
|
+
|
206
|
+
end
|
207
|
+
|
208
|
+
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
209
|
+
def urls_on_same_domain?(url1, url2)
|
210
|
+
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
|
211
|
+
begin
|
212
|
+
host1=url_2_host(url1)
|
213
|
+
host2=url_2_host(url2)
|
214
|
+
return get_domain_root(host1) == get_domain_root(host2)
|
215
|
+
rescue => ee
|
216
|
+
puts "Error searching the object content: #{ee}" if @verbose
|
217
|
+
return nil
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
# Input is host and open port, output is a URL for valid http response code or nil
|
222
|
+
def host_2_url (host,port=80)
|
223
|
+
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
224
|
+
begin
|
225
|
+
host=host.strip
|
226
|
+
if port.to_i == 80
|
227
|
+
url_1 = "http://" + host + "/"
|
228
|
+
elsif port.to_i ==443
|
229
|
+
url_1 = "https://" + host + "/"
|
230
|
+
else
|
231
|
+
url_1 = "http://" + host + ":" + port.to_s + "/"
|
232
|
+
url_2 = "https://" + host + ":" + port.to_s + "/"
|
233
|
+
end
|
234
|
+
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
235
|
+
checker=Wmap::UrlChecker.new
|
236
|
+
if checker.response_code(url_1) != 10000
|
237
|
+
puts "Found URL: #{url_1}" if @verbose
|
238
|
+
return url_1
|
239
|
+
elsif checker.response_code(url_2) != 10000
|
240
|
+
puts "Found URL: #{url_2}" if @verbose
|
241
|
+
return url_2
|
242
|
+
else
|
243
|
+
puts "No http(s) service found on: #{host}:#{port}" if @verbose
|
244
|
+
return nil
|
245
|
+
end
|
246
|
+
rescue => ee
|
247
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
248
|
+
return nil
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
253
|
+
def make_absolute(base, relative_url)
|
254
|
+
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
255
|
+
begin
|
256
|
+
absolute_url = nil;
|
257
|
+
if relative_url =~ /^\//
|
258
|
+
absolute_url = create_absolute_url_from_base(base, relative_url)
|
259
|
+
else
|
260
|
+
absolute_url = create_absolute_url_from_context(base, relative_url)
|
261
|
+
end
|
262
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
263
|
+
return absolute_url
|
264
|
+
rescue => ee
|
265
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
266
|
+
return nil
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
271
|
+
def create_absolute_url_from_base(potential_base, relative_url)
|
272
|
+
begin
|
273
|
+
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
274
|
+
naked_base = url_2_site(potential_base).strip.chop
|
275
|
+
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
276
|
+
return naked_base + relative_url
|
277
|
+
rescue => ee
|
278
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
279
|
+
return nil
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# Construct the absolute URL by comparing a known URL and the relative file path
|
284
|
+
def create_absolute_url_from_context(potential_base, relative_url)
|
285
|
+
puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
|
286
|
+
begin
|
287
|
+
absolute_url = nil
|
288
|
+
# make relative URL naked by removing the beginning '/'
|
289
|
+
relative_url.sub!(/^\//,'')
|
290
|
+
if potential_base =~ /\/$/
|
291
|
+
absolute_url = potential_base+relative_url.strip
|
292
|
+
else
|
293
|
+
last_index_of_slash = potential_base.rindex('/')
|
294
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
295
|
+
absolute_url = potential_base+relative_url
|
296
|
+
else
|
297
|
+
last_index_of_dot = potential_base.rindex('.')
|
298
|
+
if last_index_of_dot < last_index_of_slash
|
299
|
+
absolute_url = potential_base.strip.chop+relative_url
|
300
|
+
else
|
301
|
+
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
puts "Found absolute URL: #{absolute_url}" if @verbose
|
306
|
+
return absolute_url
|
307
|
+
rescue => ee
|
308
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
309
|
+
return nil
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
314
|
+
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
315
|
+
def normalize_url(url)
|
316
|
+
begin
|
317
|
+
url.strip!
|
318
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
319
|
+
# Normalize the base
|
320
|
+
base=url_2_site(url)
|
321
|
+
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
322
|
+
base=base.sub(/\.\/$/,'/')
|
323
|
+
# Normalize the relative path, case#1
|
324
|
+
# retrieve the file path and remove the first '/' or '.',
|
325
|
+
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
326
|
+
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
327
|
+
# Normalize the relative path, case#2
|
328
|
+
# Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
|
329
|
+
path=path.gsub(/\/\.{1,2}\//,'/')
|
330
|
+
if path.nil?
|
331
|
+
return base
|
332
|
+
else
|
333
|
+
return base+path
|
334
|
+
end
|
335
|
+
rescue => ee
|
336
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
337
|
+
return url
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
end
|
343
|
+
end
|