wmap 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,184 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+
9
+
10
+ module Wmap
11
+ module Utils
12
+ # Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
13
+ module DomainRoot
14
+ extend self
15
+ # Internet Domain Architecture Definitions
16
+ File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
17
+ File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
18
+ File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
19
+
20
+ # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
21
+ def get_domain_root (host)
22
+ puts "Retrieve the root domain for host: #{host}" if @verbose
23
+ begin
24
+ # Generic Top Level Domain List - loading once
25
+ @gtld=file_2_hash(File_gtld) if @gtld.nil?
26
+ # Country code top-level domain list - loading once
27
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
28
+ # Country code second level domain - loading once
29
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
30
+ if host.strip.nil?
31
+ puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
32
+ return nil
33
+ else
34
+ host=host.downcase.strip
35
+ end
36
+ found_tld=false
37
+ found_cctld=false
38
+ # search the general top level domain list first
39
+ root_domain=""
40
+ dn=host.split(".")
41
+ if @gtld.key?(dn.last)
42
+ found=false
43
+ if @cctld.key?(dn[dn.length-2])
44
+ found=true
45
+ end
46
+ if found
47
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
48
+ else
49
+ root_domain=dn[dn.length-2] + "." + dn.last
50
+ end
51
+ found_tld=true
52
+ end
53
+ # search the country code top level domain list secondly
54
+ if @cctld.key?(dn.last)
55
+ found=false
56
+ # reverse search of general top level domain
57
+ if @gtld.key?(dn[dn.length-2])
58
+ found=true
59
+ end
60
+ # search country code second level domain list
61
+ if @ccsld.key?(dn.last)
62
+ @ccsld[dn.last].each do |v|
63
+ if ( v =~ /#{dn[dn.length-2]}/i )
64
+ found=true
65
+ break
66
+ end
67
+ end
68
+ # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
69
+ #unless found
70
+ # if @gtld.key?(dn[dn.length-2])
71
+ # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
72
+ # return nil
73
+ # end
74
+ #end
75
+ end
76
+ if found
77
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
78
+ else
79
+ root_domain=dn[dn.length-2] + "." + dn.last
80
+ end
81
+ found_cctld=true
82
+ end
83
+ unless (found_tld or found_cctld)
84
+ puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
85
+ return nil
86
+ else
87
+ puts "Domain root found: #{root_domain}" if @verbose
88
+ return root_domain
89
+ end
90
+ rescue => ee
91
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
92
+ return nil
93
+ end
94
+ end
95
+ alias_method :get_root_domain, :get_domain_root
96
+ alias_method :root_domain, :get_domain_root
97
+ alias_method :domain_root, :get_domain_root
98
+ alias_method :host_2_domain, :get_domain_root
99
+
100
+ # 'setter' to parse and load the known country code second level domain table from the file
101
+ # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
102
+ def load_ccsld_from_file (file_ccsld)
103
+ begin
104
+ ccsld=Hash.new
105
+ puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
106
+ f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
107
+ f.each do |line|
108
+ next unless line =~ /^\s+\.\w/
109
+ line=line.chomp.strip.downcase
110
+ entry=line.split(' ')[0].split('.')
111
+ if entry.length > 2
112
+ key=entry.last
113
+ ccsld[key] = Array.new if not ccsld.key?(key)
114
+ val=entry[entry.length-2]
115
+ #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
116
+ ccsld[key].push(val) unless key.nil?
117
+ end
118
+ end
119
+ f.close
120
+ # Sort the blocks once in descendant order once for better performance
121
+ return ccsld
122
+ rescue => ee
123
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
124
+ end
125
+ end
126
+
127
+ # Test a host string to see if it's a valid Internet root domain
128
+ def is_domain_root? (domain)
129
+ puts "Validate the domain name is valid: #{domain}" if @verbose
130
+ begin
131
+ domain=domain.strip.downcase
132
+ return domain == get_domain_root(domain)
133
+ rescue => ee
134
+ puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
135
+ return false
136
+ end
137
+ end
138
+ alias_method :is_root_domain?, :is_domain_root?
139
+ alias_method :is_domain?, :is_domain_root?
140
+ alias_method :is_root?, :is_domain_root?
141
+
142
+ # Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
143
+ def get_sub_domain (host)
144
+ puts "Retrieve sub-domain from host: #{host}" if @verbose
145
+ begin
146
+ subdomain=String.new
147
+ host=host.strip.downcase
148
+ domain=get_domain_root(host)
149
+ record_h=host.split(".")
150
+ record_d=domain.split(".")
151
+ if (record_h.length - record_d.length) >= 2
152
+ subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
153
+ puts "Sub domain found: #{subdomain}" if @verbose
154
+ return subdomain
155
+ else
156
+ return nil
157
+ end
158
+ rescue Exception => ee
159
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
160
+ return nil
161
+ end
162
+ end
163
+ alias_method :get_subdomain, :get_sub_domain
164
+
165
+ # Function to print instance variable - General top level domain list
166
+ def print_gtld
167
+ puts @gtld
168
+ end
169
+
170
+ # Function to print instance variable - Country code top-level domain list
171
+ def print_cctld
172
+ puts @cctld
173
+ end
174
+
175
+ # Function to print instance variable - Country code second-level domain list
176
+ def print_ccsld
177
+ puts @ccsld
178
+ end
179
+
180
+ private :load_ccsld_from_file
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,53 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+
9
+
10
+ module Wmap
11
+ module Utils
12
+ # Module to log debugging and other messages
13
+ module Logger
14
+ extend self
15
+ # Append information into the log file for the trouble-shooting purpose
16
+ def wlog (obj, agent, file)
17
+ puts "Writing #{obj} into log file: #{file}" if @verbose
18
+ begin
19
+ return false if obj.nil?
20
+ # 01/27/2015, implementing singleton pattern for the logger
21
+ @@f=File.open(file,'a')
22
+ timestamp=Time.now
23
+ case obj
24
+ when Array
25
+ if obj.size >= 0
26
+ @@f.write "#{timestamp}: #{agent}: \n"
27
+ obj.map { |x| @@f.write " #{x}\n" }
28
+ puts "The list is successfully saved into the log file: #{file} " if @verbose
29
+ end
30
+ when Hash
31
+ if obj.length >= 0
32
+ @@f.write "#{timestamp}: #{agent}: \n"
33
+ obj.each_value { |value| @@f.write " #{value}\n" }
34
+ puts "The hash is successfully saved into the log file: #{file} " if @verbose
35
+ end
36
+ when String
37
+ @@f.write "#{timestamp}: #{agent}: #{obj}\n"
38
+ puts "The string is successfully saved into the log file: #{file} " if @verbose
39
+ else
40
+ #do nothing
41
+ puts "Un-handled exception on: #{obj}" if @verbose
42
+ end
43
+ @@f.close
44
+ return true
45
+ rescue => ee
46
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
47
+ return false
48
+ end
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,343 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ # require "uri"
9
+
10
+ module Wmap
11
+ module Utils
12
+ module UrlMagic
13
+ extend self
14
+
15
+ # Simple sanity check on a 'claimed' URL string.
16
+ def is_url?(url)
17
+ puts "Validate the URL format is valid: #{url}" if @verbose
18
+ begin
19
+ if url =~ /(http|https)\:\/\/((.)+)/i
20
+ host=$2.split('/')[0]
21
+ host=host.split(':')[0]
22
+ if is_ip?(host) or is_fqdn?(host)
23
+ return true
24
+ else
25
+ return false
26
+ end
27
+ else
28
+ puts "Unknown URL format: #{url}" if @verbose
29
+ return false
30
+ end
31
+ rescue => ee
32
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
33
+ return false
34
+ end
35
+ end
36
+
37
+ # Simple sanity check on a 'claimed' SSL enabled URL string
38
+ def is_ssl?(url)
39
+ puts "Validate if SSL is enabled on: #{url}" if @verbose
40
+ begin
41
+ url=url.strip
42
+ if is_url?(url) && url =~ /https/i
43
+ return true
44
+ else
45
+ return false
46
+ end
47
+ rescue => ee
48
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
49
+ return false
50
+ end
51
+ end
52
+ alias_method :is_https?, :is_ssl?
53
+
54
+ # Simple sanity check on a 'claimed' web site base string.
55
+ def is_site?(url)
56
+ puts "Validate the website string format for: #{url}" if @verbose
57
+ begin
58
+ url=url.strip.downcase
59
+ if is_url?(url)
60
+ if url == url_2_site(url)
61
+ return true
62
+ else
63
+ return false
64
+ end
65
+ else
66
+ puts "Unknown site format: #{url}" if @verbose
67
+ return false
68
+ end
69
+ rescue => ee
70
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
71
+ return nil
72
+ end
73
+ end
74
+
75
+ # Check if URL is an absolute one
76
+ #def is_absolute?(url)
77
+ # puts "Validate if the url is absolute: #{url}" if @verbose
78
+ # begin
79
+ # url.strip!
80
+ # URI.absolute?(url)
81
+ # rescue => ee
82
+ # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
83
+ # return false
84
+ # end
85
+ #end
86
+
87
+ # Check if URL is relative one
88
+ #def is_relative?(url)
89
+ # begin
90
+ # url.strip!
91
+ # !is_absolute?(url)
92
+ # rescue => ee
93
+ # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
94
+ # return false
95
+ # end
96
+ #end
97
+
98
+ # Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
99
+ def url_2_host (url)
100
+ begin
101
+ url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
102
+ record1 = url.split('/')
103
+ if record1[0].nil?
104
+ puts "Error process url: #{url}"
105
+ return nil
106
+ else
107
+ record2 = record1[0].split(':')
108
+ return record2[0]
109
+ end
110
+ rescue => ee
111
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
112
+ return nil
113
+ end
114
+ end
115
+
116
+ # Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
117
+ def url_2_port (url)
118
+ puts "Retrieve service port on URL: #{url}" if @verbose
119
+ begin
120
+ ssl = (url =~ /https/i)
121
+ url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
122
+ record1 = url.split('/')
123
+ record2 = record1[0].split(':')
124
+ if (record2.length == 2)
125
+ puts "The service port: #{record2[1]}" if @verbose
126
+ return record2[1].to_i
127
+ elsif ssl
128
+ puts "The service port: 443" if @verbose
129
+ return 443
130
+ else
131
+ puts "The service port: 80" if @verbose
132
+ return 80
133
+ end
134
+ rescue => ee
135
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
136
+ return nil
137
+ end
138
+ end
139
+
140
+ # Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
141
+ def url_2_site (url)
142
+ puts "Retrieve the web site base for url: #{url}" if @verbose
143
+ begin
144
+ url = url.downcase
145
+ url = url.sub(/^(.*?)http/i,'http')
146
+ entry = url.split(%r{\/\/})
147
+ prot=entry[0]
148
+ # step 1, extract the host:port pair from the url
149
+ host_port=entry[1].split(%r{\/})[0]
150
+ if host_port =~ /\:/
151
+ host=host_port.split(%r{\:})[0]
152
+ port=host_port.split(%r{\:})[1].to_i
153
+ elsif prot =~ /https/i
154
+ host=host_port
155
+ port=443
156
+ elsif prot =~ /http/i
157
+ host=host_port
158
+ port=80
159
+ else
160
+ host=host_port
161
+ #raise "Unknown url format: #{url}"
162
+ end
163
+ # additional logic to handle uncommon url base structures
164
+ unless is_fqdn?(host)
165
+ case host
166
+ # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
167
+ when /\?|\#/
168
+ host=host.split(%r{\?|\#})[0]
169
+ else
170
+ #do nothing
171
+ end
172
+ end
173
+ # step 2, put the host:port pair back to the normal site format
174
+ prot="https:" if port==443
175
+ if port==80 || port==443
176
+ site=prot+"//"+host+"/"
177
+ else
178
+ site=prot+"//"+host+":"+port.to_s+"/"
179
+ end
180
+ if site=~ /http/i
181
+ #puts "Base found: #{site}" if @verbose
182
+ return site
183
+ else
184
+ raise "Problem encountered on method url_2_site: Unable to convert #{url}"
185
+ return nil
186
+ end
187
+ rescue => ee
188
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
189
+ return nil
190
+ end
191
+ end
192
+
193
+ # Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
194
+ def url_2_path(url)
195
+ #puts "Retrieve the relative path component of the url: #{url}" if @verbose
196
+ begin
197
+ url.strip!
198
+ base = url_2_site(url).chop
199
+ path=url.sub(base,'')
200
+ #puts "Path component found: #{path}" if @verbose
201
+ return path
202
+ rescue => ee
203
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
204
+ end
205
+
206
+ end
207
+
208
+ # Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
209
+ def urls_on_same_domain?(url1, url2)
210
+ puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
211
+ begin
212
+ host1=url_2_host(url1)
213
+ host2=url_2_host(url2)
214
+ return get_domain_root(host1) == get_domain_root(host2)
215
+ rescue => ee
216
+ puts "Error searching the object content: #{ee}" if @verbose
217
+ return nil
218
+ end
219
+ end
220
+
221
+ # Input is host and open port, output is a URL for valid http response code or nil
222
+ def host_2_url (host,port=80)
223
+ puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
224
+ begin
225
+ host=host.strip
226
+ if port.to_i == 80
227
+ url_1 = "http://" + host + "/"
228
+ elsif port.to_i ==443
229
+ url_1 = "https://" + host + "/"
230
+ else
231
+ url_1 = "http://" + host + ":" + port.to_s + "/"
232
+ url_2 = "https://" + host + ":" + port.to_s + "/"
233
+ end
234
+ puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
235
+ checker=Wmap::UrlChecker.new
236
+ if checker.response_code(url_1) != 10000
237
+ puts "Found URL: #{url_1}" if @verbose
238
+ return url_1
239
+ elsif checker.response_code(url_2) != 10000
240
+ puts "Found URL: #{url_2}" if @verbose
241
+ return url_2
242
+ else
243
+ puts "No http(s) service found on: #{host}:#{port}" if @verbose
244
+ return nil
245
+ end
246
+ rescue => ee
247
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
248
+ return nil
249
+ end
250
+ end
251
+
252
+ # Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
253
+ def make_absolute(base, relative_url)
254
+ puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
255
+ begin
256
+ absolute_url = nil;
257
+ if relative_url =~ /^\//
258
+ absolute_url = create_absolute_url_from_base(base, relative_url)
259
+ else
260
+ absolute_url = create_absolute_url_from_context(base, relative_url)
261
+ end
262
+ puts "Found absolute URL: #{absolute_url}" if @verbose
263
+ return absolute_url
264
+ rescue => ee
265
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
266
+ return nil
267
+ end
268
+ end
269
+
270
+ # Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
271
+ def create_absolute_url_from_base(potential_base, relative_url)
272
+ begin
273
+ #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
274
+ naked_base = url_2_site(potential_base).strip.chop
275
+ puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
276
+ return naked_base + relative_url
277
+ rescue => ee
278
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
279
+ return nil
280
+ end
281
+ end
282
+
283
+ # Construct the absolute URL by comparing a known URL and the relative file path
284
+ def create_absolute_url_from_context(potential_base, relative_url)
285
+ puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
286
+ begin
287
+ absolute_url = nil
288
+ # make relative URL naked by removing the beginning '/'
289
+ relative_url.sub!(/^\//,'')
290
+ if potential_base =~ /\/$/
291
+ absolute_url = potential_base+relative_url.strip
292
+ else
293
+ last_index_of_slash = potential_base.rindex('/')
294
+ if potential_base[last_index_of_slash-2, 2] == ':/'
295
+ absolute_url = potential_base+relative_url
296
+ else
297
+ last_index_of_dot = potential_base.rindex('.')
298
+ if last_index_of_dot < last_index_of_slash
299
+ absolute_url = potential_base.strip.chop+relative_url
300
+ else
301
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
302
+ end
303
+ end
304
+ end
305
+ puts "Found absolute URL: #{absolute_url}" if @verbose
306
+ return absolute_url
307
+ rescue => ee
308
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
309
+ return nil
310
+ end
311
+ end
312
+
313
+ # Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
314
+ # See http://en.wikipedia.org/wiki/URL_normalization for more explanation
315
+ def normalize_url(url)
316
+ begin
317
+ url.strip!
318
+ # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
319
+ # Normalize the base
320
+ base=url_2_site(url)
321
+ # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
322
+ base=base.sub(/\.\/$/,'/')
323
+ # Normalize the relative path, case#1
324
+ # retrieve the file path and remove the first '/' or '.',
325
+ # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
326
+ path=url_2_path(url).sub(/^(\/|\.)*/,'')
327
+ # Normalize the relative path, case#2
328
+ # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
329
+ path=path.gsub(/\/\.{1,2}\//,'/')
330
+ if path.nil?
331
+ return base
332
+ else
333
+ return base+path
334
+ end
335
+ rescue => ee
336
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
337
+ return url
338
+ end
339
+ end
340
+
341
+ end
342
+ end
343
+ end