wmap 2.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,184 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+
9
+
10
+ module Wmap
11
+ module Utils
12
+ # Module to validate and retrieve the top or second level domain name from a host-name (FQDN).
13
+ module DomainRoot
14
+ extend self
15
+ # Internet Domain Architecture Definitions
16
+ File_ccsld=File.dirname(__FILE__)+'/../../../dicts/ccsld.txt'
17
+ File_cctld=File.dirname(__FILE__)+'/../../../dicts/cctld.txt'
18
+ File_gtld=File.dirname(__FILE__)+'/../../../dicts/gtld.txt'
19
+
20
+ # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
21
+ def get_domain_root (host)
22
+ puts "Retrieve the root domain for host: #{host}" if @verbose
23
+ begin
24
+ # Generic Top Level Domain List - loading once
25
+ @gtld=file_2_hash(File_gtld) if @gtld.nil?
26
+ # Country code top-level domain list - loading once
27
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
28
+ # Country code second level domain - loading once
29
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
30
+ if host.strip.nil?
31
+ puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
32
+ return nil
33
+ else
34
+ host=host.downcase.strip
35
+ end
36
+ found_tld=false
37
+ found_cctld=false
38
+ # search the general top level domain list first
39
+ root_domain=""
40
+ dn=host.split(".")
41
+ if @gtld.key?(dn.last)
42
+ found=false
43
+ if @cctld.key?(dn[dn.length-2])
44
+ found=true
45
+ end
46
+ if found
47
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
48
+ else
49
+ root_domain=dn[dn.length-2] + "." + dn.last
50
+ end
51
+ found_tld=true
52
+ end
53
+ # search the country code top level domain list secondly
54
+ if @cctld.key?(dn.last)
55
+ found=false
56
+ # reverse search of general top level domain
57
+ if @gtld.key?(dn[dn.length-2])
58
+ found=true
59
+ end
60
+ # search country code second level domain list
61
+ if @ccsld.key?(dn.last)
62
+ @ccsld[dn.last].each do |v|
63
+ if ( v =~ /#{dn[dn.length-2]}/i )
64
+ found=true
65
+ break
66
+ end
67
+ end
68
+ # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
69
+ #unless found
70
+ # if @gtld.key?(dn[dn.length-2])
71
+ # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
72
+ # return nil
73
+ # end
74
+ #end
75
+ end
76
+ if found
77
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
78
+ else
79
+ root_domain=dn[dn.length-2] + "." + dn.last
80
+ end
81
+ found_cctld=true
82
+ end
83
+ unless (found_tld or found_cctld)
84
+ puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
85
+ return nil
86
+ else
87
+ puts "Domain root found: #{root_domain}" if @verbose
88
+ return root_domain
89
+ end
90
+ rescue => ee
91
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
92
+ return nil
93
+ end
94
+ end
95
+ alias_method :get_root_domain, :get_domain_root
96
+ alias_method :root_domain, :get_domain_root
97
+ alias_method :domain_root, :get_domain_root
98
+ alias_method :host_2_domain, :get_domain_root
99
+
100
+ # 'setter' to parse and load the known country code second level domain table from the file
101
+ # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
102
+ def load_ccsld_from_file (file_ccsld)
103
+ begin
104
+ ccsld=Hash.new
105
+ puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
106
+ f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
107
+ f.each do |line|
108
+ next unless line =~ /^\s+\.\w/
109
+ line=line.chomp.strip.downcase
110
+ entry=line.split(' ')[0].split('.')
111
+ if entry.length > 2
112
+ key=entry.last
113
+ ccsld[key] = Array.new if not ccsld.key?(key)
114
+ val=entry[entry.length-2]
115
+ #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
116
+ ccsld[key].push(val) unless key.nil?
117
+ end
118
+ end
119
+ f.close
120
+ # Sort the blocks once in descendant order once for better performance
121
+ return ccsld
122
+ rescue => ee
123
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
124
+ end
125
+ end
126
+
127
+ # Test a host string to see if it's a valid Internet root domain
128
+ def is_domain_root? (domain)
129
+ puts "Validate the domain name is valid: #{domain}" if @verbose
130
+ begin
131
+ domain=domain.strip.downcase
132
+ return domain == get_domain_root(domain)
133
+ rescue => ee
134
+ puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
135
+ return false
136
+ end
137
+ end
138
+ alias_method :is_root_domain?, :is_domain_root?
139
+ alias_method :is_domain?, :is_domain_root?
140
+ alias_method :is_root?, :is_domain_root?
141
+
142
+ # Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
143
+ def get_sub_domain (host)
144
+ puts "Retrieve sub-domain from host: #{host}" if @verbose
145
+ begin
146
+ subdomain=String.new
147
+ host=host.strip.downcase
148
+ domain=get_domain_root(host)
149
+ record_h=host.split(".")
150
+ record_d=domain.split(".")
151
+ if (record_h.length - record_d.length) >= 2
152
+ subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
153
+ puts "Sub domain found: #{subdomain}" if @verbose
154
+ return subdomain
155
+ else
156
+ return nil
157
+ end
158
+ rescue Exception => ee
159
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
160
+ return nil
161
+ end
162
+ end
163
+ alias_method :get_subdomain, :get_sub_domain
164
+
165
+ # Function to print instance variable - General top level domain list
166
+ def print_gtld
167
+ puts @gtld
168
+ end
169
+
170
+ # Function to print instance variable - Country code top-level domain list
171
+ def print_cctld
172
+ puts @cctld
173
+ end
174
+
175
+ # Function to print instance variable - Country code second-level domain list
176
+ def print_ccsld
177
+ puts @ccsld
178
+ end
179
+
180
+ private :load_ccsld_from_file
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,53 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+
9
+
10
+ module Wmap
11
+ module Utils
12
+ # Module to log debugging and other messages
13
+ module Logger
14
+ extend self
15
+ # Append information into the log file for the trouble-shooting purpose
16
+ def wlog (obj, agent, file)
17
+ puts "Writing #{obj} into log file: #{file}" if @verbose
18
+ begin
19
+ return false if obj.nil?
20
+ # 01/27/2015, implementing singleton pattern for the logger
21
+ @@f=File.open(file,'a')
22
+ timestamp=Time.now
23
+ case obj
24
+ when Array
25
+ if obj.size >= 0
26
+ @@f.write "#{timestamp}: #{agent}: \n"
27
+ obj.map { |x| @@f.write " #{x}\n" }
28
+ puts "The list is successfully saved into the log file: #{file} " if @verbose
29
+ end
30
+ when Hash
31
+ if obj.length >= 0
32
+ @@f.write "#{timestamp}: #{agent}: \n"
33
+ obj.each_value { |value| @@f.write " #{value}\n" }
34
+ puts "The hash is successfully saved into the log file: #{file} " if @verbose
35
+ end
36
+ when String
37
+ @@f.write "#{timestamp}: #{agent}: #{obj}\n"
38
+ puts "The string is successfully saved into the log file: #{file} " if @verbose
39
+ else
40
+ #do nothing
41
+ puts "Un-handled exception on: #{obj}" if @verbose
42
+ end
43
+ @@f.close
44
+ return true
45
+ rescue => ee
46
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
47
+ return false
48
+ end
49
+ end
50
+
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,343 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ # require "uri"
9
+
10
+ module Wmap
11
+ module Utils
12
+ module UrlMagic
13
+ extend self
14
+
15
+ # Simple sanity check on a 'claimed' URL string.
16
+ def is_url?(url)
17
+ puts "Validate the URL format is valid: #{url}" if @verbose
18
+ begin
19
+ if url =~ /(http|https)\:\/\/((.)+)/i
20
+ host=$2.split('/')[0]
21
+ host=host.split(':')[0]
22
+ if is_ip?(host) or is_fqdn?(host)
23
+ return true
24
+ else
25
+ return false
26
+ end
27
+ else
28
+ puts "Unknown URL format: #{url}" if @verbose
29
+ return false
30
+ end
31
+ rescue => ee
32
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
33
+ return false
34
+ end
35
+ end
36
+
37
+ # Simple sanity check on a 'claimed' SSL enabled URL string
38
+ def is_ssl?(url)
39
+ puts "Validate if SSL is enabled on: #{url}" if @verbose
40
+ begin
41
+ url=url.strip
42
+ if is_url?(url) && url =~ /https/i
43
+ return true
44
+ else
45
+ return false
46
+ end
47
+ rescue => ee
48
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
49
+ return false
50
+ end
51
+ end
52
+ alias_method :is_https?, :is_ssl?
53
+
54
+ # Simple sanity check on a 'claimed' web site base string.
55
+ def is_site?(url)
56
+ puts "Validate the website string format for: #{url}" if @verbose
57
+ begin
58
+ url=url.strip.downcase
59
+ if is_url?(url)
60
+ if url == url_2_site(url)
61
+ return true
62
+ else
63
+ return false
64
+ end
65
+ else
66
+ puts "Unknown site format: #{url}" if @verbose
67
+ return false
68
+ end
69
+ rescue => ee
70
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
71
+ return nil
72
+ end
73
+ end
74
+
75
+ # Check if URL is an absolute one
76
+ #def is_absolute?(url)
77
+ # puts "Validate if the url is absolute: #{url}" if @verbose
78
+ # begin
79
+ # url.strip!
80
+ # URI.absolute?(url)
81
+ # rescue => ee
82
+ # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
83
+ # return false
84
+ # end
85
+ #end
86
+
87
+ # Check if URL is relative one
88
+ #def is_relative?(url)
89
+ # begin
90
+ # url.strip!
91
+ # !is_absolute?(url)
92
+ # rescue => ee
93
+ # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
94
+ # return false
95
+ # end
96
+ #end
97
+
98
+ # Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
99
+ def url_2_host (url)
100
+ begin
101
+ url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
102
+ record1 = url.split('/')
103
+ if record1[0].nil?
104
+ puts "Error process url: #{url}"
105
+ return nil
106
+ else
107
+ record2 = record1[0].split(':')
108
+ return record2[0]
109
+ end
110
+ rescue => ee
111
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
112
+ return nil
113
+ end
114
+ end
115
+
116
+ # Extract web service port from the url. For example: "https://login.yahoo.com/email/help" -> 443
117
+ def url_2_port (url)
118
+ puts "Retrieve service port on URL: #{url}" if @verbose
119
+ begin
120
+ ssl = (url =~ /https/i)
121
+ url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
122
+ record1 = url.split('/')
123
+ record2 = record1[0].split(':')
124
+ if (record2.length == 2)
125
+ puts "The service port: #{record2[1]}" if @verbose
126
+ return record2[1].to_i
127
+ elsif ssl
128
+ puts "The service port: 443" if @verbose
129
+ return 443
130
+ else
131
+ puts "The service port: 80" if @verbose
132
+ return 80
133
+ end
134
+ rescue => ee
135
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
136
+ return nil
137
+ end
138
+ end
139
+
140
+ # Extract site in (host:port) format from a url: "https://login.yahoo.com:8443/email/help" -> "http://login.yahoo.com:8443/"
141
+ def url_2_site (url)
142
+ puts "Retrieve the web site base for url: #{url}" if @verbose
143
+ begin
144
+ url = url.downcase
145
+ url = url.sub(/^(.*?)http/i,'http')
146
+ entry = url.split(%r{\/\/})
147
+ prot=entry[0]
148
+ # step 1, extract the host:port pair from the url
149
+ host_port=entry[1].split(%r{\/})[0]
150
+ if host_port =~ /\:/
151
+ host=host_port.split(%r{\:})[0]
152
+ port=host_port.split(%r{\:})[1].to_i
153
+ elsif prot =~ /https/i
154
+ host=host_port
155
+ port=443
156
+ elsif prot =~ /http/i
157
+ host=host_port
158
+ port=80
159
+ else
160
+ host=host_port
161
+ #raise "Unknown url format: #{url}"
162
+ end
163
+ # additional logic to handle uncommon url base structures
164
+ unless is_fqdn?(host)
165
+ case host
166
+ # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
167
+ when /\?|\#/
168
+ host=host.split(%r{\?|\#})[0]
169
+ else
170
+ #do nothing
171
+ end
172
+ end
173
+ # step 2, put the host:port pair back to the normal site format
174
+ prot="https:" if port==443
175
+ if port==80 || port==443
176
+ site=prot+"//"+host+"/"
177
+ else
178
+ site=prot+"//"+host+":"+port.to_s+"/"
179
+ end
180
+ if site=~ /http/i
181
+ #puts "Base found: #{site}" if @verbose
182
+ return site
183
+ else
184
+ raise "Problem encountered on method url_2_site: Unable to convert #{url}"
185
+ return nil
186
+ end
187
+ rescue => ee
188
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
189
+ return nil
190
+ end
191
+ end
192
+
193
+ # Wrapper to return relative path component of the URL. i.e. http://www.yahoo.com/login.html => /login.html
194
+ def url_2_path(url)
195
+ #puts "Retrieve the relative path component of the url: #{url}" if @verbose
196
+ begin
197
+ url.strip!
198
+ base = url_2_site(url).chop
199
+ path=url.sub(base,'')
200
+ #puts "Path component found: #{path}" if @verbose
201
+ return path
202
+ rescue => ee
203
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
204
+ end
205
+
206
+ end
207
+
208
+ # Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
209
+ def urls_on_same_domain?(url1, url2)
210
+ puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
211
+ begin
212
+ host1=url_2_host(url1)
213
+ host2=url_2_host(url2)
214
+ return get_domain_root(host1) == get_domain_root(host2)
215
+ rescue => ee
216
+ puts "Error searching the object content: #{ee}" if @verbose
217
+ return nil
218
+ end
219
+ end
220
+
221
+ # Input is host and open port, output is a URL for valid http response code or nil
222
+ def host_2_url (host,port=80)
223
+ puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
224
+ begin
225
+ host=host.strip
226
+ if port.to_i == 80
227
+ url_1 = "http://" + host + "/"
228
+ elsif port.to_i ==443
229
+ url_1 = "https://" + host + "/"
230
+ else
231
+ url_1 = "http://" + host + ":" + port.to_s + "/"
232
+ url_2 = "https://" + host + ":" + port.to_s + "/"
233
+ end
234
+ puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
235
+ checker=Wmap::UrlChecker.new
236
+ if checker.response_code(url_1) != 10000
237
+ puts "Found URL: #{url_1}" if @verbose
238
+ return url_1
239
+ elsif checker.response_code(url_2) != 10000
240
+ puts "Found URL: #{url_2}" if @verbose
241
+ return url_2
242
+ else
243
+ puts "No http(s) service found on: #{host}:#{port}" if @verbose
244
+ return nil
245
+ end
246
+ rescue => ee
247
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
248
+ return nil
249
+ end
250
+ end
251
+
252
+ # Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
253
+ def make_absolute(base, relative_url)
254
+ puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
255
+ begin
256
+ absolute_url = nil;
257
+ if relative_url =~ /^\//
258
+ absolute_url = create_absolute_url_from_base(base, relative_url)
259
+ else
260
+ absolute_url = create_absolute_url_from_context(base, relative_url)
261
+ end
262
+ puts "Found absolute URL: #{absolute_url}" if @verbose
263
+ return absolute_url
264
+ rescue => ee
265
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
266
+ return nil
267
+ end
268
+ end
269
+
270
+ # Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
271
+ def create_absolute_url_from_base(potential_base, relative_url)
272
+ begin
273
+ #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
274
+ naked_base = url_2_site(potential_base).strip.chop
275
+ puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
276
+ return naked_base + relative_url
277
+ rescue => ee
278
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
279
+ return nil
280
+ end
281
+ end
282
+
283
+ # Construct the absolute URL by comparing a known URL and the relative file path
284
+ def create_absolute_url_from_context(potential_base, relative_url)
285
+ puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
286
+ begin
287
+ absolute_url = nil
288
+ # make relative URL naked by removing the beginning '/'
289
+ relative_url.sub!(/^\//,'')
290
+ if potential_base =~ /\/$/
291
+ absolute_url = potential_base+relative_url.strip
292
+ else
293
+ last_index_of_slash = potential_base.rindex('/')
294
+ if potential_base[last_index_of_slash-2, 2] == ':/'
295
+ absolute_url = potential_base+relative_url
296
+ else
297
+ last_index_of_dot = potential_base.rindex('.')
298
+ if last_index_of_dot < last_index_of_slash
299
+ absolute_url = potential_base.strip.chop+relative_url
300
+ else
301
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
302
+ end
303
+ end
304
+ end
305
+ puts "Found absolute URL: #{absolute_url}" if @verbose
306
+ return absolute_url
307
+ rescue => ee
308
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
309
+ return nil
310
+ end
311
+ end
312
+
313
+ # Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
314
+ # See http://en.wikipedia.org/wiki/URL_normalization for more explanation
315
+ def normalize_url(url)
316
+ begin
317
+ url.strip!
318
+ # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
319
+ # Normalize the base
320
+ base=url_2_site(url)
321
+ # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
322
+ base=base.sub(/\.\/$/,'/')
323
+ # Normalize the relative path, case#1
324
+ # retrieve the file path and remove the first '/' or '.',
325
+ # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
326
+ path=url_2_path(url).sub(/^(\/|\.)*/,'')
327
+ # Normalize the relative path, case#2
328
+ # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
329
+ path=path.gsub(/\/\.{1,2}\//,'/')
330
+ if path.nil?
331
+ return base
332
+ else
333
+ return base+path
334
+ end
335
+ rescue => ee
336
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
337
+ return url
338
+ end
339
+ end
340
+
341
+ end
342
+ end
343
+ end