wmap 2.5.2 → 2.5.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -31,6 +31,7 @@ module Wmap
31
31
  File.write(file2, "") unless File.exist?(@tag_file)
32
32
  # load the known tag store
33
33
  @tag_store=load_tag_from_file(file2)
34
+ @landings = Hash.new # cache landing page to reduce redundant browsing
34
35
  end
35
36
 
36
37
 
@@ -62,7 +63,7 @@ module Wmap
62
63
  end
63
64
 
64
65
  # load the known tag store cache into an instance variable
65
- def load_tag_from_file (file, lc=true)
66
+ def load_tag_from_file (file, lc=false)
66
67
  puts "Loading tag data file: #{file}" if @verbose
67
68
  data_store=Hash.new
68
69
  f = File.open(file, 'r')
@@ -107,8 +108,8 @@ module Wmap
107
108
  # add tag entries (from the sitetracker list)
108
109
  def refresh (num=@max_parallel,use_cache=true)
109
110
  puts "Add entries to the local cache table from site tracker: " if @verbose
110
- results=Hash.new
111
- tags=Wmap::SiteTracker.instance.known_sites.keys
111
+ results = Hash.new
112
+ tags = Wmap::SiteTracker.instance.known_sites.keys
112
113
  if tags.size > 0
113
114
  Parallel.map(tags, :in_processes => num) { |target|
114
115
  check_adware(target,use_cache)
@@ -121,12 +122,12 @@ module Wmap
121
122
  end
122
123
  @tag_store.merge!(results)
123
124
  puts "Done loading entries."
124
- tags=nil
125
+ tags = nil
125
126
  return results
126
127
  else
127
128
  puts "Error: no entry is loaded. Please check your list and try again."
128
129
  end
129
- tags=nil
130
+ tags = nil
130
131
  return results
131
132
  rescue => ee
132
133
  puts "Exception on method #{__method__}: #{ee}" if @verbose
@@ -140,6 +141,10 @@ module Wmap
140
141
  puts "Site entry already exist. Skipping: #{site}" if @verbose
141
142
  else
142
143
  url = fast_landing(site)
144
+ if @landings.key?(url)
145
+ record[site] = @landings[url]
146
+ return record
147
+ end
143
148
  tags = find_tags(url)
144
149
  return record if tags.size==0
145
150
  tag_vers=tags.map do |tag|
@@ -149,7 +154,8 @@ module Wmap
149
154
  Base64.urlsafe_encode64(get_desc(url,tag))
150
155
  end
151
156
  if tags
152
- record[site]=[url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
157
+ record[site] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
158
+ @landings[url] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
153
159
  @tag_store.merge!(record)
154
160
  puts "Tag entry loaded: #{record}" if @verbose
155
161
  else
@@ -189,14 +195,13 @@ module Wmap
189
195
  def find_tags(url)
190
196
  puts "Search and return tags within the url payload: #{url}" if @verbose
191
197
  tag_list = []
192
- doc = Nokogiri::HTML(open(url))
198
+ doc = open_page(url)
193
199
  doc.text.each_line do |line|
194
200
  my_line = line.downcase
195
201
  @tag_signatures.keys.map do |tag|
196
202
  tag_list.push(tag) if my_line.include?(tag)
197
203
  end
198
204
  end
199
- doc = nil
200
205
  return tag_list
201
206
  rescue => ee
202
207
  puts "Exception on method #{__method__}: #{ee}" if @verbose
@@ -207,7 +212,7 @@ module Wmap
207
212
  def get_ver(url,tag)
208
213
  puts "Search and return tag version within the url payload: #{url}, #{tag}" if @verbose
209
214
  tag_ver=""
210
- doc = Nokogiri::HTML(open(url))
215
+ doc = open_page(url)
211
216
  case tag
212
217
  when "utag.js" # sample: ...,"code_release_version":"cb20190312032612",...
213
218
  doc.text.each_line do |line|
@@ -219,16 +224,33 @@ module Wmap
219
224
  break
220
225
  end
221
226
  end
222
- when "analytics.js" # sample: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
227
+ when "analytics.js" # sample #1: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
223
228
  doc.text.each_line do |line|
224
229
  my_line = line.downcase
225
- if my_line.include?("ga(") && my_line.include?("create")
230
+ if my_line.include?("ga") && my_line.include?("create") #sample #2: __gaTracker('create', 'UA-121313929-1', 'auto');
226
231
  puts "Extract tag version from line: #{my_line}" if @verbose
227
232
  m = my_line.match(/[\'|\"]create[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]\s*\,/)
228
233
  tag_ver = m[:ver]
229
234
  break
230
235
  end
231
236
  end
237
+ when "ga.js"
238
+ doc.text.each_line do |line|
239
+ my_line = line.downcase
240
+ puts my_line if @verbose
241
+ if my_line.include?("push") && my_line.include?("_setaccount") # # sample #1: _gaq.push(['_setAccount', 'UA-13205363-65']);
242
+ m = my_line.match(/[\'|\"]\_setaccount[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
243
+ tag_ver = m[:ver]
244
+ break
245
+ end
246
+ if my_line.include?("_gettracker") # sample #2: var pageTracker = _gat._getTracker("UA-12487327-1");
247
+ puts "Extract tag version from line: #{my_line}" if @verbose
248
+ m = my_line.match(/\_gettracker\s*\(\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
249
+ tag_ver = m[:ver]
250
+ break
251
+ end
252
+
253
+ end
232
254
  when "all.js" # sample: appId : '749936668352954',
233
255
  doc.text.each_line do |line|
234
256
  my_line = line.downcase
@@ -241,11 +263,11 @@ module Wmap
241
263
  end
242
264
 
243
265
  else
244
- puts "Unknown Adware Tag: #{tag}"
266
+ puts "Don't know how to locate Adware Tag version: #{tag}"
245
267
  # do nothing
246
268
  end
247
269
  doc = nil
248
- return tag_ver
270
+ return tag_ver.upcase
249
271
  rescue => ee
250
272
  puts "Exception on method #{__method__}: #{ee}: #{url} : #{tag}" if @verbose
251
273
  return tag_ver
@@ -257,9 +279,9 @@ module Wmap
257
279
  recording=false
258
280
  tag_found=false
259
281
  tag_desc=""
260
- doc = Nokogiri::HTML(open(url))
282
+ doc = open_page(url)
261
283
  doc.search('script').map do |script|
262
- if script.text.include?(tag)
284
+ if script.text.include?(tag) && script.text.length < 65535
263
285
  return script.text
264
286
  end
265
287
  end
@@ -21,123 +21,148 @@ module Wmap
21
21
  # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
22
22
  def get_domain_root (host)
23
23
  puts "Retrieve the root domain for host: #{host}" if @verbose
24
- begin
25
- # Comnplete Top Level Domain List - loading once
26
- @tlds=file_2_hash(File_tld) if @tlds.nil?
27
- # Generic Top Level Domain List - loading once
28
- @gtld=file_2_hash(File_gtld) if @gtld.nil?
29
- # Country code top-level domain list - loading once
30
- @cctld=file_2_hash(File_cctld) if @cctld.nil?
31
- # Country code second level domain - loading once
32
- @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
33
-
34
- if host.strip.nil?
35
- puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
36
- return nil
37
- else
38
- host=host.downcase.strip
39
- end
40
- found_tld=false
41
- found_cctld=false
42
- # search the top level domain list first
43
- root_domain=""
44
- dn=host.split(".")
45
- if @tlds.key?(dn.last)
46
- cc_found=false
47
- if @cctld.key?(dn[dn.length-2])
48
- cc_found=true
49
- end
50
- if cc_found
51
- root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
52
- else
53
- root_domain=dn[dn.length-2] + "." + dn.last
54
- end
55
- found_tld=true
56
- end
57
- # search the country code top level domain list secondly
58
- if @cctld.key?(dn.last)
59
- found=false
60
- # reverse search of general top level domain
61
- if @gtld.key?(dn[dn.length-2])
62
- found=true
63
- end
64
- # search country code second level domain list
65
- if @ccsld.key?(dn.last)
66
- @ccsld[dn.last].each do |v|
67
- if ( v =~ /#{dn[dn.length-2]}/i )
68
- found=true
69
- break
70
- end
71
- end
72
- # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
73
- #unless found
74
- # if @gtld.key?(dn[dn.length-2])
75
- # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
76
- # return nil
77
- # end
78
- #end
79
- end
80
- if found
81
- root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
82
- else
83
- root_domain=dn[dn.length-2] + "." + dn.last
84
- end
85
- found_cctld=true
86
- end
87
- unless (found_tld or found_cctld)
88
- puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
89
- return nil
90
- else
91
- puts "Domain root found: #{root_domain}" if @verbose
92
- return root_domain
93
- end
94
- rescue => ee
95
- puts "Exception on method #{__method__}: #{ee}" if @verbose
24
+ if host.strip.nil?
25
+ puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
96
26
  return nil
27
+ else
28
+ host=host.downcase.strip
97
29
  end
30
+ # First order - search country code second level domain list
31
+ root_domain = get_domain_root_by_ccsld(host)
32
+ if root_domain.nil?
33
+ # Second order - search the country code top level domain list
34
+ root_domain = get_domain_root_by_cctld(host)
35
+ if root_domain.nil?
36
+ # Third order - search top level domain list
37
+ root_domain = get_domain_root_by_tlds(host)
38
+ if root_domain.nil?
39
+ # do nothing - no further search
40
+ else
41
+ return root_domain
42
+ end
43
+ else
44
+ return root_domain
45
+ end
46
+ else
47
+ return root_domain
48
+ end
49
+ puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
50
+ return nil
51
+ #rescue => ee
52
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
53
+ # return nil
98
54
  end
99
55
  alias_method :get_root_domain, :get_domain_root
100
56
  alias_method :root_domain, :get_domain_root
101
57
  alias_method :domain_root, :get_domain_root
102
58
  alias_method :host_2_domain, :get_domain_root
103
59
 
60
+ # get domain root by lookup Country Code Second Level Domain list
61
+ def get_domain_root_by_ccsld(host)
62
+ puts "First order search - domain root lookup by Country Code Second Level Domain list ..." if @verbose
63
+ root_domain = nil
64
+ dn = host.split(".")
65
+ # Country code second level domain - loading once
66
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
67
+ # search country code second level domain list
68
+ if @ccsld.key?(dn.last)
69
+ @ccsld[dn.last].each do |v|
70
+ if ( v =~ /#{dn[dn.length-2]}/i )
71
+ return dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
72
+ end
73
+ end
74
+ end
75
+ return root_domain
76
+ #rescue => ee
77
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
78
+ # return nil
79
+ end
80
+
81
+ # get domain root by lookup Country Code Top Level Domain list
82
+ def get_domain_root_by_cctld(host)
83
+ puts "Second order search - domain root lookup by Country Code Top Level Domain list ..." if @verbose
84
+ root_domain = nil
85
+ dn = host.split(".")
86
+ # Country code top-level domain list - loading once
87
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
88
+ # Generic Top Level Domain List - loading once
89
+ @gtld=file_2_hash(File_gtld) if @gtld.nil?
90
+ # Country code second level domain - loading once
91
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
92
+ # search the country code top level domain list
93
+ if @cctld.key?(dn.last)
94
+ # reverse search of general top level domain
95
+ if @gtld.key?(dn[dn.length-2])
96
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
97
+ else
98
+ root_domain=dn[dn.length-2] + "." + dn.last
99
+ end
100
+ end
101
+ return root_domain
102
+ #rescue => ee
103
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
104
+ # return nil
105
+ end
106
+
107
+ # get domain root by lookup Top Level Domain list
108
+ def get_domain_root_by_tlds(host)
109
+ puts "Third order search - domain root lookup by Top Level Domain list ..." if @verbose
110
+ root_domain = nil
111
+ dn = host.split(".")
112
+ # Comnplete Top Level Domain List - loading once
113
+ @tlds=file_2_hash(File_tld) if @tlds.nil?
114
+ # Country code top-level domain list - loading once
115
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
116
+ cc_found=false
117
+ if @tlds.key?(dn.last)
118
+ if @cctld.key?(dn[dn.length-2])
119
+ cc_found=true
120
+ end
121
+ if cc_found
122
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
123
+ else
124
+ root_domain=dn[dn.length-2] + "." + dn.last
125
+ end
126
+ end
127
+ return root_domain
128
+ #rescue => ee
129
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
130
+ # return nil
131
+ end
132
+
104
133
  # 'setter' to parse and load the known country code second level domain table from the file
105
134
  # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
106
135
  def load_ccsld_from_file (file_ccsld)
107
- begin
108
- ccsld=Hash.new
109
- puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
110
- f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
111
- f.each do |line|
112
- next unless line =~ /^\s+\.\w/
113
- line=line.chomp.strip.downcase
114
- entry=line.split(' ')[0].split('.')
115
- if entry.length > 2
116
- key=entry.last
117
- ccsld[key] = Array.new if not ccsld.key?(key)
118
- val=entry[entry.length-2]
119
- #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
120
- ccsld[key].push(val) unless key.nil?
121
- end
136
+ ccsld=Hash.new
137
+ puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
138
+ f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
139
+ f.each do |line|
140
+ next unless line =~ /^\s+\.\w/
141
+ line=line.chomp.strip.downcase
142
+ entry=line.split(' ')[0].split('.')
143
+ if entry.length > 2
144
+ key=entry.last
145
+ ccsld[key] = Array.new if not ccsld.key?(key)
146
+ val=entry[entry.length-2]
147
+ #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
148
+ ccsld[key].push(val) unless key.nil?
122
149
  end
123
- f.close
124
- # Sort the blocks once in descendant order once for better performance
125
- return ccsld
126
- rescue => ee
127
- puts "Exception on method #{__method__}: #{ee}" if @verbose
128
150
  end
151
+ f.close
152
+ # Sort the blocks once in descendant order once for better performance
153
+ return ccsld
154
+ rescue => ee
155
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
129
156
  end
130
157
 
131
158
  # Test a host string to see if it's a valid Internet root domain
132
159
  def is_domain_root? (domain)
133
- puts "Validate the domain name is valid: #{domain}" if @verbose
134
- begin
135
- domain=domain.strip.downcase
136
- return domain == get_domain_root(domain)
137
- rescue => ee
138
- puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
139
- return false
140
- end
160
+ puts "Validate the domain name is valid: #{domain}" if @verbose
161
+ domain=domain.strip.downcase
162
+ return domain == get_domain_root(domain)
163
+ rescue => ee
164
+ puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
165
+ return false
141
166
  end
142
167
  alias_method :is_root_domain?, :is_domain_root?
143
168
  alias_method :is_domain?, :is_domain_root?
@@ -146,39 +171,40 @@ module Wmap
146
171
  # Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
147
172
  def get_sub_domain (host)
148
173
  puts "Retrieve sub-domain from host: #{host}" if @verbose
149
- begin
150
- subdomain=String.new
151
- host=host.strip.downcase
152
- domain=get_domain_root(host)
153
- record_h=host.split(".")
154
- record_d=domain.split(".")
155
- if (record_h.length - record_d.length) >= 2
156
- subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
157
- puts "Sub domain found: #{subdomain}" if @verbose
158
- return subdomain
159
- else
160
- return nil
161
- end
162
- rescue Exception => ee
163
- puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
174
+ subdomain=String.new
175
+ host=host.strip.downcase
176
+ domain=get_domain_root(host)
177
+ record_h=host.split(".")
178
+ record_d=domain.split(".")
179
+ if (record_h.length - record_d.length) >= 2
180
+ subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
181
+ puts "Sub domain found: #{subdomain}" if @verbose
182
+ return subdomain
183
+ else
164
184
  return nil
165
185
  end
186
+ rescue Exception => ee
187
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
188
+ return nil
166
189
  end
167
190
  alias_method :get_subdomain, :get_sub_domain
168
191
 
169
192
  # Function to print instance variable - General top level domain list
170
193
  def print_gtld
171
194
  puts @gtld
195
+ return @gtld
172
196
  end
173
197
 
174
198
  # Function to print instance variable - Country code top-level domain list
175
199
  def print_cctld
176
200
  puts @cctld
201
+ return @cctld
177
202
  end
178
203
 
179
204
  # Function to print instance variable - Country code second-level domain list
180
205
  def print_ccsld
181
206
  puts @ccsld
207
+ return @ccsld
182
208
  end
183
209
 
184
210
  private :load_ccsld_from_file
@@ -5,13 +5,17 @@
5
5
  #
6
6
  # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
7
  #++
8
- # require "uri"
8
+ require "watir"
9
+ require "selenium-webdriver"
9
10
 
10
11
  module Wmap
11
- module Utils
12
- module UrlMagic
12
+ module Utils
13
+ module UrlMagic
13
14
  extend self
14
15
 
16
+ # set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
17
+ Max_http_timeout=8000
18
+
15
19
  # Simple sanity check on a 'claimed' URL string.
16
20
  def is_url?(url)
17
21
  puts "Validate the URL format is valid: #{url}" if @verbose
@@ -33,7 +37,7 @@ module Wmap
33
37
  return false
34
38
  end
35
39
  end
36
-
40
+
37
41
  # Simple sanity check on a 'claimed' SSL enabled URL string
38
42
  def is_ssl?(url)
39
43
  puts "Validate if SSL is enabled on: #{url}" if @verbose
@@ -49,8 +53,8 @@ module Wmap
49
53
  return false
50
54
  end
51
55
  end
52
- alias_method :is_https?, :is_ssl?
53
-
56
+ alias_method :is_https?, :is_ssl?
57
+
54
58
  # Simple sanity check on a 'claimed' web site base string.
55
59
  def is_site?(url)
56
60
  puts "Validate the website string format for: #{url}" if @verbose
@@ -61,7 +65,7 @@ module Wmap
61
65
  return true
62
66
  else
63
67
  return false
64
- end
68
+ end
65
69
  else
66
70
  puts "Unknown site format: #{url}" if @verbose
67
71
  return false
@@ -71,40 +75,17 @@ module Wmap
71
75
  return nil
72
76
  end
73
77
  end
74
-
75
- # Check if URL is an absolute one
76
- #def is_absolute?(url)
77
- # puts "Validate if the url is absolute: #{url}" if @verbose
78
- # begin
79
- # url.strip!
80
- # URI.absolute?(url)
81
- # rescue => ee
82
- # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
83
- # return false
84
- # end
85
- #end
86
-
87
- # Check if URL is relative one
88
- #def is_relative?(url)
89
- # begin
90
- # url.strip!
91
- # !is_absolute?(url)
92
- # rescue => ee
93
- # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
94
- # return false
95
- # end
96
- #end
97
-
78
+
98
79
  # Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
99
80
  def url_2_host (url)
100
81
  begin
101
82
  url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
102
83
  record1 = url.split('/')
103
84
  if record1[0].nil?
104
- puts "Error process url: #{url}"
85
+ puts "Error process url: #{url}"
105
86
  return nil
106
87
  else
107
- record2 = record1[0].split(':')
88
+ record2 = record1[0].split(':')
108
89
  return record2[0]
109
90
  end
110
91
  rescue => ee
@@ -120,8 +101,8 @@ module Wmap
120
101
  ssl = (url =~ /https/i)
121
102
  url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
122
103
  record1 = url.split('/')
123
- record2 = record1[0].split(':')
124
- if (record2.length == 2)
104
+ record2 = record1[0].split(':')
105
+ if (record2.length == 2)
125
106
  puts "The service port: #{record2[1]}" if @verbose
126
107
  return record2[1].to_i
127
108
  elsif ssl
@@ -164,13 +145,13 @@ module Wmap
164
145
  unless is_fqdn?(host)
165
146
  case host
166
147
  # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
167
- when /\?|\#/
148
+ when /\?|\#/
168
149
  host=host.split(%r{\?|\#})[0]
169
150
  else
170
151
  #do nothing
171
152
  end
172
153
  end
173
- # step 2, put the host:port pair back to the normal site format
154
+ # step 2, put the host:port pair back to the normal site format
174
155
  prot="https:" if port==443
175
156
  if port==80 || port==443
176
157
  site=prot+"//"+host+"/"
@@ -180,7 +161,7 @@ module Wmap
180
161
  if site=~ /http/i
181
162
  #puts "Base found: #{site}" if @verbose
182
163
  return site
183
- else
164
+ else
184
165
  raise "Problem encountered on method url_2_site: Unable to convert #{url}"
185
166
  return nil
186
167
  end
@@ -202,28 +183,26 @@ module Wmap
202
183
  rescue => ee
203
184
  puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
204
185
  end
205
-
186
+
206
187
  end
207
-
188
+
208
189
  # Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
209
190
  def urls_on_same_domain?(url1, url2)
210
191
  puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
211
- begin
212
- host1=url_2_host(url1)
213
- host2=url_2_host(url2)
214
- return get_domain_root(host1) == get_domain_root(host2)
215
- rescue => ee
216
- puts "Error searching the object content: #{ee}" if @verbose
217
- return nil
218
- end
219
- end
192
+ host1=url_2_host(url1)
193
+ host2=url_2_host(url2)
194
+ return get_domain_root(host1) == get_domain_root(host2)
195
+ rescue => ee
196
+ puts "Error searching the object content: #{ee}" if @verbose
197
+ return nil
198
+ end
220
199
 
221
200
  # Input is host and open port, output is a URL for valid http response code or nil
222
201
  def host_2_url (host,port=80)
223
202
  puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
224
203
  begin
225
204
  host=host.strip
226
- if port.to_i == 80
205
+ if port.to_i == 80
227
206
  url_1 = "http://" + host + "/"
228
207
  elsif port.to_i ==443
229
208
  url_1 = "https://" + host + "/"
@@ -232,7 +211,7 @@ module Wmap
232
211
  url_2 = "https://" + host + ":" + port.to_s + "/"
233
212
  end
234
213
  puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
235
- checker=Wmap::UrlChecker.new
214
+ checker=Wmap::UrlChecker.new
236
215
  if checker.response_code(url_1) != 10000
237
216
  puts "Found URL: #{url_1}" if @verbose
238
217
  return url_1
@@ -247,8 +226,8 @@ module Wmap
247
226
  puts "Exception on method #{__method__}: #{ee}" if @verbose
248
227
  return nil
249
228
  end
250
- end
251
-
229
+ end
230
+
252
231
  # Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
253
232
  def make_absolute(base, relative_url)
254
233
  puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
@@ -266,12 +245,12 @@ module Wmap
266
245
  return nil
267
246
  end
268
247
  end
269
-
248
+
270
249
  # Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
271
250
  def create_absolute_url_from_base(potential_base, relative_url)
272
251
  begin
273
252
  #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
274
- naked_base = url_2_site(potential_base).strip.chop
253
+ naked_base = url_2_site(potential_base).strip.chop
275
254
  puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
276
255
  return naked_base + relative_url
277
256
  rescue => ee
@@ -309,19 +288,19 @@ module Wmap
309
288
  return nil
310
289
  end
311
290
  end
312
-
291
+
313
292
  # Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
314
293
  # See http://en.wikipedia.org/wiki/URL_normalization for more explanation
315
294
  def normalize_url(url)
316
295
  begin
317
296
  url.strip!
318
- # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
297
+ # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
319
298
  # Normalize the base
320
- base=url_2_site(url)
299
+ base=url_2_site(url)
321
300
  # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
322
301
  base=base.sub(/\.\/$/,'/')
323
302
  # Normalize the relative path, case#1
324
- # retrieve the file path and remove the first '/' or '.',
303
+ # retrieve the file path and remove the first '/' or '.',
325
304
  # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
326
305
  path=url_2_path(url).sub(/^(\/|\.)*/,'')
327
306
  # Normalize the relative path, case#2
@@ -337,7 +316,136 @@ module Wmap
337
316
  return url
338
317
  end
339
318
  end
340
-
319
+
320
+
321
+ # Test the URL and return the response code
322
+ def response_code (url)
323
+ puts "Check the http response code on the url: #{url}" if @verbose
324
+ code = 10000 # All unknown url connection exceptions go here
325
+ raise "Invalid url: #{url}" unless is_url?(url)
326
+ url=url.strip.downcase
327
+ timeo = Max_http_timeout/1000.0
328
+ uri = URI.parse(url)
329
+ http = Net::HTTP.new(uri.host, uri.port)
330
+ http.open_timeout = timeo
331
+ http.read_timeout = timeo
332
+ if (url =~ /https\:/i)
333
+ http.use_ssl = true
334
+ #http.ssl_version = :SSLv3
335
+ # Bypass the remote web server cert validation test
336
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
337
+ end
338
+ request = Net::HTTP::Get.new(uri.request_uri)
339
+ response = http.request(request)
340
+ puts "Server response the following: #{response}" if @verbose
341
+ code = response.code.to_i
342
+ #response.finish if response.started?()
343
+ @url_code=Hash.new unless @url_code
344
+ @url_code[url]=code
345
+ puts "Response code on #{url}: #{code}" if @verbose
346
+ return code
347
+ rescue Exception => ee
348
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
349
+ case ee
350
+ # rescue "Connection reset by peer" error type
351
+ when Errno::ECONNRESET
352
+ code=104
353
+ when Errno::ECONNABORTED,Errno::ETIMEDOUT
354
+ #code=10000
355
+ when Timeout::Error # Quick fix
356
+ if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
357
+ http.ssl_version = :SSLv3
358
+ response = http.request(request)
359
+ code = response.code.to_i
360
+ unless code.nil?
361
+ @ssl_version = http.ssl_version
362
+ end
363
+ end
364
+ else
365
+ #code=10000
366
+ end
367
+ @url_code=Hash.new unless @url_code
368
+ @url_code[url]=code
369
+ return code
370
+ end
371
+
372
+ # Given an URL, open the page, then return the DOM text from a normal user perspective
373
+ def open_page(url)
374
+ args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, read_timeout: Max_http_timeout/1000}
375
+ doc = Nokogiri::HTML(open(url, args))
376
+ if doc.text.include?("Please enable JavaScript to view the page content")
377
+ puts "Invoke headless chrome through webdriver ..." if @verbose
378
+ #Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver"
379
+ #driver = Selenium::WebDriver.for :chrome
380
+ # http://watir.com/guides/chrome/
381
+ args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate']
382
+ browser = Watir::Browser.new :chrome, headless: true, options: {args: args}
383
+ browser.goto(url)
384
+ sleep(2) # wait for the loading
385
+ doc = Nokogiri::HTML(browser.html)
386
+ browser.close
387
+ end
388
+ puts doc.text if @verbose
389
+ return doc
390
+ rescue => ee
391
+ puts "Exception on method #{__method__} for #{url}: #{ee}"
392
+ browser.close unless browser.nil?
393
+ return doc.text
394
+ end
395
+
396
+ # Test the URL / site and return the redirection location (3xx response code only)
397
+ def redirect_location (url)
398
+ puts "Test the redirection location for the url: #{url}" if @verbose
399
+ location=""
400
+ raise "Invalid url: #{url}" unless is_url?(url)
401
+ url=url.strip.downcase
402
+ timeo = Max_http_timeout/1000.0
403
+ uri = URI.parse(url)
404
+ code = response_code (url)
405
+ if code >= 300 && code < 400
406
+ http = Net::HTTP.new(uri.host, uri.port)
407
+ http.open_timeout = timeo
408
+ http.read_timeout = timeo
409
+ if (url =~ /https\:/i)
410
+ http.use_ssl = true
411
+ # Bypass the remote web server cert validation test
412
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
413
+ http.ssl_version = @ssl_version
414
+ end
415
+ request = Net::HTTP::Get.new(uri.request_uri)
416
+ response = http.request(request)
417
+ puts "Response: #{response}" if @verbose
418
+ case response
419
+ when Net::HTTPRedirection then
420
+ location = response['location']
421
+ end
422
+ end
423
+ return location
424
+ rescue Exception => ee
425
+ puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
426
+ return ""
427
+ end
428
+ alias_method :location, :redirect_location
429
+
430
+ # Test the URL / Site and return the landing url location (recursive with the depth = 4 )
431
+ def landing_location (depth=5, url)
432
+ depth -= 1
433
+ return url if depth < 1
434
+ timeo = Max_http_timeout/1000.0
435
+ uri = URI.parse(url)
436
+ code = response_code (url)
437
+ if code >= 300 && code < 400
438
+ url = redirect_location (url)
439
+ url = landing_location(depth,url)
440
+ else
441
+ return url
442
+ end
443
+ return url
444
+ rescue Exception => ee
445
+ puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
446
+ end
447
+
448
+
341
449
  end
342
450
  end
343
451
  end