wmap 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,7 @@ module Wmap
31
31
  File.write(file2, "") unless File.exist?(@tag_file)
32
32
  # load the known tag store
33
33
  @tag_store=load_tag_from_file(file2)
34
+ @landings = Hash.new # cache landing page to reduce redundant browsing
34
35
  end
35
36
 
36
37
 
@@ -62,7 +63,7 @@ module Wmap
62
63
  end
63
64
 
64
65
  # load the known tag store cache into an instance variable
65
- def load_tag_from_file (file, lc=true)
66
+ def load_tag_from_file (file, lc=false)
66
67
  puts "Loading tag data file: #{file}" if @verbose
67
68
  data_store=Hash.new
68
69
  f = File.open(file, 'r')
@@ -107,8 +108,8 @@ module Wmap
107
108
  # add tag entries (from the sitetracker list)
108
109
  def refresh (num=@max_parallel,use_cache=true)
109
110
  puts "Add entries to the local cache table from site tracker: " if @verbose
110
- results=Hash.new
111
- tags=Wmap::SiteTracker.instance.known_sites.keys
111
+ results = Hash.new
112
+ tags = Wmap::SiteTracker.instance.known_sites.keys
112
113
  if tags.size > 0
113
114
  Parallel.map(tags, :in_processes => num) { |target|
114
115
  check_adware(target,use_cache)
@@ -121,12 +122,12 @@ module Wmap
121
122
  end
122
123
  @tag_store.merge!(results)
123
124
  puts "Done loading entries."
124
- tags=nil
125
+ tags = nil
125
126
  return results
126
127
  else
127
128
  puts "Error: no entry is loaded. Please check your list and try again."
128
129
  end
129
- tags=nil
130
+ tags = nil
130
131
  return results
131
132
  rescue => ee
132
133
  puts "Exception on method #{__method__}: #{ee}" if @verbose
@@ -140,6 +141,10 @@ module Wmap
140
141
  puts "Site entry already exist. Skipping: #{site}" if @verbose
141
142
  else
142
143
  url = fast_landing(site)
144
+ if @landings.key?(url)
145
+ record[site] = @landings[url]
146
+ return record
147
+ end
143
148
  tags = find_tags(url)
144
149
  return record if tags.size==0
145
150
  tag_vers=tags.map do |tag|
@@ -149,7 +154,8 @@ module Wmap
149
154
  Base64.urlsafe_encode64(get_desc(url,tag))
150
155
  end
151
156
  if tags
152
- record[site]=[url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
157
+ record[site] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
158
+ @landings[url] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
153
159
  @tag_store.merge!(record)
154
160
  puts "Tag entry loaded: #{record}" if @verbose
155
161
  else
@@ -189,14 +195,13 @@ module Wmap
189
195
  def find_tags(url)
190
196
  puts "Search and return tags within the url payload: #{url}" if @verbose
191
197
  tag_list = []
192
- doc = Nokogiri::HTML(open(url))
198
+ doc = open_page(url)
193
199
  doc.text.each_line do |line|
194
200
  my_line = line.downcase
195
201
  @tag_signatures.keys.map do |tag|
196
202
  tag_list.push(tag) if my_line.include?(tag)
197
203
  end
198
204
  end
199
- doc = nil
200
205
  return tag_list
201
206
  rescue => ee
202
207
  puts "Exception on method #{__method__}: #{ee}" if @verbose
@@ -207,7 +212,7 @@ module Wmap
207
212
  def get_ver(url,tag)
208
213
  puts "Search and return tag version within the url payload: #{url}, #{tag}" if @verbose
209
214
  tag_ver=""
210
- doc = Nokogiri::HTML(open(url))
215
+ doc = open_page(url)
211
216
  case tag
212
217
  when "utag.js" # sample: ...,"code_release_version":"cb20190312032612",...
213
218
  doc.text.each_line do |line|
@@ -219,16 +224,33 @@ module Wmap
219
224
  break
220
225
  end
221
226
  end
222
- when "analytics.js" # sample: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
227
+ when "analytics.js" # sample #1: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
223
228
  doc.text.each_line do |line|
224
229
  my_line = line.downcase
225
- if my_line.include?("ga(") && my_line.include?("create")
230
+ if my_line.include?("ga") && my_line.include?("create") #sample #2: __gaTracker('create', 'UA-121313929-1', 'auto');
226
231
  puts "Extract tag version from line: #{my_line}" if @verbose
227
232
  m = my_line.match(/[\'|\"]create[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]\s*\,/)
228
233
  tag_ver = m[:ver]
229
234
  break
230
235
  end
231
236
  end
237
+ when "ga.js"
238
+ doc.text.each_line do |line|
239
+ my_line = line.downcase
240
+ puts my_line if @verbose
241
+ if my_line.include?("push") && my_line.include?("_setaccount") # # sample #1: _gaq.push(['_setAccount', 'UA-13205363-65']);
242
+ m = my_line.match(/[\'|\"]\_setaccount[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
243
+ tag_ver = m[:ver]
244
+ break
245
+ end
246
+ if my_line.include?("_gettracker") # sample #2: var pageTracker = _gat._getTracker("UA-12487327-1");
247
+ puts "Extract tag version from line: #{my_line}" if @verbose
248
+ m = my_line.match(/\_gettracker\s*\(\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
249
+ tag_ver = m[:ver]
250
+ break
251
+ end
252
+
253
+ end
232
254
  when "all.js" # sample: appId : '749936668352954',
233
255
  doc.text.each_line do |line|
234
256
  my_line = line.downcase
@@ -241,11 +263,11 @@ module Wmap
241
263
  end
242
264
 
243
265
  else
244
- puts "Unknown Adware Tag: #{tag}"
266
+ puts "Don't know how to locate Adware Tag version: #{tag}"
245
267
  # do nothing
246
268
  end
247
269
  doc = nil
248
- return tag_ver
270
+ return tag_ver.upcase
249
271
  rescue => ee
250
272
  puts "Exception on method #{__method__}: #{ee}: #{url} : #{tag}" if @verbose
251
273
  return tag_ver
@@ -257,9 +279,9 @@ module Wmap
257
279
  recording=false
258
280
  tag_found=false
259
281
  tag_desc=""
260
- doc = Nokogiri::HTML(open(url))
282
+ doc = open_page(url)
261
283
  doc.search('script').map do |script|
262
- if script.text.include?(tag)
284
+ if script.text.include?(tag) && script.text.length < 65535
263
285
  return script.text
264
286
  end
265
287
  end
@@ -21,123 +21,148 @@ module Wmap
21
21
  # Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
22
22
  def get_domain_root (host)
23
23
  puts "Retrieve the root domain for host: #{host}" if @verbose
24
- begin
25
- # Comnplete Top Level Domain List - loading once
26
- @tlds=file_2_hash(File_tld) if @tlds.nil?
27
- # Generic Top Level Domain List - loading once
28
- @gtld=file_2_hash(File_gtld) if @gtld.nil?
29
- # Country code top-level domain list - loading once
30
- @cctld=file_2_hash(File_cctld) if @cctld.nil?
31
- # Country code second level domain - loading once
32
- @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
33
-
34
- if host.strip.nil?
35
- puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
36
- return nil
37
- else
38
- host=host.downcase.strip
39
- end
40
- found_tld=false
41
- found_cctld=false
42
- # search the top level domain list first
43
- root_domain=""
44
- dn=host.split(".")
45
- if @tlds.key?(dn.last)
46
- cc_found=false
47
- if @cctld.key?(dn[dn.length-2])
48
- cc_found=true
49
- end
50
- if cc_found
51
- root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
52
- else
53
- root_domain=dn[dn.length-2] + "." + dn.last
54
- end
55
- found_tld=true
56
- end
57
- # search the country code top level domain list secondly
58
- if @cctld.key?(dn.last)
59
- found=false
60
- # reverse search of general top level domain
61
- if @gtld.key?(dn[dn.length-2])
62
- found=true
63
- end
64
- # search country code second level domain list
65
- if @ccsld.key?(dn.last)
66
- @ccsld[dn.last].each do |v|
67
- if ( v =~ /#{dn[dn.length-2]}/i )
68
- found=true
69
- break
70
- end
71
- end
72
- # 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
73
- #unless found
74
- # if @gtld.key?(dn[dn.length-2])
75
- # puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
76
- # return nil
77
- # end
78
- #end
79
- end
80
- if found
81
- root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
82
- else
83
- root_domain=dn[dn.length-2] + "." + dn.last
84
- end
85
- found_cctld=true
86
- end
87
- unless (found_tld or found_cctld)
88
- puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
89
- return nil
90
- else
91
- puts "Domain root found: #{root_domain}" if @verbose
92
- return root_domain
93
- end
94
- rescue => ee
95
- puts "Exception on method #{__method__}: #{ee}" if @verbose
24
+ if host.strip.nil?
25
+ puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
96
26
  return nil
27
+ else
28
+ host=host.downcase.strip
97
29
  end
30
+ # First order - search country code second level domain list
31
+ root_domain = get_domain_root_by_ccsld(host)
32
+ if root_domain.nil?
33
+ # Second order - search the country code top level domain list
34
+ root_domain = get_domain_root_by_cctld(host)
35
+ if root_domain.nil?
36
+ # Third order - search top level domain list
37
+ root_domain = get_domain_root_by_tlds(host)
38
+ if root_domain.nil?
39
+ # do nothing - no further search
40
+ else
41
+ return root_domain
42
+ end
43
+ else
44
+ return root_domain
45
+ end
46
+ else
47
+ return root_domain
48
+ end
49
+ puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
50
+ return nil
51
+ #rescue => ee
52
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
53
+ # return nil
98
54
  end
99
55
  alias_method :get_root_domain, :get_domain_root
100
56
  alias_method :root_domain, :get_domain_root
101
57
  alias_method :domain_root, :get_domain_root
102
58
  alias_method :host_2_domain, :get_domain_root
103
59
 
60
+ # get domain root by lookup Country Code Second Level Domain list
61
+ def get_domain_root_by_ccsld(host)
62
+ puts "First order search - domain root lookup by Country Code Second Level Domain list ..." if @verbose
63
+ root_domain = nil
64
+ dn = host.split(".")
65
+ # Country code second level domain - loading once
66
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
67
+ # search country code second level domain list
68
+ if @ccsld.key?(dn.last)
69
+ @ccsld[dn.last].each do |v|
70
+ if ( v =~ /#{dn[dn.length-2]}/i )
71
+ return dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
72
+ end
73
+ end
74
+ end
75
+ return root_domain
76
+ #rescue => ee
77
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
78
+ # return nil
79
+ end
80
+
81
+ # get domain root by lookup Country Code Top Level Domain list
82
+ def get_domain_root_by_cctld(host)
83
+ puts "Second order search - domain root lookup by Country Code Top Level Domain list ..." if @verbose
84
+ root_domain = nil
85
+ dn = host.split(".")
86
+ # Country code top-level domain list - loading once
87
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
88
+ # Generic Top Level Domain List - loading once
89
+ @gtld=file_2_hash(File_gtld) if @gtld.nil?
90
+ # Country code second level domain - loading once
91
+ @ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
92
+ # search the country code top level domain list
93
+ if @cctld.key?(dn.last)
94
+ # reverse search of general top level domain
95
+ if @gtld.key?(dn[dn.length-2])
96
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
97
+ else
98
+ root_domain=dn[dn.length-2] + "." + dn.last
99
+ end
100
+ end
101
+ return root_domain
102
+ #rescue => ee
103
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
104
+ # return nil
105
+ end
106
+
107
+ # get domain root by lookup Top Level Domain list
108
+ def get_domain_root_by_tlds(host)
109
+ puts "Third order search - domain root lookup by Top Level Domain list ..." if @verbose
110
+ root_domain = nil
111
+ dn = host.split(".")
112
+ # Comnplete Top Level Domain List - loading once
113
+ @tlds=file_2_hash(File_tld) if @tlds.nil?
114
+ # Country code top-level domain list - loading once
115
+ @cctld=file_2_hash(File_cctld) if @cctld.nil?
116
+ cc_found=false
117
+ if @tlds.key?(dn.last)
118
+ if @cctld.key?(dn[dn.length-2])
119
+ cc_found=true
120
+ end
121
+ if cc_found
122
+ root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
123
+ else
124
+ root_domain=dn[dn.length-2] + "." + dn.last
125
+ end
126
+ end
127
+ return root_domain
128
+ #rescue => ee
129
+ # puts "Exception on method #{__method__}: #{ee}" if @verbose
130
+ # return nil
131
+ end
132
+
104
133
  # 'setter' to parse and load the known country code second level domain table from the file
105
134
  # data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
106
135
  def load_ccsld_from_file (file_ccsld)
107
- begin
108
- ccsld=Hash.new
109
- puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
110
- f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
111
- f.each do |line|
112
- next unless line =~ /^\s+\.\w/
113
- line=line.chomp.strip.downcase
114
- entry=line.split(' ')[0].split('.')
115
- if entry.length > 2
116
- key=entry.last
117
- ccsld[key] = Array.new if not ccsld.key?(key)
118
- val=entry[entry.length-2]
119
- #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
120
- ccsld[key].push(val) unless key.nil?
121
- end
136
+ ccsld=Hash.new
137
+ puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
138
+ f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
139
+ f.each do |line|
140
+ next unless line =~ /^\s+\.\w/
141
+ line=line.chomp.strip.downcase
142
+ entry=line.split(' ')[0].split('.')
143
+ if entry.length > 2
144
+ key=entry.last
145
+ ccsld[key] = Array.new if not ccsld.key?(key)
146
+ val=entry[entry.length-2]
147
+ #puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
148
+ ccsld[key].push(val) unless key.nil?
122
149
  end
123
- f.close
124
- # Sort the blocks once in descendant order once for better performance
125
- return ccsld
126
- rescue => ee
127
- puts "Exception on method #{__method__}: #{ee}" if @verbose
128
150
  end
151
+ f.close
152
+ # Sort the blocks once in descendant order once for better performance
153
+ return ccsld
154
+ rescue => ee
155
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
129
156
  end
130
157
 
131
158
  # Test a host string to see if it's a valid Internet root domain
132
159
  def is_domain_root? (domain)
133
- puts "Validate the domain name is valid: #{domain}" if @verbose
134
- begin
135
- domain=domain.strip.downcase
136
- return domain == get_domain_root(domain)
137
- rescue => ee
138
- puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
139
- return false
140
- end
160
+ puts "Validate the domain name is valid: #{domain}" if @verbose
161
+ domain=domain.strip.downcase
162
+ return domain == get_domain_root(domain)
163
+ rescue => ee
164
+ puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
165
+ return false
141
166
  end
142
167
  alias_method :is_root_domain?, :is_domain_root?
143
168
  alias_method :is_domain?, :is_domain_root?
@@ -146,39 +171,40 @@ module Wmap
146
171
  # Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
147
172
  def get_sub_domain (host)
148
173
  puts "Retrieve sub-domain from host: #{host}" if @verbose
149
- begin
150
- subdomain=String.new
151
- host=host.strip.downcase
152
- domain=get_domain_root(host)
153
- record_h=host.split(".")
154
- record_d=domain.split(".")
155
- if (record_h.length - record_d.length) >= 2
156
- subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
157
- puts "Sub domain found: #{subdomain}" if @verbose
158
- return subdomain
159
- else
160
- return nil
161
- end
162
- rescue Exception => ee
163
- puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
174
+ subdomain=String.new
175
+ host=host.strip.downcase
176
+ domain=get_domain_root(host)
177
+ record_h=host.split(".")
178
+ record_d=domain.split(".")
179
+ if (record_h.length - record_d.length) >= 2
180
+ subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
181
+ puts "Sub domain found: #{subdomain}" if @verbose
182
+ return subdomain
183
+ else
164
184
  return nil
165
185
  end
186
+ rescue Exception => ee
187
+ puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
188
+ return nil
166
189
  end
167
190
  alias_method :get_subdomain, :get_sub_domain
168
191
 
169
192
  # Function to print instance variable - General top level domain list
170
193
  def print_gtld
171
194
  puts @gtld
195
+ return @gtld
172
196
  end
173
197
 
174
198
  # Function to print instance variable - Country code top-level domain list
175
199
  def print_cctld
176
200
  puts @cctld
201
+ return @cctld
177
202
  end
178
203
 
179
204
  # Function to print instance variable - Country code second-level domain list
180
205
  def print_ccsld
181
206
  puts @ccsld
207
+ return @ccsld
182
208
  end
183
209
 
184
210
  private :load_ccsld_from_file
@@ -5,13 +5,17 @@
5
5
  #
6
6
  # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
7
  #++
8
- # require "uri"
8
+ require "watir"
9
+ require "selenium-webdriver"
9
10
 
10
11
  module Wmap
11
- module Utils
12
- module UrlMagic
12
+ module Utils
13
+ module UrlMagic
13
14
  extend self
14
15
 
16
+ # set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
17
+ Max_http_timeout=8000
18
+
15
19
  # Simple sanity check on a 'claimed' URL string.
16
20
  def is_url?(url)
17
21
  puts "Validate the URL format is valid: #{url}" if @verbose
@@ -33,7 +37,7 @@ module Wmap
33
37
  return false
34
38
  end
35
39
  end
36
-
40
+
37
41
  # Simple sanity check on a 'claimed' SSL enabled URL string
38
42
  def is_ssl?(url)
39
43
  puts "Validate if SSL is enabled on: #{url}" if @verbose
@@ -49,8 +53,8 @@ module Wmap
49
53
  return false
50
54
  end
51
55
  end
52
- alias_method :is_https?, :is_ssl?
53
-
56
+ alias_method :is_https?, :is_ssl?
57
+
54
58
  # Simple sanity check on a 'claimed' web site base string.
55
59
  def is_site?(url)
56
60
  puts "Validate the website string format for: #{url}" if @verbose
@@ -61,7 +65,7 @@ module Wmap
61
65
  return true
62
66
  else
63
67
  return false
64
- end
68
+ end
65
69
  else
66
70
  puts "Unknown site format: #{url}" if @verbose
67
71
  return false
@@ -71,40 +75,17 @@ module Wmap
71
75
  return nil
72
76
  end
73
77
  end
74
-
75
- # Check if URL is an absolute one
76
- #def is_absolute?(url)
77
- # puts "Validate if the url is absolute: #{url}" if @verbose
78
- # begin
79
- # url.strip!
80
- # URI.absolute?(url)
81
- # rescue => ee
82
- # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
83
- # return false
84
- # end
85
- #end
86
-
87
- # Check if URL is relative one
88
- #def is_relative?(url)
89
- # begin
90
- # url.strip!
91
- # !is_absolute?(url)
92
- # rescue => ee
93
- # puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
94
- # return false
95
- # end
96
- #end
97
-
78
+
98
79
  # Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
99
80
  def url_2_host (url)
100
81
  begin
101
82
  url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
102
83
  record1 = url.split('/')
103
84
  if record1[0].nil?
104
- puts "Error process url: #{url}"
85
+ puts "Error process url: #{url}"
105
86
  return nil
106
87
  else
107
- record2 = record1[0].split(':')
88
+ record2 = record1[0].split(':')
108
89
  return record2[0]
109
90
  end
110
91
  rescue => ee
@@ -120,8 +101,8 @@ module Wmap
120
101
  ssl = (url =~ /https/i)
121
102
  url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
122
103
  record1 = url.split('/')
123
- record2 = record1[0].split(':')
124
- if (record2.length == 2)
104
+ record2 = record1[0].split(':')
105
+ if (record2.length == 2)
125
106
  puts "The service port: #{record2[1]}" if @verbose
126
107
  return record2[1].to_i
127
108
  elsif ssl
@@ -164,13 +145,13 @@ module Wmap
164
145
  unless is_fqdn?(host)
165
146
  case host
166
147
  # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
167
- when /\?|\#/
148
+ when /\?|\#/
168
149
  host=host.split(%r{\?|\#})[0]
169
150
  else
170
151
  #do nothing
171
152
  end
172
153
  end
173
- # step 2, put the host:port pair back to the normal site format
154
+ # step 2, put the host:port pair back to the normal site format
174
155
  prot="https:" if port==443
175
156
  if port==80 || port==443
176
157
  site=prot+"//"+host+"/"
@@ -180,7 +161,7 @@ module Wmap
180
161
  if site=~ /http/i
181
162
  #puts "Base found: #{site}" if @verbose
182
163
  return site
183
- else
164
+ else
184
165
  raise "Problem encountered on method url_2_site: Unable to convert #{url}"
185
166
  return nil
186
167
  end
@@ -202,28 +183,26 @@ module Wmap
202
183
  rescue => ee
203
184
  puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
204
185
  end
205
-
186
+
206
187
  end
207
-
188
+
208
189
  # Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
209
190
  def urls_on_same_domain?(url1, url2)
210
191
  puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
211
- begin
212
- host1=url_2_host(url1)
213
- host2=url_2_host(url2)
214
- return get_domain_root(host1) == get_domain_root(host2)
215
- rescue => ee
216
- puts "Error searching the object content: #{ee}" if @verbose
217
- return nil
218
- end
219
- end
192
+ host1=url_2_host(url1)
193
+ host2=url_2_host(url2)
194
+ return get_domain_root(host1) == get_domain_root(host2)
195
+ rescue => ee
196
+ puts "Error searching the object content: #{ee}" if @verbose
197
+ return nil
198
+ end
220
199
 
221
200
  # Input is host and open port, output is a URL for valid http response code or nil
222
201
  def host_2_url (host,port=80)
223
202
  puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
224
203
  begin
225
204
  host=host.strip
226
- if port.to_i == 80
205
+ if port.to_i == 80
227
206
  url_1 = "http://" + host + "/"
228
207
  elsif port.to_i ==443
229
208
  url_1 = "https://" + host + "/"
@@ -232,7 +211,7 @@ module Wmap
232
211
  url_2 = "https://" + host + ":" + port.to_s + "/"
233
212
  end
234
213
  puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
235
- checker=Wmap::UrlChecker.new
214
+ checker=Wmap::UrlChecker.new
236
215
  if checker.response_code(url_1) != 10000
237
216
  puts "Found URL: #{url_1}" if @verbose
238
217
  return url_1
@@ -247,8 +226,8 @@ module Wmap
247
226
  puts "Exception on method #{__method__}: #{ee}" if @verbose
248
227
  return nil
249
228
  end
250
- end
251
-
229
+ end
230
+
252
231
  # Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
253
232
  def make_absolute(base, relative_url)
254
233
  puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
@@ -266,12 +245,12 @@ module Wmap
266
245
  return nil
267
246
  end
268
247
  end
269
-
248
+
270
249
  # Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
271
250
  def create_absolute_url_from_base(potential_base, relative_url)
272
251
  begin
273
252
  #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
274
- naked_base = url_2_site(potential_base).strip.chop
253
+ naked_base = url_2_site(potential_base).strip.chop
275
254
  puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
276
255
  return naked_base + relative_url
277
256
  rescue => ee
@@ -309,19 +288,19 @@ module Wmap
309
288
  return nil
310
289
  end
311
290
  end
312
-
291
+
313
292
  # Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
314
293
  # See http://en.wikipedia.org/wiki/URL_normalization for more explanation
315
294
  def normalize_url(url)
316
295
  begin
317
296
  url.strip!
318
- # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
297
+ # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
319
298
  # Normalize the base
320
- base=url_2_site(url)
299
+ base=url_2_site(url)
321
300
  # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
322
301
  base=base.sub(/\.\/$/,'/')
323
302
  # Normalize the relative path, case#1
324
- # retrieve the file path and remove the first '/' or '.',
303
+ # retrieve the file path and remove the first '/' or '.',
325
304
  # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
326
305
  path=url_2_path(url).sub(/^(\/|\.)*/,'')
327
306
  # Normalize the relative path, case#2
@@ -337,7 +316,136 @@ module Wmap
337
316
  return url
338
317
  end
339
318
  end
340
-
319
+
320
+
321
+ # Test the URL and return the response code
322
+ def response_code (url)
323
+ puts "Check the http response code on the url: #{url}" if @verbose
324
+ code = 10000 # All unknown url connection exceptions go here
325
+ raise "Invalid url: #{url}" unless is_url?(url)
326
+ url=url.strip.downcase
327
+ timeo = Max_http_timeout/1000.0
328
+ uri = URI.parse(url)
329
+ http = Net::HTTP.new(uri.host, uri.port)
330
+ http.open_timeout = timeo
331
+ http.read_timeout = timeo
332
+ if (url =~ /https\:/i)
333
+ http.use_ssl = true
334
+ #http.ssl_version = :SSLv3
335
+ # Bypass the remote web server cert validation test
336
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
337
+ end
338
+ request = Net::HTTP::Get.new(uri.request_uri)
339
+ response = http.request(request)
340
+ puts "Server response the following: #{response}" if @verbose
341
+ code = response.code.to_i
342
+ #response.finish if response.started?()
343
+ @url_code=Hash.new unless @url_code
344
+ @url_code[url]=code
345
+ puts "Response code on #{url}: #{code}" if @verbose
346
+ return code
347
+ rescue Exception => ee
348
+ puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
349
+ case ee
350
+ # rescue "Connection reset by peer" error type
351
+ when Errno::ECONNRESET
352
+ code=104
353
+ when Errno::ECONNABORTED,Errno::ETIMEDOUT
354
+ #code=10000
355
+ when Timeout::Error # Quick fix
356
+ if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
357
+ http.ssl_version = :SSLv3
358
+ response = http.request(request)
359
+ code = response.code.to_i
360
+ unless code.nil?
361
+ @ssl_version = http.ssl_version
362
+ end
363
+ end
364
+ else
365
+ #code=10000
366
+ end
367
+ @url_code=Hash.new unless @url_code
368
+ @url_code[url]=code
369
+ return code
370
+ end
371
+
372
+ # Given an URL, open the page, then return the DOM text from a normal user perspective
373
+ def open_page(url)
374
+ args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, read_timeout: Max_http_timeout/1000}
375
+ doc = Nokogiri::HTML(open(url, args))
376
+ if doc.text.include?("Please enable JavaScript to view the page content")
377
+ puts "Invoke headless chrome through webdriver ..." if @verbose
378
+ #Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver"
379
+ #driver = Selenium::WebDriver.for :chrome
380
+ # http://watir.com/guides/chrome/
381
+ args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate']
382
+ browser = Watir::Browser.new :chrome, headless: true, options: {args: args}
383
+ browser.goto(url)
384
+ sleep(2) # wait for the loading
385
+ doc = Nokogiri::HTML(browser.html)
386
+ browser.close
387
+ end
388
+ puts doc.text if @verbose
389
+ return doc
390
+ rescue => ee
391
+ puts "Exception on method #{__method__} for #{url}: #{ee}"
392
+ browser.close unless browser.nil?
393
+ return doc.text
394
+ end
395
+
396
+ # Test the URL / site and return the redirection location (3xx response code only)
397
+ def redirect_location (url)
398
+ puts "Test the redirection location for the url: #{url}" if @verbose
399
+ location=""
400
+ raise "Invalid url: #{url}" unless is_url?(url)
401
+ url=url.strip.downcase
402
+ timeo = Max_http_timeout/1000.0
403
+ uri = URI.parse(url)
404
+ code = response_code (url)
405
+ if code >= 300 && code < 400
406
+ http = Net::HTTP.new(uri.host, uri.port)
407
+ http.open_timeout = timeo
408
+ http.read_timeout = timeo
409
+ if (url =~ /https\:/i)
410
+ http.use_ssl = true
411
+ # Bypass the remote web server cert validation test
412
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
413
+ http.ssl_version = @ssl_version
414
+ end
415
+ request = Net::HTTP::Get.new(uri.request_uri)
416
+ response = http.request(request)
417
+ puts "Response: #{response}" if @verbose
418
+ case response
419
+ when Net::HTTPRedirection then
420
+ location = response['location']
421
+ end
422
+ end
423
+ return location
424
+ rescue Exception => ee
425
+ puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
426
+ return ""
427
+ end
428
+ alias_method :location, :redirect_location
429
+
430
+ # Test the URL / Site and return the landing url location (recursive with the depth = 4 )
431
+ def landing_location (depth=5, url)
432
+ depth -= 1
433
+ return url if depth < 1
434
+ timeo = Max_http_timeout/1000.0
435
+ uri = URI.parse(url)
436
+ code = response_code (url)
437
+ if code >= 300 && code < 400
438
+ url = redirect_location (url)
439
+ url = landing_location(depth,url)
440
+ else
441
+ return url
442
+ end
443
+ return url
444
+ rescue Exception => ee
445
+ puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
446
+ end
447
+
448
+
341
449
  end
342
450
  end
343
451
  end