wmap 2.5.2 → 2.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/RHPG +4 -4
- data/bin/trusts +1 -1
- data/dicts/ccsld.txt +63 -60
- data/lib/wmap/url_checker.rb +220 -238
- data/lib/wmap/url_crawler/adware_tag.rb +37 -15
- data/lib/wmap/utils/domain_root.rb +141 -115
- data/lib/wmap/utils/url_magic.rb +168 -60
- data/lib/wmap/wp_tracker.rb +135 -141
- data/logs/wmap.log +16 -1553
- data/version.txt +2 -2
- data/wmap.gemspec +3 -0
- metadata +30 -2
@@ -31,6 +31,7 @@ module Wmap
|
|
31
31
|
File.write(file2, "") unless File.exist?(@tag_file)
|
32
32
|
# load the known tag store
|
33
33
|
@tag_store=load_tag_from_file(file2)
|
34
|
+
@landings = Hash.new # cache landing page to reduce redundant browsing
|
34
35
|
end
|
35
36
|
|
36
37
|
|
@@ -62,7 +63,7 @@ module Wmap
|
|
62
63
|
end
|
63
64
|
|
64
65
|
# load the known tag store cache into an instance variable
|
65
|
-
def load_tag_from_file (file, lc=
|
66
|
+
def load_tag_from_file (file, lc=false)
|
66
67
|
puts "Loading tag data file: #{file}" if @verbose
|
67
68
|
data_store=Hash.new
|
68
69
|
f = File.open(file, 'r')
|
@@ -107,8 +108,8 @@ module Wmap
|
|
107
108
|
# add tag entries (from the sitetracker list)
|
108
109
|
def refresh (num=@max_parallel,use_cache=true)
|
109
110
|
puts "Add entries to the local cache table from site tracker: " if @verbose
|
110
|
-
results=Hash.new
|
111
|
-
tags=Wmap::SiteTracker.instance.known_sites.keys
|
111
|
+
results = Hash.new
|
112
|
+
tags = Wmap::SiteTracker.instance.known_sites.keys
|
112
113
|
if tags.size > 0
|
113
114
|
Parallel.map(tags, :in_processes => num) { |target|
|
114
115
|
check_adware(target,use_cache)
|
@@ -121,12 +122,12 @@ module Wmap
|
|
121
122
|
end
|
122
123
|
@tag_store.merge!(results)
|
123
124
|
puts "Done loading entries."
|
124
|
-
tags=nil
|
125
|
+
tags = nil
|
125
126
|
return results
|
126
127
|
else
|
127
128
|
puts "Error: no entry is loaded. Please check your list and try again."
|
128
129
|
end
|
129
|
-
tags=nil
|
130
|
+
tags = nil
|
130
131
|
return results
|
131
132
|
rescue => ee
|
132
133
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
@@ -140,6 +141,10 @@ module Wmap
|
|
140
141
|
puts "Site entry already exist. Skipping: #{site}" if @verbose
|
141
142
|
else
|
142
143
|
url = fast_landing(site)
|
144
|
+
if @landings.key?(url)
|
145
|
+
record[site] = @landings[url]
|
146
|
+
return record
|
147
|
+
end
|
143
148
|
tags = find_tags(url)
|
144
149
|
return record if tags.size==0
|
145
150
|
tag_vers=tags.map do |tag|
|
@@ -149,7 +154,8 @@ module Wmap
|
|
149
154
|
Base64.urlsafe_encode64(get_desc(url,tag))
|
150
155
|
end
|
151
156
|
if tags
|
152
|
-
record[site]=[url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
157
|
+
record[site] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
158
|
+
@landings[url] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
153
159
|
@tag_store.merge!(record)
|
154
160
|
puts "Tag entry loaded: #{record}" if @verbose
|
155
161
|
else
|
@@ -189,14 +195,13 @@ module Wmap
|
|
189
195
|
def find_tags(url)
|
190
196
|
puts "Search and return tags within the url payload: #{url}" if @verbose
|
191
197
|
tag_list = []
|
192
|
-
doc =
|
198
|
+
doc = open_page(url)
|
193
199
|
doc.text.each_line do |line|
|
194
200
|
my_line = line.downcase
|
195
201
|
@tag_signatures.keys.map do |tag|
|
196
202
|
tag_list.push(tag) if my_line.include?(tag)
|
197
203
|
end
|
198
204
|
end
|
199
|
-
doc = nil
|
200
205
|
return tag_list
|
201
206
|
rescue => ee
|
202
207
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
@@ -207,7 +212,7 @@ module Wmap
|
|
207
212
|
def get_ver(url,tag)
|
208
213
|
puts "Search and return tag version within the url payload: #{url}, #{tag}" if @verbose
|
209
214
|
tag_ver=""
|
210
|
-
doc =
|
215
|
+
doc = open_page(url)
|
211
216
|
case tag
|
212
217
|
when "utag.js" # sample: ...,"code_release_version":"cb20190312032612",...
|
213
218
|
doc.text.each_line do |line|
|
@@ -219,16 +224,33 @@ module Wmap
|
|
219
224
|
break
|
220
225
|
end
|
221
226
|
end
|
222
|
-
when "analytics.js"
|
227
|
+
when "analytics.js" # sample #1: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
|
223
228
|
doc.text.each_line do |line|
|
224
229
|
my_line = line.downcase
|
225
|
-
if my_line.include?("ga
|
230
|
+
if my_line.include?("ga") && my_line.include?("create") #sample #2: __gaTracker('create', 'UA-121313929-1', 'auto');
|
226
231
|
puts "Extract tag version from line: #{my_line}" if @verbose
|
227
232
|
m = my_line.match(/[\'|\"]create[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]\s*\,/)
|
228
233
|
tag_ver = m[:ver]
|
229
234
|
break
|
230
235
|
end
|
231
236
|
end
|
237
|
+
when "ga.js"
|
238
|
+
doc.text.each_line do |line|
|
239
|
+
my_line = line.downcase
|
240
|
+
puts my_line if @verbose
|
241
|
+
if my_line.include?("push") && my_line.include?("_setaccount") # # sample #1: _gaq.push(['_setAccount', 'UA-13205363-65']);
|
242
|
+
m = my_line.match(/[\'|\"]\_setaccount[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
|
243
|
+
tag_ver = m[:ver]
|
244
|
+
break
|
245
|
+
end
|
246
|
+
if my_line.include?("_gettracker") # sample #2: var pageTracker = _gat._getTracker("UA-12487327-1");
|
247
|
+
puts "Extract tag version from line: #{my_line}" if @verbose
|
248
|
+
m = my_line.match(/\_gettracker\s*\(\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
|
249
|
+
tag_ver = m[:ver]
|
250
|
+
break
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
232
254
|
when "all.js" # sample: appId : '749936668352954',
|
233
255
|
doc.text.each_line do |line|
|
234
256
|
my_line = line.downcase
|
@@ -241,11 +263,11 @@ module Wmap
|
|
241
263
|
end
|
242
264
|
|
243
265
|
else
|
244
|
-
puts "
|
266
|
+
puts "Don't know how to locate Adware Tag version: #{tag}"
|
245
267
|
# do nothing
|
246
268
|
end
|
247
269
|
doc = nil
|
248
|
-
return tag_ver
|
270
|
+
return tag_ver.upcase
|
249
271
|
rescue => ee
|
250
272
|
puts "Exception on method #{__method__}: #{ee}: #{url} : #{tag}" if @verbose
|
251
273
|
return tag_ver
|
@@ -257,9 +279,9 @@ module Wmap
|
|
257
279
|
recording=false
|
258
280
|
tag_found=false
|
259
281
|
tag_desc=""
|
260
|
-
doc =
|
282
|
+
doc = open_page(url)
|
261
283
|
doc.search('script').map do |script|
|
262
|
-
if script.text.include?(tag)
|
284
|
+
if script.text.include?(tag) && script.text.length < 65535
|
263
285
|
return script.text
|
264
286
|
end
|
265
287
|
end
|
@@ -21,123 +21,148 @@ module Wmap
|
|
21
21
|
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
22
22
|
def get_domain_root (host)
|
23
23
|
puts "Retrieve the root domain for host: #{host}" if @verbose
|
24
|
-
|
25
|
-
|
26
|
-
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
27
|
-
# Generic Top Level Domain List - loading once
|
28
|
-
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
29
|
-
# Country code top-level domain list - loading once
|
30
|
-
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
31
|
-
# Country code second level domain - loading once
|
32
|
-
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
33
|
-
|
34
|
-
if host.strip.nil?
|
35
|
-
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
36
|
-
return nil
|
37
|
-
else
|
38
|
-
host=host.downcase.strip
|
39
|
-
end
|
40
|
-
found_tld=false
|
41
|
-
found_cctld=false
|
42
|
-
# search the top level domain list first
|
43
|
-
root_domain=""
|
44
|
-
dn=host.split(".")
|
45
|
-
if @tlds.key?(dn.last)
|
46
|
-
cc_found=false
|
47
|
-
if @cctld.key?(dn[dn.length-2])
|
48
|
-
cc_found=true
|
49
|
-
end
|
50
|
-
if cc_found
|
51
|
-
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
52
|
-
else
|
53
|
-
root_domain=dn[dn.length-2] + "." + dn.last
|
54
|
-
end
|
55
|
-
found_tld=true
|
56
|
-
end
|
57
|
-
# search the country code top level domain list secondly
|
58
|
-
if @cctld.key?(dn.last)
|
59
|
-
found=false
|
60
|
-
# reverse search of general top level domain
|
61
|
-
if @gtld.key?(dn[dn.length-2])
|
62
|
-
found=true
|
63
|
-
end
|
64
|
-
# search country code second level domain list
|
65
|
-
if @ccsld.key?(dn.last)
|
66
|
-
@ccsld[dn.last].each do |v|
|
67
|
-
if ( v =~ /#{dn[dn.length-2]}/i )
|
68
|
-
found=true
|
69
|
-
break
|
70
|
-
end
|
71
|
-
end
|
72
|
-
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
73
|
-
#unless found
|
74
|
-
# if @gtld.key?(dn[dn.length-2])
|
75
|
-
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
76
|
-
# return nil
|
77
|
-
# end
|
78
|
-
#end
|
79
|
-
end
|
80
|
-
if found
|
81
|
-
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
82
|
-
else
|
83
|
-
root_domain=dn[dn.length-2] + "." + dn.last
|
84
|
-
end
|
85
|
-
found_cctld=true
|
86
|
-
end
|
87
|
-
unless (found_tld or found_cctld)
|
88
|
-
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
89
|
-
return nil
|
90
|
-
else
|
91
|
-
puts "Domain root found: #{root_domain}" if @verbose
|
92
|
-
return root_domain
|
93
|
-
end
|
94
|
-
rescue => ee
|
95
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
24
|
+
if host.strip.nil?
|
25
|
+
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
96
26
|
return nil
|
27
|
+
else
|
28
|
+
host=host.downcase.strip
|
97
29
|
end
|
30
|
+
# First order - search country code second level domain list
|
31
|
+
root_domain = get_domain_root_by_ccsld(host)
|
32
|
+
if root_domain.nil?
|
33
|
+
# Second order - search the country code top level domain list
|
34
|
+
root_domain = get_domain_root_by_cctld(host)
|
35
|
+
if root_domain.nil?
|
36
|
+
# Third order - search top level domain list
|
37
|
+
root_domain = get_domain_root_by_tlds(host)
|
38
|
+
if root_domain.nil?
|
39
|
+
# do nothing - no further search
|
40
|
+
else
|
41
|
+
return root_domain
|
42
|
+
end
|
43
|
+
else
|
44
|
+
return root_domain
|
45
|
+
end
|
46
|
+
else
|
47
|
+
return root_domain
|
48
|
+
end
|
49
|
+
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
50
|
+
return nil
|
51
|
+
#rescue => ee
|
52
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
53
|
+
# return nil
|
98
54
|
end
|
99
55
|
alias_method :get_root_domain, :get_domain_root
|
100
56
|
alias_method :root_domain, :get_domain_root
|
101
57
|
alias_method :domain_root, :get_domain_root
|
102
58
|
alias_method :host_2_domain, :get_domain_root
|
103
59
|
|
60
|
+
# get domain root by lookup Country Code Second Level Domain list
|
61
|
+
def get_domain_root_by_ccsld(host)
|
62
|
+
puts "First order search - domain root lookup by Country Code Second Level Domain list ..." if @verbose
|
63
|
+
root_domain = nil
|
64
|
+
dn = host.split(".")
|
65
|
+
# Country code second level domain - loading once
|
66
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
67
|
+
# search country code second level domain list
|
68
|
+
if @ccsld.key?(dn.last)
|
69
|
+
@ccsld[dn.last].each do |v|
|
70
|
+
if ( v =~ /#{dn[dn.length-2]}/i )
|
71
|
+
return dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return root_domain
|
76
|
+
#rescue => ee
|
77
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
78
|
+
# return nil
|
79
|
+
end
|
80
|
+
|
81
|
+
# get domain root by lookup Country Code Top Level Domain list
|
82
|
+
def get_domain_root_by_cctld(host)
|
83
|
+
puts "Second order search - domain root lookup by Country Code Top Level Domain list ..." if @verbose
|
84
|
+
root_domain = nil
|
85
|
+
dn = host.split(".")
|
86
|
+
# Country code top-level domain list - loading once
|
87
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
88
|
+
# Generic Top Level Domain List - loading once
|
89
|
+
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
90
|
+
# Country code second level domain - loading once
|
91
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
92
|
+
# search the country code top level domain list
|
93
|
+
if @cctld.key?(dn.last)
|
94
|
+
# reverse search of general top level domain
|
95
|
+
if @gtld.key?(dn[dn.length-2])
|
96
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
97
|
+
else
|
98
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return root_domain
|
102
|
+
#rescue => ee
|
103
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
104
|
+
# return nil
|
105
|
+
end
|
106
|
+
|
107
|
+
# get domain root by lookup Top Level Domain list
|
108
|
+
def get_domain_root_by_tlds(host)
|
109
|
+
puts "Third order search - domain root lookup by Top Level Domain list ..." if @verbose
|
110
|
+
root_domain = nil
|
111
|
+
dn = host.split(".")
|
112
|
+
# Comnplete Top Level Domain List - loading once
|
113
|
+
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
114
|
+
# Country code top-level domain list - loading once
|
115
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
116
|
+
cc_found=false
|
117
|
+
if @tlds.key?(dn.last)
|
118
|
+
if @cctld.key?(dn[dn.length-2])
|
119
|
+
cc_found=true
|
120
|
+
end
|
121
|
+
if cc_found
|
122
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
123
|
+
else
|
124
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
125
|
+
end
|
126
|
+
end
|
127
|
+
return root_domain
|
128
|
+
#rescue => ee
|
129
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
130
|
+
# return nil
|
131
|
+
end
|
132
|
+
|
104
133
|
# 'setter' to parse and load the known country code second level domain table from the file
|
105
134
|
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
106
135
|
def load_ccsld_from_file (file_ccsld)
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
ccsld[key].push(val) unless key.nil?
|
121
|
-
end
|
136
|
+
ccsld=Hash.new
|
137
|
+
puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
|
138
|
+
f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
|
139
|
+
f.each do |line|
|
140
|
+
next unless line =~ /^\s+\.\w/
|
141
|
+
line=line.chomp.strip.downcase
|
142
|
+
entry=line.split(' ')[0].split('.')
|
143
|
+
if entry.length > 2
|
144
|
+
key=entry.last
|
145
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
146
|
+
val=entry[entry.length-2]
|
147
|
+
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
148
|
+
ccsld[key].push(val) unless key.nil?
|
122
149
|
end
|
123
|
-
f.close
|
124
|
-
# Sort the blocks once in descendant order once for better performance
|
125
|
-
return ccsld
|
126
|
-
rescue => ee
|
127
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
128
150
|
end
|
151
|
+
f.close
|
152
|
+
# Sort the blocks once in descendant order once for better performance
|
153
|
+
return ccsld
|
154
|
+
rescue => ee
|
155
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
129
156
|
end
|
130
157
|
|
131
158
|
# Test a host string to see if it's a valid Internet root domain
|
132
159
|
def is_domain_root? (domain)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
return false
|
140
|
-
end
|
160
|
+
puts "Validate the domain name is valid: #{domain}" if @verbose
|
161
|
+
domain=domain.strip.downcase
|
162
|
+
return domain == get_domain_root(domain)
|
163
|
+
rescue => ee
|
164
|
+
puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
|
165
|
+
return false
|
141
166
|
end
|
142
167
|
alias_method :is_root_domain?, :is_domain_root?
|
143
168
|
alias_method :is_domain?, :is_domain_root?
|
@@ -146,39 +171,40 @@ module Wmap
|
|
146
171
|
# Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
|
147
172
|
def get_sub_domain (host)
|
148
173
|
puts "Retrieve sub-domain from host: #{host}" if @verbose
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
else
|
160
|
-
return nil
|
161
|
-
end
|
162
|
-
rescue Exception => ee
|
163
|
-
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
174
|
+
subdomain=String.new
|
175
|
+
host=host.strip.downcase
|
176
|
+
domain=get_domain_root(host)
|
177
|
+
record_h=host.split(".")
|
178
|
+
record_d=domain.split(".")
|
179
|
+
if (record_h.length - record_d.length) >= 2
|
180
|
+
subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
|
181
|
+
puts "Sub domain found: #{subdomain}" if @verbose
|
182
|
+
return subdomain
|
183
|
+
else
|
164
184
|
return nil
|
165
185
|
end
|
186
|
+
rescue Exception => ee
|
187
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
188
|
+
return nil
|
166
189
|
end
|
167
190
|
alias_method :get_subdomain, :get_sub_domain
|
168
191
|
|
169
192
|
# Function to print instance variable - General top level domain list
|
170
193
|
def print_gtld
|
171
194
|
puts @gtld
|
195
|
+
return @gtld
|
172
196
|
end
|
173
197
|
|
174
198
|
# Function to print instance variable - Country code top-level domain list
|
175
199
|
def print_cctld
|
176
200
|
puts @cctld
|
201
|
+
return @cctld
|
177
202
|
end
|
178
203
|
|
179
204
|
# Function to print instance variable - Country code second-level domain list
|
180
205
|
def print_ccsld
|
181
206
|
puts @ccsld
|
207
|
+
return @ccsld
|
182
208
|
end
|
183
209
|
|
184
210
|
private :load_ccsld_from_file
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -5,13 +5,17 @@
|
|
5
5
|
#
|
6
6
|
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
7
|
#++
|
8
|
-
|
8
|
+
require "watir"
|
9
|
+
require "selenium-webdriver"
|
9
10
|
|
10
11
|
module Wmap
|
11
|
-
module Utils
|
12
|
-
module UrlMagic
|
12
|
+
module Utils
|
13
|
+
module UrlMagic
|
13
14
|
extend self
|
14
15
|
|
16
|
+
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
|
+
Max_http_timeout=8000
|
18
|
+
|
15
19
|
# Simple sanity check on a 'claimed' URL string.
|
16
20
|
def is_url?(url)
|
17
21
|
puts "Validate the URL format is valid: #{url}" if @verbose
|
@@ -33,7 +37,7 @@ module Wmap
|
|
33
37
|
return false
|
34
38
|
end
|
35
39
|
end
|
36
|
-
|
40
|
+
|
37
41
|
# Simple sanity check on a 'claimed' SSL enabled URL string
|
38
42
|
def is_ssl?(url)
|
39
43
|
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
@@ -49,8 +53,8 @@ module Wmap
|
|
49
53
|
return false
|
50
54
|
end
|
51
55
|
end
|
52
|
-
alias_method :is_https?, :is_ssl?
|
53
|
-
|
56
|
+
alias_method :is_https?, :is_ssl?
|
57
|
+
|
54
58
|
# Simple sanity check on a 'claimed' web site base string.
|
55
59
|
def is_site?(url)
|
56
60
|
puts "Validate the website string format for: #{url}" if @verbose
|
@@ -61,7 +65,7 @@ module Wmap
|
|
61
65
|
return true
|
62
66
|
else
|
63
67
|
return false
|
64
|
-
end
|
68
|
+
end
|
65
69
|
else
|
66
70
|
puts "Unknown site format: #{url}" if @verbose
|
67
71
|
return false
|
@@ -71,40 +75,17 @@ module Wmap
|
|
71
75
|
return nil
|
72
76
|
end
|
73
77
|
end
|
74
|
-
|
75
|
-
# Check if URL is an absolute one
|
76
|
-
#def is_absolute?(url)
|
77
|
-
# puts "Validate if the url is absolute: #{url}" if @verbose
|
78
|
-
# begin
|
79
|
-
# url.strip!
|
80
|
-
# URI.absolute?(url)
|
81
|
-
# rescue => ee
|
82
|
-
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
83
|
-
# return false
|
84
|
-
# end
|
85
|
-
#end
|
86
|
-
|
87
|
-
# Check if URL is relative one
|
88
|
-
#def is_relative?(url)
|
89
|
-
# begin
|
90
|
-
# url.strip!
|
91
|
-
# !is_absolute?(url)
|
92
|
-
# rescue => ee
|
93
|
-
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
94
|
-
# return false
|
95
|
-
# end
|
96
|
-
#end
|
97
|
-
|
78
|
+
|
98
79
|
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
99
80
|
def url_2_host (url)
|
100
81
|
begin
|
101
82
|
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
102
83
|
record1 = url.split('/')
|
103
84
|
if record1[0].nil?
|
104
|
-
puts "Error process url: #{url}"
|
85
|
+
puts "Error process url: #{url}"
|
105
86
|
return nil
|
106
87
|
else
|
107
|
-
record2 = record1[0].split(':')
|
88
|
+
record2 = record1[0].split(':')
|
108
89
|
return record2[0]
|
109
90
|
end
|
110
91
|
rescue => ee
|
@@ -120,8 +101,8 @@ module Wmap
|
|
120
101
|
ssl = (url =~ /https/i)
|
121
102
|
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
122
103
|
record1 = url.split('/')
|
123
|
-
record2 = record1[0].split(':')
|
124
|
-
if (record2.length == 2)
|
104
|
+
record2 = record1[0].split(':')
|
105
|
+
if (record2.length == 2)
|
125
106
|
puts "The service port: #{record2[1]}" if @verbose
|
126
107
|
return record2[1].to_i
|
127
108
|
elsif ssl
|
@@ -164,13 +145,13 @@ module Wmap
|
|
164
145
|
unless is_fqdn?(host)
|
165
146
|
case host
|
166
147
|
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
167
|
-
when /\?|\#/
|
148
|
+
when /\?|\#/
|
168
149
|
host=host.split(%r{\?|\#})[0]
|
169
150
|
else
|
170
151
|
#do nothing
|
171
152
|
end
|
172
153
|
end
|
173
|
-
# step 2, put the host:port pair back to the normal site format
|
154
|
+
# step 2, put the host:port pair back to the normal site format
|
174
155
|
prot="https:" if port==443
|
175
156
|
if port==80 || port==443
|
176
157
|
site=prot+"//"+host+"/"
|
@@ -180,7 +161,7 @@ module Wmap
|
|
180
161
|
if site=~ /http/i
|
181
162
|
#puts "Base found: #{site}" if @verbose
|
182
163
|
return site
|
183
|
-
else
|
164
|
+
else
|
184
165
|
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
185
166
|
return nil
|
186
167
|
end
|
@@ -202,28 +183,26 @@ module Wmap
|
|
202
183
|
rescue => ee
|
203
184
|
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
204
185
|
end
|
205
|
-
|
186
|
+
|
206
187
|
end
|
207
|
-
|
188
|
+
|
208
189
|
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
209
190
|
def urls_on_same_domain?(url1, url2)
|
210
191
|
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
219
|
-
end
|
192
|
+
host1=url_2_host(url1)
|
193
|
+
host2=url_2_host(url2)
|
194
|
+
return get_domain_root(host1) == get_domain_root(host2)
|
195
|
+
rescue => ee
|
196
|
+
puts "Error searching the object content: #{ee}" if @verbose
|
197
|
+
return nil
|
198
|
+
end
|
220
199
|
|
221
200
|
# Input is host and open port, output is a URL for valid http response code or nil
|
222
201
|
def host_2_url (host,port=80)
|
223
202
|
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
224
203
|
begin
|
225
204
|
host=host.strip
|
226
|
-
if port.to_i == 80
|
205
|
+
if port.to_i == 80
|
227
206
|
url_1 = "http://" + host + "/"
|
228
207
|
elsif port.to_i ==443
|
229
208
|
url_1 = "https://" + host + "/"
|
@@ -232,7 +211,7 @@ module Wmap
|
|
232
211
|
url_2 = "https://" + host + ":" + port.to_s + "/"
|
233
212
|
end
|
234
213
|
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
235
|
-
checker=Wmap::UrlChecker.new
|
214
|
+
checker=Wmap::UrlChecker.new
|
236
215
|
if checker.response_code(url_1) != 10000
|
237
216
|
puts "Found URL: #{url_1}" if @verbose
|
238
217
|
return url_1
|
@@ -247,8 +226,8 @@ module Wmap
|
|
247
226
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
248
227
|
return nil
|
249
228
|
end
|
250
|
-
end
|
251
|
-
|
229
|
+
end
|
230
|
+
|
252
231
|
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
253
232
|
def make_absolute(base, relative_url)
|
254
233
|
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
@@ -266,12 +245,12 @@ module Wmap
|
|
266
245
|
return nil
|
267
246
|
end
|
268
247
|
end
|
269
|
-
|
248
|
+
|
270
249
|
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
271
250
|
def create_absolute_url_from_base(potential_base, relative_url)
|
272
251
|
begin
|
273
252
|
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
274
|
-
naked_base = url_2_site(potential_base).strip.chop
|
253
|
+
naked_base = url_2_site(potential_base).strip.chop
|
275
254
|
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
276
255
|
return naked_base + relative_url
|
277
256
|
rescue => ee
|
@@ -309,19 +288,19 @@ module Wmap
|
|
309
288
|
return nil
|
310
289
|
end
|
311
290
|
end
|
312
|
-
|
291
|
+
|
313
292
|
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
314
293
|
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
315
294
|
def normalize_url(url)
|
316
295
|
begin
|
317
296
|
url.strip!
|
318
|
-
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
297
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
319
298
|
# Normalize the base
|
320
|
-
base=url_2_site(url)
|
299
|
+
base=url_2_site(url)
|
321
300
|
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
322
301
|
base=base.sub(/\.\/$/,'/')
|
323
302
|
# Normalize the relative path, case#1
|
324
|
-
# retrieve the file path and remove the first '/' or '.',
|
303
|
+
# retrieve the file path and remove the first '/' or '.',
|
325
304
|
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
326
305
|
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
327
306
|
# Normalize the relative path, case#2
|
@@ -337,7 +316,136 @@ module Wmap
|
|
337
316
|
return url
|
338
317
|
end
|
339
318
|
end
|
340
|
-
|
319
|
+
|
320
|
+
|
321
|
+
# Test the URL and return the response code
|
322
|
+
def response_code (url)
|
323
|
+
puts "Check the http response code on the url: #{url}" if @verbose
|
324
|
+
code = 10000 # All unknown url connection exceptions go here
|
325
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
326
|
+
url=url.strip.downcase
|
327
|
+
timeo = Max_http_timeout/1000.0
|
328
|
+
uri = URI.parse(url)
|
329
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
330
|
+
http.open_timeout = timeo
|
331
|
+
http.read_timeout = timeo
|
332
|
+
if (url =~ /https\:/i)
|
333
|
+
http.use_ssl = true
|
334
|
+
#http.ssl_version = :SSLv3
|
335
|
+
# Bypass the remote web server cert validation test
|
336
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
337
|
+
end
|
338
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
339
|
+
response = http.request(request)
|
340
|
+
puts "Server response the following: #{response}" if @verbose
|
341
|
+
code = response.code.to_i
|
342
|
+
#response.finish if response.started?()
|
343
|
+
@url_code=Hash.new unless @url_code
|
344
|
+
@url_code[url]=code
|
345
|
+
puts "Response code on #{url}: #{code}" if @verbose
|
346
|
+
return code
|
347
|
+
rescue Exception => ee
|
348
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
349
|
+
case ee
|
350
|
+
# rescue "Connection reset by peer" error type
|
351
|
+
when Errno::ECONNRESET
|
352
|
+
code=104
|
353
|
+
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
354
|
+
#code=10000
|
355
|
+
when Timeout::Error # Quick fix
|
356
|
+
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
357
|
+
http.ssl_version = :SSLv3
|
358
|
+
response = http.request(request)
|
359
|
+
code = response.code.to_i
|
360
|
+
unless code.nil?
|
361
|
+
@ssl_version = http.ssl_version
|
362
|
+
end
|
363
|
+
end
|
364
|
+
else
|
365
|
+
#code=10000
|
366
|
+
end
|
367
|
+
@url_code=Hash.new unless @url_code
|
368
|
+
@url_code[url]=code
|
369
|
+
return code
|
370
|
+
end
|
371
|
+
|
372
|
+
# Given an URL, open the page, then return the DOM text from a normal user perspective
|
373
|
+
def open_page(url)
|
374
|
+
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, read_timeout: Max_http_timeout/1000}
|
375
|
+
doc = Nokogiri::HTML(open(url, args))
|
376
|
+
if doc.text.include?("Please enable JavaScript to view the page content")
|
377
|
+
puts "Invoke headless chrome through webdriver ..." if @verbose
|
378
|
+
#Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver"
|
379
|
+
#driver = Selenium::WebDriver.for :chrome
|
380
|
+
# http://watir.com/guides/chrome/
|
381
|
+
args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate']
|
382
|
+
browser = Watir::Browser.new :chrome, headless: true, options: {args: args}
|
383
|
+
browser.goto(url)
|
384
|
+
sleep(2) # wait for the loading
|
385
|
+
doc = Nokogiri::HTML(browser.html)
|
386
|
+
browser.close
|
387
|
+
end
|
388
|
+
puts doc.text if @verbose
|
389
|
+
return doc
|
390
|
+
rescue => ee
|
391
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}"
|
392
|
+
browser.close unless browser.nil?
|
393
|
+
return doc.text
|
394
|
+
end
|
395
|
+
|
396
|
+
# Test the URL / site and return the redirection location (3xx response code only)
|
397
|
+
def redirect_location (url)
|
398
|
+
puts "Test the redirection location for the url: #{url}" if @verbose
|
399
|
+
location=""
|
400
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
401
|
+
url=url.strip.downcase
|
402
|
+
timeo = Max_http_timeout/1000.0
|
403
|
+
uri = URI.parse(url)
|
404
|
+
code = response_code (url)
|
405
|
+
if code >= 300 && code < 400
|
406
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
407
|
+
http.open_timeout = timeo
|
408
|
+
http.read_timeout = timeo
|
409
|
+
if (url =~ /https\:/i)
|
410
|
+
http.use_ssl = true
|
411
|
+
# Bypass the remote web server cert validation test
|
412
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
413
|
+
http.ssl_version = @ssl_version
|
414
|
+
end
|
415
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
416
|
+
response = http.request(request)
|
417
|
+
puts "Response: #{response}" if @verbose
|
418
|
+
case response
|
419
|
+
when Net::HTTPRedirection then
|
420
|
+
location = response['location']
|
421
|
+
end
|
422
|
+
end
|
423
|
+
return location
|
424
|
+
rescue Exception => ee
|
425
|
+
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
426
|
+
return ""
|
427
|
+
end
|
428
|
+
alias_method :location, :redirect_location
|
429
|
+
|
430
|
+
# Test the URL / Site and return the landing url location (recursive with the depth = 4 )
|
431
|
+
def landing_location (depth=5, url)
|
432
|
+
depth -= 1
|
433
|
+
return url if depth < 1
|
434
|
+
timeo = Max_http_timeout/1000.0
|
435
|
+
uri = URI.parse(url)
|
436
|
+
code = response_code (url)
|
437
|
+
if code >= 300 && code < 400
|
438
|
+
url = redirect_location (url)
|
439
|
+
url = landing_location(depth,url)
|
440
|
+
else
|
441
|
+
return url
|
442
|
+
end
|
443
|
+
return url
|
444
|
+
rescue Exception => ee
|
445
|
+
puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
|
446
|
+
end
|
447
|
+
|
448
|
+
|
341
449
|
end
|
342
450
|
end
|
343
451
|
end
|