wmap 2.5.2 → 2.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/RHPG +4 -4
- data/bin/trusts +1 -1
- data/dicts/ccsld.txt +63 -60
- data/lib/wmap/url_checker.rb +220 -238
- data/lib/wmap/url_crawler/adware_tag.rb +37 -15
- data/lib/wmap/utils/domain_root.rb +141 -115
- data/lib/wmap/utils/url_magic.rb +168 -60
- data/lib/wmap/wp_tracker.rb +135 -141
- data/logs/wmap.log +16 -1553
- data/version.txt +2 -2
- data/wmap.gemspec +3 -0
- metadata +30 -2
@@ -31,6 +31,7 @@ module Wmap
|
|
31
31
|
File.write(file2, "") unless File.exist?(@tag_file)
|
32
32
|
# load the known tag store
|
33
33
|
@tag_store=load_tag_from_file(file2)
|
34
|
+
@landings = Hash.new # cache landing page to reduce redundant browsing
|
34
35
|
end
|
35
36
|
|
36
37
|
|
@@ -62,7 +63,7 @@ module Wmap
|
|
62
63
|
end
|
63
64
|
|
64
65
|
# load the known tag store cache into an instance variable
|
65
|
-
def load_tag_from_file (file, lc=
|
66
|
+
def load_tag_from_file (file, lc=false)
|
66
67
|
puts "Loading tag data file: #{file}" if @verbose
|
67
68
|
data_store=Hash.new
|
68
69
|
f = File.open(file, 'r')
|
@@ -107,8 +108,8 @@ module Wmap
|
|
107
108
|
# add tag entries (from the sitetracker list)
|
108
109
|
def refresh (num=@max_parallel,use_cache=true)
|
109
110
|
puts "Add entries to the local cache table from site tracker: " if @verbose
|
110
|
-
results=Hash.new
|
111
|
-
tags=Wmap::SiteTracker.instance.known_sites.keys
|
111
|
+
results = Hash.new
|
112
|
+
tags = Wmap::SiteTracker.instance.known_sites.keys
|
112
113
|
if tags.size > 0
|
113
114
|
Parallel.map(tags, :in_processes => num) { |target|
|
114
115
|
check_adware(target,use_cache)
|
@@ -121,12 +122,12 @@ module Wmap
|
|
121
122
|
end
|
122
123
|
@tag_store.merge!(results)
|
123
124
|
puts "Done loading entries."
|
124
|
-
tags=nil
|
125
|
+
tags = nil
|
125
126
|
return results
|
126
127
|
else
|
127
128
|
puts "Error: no entry is loaded. Please check your list and try again."
|
128
129
|
end
|
129
|
-
tags=nil
|
130
|
+
tags = nil
|
130
131
|
return results
|
131
132
|
rescue => ee
|
132
133
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
@@ -140,6 +141,10 @@ module Wmap
|
|
140
141
|
puts "Site entry already exist. Skipping: #{site}" if @verbose
|
141
142
|
else
|
142
143
|
url = fast_landing(site)
|
144
|
+
if @landings.key?(url)
|
145
|
+
record[site] = @landings[url]
|
146
|
+
return record
|
147
|
+
end
|
143
148
|
tags = find_tags(url)
|
144
149
|
return record if tags.size==0
|
145
150
|
tag_vers=tags.map do |tag|
|
@@ -149,7 +154,8 @@ module Wmap
|
|
149
154
|
Base64.urlsafe_encode64(get_desc(url,tag))
|
150
155
|
end
|
151
156
|
if tags
|
152
|
-
record[site]=[url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
157
|
+
record[site] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
158
|
+
@landings[url] = [url, tags.join("|"), tag_vers.join("|"), tag_descs.join("|")]
|
153
159
|
@tag_store.merge!(record)
|
154
160
|
puts "Tag entry loaded: #{record}" if @verbose
|
155
161
|
else
|
@@ -189,14 +195,13 @@ module Wmap
|
|
189
195
|
def find_tags(url)
|
190
196
|
puts "Search and return tags within the url payload: #{url}" if @verbose
|
191
197
|
tag_list = []
|
192
|
-
doc =
|
198
|
+
doc = open_page(url)
|
193
199
|
doc.text.each_line do |line|
|
194
200
|
my_line = line.downcase
|
195
201
|
@tag_signatures.keys.map do |tag|
|
196
202
|
tag_list.push(tag) if my_line.include?(tag)
|
197
203
|
end
|
198
204
|
end
|
199
|
-
doc = nil
|
200
205
|
return tag_list
|
201
206
|
rescue => ee
|
202
207
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
@@ -207,7 +212,7 @@ module Wmap
|
|
207
212
|
def get_ver(url,tag)
|
208
213
|
puts "Search and return tag version within the url payload: #{url}, #{tag}" if @verbose
|
209
214
|
tag_ver=""
|
210
|
-
doc =
|
215
|
+
doc = open_page(url)
|
211
216
|
case tag
|
212
217
|
when "utag.js" # sample: ...,"code_release_version":"cb20190312032612",...
|
213
218
|
doc.text.each_line do |line|
|
@@ -219,16 +224,33 @@ module Wmap
|
|
219
224
|
break
|
220
225
|
end
|
221
226
|
end
|
222
|
-
when "analytics.js"
|
227
|
+
when "analytics.js" # sample #1: ga('create', 'UA-19175804-2', 'knopfdoubleday.com');
|
223
228
|
doc.text.each_line do |line|
|
224
229
|
my_line = line.downcase
|
225
|
-
if my_line.include?("ga
|
230
|
+
if my_line.include?("ga") && my_line.include?("create") #sample #2: __gaTracker('create', 'UA-121313929-1', 'auto');
|
226
231
|
puts "Extract tag version from line: #{my_line}" if @verbose
|
227
232
|
m = my_line.match(/[\'|\"]create[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]\s*\,/)
|
228
233
|
tag_ver = m[:ver]
|
229
234
|
break
|
230
235
|
end
|
231
236
|
end
|
237
|
+
when "ga.js"
|
238
|
+
doc.text.each_line do |line|
|
239
|
+
my_line = line.downcase
|
240
|
+
puts my_line if @verbose
|
241
|
+
if my_line.include?("push") && my_line.include?("_setaccount") # # sample #1: _gaq.push(['_setAccount', 'UA-13205363-65']);
|
242
|
+
m = my_line.match(/[\'|\"]\_setaccount[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
|
243
|
+
tag_ver = m[:ver]
|
244
|
+
break
|
245
|
+
end
|
246
|
+
if my_line.include?("_gettracker") # sample #2: var pageTracker = _gat._getTracker("UA-12487327-1");
|
247
|
+
puts "Extract tag version from line: #{my_line}" if @verbose
|
248
|
+
m = my_line.match(/\_gettracker\s*\(\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/)
|
249
|
+
tag_ver = m[:ver]
|
250
|
+
break
|
251
|
+
end
|
252
|
+
|
253
|
+
end
|
232
254
|
when "all.js" # sample: appId : '749936668352954',
|
233
255
|
doc.text.each_line do |line|
|
234
256
|
my_line = line.downcase
|
@@ -241,11 +263,11 @@ module Wmap
|
|
241
263
|
end
|
242
264
|
|
243
265
|
else
|
244
|
-
puts "
|
266
|
+
puts "Don't know how to locate Adware Tag version: #{tag}"
|
245
267
|
# do nothing
|
246
268
|
end
|
247
269
|
doc = nil
|
248
|
-
return tag_ver
|
270
|
+
return tag_ver.upcase
|
249
271
|
rescue => ee
|
250
272
|
puts "Exception on method #{__method__}: #{ee}: #{url} : #{tag}" if @verbose
|
251
273
|
return tag_ver
|
@@ -257,9 +279,9 @@ module Wmap
|
|
257
279
|
recording=false
|
258
280
|
tag_found=false
|
259
281
|
tag_desc=""
|
260
|
-
doc =
|
282
|
+
doc = open_page(url)
|
261
283
|
doc.search('script').map do |script|
|
262
|
-
if script.text.include?(tag)
|
284
|
+
if script.text.include?(tag) && script.text.length < 65535
|
263
285
|
return script.text
|
264
286
|
end
|
265
287
|
end
|
@@ -21,123 +21,148 @@ module Wmap
|
|
21
21
|
# Main function to retrieve the registered domain ('domain root' from the 'registrant' perspective) from a hostname, for example, "www.telegraph.co.uk" -> "telegraph.co.uk"
|
22
22
|
def get_domain_root (host)
|
23
23
|
puts "Retrieve the root domain for host: #{host}" if @verbose
|
24
|
-
|
25
|
-
|
26
|
-
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
27
|
-
# Generic Top Level Domain List - loading once
|
28
|
-
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
29
|
-
# Country code top-level domain list - loading once
|
30
|
-
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
31
|
-
# Country code second level domain - loading once
|
32
|
-
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
33
|
-
|
34
|
-
if host.strip.nil?
|
35
|
-
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
36
|
-
return nil
|
37
|
-
else
|
38
|
-
host=host.downcase.strip
|
39
|
-
end
|
40
|
-
found_tld=false
|
41
|
-
found_cctld=false
|
42
|
-
# search the top level domain list first
|
43
|
-
root_domain=""
|
44
|
-
dn=host.split(".")
|
45
|
-
if @tlds.key?(dn.last)
|
46
|
-
cc_found=false
|
47
|
-
if @cctld.key?(dn[dn.length-2])
|
48
|
-
cc_found=true
|
49
|
-
end
|
50
|
-
if cc_found
|
51
|
-
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
52
|
-
else
|
53
|
-
root_domain=dn[dn.length-2] + "." + dn.last
|
54
|
-
end
|
55
|
-
found_tld=true
|
56
|
-
end
|
57
|
-
# search the country code top level domain list secondly
|
58
|
-
if @cctld.key?(dn.last)
|
59
|
-
found=false
|
60
|
-
# reverse search of general top level domain
|
61
|
-
if @gtld.key?(dn[dn.length-2])
|
62
|
-
found=true
|
63
|
-
end
|
64
|
-
# search country code second level domain list
|
65
|
-
if @ccsld.key?(dn.last)
|
66
|
-
@ccsld[dn.last].each do |v|
|
67
|
-
if ( v =~ /#{dn[dn.length-2]}/i )
|
68
|
-
found=true
|
69
|
-
break
|
70
|
-
end
|
71
|
-
end
|
72
|
-
# 1/8/2015: additional logic to handle invalid ccsld string: reserved gtld string
|
73
|
-
#unless found
|
74
|
-
# if @gtld.key?(dn[dn.length-2])
|
75
|
-
# puts "Invalid ccsld: #{dn[dn.length-2]} for host: #{host}"
|
76
|
-
# return nil
|
77
|
-
# end
|
78
|
-
#end
|
79
|
-
end
|
80
|
-
if found
|
81
|
-
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
82
|
-
else
|
83
|
-
root_domain=dn[dn.length-2] + "." + dn.last
|
84
|
-
end
|
85
|
-
found_cctld=true
|
86
|
-
end
|
87
|
-
unless (found_tld or found_cctld)
|
88
|
-
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
89
|
-
return nil
|
90
|
-
else
|
91
|
-
puts "Domain root found: #{root_domain}" if @verbose
|
92
|
-
return root_domain
|
93
|
-
end
|
94
|
-
rescue => ee
|
95
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
24
|
+
if host.strip.nil?
|
25
|
+
puts "Error: empty record found. Please check your input and remove any empty line." if @verbose
|
96
26
|
return nil
|
27
|
+
else
|
28
|
+
host=host.downcase.strip
|
97
29
|
end
|
30
|
+
# First order - search country code second level domain list
|
31
|
+
root_domain = get_domain_root_by_ccsld(host)
|
32
|
+
if root_domain.nil?
|
33
|
+
# Second order - search the country code top level domain list
|
34
|
+
root_domain = get_domain_root_by_cctld(host)
|
35
|
+
if root_domain.nil?
|
36
|
+
# Third order - search top level domain list
|
37
|
+
root_domain = get_domain_root_by_tlds(host)
|
38
|
+
if root_domain.nil?
|
39
|
+
# do nothing - no further search
|
40
|
+
else
|
41
|
+
return root_domain
|
42
|
+
end
|
43
|
+
else
|
44
|
+
return root_domain
|
45
|
+
end
|
46
|
+
else
|
47
|
+
return root_domain
|
48
|
+
end
|
49
|
+
puts "#{host} - the top level domain is unknown. Please check out your record #{root_domain} " if @verbose
|
50
|
+
return nil
|
51
|
+
#rescue => ee
|
52
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
53
|
+
# return nil
|
98
54
|
end
|
99
55
|
alias_method :get_root_domain, :get_domain_root
|
100
56
|
alias_method :root_domain, :get_domain_root
|
101
57
|
alias_method :domain_root, :get_domain_root
|
102
58
|
alias_method :host_2_domain, :get_domain_root
|
103
59
|
|
60
|
+
# get domain root by lookup Country Code Second Level Domain list
|
61
|
+
def get_domain_root_by_ccsld(host)
|
62
|
+
puts "First order search - domain root lookup by Country Code Second Level Domain list ..." if @verbose
|
63
|
+
root_domain = nil
|
64
|
+
dn = host.split(".")
|
65
|
+
# Country code second level domain - loading once
|
66
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
67
|
+
# search country code second level domain list
|
68
|
+
if @ccsld.key?(dn.last)
|
69
|
+
@ccsld[dn.last].each do |v|
|
70
|
+
if ( v =~ /#{dn[dn.length-2]}/i )
|
71
|
+
return dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return root_domain
|
76
|
+
#rescue => ee
|
77
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
78
|
+
# return nil
|
79
|
+
end
|
80
|
+
|
81
|
+
# get domain root by lookup Country Code Top Level Domain list
|
82
|
+
def get_domain_root_by_cctld(host)
|
83
|
+
puts "Second order search - domain root lookup by Country Code Top Level Domain list ..." if @verbose
|
84
|
+
root_domain = nil
|
85
|
+
dn = host.split(".")
|
86
|
+
# Country code top-level domain list - loading once
|
87
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
88
|
+
# Generic Top Level Domain List - loading once
|
89
|
+
@gtld=file_2_hash(File_gtld) if @gtld.nil?
|
90
|
+
# Country code second level domain - loading once
|
91
|
+
@ccsld=load_ccsld_from_file(File_ccsld) if @ccsld.nil?
|
92
|
+
# search the country code top level domain list
|
93
|
+
if @cctld.key?(dn.last)
|
94
|
+
# reverse search of general top level domain
|
95
|
+
if @gtld.key?(dn[dn.length-2])
|
96
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
97
|
+
else
|
98
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return root_domain
|
102
|
+
#rescue => ee
|
103
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
104
|
+
# return nil
|
105
|
+
end
|
106
|
+
|
107
|
+
# get domain root by lookup Top Level Domain list
|
108
|
+
def get_domain_root_by_tlds(host)
|
109
|
+
puts "Third order search - domain root lookup by Top Level Domain list ..." if @verbose
|
110
|
+
root_domain = nil
|
111
|
+
dn = host.split(".")
|
112
|
+
# Comnplete Top Level Domain List - loading once
|
113
|
+
@tlds=file_2_hash(File_tld) if @tlds.nil?
|
114
|
+
# Country code top-level domain list - loading once
|
115
|
+
@cctld=file_2_hash(File_cctld) if @cctld.nil?
|
116
|
+
cc_found=false
|
117
|
+
if @tlds.key?(dn.last)
|
118
|
+
if @cctld.key?(dn[dn.length-2])
|
119
|
+
cc_found=true
|
120
|
+
end
|
121
|
+
if cc_found
|
122
|
+
root_domain=dn[dn.length-3] + "." + dn[dn.length-2] + "." + dn.last
|
123
|
+
else
|
124
|
+
root_domain=dn[dn.length-2] + "." + dn.last
|
125
|
+
end
|
126
|
+
end
|
127
|
+
return root_domain
|
128
|
+
#rescue => ee
|
129
|
+
# puts "Exception on method #{__method__}: #{ee}" if @verbose
|
130
|
+
# return nil
|
131
|
+
end
|
132
|
+
|
104
133
|
# 'setter' to parse and load the known country code second level domain table from the file
|
105
134
|
# data structure example: {"uk" =>["co","plc"],"za"=>["mil","nom","org"]}
|
106
135
|
def load_ccsld_from_file (file_ccsld)
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
ccsld[key].push(val) unless key.nil?
|
121
|
-
end
|
136
|
+
ccsld=Hash.new
|
137
|
+
puts "Loading known country code second level domain list from file: #{file_ccsld}" if @verbose
|
138
|
+
f=File.open(file_ccsld, 'r:ISO-8859-1:UTF-8') # transcoded magic bit
|
139
|
+
f.each do |line|
|
140
|
+
next unless line =~ /^\s+\.\w/
|
141
|
+
line=line.chomp.strip.downcase
|
142
|
+
entry=line.split(' ')[0].split('.')
|
143
|
+
if entry.length > 2
|
144
|
+
key=entry.last
|
145
|
+
ccsld[key] = Array.new if not ccsld.key?(key)
|
146
|
+
val=entry[entry.length-2]
|
147
|
+
#puts "Loading country code second level domain table with - Country code: #{key}, Second level domain: #{val}" if @verbose
|
148
|
+
ccsld[key].push(val) unless key.nil?
|
122
149
|
end
|
123
|
-
f.close
|
124
|
-
# Sort the blocks once in descendant order once for better performance
|
125
|
-
return ccsld
|
126
|
-
rescue => ee
|
127
|
-
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
128
150
|
end
|
151
|
+
f.close
|
152
|
+
# Sort the blocks once in descendant order once for better performance
|
153
|
+
return ccsld
|
154
|
+
rescue => ee
|
155
|
+
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
129
156
|
end
|
130
157
|
|
131
158
|
# Test a host string to see if it's a valid Internet root domain
|
132
159
|
def is_domain_root? (domain)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
return false
|
140
|
-
end
|
160
|
+
puts "Validate the domain name is valid: #{domain}" if @verbose
|
161
|
+
domain=domain.strip.downcase
|
162
|
+
return domain == get_domain_root(domain)
|
163
|
+
rescue => ee
|
164
|
+
puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
|
165
|
+
return false
|
141
166
|
end
|
142
167
|
alias_method :is_root_domain?, :is_domain_root?
|
143
168
|
alias_method :is_domain?, :is_domain_root?
|
@@ -146,39 +171,40 @@ module Wmap
|
|
146
171
|
# Function to retrieve the sub-domain from a Fully Qualified Domain Name(FQDN), for example, "www.secure.telegraph.co.uk" -> "secure.telegraph.co.uk"
|
147
172
|
def get_sub_domain (host)
|
148
173
|
puts "Retrieve sub-domain from host: #{host}" if @verbose
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
else
|
160
|
-
return nil
|
161
|
-
end
|
162
|
-
rescue Exception => ee
|
163
|
-
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
174
|
+
subdomain=String.new
|
175
|
+
host=host.strip.downcase
|
176
|
+
domain=get_domain_root(host)
|
177
|
+
record_h=host.split(".")
|
178
|
+
record_d=domain.split(".")
|
179
|
+
if (record_h.length - record_d.length) >= 2
|
180
|
+
subdomain=record_h[record_h.length-record_d.length-1]+"."+domain
|
181
|
+
puts "Sub domain found: #{subdomain}" if @verbose
|
182
|
+
return subdomain
|
183
|
+
else
|
164
184
|
return nil
|
165
185
|
end
|
186
|
+
rescue Exception => ee
|
187
|
+
puts "Exception on method #{__method__} for #{host}: #{ee}" if @verbose
|
188
|
+
return nil
|
166
189
|
end
|
167
190
|
alias_method :get_subdomain, :get_sub_domain
|
168
191
|
|
169
192
|
# Function to print instance variable - General top level domain list
|
170
193
|
def print_gtld
|
171
194
|
puts @gtld
|
195
|
+
return @gtld
|
172
196
|
end
|
173
197
|
|
174
198
|
# Function to print instance variable - Country code top-level domain list
|
175
199
|
def print_cctld
|
176
200
|
puts @cctld
|
201
|
+
return @cctld
|
177
202
|
end
|
178
203
|
|
179
204
|
# Function to print instance variable - Country code second-level domain list
|
180
205
|
def print_ccsld
|
181
206
|
puts @ccsld
|
207
|
+
return @ccsld
|
182
208
|
end
|
183
209
|
|
184
210
|
private :load_ccsld_from_file
|
data/lib/wmap/utils/url_magic.rb
CHANGED
@@ -5,13 +5,17 @@
|
|
5
5
|
#
|
6
6
|
# Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
|
7
7
|
#++
|
8
|
-
|
8
|
+
require "watir"
|
9
|
+
require "selenium-webdriver"
|
9
10
|
|
10
11
|
module Wmap
|
11
|
-
module Utils
|
12
|
-
module UrlMagic
|
12
|
+
module Utils
|
13
|
+
module UrlMagic
|
13
14
|
extend self
|
14
15
|
|
16
|
+
# set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain 'weird' site(s)
|
17
|
+
Max_http_timeout=8000
|
18
|
+
|
15
19
|
# Simple sanity check on a 'claimed' URL string.
|
16
20
|
def is_url?(url)
|
17
21
|
puts "Validate the URL format is valid: #{url}" if @verbose
|
@@ -33,7 +37,7 @@ module Wmap
|
|
33
37
|
return false
|
34
38
|
end
|
35
39
|
end
|
36
|
-
|
40
|
+
|
37
41
|
# Simple sanity check on a 'claimed' SSL enabled URL string
|
38
42
|
def is_ssl?(url)
|
39
43
|
puts "Validate if SSL is enabled on: #{url}" if @verbose
|
@@ -49,8 +53,8 @@ module Wmap
|
|
49
53
|
return false
|
50
54
|
end
|
51
55
|
end
|
52
|
-
alias_method :is_https?, :is_ssl?
|
53
|
-
|
56
|
+
alias_method :is_https?, :is_ssl?
|
57
|
+
|
54
58
|
# Simple sanity check on a 'claimed' web site base string.
|
55
59
|
def is_site?(url)
|
56
60
|
puts "Validate the website string format for: #{url}" if @verbose
|
@@ -61,7 +65,7 @@ module Wmap
|
|
61
65
|
return true
|
62
66
|
else
|
63
67
|
return false
|
64
|
-
end
|
68
|
+
end
|
65
69
|
else
|
66
70
|
puts "Unknown site format: #{url}" if @verbose
|
67
71
|
return false
|
@@ -71,40 +75,17 @@ module Wmap
|
|
71
75
|
return nil
|
72
76
|
end
|
73
77
|
end
|
74
|
-
|
75
|
-
# Check if URL is an absolute one
|
76
|
-
#def is_absolute?(url)
|
77
|
-
# puts "Validate if the url is absolute: #{url}" if @verbose
|
78
|
-
# begin
|
79
|
-
# url.strip!
|
80
|
-
# URI.absolute?(url)
|
81
|
-
# rescue => ee
|
82
|
-
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
83
|
-
# return false
|
84
|
-
# end
|
85
|
-
#end
|
86
|
-
|
87
|
-
# Check if URL is relative one
|
88
|
-
#def is_relative?(url)
|
89
|
-
# begin
|
90
|
-
# url.strip!
|
91
|
-
# !is_absolute?(url)
|
92
|
-
# rescue => ee
|
93
|
-
# puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
94
|
-
# return false
|
95
|
-
# end
|
96
|
-
#end
|
97
|
-
|
78
|
+
|
98
79
|
# Extract the web server host's Fully Qualified Domain Name (FQDN) from the url. For example: "https://login.yahoo.com/email/help" -> "login.yahoo.com"
|
99
80
|
def url_2_host (url)
|
100
81
|
begin
|
101
82
|
url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
102
83
|
record1 = url.split('/')
|
103
84
|
if record1[0].nil?
|
104
|
-
puts "Error process url: #{url}"
|
85
|
+
puts "Error process url: #{url}"
|
105
86
|
return nil
|
106
87
|
else
|
107
|
-
record2 = record1[0].split(':')
|
88
|
+
record2 = record1[0].split(':')
|
108
89
|
return record2[0]
|
109
90
|
end
|
110
91
|
rescue => ee
|
@@ -120,8 +101,8 @@ module Wmap
|
|
120
101
|
ssl = (url =~ /https/i)
|
121
102
|
url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
|
122
103
|
record1 = url.split('/')
|
123
|
-
record2 = record1[0].split(':')
|
124
|
-
if (record2.length == 2)
|
104
|
+
record2 = record1[0].split(':')
|
105
|
+
if (record2.length == 2)
|
125
106
|
puts "The service port: #{record2[1]}" if @verbose
|
126
107
|
return record2[1].to_i
|
127
108
|
elsif ssl
|
@@ -164,13 +145,13 @@ module Wmap
|
|
164
145
|
unless is_fqdn?(host)
|
165
146
|
case host
|
166
147
|
# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
|
167
|
-
when /\?|\#/
|
148
|
+
when /\?|\#/
|
168
149
|
host=host.split(%r{\?|\#})[0]
|
169
150
|
else
|
170
151
|
#do nothing
|
171
152
|
end
|
172
153
|
end
|
173
|
-
# step 2, put the host:port pair back to the normal site format
|
154
|
+
# step 2, put the host:port pair back to the normal site format
|
174
155
|
prot="https:" if port==443
|
175
156
|
if port==80 || port==443
|
176
157
|
site=prot+"//"+host+"/"
|
@@ -180,7 +161,7 @@ module Wmap
|
|
180
161
|
if site=~ /http/i
|
181
162
|
#puts "Base found: #{site}" if @verbose
|
182
163
|
return site
|
183
|
-
else
|
164
|
+
else
|
184
165
|
raise "Problem encountered on method url_2_site: Unable to convert #{url}"
|
185
166
|
return nil
|
186
167
|
end
|
@@ -202,28 +183,26 @@ module Wmap
|
|
202
183
|
rescue => ee
|
203
184
|
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
204
185
|
end
|
205
|
-
|
186
|
+
|
206
187
|
end
|
207
|
-
|
188
|
+
|
208
189
|
# Test if the two URLs are both under the same domain: http://login.yahoo.com, http://mail.yahoo.com => true
|
209
190
|
def urls_on_same_domain?(url1, url2)
|
210
191
|
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
219
|
-
end
|
192
|
+
host1=url_2_host(url1)
|
193
|
+
host2=url_2_host(url2)
|
194
|
+
return get_domain_root(host1) == get_domain_root(host2)
|
195
|
+
rescue => ee
|
196
|
+
puts "Error searching the object content: #{ee}" if @verbose
|
197
|
+
return nil
|
198
|
+
end
|
220
199
|
|
221
200
|
# Input is host and open port, output is a URL for valid http response code or nil
|
222
201
|
def host_2_url (host,port=80)
|
223
202
|
puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
|
224
203
|
begin
|
225
204
|
host=host.strip
|
226
|
-
if port.to_i == 80
|
205
|
+
if port.to_i == 80
|
227
206
|
url_1 = "http://" + host + "/"
|
228
207
|
elsif port.to_i ==443
|
229
208
|
url_1 = "https://" + host + "/"
|
@@ -232,7 +211,7 @@ module Wmap
|
|
232
211
|
url_2 = "https://" + host + ":" + port.to_s + "/"
|
233
212
|
end
|
234
213
|
puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
|
235
|
-
checker=Wmap::UrlChecker.new
|
214
|
+
checker=Wmap::UrlChecker.new
|
236
215
|
if checker.response_code(url_1) != 10000
|
237
216
|
puts "Found URL: #{url_1}" if @verbose
|
238
217
|
return url_1
|
@@ -247,8 +226,8 @@ module Wmap
|
|
247
226
|
puts "Exception on method #{__method__}: #{ee}" if @verbose
|
248
227
|
return nil
|
249
228
|
end
|
250
|
-
end
|
251
|
-
|
229
|
+
end
|
230
|
+
|
252
231
|
# Convert a relative URL to an absolute one. For example, from URL base 'http://games.yahoo.com/' and file path '/game/the-magic-snowman-flash.html' => 'http://games.yahoo.com/game/the-magic-snowman-flash.html'
|
253
232
|
def make_absolute(base, relative_url)
|
254
233
|
puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
|
@@ -266,12 +245,12 @@ module Wmap
|
|
266
245
|
return nil
|
267
246
|
end
|
268
247
|
end
|
269
|
-
|
248
|
+
|
270
249
|
# Create / construct the absolute URL from a known URL and relative file path. For example, 'http://images.search.yahoo.com/images' + '/search/images?p=raiders' => 'http://images.search.yahoo.com/search/images?p=raiders'
|
271
250
|
def create_absolute_url_from_base(potential_base, relative_url)
|
272
251
|
begin
|
273
252
|
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
|
274
|
-
naked_base = url_2_site(potential_base).strip.chop
|
253
|
+
naked_base = url_2_site(potential_base).strip.chop
|
275
254
|
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
|
276
255
|
return naked_base + relative_url
|
277
256
|
rescue => ee
|
@@ -309,19 +288,19 @@ module Wmap
|
|
309
288
|
return nil
|
310
289
|
end
|
311
290
|
end
|
312
|
-
|
291
|
+
|
313
292
|
# Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before
|
314
293
|
# See http://en.wikipedia.org/wiki/URL_normalization for more explanation
|
315
294
|
def normalize_url(url)
|
316
295
|
begin
|
317
296
|
url.strip!
|
318
|
-
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
297
|
+
# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
|
319
298
|
# Normalize the base
|
320
|
-
base=url_2_site(url)
|
299
|
+
base=url_2_site(url)
|
321
300
|
# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
|
322
301
|
base=base.sub(/\.\/$/,'/')
|
323
302
|
# Normalize the relative path, case#1
|
324
|
-
# retrieve the file path and remove the first '/' or '.',
|
303
|
+
# retrieve the file path and remove the first '/' or '.',
|
325
304
|
# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
|
326
305
|
path=url_2_path(url).sub(/^(\/|\.)*/,'')
|
327
306
|
# Normalize the relative path, case#2
|
@@ -337,7 +316,136 @@ module Wmap
|
|
337
316
|
return url
|
338
317
|
end
|
339
318
|
end
|
340
|
-
|
319
|
+
|
320
|
+
|
321
|
+
# Test the URL and return the response code
|
322
|
+
def response_code (url)
|
323
|
+
puts "Check the http response code on the url: #{url}" if @verbose
|
324
|
+
code = 10000 # All unknown url connection exceptions go here
|
325
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
326
|
+
url=url.strip.downcase
|
327
|
+
timeo = Max_http_timeout/1000.0
|
328
|
+
uri = URI.parse(url)
|
329
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
330
|
+
http.open_timeout = timeo
|
331
|
+
http.read_timeout = timeo
|
332
|
+
if (url =~ /https\:/i)
|
333
|
+
http.use_ssl = true
|
334
|
+
#http.ssl_version = :SSLv3
|
335
|
+
# Bypass the remote web server cert validation test
|
336
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
337
|
+
end
|
338
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
339
|
+
response = http.request(request)
|
340
|
+
puts "Server response the following: #{response}" if @verbose
|
341
|
+
code = response.code.to_i
|
342
|
+
#response.finish if response.started?()
|
343
|
+
@url_code=Hash.new unless @url_code
|
344
|
+
@url_code[url]=code
|
345
|
+
puts "Response code on #{url}: #{code}" if @verbose
|
346
|
+
return code
|
347
|
+
rescue Exception => ee
|
348
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
|
349
|
+
case ee
|
350
|
+
# rescue "Connection reset by peer" error type
|
351
|
+
when Errno::ECONNRESET
|
352
|
+
code=104
|
353
|
+
when Errno::ECONNABORTED,Errno::ETIMEDOUT
|
354
|
+
#code=10000
|
355
|
+
when Timeout::Error # Quick fix
|
356
|
+
if (url =~ /https\:/i) # try again for ssl timeout session, in case of default :TLSv1 failure
|
357
|
+
http.ssl_version = :SSLv3
|
358
|
+
response = http.request(request)
|
359
|
+
code = response.code.to_i
|
360
|
+
unless code.nil?
|
361
|
+
@ssl_version = http.ssl_version
|
362
|
+
end
|
363
|
+
end
|
364
|
+
else
|
365
|
+
#code=10000
|
366
|
+
end
|
367
|
+
@url_code=Hash.new unless @url_code
|
368
|
+
@url_code[url]=code
|
369
|
+
return code
|
370
|
+
end
|
371
|
+
|
372
|
+
# Given an URL, open the page, then return the DOM text from a normal user perspective
|
373
|
+
def open_page(url)
|
374
|
+
args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, read_timeout: Max_http_timeout/1000}
|
375
|
+
doc = Nokogiri::HTML(open(url, args))
|
376
|
+
if doc.text.include?("Please enable JavaScript to view the page content")
|
377
|
+
puts "Invoke headless chrome through webdriver ..." if @verbose
|
378
|
+
#Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver"
|
379
|
+
#driver = Selenium::WebDriver.for :chrome
|
380
|
+
# http://watir.com/guides/chrome/
|
381
|
+
args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate']
|
382
|
+
browser = Watir::Browser.new :chrome, headless: true, options: {args: args}
|
383
|
+
browser.goto(url)
|
384
|
+
sleep(2) # wait for the loading
|
385
|
+
doc = Nokogiri::HTML(browser.html)
|
386
|
+
browser.close
|
387
|
+
end
|
388
|
+
puts doc.text if @verbose
|
389
|
+
return doc
|
390
|
+
rescue => ee
|
391
|
+
puts "Exception on method #{__method__} for #{url}: #{ee}"
|
392
|
+
browser.close unless browser.nil?
|
393
|
+
return doc.text
|
394
|
+
end
|
395
|
+
|
396
|
+
# Test the URL / site and return the redirection location (3xx response code only)
|
397
|
+
def redirect_location (url)
|
398
|
+
puts "Test the redirection location for the url: #{url}" if @verbose
|
399
|
+
location=""
|
400
|
+
raise "Invalid url: #{url}" unless is_url?(url)
|
401
|
+
url=url.strip.downcase
|
402
|
+
timeo = Max_http_timeout/1000.0
|
403
|
+
uri = URI.parse(url)
|
404
|
+
code = response_code (url)
|
405
|
+
if code >= 300 && code < 400
|
406
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
407
|
+
http.open_timeout = timeo
|
408
|
+
http.read_timeout = timeo
|
409
|
+
if (url =~ /https\:/i)
|
410
|
+
http.use_ssl = true
|
411
|
+
# Bypass the remote web server cert validation test
|
412
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
413
|
+
http.ssl_version = @ssl_version
|
414
|
+
end
|
415
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
416
|
+
response = http.request(request)
|
417
|
+
puts "Response: #{response}" if @verbose
|
418
|
+
case response
|
419
|
+
when Net::HTTPRedirection then
|
420
|
+
location = response['location']
|
421
|
+
end
|
422
|
+
end
|
423
|
+
return location
|
424
|
+
rescue Exception => ee
|
425
|
+
puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
|
426
|
+
return ""
|
427
|
+
end
|
428
|
+
alias_method :location, :redirect_location
|
429
|
+
|
430
|
+
# Test the URL / Site and return the landing url location (recursive with the depth = 4 )
|
431
|
+
def landing_location (depth=5, url)
|
432
|
+
depth -= 1
|
433
|
+
return url if depth < 1
|
434
|
+
timeo = Max_http_timeout/1000.0
|
435
|
+
uri = URI.parse(url)
|
436
|
+
code = response_code (url)
|
437
|
+
if code >= 300 && code < 400
|
438
|
+
url = redirect_location (url)
|
439
|
+
url = landing_location(depth,url)
|
440
|
+
else
|
441
|
+
return url
|
442
|
+
end
|
443
|
+
return url
|
444
|
+
rescue Exception => ee
|
445
|
+
puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
|
446
|
+
end
|
447
|
+
|
448
|
+
|
341
449
|
end
|
342
450
|
end
|
343
451
|
end
|