wmap 2.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +141 -0
  3. data/LICENSE.txt +15 -0
  4. data/README.rdoc +98 -0
  5. data/TODO +13 -0
  6. data/bin/deprime +21 -0
  7. data/bin/distrust +38 -0
  8. data/bin/googleBot +23 -0
  9. data/bin/prime +21 -0
  10. data/bin/refresh +26 -0
  11. data/bin/run_tests +16 -0
  12. data/bin/spiderBot +26 -0
  13. data/bin/trust +38 -0
  14. data/bin/updateAll +57 -0
  15. data/bin/wadd +25 -0
  16. data/bin/wadds +26 -0
  17. data/bin/wcheck +28 -0
  18. data/bin/wdel +25 -0
  19. data/bin/wdump +21 -0
  20. data/bin/wmap +151 -0
  21. data/bin/wscan +32 -0
  22. data/data/cidrs +2 -0
  23. data/data/deactivated_sites +1 -0
  24. data/data/domains +2 -0
  25. data/data/hosts +1 -0
  26. data/data/prime_hosts +1 -0
  27. data/data/sites +2 -0
  28. data/data/sub_domains +2 -0
  29. data/demos/bruter.rb +27 -0
  30. data/demos/dns_brutes.rb +28 -0
  31. data/demos/filter_cidr.rb +18 -0
  32. data/demos/filter_crawls.rb +5 -0
  33. data/demos/filter_domain.rb +25 -0
  34. data/demos/filter_geoip.rb +26 -0
  35. data/demos/filter_known_services.rb +59 -0
  36. data/demos/filter_netinfo.rb +23 -0
  37. data/demos/filter_prime.rb +25 -0
  38. data/demos/filter_profiler.rb +3 -0
  39. data/demos/filter_redirection.rb +19 -0
  40. data/demos/filter_site.rb +40 -0
  41. data/demos/filter_siteip.rb +31 -0
  42. data/demos/filter_status.rb +17 -0
  43. data/demos/filter_timestamp.rb +23 -0
  44. data/demos/filter_url.rb +19 -0
  45. data/demos/new_fnd.rb +66 -0
  46. data/demos/nmap_parser.pl +138 -0
  47. data/demos/site_format.rb +18 -0
  48. data/demos/whois_domain.rb +78 -0
  49. data/dicts/GeoIP.dat +0 -0
  50. data/dicts/GeoIPASNum.dat +0 -0
  51. data/dicts/GeoLiteCity.dat +0 -0
  52. data/dicts/ccsld.txt +2646 -0
  53. data/dicts/cctld.txt +243 -0
  54. data/dicts/gtld.txt +25 -0
  55. data/dicts/hostnames-dict.big +1402 -0
  56. data/dicts/hostnames-dict.txt +101 -0
  57. data/lib/wmap/cidr_tracker.rb +327 -0
  58. data/lib/wmap/dns_bruter.rb +308 -0
  59. data/lib/wmap/domain_tracker/sub_domain.rb +142 -0
  60. data/lib/wmap/domain_tracker.rb +342 -0
  61. data/lib/wmap/geoip_tracker.rb +72 -0
  62. data/lib/wmap/google_search_scraper.rb +177 -0
  63. data/lib/wmap/host_tracker/primary_host.rb +130 -0
  64. data/lib/wmap/host_tracker.rb +550 -0
  65. data/lib/wmap/network_profiler.rb +144 -0
  66. data/lib/wmap/port_scanner.rb +208 -0
  67. data/lib/wmap/site_tracker/deactivated_site.rb +85 -0
  68. data/lib/wmap/site_tracker.rb +937 -0
  69. data/lib/wmap/url_checker.rb +314 -0
  70. data/lib/wmap/url_crawler.rb +381 -0
  71. data/lib/wmap/utils/domain_root.rb +184 -0
  72. data/lib/wmap/utils/logger.rb +53 -0
  73. data/lib/wmap/utils/url_magic.rb +343 -0
  74. data/lib/wmap/utils/utils.rb +333 -0
  75. data/lib/wmap/whois.rb +76 -0
  76. data/lib/wmap.rb +227 -0
  77. data/logs/wmap.log +17 -0
  78. data/ruby_whois_patches/base_cocca2.rb +149 -0
  79. data/ruby_whois_patches/kero.yachay.pe.rb +120 -0
  80. data/ruby_whois_patches/whois.PublicDomainRegistry.com.rb +124 -0
  81. data/ruby_whois_patches/whois.above.com.rb +61 -0
  82. data/ruby_whois_patches/whois.adamsnames.tc.rb +107 -0
  83. data/ruby_whois_patches/whois.aeda.net.ae.rb +105 -0
  84. data/ruby_whois_patches/whois.ai.rb +112 -0
  85. data/ruby_whois_patches/whois.arnes.si.rb +121 -0
  86. data/ruby_whois_patches/whois.ascio.com.rb +91 -0
  87. data/ruby_whois_patches/whois.cnnic.cn.rb +123 -0
  88. data/ruby_whois_patches/whois.corporatedomains.com.rb +67 -0
  89. data/ruby_whois_patches/whois.crsnic.net.rb +108 -0
  90. data/ruby_whois_patches/whois.denic.de.rb +174 -0
  91. data/ruby_whois_patches/whois.dk-hostmaster.dk.rb +120 -0
  92. data/ruby_whois_patches/whois.dns.be.rb +134 -0
  93. data/ruby_whois_patches/whois.dns.lu.rb +129 -0
  94. data/ruby_whois_patches/whois.dns.pl.rb +150 -0
  95. data/ruby_whois_patches/whois.dns.pt.rb +119 -0
  96. data/ruby_whois_patches/whois.domain.kg.rb +126 -0
  97. data/ruby_whois_patches/whois.domainregistry.my.rb +123 -0
  98. data/ruby_whois_patches/whois.domreg.lt.rb +110 -0
  99. data/ruby_whois_patches/whois.dot.tk.rb +140 -0
  100. data/ruby_whois_patches/whois.hkirc.hk.rb +121 -0
  101. data/ruby_whois_patches/whois.isnic.is.rb +130 -0
  102. data/ruby_whois_patches/whois.je.rb +119 -0
  103. data/ruby_whois_patches/whois.jprs.jp.rb +137 -0
  104. data/ruby_whois_patches/whois.kenic.or.ke.rb +140 -0
  105. data/ruby_whois_patches/whois.markmonitor.com.rb +118 -0
  106. data/ruby_whois_patches/whois.melbourneit.com.rb +58 -0
  107. data/ruby_whois_patches/whois.nic.as.rb +96 -0
  108. data/ruby_whois_patches/whois.nic.at.rb +109 -0
  109. data/ruby_whois_patches/whois.nic.ch.rb +141 -0
  110. data/ruby_whois_patches/whois.nic.cl.rb +117 -0
  111. data/ruby_whois_patches/whois.nic.ec.rb +157 -0
  112. data/ruby_whois_patches/whois.nic.im.rb +120 -0
  113. data/ruby_whois_patches/whois.nic.it.rb +170 -0
  114. data/ruby_whois_patches/whois.nic.lv.rb +116 -0
  115. data/ruby_whois_patches/whois.nic.ly.rb +127 -0
  116. data/ruby_whois_patches/whois.nic.mu.rb +27 -0
  117. data/ruby_whois_patches/whois.nic.mx.rb +123 -0
  118. data/ruby_whois_patches/whois.nic.net.sa.rb +111 -0
  119. data/ruby_whois_patches/whois.nic.or.kr.rb +101 -0
  120. data/ruby_whois_patches/whois.nic.tel.rb +129 -0
  121. data/ruby_whois_patches/whois.nic.tr.rb +133 -0
  122. data/ruby_whois_patches/whois.nic.us.rb +129 -0
  123. data/ruby_whois_patches/whois.nic.ve.rb +135 -0
  124. data/ruby_whois_patches/whois.norid.no.rb +127 -0
  125. data/ruby_whois_patches/whois.pandi.or.id.rb +118 -0
  126. data/ruby_whois_patches/whois.psi-usa.info.rb +63 -0
  127. data/ruby_whois_patches/whois.registro.br.rb +109 -0
  128. data/ruby_whois_patches/whois.registrygate.com.rb +55 -0
  129. data/ruby_whois_patches/whois.rrpproxy.net.rb +61 -0
  130. data/ruby_whois_patches/whois.sgnic.sg.rb +130 -0
  131. data/ruby_whois_patches/whois.srs.net.nz.rb +166 -0
  132. data/ruby_whois_patches/whois.tucows.com.rb +70 -0
  133. data/ruby_whois_patches/whois.twnic.net.tw.rb +133 -0
  134. data/settings/discovery_ports +24 -0
  135. data/settings/google_keywords.txt +9 -0
  136. data/settings/google_locator.txt +23 -0
  137. data/test/domain_tracker_test.rb +31 -0
  138. data/test/utils_test.rb +168 -0
  139. data/version.txt +13 -0
  140. data/wmap.gemspec +49 -0
  141. metadata +202 -0
@@ -0,0 +1,342 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for the Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "parallel"
9
+ #require "singleton"
10
+
11
+
12
+ # Class to track the known (trusted) Internet domains
13
+ class Wmap::DomainTracker
14
+ include Wmap::Utils
15
+ #include Singleton
16
+
17
+
18
+ attr_accessor :verbose, :max_parallel, :domains_file, :file_domains, :data_dir
19
+ attr_reader :known_internet_domains
20
+
21
+ # Set default instance variables
22
+ def initialize (params = {})
23
+ # Initialize the instance variables
24
+ @verbose=params.fetch(:verbose, false)
25
+ @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
26
+ @file_domains=params.fetch(:domains_file, @data_dir+'domains')
27
+ @max_parallel=params.fetch(:max_parallel, 40)
28
+ # Hash table to hold the trusted domains
29
+ File.write(@file_domains, "") unless File.exist?(@file_domains)
30
+ @known_internet_domains=load_domains_from_file(@file_domains)
31
+ #@known_internet_sub_domains=Hash.new
32
+ end
33
+
34
+ # 'setter' to load the known Internet domains into an instance variable
35
+ def load_domains_from_file (file=@file_domains, lc=true)
36
+ puts "Loading trusted domain file: #{file}" if @verbose
37
+ begin
38
+ known_internet_domains=Hash.new
39
+ f_domains=File.open(file, 'r')
40
+ f_domains.each_line do |line|
41
+ puts "Processing line: #{line}" if @verbose
42
+ line=line.chomp.strip
43
+ next if line.nil?
44
+ next if line.empty?
45
+ next if line =~ /^\s*#/
46
+ line=line.downcase if lc==true
47
+ entry=line.split(',')
48
+ if known_internet_domains.key?(entry[0])
49
+ next
50
+ else
51
+ if entry[1] =~ /yes/i
52
+ known_internet_domains[entry[0]]=true
53
+ else
54
+ known_internet_domains[entry[0]]=false
55
+ end
56
+ end
57
+
58
+ end
59
+ f_domains.close
60
+ return known_internet_domains
61
+ rescue => ee
62
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
63
+ return nil
64
+ end
65
+ end
66
+
67
+ # Save the current domain hash table into a file
68
+ def save_domains_to_file!(file_domains=@file_domains, domains=@known_internet_domains)
69
+ puts "Saving the current domains cache table from memory to file: #{file_domains} ..." if @verbose
70
+ begin
71
+ timestamp=Time.now
72
+ f=File.open(file_domains, 'w')
73
+ f.write "# Local domains file created by class #{self.class} method #{__method__} at: #{timestamp}\n"
74
+ f.write "# domain name, free zone transfer detected?\n"
75
+ domains.keys.sort.map do |key|
76
+ if domains[key]
77
+ f.write "#{key}, yes\n"
78
+ else
79
+ f.write "#{key}, no\n"
80
+ end
81
+ end
82
+ f.close
83
+ puts "Domain cache table is successfully saved: #{file_domains}"
84
+ rescue => ee
85
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
86
+ end
87
+ end
88
+ alias_method :save!, :save_domains_to_file!
89
+
90
+ # Count numbers of entries in the domain cache table
91
+ def count
92
+ puts "Counting number of entries in the domain cache table ..."
93
+ begin
94
+ cnt=0
95
+ @known_internet_domains.map do |key|
96
+ unless key =~ /\w+\.\w+/
97
+ cnt=cnt+1
98
+ end
99
+ end
100
+ puts "Current number of entries: #{cnt}"
101
+ return cnt
102
+ rescue => ee
103
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
104
+ end
105
+ end
106
+ alias_method :size, :count
107
+
108
+ # 'setter' to add domain entry to the cache one at a time
109
+ def add(host)
110
+ puts "Add entry to the local domains cache table: #{host}" if @verbose
111
+ #begin
112
+ host=host.strip.downcase
113
+ if @known_internet_domains.key?(host)
114
+ puts "Domain is already exist. Skipping: #{host}"
115
+ else
116
+ root=get_domain_root(host)
117
+ sub=get_subdomain(host)
118
+ record=Hash.new
119
+ if host == root
120
+ if zone_transferable?(root)
121
+ record[root]=true
122
+ #@known_internet_domains[root]=true
123
+ else
124
+ record[root]=false
125
+ #@known_internet_domains[root]=false
126
+ end
127
+ puts "Entry loaded: #{record}"
128
+ @known_internet_domains.merge!(record)
129
+ return record
130
+ elsif sub.nil? # 2/10/2014, additional logic to support sub-domains
131
+ # do nothing
132
+ elsif host != sub
133
+ if zone_transferable?(sub)
134
+ #@known_internet_domains[sub]=true
135
+ record[sub]=true
136
+ else
137
+ #@known_internet_domains[sub]=false
138
+ record[sub]=false
139
+ end
140
+ puts "Entry loaded: #{record}"
141
+ @known_internet_domains.merge!(record)
142
+ return record
143
+ else
144
+ puts "Problem add domain #{host} - please use legal root domain or sub domain only."
145
+ end
146
+ end
147
+ #rescue => ee
148
+ #puts "Exception on method #{__method__}: #{ee}" if @verbose
149
+ #end
150
+ end
151
+
152
+ # 'setter' to add domain entry to the cache in batch (from a file)
153
+ def file_add(file)
154
+ begin
155
+ puts "Add entries to the local domains cache table from file: #{file}" if @verbose
156
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
157
+ changes=Array.new
158
+ domains=file_2_list(file)
159
+ changes=bulk_add(domains)
160
+ rescue => ee
161
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
162
+ end
163
+ end
164
+
165
+ # 'setter' to add domain entry to the cache in batch (from a list)
166
+ def bulk_add(list, num=@max_parallel)
167
+ puts "Add entries to the local domains cache table from list: #{list}" if @verbose
168
+ begin
169
+ results=Hash.new
170
+ domains=list
171
+ if domains.size > 0
172
+ Parallel.map(list, :in_processes => num) { |target|
173
+ add(target)
174
+ }.each do |process|
175
+ if process.nil?
176
+ next
177
+ elsif process.empty?
178
+ #do nothing
179
+ else
180
+ results.merge!(process)
181
+ end
182
+ end
183
+ @known_internet_domains.merge!(results)
184
+ puts "Done loading entries."
185
+ return results
186
+ else
187
+ puts "Error: no entry is loaded. Please check your list and try again."
188
+ end
189
+ return results
190
+ rescue => ee
191
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
192
+ end
193
+ end
194
+ alias_method :adds, :bulk_add
195
+
196
+ # 'setter' to remove entry from the cache one at a time
197
+ def delete(domain)
198
+ puts "Remove entry from the domains cache table: #{domain} " if @verbose
199
+ begin
200
+ domain=domain.strip.downcase
201
+ if @known_internet_domains.key?(domain)
202
+ @known_internet_domains.delete(domain)
203
+ puts "Entry cleared: #{domain}"
204
+ return domain
205
+ else
206
+ puts "Entry not fund. Skipping: #{domain}"
207
+ end
208
+ rescue => ee
209
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
210
+ end
211
+ end
212
+
213
+ # 'setter' to delete domain entry to the cache in batch (from a list)
214
+ def bulk_delete(list)
215
+ puts "Delete entries to the local domains cache table from list: #{list}" if @verbose
216
+ begin
217
+ domains=list
218
+ changes=Array.new
219
+ if domains.size > 0
220
+ domains.map do |x|
221
+ domain=delete(x)
222
+ changes.push(domain) unless domain.nil?
223
+ end
224
+ puts "Done deleting domains from list: #{list}"
225
+ return changes
226
+ else
227
+ puts "Exception on method bulk_delete: no entry is loaded. Please check your list and try again."
228
+ end
229
+ rescue => ee
230
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
231
+ end
232
+ end
233
+ alias_method :dels, :bulk_delete
234
+
235
+ # 'setter' to delete domain entry to the cache in batch (from a file)
236
+ def file_delete(file)
237
+ begin
238
+ puts "Delete entries to the local domains cache table from file: #{file}" if @verbose
239
+ raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
240
+ domains=file_2_list(file)
241
+ changes=bulk_delete(domains)
242
+ rescue => ee
243
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
244
+ end
245
+ end
246
+
247
+ # 'setter' to remove all entries from the store
248
+ def delete_all
249
+ puts "Delete all entries in the domain store! " if @verbose
250
+ begin
251
+ @known_internet_domains.keys.map do |domain|
252
+ delete(domain)
253
+ end
254
+ rescue => ee
255
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
256
+ end
257
+ end
258
+
259
+ # Refresh the domain entry one at a time
260
+ def refresh(domain)
261
+ begin
262
+ abort "Trusted Internet domain file not loaded properly! " if @known_internet_domains.nil?
263
+ domain=domain.strip.downcase unless domain.nil?
264
+ if domain_known?(domain)
265
+ delete(domain)
266
+ add(domain)
267
+ return domain
268
+ else
269
+ puts "Unknown domain: #{domain}"
270
+ return nil
271
+ end
272
+ rescue => ee
273
+ puts "Exception on method #{__method__} for #{domain}: #{ee}" if @verbose
274
+ return nil
275
+ end
276
+ end
277
+
278
+ # Simple method to check if a domain is already within the domain cache table
279
+ def domain_known?(domain)
280
+ begin
281
+ #abort "Trusted Internet domain file not loaded properly! " if @known_internet_domains.nil? or @known_internet_sub_domains.nil?
282
+ domain=domain.strip.downcase unless domain.nil?
283
+ case self.class.name
284
+ when "Wmap::DomainTracker"
285
+ return @known_internet_domains.key?(domain)
286
+ when "Wmap::DomainTracker::SubDomain"
287
+ return @known_internet_sub_domains.key?(domain)
288
+ else
289
+ return nil
290
+ end
291
+ rescue => ee
292
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
293
+ end
294
+ return false
295
+ end
296
+ alias_method :is_known?, :domain_known?
297
+ alias_method :is_domain_known?, :domain_known?
298
+
299
+ # Dump out the list of known domains
300
+ def get_domains
301
+ puts "Retrieve a list of known domain ..." if @verbose
302
+ begin
303
+ return @known_internet_domains.keys
304
+ rescue Exception => ee
305
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
306
+ return nil
307
+ end
308
+ end
309
+ alias_method :dump_domains, :get_domains
310
+ alias_method :dump, :get_domains
311
+
312
+ # Search potential matching domains from the domain store by using simple regular expression. Note that any upper-case char in the search string will be automatically converted into lower case
313
+ def search (pattern)
314
+ puts "Search domain store for the regular expression: #{pattern}" if @verbose
315
+ begin
316
+ pattern=pattern.strip.downcase
317
+ results=Array.new
318
+ @known_internet_domains.keys.map do |key|
319
+ if key =~ /#{pattern}/i
320
+ results.push(key)
321
+ end
322
+ end
323
+ return results
324
+ rescue Exception => ee
325
+ puts "Exception on method #{__method__}: #{ee}" if @verbose
326
+ return nil
327
+ end
328
+ end
329
+ alias_method :find, :search
330
+
331
+ # Print summary report on all known / trust domains in the domain cache table
332
+ def print_known_domains
333
+ puts "\nSummary of known Internet Domains:"
334
+ @known_internet_domains.keys.sort.each do |domain|
335
+ puts domain
336
+ end
337
+ puts "End of the summary"
338
+ end
339
+ alias_method :print, :print_known_domains
340
+
341
+ private :load_domains_from_file
342
+ end
@@ -0,0 +1,72 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li <yang.li@owasp.org>
7
+ #++
8
+ require "geoip"
9
+
10
+
11
+ # Wrapper class of the 'GeoIP' library - http://geoip.rubyforge.org/
12
+ # For detail explanation of Geographic information of an IP address (GeoIP) and its data repository, please refer to the vendor MaxMind (http://www.maxmind.com)
13
+ class Wmap::GeoIPTracker
14
+ include Wmap::Utils
15
+
16
+ attr_accessor :db, :verbose
17
+
18
+ # This product includes GeoLite data created by MaxMind, available from
19
+ # <a href="http://www.maxmind.com">http://www.maxmind.com</a>.
20
+ Db_city=File.dirname(__FILE__)+"/../../dicts/GeoLiteCity.dat"
21
+ Db_asn=File.dirname(__FILE__)+"/../../dicts/GeoIPASNum.dat"
22
+ Db_country=File.dirname(__FILE__)+"/../../dicts/GeoIP.dat"
23
+
24
+ # Set default instance variables
25
+ def initialize (params = {})
26
+ @verbose=params.fetch(:verbose, false)
27
+ @db=params.fetch(:db, Db_city)
28
+ end
29
+
30
+ # Wrapper for the Ruby GeoIP City class - return data structure below on successful lookup
31
+ # Struct.new(:request, :ip, :country_code2, :country_code3, :country_name, :continent_code, :region_name, :city_name, :postal_code, :latitude, :longitude, :dma_code, :area_code, :timezone)
32
+ def city(object)
33
+ puts "Perform GeoIP city lookup on: #{object}" if @verbose
34
+ begin
35
+ object=object.strip
36
+ raise "Unknown object format - only valid hostname or IP is accepted: #{object}" unless is_ip?(object) or is_fqdn?(object)
37
+ GeoIP.new(Db_city).city(object)
38
+ rescue Exception => ee
39
+ puts "Exception on method city: #{object}" if @verbose
40
+ return nil
41
+ end
42
+ end
43
+ alias_method :query, :city
44
+
45
+ # Wrapper for the Ruby GeoIP Country class - return data structure below on successful lookup
46
+ # Struct.new(:request, :ip, :country_code, :country_code2, :country_code3, :country_name, :continent_code)
47
+ def country(object)
48
+ puts "Perform GeoIP country lookup on: #{object}" if @verbose
49
+ begin
50
+ object=object.strip
51
+ raise "Unknown object format - only valid hostname or IP is accepted: #{object}" unless is_ip?(object) or is_fqdn?(object)
52
+ GeoIP.new(Db_country).country(object)
53
+ rescue Exception => ee
54
+ puts "Exception on method country: #{object}" if @verbose
55
+ return nil
56
+ end
57
+ end
58
+
59
+ # Wrapper for the Ruby GeoIP ASN class - return data structure below on successful lookup
60
+ # Struct.new(:number, :asn)
61
+ def asn(object)
62
+ puts "Perform GeoIP ASN lookup on: #{object}" if @verbose
63
+ begin
64
+ object=object.strip
65
+ raise "Unknown object format - only valid hostname or IP is accepted: #{object}" unless is_ip?(object) or is_fqdn?(object)
66
+ GeoIP.new(Db_asn).asn(object)
67
+ rescue Exception => ee
68
+ puts "Exception on method asn: #{object}" if @verbose
69
+ return nil
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,177 @@
1
+ #--
2
+ # Wmap
3
+ #
4
+ # A pure Ruby library for Internet web application discovery and tracking.
5
+ #
6
+ # Copyright (c) 2012-2015 Yang Li
7
+ #++
8
+ require 'open-uri'
9
+ require 'nokogiri'
10
+
11
+
12
+ # We build our own Google search class by querying Google search engine from its web interface, by simulating
13
+ # an anonymous web surfer.
14
+ # Note: we don't use the native Google API due to its pricing structure - We don't have budget for
15
+ # this project, and we can not use the free version due to the limitation of 100 queries per day for free. See https://github.com/google/google-api-ruby-client for details.
16
+ class Wmap::GoogleSearchScraper
17
+ include Wmap::Utils
18
+
19
+ attr_accessor :verbose, :http_timeout, :keyword_list
20
+ attr_reader :discovered_urls_from_scraper, :discovered_sites_from_scraper
21
+
22
+ # Google search engine web interface locators
23
+ File_locator = File.dirname(__FILE__)+'/../../settings/google_locator.txt'
24
+ # Google search key words
25
+ File_keywords = File.dirname(__FILE__)+'/../../settings/google_keywords.txt'
26
+
27
+
28
+ # Scraper default variables
29
+ def initialize (params = {})
30
+ @verbose=params.fetch(:verbose, false)
31
+ @http_timeout=params.fetch(:http_timeout, 5000)
32
+ # Discovered data store
33
+ @discovered_urls_from_scraper=Hash.new
34
+ @discovered_sites_from_scraper=Hash.new
35
+ end
36
+
37
+ # Main worker method to simulate extensive google keyword searches on over 100+ countries and regions. The search will extract known web services related to the keyword by the Google Inc.
38
+ def google_worker (keyword)
39
+ begin
40
+ puts "Start the Google worker for: #{keyword}" if @verbose
41
+ links=Array.new
42
+ keyword=keyword.strip
43
+ google_locators = file_2_list(File_locator)
44
+ google_locators.map do |locator|
45
+ doc=google_search(locator,keyword) unless keyword.nil?
46
+ links+=extract_links(doc) unless doc.nil?
47
+ end
48
+ return links.uniq.sort-["",nil]
49
+ rescue Exception => ee
50
+ puts "Exception on the method google_worker for #{keyword}: #{ee}" if @verbose
51
+ return nil
52
+ end
53
+ end
54
+ alias_method :worker, :google_worker
55
+ alias_method :search, :google_worker
56
+
57
+ # Main method to collect intelligences on the Google vast data warehouse. It works by hitting the Google engines with the keyword list. This exhausive method will sweep through the Google engines in over 100+ countries and regions one by one, in order to collect all related web service links collected by known the Google, Inc. across the global Internet.
58
+ def google_workers(keyword_list=file_2_list(File_keywords))
59
+ begin
60
+ puts "Start the Google worker for: #{keyword_list}" if @verbose
61
+ links=Array.new
62
+ keyword_list.map do |keyword|
63
+ links+=google_worker(keyword)
64
+ end
65
+ return links.uniq.sort
66
+ rescue Exception => ee
67
+ puts "Exception on the method google_workers for #{keyword_list}: #{ee}" if @verbose
68
+ return nil
69
+ end
70
+ end
71
+ alias_method :workers, :google_workers
72
+
73
+ # Perform a Google web interface keyword search, return as a Nokogiri::HTML:Document object for the search result page
74
+ def google_search (locator,keyword)
75
+ begin
76
+ puts "Perform the keyword search on the Google web engine for: #{keyword}" if @verbose
77
+ link_search = locator + "search?q=" + URI::encode(keyword)
78
+ doc = Nokogiri::HTML(open(link_search))
79
+ return doc
80
+ rescue Exception => ee
81
+ puts "Exception on method google_search at Google engine location #{link_search} for the keyword #{keyword} : #{ee}" if @verbose
82
+ end
83
+ end
84
+
85
+ # Search for nodes by css, and extract the hyper links
86
+ def extract_links (doc)
87
+ begin
88
+ puts "Extract the meaningful links from the DOC." if @verbose
89
+ links=Array.new
90
+ doc.css('a').each do |link|
91
+ ref=link.attribute('href').to_s
92
+ if ref =~ /\/url\?/
93
+ my_key=ref.sub(/\/url\?q\=/,'')
94
+ my_site=url_2_site(my_key)
95
+ links.push(my_key)
96
+ @discovered_urls_from_scraper[my_key]=true unless @discovered_urls_from_scraper.key?(my_key)
97
+ @discovered_sites_from_scraper[my_site]=true unless @discovered_sites_from_scraper.key?(my_site)
98
+ end
99
+ end
100
+ return links
101
+ rescue Exception => ee
102
+ puts "Exception on method extract_links: #{ee}" if @verbose
103
+ return nil
104
+ end
105
+ end
106
+
107
+ # Method to print out discovery URL result
108
+ def print_discovered_urls_from_scraper
109
+ puts "Print discovered urls by the scraper. " if @verbose
110
+ begin
111
+ puts "\nSummary Report of Discovered URLs from the Scraper:"
112
+ @discovered_urls_from_scraper.keys.each do |url|
113
+ puts url
114
+ end
115
+ puts "Total: #{@discovered_urls_from_scraper.keys.size} url(s)"
116
+ puts "End of the summary"
117
+ rescue => ee
118
+ puts "Error on method print_discovered_urls_from_scraper: #{ee}" if @verbose
119
+ end
120
+ end
121
+
122
+ # Method to print out discovery Sites result
123
+ def print_discovered_sites_from_scraper
124
+ puts "Print discovered sites by the scraper. " if @verbose
125
+ begin
126
+ puts "\nSummary Report of Discovered Sites from the Scraper:"
127
+ @discovered_sites_from_scraper.keys.each do |site|
128
+ puts site
129
+ end
130
+ puts "Total: #{@discovered_sites_from_scraper.keys.size} site(s)"
131
+ puts "End of the summary"
132
+ rescue => ee
133
+ puts "Error on method print_discovered_sites_from_scraper: #{ee}" if @verbose
134
+ end
135
+ end
136
+
137
+ # 'getter' for the discovered sites from the Google search
138
+ def get_discovered_sites_from_scraper
139
+ puts "Getter for the discovered sites by the scraper. " if @verbose
140
+ begin
141
+ return @discovered_sites_from_scraper.keys.sort
142
+ rescue => ee
143
+ puts "Error on method get_discovered_sites_from_scraper: #{ee}" if @verbose
144
+ end
145
+ end
146
+ alias_method :print, :get_discovered_sites_from_scraper
147
+
148
+ # 'getter' for the discovered urls from the Google search
149
+ def get_discovered_urls_from_scraper
150
+ puts "Getter for the discovered urls by the scraper. " if @verbose
151
+ begin
152
+ return @discovered_urls_from_scraper.keys.sort
153
+ rescue => ee
154
+ puts "Error on method get_discovered_urls_from_scraper: #{ee}" if @verbose
155
+ end
156
+ end
157
+
158
+ # Save the discovered sites into a local file
159
+ def save_discovered_sites_from_scraper (file)
160
+ puts "Save the discovery result(sites) into a local file: #{file}" if @verbose
161
+ begin
162
+ f=File.open(file, 'w')
163
+ timestamp=Time.now
164
+ f.puts "# Discovery result written by Wmap::GoogleSearchScraper.save_discovered_sites_from_scraper method at #{timestamp}\n"
165
+ @discovered_sites_from_scraper.keys.sort.map { |x| f.puts "#{x}\n" }
166
+ f.close
167
+ raise "Unknown problem saving the result to file: #{file}" unless File.exist?(file)
168
+ puts "Done saving the discovery result into the local file: #{file}"
169
+ rescue => ee
170
+ puts "Error on method save_discovered_sites_from_scraper: #{ee}" if @verbose
171
+ end
172
+ end
173
+ alias_method :save, :save_discovered_sites_from_scraper
174
+
175
+ private
176
+
177
+ end