site-inspector 1.0.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +8 -0
  3. data/.rubocop.yml +42 -0
  4. data/.rubocop_todo.yml +139 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +9 -0
  7. data/Gemfile +7 -0
  8. data/Guardfile +10 -0
  9. data/README.md +189 -0
  10. data/Rakefile +10 -0
  11. data/bin/site-inspector +50 -22
  12. data/lib/cliver/dependency_ext.rb +24 -0
  13. data/lib/site-inspector.rb +62 -615
  14. data/lib/site-inspector/cache.rb +10 -51
  15. data/lib/site-inspector/checks/accessibility.rb +135 -0
  16. data/lib/site-inspector/checks/check.rb +54 -0
  17. data/lib/site-inspector/checks/content.rb +85 -0
  18. data/lib/site-inspector/checks/cookies.rb +45 -0
  19. data/lib/site-inspector/checks/dns.rb +138 -0
  20. data/lib/site-inspector/checks/headers.rb +68 -0
  21. data/lib/site-inspector/checks/hsts.rb +81 -0
  22. data/lib/site-inspector/checks/https.rb +40 -0
  23. data/lib/site-inspector/checks/sniffer.rb +67 -0
  24. data/lib/site-inspector/checks/wappalyzer.rb +62 -0
  25. data/lib/site-inspector/checks/whois.rb +36 -0
  26. data/lib/site-inspector/disk_cache.rb +42 -0
  27. data/lib/site-inspector/domain.rb +271 -0
  28. data/lib/site-inspector/endpoint.rb +217 -0
  29. data/lib/site-inspector/rails_cache.rb +13 -0
  30. data/lib/site-inspector/version.rb +5 -0
  31. data/package-lock.json +505 -0
  32. data/package.json +23 -0
  33. data/script/bootstrap +2 -0
  34. data/script/cibuild +11 -0
  35. data/script/console +3 -0
  36. data/script/pa11y-version +10 -0
  37. data/script/release +38 -0
  38. data/site-inspector.gemspec +42 -0
  39. data/spec/checks/site_inspector_endpoint_accessibility_spec.rb +84 -0
  40. data/spec/checks/site_inspector_endpoint_check_spec.rb +42 -0
  41. data/spec/checks/site_inspector_endpoint_content_spec.rb +117 -0
  42. data/spec/checks/site_inspector_endpoint_cookies_spec.rb +73 -0
  43. data/spec/checks/site_inspector_endpoint_dns_spec.rb +184 -0
  44. data/spec/checks/site_inspector_endpoint_headers_spec.rb +65 -0
  45. data/spec/checks/site_inspector_endpoint_hsts_spec.rb +92 -0
  46. data/spec/checks/site_inspector_endpoint_https_spec.rb +49 -0
  47. data/spec/checks/site_inspector_endpoint_sniffer_spec.rb +150 -0
  48. data/spec/checks/site_inspector_endpoint_wappalyzer_spec.rb +34 -0
  49. data/spec/checks/site_inspector_endpoint_whois_spec.rb +26 -0
  50. data/spec/fixtures/wappalyzer.json +125 -0
  51. data/spec/site_inspector_cache_spec.rb +15 -0
  52. data/spec/site_inspector_disk_cache_spec.rb +39 -0
  53. data/spec/site_inspector_domain_spec.rb +271 -0
  54. data/spec/site_inspector_endpoint_spec.rb +252 -0
  55. data/spec/site_inspector_spec.rb +48 -0
  56. data/spec/spec_helper.rb +19 -0
  57. metadata +204 -63
  58. data/lib/site-inspector/compliance.rb +0 -19
  59. data/lib/site-inspector/dns.rb +0 -92
  60. data/lib/site-inspector/headers.rb +0 -59
  61. data/lib/site-inspector/sniffer.rb +0 -26
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ class Headers < Check
6
+ # TODO: kill this
7
+ def strict_transport_security?
8
+ !!strict_transport_security
9
+ end
10
+
11
+ def content_security_policy?
12
+ !!content_security_policy
13
+ end
14
+
15
+ def click_jacking_protection?
16
+ !!click_jacking_protection
17
+ end
18
+
19
+ # return the found header value
20
+
21
+ # TODO: kill this
22
+ def strict_transport_security
23
+ headers['strict-transport-security']
24
+ end
25
+
26
+ def content_security_policy
27
+ headers['content-security-policy']
28
+ end
29
+
30
+ def click_jacking_protection
31
+ headers['x-frame-options']
32
+ end
33
+
34
+ def server
35
+ headers['server']
36
+ end
37
+
38
+ def xss_protection
39
+ headers['x-xss-protection']
40
+ end
41
+
42
+ # more specific checks than presence of headers
43
+ def xss_protection?
44
+ xss_protection == '1; mode=block'
45
+ end
46
+
47
+ # Returns an array of hashes of downcased key/value header pairs (or an empty hash)
48
+ def all
49
+ @all ||= response&.headers ? response.headers.transform_keys(&:downcase) : {}
50
+ end
51
+ alias headers all
52
+
53
+ def [](header)
54
+ headers[header]
55
+ end
56
+
57
+ def to_h
58
+ {
59
+ strict_transport_security: strict_transport_security || false,
60
+ content_security_policy: content_security_policy || false,
61
+ click_jacking_protection: click_jacking_protection || false,
62
+ server: server,
63
+ xss_protection: xss_protection || false
64
+ }
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ # Utility parser for HSTS headers.
6
+ # RFC: http://tools.ietf.org/html/rfc6797
7
+ class Hsts < Check
8
+ def valid?
9
+ return false unless header
10
+
11
+ pairs.none? { |key, value| "#{key}#{value}" =~ /[\s'"]/ }
12
+ end
13
+
14
+ def max_age
15
+ pairs[:"max-age"].to_i
16
+ end
17
+
18
+ def include_subdomains?
19
+ pairs.key?(:includesubdomains)
20
+ end
21
+
22
+ def preload?
23
+ pairs.key?(:preload)
24
+ end
25
+
26
+ def enabled?
27
+ return false unless max_age
28
+
29
+ max_age.positive?
30
+ end
31
+
32
+ # Google's minimum max-age for automatic preloading
33
+ def preload_ready?
34
+ include_subdomains? && preload? && max_age >= 10_886_400
35
+ end
36
+
37
+ def to_h
38
+ {
39
+ valid: valid?,
40
+ max_age: max_age,
41
+ include_subdomains: include_subdomains?,
42
+ preload: preload?,
43
+ enabled: enabled?,
44
+ preload_ready: preload_ready?
45
+ }
46
+ end
47
+
48
+ private
49
+
50
+ def headers
51
+ endpoint.headers
52
+ end
53
+
54
+ def header
55
+ @header ||= headers['strict-transport-security']
56
+ end
57
+
58
+ def directives
59
+ @directives ||= header ? header.split(/\s*;\s*/) : []
60
+ end
61
+
62
+ def pairs
63
+ @pairs ||= begin
64
+ pairs = {}
65
+ directives.each do |directive|
66
+ key, value = directive.downcase.split('=')
67
+
68
+ if /".*"/.match?(value)
69
+ value = value.sub(/^"/, '')
70
+ value = value.sub(/"$/, '')
71
+ end
72
+
73
+ pairs[key.to_sym] = value
74
+ end
75
+
76
+ pairs
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ class Https < Check
6
+ def scheme?
7
+ scheme == 'https'
8
+ end
9
+
10
+ def valid?
11
+ scheme? && response && response.return_code == :ok
12
+ end
13
+
14
+ def bad_chain?
15
+ scheme? && response && response.return_code == :ssl_cacert
16
+ end
17
+
18
+ def bad_name?
19
+ scheme? && response && response.return_code == :peer_failed_verification
20
+ end
21
+
22
+ def inspect
23
+ "#<SiteInspector::Endpoint::Https valid=#{valid?}>"
24
+ end
25
+
26
+ def to_h
27
+ {
28
+ valid: valid?,
29
+ return_code: response.return_code
30
+ }
31
+ end
32
+
33
+ private
34
+
35
+ def scheme
36
+ @scheme ||= request.base_url.scheme
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ class Sniffer < Check
6
+ OPEN_SOURCE_FRAMEWORKS = [
7
+ # Sniffles
8
+ :drupal,
9
+ :joomla,
10
+ :movabletype,
11
+ :phpbb,
12
+ :wordpress,
13
+
14
+ # Internal
15
+ :php,
16
+ :expression_engine,
17
+ :cowboy
18
+ ].freeze
19
+
20
+ def framework
21
+ cms = sniff :cms
22
+ return cms unless cms.nil?
23
+ return :expression_engine if endpoint.cookies.any? { |c| c.keys.first =~ /^exp_/ }
24
+ return :php if endpoint.cookies['PHPSESSID']
25
+ return :coldfusion if endpoint.cookies['CFID'] && endpoint.cookies['CFTOKEN']
26
+ return :cowboy if endpoint.headers.server.to_s.casecmp('cowboy').zero?
27
+
28
+ nil
29
+ end
30
+
31
+ def open_source?
32
+ OPEN_SOURCE_FRAMEWORKS.include?(framework)
33
+ end
34
+
35
+ def analytics
36
+ sniff :analytics
37
+ end
38
+
39
+ def javascript
40
+ sniff :javascript
41
+ end
42
+
43
+ def advertising
44
+ sniff :advertising
45
+ end
46
+
47
+ def to_h
48
+ {
49
+ framework: framework,
50
+ analytics: analytics,
51
+ javascript: javascript,
52
+ advertising: advertising
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ def sniff(type)
59
+ require 'sniffles'
60
+ results = Sniffles.sniff(endpoint.content.body, type).select { |_name, meta| meta[:found] }
61
+ results&.keys&.first
62
+ rescue StandardError
63
+ nil
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ class Wappalyzer < Check
6
+ ENDPOINT = 'https://api.wappalyzer.com/lookup/v2/'
7
+
8
+ def to_h
9
+ return {} unless data['technologies']
10
+
11
+ @to_h ||= begin
12
+ technologies = {}
13
+ data['technologies'].each do |t|
14
+ category = t['categories'].first
15
+ category = category ? category['name'] : 'Other'
16
+ technologies[category] ||= []
17
+ technologies[category].push t['name']
18
+ end
19
+
20
+ technologies
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def request
27
+ @request ||= begin
28
+ options = SiteInspector.typhoeus_defaults
29
+ headers = options[:headers].merge({ "x-api-key": api_key })
30
+ options = options.merge(method: :get, headers: headers)
31
+ Typhoeus::Request.new(url, options)
32
+ end
33
+ end
34
+
35
+ def data
36
+ return {} unless api_key && api_key != ''
37
+
38
+ @data ||= begin
39
+ SiteInspector.hydra.queue(request)
40
+ SiteInspector.hydra.run
41
+
42
+ response = request.response
43
+ if response.success?
44
+ JSON.parse(response.body).first
45
+ else
46
+ {}
47
+ end
48
+ end
49
+ end
50
+
51
+ def url
52
+ url = Addressable::URI.parse(ENDPOINT)
53
+ url.query_values = { urls: endpoint.uri }
54
+ url
55
+ end
56
+
57
+ def api_key
58
+ @api_key ||= ENV['WAPPALYZER_API_KEY']
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Endpoint
5
+ class Whois < Check
6
+ def domain
7
+ @domain ||= whois.lookup host
8
+ end
9
+
10
+ def ip
11
+ @ip ||= whois.lookup ip_address
12
+ end
13
+
14
+ def to_h
15
+ {
16
+ domain: record_to_h(domain),
17
+ ip: record_to_h(ip)
18
+ }
19
+ end
20
+
21
+ private
22
+
23
+ def record_to_h(record)
24
+ record.content.scan(/^\s*(.*?):\s*(.*?)\r?\n/).to_h
25
+ end
26
+
27
+ def ip_address
28
+ @ip_address ||= Resolv.getaddress host
29
+ end
30
+
31
+ def whois
32
+ @whois ||= ::Whois::Client.new
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class DiskCache
5
+ def initialize(dir = nil, replace = nil)
6
+ @dir = dir || ENV['CACHE']
7
+ @replace = replace || ENV['CACHE_REPLACE']
8
+ @memory = {}
9
+ end
10
+
11
+ def get(request)
12
+ return unless File.exist?(path(request))
13
+ return @memory[request] if @memory[request]
14
+
15
+ if @replace
16
+ FileUtils.rm(path(request))
17
+ nil
18
+ else
19
+ begin
20
+ contents = File.read(path(request))
21
+ Marshal.load(contents)
22
+ rescue ArgumentError
23
+ FileUtils.rm(path(request))
24
+ nil
25
+ end
26
+ end
27
+ end
28
+
29
+ def set(request, response)
30
+ File.write(path(request), Marshal.dump(response))
31
+ @memory[request] = response
32
+ end
33
+
34
+ private
35
+
36
+ # The `request` is a Typhoeus::Request, which provides a
37
+ # unique `cache_key` string for exactly this sort of thing.
38
+ def path(request)
39
+ File.join(@dir, request.cache_key)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,271 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SiteInspector
4
+ class Domain
5
+ attr_reader :host
6
+
7
+ def initialize(host)
8
+ host = host.downcase
9
+ host = host.sub(/^https?:/, '')
10
+ host = host.sub(%r{^/+}, '')
11
+ host = host.sub(/^www\./, '')
12
+ uri = Addressable::URI.parse "//#{host}"
13
+ @host = uri.host
14
+ end
15
+
16
+ def endpoints
17
+ @endpoints ||= [
18
+ Endpoint.new("https://#{host}", domain: self),
19
+ Endpoint.new("https://www.#{host}", domain: self),
20
+ Endpoint.new("http://#{host}", domain: self),
21
+ Endpoint.new("http://www.#{host}", domain: self)
22
+ ]
23
+ end
24
+
25
+ def canonical_endpoint
26
+ @canonical_endpoint ||= begin
27
+ prefetch
28
+ endpoints.find do |e|
29
+ e.https? == canonically_https? && e.www? == canonically_www?
30
+ end
31
+ end
32
+ end
33
+
34
+ def government?
35
+ require 'gman'
36
+ Gman.valid? host
37
+ end
38
+
39
+ # Does *any* endpoint return a 200 or 300 response code?
40
+ def up?
41
+ endpoints.any?(&:up?)
42
+ end
43
+
44
+ # Does *any* endpoint respond to HTTP?
45
+ # TODO: needs to allow an invalid chain.
46
+ def responds?
47
+ endpoints.any?(&:responds?)
48
+ end
49
+
50
+ # TODO: These weren't present before, and may not be useful.
51
+ # Can you connect to www?
52
+ def www?
53
+ endpoints.any? { |e| e.www? && e.up? }
54
+ end
55
+
56
+ # Can you connect without www?
57
+ def root?
58
+ endpoints.any? { |e| e.root? && e.up? }
59
+ end
60
+
61
+ # HTTPS is "supported" (different than "canonical" or "enforced") if:
62
+ #
63
+ # * Either of the HTTPS endpoints is listening, and doesn't have
64
+ # an invalid hostname.
65
+ #
66
+ # TODO: needs to allow an invalid chain.
67
+ def https?
68
+ endpoints.any? { |e| e.https? && e.up? && e.https.valid? }
69
+ end
70
+
71
+ # HTTPS is enforced if one of the HTTPS endpoints is "up",
72
+ # and if both *HTTP* endpoints are either:
73
+ #
74
+ # * down, or
75
+ # * redirect immediately to HTTPS.
76
+ #
77
+ # This is different than whether a domain is "canonically" HTTPS.
78
+ #
79
+ # * an HTTP redirect can go to HTTPS on another domain, as long
80
+ # as it's immediate.
81
+ # * a domain with an invalid cert can still be enforcing HTTPS.
82
+ #
83
+ # TODO: need to ensure the redirect *immediately* goes to HTTPS.
84
+ # TODO: don't need to require that the HTTPS cert is valid for this purpose.
85
+ def enforces_https?
86
+ return false unless https?
87
+
88
+ endpoints.select(&:http?).all? { |e| !e.up? || e.redirect&.https? }
89
+ end
90
+
91
+ # we can say that a canonical HTTPS site "defaults" to HTTPS,
92
+ # even if it doesn't *strictly* enforce it (e.g. having a www
93
+ # subdomain first to go HTTP root before HTTPS root).
94
+ #
95
+ # TODO: not implemented.
96
+ def defaults_https?
97
+ raise 'Not implemented. Halp?'
98
+ end
99
+
100
+ # HTTPS is "downgraded" if both:
101
+ #
102
+ # * HTTPS is supported, and
103
+ # * The 'canonical' endpoint gets an immediate internal redirect to HTTP.
104
+ #
105
+ # TODO: the redirect must be internal.
106
+ def downgrades_https?
107
+ return false unless https?
108
+
109
+ canonical_endpoint.redirect? && canonical_endpoint.redirect.http?
110
+ end
111
+
112
+ # A domain is "canonically" at www if:
113
+ # * at least one of its www endpoints responds
114
+ # * both root endpoints are either down ~~or redirect *somewhere*~~, or
115
+ # * at least one root endpoint redirect should immediately go to
116
+ # an *internal* www endpoint
117
+ # This is meant to affirm situations like:
118
+ # http:// -> https:// -> https://www
119
+ # https:// -> http:// -> https://www
120
+ # and meant to avoid affirming situations like:
121
+ # http:// -> http://non-www,
122
+ # http://www -> http://non-www
123
+ # or like:
124
+ # https:// -> 200, http:// -> http://www
125
+ def canonically_www?
126
+ # Does any endpoint respond?
127
+ return false unless up?
128
+
129
+ # Does at least one www endpoint respond?
130
+ return false unless www?
131
+
132
+ # Are both root endpoints down?
133
+ return true if endpoints.select(&:root?).all? { |e| !e.up? }
134
+
135
+ # Does either root endpoint redirect to a www endpoint?
136
+ endpoints.select(&:root?).any? { |e| e.redirect&.www? }
137
+ end
138
+
139
+ # A domain is "canonically" at https if:
140
+ # * at least one of its https endpoints is live and
141
+ # doesn't have an invalid hostname
142
+ # * both http endpoints are either down or redirect *somewhere*
143
+ # * at least one http endpoint redirects immediately to
144
+ # an *internal* https endpoint
145
+ # This is meant to affirm situations like:
146
+ # http:// -> http://www -> https://
147
+ # https:// -> http:// -> https://www
148
+ # and meant to avoid affirming situations like:
149
+ # http:// -> http://non-www
150
+ # http://www -> http://non-www
151
+ # or:
152
+ # http:// -> 200, http://www -> https://www
153
+ #
154
+ # It allows a site to be canonically HTTPS if the cert has
155
+ # a valid hostname but invalid chain issues.
156
+ def canonically_https?
157
+ # Does any endpoint respond?
158
+ return false unless up?
159
+
160
+ # At least one of its https endpoints is live and doesn't have an invalid hostname
161
+ return false unless https?
162
+
163
+ # Both http endpoints are down
164
+ return true if endpoints.select(&:http?).all? { |e| !e.up? }
165
+
166
+ # at least one http endpoint redirects immediately to https
167
+ endpoints.select(&:http?).any? { |e| e.redirect&.https? }
168
+ end
169
+
170
+ # A domain redirects if
171
+ # 1. At least one endpoint is an external redirect, and
172
+ # 2. All endpoints are either down or an external redirect
173
+ def redirect?
174
+ return false unless redirect
175
+
176
+ endpoints.all? { |e| !e.up? || e.external_redirect? }
177
+ end
178
+
179
+ # The first endpoint to respond with a redirect
180
+ def redirect
181
+ endpoints.find(&:external_redirect?)
182
+ end
183
+
184
+ # HSTS on the canonical domain?
185
+ def hsts?
186
+ canonical_endpoint.hsts&.enabled?
187
+ end
188
+
189
+ def hsts_subdomains?
190
+ endpoints.find { |e| e.root? && e.https? }.hsts.include_subdomains?
191
+ end
192
+
193
+ def hsts_preload_ready?
194
+ return false unless hsts_subdomains?
195
+
196
+ endpoints.find { |e| e.root? && e.https? }.hsts.preload_ready?
197
+ end
198
+
199
+ def to_s
200
+ host
201
+ end
202
+
203
+ def inspect
204
+ "#<SiteInspector::Domain host=\"#{host}\">"
205
+ end
206
+
207
+ # We know most API calls to the domain model are going to require
208
+ # That the root of all four endpoints are called. Rather than process them
209
+ # In serial, lets grab them in parallel and cache the results to speed
210
+ # up later calls.
211
+ def prefetch
212
+ endpoints.each do |endpoint|
213
+ request = Typhoeus::Request.new(endpoint.uri, SiteInspector.typhoeus_defaults)
214
+ SiteInspector.hydra.queue(request)
215
+ end
216
+ SiteInspector.hydra.run
217
+ end
218
+
219
+ # Converts the domain to a hash
220
+ #
221
+ # By default, it only returns domain-wide information and
222
+ # information about the canonical endpoint
223
+ #
224
+ # It will also pass options allong to each endpoint's to_h method
225
+ #
226
+ # options:
227
+ # :all - return information about all endpoints
228
+ #
229
+ # Returns a complete hash of the domain's information
230
+ def to_h(options = {})
231
+ prefetch
232
+
233
+ hash = {
234
+ host: host,
235
+ up: up?,
236
+ responds: responds?,
237
+ www: www?,
238
+ root: root?,
239
+ https: https?,
240
+ enforces_https: enforces_https?,
241
+ downgrades_https: downgrades_https?,
242
+ canonically_www: canonically_www?,
243
+ canonically_https: canonically_https?,
244
+ redirect: redirect?,
245
+ hsts: hsts?,
246
+ hsts_subdomains: hsts_subdomains?,
247
+ hsts_preload_ready: hsts_preload_ready?,
248
+ canonical_endpoint: canonical_endpoint.to_h(options)
249
+ }
250
+
251
+ if options['all']
252
+ hash[:endpoints] = {
253
+ https: {
254
+ root: endpoints[0].to_h(options),
255
+ www: endpoints[1].to_h(options)
256
+ },
257
+ http: {
258
+ root: endpoints[2].to_h(options),
259
+ www: endpoints[3].to_h(options)
260
+ }
261
+ }
262
+ end
263
+
264
+ hash
265
+ end
266
+
267
+ def to_json(*_args)
268
+ to_h.to_json
269
+ end
270
+ end
271
+ end