site-inspector 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/Guardfile +8 -0
  7. data/README.md +175 -0
  8. data/Rakefile +8 -0
  9. data/bin/site-inspector +48 -21
  10. data/lib/site-inspector.rb +38 -613
  11. data/lib/site-inspector/cache.rb +9 -52
  12. data/lib/site-inspector/checks/check.rb +41 -0
  13. data/lib/site-inspector/checks/content.rb +67 -0
  14. data/lib/site-inspector/checks/dns.rb +129 -0
  15. data/lib/site-inspector/checks/headers.rb +83 -0
  16. data/lib/site-inspector/checks/hsts.rb +78 -0
  17. data/lib/site-inspector/checks/https.rb +40 -0
  18. data/lib/site-inspector/checks/sniffer.rb +42 -0
  19. data/lib/site-inspector/disk_cache.rb +38 -0
  20. data/lib/site-inspector/domain.rb +248 -0
  21. data/lib/site-inspector/endpoint.rb +200 -0
  22. data/lib/site-inspector/rails_cache.rb +11 -0
  23. data/lib/site-inspector/version.rb +3 -0
  24. data/script/bootstrap +1 -0
  25. data/script/cibuild +7 -0
  26. data/script/console +1 -0
  27. data/script/release +38 -0
  28. data/site-inspector.gemspec +33 -0
  29. data/spec/checks/site_inspector_endpoint_check_spec.rb +34 -0
  30. data/spec/checks/site_inspector_endpoint_content_spec.rb +89 -0
  31. data/spec/checks/site_inspector_endpoint_dns_spec.rb +167 -0
  32. data/spec/checks/site_inspector_endpoint_headers_spec.rb +74 -0
  33. data/spec/checks/site_inspector_endpoint_hsts_spec.rb +91 -0
  34. data/spec/checks/site_inspector_endpoint_https_spec.rb +48 -0
  35. data/spec/checks/site_inspector_endpoint_sniffer_spec.rb +52 -0
  36. data/spec/site_inspector_cache_spec.rb +13 -0
  37. data/spec/site_inspector_disc_cache_spec.rb +31 -0
  38. data/spec/site_inspector_domain_spec.rb +252 -0
  39. data/spec/site_inspector_endpoint_spec.rb +224 -0
  40. data/spec/site_inspector_spec.rb +46 -0
  41. data/spec/spec_helper.rb +17 -0
  42. metadata +75 -57
  43. data/lib/site-inspector/compliance.rb +0 -19
  44. data/lib/site-inspector/dns.rb +0 -92
  45. data/lib/site-inspector/headers.rb +0 -59
  46. data/lib/site-inspector/sniffer.rb +0 -26
@@ -0,0 +1,40 @@
1
+ class SiteInspector
2
+ class Endpoint
3
+ class Https < Check
4
+
5
+ def scheme?
6
+ scheme == "https"
7
+ end
8
+
9
+ def valid?
10
+ scheme? && response && response.return_code == :ok
11
+ end
12
+
13
+ def bad_chain?
14
+ scheme? && response && response.return_code == :ssl_cacert
15
+ end
16
+
17
+ def bad_name?
18
+ scheme? && response && response.return_code == :peer_failed_verification
19
+ end
20
+
21
+ def inspect
22
+ "#<SiteInspector::Endpoint::Https valid=#{valid?}>"
23
+ end
24
+
25
+ def to_h
26
+ {
27
+ valid: valid?,
28
+ return_code: response.return_code,
29
+ }
30
+ end
31
+
32
+ private
33
+
34
+ def scheme
35
+ @scheme ||= request.base_url.scheme
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,42 @@
1
+ class SiteInspector
2
+ class Endpoint
3
+ class Sniffer < Check
4
+
5
+ def cms
6
+ sniff :cms
7
+ end
8
+
9
+ def analytics
10
+ sniff :analytics
11
+ end
12
+
13
+ def javascript
14
+ sniff :javascript
15
+ end
16
+
17
+ def advertising
18
+ sniff :advertising
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ :cms => cms,
24
+ :analytics => analytics,
25
+ :javascript => javascript,
26
+ :advertising => advertising
27
+ }
28
+ end
29
+
30
+ private
31
+
32
+ def sniff(type)
33
+ require 'sniffles'
34
+ results = Sniffles.sniff(endpoint.content.body, type).select { |name, meta| meta[:found] == true }
35
+ results.each { |name, result| result.delete :found} if results
36
+ results
37
+ rescue
38
+ nil
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,38 @@
1
+ class SiteInspector
2
+ class DiskCache
3
+ def initialize(dir = nil, replace = nil)
4
+ @dir = dir || ENV['CACHE']
5
+ @replace = replace || ENV['CACHE_REPLACE']
6
+ @memory = {}
7
+ end
8
+
9
+ def get(request)
10
+ return unless File.exist?(path(request))
11
+ return @memory[request] if @memory[request]
12
+
13
+ if @replace
14
+ FileUtils.rm(path(request))
15
+ nil
16
+ else
17
+ begin
18
+ contents = File.read(path(request))
19
+ Marshal.load(contents)
20
+ rescue ArgumentError
21
+ FileUtils.rm(path(request))
22
+ nil
23
+ end
24
+ end
25
+ end
26
+
27
+ def set(request, response)
28
+ File.write(path(request), Marshal.dump(response))
29
+ @memory[request] = response
30
+ end
31
+
32
+ private
33
+
34
+ def path(request)
35
+ File.join(@dir, request)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,248 @@
1
+ class SiteInspector
2
+ class Domain
3
+
4
+ attr_reader :host
5
+
6
+ def initialize(host)
7
+ host = host.downcase
8
+ host = host.sub /^https?\:/, ""
9
+ host = host.sub /^\/+/, ""
10
+ host = host.sub /^www\./, ""
11
+ uri = Addressable::URI.parse "//#{host}"
12
+ @host = uri.host
13
+ end
14
+
15
+ def endpoints
16
+ @endpoints ||= [
17
+ Endpoint.new("https://#{host}"),
18
+ Endpoint.new("https://www.#{host}"),
19
+ Endpoint.new("http://#{host}"),
20
+ Endpoint.new("http://www.#{host}")
21
+ ]
22
+ end
23
+
24
+ def canonical_endpoint
25
+ @canonical_endpoint ||= endpoints.find do |e|
26
+ e.https? == canonically_https? && e.www? == canonically_www?
27
+ end
28
+ end
29
+
30
+ def government?
31
+ require 'gman'
32
+ Gman.valid? host
33
+ end
34
+
35
+ # Does *any* endpoint return a 200 response code?
36
+ def up?
37
+ endpoints.any? { |e| e.up? }
38
+ end
39
+
40
+ # Does any www endpoint return a 200 response code?
41
+ def www?
42
+ endpoints.any? { |e| e.www? && e.up? }
43
+ end
44
+
45
+ # Can you connect without www?
46
+ def root?
47
+ endpoints.any? { |e| e.root? && e.up? }
48
+ end
49
+
50
+ # HTTPS is "supported" (different than "canonical" or "enforced") if:
51
+ #
52
+ # * Either of the HTTPS endpoints is listening, and doesn't have
53
+ # an invalid hostname.
54
+ def https?
55
+ endpoints.any? { |e| e.https? && e.up? && e.https.valid? }
56
+ end
57
+
58
+ # HTTPS is enforced if one of the HTTPS endpoints is "live",
59
+ # and if both *HTTP* endpoints are either:
60
+ #
61
+ # * down, or
62
+ # * redirect immediately to HTTPS.
63
+ #
64
+ # This is different than whether a domain is "canonically" HTTPS.
65
+ #
66
+ # * an HTTP redirect can go to HTTPS on another domain, as long
67
+ # as it's immediate.
68
+ # * a domain with an invalid cert can still be enforcing HTTPS.
69
+ def enforces_https?
70
+ return false unless https?
71
+ endpoints.select { |e| e.http? }.all? { |e| e.down? || (e.redirect && e.redirect.https?) }
72
+ end
73
+
74
+ # we can say that a canonical HTTPS site "defaults" to HTTPS,
75
+ # even if it doesn't *strictly* enforce it (e.g. having a www
76
+ # subdomain first to go HTTP root before HTTPS root).
77
+ def defaults_https?
78
+ raise "Not implemented. Halp?"
79
+ end
80
+
81
+ # HTTPS is "downgraded" if both:
82
+ #
83
+ # * HTTPS is supported, and
84
+ # * The 'canonical' endpoint gets an immediate internal redirect to HTTP.
85
+ def downgrades_https?
86
+ return false unless https?
87
+ canonical_endpoint.redirect && canonical_endpoint.redirect.http?
88
+ end
89
+
90
+ # A domain is "canonically" at www if:
91
+ # * at least one of its www endpoints responds
92
+ # * both root endpoints are either down ~~or redirect *somewhere*~~, or
93
+ # * at least one root endpoint redirect should immediately go to
94
+ # an *internal* www endpoint
95
+ # This is meant to affirm situations like:
96
+ # http:// -> https:// -> https://www
97
+ # https:// -> http:// -> https://www
98
+ # and meant to avoid affirming situations like:
99
+ # http:// -> http://non-www,
100
+ # http://www -> http://non-www
101
+ # or like:
102
+ # https:// -> 200, http:// -> http://www
103
+ def canonically_www?
104
+ # Does any endpoint respond?
105
+ return false unless up?
106
+
107
+ # Does at least one www endpoint respond?
108
+ return false unless www?
109
+
110
+ # Are both root endpoints down?
111
+ return true if endpoints.select { |e| e.root? }.all? { |e| e.down? }
112
+
113
+ # Does either root endpoint redirect to a www endpoint?
114
+ endpoints.select { |e| e.root? }.any? { |e| e.redirect && e.redirect.www? }
115
+ end
116
+
117
+ # A domain is "canonically" at https if:
118
+ # * at least one of its https endpoints is live and
119
+ # doesn't have an invalid hostname
120
+ # * both http endpoints are either down or redirect *somewhere*
121
+ # * at least one http endpoint redirects immediately to
122
+ # an *internal* https endpoint
123
+ # This is meant to affirm situations like:
124
+ # http:// -> http://www -> https://
125
+ # https:// -> http:// -> https://www
126
+ # and meant to avoid affirming situations like:
127
+ # http:// -> http://non-www
128
+ # http://www -> http://non-www
129
+ # or:
130
+ # http:// -> 200, http://www -> https://www
131
+ #
132
+ # It allows a site to be canonically HTTPS if the cert has
133
+ # a valid hostname but invalid chain issues.
134
+ def canonically_https?
135
+ # Does any endpoint respond?
136
+ return false unless up?
137
+
138
+ # At least one of its https endpoints is live and doesn't have an invalid hostname
139
+ return false unless https?
140
+
141
+ # Both http endpoints are down
142
+ return true if endpoints.select { |e| e.http? }.all? { |e| e.down? }
143
+
144
+ # at least one http endpoint redirects immediately to https
145
+ endpoints.select { |e| e.http? }.any? { |e| e.redirect && e.redirect.https? }
146
+ end
147
+
148
+ # A domain redirects if
149
+ # 1. At least one endpoint is an external redirect, and
150
+ # 2. All endpoints are either down or an external redirect
151
+ def redirect?
152
+ return false unless redirect
153
+ endpoints.all? { |e| e.down? || e.external_redirect? }
154
+ end
155
+
156
+ # The first endpoint to respond with a redirect
157
+ def redirect
158
+ endpoints.find { |e| e.external_redirect? }
159
+ end
160
+
161
+ # HSTS on the canonical domain?
162
+ def hsts?
163
+ canonical_endpoint.hsts && canonical_endpoint.hsts.enabled?
164
+ end
165
+
166
+ def hsts_subdomains?
167
+ endpoints.find { |e| e.root? && e.https? }.hsts.include_subdomains?
168
+ end
169
+
170
+ def hsts_preload_ready?
171
+ return false unless hsts_subdomains?
172
+ endpoints.find { |e| e.root? && e.https? }.hsts.preload_ready?
173
+ end
174
+
175
+ def to_s
176
+ host
177
+ end
178
+
179
+ def inspect
180
+ "#<SiteInspector::Domain host=\"#{host}\">"
181
+ end
182
+
183
+ # We know most API calls to the domain model are going to require
184
+ # That the root of all four endpoints are called. Rather than process them
185
+ # In serial, lets grab them in parallel and cache the results to speed
186
+ # up later calls.
187
+ def prefetch
188
+ endpoints.each do |endpoint|
189
+ request = Typhoeus::Request.new(endpoint.uri, SiteInspector.typhoeus_defaults)
190
+ SiteInspector.hydra.queue(request)
191
+ end
192
+ SiteInspector.hydra.run
193
+ end
194
+
195
+ # Converts the domain to a hash
196
+ #
197
+ # By default, it only returns domain-wide information and
198
+ # information about the canonical endpoint
199
+ #
200
+ # It will also pass options allong to each endpoint's to_h method
201
+ #
202
+ # options:
203
+ # :all - return information about all endpoints
204
+ #
205
+ # Returns a complete hash of the domain's information
206
+ def to_h(options={})
207
+ prefetch
208
+
209
+ hash = {
210
+ host: host,
211
+ up: up?,
212
+ www: www?,
213
+ root: root?,
214
+ https: https?,
215
+ enforces_https: enforces_https?,
216
+ downgrades_https: downgrades_https?,
217
+ canonically_www: canonically_www?,
218
+ canonically_https: canonically_https?,
219
+ redirect: redirect?,
220
+ hsts: hsts?,
221
+ hsts_subdomains: hsts_subdomains?,
222
+ hsts_preload_ready: hsts_preload_ready?,
223
+ canoncial_endpoint: canonical_endpoint.to_h(options)
224
+ }
225
+
226
+ if options["all"]
227
+ hash.merge!({
228
+ endpoints: {
229
+ https: {
230
+ root: endpoints[0].to_h(options),
231
+ www: endpoints[1].to_h(options)
232
+ },
233
+ http: {
234
+ root: endpoints[2].to_h(options),
235
+ www: endpoints[3].to_h(options)
236
+ }
237
+ }
238
+ })
239
+ end
240
+
241
+ hash
242
+ end
243
+
244
+ def to_json
245
+ to_h.to_json
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,200 @@
1
+ class SiteInspector
2
+ # Every domain has four possible "endpoints" to evaluate
3
+ #
4
+ # For example, if you had `example.com` you'd have:
5
+ # 1. `http://example.com`
6
+ # 2. `http://www.example.com`
7
+ # 3. `https://example.com`
8
+ # 4. `https://www.example.com`
9
+ #
10
+ # Because each of the four endpoints could potentially respond differently
11
+ # We must evaluate all four to make certain determination
12
+ class Endpoint
13
+ attr_accessor :host, :uri
14
+
15
+ # Initatiate a new Endpoint object
16
+ #
17
+ # endpoint - (string) the endpoint to query (e.g., `https://example.com`)
18
+ def initialize(host)
19
+ @uri = Addressable::URI.parse(host.downcase)
20
+ @host = uri.host.sub(/^www\./, "")
21
+ @checks = {}
22
+ end
23
+
24
+ def www?
25
+ !!(uri.host =~ /^www\./)
26
+ end
27
+
28
+ def root?
29
+ !www?
30
+ end
31
+
32
+ def https?
33
+ https.scheme?
34
+ end
35
+
36
+ def http?
37
+ !https?
38
+ end
39
+
40
+ def scheme
41
+ @uri.scheme
42
+ end
43
+
44
+ def request(options = {})
45
+ target = options[:path] ? URI.join(uri, options.delete(:path)) : uri
46
+ request = Typhoeus::Request.new(target, SiteInspector.typhoeus_defaults.merge(options))
47
+ hydra.queue(request)
48
+ hydra.run
49
+ request.response
50
+ end
51
+
52
+ # Makes a GET request of the given host
53
+ #
54
+ # Retutns the Typhoeus::Response object
55
+ def response
56
+ @response ||= request
57
+ end
58
+
59
+ # Does the server return any response? (including 50x)
60
+ def response?
61
+ response.code != 0 && !timed_out?
62
+ end
63
+
64
+ def response_code
65
+ response.response_code.to_s if response
66
+ end
67
+
68
+ def timed_out?
69
+ response && response.timed_out?
70
+ end
71
+
72
+ # Does the endpoint return a 2xx or 3xx response code?
73
+ def up?
74
+ response && response_code.start_with?("2") || response_code.start_with?("3")
75
+ end
76
+
77
+ def down?
78
+ !up?
79
+ end
80
+
81
+ # If the domain is a redirect, what's the first endpoint we're redirected to?
82
+ def redirect
83
+ return unless response && response_code.start_with?("3")
84
+
85
+ @redirect ||= begin
86
+ redirect = Addressable::URI.parse(headers["location"])
87
+
88
+ # This is a relative redirect, but we still need the absolute URI
89
+ if redirect.relative?
90
+ redirect.path = "/#{redirect.path}" unless redirect.path[0] == "/"
91
+ redirect.host = host
92
+ redirect.scheme = scheme
93
+ end
94
+
95
+ # This was a redirect to a subpath or back to itself, which we don't care about
96
+ return if redirect.host == host && redirect.scheme == scheme
97
+
98
+ # Init a new endpoint representing the redirect
99
+ Endpoint.new(redirect.to_s)
100
+ end
101
+ end
102
+
103
+ # Does this endpoint return a redirect?
104
+ def redirect?
105
+ !!redirect
106
+ end
107
+
108
+ # What's the effective URL of a request to this domain?
109
+ def resolves_to
110
+ return self unless redirect?
111
+ @resolves_to ||= begin
112
+ response = request(:followlocation => true)
113
+
114
+ # Workaround for Webmock not playing nicely with Typhoeus redirects
115
+ if response.mock?
116
+ if response.headers["Location"]
117
+ url = response.headers["Location"]
118
+ else
119
+ url = response.request.url
120
+ end
121
+ else
122
+ url = response.effective_url
123
+ end
124
+
125
+ Endpoint.new(url)
126
+ end
127
+ end
128
+
129
+ def external_redirect?
130
+ host != resolves_to.host
131
+ end
132
+
133
+ def to_s
134
+ uri.to_s
135
+ end
136
+
137
+ def inspect
138
+ "#<SiteInspector::Endpoint uri=\"#{uri.to_s}\">"
139
+ end
140
+
141
+ # Returns information about the endpoint
142
+ #
143
+ # By default, all checks are run. If one or more check names are passed
144
+ # in the options hash, only those checks will be run.
145
+ #
146
+ # options:
147
+ # a hash of check symbols and bools representing which checks should be run
148
+ #
149
+ # Returns the hash representing the endpoint and its checks
150
+ def to_h(options={})
151
+ hash = {
152
+ uri: uri.to_s,
153
+ host: host,
154
+ www: www?,
155
+ https: https?,
156
+ scheme: scheme,
157
+ up: up?,
158
+ timed_out: timed_out?,
159
+ redirect: redirect?,
160
+ external_redirect: external_redirect?,
161
+ }
162
+
163
+ # Either they've specifically asked for a check, or we throw everything at them
164
+ checks = SiteInspector::Endpoint.checks.select { |c| options.keys.include?(c.name) }
165
+ checks = SiteInspector::Endpoint.checks if checks.empty?
166
+
167
+ checks.each do |check|
168
+ hash[check.name] = self.send(check.name).to_h
169
+ end
170
+
171
+ hash
172
+ end
173
+
174
+ def self.checks
175
+ ObjectSpace.each_object(Class).select { |klass| klass < Check }
176
+ end
177
+
178
+ def method_missing(method_sym, *arguments, &block)
179
+ if check = SiteInspector::Endpoint.checks.find { |c| c.name == method_sym }
180
+ @checks[method_sym] ||= check.new(self)
181
+ else
182
+ super
183
+ end
184
+ end
185
+
186
+ def respond_to?(method_sym, include_private = false)
187
+ if checks.keys.include?(method_sym)
188
+ true
189
+ else
190
+ super
191
+ end
192
+ end
193
+
194
+ private
195
+
196
+ def hydra
197
+ SiteInspector.hydra
198
+ end
199
+ end
200
+ end