site-inspector 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/Guardfile +8 -0
  7. data/README.md +175 -0
  8. data/Rakefile +8 -0
  9. data/bin/site-inspector +48 -21
  10. data/lib/site-inspector.rb +38 -613
  11. data/lib/site-inspector/cache.rb +9 -52
  12. data/lib/site-inspector/checks/check.rb +41 -0
  13. data/lib/site-inspector/checks/content.rb +67 -0
  14. data/lib/site-inspector/checks/dns.rb +129 -0
  15. data/lib/site-inspector/checks/headers.rb +83 -0
  16. data/lib/site-inspector/checks/hsts.rb +78 -0
  17. data/lib/site-inspector/checks/https.rb +40 -0
  18. data/lib/site-inspector/checks/sniffer.rb +42 -0
  19. data/lib/site-inspector/disk_cache.rb +38 -0
  20. data/lib/site-inspector/domain.rb +248 -0
  21. data/lib/site-inspector/endpoint.rb +200 -0
  22. data/lib/site-inspector/rails_cache.rb +11 -0
  23. data/lib/site-inspector/version.rb +3 -0
  24. data/script/bootstrap +1 -0
  25. data/script/cibuild +7 -0
  26. data/script/console +1 -0
  27. data/script/release +38 -0
  28. data/site-inspector.gemspec +33 -0
  29. data/spec/checks/site_inspector_endpoint_check_spec.rb +34 -0
  30. data/spec/checks/site_inspector_endpoint_content_spec.rb +89 -0
  31. data/spec/checks/site_inspector_endpoint_dns_spec.rb +167 -0
  32. data/spec/checks/site_inspector_endpoint_headers_spec.rb +74 -0
  33. data/spec/checks/site_inspector_endpoint_hsts_spec.rb +91 -0
  34. data/spec/checks/site_inspector_endpoint_https_spec.rb +48 -0
  35. data/spec/checks/site_inspector_endpoint_sniffer_spec.rb +52 -0
  36. data/spec/site_inspector_cache_spec.rb +13 -0
  37. data/spec/site_inspector_disc_cache_spec.rb +31 -0
  38. data/spec/site_inspector_domain_spec.rb +252 -0
  39. data/spec/site_inspector_endpoint_spec.rb +224 -0
  40. data/spec/site_inspector_spec.rb +46 -0
  41. data/spec/spec_helper.rb +17 -0
  42. metadata +75 -57
  43. data/lib/site-inspector/compliance.rb +0 -19
  44. data/lib/site-inspector/dns.rb +0 -92
  45. data/lib/site-inspector/headers.rb +0 -59
  46. data/lib/site-inspector/sniffer.rb +0 -26
@@ -0,0 +1,40 @@
1
+ class SiteInspector
2
+ class Endpoint
3
+ class Https < Check
4
+
5
+ def scheme?
6
+ scheme == "https"
7
+ end
8
+
9
+ def valid?
10
+ scheme? && response && response.return_code == :ok
11
+ end
12
+
13
+ def bad_chain?
14
+ scheme? && response && response.return_code == :ssl_cacert
15
+ end
16
+
17
+ def bad_name?
18
+ scheme? && response && response.return_code == :peer_failed_verification
19
+ end
20
+
21
+ def inspect
22
+ "#<SiteInspector::Endpoint::Https valid=#{valid?}>"
23
+ end
24
+
25
+ def to_h
26
+ {
27
+ valid: valid?,
28
+ return_code: response.return_code,
29
+ }
30
+ end
31
+
32
+ private
33
+
34
+ def scheme
35
+ @scheme ||= request.base_url.scheme
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,42 @@
1
+ class SiteInspector
2
+ class Endpoint
3
+ class Sniffer < Check
4
+
5
+ def cms
6
+ sniff :cms
7
+ end
8
+
9
+ def analytics
10
+ sniff :analytics
11
+ end
12
+
13
+ def javascript
14
+ sniff :javascript
15
+ end
16
+
17
+ def advertising
18
+ sniff :advertising
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ :cms => cms,
24
+ :analytics => analytics,
25
+ :javascript => javascript,
26
+ :advertising => advertising
27
+ }
28
+ end
29
+
30
+ private
31
+
32
+ def sniff(type)
33
+ require 'sniffles'
34
+ results = Sniffles.sniff(endpoint.content.body, type).select { |name, meta| meta[:found] == true }
35
+ results.each { |name, result| result.delete :found} if results
36
+ results
37
+ rescue
38
+ nil
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,38 @@
1
+ class SiteInspector
2
+ class DiskCache
3
+ def initialize(dir = nil, replace = nil)
4
+ @dir = dir || ENV['CACHE']
5
+ @replace = replace || ENV['CACHE_REPLACE']
6
+ @memory = {}
7
+ end
8
+
9
+ def get(request)
10
+ return unless File.exist?(path(request))
11
+ return @memory[request] if @memory[request]
12
+
13
+ if @replace
14
+ FileUtils.rm(path(request))
15
+ nil
16
+ else
17
+ begin
18
+ contents = File.read(path(request))
19
+ Marshal.load(contents)
20
+ rescue ArgumentError
21
+ FileUtils.rm(path(request))
22
+ nil
23
+ end
24
+ end
25
+ end
26
+
27
+ def set(request, response)
28
+ File.write(path(request), Marshal.dump(response))
29
+ @memory[request] = response
30
+ end
31
+
32
+ private
33
+
34
+ def path(request)
35
+ File.join(@dir, request)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,248 @@
1
+ class SiteInspector
2
+ class Domain
3
+
4
+ attr_reader :host
5
+
6
+ def initialize(host)
7
+ host = host.downcase
8
+ host = host.sub /^https?\:/, ""
9
+ host = host.sub /^\/+/, ""
10
+ host = host.sub /^www\./, ""
11
+ uri = Addressable::URI.parse "//#{host}"
12
+ @host = uri.host
13
+ end
14
+
15
+ def endpoints
16
+ @endpoints ||= [
17
+ Endpoint.new("https://#{host}"),
18
+ Endpoint.new("https://www.#{host}"),
19
+ Endpoint.new("http://#{host}"),
20
+ Endpoint.new("http://www.#{host}")
21
+ ]
22
+ end
23
+
24
+ def canonical_endpoint
25
+ @canonical_endpoint ||= endpoints.find do |e|
26
+ e.https? == canonically_https? && e.www? == canonically_www?
27
+ end
28
+ end
29
+
30
+ def government?
31
+ require 'gman'
32
+ Gman.valid? host
33
+ end
34
+
35
+ # Does *any* endpoint return a 200 response code?
36
+ def up?
37
+ endpoints.any? { |e| e.up? }
38
+ end
39
+
40
+ # Does any www endpoint return a 200 response code?
41
+ def www?
42
+ endpoints.any? { |e| e.www? && e.up? }
43
+ end
44
+
45
+ # Can you connect without www?
46
+ def root?
47
+ endpoints.any? { |e| e.root? && e.up? }
48
+ end
49
+
50
+ # HTTPS is "supported" (different than "canonical" or "enforced") if:
51
+ #
52
+ # * Either of the HTTPS endpoints is listening, and doesn't have
53
+ # an invalid hostname.
54
+ def https?
55
+ endpoints.any? { |e| e.https? && e.up? && e.https.valid? }
56
+ end
57
+
58
+ # HTTPS is enforced if one of the HTTPS endpoints is "live",
59
+ # and if both *HTTP* endpoints are either:
60
+ #
61
+ # * down, or
62
+ # * redirect immediately to HTTPS.
63
+ #
64
+ # This is different than whether a domain is "canonically" HTTPS.
65
+ #
66
+ # * an HTTP redirect can go to HTTPS on another domain, as long
67
+ # as it's immediate.
68
+ # * a domain with an invalid cert can still be enforcing HTTPS.
69
+ def enforces_https?
70
+ return false unless https?
71
+ endpoints.select { |e| e.http? }.all? { |e| e.down? || (e.redirect && e.redirect.https?) }
72
+ end
73
+
74
+ # we can say that a canonical HTTPS site "defaults" to HTTPS,
75
+ # even if it doesn't *strictly* enforce it (e.g. having a www
76
+ # subdomain first to go HTTP root before HTTPS root).
77
+ def defaults_https?
78
+ raise "Not implemented. Halp?"
79
+ end
80
+
81
+ # HTTPS is "downgraded" if both:
82
+ #
83
+ # * HTTPS is supported, and
84
+ # * The 'canonical' endpoint gets an immediate internal redirect to HTTP.
85
+ def downgrades_https?
86
+ return false unless https?
87
+ canonical_endpoint.redirect && canonical_endpoint.redirect.http?
88
+ end
89
+
90
+ # A domain is "canonically" at www if:
91
+ # * at least one of its www endpoints responds
92
+ # * both root endpoints are either down ~~or redirect *somewhere*~~, or
93
+ # * at least one root endpoint redirect should immediately go to
94
+ # an *internal* www endpoint
95
+ # This is meant to affirm situations like:
96
+ # http:// -> https:// -> https://www
97
+ # https:// -> http:// -> https://www
98
+ # and meant to avoid affirming situations like:
99
+ # http:// -> http://non-www,
100
+ # http://www -> http://non-www
101
+ # or like:
102
+ # https:// -> 200, http:// -> http://www
103
+ def canonically_www?
104
+ # Does any endpoint respond?
105
+ return false unless up?
106
+
107
+ # Does at least one www endpoint respond?
108
+ return false unless www?
109
+
110
+ # Are both root endpoints down?
111
+ return true if endpoints.select { |e| e.root? }.all? { |e| e.down? }
112
+
113
+ # Does either root endpoint redirect to a www endpoint?
114
+ endpoints.select { |e| e.root? }.any? { |e| e.redirect && e.redirect.www? }
115
+ end
116
+
117
+ # A domain is "canonically" at https if:
118
+ # * at least one of its https endpoints is live and
119
+ # doesn't have an invalid hostname
120
+ # * both http endpoints are either down or redirect *somewhere*
121
+ # * at least one http endpoint redirects immediately to
122
+ # an *internal* https endpoint
123
+ # This is meant to affirm situations like:
124
+ # http:// -> http://www -> https://
125
+ # https:// -> http:// -> https://www
126
+ # and meant to avoid affirming situations like:
127
+ # http:// -> http://non-www
128
+ # http://www -> http://non-www
129
+ # or:
130
+ # http:// -> 200, http://www -> https://www
131
+ #
132
+ # It allows a site to be canonically HTTPS if the cert has
133
+ # a valid hostname but invalid chain issues.
134
+ def canonically_https?
135
+ # Does any endpoint respond?
136
+ return false unless up?
137
+
138
+ # At least one of its https endpoints is live and doesn't have an invalid hostname
139
+ return false unless https?
140
+
141
+ # Both http endpoints are down
142
+ return true if endpoints.select { |e| e.http? }.all? { |e| e.down? }
143
+
144
+ # at least one http endpoint redirects immediately to https
145
+ endpoints.select { |e| e.http? }.any? { |e| e.redirect && e.redirect.https? }
146
+ end
147
+
148
+ # A domain redirects if
149
+ # 1. At least one endpoint is an external redirect, and
150
+ # 2. All endpoints are either down or an external redirect
151
+ def redirect?
152
+ return false unless redirect
153
+ endpoints.all? { |e| e.down? || e.external_redirect? }
154
+ end
155
+
156
+ # The first endpoint to respond with a redirect
157
+ def redirect
158
+ endpoints.find { |e| e.external_redirect? }
159
+ end
160
+
161
+ # HSTS on the canonical domain?
162
+ def hsts?
163
+ canonical_endpoint.hsts && canonical_endpoint.hsts.enabled?
164
+ end
165
+
166
+ def hsts_subdomains?
167
+ endpoints.find { |e| e.root? && e.https? }.hsts.include_subdomains?
168
+ end
169
+
170
+ def hsts_preload_ready?
171
+ return false unless hsts_subdomains?
172
+ endpoints.find { |e| e.root? && e.https? }.hsts.preload_ready?
173
+ end
174
+
175
+ def to_s
176
+ host
177
+ end
178
+
179
+ def inspect
180
+ "#<SiteInspector::Domain host=\"#{host}\">"
181
+ end
182
+
183
+ # We know most API calls to the domain model are going to require
184
+ # That the root of all four endpoints are called. Rather than process them
185
+ # In serial, lets grab them in parallel and cache the results to speed
186
+ # up later calls.
187
+ def prefetch
188
+ endpoints.each do |endpoint|
189
+ request = Typhoeus::Request.new(endpoint.uri, SiteInspector.typhoeus_defaults)
190
+ SiteInspector.hydra.queue(request)
191
+ end
192
+ SiteInspector.hydra.run
193
+ end
194
+
195
+ # Converts the domain to a hash
196
+ #
197
+ # By default, it only returns domain-wide information and
198
+ # information about the canonical endpoint
199
+ #
200
+ # It will also pass options allong to each endpoint's to_h method
201
+ #
202
+ # options:
203
+ # :all - return information about all endpoints
204
+ #
205
+ # Returns a complete hash of the domain's information
206
+ def to_h(options={})
207
+ prefetch
208
+
209
+ hash = {
210
+ host: host,
211
+ up: up?,
212
+ www: www?,
213
+ root: root?,
214
+ https: https?,
215
+ enforces_https: enforces_https?,
216
+ downgrades_https: downgrades_https?,
217
+ canonically_www: canonically_www?,
218
+ canonically_https: canonically_https?,
219
+ redirect: redirect?,
220
+ hsts: hsts?,
221
+ hsts_subdomains: hsts_subdomains?,
222
+ hsts_preload_ready: hsts_preload_ready?,
223
+ canoncial_endpoint: canonical_endpoint.to_h(options)
224
+ }
225
+
226
+ if options["all"]
227
+ hash.merge!({
228
+ endpoints: {
229
+ https: {
230
+ root: endpoints[0].to_h(options),
231
+ www: endpoints[1].to_h(options)
232
+ },
233
+ http: {
234
+ root: endpoints[2].to_h(options),
235
+ www: endpoints[3].to_h(options)
236
+ }
237
+ }
238
+ })
239
+ end
240
+
241
+ hash
242
+ end
243
+
244
+ def to_json
245
+ to_h.to_json
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,200 @@
1
+ class SiteInspector
2
+ # Every domain has four possible "endpoints" to evaluate
3
+ #
4
+ # For example, if you had `example.com` you'd have:
5
+ # 1. `http://example.com`
6
+ # 2. `http://www.example.com`
7
+ # 3. `https://example.com`
8
+ # 4. `https://www.example.com`
9
+ #
10
+ # Because each of the four endpoints could potentially respond differently
11
+ # We must evaluate all four to make certain determination
12
+ class Endpoint
13
+ attr_accessor :host, :uri
14
+
15
+ # Initatiate a new Endpoint object
16
+ #
17
+ # endpoint - (string) the endpoint to query (e.g., `https://example.com`)
18
+ def initialize(host)
19
+ @uri = Addressable::URI.parse(host.downcase)
20
+ @host = uri.host.sub(/^www\./, "")
21
+ @checks = {}
22
+ end
23
+
24
+ def www?
25
+ !!(uri.host =~ /^www\./)
26
+ end
27
+
28
+ def root?
29
+ !www?
30
+ end
31
+
32
+ def https?
33
+ https.scheme?
34
+ end
35
+
36
+ def http?
37
+ !https?
38
+ end
39
+
40
+ def scheme
41
+ @uri.scheme
42
+ end
43
+
44
+ def request(options = {})
45
+ target = options[:path] ? URI.join(uri, options.delete(:path)) : uri
46
+ request = Typhoeus::Request.new(target, SiteInspector.typhoeus_defaults.merge(options))
47
+ hydra.queue(request)
48
+ hydra.run
49
+ request.response
50
+ end
51
+
52
+ # Makes a GET request of the given host
53
+ #
54
+ # Retutns the Typhoeus::Response object
55
+ def response
56
+ @response ||= request
57
+ end
58
+
59
+ # Does the server return any response? (including 50x)
60
+ def response?
61
+ response.code != 0 && !timed_out?
62
+ end
63
+
64
+ def response_code
65
+ response.response_code.to_s if response
66
+ end
67
+
68
+ def timed_out?
69
+ response && response.timed_out?
70
+ end
71
+
72
+ # Does the endpoint return a 2xx or 3xx response code?
73
+ def up?
74
+ response && response_code.start_with?("2") || response_code.start_with?("3")
75
+ end
76
+
77
+ def down?
78
+ !up?
79
+ end
80
+
81
+ # If the domain is a redirect, what's the first endpoint we're redirected to?
82
+ def redirect
83
+ return unless response && response_code.start_with?("3")
84
+
85
+ @redirect ||= begin
86
+ redirect = Addressable::URI.parse(headers["location"])
87
+
88
+ # This is a relative redirect, but we still need the absolute URI
89
+ if redirect.relative?
90
+ redirect.path = "/#{redirect.path}" unless redirect.path[0] == "/"
91
+ redirect.host = host
92
+ redirect.scheme = scheme
93
+ end
94
+
95
+ # This was a redirect to a subpath or back to itself, which we don't care about
96
+ return if redirect.host == host && redirect.scheme == scheme
97
+
98
+ # Init a new endpoint representing the redirect
99
+ Endpoint.new(redirect.to_s)
100
+ end
101
+ end
102
+
103
+ # Does this endpoint return a redirect?
104
+ def redirect?
105
+ !!redirect
106
+ end
107
+
108
+ # What's the effective URL of a request to this domain?
109
+ def resolves_to
110
+ return self unless redirect?
111
+ @resolves_to ||= begin
112
+ response = request(:followlocation => true)
113
+
114
+ # Workaround for Webmock not playing nicely with Typhoeus redirects
115
+ if response.mock?
116
+ if response.headers["Location"]
117
+ url = response.headers["Location"]
118
+ else
119
+ url = response.request.url
120
+ end
121
+ else
122
+ url = response.effective_url
123
+ end
124
+
125
+ Endpoint.new(url)
126
+ end
127
+ end
128
+
129
+ def external_redirect?
130
+ host != resolves_to.host
131
+ end
132
+
133
+ def to_s
134
+ uri.to_s
135
+ end
136
+
137
+ def inspect
138
+ "#<SiteInspector::Endpoint uri=\"#{uri.to_s}\">"
139
+ end
140
+
141
+ # Returns information about the endpoint
142
+ #
143
+ # By default, all checks are run. If one or more check names are passed
144
+ # in the options hash, only those checks will be run.
145
+ #
146
+ # options:
147
+ # a hash of check symbols and bools representing which checks should be run
148
+ #
149
+ # Returns the hash representing the endpoint and its checks
150
+ def to_h(options={})
151
+ hash = {
152
+ uri: uri.to_s,
153
+ host: host,
154
+ www: www?,
155
+ https: https?,
156
+ scheme: scheme,
157
+ up: up?,
158
+ timed_out: timed_out?,
159
+ redirect: redirect?,
160
+ external_redirect: external_redirect?,
161
+ }
162
+
163
+ # Either they've specifically asked for a check, or we throw everything at them
164
+ checks = SiteInspector::Endpoint.checks.select { |c| options.keys.include?(c.name) }
165
+ checks = SiteInspector::Endpoint.checks if checks.empty?
166
+
167
+ checks.each do |check|
168
+ hash[check.name] = self.send(check.name).to_h
169
+ end
170
+
171
+ hash
172
+ end
173
+
174
+ def self.checks
175
+ ObjectSpace.each_object(Class).select { |klass| klass < Check }
176
+ end
177
+
178
+ def method_missing(method_sym, *arguments, &block)
179
+ if check = SiteInspector::Endpoint.checks.find { |c| c.name == method_sym }
180
+ @checks[method_sym] ||= check.new(self)
181
+ else
182
+ super
183
+ end
184
+ end
185
+
186
+ def respond_to?(method_sym, include_private = false)
187
+ if checks.keys.include?(method_sym)
188
+ true
189
+ else
190
+ super
191
+ end
192
+ end
193
+
194
+ private
195
+
196
+ def hydra
197
+ SiteInspector.hydra
198
+ end
199
+ end
200
+ end