site-inspector 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +7 -0
- data/.ruby-version +1 -0
- data/.travis.yml +6 -0
- data/Gemfile +3 -0
- data/Guardfile +8 -0
- data/README.md +175 -0
- data/Rakefile +8 -0
- data/bin/site-inspector +48 -21
- data/lib/site-inspector.rb +38 -613
- data/lib/site-inspector/cache.rb +9 -52
- data/lib/site-inspector/checks/check.rb +41 -0
- data/lib/site-inspector/checks/content.rb +67 -0
- data/lib/site-inspector/checks/dns.rb +129 -0
- data/lib/site-inspector/checks/headers.rb +83 -0
- data/lib/site-inspector/checks/hsts.rb +78 -0
- data/lib/site-inspector/checks/https.rb +40 -0
- data/lib/site-inspector/checks/sniffer.rb +42 -0
- data/lib/site-inspector/disk_cache.rb +38 -0
- data/lib/site-inspector/domain.rb +248 -0
- data/lib/site-inspector/endpoint.rb +200 -0
- data/lib/site-inspector/rails_cache.rb +11 -0
- data/lib/site-inspector/version.rb +3 -0
- data/script/bootstrap +1 -0
- data/script/cibuild +7 -0
- data/script/console +1 -0
- data/script/release +38 -0
- data/site-inspector.gemspec +33 -0
- data/spec/checks/site_inspector_endpoint_check_spec.rb +34 -0
- data/spec/checks/site_inspector_endpoint_content_spec.rb +89 -0
- data/spec/checks/site_inspector_endpoint_dns_spec.rb +167 -0
- data/spec/checks/site_inspector_endpoint_headers_spec.rb +74 -0
- data/spec/checks/site_inspector_endpoint_hsts_spec.rb +91 -0
- data/spec/checks/site_inspector_endpoint_https_spec.rb +48 -0
- data/spec/checks/site_inspector_endpoint_sniffer_spec.rb +52 -0
- data/spec/site_inspector_cache_spec.rb +13 -0
- data/spec/site_inspector_disc_cache_spec.rb +31 -0
- data/spec/site_inspector_domain_spec.rb +252 -0
- data/spec/site_inspector_endpoint_spec.rb +224 -0
- data/spec/site_inspector_spec.rb +46 -0
- data/spec/spec_helper.rb +17 -0
- metadata +75 -57
- data/lib/site-inspector/compliance.rb +0 -19
- data/lib/site-inspector/dns.rb +0 -92
- data/lib/site-inspector/headers.rb +0 -59
- data/lib/site-inspector/sniffer.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5eab4885077637bf3251776f685d29b3cde53a3f
|
4
|
+
data.tar.gz: de588b40792c8fb27be03badd85df8c7b74c3d6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c23e39615a372abb0ce5958d1ed7abdbfb7d98c436b1cdfd21ec9072471f0eed3c0365641d3211945212309205b6501cb59b2f7b158017ef082f71b72c738d02
|
7
|
+
data.tar.gz: 0dc5713aa8301858af5e0c8941aa12c05140837e33151d650475aed15d39794d950ea90c1152799ec6ad04b9d9739f0cd14a634abc69ae9ff48145a17f0da7fd
|
data/.gitignore
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.4
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
# Site Inspector
|
2
|
+
|
3
|
+
A Ruby Gem to sniff information about a domain's technology and capabilities.
|
4
|
+
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/site-inspector.svg)](http://badge.fury.io/rb/site-inspector) [![Build Status](https://travis-ci.org/benbalter/site-inspector-ruby.svg)](https://travis-ci.org/benbalter/site-inspector-ruby)
|
6
|
+
|
7
|
+
## Demo
|
8
|
+
|
9
|
+
[site-inspector.herokuapp.com](https://site-inspector.herokuapp.com) ([source](https://github.com/benbalter/site-inspector-demo))
|
10
|
+
|
11
|
+
## Concepts
|
12
|
+
|
13
|
+
Site Inspector involves three primary concepts:
|
14
|
+
|
15
|
+
* **Domain** - A domain has a host defined by it's TLD + SLD. A domain might be `example.com`. Domain's have certain domain-wide properties like whether it supports non-www requests, or if it enforces HTTPS.
|
16
|
+
|
17
|
+
* **Endpoint** - Each domain has four endpoints based on whether you make your request with HTTPS or not, and whether you prefix the host with `www.` or not. So the domain `example.com` may have endpoints at `https://example.com`, `https://www.example.com`, `http://example.com`, and `https://www.example.com`. There may theoretically be a different server responding to each endpoint, so endpoints have certain endpoint-specific properties, like whether it responds or not, or whether it redirects. Each domain has one canonical (primary) endpoint.
|
18
|
+
|
19
|
+
* **Checks** - A check is a set of tests performed on an endpoint. A check might look at what headers are returned, what CMS is used, or whether there is a valid HTTPS certificate. There are some built in checks, listed below, or you can define your own. While they're endpoint specific, checks often filter up and inform some of the domain-wide logic (such as if the domain supports HTTPS).
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
### Ruby
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
domain = SiteInspector.inspect "whitehouse.gov"
|
27
|
+
domain.https?
|
28
|
+
# => true
|
29
|
+
domain.www?
|
30
|
+
# => true
|
31
|
+
domain.canonical_endpoint.to_s
|
32
|
+
# => "https://www.whitehouse.gov"
|
33
|
+
domain.canonical_endpoint.sniffer.cms
|
34
|
+
# => { :drupal => {}}
|
35
|
+
```
|
36
|
+
|
37
|
+
### Command line usage
|
38
|
+
|
39
|
+
```
|
40
|
+
site-inspector inspect -- inspects a domain
|
41
|
+
|
42
|
+
Usage:
|
43
|
+
|
44
|
+
site-inspector inspect <domain> [options]
|
45
|
+
|
46
|
+
Options:
|
47
|
+
-j, --json JSON encode the output
|
48
|
+
-a, --all return results for all endpoints (defaults to only the canonical endpoint)
|
49
|
+
--sniffer return results for the sniffer check (defaults to all checks unless one or more checks are specified)
|
50
|
+
--https return results for the https check (defaults to all checks unless one or more checks are specified)
|
51
|
+
--hsts return results for the hsts check (defaults to all checks unless one or more checks are specified)
|
52
|
+
--headers return results for the headers check (defaults to all checks unless one or more checks are specified)
|
53
|
+
--dns return results for the dns check (defaults to all checks unless one or more checks are specified)
|
54
|
+
--content return results for the content check (defaults to all checks unless one or more checks are specified)
|
55
|
+
-h, --help Show this message
|
56
|
+
-v, --version Print the name and version
|
57
|
+
-t, --trace Show the full backtrace when an error occurs
|
58
|
+
```
|
59
|
+
|
60
|
+
## What's checked
|
61
|
+
|
62
|
+
### Domain
|
63
|
+
|
64
|
+
* `canonical_endpoint` - The domain's primary endpoint
|
65
|
+
* `government` - whether the domain is a government domain
|
66
|
+
* `up` - whether any endpoint responds
|
67
|
+
* `www` - whether either `www` endpoint responds
|
68
|
+
* `root` - whether you can access the domain with `www.`
|
69
|
+
* `https` - whether HTTPS is supported
|
70
|
+
* `enforces_https` - whether non-htttps endpoints are either down or redirects to https
|
71
|
+
* `downgrades_https` - whether the canonical endpoint redirects to an http endpoint
|
72
|
+
* `canonically_www` - whether non-www requests are redirected to www (or all non-www endpoints are down)
|
73
|
+
* `canonically_https` - whether non-https request are redirected to https (or all http endpoints are down)
|
74
|
+
* `redirect` - whether the domain redirects to an external domain
|
75
|
+
* `hsts` - does the canonical endpoint have HSTS enabled
|
76
|
+
* `hsts_subdomains` - are subdomains included in the HSTS list?
|
77
|
+
* `hsts_preload_ready` - can this domain be added to the HSTS preload list?
|
78
|
+
|
79
|
+
### Endpoint
|
80
|
+
|
81
|
+
* `up` - whether the endpoint responds or not
|
82
|
+
* `timed_out` - whether the endpoint times out
|
83
|
+
* `redirect` - whether the endpoint redirects
|
84
|
+
* `external_redirect` - whether the endpoint redirects to another domain
|
85
|
+
|
86
|
+
### Checks
|
87
|
+
|
88
|
+
Each endpoint also returns the following checks:
|
89
|
+
|
90
|
+
#### Content
|
91
|
+
|
92
|
+
* `doctype` - The HTML doctype returned
|
93
|
+
* `sitemap_xml` - Whether the endpoint has a sitemap
|
94
|
+
* `robots_txt` - whether the endpoint has a `robots.txt` file
|
95
|
+
|
96
|
+
#### DNS
|
97
|
+
|
98
|
+
* `dnssec` - is DNSSEC supported
|
99
|
+
* `ipv6` - is IPV6 supported
|
100
|
+
* `cdn` - the endpoint's CDN, if any
|
101
|
+
* `cloud_provider` - the endpoint's cloud provider, if any
|
102
|
+
* `google_apps` - whether the domain is using google apps
|
103
|
+
* `hostname` - the server hostname
|
104
|
+
* `ip` - the server IP
|
105
|
+
|
106
|
+
#### Headers
|
107
|
+
|
108
|
+
* `cookies` - does the domain use cookies
|
109
|
+
* `strict_transport_security` - whether STS is enabled
|
110
|
+
* `content_security_policy` - the endpoint's CSP
|
111
|
+
* `click_jacking_protection` - whether an `x-frame-options` header is sent
|
112
|
+
* `xss_protection` - whether an `x-xss-protection` header is sent
|
113
|
+
* `server` - the server header
|
114
|
+
* `secure_cookies` - whether the cookies are secure, or not
|
115
|
+
|
116
|
+
#### HSTS
|
117
|
+
|
118
|
+
* `valid` - whether the HSTS header is valid
|
119
|
+
* `max_age` - the HSTS max age
|
120
|
+
* `include_subdomains` - whether subdomains are included
|
121
|
+
* `preload` - whether its preloaded
|
122
|
+
* `enabled` - whether HSTS is enabled
|
123
|
+
* `preload_ready` - whether HSTS could be preloaded
|
124
|
+
|
125
|
+
#### HTTPS
|
126
|
+
|
127
|
+
* `valid` - if the HTTPS response is valid
|
128
|
+
* `return_code` - the HTTPS error, if any
|
129
|
+
|
130
|
+
#### Sniffer
|
131
|
+
|
132
|
+
* `cms` - the CMS used, if any
|
133
|
+
* `analytics` - the analytics providers used, if any
|
134
|
+
* `javascript` - the javascript libraries used, if any
|
135
|
+
* `advertising` - the advertising providers used, if any
|
136
|
+
|
137
|
+
## Adding your own check
|
138
|
+
|
139
|
+
[Checks](https://github.com/benbalter/site-inspector-ruby/tree/master/lib/site-inspector/checks) are special classes that are children of [`SiteInspector::Endpoint::Check`](https://github.com/benbalter/site-inspector-ruby/blob/master/lib/site-inspector/checks/check.rb). You can implement your own check like this:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
class SiteInspector
|
143
|
+
class Endpoint
|
144
|
+
class Mention
|
145
|
+
def mentions_ben?
|
146
|
+
endpoint.content.body =~ /ben/i
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
```
|
152
|
+
|
153
|
+
Checks can call the `endpoint` object, which, contains the request, response, and other checks. Custom checks are automatically exposed as endpoint methods.
|
154
|
+
|
155
|
+
## Contributing
|
156
|
+
|
157
|
+
### Bootstrapping locally
|
158
|
+
|
159
|
+
1. Clone down the repo
|
160
|
+
2. `script/bootstrap`
|
161
|
+
|
162
|
+
### Running tests
|
163
|
+
|
164
|
+
`script/cibuild`
|
165
|
+
|
166
|
+
### Development console
|
167
|
+
|
168
|
+
`script/console`
|
169
|
+
|
170
|
+
### How to contribute
|
171
|
+
|
172
|
+
1. Fork the project
|
173
|
+
2. Create a new, descriptively named feature branch
|
174
|
+
3. Make your changes
|
175
|
+
4. Submit a pull request
|
data/Rakefile
ADDED
data/bin/site-inspector
CHANGED
@@ -1,29 +1,56 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require "mercenary"
|
4
|
+
require "oj"
|
5
|
+
require 'yaml'
|
6
|
+
require 'colorator'
|
3
7
|
require_relative "../lib/site-inspector"
|
4
8
|
|
5
|
-
|
9
|
+
def stringify_keys_deep!(h)
|
10
|
+
h.keys.each do |k|
|
11
|
+
ks = k.respond_to?(:to_s) ? k.to_s : k
|
12
|
+
h[ks] = h.delete k # Preserve order even when k == ks
|
13
|
+
stringify_keys_deep! h[ks] if h[ks].kind_of? Hash
|
14
|
+
end
|
15
|
+
end
|
6
16
|
|
7
|
-
|
8
|
-
|
17
|
+
Mercenary.program(:"site-inspector") do |p|
|
18
|
+
p.version SiteInspector::VERSION
|
19
|
+
p.description "Returns information about a domain's technology and capabilities"
|
20
|
+
p.syntax "site-inspector <command> <domain> [options]"
|
9
21
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
22
|
+
p.command(:inspect) do |c|
|
23
|
+
c.syntax "inspect <domain> [options]"
|
24
|
+
c.description "inspects a domain"
|
25
|
+
c.option 'json', '-j', '--json', 'JSON encode the output'
|
26
|
+
c.option 'all', '-a', '--all', 'return results for all endpoints (defaults to only the canonical endpoint)'
|
14
27
|
|
15
|
-
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
details = site.http
|
22
|
-
|
23
|
-
# Normal mode: autodetect canonical domain, sweep every attribute.
|
24
|
-
else
|
25
|
-
site = SiteInspector.new(domain)
|
26
|
-
details = site.to_hash
|
27
|
-
end
|
28
|
+
SiteInspector::Endpoint.checks.each do |check|
|
29
|
+
c.option check.name, "--#{check.name}", "return results for the #{check.name} check (defaults to all checks unless one or more checks are specified)"
|
30
|
+
end
|
31
|
+
|
32
|
+
c.action do |args, options|
|
33
|
+
next c.logger.fatal "Must specify a domain" if args.length != 1
|
28
34
|
|
29
|
-
|
35
|
+
# Build our domain hash as requested
|
36
|
+
domain = SiteInspector.inspect(args[0])
|
37
|
+
hash = domain.to_h(options)
|
38
|
+
json = Oj.dump(hash, indent: 2, mode: :compat)
|
39
|
+
|
40
|
+
# Dump the JSON and run
|
41
|
+
next puts json if options["json"]
|
42
|
+
|
43
|
+
# This is a dirty, dirty hack, but it's a simple way to stringify keys recursively
|
44
|
+
# And format the output in a human-readable way
|
45
|
+
yaml = YAML.dump Oj.load(json)
|
46
|
+
|
47
|
+
# Colorize bools
|
48
|
+
yaml.gsub! /\: (true|ok)$/, ": " + "true".green
|
49
|
+
yaml.gsub! /\: false$/, ": " + "false".red
|
50
|
+
|
51
|
+
puts yaml
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
p.default_command(:inspect)
|
56
|
+
end
|
data/lib/site-inspector.rb
CHANGED
@@ -1,636 +1,61 @@
|
|
1
|
-
|
2
|
-
# needed for HTTP analysis
|
3
1
|
require 'open-uri'
|
4
|
-
require
|
2
|
+
require 'addressable/uri'
|
5
3
|
require 'public_suffix'
|
6
4
|
require 'typhoeus'
|
7
5
|
|
8
6
|
require_relative 'site-inspector/cache'
|
9
|
-
require_relative 'site-inspector/
|
10
|
-
require_relative 'site-inspector/
|
11
|
-
require_relative 'site-inspector/
|
12
|
-
require_relative 'site-inspector/
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
7
|
+
require_relative 'site-inspector/disk_cache'
|
8
|
+
require_relative 'site-inspector/rails_cache'
|
9
|
+
require_relative 'site-inspector/domain'
|
10
|
+
require_relative 'site-inspector/checks/check'
|
11
|
+
require_relative 'site-inspector/checks/content'
|
12
|
+
require_relative 'site-inspector/checks/dns'
|
13
|
+
require_relative 'site-inspector/checks/headers'
|
14
|
+
require_relative 'site-inspector/checks/hsts'
|
15
|
+
require_relative 'site-inspector/checks/https'
|
16
|
+
require_relative 'site-inspector/checks/sniffer'
|
17
|
+
require_relative 'site-inspector/endpoint'
|
18
|
+
require_relative 'site-inspector/version'
|
20
19
|
|
21
20
|
class SiteInspector
|
21
|
+
class << self
|
22
22
|
|
23
|
-
|
24
|
-
require 'yaml'
|
25
|
-
YAML.load_file File.expand_path "./data/#{name}.yml", File.dirname(__FILE__)
|
26
|
-
end
|
27
|
-
|
28
|
-
# Utility parser for HSTS headers.
|
29
|
-
# RFC: http://tools.ietf.org/html/rfc6797
|
30
|
-
def self.hsts_parse(header)
|
31
|
-
# no hsts for you
|
32
|
-
nothing = {
|
33
|
-
max_age: nil,
|
34
|
-
include_subdomains: false,
|
35
|
-
preload: false,
|
36
|
-
enabled: false,
|
37
|
-
preload_ready: false
|
38
|
-
}
|
39
|
-
|
40
|
-
return nothing unless header and header.is_a?(String)
|
41
|
-
|
42
|
-
directives = header.split(/\s*;\s*/)
|
43
|
-
|
44
|
-
pairs = []
|
45
|
-
directives.each do |directive|
|
46
|
-
name, value = directive.downcase.split("=")
|
47
|
-
|
48
|
-
if value and value.start_with?("\"") and value.end_with?("\"")
|
49
|
-
value = value.sub(/^\"/, '')
|
50
|
-
value = value.sub(/\"$/, '')
|
51
|
-
end
|
52
|
-
|
53
|
-
pairs.push([name, value])
|
54
|
-
end
|
55
|
-
|
56
|
-
# reject invalid directives
|
57
|
-
fatal = pairs.any? do |name, value|
|
58
|
-
# TODO: more comprehensive rejection of characters
|
59
|
-
invalid_chars = /[\s\'\"]/
|
60
|
-
(name =~ invalid_chars) or (value =~ invalid_chars)
|
61
|
-
end
|
62
|
-
|
63
|
-
# good DAY, sir
|
64
|
-
return nothing if fatal
|
65
|
-
|
66
|
-
max_age_directive = pairs.find {|n, v| n == "max-age"}
|
67
|
-
max_age = max_age_directive ? max_age_directive[1].to_i : nil
|
68
|
-
include_subdomains = !!pairs.find {|n, v| n == "includesubdomains"}
|
69
|
-
preload = !!pairs.find {|n, v| n == "preload"}
|
70
|
-
|
71
|
-
enabled = !!(max_age and (max_age > 0))
|
72
|
-
|
73
|
-
# Google's minimum max-age for automatic preloading
|
74
|
-
eighteen_weeks = !!(max_age and (max_age >= 10886400))
|
75
|
-
preload_ready = !!(eighteen_weeks and include_subdomains and preload)
|
76
|
-
|
77
|
-
{
|
78
|
-
max_age: max_age,
|
79
|
-
include_subdomains: include_subdomains,
|
80
|
-
preload: preload,
|
81
|
-
enabled: enabled,
|
82
|
-
preload_ready: preload_ready
|
83
|
-
}
|
84
|
-
end
|
85
|
-
|
86
|
-
# makes no network requests
|
87
|
-
def initialize(domain, options = {})
|
88
|
-
domain = domain.downcase
|
89
|
-
domain = domain.sub /^https?\:/, ""
|
90
|
-
domain = domain.sub /^\/+/, ""
|
91
|
-
domain = domain.sub /^www\./, ""
|
92
|
-
@uri = Addressable::URI.parse "//#{domain}"
|
93
|
-
@domain = PublicSuffix.parse @uri.host
|
94
|
-
@timeout = options[:timeout] || 10
|
95
|
-
end
|
96
|
-
|
97
|
-
def inspect
|
98
|
-
"<SiteInspector domain=\"#{domain}\">"
|
99
|
-
end
|
100
|
-
|
101
|
-
def uri(ssl=enforce_https?,www=www?)
|
102
|
-
uri = @uri.clone
|
103
|
-
uri.host = www ? "www.#{uri.host}" : uri.host
|
104
|
-
uri.scheme = ssl ? "https" : "http"
|
105
|
-
uri
|
106
|
-
end
|
107
|
-
|
108
|
-
def domain
|
109
|
-
www? ? PublicSuffix.parse("www.#{@uri.host}") : @domain
|
110
|
-
end
|
111
|
-
|
112
|
-
def request(ssl=false, www=false, followlocation=true, ssl_verifypeer=true, ssl_verifyhost=true)
|
113
|
-
to_get = uri(ssl, www)
|
23
|
+
attr_writer :timeout, :cache
|
114
24
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
def response
|
122
|
-
@response ||= begin
|
123
|
-
if response = request(false, false) and response.success?
|
124
|
-
@non_www = true
|
125
|
-
response
|
126
|
-
elsif response = request(false, true) and response.success?
|
127
|
-
@non_www = false
|
128
|
-
response
|
129
|
-
else
|
130
|
-
false
|
131
|
-
end
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def timed_out?
|
136
|
-
response && response.timed_out?
|
137
|
-
end
|
138
|
-
|
139
|
-
def doc
|
140
|
-
require 'nokogiri'
|
141
|
-
@doc ||= Nokogiri::HTML response.body if response
|
142
|
-
end
|
143
|
-
|
144
|
-
def body
|
145
|
-
doc.to_s.force_encoding("UTF-8").encode("UTF-8", :invalid => :replace, :replace => "")
|
146
|
-
end
|
147
|
-
|
148
|
-
def government?
|
149
|
-
require 'gman'
|
150
|
-
Gman.valid? domain.to_s
|
151
|
-
end
|
152
|
-
|
153
|
-
def https?
|
154
|
-
@https ||= request(true, www?).success?
|
155
|
-
end
|
156
|
-
alias_method :ssl?, :https?
|
157
|
-
|
158
|
-
def enforce_https?
|
159
|
-
return false unless https?
|
160
|
-
@enforce_https ||= begin
|
161
|
-
response = request(false, www?)
|
162
|
-
if response.effective_url
|
163
|
-
Addressable::URI.parse(response.effective_url).scheme == "https"
|
25
|
+
def cache
|
26
|
+
@cache ||= if ENV['CACHE']
|
27
|
+
SiteInspector::DiskCache.new
|
28
|
+
elsif Object.const_defined?('Rails')
|
29
|
+
SiteInspector::RailsCache.new
|
164
30
|
else
|
165
|
-
|
31
|
+
SiteInspector::Cache.new
|
166
32
|
end
|
167
33
|
end
|
168
|
-
end
|
169
|
-
|
170
|
-
def www?
|
171
|
-
response && response.effective_url && !!response.effective_url.match(/^https?:\/\/www\./)
|
172
|
-
end
|
173
|
-
|
174
|
-
def non_www?
|
175
|
-
response && @non_www
|
176
|
-
end
|
177
34
|
|
178
|
-
|
179
|
-
|
180
|
-
end
|
181
|
-
|
182
|
-
def redirect
|
183
|
-
@redirect ||= begin
|
184
|
-
if location = request(https?, www?, false).headers["location"]
|
185
|
-
redirect_domain = SiteInspector.new(location).domain
|
186
|
-
redirect_domain.to_s if redirect_domain.to_s != domain.to_s
|
187
|
-
end
|
188
|
-
rescue
|
189
|
-
nil
|
35
|
+
def timeout
|
36
|
+
@timeout || 10
|
190
37
|
end
|
191
|
-
end
|
192
|
-
|
193
|
-
def http
|
194
|
-
details = {
|
195
|
-
endpoints: endpoints
|
196
|
-
}
|
197
|
-
|
198
|
-
# convenient shorthand for the extensive statements to come
|
199
|
-
combos = details[:endpoints]
|
200
|
-
|
201
|
-
# A domain is "canonically" at www if:
|
202
|
-
# * at least one of its www endpoints responds
|
203
|
-
# * both root endpoints are either down or redirect *somewhere*
|
204
|
-
# * either both root endpoints are down, *or* at least one
|
205
|
-
# root endpoint redirect should immediately go to
|
206
|
-
# an *internal* www endpoint
|
207
|
-
# This is meant to affirm situations like:
|
208
|
-
# http:// -> https:// -> https://www
|
209
|
-
# https:// -> http:// -> https://www
|
210
|
-
# and meant to avoid affirming situations like:
|
211
|
-
# http:// -> http://non-www,
|
212
|
-
# http://www -> http://non-www
|
213
|
-
# or like:
|
214
|
-
# https:// -> 200, http:// -> http://www
|
215
38
|
|
216
|
-
|
217
|
-
(
|
218
|
-
combos[:https][:www][:up] or
|
219
|
-
combos[:http][:www][:up]
|
220
|
-
) and (
|
221
|
-
(
|
222
|
-
combos[:https][:root][:redirect] or
|
223
|
-
!combos[:https][:root][:up] or
|
224
|
-
combos[:https][:root][:https_bad_name] or
|
225
|
-
!combos[:https][:root][:status].to_s.start_with?("2")
|
226
|
-
) and (
|
227
|
-
combos[:http][:root][:redirect] or
|
228
|
-
!combos[:http][:root][:up] or
|
229
|
-
!combos[:http][:root][:status].to_s.start_with?("2")
|
230
|
-
)
|
231
|
-
) and (
|
232
|
-
(
|
233
|
-
(
|
234
|
-
!combos[:https][:root][:up] or
|
235
|
-
combos[:https][:root][:https_bad_name] or
|
236
|
-
!combos[:https][:root][:status].to_s.start_with?("2")
|
237
|
-
) and
|
238
|
-
(
|
239
|
-
!combos[:http][:root][:up] or
|
240
|
-
!combos[:http][:root][:status].to_s.start_with?("2")
|
241
|
-
)
|
242
|
-
) or
|
243
|
-
(
|
244
|
-
combos[:https][:root][:redirect_immediately_to_www] and
|
245
|
-
!combos[:https][:root][:redirect_immediately_external]
|
246
|
-
) or
|
247
|
-
(
|
248
|
-
combos[:http][:root][:redirect_immediately_to_www] and
|
249
|
-
!combos[:http][:root][:redirect_immediately_external]
|
250
|
-
)
|
251
|
-
)
|
252
|
-
)
|
253
|
-
|
254
|
-
# A domain is "canonically" at https if:
|
255
|
-
# * at least one of its https endpoints is live and
|
256
|
-
# doesn't have an invalid hostname
|
257
|
-
# * both http endpoints are either down or redirect *somewhere*
|
258
|
-
# * at least one http endpoint redirects immediately to
|
259
|
-
# an *internal* https endpoint
|
260
|
-
# This is meant to affirm situations like:
|
261
|
-
# http:// -> http://www -> https://
|
262
|
-
# https:// -> http:// -> https://www
|
263
|
-
# and meant to avoid affirming situations like:
|
264
|
-
# http:// -> http://non-www
|
265
|
-
# http://www -> http://non-www
|
266
|
-
# or:
|
267
|
-
# http:// -> 200, http://www -> https://www
|
268
|
-
#
|
269
|
-
# It allows a site to be canonically HTTPS if the cert has
|
270
|
-
# a valid hostname but invalid chain issues.
|
271
|
-
|
272
|
-
https = !!(
|
273
|
-
(
|
274
|
-
(
|
275
|
-
combos[:https][:root][:up] and
|
276
|
-
!combos[:https][:root][:https_bad_name]
|
277
|
-
) or
|
278
|
-
(
|
279
|
-
combos[:https][:www][:up] and
|
280
|
-
!combos[:https][:www][:https_bad_name]
|
281
|
-
)
|
282
|
-
) and (
|
283
|
-
(
|
284
|
-
combos[:http][:root][:redirect] or
|
285
|
-
!combos[:http][:root][:up] or
|
286
|
-
!combos[:http][:root][:status].to_s.start_with?("2")
|
287
|
-
) and (
|
288
|
-
combos[:http][:www][:redirect] or
|
289
|
-
!combos[:http][:www][:up] or
|
290
|
-
!combos[:http][:www][:status].to_s.start_with?("2")
|
291
|
-
)
|
292
|
-
) and (
|
293
|
-
(
|
294
|
-
combos[:http][:root][:redirect_immediately_to_https] and
|
295
|
-
!combos[:http][:root][:redirect_immediately_external]
|
296
|
-
) or (
|
297
|
-
combos[:http][:www][:redirect_immediately_to_https] and
|
298
|
-
!combos[:http][:www][:redirect_immediately_external]
|
299
|
-
)
|
300
|
-
)
|
301
|
-
)
|
302
|
-
|
303
|
-
details[:canonical_endpoint] = www ? :www : :root
|
304
|
-
details[:canonical_protocol] = https ? :https : :http
|
305
|
-
details[:canonical] = uri(https, www).to_s
|
306
|
-
|
307
|
-
# If any endpoint is up, the domain is up.
|
308
|
-
details[:up] = !!(
|
309
|
-
combos[:https][:www][:up] or
|
310
|
-
combos[:https][:root][:up] or
|
311
|
-
combos[:http][:www][:up] or
|
312
|
-
combos[:http][:root][:up]
|
313
|
-
)
|
314
|
-
|
315
|
-
# A domain's root is broken if neither protocol can connect.
|
316
|
-
details[:broken_root] = !!(
|
317
|
-
!combos[:https][:root][:up] and
|
318
|
-
!combos[:http][:root][:up]
|
319
|
-
)
|
320
|
-
|
321
|
-
# A domain's www is broken if neither protocol can connect.
|
322
|
-
details[:broken_www] = !!(
|
323
|
-
!combos[:https][:www][:up] and
|
324
|
-
!combos[:http][:www][:up]
|
325
|
-
)
|
326
|
-
|
327
|
-
# HTTPS is "supported" (different than "canonical" or "enforced") if:
|
328
|
-
#
|
329
|
-
# * Either of the HTTPS endpoints is listening, and doesn't have
|
330
|
-
# an invalid hostname.
|
331
|
-
details[:support_https] = !!(
|
332
|
-
(
|
333
|
-
(combos[:https][:root][:status] != 0) and
|
334
|
-
!combos[:https][:root][:https_bad_name]
|
335
|
-
) or (
|
336
|
-
(combos[:https][:www][:status] != 0) and
|
337
|
-
!combos[:https][:www][:https_bad_name]
|
338
|
-
)
|
339
|
-
)
|
340
|
-
|
341
|
-
# we can say that a canonical HTTPS site "defaults" to HTTPS,
|
342
|
-
# even if it doesn't *strictly* enforce it (e.g. having a www
|
343
|
-
# subdomain first to go HTTP root before HTTPS root).
|
344
|
-
details[:default_https] = https
|
345
|
-
|
346
|
-
# HTTPS is "downgraded" if both:
|
347
|
-
#
|
348
|
-
# * HTTPS is supported, and
|
349
|
-
# * The 'canonical' endpoint gets an immediate internal redirect to HTTP.
|
350
|
-
|
351
|
-
details[:downgrade_https] = !!(
|
352
|
-
details[:support_https] and
|
353
|
-
(
|
354
|
-
combos[:https][details[:canonical_endpoint]][:redirect] and
|
355
|
-
!combos[:https][details[:canonical_endpoint]][:redirect_immediately_external] and
|
356
|
-
!combos[:https][details[:canonical_endpoint]][:redirect_immediately_to_https]
|
357
|
-
)
|
358
|
-
)
|
359
|
-
|
360
|
-
# HTTPS is enforced if one of the HTTPS endpoints is "live",
|
361
|
-
# and if both *HTTP* endpoints are either:
|
362
|
-
#
|
363
|
-
# * down, or
|
364
|
-
# * redirect immediately to HTTPS.
|
365
|
-
#
|
366
|
-
# This is different than whether a domain is "canonically" HTTPS.
|
367
|
-
#
|
368
|
-
# * an HTTP redirect can go to HTTPS on another domain, as long
|
369
|
-
# as it's immediate.
|
370
|
-
# * a domain with an invalid cert can still be enforcing HTTPS.
|
371
|
-
details[:enforce_https] = !!(
|
372
|
-
(
|
373
|
-
!combos[:http][:www][:up] or
|
374
|
-
(combos[:http][:www][:redirect_immediately_to_https])
|
375
|
-
) and
|
376
|
-
(
|
377
|
-
!combos[:http][:root][:up] or
|
378
|
-
(combos[:http][:root][:redirect_immediately_to_https])
|
379
|
-
) and
|
380
|
-
(
|
381
|
-
combos[:https][:www][:up] or
|
382
|
-
combos[:https][:root][:up]
|
383
|
-
)
|
384
|
-
)
|
385
|
-
|
386
|
-
# The domain is a redirect if at least one endpoint is up,
|
387
|
-
# and each one is *either* an external redirect or down entirely.
|
388
|
-
details[:redirect] = !!(
|
389
|
-
details[:up] and
|
390
|
-
(
|
391
|
-
combos[:http][:www][:redirect_external] or
|
392
|
-
!combos[:http][:www][:up] or
|
393
|
-
combos[:http][:www][:status] >= 400
|
394
|
-
) and
|
395
|
-
(
|
396
|
-
combos[:http][:root][:redirect_external] or
|
397
|
-
!combos[:http][:root][:up] or
|
398
|
-
combos[:http][:root][:status] >= 400
|
399
|
-
) and
|
400
|
-
(
|
401
|
-
combos[:https][:www][:redirect_external] or
|
402
|
-
!combos[:https][:www][:up] or
|
403
|
-
combos[:https][:www][:https_bad_name] or
|
404
|
-
combos[:https][:www][:status] >= 400
|
405
|
-
) and
|
406
|
-
(
|
407
|
-
combos[:https][:root][:redirect_external] or
|
408
|
-
!combos[:https][:root][:up] or
|
409
|
-
combos[:https][:root][:https_bad_name] or
|
410
|
-
combos[:https][:root][:status] >= 400
|
411
|
-
)
|
412
|
-
)
|
413
|
-
|
414
|
-
# OK, we've said a domain is a "redirect" domain.
|
415
|
-
# What does the domain redirect to?
|
416
|
-
if details[:redirect]
|
417
|
-
canon = combos[details[:canonical_protocol]][details[:canonical_endpoint]]
|
418
|
-
details[:redirect_to] = canon[:redirect_to]
|
419
|
-
else
|
420
|
-
details[:redirect_to] = nil
|
39
|
+
def inspect(domain)
|
40
|
+
Domain.new(domain)
|
421
41
|
end
|
422
42
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
)
|
432
|
-
|
433
|
-
# HSTS preload-ready for the entire domain?
|
434
|
-
#
|
435
|
-
# Re-checks :hsts_entire_domain in case the :preload_ready
|
436
|
-
# flag ever changes its definition to not require include_subdomains.
|
437
|
-
|
438
|
-
details[:hsts_entire_domain_preload] = !!(
|
439
|
-
details[:hsts_entire_domain] and
|
440
|
-
combos[:https][:root][:hsts_details][:preload_ready]
|
441
|
-
)
|
442
|
-
|
443
|
-
details
|
444
|
-
end
|
445
|
-
|
446
|
-
def endpoints
|
447
|
-
https_www = http_endpoint(true, true)
|
448
|
-
http_www = http_endpoint(false, true)
|
449
|
-
https_root = http_endpoint(true, false)
|
450
|
-
http_root = http_endpoint(false, false)
|
451
|
-
|
452
|
-
{
|
453
|
-
https: {
|
454
|
-
www: https_www,
|
455
|
-
root: https_root
|
456
|
-
},
|
457
|
-
http: {
|
458
|
-
www: http_www,
|
459
|
-
root: http_root
|
43
|
+
def typhoeus_defaults
|
44
|
+
{
|
45
|
+
:followlocation => false,
|
46
|
+
:timeout => SiteInspector.timeout,
|
47
|
+
:accept_encoding => "gzip",
|
48
|
+
:headers => {
|
49
|
+
"User-Agent" => "Mozilla/5.0 (compatible; SiteInspector/#{SiteInspector::VERSION}; +https://github.com/benbalter/site-inspector-ruby)"
|
50
|
+
}
|
460
51
|
}
|
461
|
-
}
|
462
|
-
end
|
463
|
-
|
464
|
-
# State of affairs at a particular endpoint.
|
465
|
-
def http_endpoint(ssl, www)
|
466
|
-
details = {}
|
467
|
-
|
468
|
-
# Don't follow redirects for first ping.
|
469
|
-
response = request(ssl, www, false)
|
470
|
-
|
471
|
-
|
472
|
-
# For HTTPS: examine the full range of possibilities.
|
473
|
-
if ssl
|
474
|
-
if response.return_code == :ok
|
475
|
-
details[:https_valid] = true
|
476
|
-
details[:https_bad_chain] = false
|
477
|
-
details[:https_bad_name] = false
|
478
|
-
|
479
|
-
# Bad certificate chain.
|
480
|
-
elsif response.return_code == :ssl_cacert
|
481
|
-
details[:https_valid] = false
|
482
|
-
details[:https_bad_chain] = true
|
483
|
-
response = request(ssl, www, false, false, true)
|
484
|
-
# Bad everything.
|
485
|
-
if response.return_code == :peer_failed_verification
|
486
|
-
details[:https_bad_name] = true
|
487
|
-
response = request(ssl, www, false, false, false)
|
488
|
-
end
|
489
|
-
|
490
|
-
# Bad hostname.
|
491
|
-
elsif response.return_code == :peer_failed_verification
|
492
|
-
details[:https_valid] = false
|
493
|
-
details[:https_bad_name] = true
|
494
|
-
response = request(ssl, www, false, true, false)
|
495
|
-
# Bad everything.
|
496
|
-
if response.return_code == :ssl_cacert
|
497
|
-
details[:https_bad_chain] = true
|
498
|
-
response = request(ssl, www, false, false, false)
|
499
|
-
end
|
500
|
-
|
501
|
-
# not sure what else would happen
|
502
|
-
elsif response.response_code != 0
|
503
|
-
details[:https_valid] = false
|
504
|
-
details[:https_unknown_issue] = response.return_code
|
505
|
-
end
|
506
52
|
end
|
507
53
|
|
508
|
-
|
509
|
-
|
510
|
-
details[:up] = (response.response_code != 0)
|
511
|
-
return details if !details[:up]
|
512
|
-
|
513
|
-
headers = Hash[response.headers.map{ |k,v| [k.downcase,v] }]
|
514
|
-
details[:headers] = headers
|
515
|
-
|
516
|
-
|
517
|
-
# HSTS only takes effect when delivered over valid HTTPS.
|
518
|
-
hsts = SiteInspector.hsts_parse(headers["strict-transport-security"])
|
519
|
-
|
520
|
-
details[:hsts] = !!(
|
521
|
-
ssl and
|
522
|
-
details[:https_valid] and
|
523
|
-
hsts[:enabled]
|
524
|
-
)
|
525
|
-
|
526
|
-
details[:hsts_header] = headers["strict-transport-security"]
|
527
|
-
details[:hsts_details] = hsts
|
528
|
-
|
529
|
-
|
530
|
-
# If it's a redirect, go find the ultimate response starting from this combo.
|
531
|
-
redirect_code = response.response_code.to_s.start_with?("3")
|
532
|
-
location_header = headers["location"]
|
533
|
-
if redirect_code and location_header
|
534
|
-
location_header = location_header.downcase
|
535
|
-
details[:redirect] = true
|
536
|
-
|
537
|
-
ultimate_response = request(ssl, www, true, !details[:https_bad_chain], !details[:https_bad_name])
|
538
|
-
uri_original = URI(ultimate_response.request.url)
|
539
|
-
|
540
|
-
# treat relative Location headers as having the original hostname
|
541
|
-
if location_header.start_with?("http:") or location_header.start_with?("https:")
|
542
|
-
uri_immediate = URI(URI.escape(location_header))
|
543
|
-
else
|
544
|
-
uri_immediate = URI.join(uri_original, URI.escape(location_header))
|
545
|
-
end
|
546
|
-
|
547
|
-
uri_eventual = URI(ultimate_response.effective_url.downcase)
|
548
|
-
|
549
|
-
# compare base domain names
|
550
|
-
base_original = PublicSuffix.parse(uri_original.hostname).domain
|
551
|
-
|
552
|
-
# if the redirects aren't to valid hostnames (e.g. IP addresses)
|
553
|
-
# then fine just compare them directly, they're not going to be
|
554
|
-
# identical anyway.
|
555
|
-
base_immediate = begin
|
556
|
-
PublicSuffix.parse(uri_immediate.hostname).domain
|
557
|
-
rescue PublicSuffix::DomainInvalid
|
558
|
-
uri_immediate.to_s
|
559
|
-
end
|
560
|
-
|
561
|
-
base_eventual = begin
|
562
|
-
PublicSuffix.parse(uri_eventual.hostname).domain
|
563
|
-
rescue PublicSuffix::DomainInvalid
|
564
|
-
uri_eventual.to_s
|
565
|
-
end
|
566
|
-
|
567
|
-
details[:redirect_immediately_to] = uri_immediate.to_s
|
568
|
-
details[:redirect_immediately_to_www] = !!uri_immediate.to_s.match(/^https?:\/\/www\./)
|
569
|
-
details[:redirect_immediately_to_https] = uri_immediate.to_s.start_with?("https://")
|
570
|
-
details[:redirect_immediately_external] = (base_original != base_immediate)
|
571
|
-
|
572
|
-
details[:redirect_to] = uri_eventual.to_s
|
573
|
-
details[:redirect_external] = (base_original != base_eventual)
|
574
|
-
|
575
|
-
# otherwise, mark all the redirect fields as false/null
|
576
|
-
else
|
577
|
-
details[:redirect] = false
|
578
|
-
details[:redirect_immediately_to] = nil
|
579
|
-
details[:redirect_immediately_to_www] = false
|
580
|
-
details[:redirect_immediately_to_https] = false
|
581
|
-
details[:redirect_immediately_external] = false
|
582
|
-
|
583
|
-
details[:redirect_to] = nil
|
584
|
-
details[:redirect_external] = false
|
585
|
-
end
|
586
|
-
|
587
|
-
details
|
588
|
-
end
|
589
|
-
|
590
|
-
def to_hash(http_only=false)
|
591
|
-
if http_only
|
592
|
-
{
|
593
|
-
:domain => domain.to_s,
|
594
|
-
:uri => uri.to_s,
|
595
|
-
:live => !!response,
|
596
|
-
:ssl => https?,
|
597
|
-
:enforce_https => enforce_https?,
|
598
|
-
:non_www => non_www?,
|
599
|
-
:redirect => redirect,
|
600
|
-
:headers => headers
|
601
|
-
}
|
602
|
-
else
|
603
|
-
{
|
604
|
-
:domain => domain.to_s,
|
605
|
-
:uri => uri.to_s,
|
606
|
-
:government => government?,
|
607
|
-
:live => !!response,
|
608
|
-
:ssl => https?,
|
609
|
-
:enforce_https => enforce_https?,
|
610
|
-
:non_www => non_www?,
|
611
|
-
:redirect => redirect,
|
612
|
-
:ip => ip,
|
613
|
-
:hostname => hostname.to_s,
|
614
|
-
:ipv6 => ipv6?,
|
615
|
-
:dnssec => dnssec?,
|
616
|
-
:cdn => cdn,
|
617
|
-
:google_apps => google_apps?,
|
618
|
-
:cloud_provider => cloud_provider,
|
619
|
-
:server => server,
|
620
|
-
:cms => cms,
|
621
|
-
:analytics => analytics,
|
622
|
-
:javascript => javascript,
|
623
|
-
:advertising => advertising,
|
624
|
-
:slash_data => slash_data?,
|
625
|
-
:slash_developer => slash_developer?,
|
626
|
-
:data_dot_json => data_dot_json?,
|
627
|
-
:click_jacking_protection => click_jacking_protection?,
|
628
|
-
:content_security_policy => content_security_policy?,
|
629
|
-
:xss_protection => xss_protection?,
|
630
|
-
:secure_cookies => secure_cookies?,
|
631
|
-
:strict_transport_security => strict_transport_security?,
|
632
|
-
:headers => headers
|
633
|
-
}
|
54
|
+
def hydra
|
55
|
+
@hydra ||= Typhoeus::Hydra.new(max_concurrency: 4)
|
634
56
|
end
|
635
57
|
end
|
636
58
|
end
|
59
|
+
|
60
|
+
Typhoeus::Config.memoize = true
|
61
|
+
Typhoeus::Config.cache = SiteInspector.cache
|