site_health 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +3 -0
- data/.rubocop.yml +7 -0
- data/.ruby-style-guide.yml +263 -0
- data/.travis.yml +3 -2
- data/CHANGELOG.md +10 -0
- data/Gemfile +4 -2
- data/LICENSE.txt +1 -1
- data/README.md +165 -21
- data/Rakefile +5 -3
- data/bin/console +4 -10
- data/bin/setup +0 -2
- data/exe/site_health +75 -0
- data/lib/site_health.rb +89 -113
- data/lib/site_health/check_data.rb +35 -0
- data/lib/site_health/checkers/checker.rb +152 -0
- data/lib/site_health/checkers/facebook_share_link.rb +125 -0
- data/lib/site_health/checkers/google_page_speed.rb +55 -0
- data/lib/site_health/checkers/html_proofer.rb +67 -0
- data/lib/site_health/checkers/json_syntax.rb +28 -0
- data/lib/site_health/checkers/missing_description.rb +50 -0
- data/lib/site_health/checkers/missing_title.rb +41 -0
- data/lib/site_health/checkers/page_not_found.rb +30 -0
- data/lib/site_health/checkers/redirect.rb +16 -0
- data/lib/site_health/checkers/w3c_css.rb +37 -0
- data/lib/site_health/checkers/w3c_html.rb +37 -0
- data/lib/site_health/checkers/xml.rb +27 -0
- data/lib/site_health/configuration/configuration.rb +84 -0
- data/lib/site_health/configuration/html_proofer_configuration.rb +88 -0
- data/lib/site_health/configuration/w3c_validators_configuration.rb +23 -0
- data/lib/site_health/event_emitter.rb +70 -0
- data/lib/site_health/issue.rb +125 -0
- data/lib/site_health/issues.rb +43 -0
- data/lib/site_health/issues_report.rb +52 -0
- data/lib/site_health/key_struct.rb +6 -3
- data/lib/site_health/link.rb +32 -0
- data/lib/site_health/null_logger.rb +14 -0
- data/lib/site_health/nurse.rb +167 -0
- data/lib/site_health/summarizers/page_size_summarizer.rb +77 -0
- data/lib/site_health/timer.rb +47 -0
- data/lib/site_health/url_map.rb +41 -0
- data/lib/site_health/version.rb +10 -1
- data/lib/site_health/{journals/w3c_journal.rb → w3c_journal_builder.rb} +5 -1
- data/site_health.gemspec +28 -17
- metadata +144 -21
- data/lib/site_health/checkers/css_page.rb +0 -36
- data/lib/site_health/checkers/html_page.rb +0 -41
- data/lib/site_health/checkers/xml_page.rb +0 -21
- data/lib/site_health/journals/css_journal.rb +0 -12
- data/lib/site_health/journals/html_journal.rb +0 -16
- data/lib/site_health/journals/xml_journal.rb +0 -8
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,14 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
-
require
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'site_health'
|
5
6
|
|
6
|
-
|
7
|
-
# with your gem easier. You can also use a different console, if you like.
|
8
|
-
|
9
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
# require "pry"
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require "irb"
|
7
|
+
require 'irb'
|
14
8
|
IRB.start(__FILE__)
|
data/bin/setup
CHANGED
data/exe/site_health
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
# for dev purposes
|
7
|
+
require 'bundler/setup' if ENV['SITE_HEALTH_GEM_DEV']
|
8
|
+
require 'site_health'
|
9
|
+
|
10
|
+
def site_report(url, fields, progress)
|
11
|
+
puts 'severity,title,url' if progress
|
12
|
+
nurse = SiteHealth.check(url) do |n|
|
13
|
+
n.clerk do |clerk|
|
14
|
+
clerk.every_issue do |issue|
|
15
|
+
puts [issue.severity, issue.title, issue.url].join(',') if progress
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
SiteHealth::IssuesReport.new(nurse.issues) do |r|
|
20
|
+
r.fields = fields if fields
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
options = {}
|
25
|
+
OptionParser.new do |parser|
|
26
|
+
parser.banner = 'Usage: site_health --help'
|
27
|
+
parser.default_argv = ARGV
|
28
|
+
|
29
|
+
parser.on('--url=val0', String, '') do |url|
|
30
|
+
options[:url] = url
|
31
|
+
end
|
32
|
+
|
33
|
+
parser.on('--fields=priority,title,url', Array, 'Issue fields to include - by default all fields are included') do |fields| # rubocop:disable Metrics/LineLength
|
34
|
+
options[:fields] = fields
|
35
|
+
end
|
36
|
+
|
37
|
+
parser.on('--output=result.csv', String, 'Output format, .csv or .json') do |output|
|
38
|
+
options[:output] = output
|
39
|
+
end
|
40
|
+
|
41
|
+
parser.on('--[no-]progress', '') do |progress|
|
42
|
+
options[:progress] = progress
|
43
|
+
end
|
44
|
+
|
45
|
+
parser.on('-h', '--help', 'How to use') do
|
46
|
+
puts parser
|
47
|
+
exit
|
48
|
+
end
|
49
|
+
end.parse!
|
50
|
+
|
51
|
+
def die(error_klass, message)
|
52
|
+
fail(error_klass, "#{message} - run `site_health --help`")
|
53
|
+
end
|
54
|
+
|
55
|
+
def perform(output, url, fields, progress, format_method)
|
56
|
+
File.write(
|
57
|
+
output,
|
58
|
+
site_report(url, fields, progress).public_send(format_method)
|
59
|
+
)
|
60
|
+
end
|
61
|
+
|
62
|
+
url = options.fetch(:url) { die(ArgumentError, '--url is required') }
|
63
|
+
output = options.fetch(:output) { die(ArgumentError, '--output is required') }
|
64
|
+
progress = options.fetch(:progress, true)
|
65
|
+
fields = options.fetch(:fields, nil)
|
66
|
+
|
67
|
+
extension = File.extname(output)
|
68
|
+
case extension
|
69
|
+
when '.csv' then perform(output, url, fields, progress, :to_csv)
|
70
|
+
when '.json' then perform(output, url, fields, progress, :to_json)
|
71
|
+
when ''
|
72
|
+
die(ArgumentError, "missing file extension from '#{output}'")
|
73
|
+
else
|
74
|
+
die(ArgumentError, "unknown file extension '#{extension}'")
|
75
|
+
end
|
data/lib/site_health.rb
CHANGED
@@ -1,136 +1,112 @@
|
|
1
|
-
|
2
|
-
require 'w3c_validators'
|
3
|
-
require "site_health/version"
|
1
|
+
# frozen_string_literal: true
|
4
2
|
|
5
|
-
require
|
3
|
+
require 'logger'
|
4
|
+
require 'spidr'
|
6
5
|
|
7
|
-
require 'site_health/
|
8
|
-
require 'site_health/
|
9
|
-
require 'site_health/journals/xml_journal'
|
10
|
-
require 'site_health/journals/w3c_journal'
|
6
|
+
require 'site_health/version'
|
7
|
+
require 'site_health/configuration/configuration'
|
11
8
|
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
9
|
+
require 'site_health/key_struct'
|
10
|
+
require 'site_health/url_map'
|
11
|
+
require 'site_health/link'
|
15
12
|
|
13
|
+
require 'site_health/checkers/checker'
|
14
|
+
require 'site_health/nurse'
|
15
|
+
require 'site_health/issues_report'
|
16
|
+
|
17
|
+
# Top-level module/namespace
|
16
18
|
module SiteHealth
|
17
|
-
def self.
|
18
|
-
|
19
|
+
def self.require_optional_dependency(path, gem_name: nil)
|
20
|
+
gem_name ||= path
|
21
|
+
require path
|
22
|
+
rescue LoadError => e
|
23
|
+
message_parts = [
|
24
|
+
e.message,
|
25
|
+
"unable to require file from '#{gem_name}' gem",
|
26
|
+
'please install it',
|
27
|
+
]
|
28
|
+
raise(LoadError, message_parts.join(' -- '))
|
19
29
|
end
|
20
30
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
31
|
+
# @param [Checker] klass that inherits from Checker
|
32
|
+
# @return [see SiteHealth#registered_checkers]
|
33
|
+
def self.register_checker(klass)
|
34
|
+
registered_checkers[klass.name.to_sym] = klass
|
35
|
+
registered_checkers
|
36
|
+
end
|
25
37
|
|
26
|
-
|
38
|
+
# @return [Hash] all registered checkers
|
39
|
+
def self.registered_checkers
|
40
|
+
@checkers ||= {}
|
41
|
+
end
|
27
42
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
43
|
+
# @param [Symbol, String] name of the checker to be loaded
|
44
|
+
# @return [Checker] loaded class that should inherits from Checker
|
45
|
+
def self.load_checker(name)
|
46
|
+
name_key = name.to_sym
|
47
|
+
registered_checkers.fetch(name_key) do
|
48
|
+
require "site_health/checkers/#{name}"
|
49
|
+
registered_checkers[name_key]
|
33
50
|
end
|
51
|
+
end
|
34
52
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
)
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
def initialize(site:)
|
49
|
-
@site = site
|
53
|
+
# @param [String] site to be checked
|
54
|
+
# @param config [SiteHealth::Configuration] the configuration to use
|
55
|
+
# @yieldparam [SiteHealth::Nurse] nurse (a.k.a agent)
|
56
|
+
# @return [Hash] journal data
|
57
|
+
# @see Nurse#journal
|
58
|
+
def self.check(site, config: SiteHealth.config)
|
59
|
+
nurse = Nurse.new(config: config)
|
60
|
+
yield(nurse) if block_given?
|
61
|
+
|
62
|
+
Spidr.site(site) do |spider|
|
63
|
+
spider.every_failed_url { |url| nurse.check_failed_url(url) }
|
64
|
+
spider.every_page { |page| nurse.check_page(page) }
|
50
65
|
end
|
51
66
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
missing_html_title = []
|
56
|
-
http_error_urls = []
|
57
|
-
html_error_urls = []
|
58
|
-
html_warning_urls = []
|
59
|
-
xml_error_urls = []
|
60
|
-
css_error_urls = []
|
61
|
-
css_warning_urls = []
|
62
|
-
|
63
|
-
spider = Spidr.site(site) do |spider|
|
64
|
-
spider.every_link do |origin, destination|
|
65
|
-
url_map[destination] << origin
|
66
|
-
end
|
67
|
-
|
68
|
-
spider.every_page do |page|
|
69
|
-
code_journal = HTTPCodeJournal.new(url: page.url, code: page.code)
|
70
|
-
http_error_urls << code_journal if code_journal.error?
|
71
|
-
|
72
|
-
if page.css?
|
73
|
-
result = Checkers::CSSPage.check(page)
|
74
|
-
xml_error_urls << result if result.errors?
|
75
|
-
end
|
76
|
-
|
77
|
-
if page.xml?
|
78
|
-
result = Checkers::XMLPage.check(page)
|
79
|
-
xml_error_urls << result if result.errors?
|
80
|
-
end
|
81
|
-
|
82
|
-
if page.html?
|
83
|
-
result = Checkers::HTMLPage.check(page)
|
84
|
-
missing_html_title << result if result.missing_title?
|
85
|
-
html_error_urls << result if result.errors?
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
67
|
+
nurse.punch_out!
|
68
|
+
end
|
89
69
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
html_warning_urls: html_warning_urls,
|
99
|
-
xml_error_urls: xml_error_urls,
|
100
|
-
css_error_urls: css_error_urls,
|
101
|
-
css_warning_urls: css_warning_urls
|
102
|
-
)
|
103
|
-
end
|
70
|
+
# @param [Array<String>, String] urls to be checked
|
71
|
+
# @param config [SiteHealth::Configuration] the configuration to use
|
72
|
+
# @yieldparam [SiteHealth::Nurse] nurse (a.k.a agent)
|
73
|
+
# @return [Hash] journal data
|
74
|
+
# @see Nurse#journal
|
75
|
+
def self.check_urls(urls, config: SiteHealth.config)
|
76
|
+
nurse = Nurse.new(config: config)
|
77
|
+
yield(nurse) if block_given?
|
104
78
|
|
105
|
-
|
106
|
-
css_checker = Checkers::CSSPage.new(page)
|
107
|
-
result = css_checker.check
|
108
|
-
return unless result.errors?
|
79
|
+
agent = Spidr::Agent.new
|
109
80
|
|
110
|
-
|
111
|
-
|
81
|
+
Array(urls).each do |url|
|
82
|
+
page = agent.get_page(url)
|
112
83
|
|
113
|
-
|
114
|
-
|
115
|
-
|
84
|
+
if page.nil?
|
85
|
+
nurse.check_failed_url(url)
|
86
|
+
next
|
116
87
|
end
|
117
|
-
end
|
118
88
|
|
119
|
-
|
120
|
-
def broken_links(spider, url_map)
|
121
|
-
# FIXME: spider#failures only returns timeout errors etc and not HTTP error status codes..
|
122
|
-
# so we need to have 2 types of "failed" URLs
|
123
|
-
spider.failures.map do |failed_url|
|
124
|
-
BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
|
125
|
-
end
|
89
|
+
nurse.check_page(page)
|
126
90
|
end
|
127
91
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
92
|
+
nurse.punch_out!
|
93
|
+
end
|
94
|
+
|
95
|
+
# @see Configuration#logger
|
96
|
+
def self.logger
|
97
|
+
config.logger
|
98
|
+
end
|
99
|
+
|
100
|
+
# @return [Configuration] the current configuration
|
101
|
+
# @yieldparam [Configuration] the current configuration
|
102
|
+
def self.configure
|
103
|
+
@configuration ||= Configuration.new
|
104
|
+
yield(@configuration) if block_given?
|
105
|
+
@configuration
|
106
|
+
end
|
107
|
+
|
108
|
+
# @return [Configuration] the current configuration
|
109
|
+
def self.config
|
110
|
+
configure
|
135
111
|
end
|
136
112
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SiteHealth
|
4
|
+
class CheckData
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@data = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def [](key)
|
12
|
+
@data[key]
|
13
|
+
end
|
14
|
+
|
15
|
+
# Adds data
|
16
|
+
# @param [Hash] the hash to be added
|
17
|
+
# @return [Hash] the current data
|
18
|
+
def add(hash)
|
19
|
+
@data.merge!(hash)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [TrueClass, FalseClass] true if there is no data
|
23
|
+
def empty?
|
24
|
+
@data.empty?
|
25
|
+
end
|
26
|
+
|
27
|
+
def each(&block)
|
28
|
+
@data.each(&block)
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_h
|
32
|
+
@data.to_h
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'site_health/check_data'
|
4
|
+
require 'site_health/issues'
|
5
|
+
require 'site_health/issue'
|
6
|
+
|
7
|
+
module SiteHealth
|
8
|
+
# Parent class for all checkers (all checkers must inheirit from this class)
|
9
|
+
class Checker
|
10
|
+
# All possible page types that can be checked
|
11
|
+
CHECKABLE_TYPES = %i[
|
12
|
+
plain_text
|
13
|
+
directory
|
14
|
+
xsl
|
15
|
+
rss
|
16
|
+
atom
|
17
|
+
ms_word
|
18
|
+
pdf
|
19
|
+
zip
|
20
|
+
javascript
|
21
|
+
json
|
22
|
+
css
|
23
|
+
xml
|
24
|
+
html
|
25
|
+
].freeze
|
26
|
+
|
27
|
+
def self.name(name = '__get_value__')
|
28
|
+
if name == '__get_value__'
|
29
|
+
return @name if @name
|
30
|
+
|
31
|
+
@name = (super() || SecureRandom.hex).downcase.gsub(/sitehealth::/, '')
|
32
|
+
return @name
|
33
|
+
end
|
34
|
+
|
35
|
+
@name = name.to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.types(types = '__get_value__')
|
39
|
+
if types == '__get_value__'
|
40
|
+
@types ||= CHECKABLE_TYPES
|
41
|
+
return @types
|
42
|
+
end
|
43
|
+
|
44
|
+
@types = Array(types).map(&:to_sym)
|
45
|
+
end
|
46
|
+
|
47
|
+
# @param [Hash] types
|
48
|
+
# the issues data - optional, if not present it will return the current data
|
49
|
+
# @return [Hash] the issues types data
|
50
|
+
def self.issue_types(types = :__get_value__)
|
51
|
+
if types == :__get_value__
|
52
|
+
return @issue_types ||= {}
|
53
|
+
end
|
54
|
+
|
55
|
+
default = types.fetch(:_default, {})
|
56
|
+
@issue_types = types.map do |key, data|
|
57
|
+
issue_data = { code: key }.merge!(default).merge!(data)
|
58
|
+
[key, issue_data]
|
59
|
+
end.to_h
|
60
|
+
end
|
61
|
+
|
62
|
+
attr_reader :page, :config, :logger, :issues, :data
|
63
|
+
|
64
|
+
# @param [Spidr::Page] page the crawled page
|
65
|
+
# @param config [SiteHealth::Configuration]
|
66
|
+
def initialize(page, config: SiteHealth.config)
|
67
|
+
@page = page
|
68
|
+
@config = config
|
69
|
+
@logger = config.logger
|
70
|
+
@issues = Issues.new(name)
|
71
|
+
@data = CheckData.new
|
72
|
+
end
|
73
|
+
|
74
|
+
# Run the checker
|
75
|
+
# @yieldparam [Checker] yields self
|
76
|
+
# @return [CheckerResult] returns self
|
77
|
+
def call
|
78
|
+
timer = Timer.measure { check }
|
79
|
+
add_data(
|
80
|
+
started_at: timer.started_at,
|
81
|
+
finished_at: timer.finished_at,
|
82
|
+
runtime_in_seconds: timer.diff.to_f
|
83
|
+
)
|
84
|
+
yield(self) if block_given?
|
85
|
+
self
|
86
|
+
end
|
87
|
+
|
88
|
+
# @return [String] the page URL
|
89
|
+
def url
|
90
|
+
page.url
|
91
|
+
end
|
92
|
+
|
93
|
+
# @return [String] the name of the checker
|
94
|
+
def name
|
95
|
+
self.class.name
|
96
|
+
end
|
97
|
+
|
98
|
+
# @return [Array<Symbol>] list of page types the checker will run on
|
99
|
+
def types
|
100
|
+
self.class.types
|
101
|
+
end
|
102
|
+
|
103
|
+
# @return [Hash] issue types data
|
104
|
+
def issue_types
|
105
|
+
self.class.issue_types
|
106
|
+
end
|
107
|
+
|
108
|
+
# @return [Boolean] determines whether the checker should run
|
109
|
+
def should_check?
|
110
|
+
types.any? { |type| page.public_send("#{type}?") }
|
111
|
+
end
|
112
|
+
|
113
|
+
# Adds an issue
|
114
|
+
# @return [Array<Issue>] the current list of issues
|
115
|
+
# @see Issue#initialize for supported arguments
|
116
|
+
def add_issue(**args)
|
117
|
+
issues << Issue.new({ name: name, url: page.url }.merge!(**args))
|
118
|
+
end
|
119
|
+
|
120
|
+
def add_issue_type(type, **args)
|
121
|
+
data = issue_types.fetch(type) do
|
122
|
+
raise(ArgumentError, "unknown issue type #{type}, known types are: #{issue_types.keys.join(', ')}") # rubocop:disable Metrics/LineLength
|
123
|
+
end
|
124
|
+
|
125
|
+
add_issue(data.merge(**args))
|
126
|
+
end
|
127
|
+
|
128
|
+
# Adds data
|
129
|
+
# @param [Hash] the hash to be added
|
130
|
+
# @return [Hash] the current data
|
131
|
+
def add_data(hash)
|
132
|
+
data.add(hash)
|
133
|
+
end
|
134
|
+
|
135
|
+
# @return [Hash] hash representation of the object
|
136
|
+
def to_h
|
137
|
+
{
|
138
|
+
name: name.to_sym,
|
139
|
+
data: data.to_h,
|
140
|
+
issues: issues.map(&:to_h),
|
141
|
+
}
|
142
|
+
end
|
143
|
+
|
144
|
+
protected
|
145
|
+
|
146
|
+
# Abstract method that subclasses must implement
|
147
|
+
# @raise [NotImplementedError] subclasses must implement
|
148
|
+
def check
|
149
|
+
raise(NotImplementedError, 'please implement!')
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|