site_health 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +7 -0
  4. data/.ruby-style-guide.yml +263 -0
  5. data/.travis.yml +3 -2
  6. data/CHANGELOG.md +10 -0
  7. data/Gemfile +4 -2
  8. data/LICENSE.txt +1 -1
  9. data/README.md +165 -21
  10. data/Rakefile +5 -3
  11. data/bin/console +4 -10
  12. data/bin/setup +0 -2
  13. data/exe/site_health +75 -0
  14. data/lib/site_health.rb +89 -113
  15. data/lib/site_health/check_data.rb +35 -0
  16. data/lib/site_health/checkers/checker.rb +152 -0
  17. data/lib/site_health/checkers/facebook_share_link.rb +125 -0
  18. data/lib/site_health/checkers/google_page_speed.rb +55 -0
  19. data/lib/site_health/checkers/html_proofer.rb +67 -0
  20. data/lib/site_health/checkers/json_syntax.rb +28 -0
  21. data/lib/site_health/checkers/missing_description.rb +50 -0
  22. data/lib/site_health/checkers/missing_title.rb +41 -0
  23. data/lib/site_health/checkers/page_not_found.rb +30 -0
  24. data/lib/site_health/checkers/redirect.rb +16 -0
  25. data/lib/site_health/checkers/w3c_css.rb +37 -0
  26. data/lib/site_health/checkers/w3c_html.rb +37 -0
  27. data/lib/site_health/checkers/xml.rb +27 -0
  28. data/lib/site_health/configuration/configuration.rb +84 -0
  29. data/lib/site_health/configuration/html_proofer_configuration.rb +88 -0
  30. data/lib/site_health/configuration/w3c_validators_configuration.rb +23 -0
  31. data/lib/site_health/event_emitter.rb +70 -0
  32. data/lib/site_health/issue.rb +125 -0
  33. data/lib/site_health/issues.rb +43 -0
  34. data/lib/site_health/issues_report.rb +52 -0
  35. data/lib/site_health/key_struct.rb +6 -3
  36. data/lib/site_health/link.rb +32 -0
  37. data/lib/site_health/null_logger.rb +14 -0
  38. data/lib/site_health/nurse.rb +167 -0
  39. data/lib/site_health/summarizers/page_size_summarizer.rb +77 -0
  40. data/lib/site_health/timer.rb +47 -0
  41. data/lib/site_health/url_map.rb +41 -0
  42. data/lib/site_health/version.rb +10 -1
  43. data/lib/site_health/{journals/w3c_journal.rb → w3c_journal_builder.rb} +5 -1
  44. data/site_health.gemspec +28 -17
  45. metadata +144 -21
  46. data/lib/site_health/checkers/css_page.rb +0 -36
  47. data/lib/site_health/checkers/html_page.rb +0 -41
  48. data/lib/site_health/checkers/xml_page.rb +0 -21
  49. data/lib/site_health/journals/css_journal.rb +0 -12
  50. data/lib/site_health/journals/html_journal.rb +0 -16
  51. data/lib/site_health/journals/xml_journal.rb +0 -8
data/Rakefile CHANGED
@@ -1,6 +1,8 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
3
5
 
4
6
  RSpec::Core::RakeTask.new(:spec)
5
7
 
6
- task :default => :spec
8
+ task default: :spec
@@ -1,14 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
- require "bundler/setup"
4
- require "site_health"
4
+ require 'bundler/setup'
5
+ require 'site_health'
5
6
 
6
- # You can add fixtures and/or initialization code here to make experimenting
7
- # with your gem easier. You can also use a different console, if you like.
8
-
9
- # (If you use this, don't forget to add pry to your Gemfile!)
10
- # require "pry"
11
- # Pry.start
12
-
13
- require "irb"
7
+ require 'irb'
14
8
  IRB.start(__FILE__)
data/bin/setup CHANGED
@@ -4,5 +4,3 @@ IFS=$'\n\t'
4
4
  set -vx
5
5
 
6
6
  bundle install
7
-
8
- # Do any other automated setup that you need to do here
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'optparse'
5
+
6
+ # for dev purposes
7
+ require 'bundler/setup' if ENV['SITE_HEALTH_GEM_DEV']
8
+ require 'site_health'
9
+
10
+ def site_report(url, fields, progress)
11
+ puts 'severity,title,url' if progress
12
+ nurse = SiteHealth.check(url) do |n|
13
+ n.clerk do |clerk|
14
+ clerk.every_issue do |issue|
15
+ puts [issue.severity, issue.title, issue.url].join(',') if progress
16
+ end
17
+ end
18
+ end
19
+ SiteHealth::IssuesReport.new(nurse.issues) do |r|
20
+ r.fields = fields if fields
21
+ end
22
+ end
23
+
24
+ options = {}
25
+ OptionParser.new do |parser|
26
+ parser.banner = 'Usage: site_health --help'
27
+ parser.default_argv = ARGV
28
+
29
+ parser.on('--url=val0', String, '') do |url|
30
+ options[:url] = url
31
+ end
32
+
33
+ parser.on('--fields=priority,title,url', Array, 'Issue fields to include - by default all fields are included') do |fields| # rubocop:disable Metrics/LineLength
34
+ options[:fields] = fields
35
+ end
36
+
37
+ parser.on('--output=result.csv', String, 'Output format, .csv or .json') do |output|
38
+ options[:output] = output
39
+ end
40
+
41
+ parser.on('--[no-]progress', '') do |progress|
42
+ options[:progress] = progress
43
+ end
44
+
45
+ parser.on('-h', '--help', 'How to use') do
46
+ puts parser
47
+ exit
48
+ end
49
+ end.parse!
50
+
51
+ def die(error_klass, message)
52
+ fail(error_klass, "#{message} - run `site_health --help`")
53
+ end
54
+
55
+ def perform(output, url, fields, progress, format_method)
56
+ File.write(
57
+ output,
58
+ site_report(url, fields, progress).public_send(format_method)
59
+ )
60
+ end
61
+
62
+ url = options.fetch(:url) { die(ArgumentError, '--url is required') }
63
+ output = options.fetch(:output) { die(ArgumentError, '--output is required') }
64
+ progress = options.fetch(:progress, true)
65
+ fields = options.fetch(:fields, nil)
66
+
67
+ extension = File.extname(output)
68
+ case extension
69
+ when '.csv' then perform(output, url, fields, progress, :to_csv)
70
+ when '.json' then perform(output, url, fields, progress, :to_json)
71
+ when ''
72
+ die(ArgumentError, "missing file extension from '#{output}'")
73
+ else
74
+ die(ArgumentError, "unknown file extension '#{extension}'")
75
+ end
@@ -1,136 +1,112 @@
1
- require "spidr"
2
- require 'w3c_validators'
3
- require "site_health/version"
1
+ # frozen_string_literal: true
4
2
 
5
- require "site_health/key_struct"
3
+ require 'logger'
4
+ require 'spidr'
6
5
 
7
- require 'site_health/journals/css_journal'
8
- require 'site_health/journals/html_journal'
9
- require 'site_health/journals/xml_journal'
10
- require 'site_health/journals/w3c_journal'
6
+ require 'site_health/version'
7
+ require 'site_health/configuration/configuration'
11
8
 
12
- require "site_health/checkers/css_page"
13
- require "site_health/checkers/html_page"
14
- require "site_health/checkers/xml_page"
9
+ require 'site_health/key_struct'
10
+ require 'site_health/url_map'
11
+ require 'site_health/link'
15
12
 
13
+ require 'site_health/checkers/checker'
14
+ require 'site_health/nurse'
15
+ require 'site_health/issues_report'
16
+
17
+ # Top-level module/namespace
16
18
  module SiteHealth
17
- def self.check(site)
18
- Check.call(site: site)
19
+ def self.require_optional_dependency(path, gem_name: nil)
20
+ gem_name ||= path
21
+ require path
22
+ rescue LoadError => e
23
+ message_parts = [
24
+ e.message,
25
+ "unable to require file from '#{gem_name}' gem",
26
+ 'please install it',
27
+ ]
28
+ raise(LoadError, message_parts.join(' -- '))
19
29
  end
20
30
 
21
- class Check
22
- def self.call(**args)
23
- new(**args).call
24
- end
31
+ # @param [Checker] klass that inherits from Checker
32
+ # @return [see SiteHealth#registered_checkers]
33
+ def self.register_checker(klass)
34
+ registered_checkers[klass.name.to_sym] = klass
35
+ registered_checkers
36
+ end
25
37
 
26
- BrokenLinkJournal = KeyStruct.new(:url, :exists_on)
38
+ # @return [Hash] all registered checkers
39
+ def self.registered_checkers
40
+ @checkers ||= {}
41
+ end
27
42
 
28
- HTTPCodeJournal = KeyStruct.new(:url, :code)
29
- class HTTPCodeJournal
30
- def error?
31
- code >= 400
32
- end
43
+ # @param [Symbol, String] name of the checker to be loaded
44
+ # @return [Checker] loaded class that should inherits from Checker
45
+ def self.load_checker(name)
46
+ name_key = name.to_sym
47
+ registered_checkers.fetch(name_key) do
48
+ require "site_health/checkers/#{name}"
49
+ registered_checkers[name_key]
33
50
  end
51
+ end
34
52
 
35
- ChecksJournal = KeyStruct.new(
36
- :missing_html_title,
37
- :broken_urls,
38
- :http_error_urls,
39
- :html_error_urls,
40
- :html_warning_urls,
41
- :xml_error_urls,
42
- :css_error_urls,
43
- :css_warning_urls
44
- )
45
-
46
- attr_reader :site
47
-
48
- def initialize(site:)
49
- @site = site
53
+ # @param [String] site to be checked
54
+ # @param config [SiteHealth::Configuration] the configuration to use
55
+ # @yieldparam [SiteHealth::Nurse] nurse (a.k.a agent)
56
+ # @return [Hash] journal data
57
+ # @see Nurse#journal
58
+ def self.check(site, config: SiteHealth.config)
59
+ nurse = Nurse.new(config: config)
60
+ yield(nurse) if block_given?
61
+
62
+ Spidr.site(site) do |spider|
63
+ spider.every_failed_url { |url| nurse.check_failed_url(url) }
64
+ spider.every_page { |page| nurse.check_page(page) }
50
65
  end
51
66
 
52
- def call
53
- url_map = Hash.new { |hash, key| hash[key] = [] }
54
-
55
- missing_html_title = []
56
- http_error_urls = []
57
- html_error_urls = []
58
- html_warning_urls = []
59
- xml_error_urls = []
60
- css_error_urls = []
61
- css_warning_urls = []
62
-
63
- spider = Spidr.site(site) do |spider|
64
- spider.every_link do |origin, destination|
65
- url_map[destination] << origin
66
- end
67
-
68
- spider.every_page do |page|
69
- code_journal = HTTPCodeJournal.new(url: page.url, code: page.code)
70
- http_error_urls << code_journal if code_journal.error?
71
-
72
- if page.css?
73
- result = Checkers::CSSPage.check(page)
74
- xml_error_urls << result if result.errors?
75
- end
76
-
77
- if page.xml?
78
- result = Checkers::XMLPage.check(page)
79
- xml_error_urls << result if result.errors?
80
- end
81
-
82
- if page.html?
83
- result = Checkers::HTMLPage.check(page)
84
- missing_html_title << result if result.missing_title?
85
- html_error_urls << result if result.errors?
86
- end
87
- end
88
- end
67
+ nurse.punch_out!
68
+ end
89
69
 
90
- http_error_urls = map_http_error_urls(http_error_urls, url_map)
91
- broken_urls = broken_links(spider, url_map) + http_error_urls
92
-
93
- ChecksJournal.new(
94
- missing_html_title: missing_html_title,
95
- broken_urls: broken_urls,
96
- http_error_urls: http_error_urls,
97
- html_error_urls: html_error_urls,
98
- html_warning_urls: html_warning_urls,
99
- xml_error_urls: xml_error_urls,
100
- css_error_urls: css_error_urls,
101
- css_warning_urls: css_warning_urls
102
- )
103
- end
70
+ # @param [Array<String>, String] urls to be checked
71
+ # @param config [SiteHealth::Configuration] the configuration to use
72
+ # @yieldparam [SiteHealth::Nurse] nurse (a.k.a agent)
73
+ # @return [Hash] journal data
74
+ # @see Nurse#journal
75
+ def self.check_urls(urls, config: SiteHealth.config)
76
+ nurse = Nurse.new(config: config)
77
+ yield(nurse) if block_given?
104
78
 
105
- def validate_css_page(page, errors)
106
- css_checker = Checkers::CSSPage.new(page)
107
- result = css_checker.check
108
- return unless result.errors?
79
+ agent = Spidr::Agent.new
109
80
 
110
- result
111
- end
81
+ Array(urls).each do |url|
82
+ page = agent.get_page(url)
112
83
 
113
- def map_http_error_urls(urls, url_map)
114
- urls.map do |failed_url|
115
- BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
84
+ if page.nil?
85
+ nurse.check_failed_url(url)
86
+ next
116
87
  end
117
- end
118
88
 
119
- # Finds all pages which have broken links:
120
- def broken_links(spider, url_map)
121
- # FIXME: spider#failures only returns timeout errors etc and not HTTP error status codes..
122
- # so we need to have 2 types of "failed" URLs
123
- spider.failures.map do |failed_url|
124
- BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
125
- end
89
+ nurse.check_page(page)
126
90
  end
127
91
 
128
- # @return [W3CValidators::Results]
129
- # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
130
- # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
131
- def validate_html(html_url)
132
- validator = W3CValidators::NuValidator.new
133
- validator.validate_uri(html_url)
134
- end
92
+ nurse.punch_out!
93
+ end
94
+
95
+ # @see Configuration#logger
96
+ def self.logger
97
+ config.logger
98
+ end
99
+
100
+ # @return [Configuration] the current configuration
101
+ # @yieldparam [Configuration] the current configuration
102
+ def self.configure
103
+ @configuration ||= Configuration.new
104
+ yield(@configuration) if block_given?
105
+ @configuration
106
+ end
107
+
108
+ # @return [Configuration] the current configuration
109
+ def self.config
110
+ configure
135
111
  end
136
112
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SiteHealth
4
+ class CheckData
5
+ include Enumerable
6
+
7
+ def initialize
8
+ @data = {}
9
+ end
10
+
11
+ def [](key)
12
+ @data[key]
13
+ end
14
+
15
+ # Adds data
16
+ # @param [Hash] the hash to be added
17
+ # @return [Hash] the current data
18
+ def add(hash)
19
+ @data.merge!(hash)
20
+ end
21
+
22
+ # @return [TrueClass, FalseClass] true if there is no data
23
+ def empty?
24
+ @data.empty?
25
+ end
26
+
27
+ def each(&block)
28
+ @data.each(&block)
29
+ end
30
+
31
+ def to_h
32
+ @data.to_h
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'site_health/check_data'
4
+ require 'site_health/issues'
5
+ require 'site_health/issue'
6
+
7
+ module SiteHealth
8
+ # Parent class for all checkers (all checkers must inheirit from this class)
9
+ class Checker
10
+ # All possible page types that can be checked
11
+ CHECKABLE_TYPES = %i[
12
+ plain_text
13
+ directory
14
+ xsl
15
+ rss
16
+ atom
17
+ ms_word
18
+ pdf
19
+ zip
20
+ javascript
21
+ json
22
+ css
23
+ xml
24
+ html
25
+ ].freeze
26
+
27
+ def self.name(name = '__get_value__')
28
+ if name == '__get_value__'
29
+ return @name if @name
30
+
31
+ @name = (super() || SecureRandom.hex).downcase.gsub(/sitehealth::/, '')
32
+ return @name
33
+ end
34
+
35
+ @name = name.to_s
36
+ end
37
+
38
+ def self.types(types = '__get_value__')
39
+ if types == '__get_value__'
40
+ @types ||= CHECKABLE_TYPES
41
+ return @types
42
+ end
43
+
44
+ @types = Array(types).map(&:to_sym)
45
+ end
46
+
47
+ # @param [Hash] types
48
+ # the issues data - optional, if not present it will return the current data
49
+ # @return [Hash] the issues types data
50
+ def self.issue_types(types = :__get_value__)
51
+ if types == :__get_value__
52
+ return @issue_types ||= {}
53
+ end
54
+
55
+ default = types.fetch(:_default, {})
56
+ @issue_types = types.map do |key, data|
57
+ issue_data = { code: key }.merge!(default).merge!(data)
58
+ [key, issue_data]
59
+ end.to_h
60
+ end
61
+
62
+ attr_reader :page, :config, :logger, :issues, :data
63
+
64
+ # @param [Spidr::Page] page the crawled page
65
+ # @param config [SiteHealth::Configuration]
66
+ def initialize(page, config: SiteHealth.config)
67
+ @page = page
68
+ @config = config
69
+ @logger = config.logger
70
+ @issues = Issues.new(name)
71
+ @data = CheckData.new
72
+ end
73
+
74
+ # Run the checker
75
+ # @yieldparam [Checker] yields self
76
+ # @return [CheckerResult] returns self
77
+ def call
78
+ timer = Timer.measure { check }
79
+ add_data(
80
+ started_at: timer.started_at,
81
+ finished_at: timer.finished_at,
82
+ runtime_in_seconds: timer.diff.to_f
83
+ )
84
+ yield(self) if block_given?
85
+ self
86
+ end
87
+
88
+ # @return [String] the page URL
89
+ def url
90
+ page.url
91
+ end
92
+
93
+ # @return [String] the name of the checker
94
+ def name
95
+ self.class.name
96
+ end
97
+
98
+ # @return [Array<Symbol>] list of page types the checker will run on
99
+ def types
100
+ self.class.types
101
+ end
102
+
103
+ # @return [Hash] issue types data
104
+ def issue_types
105
+ self.class.issue_types
106
+ end
107
+
108
+ # @return [Boolean] determines whether the checker should run
109
+ def should_check?
110
+ types.any? { |type| page.public_send("#{type}?") }
111
+ end
112
+
113
+ # Adds an issue
114
+ # @return [Array<Issue>] the current list of issues
115
+ # @see Issue#initialize for supported arguments
116
+ def add_issue(**args)
117
+ issues << Issue.new({ name: name, url: page.url }.merge!(**args))
118
+ end
119
+
120
+ def add_issue_type(type, **args)
121
+ data = issue_types.fetch(type) do
122
+ raise(ArgumentError, "unknown issue type #{type}, known types are: #{issue_types.keys.join(', ')}") # rubocop:disable Metrics/LineLength
123
+ end
124
+
125
+ add_issue(data.merge(**args))
126
+ end
127
+
128
+ # Adds data
129
+ # @param [Hash] the hash to be added
130
+ # @return [Hash] the current data
131
+ def add_data(hash)
132
+ data.add(hash)
133
+ end
134
+
135
+ # @return [Hash] hash representation of the object
136
+ def to_h
137
+ {
138
+ name: name.to_sym,
139
+ data: data.to_h,
140
+ issues: issues.map(&:to_h),
141
+ }
142
+ end
143
+
144
+ protected
145
+
146
+ # Abstract method that subclasses must implement
147
+ # @raise [NotImplementedError] subclasses must implement
148
+ def check
149
+ raise(NotImplementedError, 'please implement!')
150
+ end
151
+ end
152
+ end