site_health 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 251746b898058b968e3e9f5a50406a9e6d4ab4d3
4
+ data.tar.gz: 6889e72c9f4d2b925381d9f41ef28a5acf0e84d6
5
+ SHA512:
6
+ metadata.gz: c06a8679ef7fc7ebb6f9b926b8730119201ff4ad07439fae7b5e38fb15499ec51c11cca6664be80cb1d4dbde3a63d65c6d2861fcd0e567f9041908b39fa46b02
7
+ data.tar.gz: af62e5459f0882e659da7a2e9e7b47b12aaf8d87d1835acf621dbdc3234999002f5e25ade7ed64da93ec9c95d2231aa0671adddb22ad7896447b2bc2a91be8c9
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ Gemfile.lock
11
+
12
+ # rspec failure tracking
13
+ .rspec_status
14
+
15
+ .byebug_history
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.4.1
5
+ before_install: gem install bundler -v 1.16.0.pre.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in site_health.gemspec
6
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Jacob Burenstam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,64 @@
1
+ # SiteHealth
2
+
3
+ :warning: Project is still experimental, API will change (a lot) without notice.
4
+
5
+ Crawl a site and check various health indicators, such as:
6
+
7
+ - HTTP error status
8
+ - Invalid HTML/CSS/XML
9
+ - Missing HTML page title
10
+ - Broken links
11
+
12
+ ## Installation
13
+
14
+ Add this line to your application's Gemfile:
15
+
16
+ ```ruby
17
+ gem 'site_health'
18
+ ```
19
+
20
+ And then execute:
21
+
22
+ $ bundle
23
+
24
+ Or install it yourself as:
25
+
26
+ $ gem install site_health
27
+
28
+ ## Usage
29
+
30
+ ```ruby
31
+ journal = SiteHealth.check('https://example.com')
32
+
33
+ # HTML
34
+ journal.missing_html_title # List of URLs that are missing the HTML title
35
+ journal.html_error_urls # List of URLs with HTML errors in them
36
+
37
+ # CSS
38
+ journal.css_error_urls # List of URLs with CSS errors in them
39
+
40
+ # XML
41
+ journal.xml_error_urls # List of URLs with XML errors in them
42
+
43
+ # Broken URLs
44
+ broken = journal.broken_urls.first
45
+ broken.url # The URL that failed
46
+ broken.exists_on # Array of URLs where the broken URL was present
47
+
48
+ # HTTP
49
+ journal.http_error_urls # All URLs with HTTP status code >= 400
50
+ ```
51
+
52
+ ## Development
53
+
54
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
55
+
56
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
57
+
58
+ ## Contributing
59
+
60
+ Bug reports and pull requests are welcome on GitHub at https://github.com/buren/site_health.
61
+
62
+ ## License
63
+
64
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "site_health"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,136 @@
1
+ require "spidr"
2
+ require 'w3c_validators'
3
+ require "site_health/version"
4
+
5
+ require "site_health/key_struct"
6
+
7
+ require 'site_health/journals/css_journal'
8
+ require 'site_health/journals/html_journal'
9
+ require 'site_health/journals/xml_journal'
10
+ require 'site_health/journals/w3c_journal'
11
+
12
+ require "site_health/checkers/css_page"
13
+ require "site_health/checkers/html_page"
14
+ require "site_health/checkers/xml_page"
15
+
16
+ module SiteHealth
17
+ def self.check(site)
18
+ Check.call(site: site)
19
+ end
20
+
21
+ class Check
22
+ def self.call(**args)
23
+ new(**args).call
24
+ end
25
+
26
+ BrokenLinkJournal = KeyStruct.new(:url, :exists_on)
27
+
28
+ HTTPCodeJournal = KeyStruct.new(:url, :code)
29
+ class HTTPCodeJournal
30
+ def error?
31
+ code >= 400
32
+ end
33
+ end
34
+
35
+ ChecksJournal = KeyStruct.new(
36
+ :missing_html_title,
37
+ :broken_urls,
38
+ :http_error_urls,
39
+ :html_error_urls,
40
+ :html_warning_urls,
41
+ :xml_error_urls,
42
+ :css_error_urls,
43
+ :css_warning_urls
44
+ )
45
+
46
+ attr_reader :site
47
+
48
+ def initialize(site:)
49
+ @site = site
50
+ end
51
+
52
+ def call
53
+ url_map = Hash.new { |hash, key| hash[key] = [] }
54
+
55
+ missing_html_title = []
56
+ http_error_urls = []
57
+ html_error_urls = []
58
+ html_warning_urls = []
59
+ xml_error_urls = []
60
+ css_error_urls = []
61
+ css_warning_urls = []
62
+
63
+ spider = Spidr.site(site) do |spider|
64
+ spider.every_link do |origin, destination|
65
+ url_map[destination] << origin
66
+ end
67
+
68
+ spider.every_page do |page|
69
+ code_journal = HTTPCodeJournal.new(url: page.url, code: page.code)
70
+ http_error_urls << code_journal if code_journal.error?
71
+
72
+ if page.css?
73
+ result = Checkers::CSSPage.check(page)
74
+ xml_error_urls << result if result.errors?
75
+ end
76
+
77
+ if page.xml?
78
+ result = Checkers::XMLPage.check(page)
79
+ xml_error_urls << result if result.errors?
80
+ end
81
+
82
+ if page.html?
83
+ result = Checkers::HTMLPage.check(page)
84
+ missing_html_title << result if result.missing_title?
85
+ html_error_urls << result if result.errors?
86
+ end
87
+ end
88
+ end
89
+
90
+ http_error_urls = map_http_error_urls(http_error_urls, url_map)
91
+ broken_urls = broken_links(spider, url_map) + http_error_urls
92
+
93
+ ChecksJournal.new(
94
+ missing_html_title: missing_html_title,
95
+ broken_urls: broken_urls,
96
+ http_error_urls: http_error_urls,
97
+ html_error_urls: html_error_urls,
98
+ html_warning_urls: html_warning_urls,
99
+ xml_error_urls: xml_error_urls,
100
+ css_error_urls: css_error_urls,
101
+ css_warning_urls: css_warning_urls
102
+ )
103
+ end
104
+
105
+ def validate_css_page(page, errors)
106
+ css_checker = Checkers::CSSPage.new(page)
107
+ result = css_checker.check
108
+ return unless result.errors?
109
+
110
+ result
111
+ end
112
+
113
+ def map_http_error_urls(urls, url_map)
114
+ urls.map do |failed_url|
115
+ BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
116
+ end
117
+ end
118
+
119
+ # Finds all pages which have broken links:
120
+ def broken_links(spider, url_map)
121
+ # FIXME: spider#failures only returns timeout errors etc and not HTTP error status codes..
122
+ # so we need to have 2 types of "failed" URLs
123
+ spider.failures.map do |failed_url|
124
+ BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
125
+ end
126
+ end
127
+
128
+ # @return [W3CValidators::Results]
129
+ # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
130
+ # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
131
+ def validate_html(html_url)
132
+ validator = W3CValidators::NuValidator.new
133
+ validator.validate_uri(html_url)
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,36 @@
1
+ module SiteHealth
2
+ module Checkers
3
+ class CSSPage
4
+ def self.check(page)
5
+ new(page).check
6
+ end
7
+
8
+ attr_reader :page, :url
9
+
10
+ # @param [Spidr::Page] the crawled page
11
+ def initialize(page)
12
+ @page = page
13
+ @url = page.url
14
+ end
15
+
16
+ def check
17
+ result = check_content
18
+
19
+ CSSJournal.new(
20
+ url: url,
21
+ page: page,
22
+ errors: result.errors.map { |e| W3CJournalBuilder.build(e) },
23
+ warnings: result.warnings.map { |e| W3CJournalBuilder.build(e) }
24
+ )
25
+ end
26
+
27
+ # @return [W3CValidators::Results]
28
+ # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
29
+ # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
30
+ def check_content
31
+ validator = W3CValidators::CSSValidator.new
32
+ validator.validate_uri(url)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,41 @@
1
+ module SiteHealth
2
+ module Checkers
3
+ class HTMLPage
4
+ def self.check(page)
5
+ new(page).check
6
+ end
7
+
8
+ attr_reader :page, :url
9
+
10
+ # @param [Spidr::Page] the crawled page
11
+ def initialize(page)
12
+ @page = page
13
+ @url = page.url
14
+ end
15
+
16
+ def check
17
+ result = check_content
18
+
19
+ HTMLJournal.new(
20
+ url: url,
21
+ page: page,
22
+ missing_title: missing_title?,
23
+ errors: result.errors.map { |e| W3CJournalBuilder.build(e) },
24
+ warnings: result.warnings.map { |e| W3CJournalBuilder.build(e) }
25
+ )
26
+ end
27
+
28
+ def missing_title?
29
+ page.title.to_s.strip.empty?
30
+ end
31
+
32
+ # @return [W3CValidators::Results]
33
+ # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
34
+ # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
35
+ def check_content
36
+ validator = W3CValidators::NuValidator.new
37
+ validator.validate_uri(url)
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,21 @@
1
+ module SiteHealth
2
+ module Checkers
3
+ class XMLPage
4
+ def self.check(page)
5
+ new(page).check
6
+ end
7
+
8
+ attr_reader :page, :url
9
+
10
+ # @param [Spidr::Page] the crawled page
11
+ def initialize(page)
12
+ @page = page
13
+ @url = page.url
14
+ end
15
+
16
+ def check
17
+ XMLJournal.new(url: url, page: page, errors: page.doc.errors)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,12 @@
1
+ module SiteHealth
2
+ CSSJournal = KeyStruct.new(:page, :url, :errors, :warnings)
3
+ class CSSJournal
4
+ def errors?
5
+ errors.any?
6
+ end
7
+
8
+ def warnings?
9
+ warnings.any?
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,16 @@
1
+ module SiteHealth
2
+ HTMLJournal = KeyStruct.new(:page, :url, :errors, :warnings, :missing_title)
3
+ class HTMLJournal
4
+ def missing_title?
5
+ missing_title
6
+ end
7
+
8
+ def errors?
9
+ errors.any?
10
+ end
11
+
12
+ def warnings?
13
+ warnings.any?
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,43 @@
1
+ module SiteHealth
2
+ W3CJournal = KeyStruct.new(
3
+ :message,
4
+ :value,
5
+ :source,
6
+ :type,
7
+ :explanation,
8
+ :parent,
9
+ :line,
10
+ :context,
11
+ :element,
12
+ :error?,
13
+ :warning?,
14
+ :col,
15
+ :message_id,
16
+ :message_count,
17
+ :skipped_string
18
+ )
19
+
20
+ module W3CJournalBuilder
21
+ # @param [W3CValidators::Result]
22
+ # @return [W3CJournal]
23
+ def self.build(result)
24
+ W3CJournal.new(
25
+ message: (result.message || '').strip,
26
+ value: result.value,
27
+ source: result.source,
28
+ type: result.type,
29
+ explanation: result.explanation,
30
+ parent: result.parent,
31
+ line: result.line,
32
+ context: result.context,
33
+ element: result.element,
34
+ error?: result.is_error?,
35
+ warning?: result.is_warning?,
36
+ col: result.col,
37
+ message_id: result.message_id,
38
+ message_count: result.message_count,
39
+ skipped_string: result.skippedstring
40
+ )
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,8 @@
1
+ module SiteHealth
2
+ XMLJournal = KeyStruct.new(:page, :url, :errors)
3
+ class XMLJournal
4
+ def errors?
5
+ errors.any?
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,13 @@
1
+ module SiteHealth
2
+ class KeyStruct < Struct
3
+ def initialize(**keyword_args)
4
+ keyword_args.each do |key, value|
5
+ if members.include?(key)
6
+ self[key] = value
7
+ else
8
+ raise ArgumentError, "Unknown key struct member: #{key}"
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,3 @@
1
+ module SiteHealth
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,31 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "site_health/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "site_health"
8
+ spec.version = SiteHealth::VERSION
9
+ spec.authors = ["Jacob Burenstam"]
10
+ spec.email = ["burenstam@gmail.com"]
11
+
12
+ spec.summary = %q{Crawl a site and check various helth indicators.}
13
+ spec.description = %q{Crawl a site and check various health indicators, such as: HTTP 4XX, 5XX status and valid HTML.}
14
+ spec.homepage = "https://github.com/buren/site_health"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ spec.bindir = "exe"
21
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
+ spec.require_paths = ["lib"]
23
+
24
+ spec.add_dependency "spidr", "~> 0.6"
25
+ spec.add_dependency "w3c_validators", "~> 1.3"
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.16.a"
28
+ spec.add_development_dependency "rake", "~> 10.0"
29
+ spec.add_development_dependency "rspec", "~> 3.0"
30
+ spec.add_development_dependency "byebug"
31
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: site_health
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jacob Burenstam
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-10-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: spidr
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: w3c_validators
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 1.16.a
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 1.16.a
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: byebug
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: 'Crawl a site and check various health indicators, such as: HTTP 4XX,
98
+ 5XX status and valid HTML.'
99
+ email:
100
+ - burenstam@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - Gemfile
109
+ - LICENSE.txt
110
+ - README.md
111
+ - Rakefile
112
+ - bin/console
113
+ - bin/setup
114
+ - lib/site_health.rb
115
+ - lib/site_health/checkers/css_page.rb
116
+ - lib/site_health/checkers/html_page.rb
117
+ - lib/site_health/checkers/xml_page.rb
118
+ - lib/site_health/journals/css_journal.rb
119
+ - lib/site_health/journals/html_journal.rb
120
+ - lib/site_health/journals/w3c_journal.rb
121
+ - lib/site_health/journals/xml_journal.rb
122
+ - lib/site_health/key_struct.rb
123
+ - lib/site_health/version.rb
124
+ - site_health.gemspec
125
+ homepage: https://github.com/buren/site_health
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 2.6.13
146
+ signing_key:
147
+ specification_version: 4
148
+ summary: Crawl a site and check various helth indicators.
149
+ test_files: []