RubyGems - site_health - Versions diffs - 0.1.0 - Mend

site_health 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/.gitignore +15 -0
data/.rspec +3 -0
data/.travis.yml +5 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +64 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/site_health.rb +136 -0
data/lib/site_health/checkers/css_page.rb +36 -0
data/lib/site_health/checkers/html_page.rb +41 -0
data/lib/site_health/checkers/xml_page.rb +21 -0
data/lib/site_health/journals/css_journal.rb +12 -0
data/lib/site_health/journals/html_journal.rb +16 -0
data/lib/site_health/journals/w3c_journal.rb +43 -0
data/lib/site_health/journals/xml_journal.rb +8 -0
data/lib/site_health/key_struct.rb +13 -0
data/lib/site_health/version.rb +3 -0
data/site_health.gemspec +31 -0
metadata +149 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 251746b898058b968e3e9f5a50406a9e6d4ab4d3
+  data.tar.gz: 6889e72c9f4d2b925381d9f41ef28a5acf0e84d6
+SHA512:
+  metadata.gz: c06a8679ef7fc7ebb6f9b926b8730119201ff4ad07439fae7b5e38fb15499ec51c11cca6664be80cb1d4dbde3a63d65c6d2861fcd0e567f9041908b39fa46b02
+  data.tar.gz: af62e5459f0882e659da7a2e9e7b47b12aaf8d87d1835acf621dbdc3234999002f5e25ade7ed64da93ec9c95d2231aa0671adddb22ad7896447b2bc2a91be8c9

data/.gitignore ADDED

@@ -0,0 +1,15 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+Gemfile.lock
+# rspec failure tracking
+.rspec_status
+.byebug_history

data/.rspec ADDED

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.travis.yml ADDED

@@ -0,0 +1,5 @@
+sudo: false
+language: ruby
+rvm:
+  - 2.4.1
+before_install: gem install bundler -v 1.16.0.pre.2

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in site_health.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2017 Jacob Burenstam
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,64 @@
+# SiteHealth
+:warning: Project is still experimental, API will change (a lot) without notice.
+Crawl a site and check various health indicators, such as:
+- HTTP error status
+- Invalid HTML/CSS/XML
+- Missing HTML page title
+- Broken links
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'site_health'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install site_health
+## Usage
+```ruby
+journal = SiteHealth.check('https://example.com')
+# HTML
+journal.missing_html_title # List of URLs that are missing the HTML title
+journal.html_error_urls # List of URLs with HTML errors in them
+# CSS
+journal.css_error_urls # List of URLs with CSS errors in them
+# XML
+journal.xml_error_urls # List of URLs with XML errors in them
+# Broken URLs
+broken = journal.broken_urls.first
+broken.url # The URL that failed
+broken.exists_on # Array of URLs where the broken URL was present
+# HTTP
+journal.http_error_urls # All URLs with HTTP status code >= 400
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/buren/site_health.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "site_health"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/site_health.rb ADDED

@@ -0,0 +1,136 @@
+require "spidr"
+require 'w3c_validators'
+require "site_health/version"
+require "site_health/key_struct"
+require 'site_health/journals/css_journal'
+require 'site_health/journals/html_journal'
+require 'site_health/journals/xml_journal'
+require 'site_health/journals/w3c_journal'
+require "site_health/checkers/css_page"
+require "site_health/checkers/html_page"
+require "site_health/checkers/xml_page"
+module SiteHealth
+  def self.check(site)
+    Check.call(site: site)
+  end
+  class Check
+    def self.call(**args)
+      new(**args).call
+    end
+    BrokenLinkJournal = KeyStruct.new(:url, :exists_on)
+    HTTPCodeJournal = KeyStruct.new(:url, :code)
+    class HTTPCodeJournal
+      def error?
+        code >= 400
+      end
+    end
+    ChecksJournal = KeyStruct.new(
+      :missing_html_title,
+      :broken_urls,
+      :http_error_urls,
+      :html_error_urls,
+      :html_warning_urls,
+      :xml_error_urls,
+      :css_error_urls,
+      :css_warning_urls
+    )
+    attr_reader :site
+    def initialize(site:)
+      @site = site
+    end
+    def call
+      url_map = Hash.new { |hash, key| hash[key] = [] }
+      missing_html_title = []
+      http_error_urls = []
+      html_error_urls = []
+      html_warning_urls = []
+      xml_error_urls = []
+      css_error_urls = []
+      css_warning_urls = []
+      spider = Spidr.site(site) do |spider|
+        spider.every_link do |origin, destination|
+          url_map[destination] << origin
+        end
+        spider.every_page do |page|
+          code_journal = HTTPCodeJournal.new(url: page.url, code: page.code)
+          http_error_urls << code_journal if code_journal.error?
+          if page.css?
+            result = Checkers::CSSPage.check(page)
+            xml_error_urls << result if result.errors?
+          end
+          if page.xml?
+            result = Checkers::XMLPage.check(page)
+            xml_error_urls << result if result.errors?
+          end
+          if page.html?
+            result = Checkers::HTMLPage.check(page)
+            missing_html_title << result if result.missing_title?
+            html_error_urls << result if result.errors?
+          end
+        end
+      end
+      http_error_urls = map_http_error_urls(http_error_urls, url_map)
+      broken_urls = broken_links(spider, url_map) + http_error_urls
+      ChecksJournal.new(
+        missing_html_title: missing_html_title,
+        broken_urls: broken_urls,
+        http_error_urls: http_error_urls,
+        html_error_urls: html_error_urls,
+        html_warning_urls: html_warning_urls,
+        xml_error_urls: xml_error_urls,
+        css_error_urls: css_error_urls,
+        css_warning_urls: css_warning_urls
+      )
+    end
+    def validate_css_page(page, errors)
+      css_checker = Checkers::CSSPage.new(page)
+      result = css_checker.check
+      return unless result.errors?
+      result
+    end
+    def map_http_error_urls(urls, url_map)
+      urls.map do |failed_url|
+        BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
+      end
+    end
+    # Finds all pages which have broken links:
+    def broken_links(spider, url_map)
+      # FIXME: spider#failures only returns timeout errors etc and not HTTP error status codes..
+      #        so we need to have 2 types of "failed" URLs
+      spider.failures.map do |failed_url|
+        BrokenLinkJournal.new(url: failed_url, exists_on: url_map[failed_url])
+      end
+    end
+    # @return [W3CValidators::Results]
+    # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
+    # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
+    def validate_html(html_url)
+      validator = W3CValidators::NuValidator.new
+      validator.validate_uri(html_url)
+    end
+  end
+end

data/lib/site_health/checkers/css_page.rb ADDED

@@ -0,0 +1,36 @@
+module SiteHealth
+  module Checkers
+    class CSSPage
+      def self.check(page)
+        new(page).check
+      end
+      attr_reader :page, :url
+      # @param [Spidr::Page] the crawled page
+      def initialize(page)
+        @page = page
+        @url = page.url
+      end
+      def check
+        result = check_content
+        CSSJournal.new(
+          url: url,
+          page: page,
+          errors: result.errors.map { |e| W3CJournalBuilder.build(e) },
+          warnings: result.warnings.map { |e| W3CJournalBuilder.build(e) }
+        )
+      end
+      # @return [W3CValidators::Results]
+      # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
+      # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
+      def check_content
+        validator = W3CValidators::CSSValidator.new
+        validator.validate_uri(url)
+      end
+    end
+  end
+end

data/lib/site_health/checkers/html_page.rb ADDED

@@ -0,0 +1,41 @@
+module SiteHealth
+  module Checkers
+    class HTMLPage
+      def self.check(page)
+        new(page).check
+      end
+      attr_reader :page, :url
+      # @param [Spidr::Page] the crawled page
+      def initialize(page)
+        @page = page
+        @url = page.url
+      end
+      def check
+        result = check_content
+        HTMLJournal.new(
+          url: url,
+          page: page,
+          missing_title: missing_title?,
+          errors: result.errors.map { |e| W3CJournalBuilder.build(e) },
+          warnings: result.warnings.map { |e| W3CJournalBuilder.build(e) }
+        )
+      end
+      def missing_title?
+        page.title.to_s.strip.empty?
+      end
+      # @return [W3CValidators::Results]
+      # @raise [W3CValidators::ValidatorUnavailable] the service is offline or returns 400 Bad Request
+      # @see https://github.com/w3c-validators/w3c_validators/issues/39 we really want to use #validate_text instead of #validate_uri but due to the linked issue thats not possible
+      def check_content
+        validator = W3CValidators::NuValidator.new
+        validator.validate_uri(url)
+      end
+    end
+  end
+end

data/lib/site_health/checkers/xml_page.rb ADDED

@@ -0,0 +1,21 @@
+module SiteHealth
+  module Checkers
+    class XMLPage
+      def self.check(page)
+        new(page).check
+      end
+      attr_reader :page, :url
+      # @param [Spidr::Page] the crawled page
+      def initialize(page)
+        @page = page
+        @url = page.url
+      end
+      def check
+        XMLJournal.new(url: url, page: page, errors: page.doc.errors)
+      end
+    end
+  end
+end

data/lib/site_health/journals/css_journal.rb ADDED

@@ -0,0 +1,12 @@
+module SiteHealth
+  CSSJournal = KeyStruct.new(:page, :url, :errors, :warnings)
+  class CSSJournal
+    def errors?
+      errors.any?
+    end
+    def warnings?
+      warnings.any?
+    end
+  end
+end

data/lib/site_health/journals/html_journal.rb ADDED

@@ -0,0 +1,16 @@
+module SiteHealth
+  HTMLJournal = KeyStruct.new(:page, :url, :errors, :warnings, :missing_title)
+  class HTMLJournal
+    def missing_title?
+      missing_title
+    end
+    def errors?
+      errors.any?
+    end
+    def warnings?
+      warnings.any?
+    end
+  end
+end

data/lib/site_health/journals/w3c_journal.rb ADDED

@@ -0,0 +1,43 @@
+module SiteHealth
+  W3CJournal = KeyStruct.new(
+    :message,
+    :value,
+    :source,
+    :type,
+    :explanation,
+    :parent,
+    :line,
+    :context,
+    :element,
+    :error?,
+    :warning?,
+    :col,
+    :message_id,
+    :message_count,
+    :skipped_string
+  )
+  module W3CJournalBuilder
+    # @param [W3CValidators::Result]
+    # @return [W3CJournal]
+    def self.build(result)
+      W3CJournal.new(
+        message: (result.message || '').strip,
+        value: result.value,
+        source: result.source,
+        type: result.type,
+        explanation: result.explanation,
+        parent: result.parent,
+        line: result.line,
+        context: result.context,
+        element: result.element,
+        error?: result.is_error?,
+        warning?: result.is_warning?,
+        col: result.col,
+        message_id: result.message_id,
+        message_count: result.message_count,
+        skipped_string: result.skippedstring
+      )
+    end
+  end
+end

data/lib/site_health/journals/xml_journal.rb ADDED

@@ -0,0 +1,8 @@
+module SiteHealth
+  XMLJournal = KeyStruct.new(:page, :url, :errors)
+  class XMLJournal
+    def errors?
+      errors.any?
+    end
+  end
+end

data/lib/site_health/key_struct.rb ADDED

@@ -0,0 +1,13 @@
+module SiteHealth
+  class KeyStruct < Struct
+    def initialize(**keyword_args)
+      keyword_args.each do |key, value|
+        if members.include?(key)
+          self[key] = value
+        else
+          raise ArgumentError, "Unknown key struct member: #{key}"
+        end
+      end
+    end
+  end
+end

data/lib/site_health/version.rb ADDED

@@ -0,0 +1,3 @@
+module SiteHealth
+  VERSION = "0.1.0"
+end

data/site_health.gemspec ADDED

@@ -0,0 +1,31 @@
+lib = File.expand_path("../lib", __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require "site_health/version"
+Gem::Specification.new do |spec|
+  spec.name          = "site_health"
+  spec.version       = SiteHealth::VERSION
+  spec.authors       = ["Jacob Burenstam"]
+  spec.email         = ["burenstam@gmail.com"]
+  spec.summary       = %q{Crawl a site and check various helth indicators.}
+  spec.description   = %q{Crawl a site and check various health indicators, such as: HTTP 4XX, 5XX status and valid HTML.}
+  spec.homepage      = "https://github.com/buren/site_health"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_dependency "spidr", "~> 0.6"
+  spec.add_dependency "w3c_validators", "~> 1.3"
+  spec.add_development_dependency "bundler", "~> 1.16.a"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_development_dependency "byebug"
+end

metadata ADDED

@@ -0,0 +1,149 @@
+--- !ruby/object:Gem::Specification
+name: site_health
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Jacob Burenstam
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2017-10-24 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: spidr
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.6'
+- !ruby/object:Gem::Dependency
+  name: w3c_validators
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.16.a
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.16.a
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: 'Crawl a site and check various health indicators, such as: HTTP 4XX,
+  5XX status and valid HTML.'
+email:
+- burenstam@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/site_health.rb
+- lib/site_health/checkers/css_page.rb
+- lib/site_health/checkers/html_page.rb
+- lib/site_health/checkers/xml_page.rb
+- lib/site_health/journals/css_journal.rb
+- lib/site_health/journals/html_journal.rb
+- lib/site_health/journals/w3c_journal.rb
+- lib/site_health/journals/xml_journal.rb
+- lib/site_health/key_struct.rb
+- lib/site_health/version.rb
+- site_health.gemspec
+homepage: https://github.com/buren/site_health
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.13
+signing_key:
+specification_version: 4
+summary: Crawl a site and check various helth indicators.
+test_files: []