RubyGems - httpspell - Versions diffs - 1.0.0 → 1.1.0 - Mend

httpspell 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0fe73f8f1ff3740d6e3ae3af685d3554879a2819fc9d7803c994a21dd3694d91
-  data.tar.gz: 8354f5c3bdc325a073310aa534a6171164d3dfbe7a1c4154f77737f20108eb91
+  metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
+  data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
 SHA512:
-  metadata.gz: '073693d2520238d10012e4c02057c4966ab8af80f1c9db868e5ae2a4b95e4ae59a7d0989c162f62649aa0d2194290da0bca1ac5e1186f8ff3569cca581d571ae'
-  data.tar.gz: 8eb778ffa3bcc1f56e8362d160117f695ec5f3ca146592219f4ef43a160ea28b96c67c5a6edeba52fbdc6dc3413b9f4967243fc60431a81861506d2c46435b7b
+  metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
+  data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45

data/.rspec ADDED

@@ -0,0 +1,5 @@
+--color
+--format documentation
+--tty
+--order random
+--require 'spec_helper'

data/.travis.yml ADDED

@@ -0,0 +1,6 @@
+language: ruby
+rvm:
+- 2.5.1
+before_install:
+- sudo apt-get -qq update
+- sudo apt-get install -y pandoc hunspell hunspell-en-us

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    httpspell (1.0.0)
+    httpspell (1.1.0)
       addressable
       nokogiri
@@ -10,12 +10,41 @@ GEM
   specs:
     addressable (2.5.2)
       public_suffix (>= 2.0.2, < 4.0)
+    aruba (0.14.5)
+      childprocess (>= 0.6.3, < 0.10.0)
+      contracts (~> 0.9)
+      cucumber (>= 1.3.19)
+      ffi (~> 1.9.10)
+      rspec-expectations (>= 2.99)
+      thor (~> 0.19)
     ast (2.4.0)
+    backports (3.11.3)
+    builder (3.2.3)
     byebug (10.0.2)
+    childprocess (0.9.0)
+      ffi (~> 1.0, >= 1.0.11)
     coderay (1.1.2)
+    contracts (0.16.0)
+    cucumber (3.1.0)
+      builder (>= 2.1.2)
+      cucumber-core (~> 3.1.0)
+      cucumber-expressions (~> 5.0.4)
+      cucumber-wire (~> 0.0.1)
+      diff-lcs (~> 1.3)
+      gherkin (~> 5.0)
+      multi_json (>= 1.7.5, < 2.0)
+      multi_test (>= 0.1.2)
+    cucumber-core (3.1.0)
+      backports (>= 3.8.0)
+      cucumber-tag_expressions (~> 1.1.0)
+      gherkin (>= 5.0.0)
+    cucumber-expressions (5.0.18)
+    cucumber-tag_expressions (1.1.1)
+    cucumber-wire (0.0.1)
     diff-lcs (1.3)
     ffi (1.9.23)
     formatador (0.2.5)
+    gherkin (5.1.0)
     guard (2.14.2)
       formatador (>= 0.2.4)
       listen (>= 2.7, < 4.0)
@@ -41,6 +70,8 @@ GEM
     lumberjack (1.0.13)
     method_source (0.9.0)
     mini_portile2 (2.3.0)
+    multi_json (1.13.1)
+    multi_test (0.1.2)
     nenv (0.3.0)
     nokogiri (1.8.2)
       mini_portile2 (~> 2.3.0)
@@ -93,6 +124,7 @@ PLATFORMS
   ruby
 DEPENDENCIES
+  aruba
   bundler
   guard
   guard-bundler

data/README.markdown CHANGED

@@ -1,16 +1,43 @@
 # `httpspell`
+[![Build Status](https://travis-ci.org/suhlig/httpspell.svg?branch=master)](https://travis-ci.org/suhlig/httpspell)
 This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
 Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
+# Usage
+* The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
+  ```bash
+  $ httpspell https://example.com
+  ```
+  The exit code is `0`.
+* The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
+  ```bash
+  $ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
+  suhlig
+  Permalink
+  httpspell
+  sloc
+  pandoc
+  hunspell
+  ...
+  ```
+  The exit code is `1`.
 # What is *not* checked
-* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html`.
+* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
 * Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
   - `code`
   - `pre`
-  - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
+  - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
 # Misc

data/Rakefile CHANGED

@@ -10,9 +10,12 @@ task default: ['spec:all']
 namespace :spec do
   desc 'Run all specs'
-  task all: ['rubocop:auto_correct', :unit]
+  task all: %i[rubocop:auto_correct unit system]
-  RSpec::Core::RakeTask.new(:unit) do |t|
-    t.pattern = 'spec/unit/**/*_spec.rb'
+  %w[unit system].each do |type|
+    desc "Run #{type} tests"
+    RSpec::Core::RakeTask.new(type) do |t|
+      t.pattern = "spec/#{type}/**/*_spec.rb"
+    end
   end
 end

data/exe/httpspell CHANGED

@@ -4,8 +4,13 @@
 require 'optparse'
 require 'httpspell/spider'
 require 'httpspell/spellchecker'
+require 'httpspell/version'
 personal_dictionary_path = nil
+force_language = nil
+tracing = nil
+verbose = nil
+limit = nil
 begin
   OptionParser.new do |parser|
@@ -13,11 +18,28 @@ begin
       Spellchecks a website via HTTP.
     BANNER
+    parser.version = HttpSpell::VERSION
     parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
       personal_dictionary_path = p
     end
+    parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
+      force_language = l
+    end
+    parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
+      limit = Regexp.new(l)
+    end
+    parser.on('-t', '--trace', 'enable error tracing') do
+      tracing = true
+    end
+    parser.on('-V', '--verbose', "explain what's happening") do
+      verbose = true
+    end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
@@ -32,20 +54,32 @@ if ARGV.size != 1
 end
 spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
+has_unknown_words = false
-HttpSpell::Spider.new(ARGV.first).start do |url, doc|
-  lang = doc.root['lang'] || 'de-DE'
+begin
+  HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
+    lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
-  # Remove sections that are not to be spellchecked
-  doc.css('pre').each(&:unlink)
-  doc.css('code').each(&:unlink)
-  doc.css('[spellcheck=false]').each(&:unlink)
+    # Remove sections that are not to be spellchecked
+    doc.css('pre').each(&:unlink)
+    doc.css('code').each(&:unlink)
+    doc.css('[spellcheck=false]').each(&:unlink)
-  # TODO: Find sections with a lang attribute and handle them separately
-  unknown_words = spell_checker.check(doc.to_s, lang)
+    # TODO: Find sections with a lang attribute and handle them separately
+    unknown_words = spell_checker.check(doc.to_s, lang)
-  unless unknown_words.empty?
-    warn "#{unknown_words.size} unknown words at #{url}:"
-    puts unknown_words
+    if unknown_words.empty?
+      warn "No unknown words (language is #{lang}) at #{url}." if verbose
+    else
+      warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
+      puts unknown_words
+      has_unknown_words = true
+    end
   end
+rescue StandardError
+  warn $ERROR_INFO.message
+  warn $ERROR_INFO.backtrace if tracing
+  exit 2
 end
+exit 1 if has_unknown_words

data/httpspell.gemspec CHANGED

@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency 'addressable'
   spec.add_dependency 'nokogiri'
+  spec.add_development_dependency 'aruba'
   spec.add_development_dependency 'bundler'
   spec.add_development_dependency 'guard'
   spec.add_development_dependency 'guard-bundler'

data/lib/httpspell/spider.rb CHANGED

@@ -5,34 +5,29 @@ require 'addressable/uri'
 require 'English'
 module HttpSpell
-  # rubocop:disable Metrics/AbcSize
-  # rubocop:disable Metrics/MethodLength
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, base_url = starting_point)
+    def initialize(starting_point, limit: nil, tracing: false)
       @todo = []
       @done = []
       todo << Addressable::URI.parse(starting_point)
-      @base_url = Addressable::URI.parse(base_url)
+      @limit = limit || /^#{starting_point}/
+      @tracing = tracing
     end
     def start
       while todo.any?
         url = todo.pop
-        begin
-          extracted = links(url) do |u, d|
-            yield u, d if block_given?
-          rescue
-            warn "Callback error for #{url}: #{$ERROR_INFO}"
-          end
-          done.append(url)
-          todo.concat(extracted - done - todo)
-        rescue StandardError
-          warn "Could not fetch #{url}: #{$ERROR_INFO}"
+        extracted = links(url) do |u, d|
+          yield u, d if block_given?
+        rescue
+          warn "Callback error for #{url}: #{$ERROR_INFO}"
+          warn $ERROR_INFO.backtrace if @tracing
         end
+        done.append(url)
+        todo.concat(extracted - done - todo)
       end
     end
@@ -40,23 +35,29 @@ module HttpSpell
     def links(uri)
       # We are using open-uri, which follows redirects and also provides the content-type.
-      response = URI(uri).read
-      return [] unless response.content_type == 'text/html'
+      response = open(uri).read
+      if response.respond_to?(:content_type)
+        return [] unless response.content_type == 'text/html'
+      end
       doc = Nokogiri::HTML(response)
       links = doc.css('a[href]').map do |e|
         link = Addressable::URI.parse(e['href'])
         link = uri.join(link) if link.relative?
-        next unless link.to_s.start_with?(@base_url.to_s)
+        next unless @limit.match?(link.to_s)
+        # TODO Ignore same page links (some anchor)
         link
       rescue StandardError
-        warn $ERROR_INFO
+        warn $ERROR_INFO.message
+        warn $ERROR_INFO.backtrace if @tracing
       end.compact
       yield uri, doc if block_given?
+      warn "Adding #{links.size} links from #{uri}" if @tracing
       links
     end
   end
-  # rubocop:enable Metrics/AbcSize
-  # rubocop:enable Metrics/MethodLength
 end

data/lib/httpspell/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.0.0'
+  VERSION = '1.1.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Steffen Uhlig
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-05-30 00:00:00.000000000 Z
+date: 2018-06-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -38,6 +38,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: aruba
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -176,7 +190,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rspec"
 - ".rubocop.yml"
+- ".travis.yml"
 - Gemfile
 - Gemfile.lock
 - README.markdown