RubyGems - httpspell - Versions diffs - 1.0.0 → 1.1.0 - Mend

httpspell 1.0.0 → 1.1.0

Files changed (11) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0fe73f8f1ff3740d6e3ae3af685d3554879a2819fc9d7803c994a21dd3694d91
-  data.tar.gz: 8354f5c3bdc325a073310aa534a6171164d3dfbe7a1c4154f77737f20108eb91
+  metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
+  data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
 SHA512:
-  metadata.gz: '073693d2520238d10012e4c02057c4966ab8af80f1c9db868e5ae2a4b95e4ae59a7d0989c162f62649aa0d2194290da0bca1ac5e1186f8ff3569cca581d571ae'
-  data.tar.gz: 8eb778ffa3bcc1f56e8362d160117f695ec5f3ca146592219f4ef43a160ea28b96c67c5a6edeba52fbdc6dc3413b9f4967243fc60431a81861506d2c46435b7b
+  metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
+  data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45

data/.rspec ADDED

@@ -0,0 +1,5 @@
+--color
+--format documentation
+--tty
+--order random
+--require 'spec_helper'

data/.travis.yml ADDED

@@ -0,0 +1,6 @@
+language: ruby
+rvm:
+- 2.5.1
+before_install:
+- sudo apt-get -qq update
+- sudo apt-get install -y pandoc hunspell hunspell-en-us

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    httpspell (1.0.0)
+    httpspell (1.1.0)
       addressable
       nokogiri
@@ -10,12 +10,41 @@ GEM
   specs:
     addressable (2.5.2)
       public_suffix (>= 2.0.2, < 4.0)
+    aruba (0.14.5)
+      childprocess (>= 0.6.3, < 0.10.0)
+      contracts (~> 0.9)
+      cucumber (>= 1.3.19)
+      ffi (~> 1.9.10)
+      rspec-expectations (>= 2.99)
+      thor (~> 0.19)
     ast (2.4.0)
+    backports (3.11.3)
+    builder (3.2.3)
     byebug (10.0.2)
+    childprocess (0.9.0)
+      ffi (~> 1.0, >= 1.0.11)
     coderay (1.1.2)
+    contracts (0.16.0)
+    cucumber (3.1.0)
+      builder (>= 2.1.2)
+      cucumber-core (~> 3.1.0)
+      cucumber-expressions (~> 5.0.4)
+      cucumber-wire (~> 0.0.1)
+      diff-lcs (~> 1.3)
+      gherkin (~> 5.0)
+      multi_json (>= 1.7.5, < 2.0)
+      multi_test (>= 0.1.2)
+    cucumber-core (3.1.0)
+      backports (>= 3.8.0)
+      cucumber-tag_expressions (~> 1.1.0)
+      gherkin (>= 5.0.0)
+    cucumber-expressions (5.0.18)
+    cucumber-tag_expressions (1.1.1)
+    cucumber-wire (0.0.1)
     diff-lcs (1.3)
     ffi (1.9.23)
     formatador (0.2.5)
+    gherkin (5.1.0)
     guard (2.14.2)
       formatador (>= 0.2.4)
       listen (>= 2.7, < 4.0)
@@ -41,6 +70,8 @@ GEM
     lumberjack (1.0.13)
     method_source (0.9.0)
     mini_portile2 (2.3.0)
+    multi_json (1.13.1)
+    multi_test (0.1.2)
     nenv (0.3.0)
     nokogiri (1.8.2)
       mini_portile2 (~> 2.3.0)
@@ -93,6 +124,7 @@ PLATFORMS
   ruby
 DEPENDENCIES
+  aruba
   bundler
   guard
   guard-bundler

data/README.markdown CHANGED

@@ -1,16 +1,43 @@
 # `httpspell`
+[![Build Status](https://travis-ci.org/suhlig/httpspell.svg?branch=master)](https://travis-ci.org/suhlig/httpspell)
 This is a spellchecker that recursively fetches HTML pages, converts them to plain text (using [pandoc](http://pandoc.org/)), and spellchecks them with [hunspell](https://hunspell.github.io/). Unknown words will be printed to `stdout`, which makes the tool a good candidate for CI pipelines where you might want to take action when a spelling error is found on a web page.
 Words that are not in the dictionary for the given language (inferred from the `lang` attribute of the HTML document's root element) can be added to a personal dictionary, which will mark the word as correctly spelled.
+# Usage
+* The following command will retrieve the HTML document at https://example.com, spellcheck it, and not print anything because there are no errors:
+  ```bash
+  $ httpspell https://example.com
+  ```
+  The exit code is `0`.
+* The following command will spellcheck the README of this project as rendered by GitHub, and print a list of unknown words. Note that we set the language to `en_US` because GitHub declares 'en' as document language, but the installed dictionaries usually refer the a specific language variant like `en_US`:
+  ```bash
+  $ httpspell https://github.com/suhlig/httpspell/blob/master/README.markdown --language en_US
+  suhlig
+  Permalink
+  httpspell
+  sloc
+  pandoc
+  hunspell
+  ...
+  ```
+  The exit code is `1`.
 # What is *not* checked
-* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html`.
+* When spidering a site, `httpspell` will skip all responses with a `content-type` header other than `text/html` (unless pointing it to file, in which case it accepts anything).
 * Before converting, `httpspell` removes the following nodes from the HTML DOM as they are not a good target for spellchecking:
   - `code`
   - `pre`
-  - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a target for spellchecking)
+  - Elements with `spellcheck='false'` (this is how HTML5 allows tagging elements as a being target for spellchecking or not)
 # Misc

data/Rakefile CHANGED

@@ -10,9 +10,12 @@ task default: ['spec:all']
 namespace :spec do
   desc 'Run all specs'
-  task all: ['rubocop:auto_correct', :unit]
+  task all: %i[rubocop:auto_correct unit system]
-  RSpec::Core::RakeTask.new(:unit) do |t|
-    t.pattern = 'spec/unit/**/*_spec.rb'
+  %w[unit system].each do |type|
+    desc "Run #{type} tests"
+    RSpec::Core::RakeTask.new(type) do |t|
+      t.pattern = "spec/#{type}/**/*_spec.rb"
+    end
   end
 end

data/exe/httpspell CHANGED

@@ -4,8 +4,13 @@
 require 'optparse'
 require 'httpspell/spider'
 require 'httpspell/spellchecker'
+require 'httpspell/version'
 personal_dictionary_path = nil
+force_language = nil
+tracing = nil
+verbose = nil
+limit = nil
 begin
   OptionParser.new do |parser|
@@ -13,11 +18,28 @@ begin
       Spellchecks a website via HTTP.
     BANNER
+    parser.version = HttpSpell::VERSION
     parser.on('-p', '--personal-dictionary=FILE', 'path to the personal dictionary file') do |p|
       personal_dictionary_path = p
     end
+    parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
+      force_language = l
+    end
+    parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
+      limit = Regexp.new(l)
+    end
+    parser.on('-t', '--trace', 'enable error tracing') do
+      tracing = true
+    end
+    parser.on('-V', '--verbose', "explain what's happening") do
+      verbose = true
+    end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
@@ -32,20 +54,32 @@ if ARGV.size != 1
 end
 spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
+has_unknown_words = false
-HttpSpell::Spider.new(ARGV.first).start do |url, doc|
-  lang = doc.root['lang'] || 'de-DE'
+begin
+  HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
+    lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
-  # Remove sections that are not to be spellchecked
-  doc.css('pre').each(&:unlink)
-  doc.css('code').each(&:unlink)
-  doc.css('[spellcheck=false]').each(&:unlink)
+    # Remove sections that are not to be spellchecked
+    doc.css('pre').each(&:unlink)
+    doc.css('code').each(&:unlink)
+    doc.css('[spellcheck=false]').each(&:unlink)
-  # TODO: Find sections with a lang attribute and handle them separately
-  unknown_words = spell_checker.check(doc.to_s, lang)
+    # TODO: Find sections with a lang attribute and handle them separately
+    unknown_words = spell_checker.check(doc.to_s, lang)
-  unless unknown_words.empty?
-    warn "#{unknown_words.size} unknown words at #{url}:"
-    puts unknown_words
+    if unknown_words.empty?
+      warn "No unknown words (language is #{lang}) at #{url}." if verbose
+    else
+      warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
+      puts unknown_words
+      has_unknown_words = true
+    end
   end
+rescue StandardError
+  warn $ERROR_INFO.message
+  warn $ERROR_INFO.backtrace if tracing
+  exit 2
 end
+exit 1 if has_unknown_words

data/httpspell.gemspec CHANGED

@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency 'addressable'
   spec.add_dependency 'nokogiri'
+  spec.add_development_dependency 'aruba'
   spec.add_development_dependency 'bundler'
   spec.add_development_dependency 'guard'
   spec.add_development_dependency 'guard-bundler'

data/lib/httpspell/spider.rb CHANGED

@@ -5,34 +5,29 @@ require 'addressable/uri'
 require 'English'
 module HttpSpell
-  # rubocop:disable Metrics/AbcSize
-  # rubocop:disable Metrics/MethodLength
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, base_url = starting_point)
+    def initialize(starting_point, limit: nil, tracing: false)
       @todo = []
       @done = []
       todo << Addressable::URI.parse(starting_point)
-      @base_url = Addressable::URI.parse(base_url)
+      @limit = limit || /^#{starting_point}/
+      @tracing = tracing
     end
     def start
       while todo.any?
         url = todo.pop
-        begin
-          extracted = links(url) do |u, d|
-            yield u, d if block_given?
-          rescue
-            warn "Callback error for #{url}: #{$ERROR_INFO}"
-          end
-          done.append(url)
-          todo.concat(extracted - done - todo)
-        rescue StandardError
-          warn "Could not fetch #{url}: #{$ERROR_INFO}"
+        extracted = links(url) do |u, d|
+          yield u, d if block_given?
+        rescue
+          warn "Callback error for #{url}: #{$ERROR_INFO}"
+          warn $ERROR_INFO.backtrace if @tracing
         end
+        done.append(url)
+        todo.concat(extracted - done - todo)
       end
     end
@@ -40,23 +35,29 @@ module HttpSpell
     def links(uri)
       # We are using open-uri, which follows redirects and also provides the content-type.
-      response = URI(uri).read
-      return [] unless response.content_type == 'text/html'
+      response = open(uri).read
+      if response.respond_to?(:content_type)
+        return [] unless response.content_type == 'text/html'
+      end
       doc = Nokogiri::HTML(response)
       links = doc.css('a[href]').map do |e|
         link = Addressable::URI.parse(e['href'])
         link = uri.join(link) if link.relative?
-        next unless link.to_s.start_with?(@base_url.to_s)
+        next unless @limit.match?(link.to_s)
+        # TODO Ignore same page links (some anchor)
         link
       rescue StandardError
-        warn $ERROR_INFO
+        warn $ERROR_INFO.message
+        warn $ERROR_INFO.backtrace if @tracing
       end.compact
       yield uri, doc if block_given?
+      warn "Adding #{links.size} links from #{uri}" if @tracing
       links
     end
   end
-  # rubocop:enable Metrics/AbcSize
-  # rubocop:enable Metrics/MethodLength
 end

data/lib/httpspell/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.0.0'
+  VERSION = '1.1.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.0
 platform: ruby
 authors:
 - Steffen Uhlig
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-05-30 00:00:00.000000000 Z
+date: 2018-06-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -38,6 +38,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: aruba
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -176,7 +190,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rspec"
 - ".rubocop.yml"
+- ".travis.yml"
 - Gemfile
 - Gemfile.lock
 - README.markdown