RubyGems - httpspell - Versions diffs - 1.1.0 → 1.2.0 - Mend

httpspell 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/exe/httpspell +28 -27
data/lib/httpspell/spellchecker.rb +13 -2
data/lib/httpspell/spider.rb +36 -12
data/lib/httpspell/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
-  data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
+  metadata.gz: 01e176d60bc87e62dae15531feabcfa31ac27a1d4d74b4f1faf3ee161579fc35
+  data.tar.gz: 36b6d13d79bc37531054f59859708db1ee9dd594f668ed8f4cf6bcd18cba66e2
 SHA512:
-  metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
-  data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
+  metadata.gz: 91d76a7d20f95562b8012ce1a76aee35008def4bac42d537104450db8cfc4974bdd480c24d7ab8388164f89bb3cf486465f39958b67fd5c792196620eb06261b
+  data.tar.gz: d9c6faca03e9d992fbf7d07792b88106807977092ae4591a5f4c316e05d86ad44a5bf6a5b637be37e9d8c5b0e3ea78d9c118810ce2b2e257c996459f92012490

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    httpspell (1.1.0)
+    httpspell (1.2.0)
       addressable
       nokogiri

data/exe/httpspell CHANGED

@@ -10,7 +10,8 @@ personal_dictionary_path = nil
 force_language = nil
 tracing = nil
 verbose = nil
-limit = nil
+whitelist = nil
+blacklist = []
 begin
   OptionParser.new do |parser|
@@ -28,8 +29,9 @@ begin
       force_language = l
     end
-    parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
-      limit = Regexp.new(l)
+    parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
+      whitelist ||= []
+      whitelist << Regexp.new(w)
     end
     parser.on('-t', '--trace', 'enable error tracing') do
@@ -40,6 +42,10 @@ begin
       verbose = true
     end
+    parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
+      blacklist << Regexp.new(b)
+    end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
@@ -53,33 +59,28 @@ if ARGV.size != 1
   exit 1
 end
-spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
+spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
 has_unknown_words = false
-begin
-  HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
-    lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
-    # Remove sections that are not to be spellchecked
-    doc.css('pre').each(&:unlink)
-    doc.css('code').each(&:unlink)
-    doc.css('[spellcheck=false]').each(&:unlink)
-    # TODO: Find sections with a lang attribute and handle them separately
-    unknown_words = spell_checker.check(doc.to_s, lang)
-    if unknown_words.empty?
-      warn "No unknown words (language is #{lang}) at #{url}." if verbose
-    else
-      warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
-      puts unknown_words
-      has_unknown_words = true
-    end
+spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
+  lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
+  # Remove sections that are not to be spellchecked
+  doc.css('pre').each(&:unlink)
+  doc.css('code').each(&:unlink)
+  doc.css('[spellcheck=false]').each(&:unlink)
+  # TODO: Find sections with a lang attribute and handle them separately
+  unknown_words = spell_checker.check(doc.to_s, lang)
+  if unknown_words.empty?
+    warn "No unknown words (language is #{lang}) at #{url}." if verbose
+  else
+    warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
+    puts unknown_words
+    has_unknown_words = true
   end
-rescue StandardError
-  warn $ERROR_INFO.message
-  warn $ERROR_INFO.backtrace if tracing
-  exit 2
 end
+exit 2 unless spider_success
 exit 1 if has_unknown_words

data/lib/httpspell/spellchecker.rb CHANGED

@@ -1,11 +1,22 @@
 module HttpSpell
   class SpellChecker
-    def initialize(personal_dictionary_path = nil)
+    def initialize(personal_dictionary_path = nil, tracing: false)
       @personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
+      @tracing = tracing
     end
     def check(doc, lang)
-      Open3.pipeline_rw('pandoc --from html --to plain', "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l") do |stdin, stdout, _wait_thrs|
+      commands = [
+        'pandoc --from html --to plain',
+        "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
+      ]
+      if @tracing
+        warn "Piping the HTML document into the following chain of commands:"
+        warn commands
+      end
+      Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
         stdin.puts(doc)
         stdin.close
         stdout.read.split.uniq

data/lib/httpspell/spider.rb CHANGED

@@ -8,27 +8,39 @@ module HttpSpell
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, limit: nil, tracing: false)
+    def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
       @todo = []
       @done = []
       todo << Addressable::URI.parse(starting_point)
-      @limit = limit || /^#{starting_point}/
+      @whitelist = whitelist || [/^#{starting_point}/]
+      @blacklist = blacklist
       @tracing = tracing
     end
     def start
+      success = true
       while todo.any?
         url = todo.pop
-        extracted = links(url) do |u, d|
-          yield u, d if block_given?
-        rescue
-          warn "Callback error for #{url}: #{$ERROR_INFO}"
+        begin
+          extracted = links(url) do |u, d|
+            yield u, d if block_given?
+          rescue
+            warn "Callback error for #{url}: #{$ERROR_INFO}"
+            warn $ERROR_INFO.backtrace if @tracing
+          end
+          done.append(url)
+          todo.concat(extracted - done - todo)
+        rescue StandardError
+          warn "Skipping #{url} because of #{$ERROR_INFO.message}"
           warn $ERROR_INFO.backtrace if @tracing
+          success = false
         end
-        done.append(url)
-        todo.concat(extracted - done - todo)
       end
+      return success
     end
     private
@@ -37,8 +49,9 @@ module HttpSpell
       # We are using open-uri, which follows redirects and also provides the content-type.
       response = open(uri).read
-      if response.respond_to?(:content_type)
-        return [] unless response.content_type == 'text/html'
+      if response.respond_to?(:content_type) && response.content_type != 'text/html'
+        warn "Skipping #{uri} because it is not HTML" if @tracing
+        return []
       end
       doc = Nokogiri::HTML(response)
@@ -46,7 +59,18 @@ module HttpSpell
       links = doc.css('a[href]').map do |e|
         link = Addressable::URI.parse(e['href'])
         link = uri.join(link) if link.relative?
-        next unless @limit.match?(link.to_s)
+        if @whitelist.none? { |re| re.match?(link.to_s) }
+          warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @tracing
+          next
+        end
+        if @blacklist.any? { |re| re.match?(link.to_s) }
+          # TODO Print _which_ entry of the blacklist matches
+          warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @tracing
+          next
+        end
         # TODO Ignore same page links (some anchor)
         link
       rescue StandardError

data/lib/httpspell/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.1.0'
+  VERSION = '1.2.0'
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.2.0
 platform: ruby
 authors:
 - Steffen Uhlig