RubyGems - httpspell - Versions diffs - 1.1.0 → 1.2.0 - Mend

httpspell 1.1.0 → 1.2.0

Files changed (7) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/exe/httpspell +28 -27
data/lib/httpspell/spellchecker.rb +13 -2
data/lib/httpspell/spider.rb +36 -12
data/lib/httpspell/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aeed14621889176b3b295937ab5ae5d75371862b0e9107fc11ab0be65cebc082
-  data.tar.gz: 10a13a180d6b032b0e71623ae82239325c658bad2a4a4f74424636adedc22531
+  metadata.gz: 01e176d60bc87e62dae15531feabcfa31ac27a1d4d74b4f1faf3ee161579fc35
+  data.tar.gz: 36b6d13d79bc37531054f59859708db1ee9dd594f668ed8f4cf6bcd18cba66e2
 SHA512:
-  metadata.gz: 9517f20cdfa7da8f013eb01e456c5a67fbccd49df25bfcfa437aca1e20ccd570623c465e8a953fb588d83f95c783caa1fccce19dab0ff416caf05c5a4c75a58a
-  data.tar.gz: 54d38926757fcd968a461cfff777343e08a69f8a3fe687fd00694486a1ac4539d4547f6faaa34c9a997b7643a134bf27e346d9ddf0c3fe7d11c38d5d6ca56e45
+  metadata.gz: 91d76a7d20f95562b8012ce1a76aee35008def4bac42d537104450db8cfc4974bdd480c24d7ab8388164f89bb3cf486465f39958b67fd5c792196620eb06261b
+  data.tar.gz: d9c6faca03e9d992fbf7d07792b88106807977092ae4591a5f4c316e05d86ad44a5bf6a5b637be37e9d8c5b0e3ea78d9c118810ce2b2e257c996459f92012490

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    httpspell (1.1.0)
+    httpspell (1.2.0)
       addressable
       nokogiri

data/exe/httpspell CHANGED

@@ -10,7 +10,8 @@ personal_dictionary_path = nil
 force_language = nil
 tracing = nil
 verbose = nil
-limit = nil
+whitelist = nil
+blacklist = []
 begin
   OptionParser.new do |parser|
@@ -28,8 +29,9 @@ begin
       force_language = l
     end
-    parser.on('-L', '--limit=EXPRESSION', 'limit recursive retrieval to URLs matching a regular EXPRESSION') do |l|
-      limit = Regexp.new(l)
+    parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
+      whitelist ||= []
+      whitelist << Regexp.new(w)
     end
     parser.on('-t', '--trace', 'enable error tracing') do
@@ -40,6 +42,10 @@ begin
       verbose = true
     end
+    parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
+      blacklist << Regexp.new(b)
+    end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
@@ -53,33 +59,28 @@ if ARGV.size != 1
   exit 1
 end
-spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path)
+spell_checker = HttpSpell::SpellChecker.new(personal_dictionary_path, tracing: tracing)
 has_unknown_words = false
-begin
-  HttpSpell::Spider.new(ARGV.first, limit: limit, tracing: tracing).start do |url, doc|
-    lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
-    # Remove sections that are not to be spellchecked
-    doc.css('pre').each(&:unlink)
-    doc.css('code').each(&:unlink)
-    doc.css('[spellcheck=false]').each(&:unlink)
-    # TODO: Find sections with a lang attribute and handle them separately
-    unknown_words = spell_checker.check(doc.to_s, lang)
-    if unknown_words.empty?
-      warn "No unknown words (language is #{lang}) at #{url}." if verbose
-    else
-      warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
-      puts unknown_words
-      has_unknown_words = true
-    end
+spider_success = HttpSpell::Spider.new(ARGV.first, whitelist: whitelist, blacklist: blacklist, tracing: tracing).start do |url, doc|
+  lang = force_language || doc.root['lang'] || ENV['LANGUAGE']
+  # Remove sections that are not to be spellchecked
+  doc.css('pre').each(&:unlink)
+  doc.css('code').each(&:unlink)
+  doc.css('[spellcheck=false]').each(&:unlink)
+  # TODO: Find sections with a lang attribute and handle them separately
+  unknown_words = spell_checker.check(doc.to_s, lang)
+  if unknown_words.empty?
+    warn "No unknown words (language is #{lang}) at #{url}." if verbose
+  else
+    warn "#{unknown_words.size} unknown words (language is #{lang}) at #{url}:" if verbose
+    puts unknown_words
+    has_unknown_words = true
   end
-rescue StandardError
-  warn $ERROR_INFO.message
-  warn $ERROR_INFO.backtrace if tracing
-  exit 2
 end
+exit 2 unless spider_success
 exit 1 if has_unknown_words

data/lib/httpspell/spellchecker.rb CHANGED

@@ -1,11 +1,22 @@
 module HttpSpell
   class SpellChecker
-    def initialize(personal_dictionary_path = nil)
+    def initialize(personal_dictionary_path = nil, tracing: false)
       @personal_dictionary_arg = "-p #{personal_dictionary_path}" if personal_dictionary_path
+      @tracing = tracing
     end
     def check(doc, lang)
-      Open3.pipeline_rw('pandoc --from html --to plain', "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l") do |stdin, stdout, _wait_thrs|
+      commands = [
+        'pandoc --from html --to plain',
+        "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
+      ]
+      if @tracing
+        warn "Piping the HTML document into the following chain of commands:"
+        warn commands
+      end
+      Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
         stdin.puts(doc)
         stdin.close
         stdout.read.split.uniq

data/lib/httpspell/spider.rb CHANGED

@@ -8,27 +8,39 @@ module HttpSpell
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, limit: nil, tracing: false)
+    def initialize(starting_point, whitelist: nil, blacklist: [], tracing: false)
       @todo = []
       @done = []
       todo << Addressable::URI.parse(starting_point)
-      @limit = limit || /^#{starting_point}/
+      @whitelist = whitelist || [/^#{starting_point}/]
+      @blacklist = blacklist
       @tracing = tracing
     end
     def start
+      success = true
       while todo.any?
         url = todo.pop
-        extracted = links(url) do |u, d|
-          yield u, d if block_given?
-        rescue
-          warn "Callback error for #{url}: #{$ERROR_INFO}"
+        begin
+          extracted = links(url) do |u, d|
+            yield u, d if block_given?
+          rescue
+            warn "Callback error for #{url}: #{$ERROR_INFO}"
+            warn $ERROR_INFO.backtrace if @tracing
+          end
+          done.append(url)
+          todo.concat(extracted - done - todo)
+        rescue StandardError
+          warn "Skipping #{url} because of #{$ERROR_INFO.message}"
           warn $ERROR_INFO.backtrace if @tracing
+          success = false
         end
-        done.append(url)
-        todo.concat(extracted - done - todo)
       end
+      return success
     end
     private
@@ -37,8 +49,9 @@ module HttpSpell
       # We are using open-uri, which follows redirects and also provides the content-type.
       response = open(uri).read
-      if response.respond_to?(:content_type)
-        return [] unless response.content_type == 'text/html'
+      if response.respond_to?(:content_type) && response.content_type != 'text/html'
+        warn "Skipping #{uri} because it is not HTML" if @tracing
+        return []
       end
       doc = Nokogiri::HTML(response)
@@ -46,7 +59,18 @@ module HttpSpell
       links = doc.css('a[href]').map do |e|
         link = Addressable::URI.parse(e['href'])
         link = uri.join(link) if link.relative?
-        next unless @limit.match?(link.to_s)
+        if @whitelist.none? { |re| re.match?(link.to_s) }
+          warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @tracing
+          next
+        end
+        if @blacklist.any? { |re| re.match?(link.to_s) }
+          # TODO Print _which_ entry of the blacklist matches
+          warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @tracing
+          next
+        end
         # TODO Ignore same page links (some anchor)
         link
       rescue StandardError

data/lib/httpspell/version.rb CHANGED

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.1.0'
+  VERSION = '1.2.0'
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.2.0
 platform: ruby
 authors:
 - Steffen Uhlig