RubyGems - httpspell - Versions diffs - 1.4.1 → 1.5.1 - Mend

httpspell 1.4.1 → 1.5.1

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +3 -7
data/README.markdown +1 -1
data/TODO.markdown +1 -1
data/exe/httpspell +44 -24
data/httpspell.gemspec +0 -1
data/lib/http_spell/spellchecker.rb +0 -5
data/lib/http_spell/spider.rb +23 -21
data/lib/http_spell/version.rb +1 -1
metadata +2 -16

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dc09324c003c7b14e08fa255b7a31c0a9aeb143df033da9aea300619a47268ba
-  data.tar.gz: 6890352a3cef38e243e2506398d58736c8179c2e0443a2b6ff341165e724dba0
+  metadata.gz: 509242695286e955675a85e15957752f1ac19eba7a5ffda317f6e45fd41c6c01
+  data.tar.gz: 4537ecafb9c882a23024c00246b0c1a07359d5180b2ee052d68a25ea23a64f6f
 SHA512:
-  metadata.gz: 826bb8e875b2f1584dd5c052ab9777e616e1da0d6844263589b027c3eabfb07955155e0c43b8b1b8dc253d720eba952e80330c38035fff53fc1943420dea7454
-  data.tar.gz: 7a4e3c9aaa586d4fbdc41971424cd5f064793ff18cba8d8606a452b3cee36070af44aa2f78ab307c71a613404cc1e490af1f56eca11068675183625f5360790e
+  metadata.gz: ddf6cb8856cf025e21956c49efe2d94c35204c273a086f60b6ae5e61c7bd56ec9fddda5ec8890f78c0ff106b03baba6ced6bfcf733f1e93622721ebf0b966a08
+  data.tar.gz: c217f2635966096b1ab86df7c52d6dd76145359d761b3cccc6829fde018760407e7207b7d6b89e3f709b8a966f82de93964656e64ad32474e4e8e3ad7cea43f8

data/Gemfile.lock CHANGED Viewed

@@ -1,15 +1,12 @@
 PATH
   remote: .
   specs:
-    httpspell (1.4.1)
-      addressable
+    httpspell (1.5.1)
       nokogiri
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.8.6)
-      public_suffix (>= 2.0.2, < 6.0)
     aruba (2.2.0)
       bundler (>= 1.17, < 3.0)
       contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
       nenv (~> 0.1)
       shellany (~> 0.0)
     parallel (1.24.0)
-    parser (3.3.1.0)
+    parser (3.3.2.0)
       ast (~> 2.4.1)
       racc
     pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
     pry-byebug (3.10.1)
       byebug (~> 11.0)
       pry (>= 0.13, < 0.15)
-    public_suffix (5.0.5)
     racc (1.8.0)
     rack (3.0.11)
     rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.1)
-    rubocop (1.64.0)
+    rubocop (1.64.1)
       json (~> 2.3)
       language_server-protocol (>= 3.17.0)
       parallel (~> 1.10)

data/README.markdown CHANGED Viewed

@@ -39,7 +39,7 @@ Words that are not in the dictionary for the given language (inferred from the `
 # Misc
-If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='false'` for an element is a simple as adding this line *after* the element (e.g. heading):
+If you produce content with kramdown (e.g. using Jekyll), an [Inline Attribute List](https://kramdown.gettalong.org/syntax.html#inline-attribute-lists) can be used to set `spellcheck='false'` for an element by adding this line *after* the element (e.g. heading):
 ```
 {: spellcheck="false"}

data/TODO.markdown CHANGED Viewed

@@ -1,4 +1,4 @@
 * Bail out if lang cannot be inferred and is not given on cmdline
 * exe/httpspell:    # TODO: --recursive, defaults to false
 * exe/httpspell:    # TODO wget has some additional options for recursive behavior that should be reviewed
-* lib/httpspell/spider.rb:          # TODO Print _which_ entry of the blacklist matches
+* lib/httpspell/spider.rb:          # TODO Print _which_ entry of the exclude list matches

data/exe/httpspell CHANGED Viewed

@@ -7,13 +7,15 @@ require 'http_spell/spellchecker'
 require 'http_spell/version'
 personal_dictionary_path = nil
+ignore_file_path = nil
 force_language = nil
 tracing = nil
 verbose = nil
-whitelist = nil
-blacklist = []
+included = nil
+excluded = []
 begin
+  # rubocop:disable Metrics/BlockLength
   OptionParser.new do |parser|
     parser.banner.prepend <<~BANNER
       Spellchecks a website via HTTP.
@@ -25,13 +27,17 @@ begin
       personal_dictionary_path = p
     end
+    parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i|
+      ignore_file_path = i
+    end
     parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
       force_language = l
     end
-    parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
-      whitelist ||= []
-      whitelist << Regexp.new(w)
+    parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
+      included ||= []
+      included << Regexp.new(w)
     end
     parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +48,16 @@ begin
       verbose = true
     end
-    parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
-      blacklist << Regexp.new(b)
+    parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
+      excluded << Regexp.new(b)
     end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
+  # rubocop:enable Metrics/BlockLength
 rescue StandardError
-  warn "Error - #{$ERROR_INFO}"
+  warn "Error: #{$ERROR_INFO}"
   exit 1
 end
@@ -59,38 +66,51 @@ if ARGV.size != 1
   exit 1
 end
-def check(doc, lang, personal_dictionary_path, verbose)
-  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
+# rubocop:disable Metrics/ParameterLists
+def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose)
+  has_unknown_words = false
+  # Handle elements with a different lang attribute separately
+  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
+    has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose)
+    element.unlink
+  end
+  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang)
+  if ignore_file_path && unknown_words.any?
+    ignore_words = File.read(ignore_file_path).lines.map(&:chomp)
+    ignored_words = unknown_words.intersection(ignore_words)
+    if ignored_words.any?
+      warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose
+      unknown_words -= ignore_words
+    end
+  end
   if unknown_words.empty?
-    warn 'No unknown words.' if verbose
+    warn "#{url} (lang=#{lang}): No unknown words" if verbose
+    has_unknown_words # no unknown words in doc, but maybe in elements with a different language
   else
-    warn "#{unknown_words.size} unknown words:" if verbose
+    warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
     puts unknown_words
-    true
+    true # regardless of what elements with a different language had, at least doc has unknown words
   end
 end
+# rubocop:enable Metrics/ParameterLists
 has_unknown_words = false
-spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
+spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
   lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
-  warn "Checking #{url} as #{lang}" if verbose
   # Remove elements that are not to be spellchecked
   doc.css('pre').each(&:unlink)
   doc.css('code').each(&:unlink)
+  doc.css('iframe').each(&:unlink)
   doc.css('[spellcheck=false]').each(&:unlink)
-  # Handle elements with a different lang attribute separately
-  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
-    warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
-    has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
-    element.unlink
-  end
-  # Everything else
-  has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
+  has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose)
 end
 exit 2 unless spider_success

data/httpspell.gemspec CHANGED Viewed

@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ['lib']
-  spec.add_dependency 'addressable'
   spec.add_dependency 'nokogiri'
   spec.metadata['rubygems_mfa_required'] = 'true'
 end

data/lib/http_spell/spellchecker.rb CHANGED Viewed

@@ -13,11 +13,6 @@ module HttpSpell
         "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
       ]
-      if @verbose
-        warn 'Piping the HTML document into the following chain of commands:'
-        warn commands
-      end
       Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
         stdin.puts(doc)
         stdin.close

data/lib/http_spell/spider.rb CHANGED Viewed

@@ -1,21 +1,21 @@
 # frozen_string_literal: true
 require 'nokogiri'
+require 'uri'
 require 'open-uri'
 require 'open3'
-require 'addressable/uri'
 require 'English'
 module HttpSpell
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
+    def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
       @todo = []
       @done = []
-      todo << Addressable::URI.parse(starting_point)
-      @whitelist = whitelist || [/^#{starting_point}/]
-      @blacklist = blacklist
+      todo << URI(starting_point)
+      @included = included || [/^#{starting_point}/]
+      @excluded = excluded
       @verbose = verbose
       @tracing = tracing
     end
@@ -35,7 +35,12 @@ module HttpSpell
           end
           done.append(url)
-          todo.concat(extracted - done - todo).uniq!
+          new_links = (extracted - done - todo).uniq
+          if new_links.any?
+            warn "Adding #{new_links.size} new links found at #{url}" if @verbose
+            todo.concat(extracted - done - todo).uniq!
+          end
         rescue StandardError
           warn "Skipping #{url} because of #{$ERROR_INFO.message}"
           warn $ERROR_INFO.backtrace if @tracing
@@ -52,46 +57,43 @@ module HttpSpell
       response = http_get(uri)
       if response.respond_to?(:content_type) && response.content_type != 'text/html'
-        warn "Skipping #{uri} because it is not HTML" if @verbose
+        warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
         return []
       end
       doc = Nokogiri::HTML(response)
       links = doc.css('a[href]').map do |e|
-        link = Addressable::URI.parse(e['href'])
-        link = uri.join(link) if link.relative?
+        next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
+        link = URI.join(response.base_uri, e['href'])
+        link.fragment = nil # Ignore fragment in links to other pages, too
-        if @whitelist.none? { |re| re.match?(link.to_s) }
-          warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
+        if @included.none? { |re| re.match?(link.to_s) }
+          warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
           next
         end
-        if @blacklist.any? { |re| re.match?(link.to_s) }
-          # TODO: Print _which_ entry of the blacklist matches
-          warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
+        if @excluded.any? { |re| re.match?(link.to_s) }
+          # TODO: Print _which_ entry of the excluded matches
+          warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
           next
         end
-        # Ignore fragment; we always check the whole page
-        link.fragment = nil
         link
       rescue StandardError
-        warn $ERROR_INFO.message
+        warn "Error: #{$ERROR_INFO}"
         warn $ERROR_INFO.backtrace if @tracing
       end.compact
-      yield uri, doc if block_given?
+      yield response.base_uri, doc if block_given?
-      warn "Adding #{links.size} links from #{uri}" if @verbose
       links
     end
     # https://twin.github.io/improving-open-uri/
     def http_get(uri)
       tries = 10
       begin
         URI.parse(uri).open(redirect: false)
       rescue OpenURI::HTTPRedirect => e

data/lib/http_spell/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.4.1'
+  VERSION = '1.5.1'
 end

metadata CHANGED Viewed

@@ -1,29 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.4.1
+  version: 1.5.1
 platform: ruby
 authors:
 - Steffen Uhlig
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-05-30 00:00:00.000000000 Z
+date: 2024-06-01 00:00:00.000000000 Z
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: addressable
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement