RubyGems - httpspell - Versions diffs - 1.4.1 → 1.5.1 - Mend

httpspell 1.4.1 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +3 -7
data/README.markdown +1 -1
data/TODO.markdown +1 -1
data/exe/httpspell +44 -24
data/httpspell.gemspec +0 -1
data/lib/http_spell/spellchecker.rb +0 -5
data/lib/http_spell/spider.rb +23 -21
data/lib/http_spell/version.rb +1 -1
metadata +2 -16

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dc09324c003c7b14e08fa255b7a31c0a9aeb143df033da9aea300619a47268ba
-  data.tar.gz: 6890352a3cef38e243e2506398d58736c8179c2e0443a2b6ff341165e724dba0
+  metadata.gz: 509242695286e955675a85e15957752f1ac19eba7a5ffda317f6e45fd41c6c01
+  data.tar.gz: 4537ecafb9c882a23024c00246b0c1a07359d5180b2ee052d68a25ea23a64f6f
 SHA512:
-  metadata.gz: 826bb8e875b2f1584dd5c052ab9777e616e1da0d6844263589b027c3eabfb07955155e0c43b8b1b8dc253d720eba952e80330c38035fff53fc1943420dea7454
-  data.tar.gz: 7a4e3c9aaa586d4fbdc41971424cd5f064793ff18cba8d8606a452b3cee36070af44aa2f78ab307c71a613404cc1e490af1f56eca11068675183625f5360790e
+  metadata.gz: ddf6cb8856cf025e21956c49efe2d94c35204c273a086f60b6ae5e61c7bd56ec9fddda5ec8890f78c0ff106b03baba6ced6bfcf733f1e93622721ebf0b966a08
+  data.tar.gz: c217f2635966096b1ab86df7c52d6dd76145359d761b3cccc6829fde018760407e7207b7d6b89e3f709b8a966f82de93964656e64ad32474e4e8e3ad7cea43f8

data/Gemfile.lock CHANGED Viewed

@@ -1,15 +1,12 @@
 PATH
   remote: .
   specs:
-    httpspell (1.4.1)
-      addressable
+    httpspell (1.5.1)
       nokogiri
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.8.6)
-      public_suffix (>= 2.0.2, < 6.0)
     aruba (2.2.0)
       bundler (>= 1.17, < 3.0)
       contracts (>= 0.16.0, < 0.18.0)
@@ -91,7 +88,7 @@ GEM
       nenv (~> 0.1)
       shellany (~> 0.0)
     parallel (1.24.0)
-    parser (3.3.1.0)
+    parser (3.3.2.0)
       ast (~> 2.4.1)
       racc
     pry (0.14.2)
@@ -100,7 +97,6 @@ GEM
     pry-byebug (3.10.1)
       byebug (~> 11.0)
       pry (>= 0.13, < 0.15)
-    public_suffix (5.0.5)
     racc (1.8.0)
     rack (3.0.11)
     rackup (0.2.3)
@@ -127,7 +123,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.1)
-    rubocop (1.64.0)
+    rubocop (1.64.1)
       json (~> 2.3)
       language_server-protocol (>= 3.17.0)
       parallel (~> 1.10)

data/README.markdown CHANGED Viewed

@@ -39,7 +39,7 @@ Words that are not in the dictionary for the given language (inferred from the `
 # Misc
-If you produce content with kramdown (e.g. using Jekyll), setting `spellcheck='false'` for an element is a simple as adding this line *after* the element (e.g. heading):
+If you produce content with kramdown (e.g. using Jekyll), an [Inline Attribute List](https://kramdown.gettalong.org/syntax.html#inline-attribute-lists) can be used to set `spellcheck='false'` for an element by adding this line *after* the element (e.g. heading):
 ```
 {: spellcheck="false"}

data/TODO.markdown CHANGED Viewed

@@ -1,4 +1,4 @@
 * Bail out if lang cannot be inferred and is not given on cmdline
 * exe/httpspell:    # TODO: --recursive, defaults to false
 * exe/httpspell:    # TODO wget has some additional options for recursive behavior that should be reviewed
-* lib/httpspell/spider.rb:          # TODO Print _which_ entry of the blacklist matches
+* lib/httpspell/spider.rb:          # TODO Print _which_ entry of the exclude list matches

data/exe/httpspell CHANGED Viewed

@@ -7,13 +7,15 @@ require 'http_spell/spellchecker'
 require 'http_spell/version'
 personal_dictionary_path = nil
+ignore_file_path = nil
 force_language = nil
 tracing = nil
 verbose = nil
-whitelist = nil
-blacklist = []
+included = nil
+excluded = []
 begin
+  # rubocop:disable Metrics/BlockLength
   OptionParser.new do |parser|
     parser.banner.prepend <<~BANNER
       Spellchecks a website via HTTP.
@@ -25,13 +27,17 @@ begin
       personal_dictionary_path = p
     end
+    parser.on('-I', '--ignore=FILE', 'path to a file containing spelling errors to ignore') do |i|
+      ignore_file_path = i
+    end
     parser.on('-l', '--language=LANGUAGE', 'override LANGUAGE of content') do |l|
       force_language = l
     end
-    parser.on('-w', '--whitelist=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
-      whitelist ||= []
-      whitelist << Regexp.new(w)
+    parser.on('-i', '--include=EXPRESSION', 'when recursively retrieving URLs, allow only those matching the given regular EXPRESSION') do |w|
+      included ||= []
+      included << Regexp.new(w)
     end
     parser.on('-t', '--trace', 'enable error tracing') do
@@ -42,15 +48,16 @@ begin
       verbose = true
     end
-    parser.on('-b', '--blacklist=EXPRESSION', 'blacklist (ignore) URLs matching the given regular EXPRESSION') do |b|
-      blacklist << Regexp.new(b)
+    parser.on('-e', '--exclude=EXPRESSION', 'exclude URLs matching the given regular EXPRESSION') do |b|
+      excluded << Regexp.new(b)
     end
     # TODO: --recursive, defaults to false
     # TODO wget has some additional options for recursive behavior that should be reviewed
   end.parse!
+  # rubocop:enable Metrics/BlockLength
 rescue StandardError
-  warn "Error - #{$ERROR_INFO}"
+  warn "Error: #{$ERROR_INFO}"
   exit 1
 end
@@ -59,38 +66,51 @@ if ARGV.size != 1
   exit 1
 end
-def check(doc, lang, personal_dictionary_path, verbose)
-  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc, lang)
+# rubocop:disable Metrics/ParameterLists
+def check(url, doc, lang, personal_dictionary_path, ignore_file_path, verbose)
+  has_unknown_words = false
+  # Handle elements with a different lang attribute separately
+  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
+    has_unknown_words |= check("#{url} => #{element.name} with", element, element['lang'], personal_dictionary_path, ignore_file_path, verbose)
+    element.unlink
+  end
+  unknown_words = HttpSpell::SpellChecker.new(personal_dictionary_path, verbose:).check(doc.to_s, lang)
+  if ignore_file_path && unknown_words.any?
+    ignore_words = File.read(ignore_file_path).lines.map(&:chomp)
+    ignored_words = unknown_words.intersection(ignore_words)
+    if ignored_words.any?
+      warn "#{url} (lang=#{lang}): Ignoring the following spelling errors because they are in the ignore list: #{ignored_words}" if verbose
+      unknown_words -= ignore_words
+    end
+  end
   if unknown_words.empty?
-    warn 'No unknown words.' if verbose
+    warn "#{url} (lang=#{lang}): No unknown words" if verbose
+    has_unknown_words # no unknown words in doc, but maybe in elements with a different language
   else
-    warn "#{unknown_words.size} unknown words:" if verbose
+    warn "#{url} (lang=#{lang}): #{unknown_words.size} unknown words:" if verbose
     puts unknown_words
-    true
+    true # regardless of what elements with a different language had, at least doc has unknown words
   end
 end
+# rubocop:enable Metrics/ParameterLists
 has_unknown_words = false
-spider_success = HttpSpell::Spider.new(ARGV.first, whitelist:, blacklist:, verbose:, tracing:).start do |url, doc|
+spider_success = HttpSpell::Spider.new(ARGV.first, included:, excluded:, verbose:, tracing:).start do |url, doc|
   lang = force_language || doc.root['lang'] || ENV.fetch('LANGUAGE', nil)
-  warn "Checking #{url} as #{lang}" if verbose
   # Remove elements that are not to be spellchecked
   doc.css('pre').each(&:unlink)
   doc.css('code').each(&:unlink)
+  doc.css('iframe').each(&:unlink)
   doc.css('[spellcheck=false]').each(&:unlink)
-  # Handle elements with a different lang attribute separately
-  doc.css(%([lang]:not([lang="#{lang}"]))).each do |element|
-    warn "Handling #{element.name} with lang #{element['lang']}:" if verbose
-    has_unknown_words |= check(element.to_s, element['lang'], personal_dictionary_path, verbose)
-    element.unlink
-  end
-  # Everything else
-  has_unknown_words |= check(doc.to_s, lang, personal_dictionary_path, verbose)
+  has_unknown_words |= check("#{url} => document with", doc, lang, personal_dictionary_path, ignore_file_path, verbose)
 end
 exit 2 unless spider_success

data/httpspell.gemspec CHANGED Viewed

@@ -23,7 +23,6 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ['lib']
-  spec.add_dependency 'addressable'
   spec.add_dependency 'nokogiri'
   spec.metadata['rubygems_mfa_required'] = 'true'
 end

data/lib/http_spell/spellchecker.rb CHANGED Viewed

@@ -13,11 +13,6 @@ module HttpSpell
         "hunspell -d #{translate(lang)} #{@personal_dictionary_arg} -i UTF-8 -l",
       ]
-      if @verbose
-        warn 'Piping the HTML document into the following chain of commands:'
-        warn commands
-      end
       Open3.pipeline_rw(*commands) do |stdin, stdout, _wait_thrs|
         stdin.puts(doc)
         stdin.close

data/lib/http_spell/spider.rb CHANGED Viewed

@@ -1,21 +1,21 @@
 # frozen_string_literal: true
 require 'nokogiri'
+require 'uri'
 require 'open-uri'
 require 'open3'
-require 'addressable/uri'
 require 'English'
 module HttpSpell
   class Spider
     attr_reader :todo, :done
-    def initialize(starting_point, whitelist: nil, blacklist: [], verbose: false, tracing: false)
+    def initialize(starting_point, included: nil, excluded: [], verbose: false, tracing: false)
       @todo = []
       @done = []
-      todo << Addressable::URI.parse(starting_point)
-      @whitelist = whitelist || [/^#{starting_point}/]
-      @blacklist = blacklist
+      todo << URI(starting_point)
+      @included = included || [/^#{starting_point}/]
+      @excluded = excluded
       @verbose = verbose
       @tracing = tracing
     end
@@ -35,7 +35,12 @@ module HttpSpell
           end
           done.append(url)
-          todo.concat(extracted - done - todo).uniq!
+          new_links = (extracted - done - todo).uniq
+          if new_links.any?
+            warn "Adding #{new_links.size} new links found at #{url}" if @verbose
+            todo.concat(extracted - done - todo).uniq!
+          end
         rescue StandardError
           warn "Skipping #{url} because of #{$ERROR_INFO.message}"
           warn $ERROR_INFO.backtrace if @tracing
@@ -52,46 +57,43 @@ module HttpSpell
       response = http_get(uri)
       if response.respond_to?(:content_type) && response.content_type != 'text/html'
-        warn "Skipping #{uri} because it is not HTML" if @verbose
+        warn "Skipping #{response.base_uri} because it is not HTML" if @verbose
         return []
       end
       doc = Nokogiri::HTML(response)
       links = doc.css('a[href]').map do |e|
-        link = Addressable::URI.parse(e['href'])
-        link = uri.join(link) if link.relative?
+        next if e['href'].start_with?('#') # Ignore fragment on the same page; we always check the whole page
+        link = URI.join(response.base_uri, e['href'])
+        link.fragment = nil # Ignore fragment in links to other pages, too
-        if @whitelist.none? { |re| re.match?(link.to_s) }
-          warn "Skipping #{link} because it is not on the whitelist #{@whitelist}" if @verbose
+        if @included.none? { |re| re.match?(link.to_s) }
+          warn "Skipping #{link} because it is not on the included #{@included}" if @verbose
           next
         end
-        if @blacklist.any? { |re| re.match?(link.to_s) }
-          # TODO: Print _which_ entry of the blacklist matches
-          warn "Skipping #{link} because it is on the blacklist #{@blacklist}" if @verbose
+        if @excluded.any? { |re| re.match?(link.to_s) }
+          # TODO: Print _which_ entry of the excluded matches
+          warn "Skipping #{link} because it is on the excluded #{@excluded}" if @verbose
           next
         end
-        # Ignore fragment; we always check the whole page
-        link.fragment = nil
         link
       rescue StandardError
-        warn $ERROR_INFO.message
+        warn "Error: #{$ERROR_INFO}"
         warn $ERROR_INFO.backtrace if @tracing
       end.compact
-      yield uri, doc if block_given?
+      yield response.base_uri, doc if block_given?
-      warn "Adding #{links.size} links from #{uri}" if @verbose
       links
     end
     # https://twin.github.io/improving-open-uri/
     def http_get(uri)
       tries = 10
       begin
         URI.parse(uri).open(redirect: false)
       rescue OpenURI::HTTPRedirect => e

data/lib/http_spell/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HttpSpell
-  VERSION = '1.4.1'
+  VERSION = '1.5.1'
 end

metadata CHANGED Viewed

@@ -1,29 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: httpspell
 version: !ruby/object:Gem::Version
-  version: 1.4.1
+  version: 1.5.1
 platform: ruby
 authors:
 - Steffen Uhlig
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-05-30 00:00:00.000000000 Z
+date: 2024-06-01 00:00:00.000000000 Z
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: addressable
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement