RubyGems - spellr - Versions diffs - 0.5.0 → 0.5.1 - Mend

spellr 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/Gemfile.lock +14 -14
data/lib/.spellr.yml +2 -0
data/lib/spellr/backports.rb +16 -6
data/lib/spellr/base_reporter.rb +54 -0
data/lib/spellr/check.rb +54 -20
data/lib/spellr/cli.rb +13 -6
data/lib/spellr/column_location.rb +1 -1
data/lib/spellr/config.rb +6 -45
data/lib/spellr/config_loader.rb +10 -6
data/lib/spellr/file.rb +15 -2
data/lib/spellr/file_list.rb +21 -17
data/lib/spellr/interactive.rb +51 -116
data/lib/spellr/interactive_add.rb +64 -0
data/lib/spellr/interactive_replacement.rb +69 -0
data/lib/spellr/key_tuner/naive_bayes.rb +49 -91
data/lib/spellr/key_tuner/possible_key.rb +36 -32
data/lib/spellr/key_tuner/stats.rb +26 -7
data/lib/spellr/language.rb +28 -44
data/lib/spellr/line_location.rb +13 -7
data/lib/spellr/line_tokenizer.rb +35 -134
data/lib/spellr/output.rb +62 -0
data/lib/spellr/output_stubbed.rb +58 -0
data/lib/spellr/quiet_reporter.rb +13 -0
data/lib/spellr/reporter.rb +9 -13
data/lib/spellr/token.rb +14 -16
data/lib/spellr/token_regexps.rb +103 -0
data/lib/spellr/tokenizer.rb +35 -14
data/lib/spellr/version.rb +1 -1
data/lib/spellr/wordlist.rb +29 -25
data/lib/spellr/wordlist_reporter.rb +16 -8
data/lib/spellr.rb +1 -0
data/wordlists/ruby.txt +1046 -13
metadata +9 -2

data/lib/spellr/token_regexps.rb ADDED Viewed

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+require_relative '../spellr'
+module Spellr
+  module TokenRegexps
+    #### WORDS ####
+    # [Word], [Word]Word [Word]'s [Wordn't]
+    TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
+    # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
+    UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze # rubocop:disable Metrics/LineLength
+    # [word] [word]'s [wordn't]
+    LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
+    # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
+    OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
+    TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
+    #### NON WORDS ####
+    NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze
+    LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
+    HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
+    SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
+    PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
+    # TODO: hex escapes e.g. \xAA.
+    # TODO: language aware escapes
+    BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze
+    REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx
+    URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
+    # There's got to be a better way of writing this
+    SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
+    # I didn't want to do this myself
+    # BUT i need something to heuristically match on, and it's difficult
+    URL_SCHEME = %r{(?://|https?://|s?ftp://|mailto:)}.freeze
+    URL_USERINFO = /[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
+    URL_IP_ADDRESS = /\d{1,3}(?:\.\d{1,3}){3}/.freeze
+    # literal \ so that i can match on domains in regexps. no-one cares but me.
+    URL_HOSTNAME = /(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
+    URL_PORT = /:\d+/.freeze
+    URL_PATH = %r{/(?:[[:alnum:]=@!$&\-/._\\]|%\h{2})+}.freeze
+    URL_QUERY = %r{\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*}.freeze
+    URL_FRAGMENT = %r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
+    # URL can be any valid hostname, it must have either a scheme, userinfo, or path
+    # it may have those and any of the others and a port, or a query or a fragment.
+    URL_REST = /#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
+    URL_RE = Regexp.union(
+      /#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
+      /#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
+      /#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
+    ).freeze
+    KEY_SENDGRID_RE = /SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
+    KEY_HYPERWALLET_RE = /prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
+    KEY_GTM_RE = /GTM-[A-Z0-9]{7}/.freeze
+    KEY_SHA1 = %r{sha1-[A-Za-z0-9=+/]{28}}.freeze
+    KEY_SHA512 = %r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
+    KEY_DATA_URL = %r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
+    KEY_PATTERNS_RE = Regexp.union(
+      KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
+    )
+    SKIPS = Regexp.union(
+      NOT_EVEN_NON_WORDS_RE,
+      SHELL_COLOR_ESCAPE_RE,
+      BACKSLASH_ESCAPE_RE,
+      URL_ENCODED_ENTITIES_RE,
+      HEX_RE,
+      URL_RE, # 2%
+      KEY_PATTERNS_RE
+    ).freeze
+    AFTER_KEY_SKIPS = Regexp.union(
+      LEFTOVER_NON_WORD_BITS_RE,
+      REPEATED_SINGLE_LETTERS_RE,
+      SEQUENTIAL_LETTERS_RE
+    )
+    # this is in a method because the minimum word length stuff was throwing it off
+    # TODO: move to config maybe?
+    def min_alpha_re
+      @min_alpha_re ||= Regexp.union(
+        /[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
+        /[a-z]{#{Spellr.config.word_minimum_length}}/,
+        /[A-Z]{#{Spellr.config.word_minimum_length}}/
+      ).freeze
+    end
+    ALPHA_SEP_RE = %r{[A-Za-z][A-Za-z\-_/+]*}.freeze
+    NUM_SEP_RE = %r{\d[\d\-_/+]*}.freeze
+    THREE_CHUNK_RE = Regexp.union(
+      /\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
+      /\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
+    ).freeze
+    POSSIBLE_KEY_RE = %r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
+    SPELLR_DISABLE_RE = /spellr:disable/.freeze
+    SPELLR_ENABLE_RE = /spellr:enable/.freeze
+  end
+end

data/lib/spellr/tokenizer.rb CHANGED Viewed

@@ -14,13 +14,12 @@ module Spellr
     attr_accessor :disabled
     alias_method :disabled?, :disabled
-    def initialize(file, start_at: nil, skip_uri: true, skip_key: true)
-      # $stderr.puts start_at if start_at
+    def initialize(file, start_at: nil, skip_key: true)
       @start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
       @file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
       @file.pos = @start_at.line_location.byte_offset
-      @line_tokenizer = LineTokenizer.new(skip_uri: skip_uri, skip_key: skip_key)
+      @line_tokenizer = LineTokenizer.new(skip_key: skip_key)
     end
     def terms
@@ -32,37 +31,59 @@ module Spellr
     end
     def each_term(&block)
-      each_line_with_offset do |line, line_number|
-        prepare_tokenizer_for_line(line, line_number).each_term(&block)
+      file.each_line do |line|
+        prepare_tokenizer_for_line(line).each_term(&block)
       end
     end
-    def each_token(&block) # rubocop:disable Metrics/AbcSize
+    def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
+      each_line_with_stats do |line, line_number, char_offset, byte_offset|
+        prepare_tokenizer_for_line(line).each_token(skip_term_proc: skip_term_proc) do |token|
+          token.line = prepare_line(line, line_number, char_offset, byte_offset)
+          yield token
+        end
+      end
+    end
+    def prepare_line(line, line_number, char_offset, byte_offset)
+      line_location = LineLocation.new(
+        file, line_number, char_offset: char_offset, byte_offset: byte_offset
+      )
+      column_location = ColumnLocation.new(line_location: line_location)
+      Token.new(line, location: column_location)
+    end
+    def each_line_with_stats # rubocop:disable Metrics/MethodLength
       char_offset = @start_at.line_location.char_offset
       byte_offset = @start_at.line_location.byte_offset
-      each_line_with_offset do |line, line_number|
-        line_location = LineLocation.new(file, line_number, byte_offset: byte_offset, char_offset: char_offset)
+      file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
+        yield line, line_number, char_offset, byte_offset
         char_offset += line.length
         byte_offset += line.bytesize
-        line = Token.new(line, location: ColumnLocation.new(line_location: line_location))
-        prepare_tokenizer_for_line(line, line_number).each_token(&block)
       end
     end
     def normalized_terms
-      enum_for(:each_term).map(&:normalize).uniq.sort
+      enum_for(:each_term).map(&:spellr_normalize).uniq.sort
     end
     private
     attr_reader :line_tokenizer
-    def each_line_with_offset(&block)
-      file.each_line.with_index(@start_at.line_number, &block)
+    def each_line_token
+      line_location = @start_at.line_location
+      file.each_line do |line|
+        yield Token.new(line, location: ColumnLocation.new(line_location: line_location))
+        line_location = line_location.advance(line)
+      end
     end
-    def prepare_tokenizer_for_line(line, _line_number)
+    def prepare_tokenizer_for_line(line)
       line_tokenizer.string = line
       line_tokenizer.pos = 0
       line_tokenizer

data/lib/spellr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Spellr
-  VERSION = '0.5.0'
+  VERSION = '0.5.1'
 end

data/lib/spellr/wordlist.rb CHANGED Viewed

@@ -27,17 +27,17 @@ module Spellr
     end
     # significantly faster than default Enumerable#include?
-    # requires terms to be sorted
+    # requires terms to have been sorted
     def include?(term)
-      include_cache[term.normalize]
+      include_cache[term.spellr_normalize]
     end
-    def include_cache
-      @include_cache ||= Hash.new do |cache, term|
-        cache[term] = to_a.bsearch do |value|
-          term <=> value
-        end
-      end
+    def <<(term)
+      term = term.spellr_normalize
+      touch
+      include_cache[term] = true
+      insert_sorted(term)
+      @path.write(to_a.join) # we don't need to clear the cache
     end
     def to_a
@@ -46,7 +46,7 @@ module Spellr
     def clean(file = @path)
       require_relative 'tokenizer'
-      write(Spellr::Tokenizer.new(file, skip_uri: false, skip_key: false).normalized_terms.join)
+      write(Spellr::Tokenizer.new(file, skip_key: false).normalized_terms.join)
     end
     def write(content)
@@ -61,37 +61,41 @@ module Spellr
       @path.read
     end
-    def clear_cache
-      @to_a = nil
-      @include = nil
-    end
     def exist?
       return @exist if defined?(@exist)
       @exist = @path.exist?
     end
-    def add(term)
-      touch
-      term = term.normalize
-      include_cache[term] = true
-      to_a << term
-      to_a.sort!
-      write(@to_a.join)
-      Spellr.config.clear_cache if to_a.length == 1
-    end
     def touch
       return if exist?
       @path.dirname.mkpath
       @path.write('')
-      remove_instance_variable(:@exist)
+      clear_cache
     end
     private
+    def insert_sorted(term)
+      insert_at = to_a.bsearch_index { |value| value >= term }
+      insert_at ? to_a.insert(insert_at, term) : to_a.push(term)
+    end
+    def include_cache
+      @include_cache ||= Hash.new do |cache, term|
+        cache[term] = to_a.bsearch do |value|
+          term <=> value
+        end
+      end
+    end
+    def clear_cache
+      @to_a = nil
+      @include = nil
+      remove_instance_variable(:@exist) if defined?(@exist)
+    end
     def raise_unless_exists?
       return if exist?

data/lib/spellr/wordlist_reporter.rb CHANGED Viewed

@@ -1,21 +1,29 @@
 # frozen_string_literal: true
 require 'set'
+require_relative 'base_reporter'
 module Spellr
-  class WordlistReporter
-    attr_reader :words
-    def initialize
-      @words = Set.new
+  class WordlistReporter < Spellr::BaseReporter
+    def parallel?
+      true
     end
-    def finish(_checked)
-      puts words.sort.join
+    def finish
+      output.puts words.sort.join
     end
     def call(token)
-      words << token.normalize
+      words << token.spellr_normalize
+    end
+    private
+    def words
+      @words ||= begin
+        output.counts[:words] = Set.new unless output.counts.key?(:words)
+        output.counts[:words]
+      end
     end
   end
 end

data/lib/spellr.rb CHANGED Viewed

@@ -5,6 +5,7 @@ require_relative 'spellr/config'
 module Spellr
   class Error < StandardError; end
   class Wordlist
     class NotFound < Spellr::Error; end
   end