RubyGems - spellr - Versions diffs - 0.3.2 → 0.4.0 - Mend

spellr 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/Gemfile.lock +13 -10
data/bin/fetch_wordlist/english +3 -1
data/lib/.spellr.yml +12 -9
data/lib/spellr/backports.rb +19 -0
data/lib/spellr/check.rb +15 -12
data/lib/spellr/config.rb +46 -11
data/lib/spellr/file.rb +7 -8
data/lib/spellr/file_list.rb +15 -10
data/lib/spellr/interactive.rb +8 -3
data/lib/spellr/key_tuner/data.yml +1242 -0
data/lib/spellr/key_tuner/naive_bayes.rb +162 -0
data/lib/spellr/key_tuner/possible_key.rb +170 -0
data/lib/spellr/key_tuner/stats.rb +33 -0
data/lib/spellr/language.rb +20 -5
data/lib/spellr/line_tokenizer.rb +115 -84
data/lib/spellr/string_format.rb +8 -3
data/lib/spellr/token.rb +14 -10
data/lib/spellr/tokenizer.rb +1 -2
data/lib/spellr/version.rb +1 -1
data/lib/spellr/wordlist.rb +6 -5
data/lib/spellr.rb +5 -14
data/spellr.gemspec +3 -2
metadata +24 -5

data/lib/spellr/key_tuner/naive_bayes.rb ADDED Viewed

@@ -0,0 +1,162 @@
+# frozen_string_literal: true
+require_relative 'possible_key'
+require_relative 'stats'
+require 'yaml'
+# this is lifted in whole from this article. i don't understand the maths and i don't want to
+# https://www.sitepoint.com/machine-learning-ruby-naive-bayes-theorem/
+class NaiveBayes # rubocop:disable Metrics/ClassLength
+  include Stats
+  YAML_PATH = File.join(__dir__, 'data.yml')
+  def training_data
+    @training_data ||= begin
+      PossibleKey.load
+      PossibleKey.keys.each.with_object({}) do |key, data|
+        key_class = key.key? ? 'key' : 'not_key'
+        character_set = key.character_set
+        key_key = "#{key_class}_#{character_set}"
+        data[key_key] ||= []
+        data[key_key] << key.features
+      end
+    end
+  end
+  def load_from_yaml
+    data = YAML.safe_load(::File.read(YAML_PATH), [Symbol])
+    @feature_set = data[:feature_set]
+    @num_classes = data[:num_classes]
+    @classes = data[:classes]
+    @features = data[:features]
+  end
+  def save_to_yaml
+    require 'yaml'
+    File.write(YAML_PATH, {
+      feature_set: feature_set,
+      num_classes: num_classes,
+      classes: classes,
+      features: features
+    }.to_yaml)
+  end
+  def initialize
+    load_from_yaml if File.exist?(YAML_PATH)
+  end
+  def num_classes
+    @num_classes ||= training_data&.length
+  end
+  def classes
+    @classes ||= training_data&.keys
+  end
+  def features
+    @features ||= training_data.first.last.first.keys
+  end
+  def feature_set # rubocop:disable Metrics/MethodLength
+    @feature_set ||= classes.each.with_object({}) do |class_name, feature_set|
+      feature_set[class_name] = {}
+      features.each do |feature|
+        values = training_data[class_name].map do |row|
+          row[feature]
+        end
+        feature_set[class_name][feature] = {
+          standard_deviation: standard_deviation(values),
+          mean: mean(values),
+          variance: variance(values)
+        }
+      end
+    end
+  end
+  # given a class, this method determines the probability
+  # of a certain value occurring for a given feature
+  # index: index of the feature in consideration in the training data
+  # value: the value of the feature for which we are finding the probability
+  # class_name: name of the class in consideration
+  def feature_probability(feature, value, class_name) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    # get the feature value set
+    fs = feature_set[class_name][feature]
+    # statistical properties of the feature set
+    fs_std = fs[:standard_deviation]
+    fs_mean = fs[:mean]
+    fs_var = fs[:variance]
+    # deal with the edge case of a 0 standard deviation
+    if fs_std == 0
+      return fs_mean == value ? 1.0 : 0.0
+    end
+    # calculate the gaussian probability
+    pi = Math::PI
+    e = Math::E
+    exp = -((value - fs_mean)**2) / (2 * fs_var)
+    probability = (1.0 / Math.sqrt(2 * pi * fs_var)) * (e**exp)
+    probability
+  end
+  # multiply together the feature probabilities for all of the
+  # features in a class for given values
+  def feature_multiplication(features, class_name)
+    features.reduce(1.0) do |result, (key, value)|
+      result * feature_probability(key, value, class_name)
+    end
+  end
+  def debug(string) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    require 'terminal-table'
+    features = PossibleKey.new(string).features
+    table = Terminal::Table.new do |t|
+      t << ['classes'] + classes
+      t << :separator
+      t << ['probabilities'] + classes.map { |c| class_probability(features, c) }
+      features.each do |key, value|
+        t << [key] + classes.map { |c| feature_probability(key, value, c).round(4) }
+      end
+    end
+    puts table
+    nil
+  end
+  # this is where we compute the final naive Bayesian probability
+  # for a given set of features being a part of a given class.
+  def class_probability(features, class_name)
+    class_fraction = 1.0 / num_classes
+    feature_bayes = feature_multiplication(features, class_name)
+    feature_bayes *= (10**Spellr.config.key_heuristic_weight) if class_name.start_with?('key_')
+    feature_bayes * class_fraction
+  end
+  # This the method we should be calling!
+  # Given a set of feature values, it decides
+  # what class to categorize them under
+  def classify(features)
+    classes.max_by do |class_name|
+      class_probability(features, class_name)
+    end
+  end
+  def key?(string)
+    key_cache[string]
+  end
+  def key_cache
+    @key_cache ||= Hash.new do |cache, string|
+      cache[string] = classify(PossibleKey.new(string).features).start_with?('key')
+    end
+  end
+end

data/lib/spellr/key_tuner/possible_key.rb ADDED Viewed

@@ -0,0 +1,170 @@
+# frozen_string_literal: true
+require 'pathname'
+require_relative 'stats'
+class PossibleKey # rubocop:disable Metrics/ClassLength
+  include Stats
+  class << self
+    attr_reader :keys
+  end
+  def self.load # rubocop:disable Metrics/AbcSize
+    @keys = []
+    Pathname.new(__dir__).join('data', 'false_positives.txt').each_line do |line|
+      next if line.chomp.empty?
+      keys << PossibleKey.new(line.chomp, false)
+    end
+    Pathname.new(__dir__).join('data', 'keys.txt').each_line do |line|
+      next if line.chomp.empty?
+      keys << PossibleKey.new(line.chomp, true)
+    end
+  end
+  attr_reader :string
+  def initialize(string, key = nil)
+    @string = string
+    @key = key
+  end
+  def features # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
+    {
+      **significant_letter_frequency_difference,
+      equal: letter_count[:'='],
+      length: length,
+      hex: character_set == :hex ? 1 : 0,
+      lower36: character_set == :lower36 ? 1 : 0,
+      upper36: character_set == :upper36 ? 1 : 0,
+      base64: character_set == :base64 ? 1 : 0,
+      mean_title_chunk_size: mean(title_chunks, &:length),
+      variance_title_chunk_size: variance(title_chunks, &:length),
+      max_title_chunk_size: max(title_chunks, &:length),
+      mean_lower_chunk_size: mean(lower_chunks, &:length),
+      variance_lower_chunk_size: variance(lower_chunks, &:length),
+      mean_upper_chunk_size: mean(upper_chunks, &:length),
+      variance_upper_chunk_size: variance(upper_chunks, &:length),
+      mean_alpha_chunk_size: mean(alpha_chunks, &:length),
+      variance_alpha_chunk_size: variance(alpha_chunks, &:length),
+      mean_alnum_chunk_size: mean(alnum_chunks, &:length),
+      variance_alnum_chunk_size: variance(alnum_chunks, &:length),
+      mean_digit_chunk_size: mean(digit_chunks, &:length),
+      variance_digit_chunk_size: variance(digit_chunks, &:length),
+      vowel_consonant_ratio: vowel_consonant_ratio,
+      alpha_chunks: alpha_chunks.length,
+      alnum_chunks: alnum_chunks.length,
+      digit_chunks: digit_chunks.length,
+      title_chunks: title_chunks.length,
+      mean_letter_frequency_difference: mean(letter_frequency_difference.values),
+      variance_letter_frequency_difference: max(letter_frequency_difference.values)
+    }
+  end
+  def key?
+    @key
+  end
+  def length
+    string.length
+  end
+  SIGNIFICANT_LETTERS = %i{+ - _ / A z Z q Q X x}.freeze
+  if RUBY_VERSION >= '2.5'
+    def significant_letter_frequency_difference
+      letter_frequency_difference.slice(*SIGNIFICANT_LETTERS)
+    end
+  else
+    def significant_letter_frequency_difference
+      letter_frequency_difference.each.with_object({}) do |key, value, hash|
+        hash[key] = value if SIGNIFICANT_LETTERS.include?(key)
+      end
+    end
+  end
+  def character_set
+    @character_set ||= case string
+    when /^[a-fA-F0-9\-]+$/ then :hex
+    when /^[a-z0-9]+$/ then :lower36
+    when /^[A-Z0-9]+$/ then :upper36
+    when %r{^[A-Za-z0-9\-_+/]+={0,2}$} then :base64
+    else
+      raise "#{string.inspect} is an unrecognised character set"
+    end
+  end
+  def character_set_total
+    case character_set
+    when :hex then 16
+    when :lower36 then 36
+    when :upper36 then 36
+    when :base64 then 64
+    end
+  end
+  def ideal_letter_frequency
+    1.0 / character_set_total * length
+  end
+  LETTER_COUNT_HASH = (('A'..'Z').to_a + ('a'..'z').to_a + ('0'..'9').to_a + %w{+ _ / - =})
+    .map { |k| [k.to_sym, 0] }.to_h
+  def letter_count
+    @letter_count ||= begin
+      string.chars.each.with_object(LETTER_COUNT_HASH.dup) do |letter, hash|
+        hash[letter.to_sym] += 1
+      end
+    end
+  end
+  def letter_frequency
+    @letter_frequency ||= begin
+      l = letter_count.dup
+      l.each { |k, v| l[k] = v.to_f / string.length }
+      l
+    end
+  end
+  def letter_frequency_difference
+    @letter_frequency_difference ||= begin
+      l = letter_frequency.dup
+      l.each { |k, v| l[k] = (v - ideal_letter_frequency).abs }
+      l
+    end
+  end
+  VOWELS = %i{a e i o u A E I O U}.freeze
+  CONSONANTS = %i{b c d f g h j k l m n p q r s t v w x y z B C D F G H J K L M N P Q R S T V W X Y Z}.freeze
+  def vowel_consonant_ratio
+    vowels = letter_count.fetch_values(*VOWELS).sum
+    consonants = letter_count.fetch_values(*CONSONANTS).sum
+    vowels / (consonants.nonzero? || 1)
+  end
+  def digit_chunks
+    @digit_chunks ||= string.scan(/\d+/)
+  end
+  def title_chunks
+    @title_chunks ||= string.scan(/[A-Z][a-z]+/)
+  end
+  def lower_chunks
+    @lower_chunks ||= string.scan(/[a-z]+/)
+  end
+  def upper_chunks
+    @upper_chunks ||= string.scan(/[A-Z]+/)
+  end
+  def alpha_chunks
+    @alpha_chunks ||= string.scan(/[A-Za-z]+/)
+  end
+  def alnum_chunks
+    @alnum_chunks ||= string.scan(/[A-Za-z0-9]+/)
+  end
+end

data/lib/spellr/key_tuner/stats.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module Stats
+  def mean(values, &block)
+    return 0 if values.empty?
+    values.sum(&block).to_f / values.length
+  end
+  def min(values, &block)
+    return 0 if values.empty?
+    block ||= :itself.to_proc
+    block.call(values.min_by(&block))
+  end
+  def max(values, &block)
+    return 0 if values.empty?
+    block ||= :itself.to_proc
+    block.call(values.max_by(&block))
+  end
+  def variance(values, &block)
+    return 0 if values.empty?
+    values.sum { |sample| (mean(values, &block) - (block ? block.call(sample) : sample))**2 }.to_f / values.length
+  end
+  def standard_deviation(values, &block)
+    Math.sqrt(variance(values, &block))
+  end
+end

data/lib/spellr/language.rb CHANGED Viewed

@@ -7,26 +7,39 @@ module Spellr
     attr_reader :name
     attr_reader :key
-    def initialize(name, # rubocop:disable Metrics/ParameterLists
+    def initialize(name, # rubocop:disable Metrics/ParameterLists, Metrics/MethodLength
       key: name[0],
       generate: nil,
       only: [],
+      includes: [],
       description: '',
       hashbangs: [])
+      unless only.empty?
+        warn <<~WARNING
+          \e[33mSpellr: `only:` language yaml key with a list of fnmatch rules is deprecated.
+          Please use `includes:` instead, which uses gitignore-inspired rules.
+          see github.com/robotdana/fast_ignore#using-an-includes-list for details\e[0m
+        WARNING
+      end
       @name = name
       @key = key
       @description = description
       @generate = generate
-      @only = only
+      @includes = only + includes
       @hashbangs = hashbangs
     end
     def matches?(file)
-      return true if @only.empty?
+      return true if @includes.empty?
+      return true if fast_ignore.allowed?(file.to_s)
       file = Spellr::File.wrap(file)
-      return true if @only.any? { |o| file.fnmatch?(o) }
-      return true if file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
+      return true if !@hashbangs.empty? && file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
+    end
+    def fast_ignore
+      @fast_ignore ||= FastIgnore.new(include_rules: @includes, gitignore: false)
     end
     def wordlists
@@ -41,6 +54,8 @@ module Spellr
       require 'shellwords'
       warn "Generating wordlist for #{name}"
+      generated_project_wordlist.touch
       Spellr::CLI.new(generate.shellsplit)
       default_wordlists

data/lib/spellr/line_tokenizer.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'strscan'
 require_relative '../spellr'
 require_relative 'column_location'
 require_relative 'token'
+require_relative 'key_tuner/naive_bayes'
 module Spellr
   class LineTokenizer < StringScanner # rubocop:disable Metrics/ClassLength
@@ -63,85 +64,135 @@ module Spellr
     end
     def next_term
-      return if eos?
-      (skip_nonwords_and_flags && next_term) || scan_term || next_term
+      if skip_nonwords_and_flags
+        nil
+      else
+        scan_term
+      end
     end
+    # [Word], [Word]Word [Word]'s [Wordn't]
+    TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
+    # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
+    UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
+    # [word] [word]'s [wordn't]
+    LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
+    # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
+    OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
+    TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
     def scan_term
-      term = title_case || lower_case || upper_case || other_case
+      term = scan(TERM_RE)
       return term if term && term.length >= Spellr.config.word_minimum_length
     end
     NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze # everything not covered by more specific skips/scans
-    LEFTOVER_NON_WORD_BITS_RE = %r{[/%#0-9\\]}.freeze # e.g. a / not starting //a-url.com
+    LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
     HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
-    SHELL_COLOR_ESCAPE_RE = /\\(e|033)\[\d+(;\d+)*m/.freeze
+    SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
+    PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
     BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze # TODO: hex escapes e.g. \xAA. TODO: language aware escapes
     REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx (it's not a word)
-    # https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding
-    # Only the necessary percent encoding that actually ends in letters
-    # URL_ENCODED_ENTITIES_RE = /%(3A|2F|3F|5B|5D|%2A|%2B|%2C|%3B|%3D)/i.freeze
     URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
     # There's got to be a better way of writing this
-    SEQUENTIAL_LETTERS_RE = /a(b(c(d(e(f(g(h(i(j(k(l(m(n(o(p(q(r(s(t(u(v(w(x(y(z)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
-    def skip_nonwords # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
-      skip(NOT_EVEN_NON_WORDS_RE) ||
-        skip(SHELL_COLOR_ESCAPE_RE) ||
-        skip(BACKSLASH_ESCAPE_RE) ||
-        skip(URL_ENCODED_ENTITIES_RE) ||
-        skip(HEX_RE) ||
-        skip_key_heuristically ||
-        skip_uri_heuristically ||
-        skip(LEFTOVER_NON_WORD_BITS_RE) ||
-        skip(REPEATED_SINGLE_LETTERS_RE) ||
-        skip(SEQUENTIAL_LETTERS_RE)
-    end
+    SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
     # I didn't want to do this myself. BUT i need something to heuristically match on, and it's difficult
-    URL_RE = %r{
-      (//|https?://|s?ftp://|mailto:)? # 0 scheme
-      ([[:alnum:]]+(?::[[:alnum:]]+)?@)? # 1 userinfo
-      (?:(?:[[:alnum:]-]+(?:\\?\.[[:alnum:]-]+)+|localhost|\d{1,3}(?:.\d{1,3}){3})) # 2 hostname
-      (?::\d+)? # 3 port
-      (/(?:[[:alnum:]=!$&\-/._\\]|%\h{2})+)? # 4 path
-      (?:\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*)? # 5 query
-      (?:\#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+)? # 6 fragment
-    }x.freeze
-    def skip_uri_heuristically
-      return unless skip_uri?
-      return unless scan(URL_RE)
-      heuristic_failed = if RUBY_VERSION >= '2.5'
-        captures.all?(&:empty?)
+    URL_SCHEME = '(//|https?://|s?ftp://|mailto:)'
+    URL_USERINFO = '([[:alnum:]]+(?::[[:alnum:]]+)?@)'
+    URL_HOSTNAME = '((?:[[:alnum:]-]+(?:\\\\?\\.[[:alnum:]-]+)+|localhost|\\d{1,3}(?:\\.\\d{1,3}){3}))'
+    URL_PORT = '(:\\d+)'
+    URL_PATH = '(/(?:[[:alnum:]=@!$&\\-/._\\\\]|%\h{2})+)'
+    URL_QUERY = '(\\?(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+(?:&(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+)*)'
+    URL_FRAGMENT = '(\\#(?:[[:alnum:]=!$&\\-/.\\\\]|%\\h{2})+)'
+    URL_RE = /
+      (?:
+        #{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
+        |
+        #{URL_SCHEME}?#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
+        |
+        #{URL_SCHEME}?#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}
+      )
+      #{URL_QUERY}?#{URL_FRAGMENT}?
+    /x.freeze
+    KNOWN_KEY_PATTERNS_RE = %r{(
+      SG\.[\w\-]{22}\.[\w\-]{43} | # sendgrid
+      prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12} | # hyperwallet
+      GTM-[A-Z0-9]{7} | # google tag manager
+      sha1-[A-Za-z0-9=+/]{28} |
+      sha512-[A-Za-z0-9=+/]{88} |
+      data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])
+    )}x.freeze
+    SKIPS = Regexp.union(
+      NOT_EVEN_NON_WORDS_RE,
+      SHELL_COLOR_ESCAPE_RE,
+      BACKSLASH_ESCAPE_RE,
+      URL_ENCODED_ENTITIES_RE,
+      HEX_RE,
+      URL_RE, # 2%
+      KNOWN_KEY_PATTERNS_RE
+    ).freeze
+    AFTER_KEY_SKIPS = Regexp.union(
+      LEFTOVER_NON_WORD_BITS_RE,
+      REPEATED_SINGLE_LETTERS_RE,
+      SEQUENTIAL_LETTERS_RE
+    )
+    def skip_nonwords
+      skip(SKIPS) ||
+        skip_key_heuristically || # 5%
+        skip(AFTER_KEY_SKIPS)
+    end
+    KEY_RE = %r{[A-Za-z0-9]([A-Za-z0-9+/\-_]*)=*(?![[:alnum:]])}.freeze
+    N = NaiveBayes.new
+    def skip_key_heuristically # rubocop:disable Metrics/MethodLength
+      return unless scan(KEY_RE)
+      # I've come across some large base64 strings by this point they're definitely base64.
+      return true if matched.length > 200
+      if key_roughly?(matched)
+        if N.key?(matched)
+          true
+        else
+          unscan
+          false
+        end
       else
-        # unfortunately i have to match this regex again because stringscanner doesn't give me matchdata
-        matched.match(URL_RE).captures.compact.all?(&:empty?)
+        unscan
+        false
       end
-      unscan && false if heuristic_failed
     end
-    # url unsafe base64 or url safe base64
-    # TODO: character distribution heuristic
-    KEY_FULL_RE = %r{([A-Za-z\d+/]|[A-Za-z\d\-_])+[=.]*}.freeze
-    KEY_RE = %r{
-      (?:
-        [A-Za-z\-_+/=]+|
-        [\d\-_+/=]+
-      )
-    }x.freeze
-    def skip_key_heuristically
-      return unless skip_key?
-      return unless match?(KEY_FULL_RE)
-      # can't use regular captures because repeated capture groups don't
-      matches = matched.scan(KEY_RE)
-      return unless matches.length >= 3 # number chosen arbitrarily
-      skip(KEY_FULL_RE)
+    # this is in a method becase the minimum word length stuff was throwing it off
+    # TODO: move to config maybe?
+    def min_alpha_re
+      /(?:
+        [A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}
+        |
+        [a-z]{#{Spellr.config.word_minimum_length}}
+        |
+        [A-Z]{#{Spellr.config.word_minimum_length}}
+      )/x.freeze
+    end
+    ALPHA_SEP_RE = '[A-Za-z][A-Za-z\\-_/+]*'
+    NUM_SEP_RE = '\\d[\\d\\-_/+]*'
+    THREE_CHUNK_RE = /^(?:
+      #{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}
+      |
+      #{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}
+    )/x.freeze
+    def key_roughly?(matched)
+      return unless matched.length >= Spellr.config.key_minimum_length
+      return unless matched.match?(THREE_CHUNK_RE)
+      return unless matched.match?(min_alpha_re) # or there's no point
+      true
     end
     # jump to character-aware position
@@ -149,37 +200,17 @@ module Spellr
       skip(/.{#{new_charpos - charpos}}/m)
     end
-    # [Word], [Word]Word [Word]'s [Wordn't]
-    TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
-    def title_case
-      scan(TITLE_CASE_RE)
-    end
-    # [word] [word]'s [wordn't]
-    LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
-    def lower_case
-      scan(LOWER_CASE_RE)
-    end
-    # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
-    UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*((?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
-    def upper_case
-      scan(UPPER_CASE_RE)
-    end
-    # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
-    OTHER_CASE_RE = /[[:alpha:]]+/.freeze
-    def other_case
-      scan(OTHER_CASE_RE)
-    end
     SPELLR_DISABLE_RE = /spellr:disable/.freeze
     def skip_and_track_disable
+      return if disabled?
       skip(SPELLR_DISABLE_RE) && self.disabled = true
     end
     SPELLR_ENABLE_RE = /spellr:enable/.freeze
     def skip_and_track_enable
+      return unless disabled?
       skip(SPELLR_ENABLE_RE) && self.disabled = false
     end
   end

data/lib/spellr/string_format.rb CHANGED Viewed

@@ -10,10 +10,9 @@ module Spellr
       "#{count} #{word}#{'s' if count != 1}"
     end
+    # TODO: make it work without color
     def color_enabled?
-      return $stdout.tty? if Spellr.config.color.nil?
-      Spellr.config.color
+      true
     end
     def aqua(text)
@@ -39,5 +38,11 @@ module Spellr
       "\e[1;31m#{text}#{normal}"
     end
+    def green(text)
+      return text unless Spellr::StringFormat.color_enabled?
+      "\e[1;32m#{text}#{normal}"
+    end
   end
 end