spellr 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,162 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'possible_key'
4
+ require_relative 'stats'
5
+ require 'yaml'
6
+ # this is lifted in whole from this article. i don't understand the maths and i don't want to
7
+ # https://www.sitepoint.com/machine-learning-ruby-naive-bayes-theorem/
8
+
9
+ class NaiveBayes # rubocop:disable Metrics/ClassLength
10
+ include Stats
11
+
12
+ YAML_PATH = File.join(__dir__, 'data.yml')
13
+
14
+ def training_data
15
+ @training_data ||= begin
16
+ PossibleKey.load
17
+ PossibleKey.keys.each.with_object({}) do |key, data|
18
+ key_class = key.key? ? 'key' : 'not_key'
19
+ character_set = key.character_set
20
+ key_key = "#{key_class}_#{character_set}"
21
+ data[key_key] ||= []
22
+ data[key_key] << key.features
23
+ end
24
+ end
25
+ end
26
+
27
+ def load_from_yaml
28
+ data = YAML.safe_load(::File.read(YAML_PATH), [Symbol])
29
+
30
+ @feature_set = data[:feature_set]
31
+ @num_classes = data[:num_classes]
32
+ @classes = data[:classes]
33
+ @features = data[:features]
34
+ end
35
+
36
+ def save_to_yaml
37
+ require 'yaml'
38
+ File.write(YAML_PATH, {
39
+ feature_set: feature_set,
40
+ num_classes: num_classes,
41
+ classes: classes,
42
+ features: features
43
+ }.to_yaml)
44
+ end
45
+
46
+ def initialize
47
+ load_from_yaml if File.exist?(YAML_PATH)
48
+ end
49
+
50
+ def num_classes
51
+ @num_classes ||= training_data&.length
52
+ end
53
+
54
+ def classes
55
+ @classes ||= training_data&.keys
56
+ end
57
+
58
+ def features
59
+ @features ||= training_data.first.last.first.keys
60
+ end
61
+
62
+ def feature_set # rubocop:disable Metrics/MethodLength
63
+ @feature_set ||= classes.each.with_object({}) do |class_name, feature_set|
64
+ feature_set[class_name] = {}
65
+
66
+ features.each do |feature|
67
+ values = training_data[class_name].map do |row|
68
+ row[feature]
69
+ end
70
+
71
+ feature_set[class_name][feature] = {
72
+ standard_deviation: standard_deviation(values),
73
+ mean: mean(values),
74
+ variance: variance(values)
75
+ }
76
+ end
77
+ end
78
+ end
79
+
80
+ # given a class, this method determines the probability
81
+ # of a certain value occurring for a given feature
82
+ # index: index of the feature in consideration in the training data
83
+ # value: the value of the feature for which we are finding the probability
84
+ # class_name: name of the class in consideration
85
+ def feature_probability(feature, value, class_name) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
86
+ # get the feature value set
87
+ fs = feature_set[class_name][feature]
88
+
89
+ # statistical properties of the feature set
90
+ fs_std = fs[:standard_deviation]
91
+ fs_mean = fs[:mean]
92
+ fs_var = fs[:variance]
93
+
94
+ # deal with the edge case of a 0 standard deviation
95
+ if fs_std == 0
96
+ return fs_mean == value ? 1.0 : 0.0
97
+ end
98
+
99
+ # calculate the gaussian probability
100
+ pi = Math::PI
101
+ e = Math::E
102
+
103
+ exp = -((value - fs_mean)**2) / (2 * fs_var)
104
+ probability = (1.0 / Math.sqrt(2 * pi * fs_var)) * (e**exp)
105
+
106
+ probability
107
+ end
108
+
109
+ # multiply together the feature probabilities for all of the
110
+ # features in a class for given values
111
+ def feature_multiplication(features, class_name)
112
+ features.reduce(1.0) do |result, (key, value)|
113
+ result * feature_probability(key, value, class_name)
114
+ end
115
+ end
116
+
117
+ def debug(string) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
118
+ require 'terminal-table'
119
+
120
+ features = PossibleKey.new(string).features
121
+
122
+ table = Terminal::Table.new do |t|
123
+ t << ['classes'] + classes
124
+ t << :separator
125
+ t << ['probabilities'] + classes.map { |c| class_probability(features, c) }
126
+ features.each do |key, value|
127
+ t << [key] + classes.map { |c| feature_probability(key, value, c).round(4) }
128
+ end
129
+ end
130
+ puts table
131
+
132
+ nil
133
+ end
134
+
135
+ # this is where we compute the final naive Bayesian probability
136
+ # for a given set of features being a part of a given class.
137
+ def class_probability(features, class_name)
138
+ class_fraction = 1.0 / num_classes
139
+ feature_bayes = feature_multiplication(features, class_name)
140
+ feature_bayes *= (10**Spellr.config.key_heuristic_weight) if class_name.start_with?('key_')
141
+ feature_bayes * class_fraction
142
+ end
143
+
144
+ # This the method we should be calling!
145
+ # Given a set of feature values, it decides
146
+ # what class to categorize them under
147
+ def classify(features)
148
+ classes.max_by do |class_name|
149
+ class_probability(features, class_name)
150
+ end
151
+ end
152
+
153
+ def key?(string)
154
+ key_cache[string]
155
+ end
156
+
157
+ def key_cache
158
+ @key_cache ||= Hash.new do |cache, string|
159
+ cache[string] = classify(PossibleKey.new(string).features).start_with?('key')
160
+ end
161
+ end
162
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require_relative 'stats'
5
+
6
+ class PossibleKey # rubocop:disable Metrics/ClassLength
7
+ include Stats
8
+
9
+ class << self
10
+ attr_reader :keys
11
+ end
12
+
13
+ def self.load # rubocop:disable Metrics/AbcSize
14
+ @keys = []
15
+
16
+ Pathname.new(__dir__).join('data', 'false_positives.txt').each_line do |line|
17
+ next if line.chomp.empty?
18
+
19
+ keys << PossibleKey.new(line.chomp, false)
20
+ end
21
+
22
+ Pathname.new(__dir__).join('data', 'keys.txt').each_line do |line|
23
+ next if line.chomp.empty?
24
+
25
+ keys << PossibleKey.new(line.chomp, true)
26
+ end
27
+ end
28
+
29
+ attr_reader :string
30
+
31
+ def initialize(string, key = nil)
32
+ @string = string
33
+ @key = key
34
+ end
35
+
36
+ def features # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
37
+ {
38
+ **significant_letter_frequency_difference,
39
+ equal: letter_count[:'='],
40
+ length: length,
41
+ hex: character_set == :hex ? 1 : 0,
42
+ lower36: character_set == :lower36 ? 1 : 0,
43
+ upper36: character_set == :upper36 ? 1 : 0,
44
+ base64: character_set == :base64 ? 1 : 0,
45
+ mean_title_chunk_size: mean(title_chunks, &:length),
46
+ variance_title_chunk_size: variance(title_chunks, &:length),
47
+ max_title_chunk_size: max(title_chunks, &:length),
48
+ mean_lower_chunk_size: mean(lower_chunks, &:length),
49
+ variance_lower_chunk_size: variance(lower_chunks, &:length),
50
+ mean_upper_chunk_size: mean(upper_chunks, &:length),
51
+ variance_upper_chunk_size: variance(upper_chunks, &:length),
52
+ mean_alpha_chunk_size: mean(alpha_chunks, &:length),
53
+ variance_alpha_chunk_size: variance(alpha_chunks, &:length),
54
+ mean_alnum_chunk_size: mean(alnum_chunks, &:length),
55
+ variance_alnum_chunk_size: variance(alnum_chunks, &:length),
56
+ mean_digit_chunk_size: mean(digit_chunks, &:length),
57
+ variance_digit_chunk_size: variance(digit_chunks, &:length),
58
+ vowel_consonant_ratio: vowel_consonant_ratio,
59
+ alpha_chunks: alpha_chunks.length,
60
+ alnum_chunks: alnum_chunks.length,
61
+ digit_chunks: digit_chunks.length,
62
+ title_chunks: title_chunks.length,
63
+ mean_letter_frequency_difference: mean(letter_frequency_difference.values),
64
+ variance_letter_frequency_difference: max(letter_frequency_difference.values)
65
+ }
66
+ end
67
+
68
+ def key?
69
+ @key
70
+ end
71
+
72
+ def length
73
+ string.length
74
+ end
75
+
76
+ SIGNIFICANT_LETTERS = %i{+ - _ / A z Z q Q X x}.freeze
77
+ if RUBY_VERSION >= '2.5'
78
+ def significant_letter_frequency_difference
79
+ letter_frequency_difference.slice(*SIGNIFICANT_LETTERS)
80
+ end
81
+ else
82
+ def significant_letter_frequency_difference
83
+ letter_frequency_difference.each.with_object({}) do |key, value, hash|
84
+ hash[key] = value if SIGNIFICANT_LETTERS.include?(key)
85
+ end
86
+ end
87
+ end
88
+
89
+ def character_set
90
+ @character_set ||= case string
91
+ when /^[a-fA-F0-9\-]+$/ then :hex
92
+ when /^[a-z0-9]+$/ then :lower36
93
+ when /^[A-Z0-9]+$/ then :upper36
94
+ when %r{^[A-Za-z0-9\-_+/]+={0,2}$} then :base64
95
+ else
96
+ raise "#{string.inspect} is an unrecognised character set"
97
+ end
98
+ end
99
+
100
+ def character_set_total
101
+ case character_set
102
+ when :hex then 16
103
+ when :lower36 then 36
104
+ when :upper36 then 36
105
+ when :base64 then 64
106
+ end
107
+ end
108
+
109
+ def ideal_letter_frequency
110
+ 1.0 / character_set_total * length
111
+ end
112
+
113
+ LETTER_COUNT_HASH = (('A'..'Z').to_a + ('a'..'z').to_a + ('0'..'9').to_a + %w{+ _ / - =})
114
+ .map { |k| [k.to_sym, 0] }.to_h
115
+ def letter_count
116
+ @letter_count ||= begin
117
+ string.chars.each.with_object(LETTER_COUNT_HASH.dup) do |letter, hash|
118
+ hash[letter.to_sym] += 1
119
+ end
120
+ end
121
+ end
122
+
123
+ def letter_frequency
124
+ @letter_frequency ||= begin
125
+ l = letter_count.dup
126
+ l.each { |k, v| l[k] = v.to_f / string.length }
127
+ l
128
+ end
129
+ end
130
+
131
+ def letter_frequency_difference
132
+ @letter_frequency_difference ||= begin
133
+ l = letter_frequency.dup
134
+ l.each { |k, v| l[k] = (v - ideal_letter_frequency).abs }
135
+ l
136
+ end
137
+ end
138
+
139
+ VOWELS = %i{a e i o u A E I O U}.freeze
140
+ CONSONANTS = %i{b c d f g h j k l m n p q r s t v w x y z B C D F G H J K L M N P Q R S T V W X Y Z}.freeze
141
+ def vowel_consonant_ratio
142
+ vowels = letter_count.fetch_values(*VOWELS).sum
143
+ consonants = letter_count.fetch_values(*CONSONANTS).sum
144
+ vowels / (consonants.nonzero? || 1)
145
+ end
146
+
147
+ def digit_chunks
148
+ @digit_chunks ||= string.scan(/\d+/)
149
+ end
150
+
151
+ def title_chunks
152
+ @title_chunks ||= string.scan(/[A-Z][a-z]+/)
153
+ end
154
+
155
+ def lower_chunks
156
+ @lower_chunks ||= string.scan(/[a-z]+/)
157
+ end
158
+
159
+ def upper_chunks
160
+ @upper_chunks ||= string.scan(/[A-Z]+/)
161
+ end
162
+
163
+ def alpha_chunks
164
+ @alpha_chunks ||= string.scan(/[A-Za-z]+/)
165
+ end
166
+
167
+ def alnum_chunks
168
+ @alnum_chunks ||= string.scan(/[A-Za-z0-9]+/)
169
+ end
170
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Stats
4
+ def mean(values, &block)
5
+ return 0 if values.empty?
6
+
7
+ values.sum(&block).to_f / values.length
8
+ end
9
+
10
+ def min(values, &block)
11
+ return 0 if values.empty?
12
+
13
+ block ||= :itself.to_proc
14
+ block.call(values.min_by(&block))
15
+ end
16
+
17
+ def max(values, &block)
18
+ return 0 if values.empty?
19
+
20
+ block ||= :itself.to_proc
21
+ block.call(values.max_by(&block))
22
+ end
23
+
24
+ def variance(values, &block)
25
+ return 0 if values.empty?
26
+
27
+ values.sum { |sample| (mean(values, &block) - (block ? block.call(sample) : sample))**2 }.to_f / values.length
28
+ end
29
+
30
+ def standard_deviation(values, &block)
31
+ Math.sqrt(variance(values, &block))
32
+ end
33
+ end
@@ -7,26 +7,39 @@ module Spellr
7
7
  attr_reader :name
8
8
  attr_reader :key
9
9
 
10
- def initialize(name, # rubocop:disable Metrics/ParameterLists
10
+ def initialize(name, # rubocop:disable Metrics/ParameterLists, Metrics/MethodLength
11
11
  key: name[0],
12
12
  generate: nil,
13
13
  only: [],
14
+ includes: [],
14
15
  description: '',
15
16
  hashbangs: [])
17
+ unless only.empty?
18
+ warn <<~WARNING
19
+ \e[33mSpellr: `only:` language yaml key with a list of fnmatch rules is deprecated.
20
+ Please use `includes:` instead, which uses gitignore-inspired rules.
21
+ see github.com/robotdana/fast_ignore#using-an-includes-list for details\e[0m
22
+ WARNING
23
+ end
16
24
  @name = name
17
25
  @key = key
18
26
  @description = description
19
27
  @generate = generate
20
- @only = only
28
+ @includes = only + includes
21
29
  @hashbangs = hashbangs
22
30
  end
23
31
 
24
32
  def matches?(file)
25
- return true if @only.empty?
33
+ return true if @includes.empty?
34
+
35
+ return true if fast_ignore.allowed?(file.to_s)
26
36
 
27
37
  file = Spellr::File.wrap(file)
28
- return true if @only.any? { |o| file.fnmatch?(o) }
29
- return true if file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
38
+ return true if !@hashbangs.empty? && file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
39
+ end
40
+
41
+ def fast_ignore
42
+ @fast_ignore ||= FastIgnore.new(include_rules: @includes, gitignore: false)
30
43
  end
31
44
 
32
45
  def wordlists
@@ -41,6 +54,8 @@ module Spellr
41
54
  require 'shellwords'
42
55
  warn "Generating wordlist for #{name}"
43
56
 
57
+ generated_project_wordlist.touch
58
+
44
59
  Spellr::CLI.new(generate.shellsplit)
45
60
 
46
61
  default_wordlists
@@ -4,6 +4,7 @@ require 'strscan'
4
4
  require_relative '../spellr'
5
5
  require_relative 'column_location'
6
6
  require_relative 'token'
7
+ require_relative 'key_tuner/naive_bayes'
7
8
 
8
9
  module Spellr
9
10
  class LineTokenizer < StringScanner # rubocop:disable Metrics/ClassLength
@@ -63,85 +64,135 @@ module Spellr
63
64
  end
64
65
 
65
66
  def next_term
66
- return if eos?
67
-
68
- (skip_nonwords_and_flags && next_term) || scan_term || next_term
67
+ if skip_nonwords_and_flags
68
+ nil
69
+ else
70
+ scan_term
71
+ end
69
72
  end
70
73
 
74
+ # [Word], [Word]Word [Word]'s [Wordn't]
75
+ TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
76
+ # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
77
+ UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
78
+ # [word] [word]'s [wordn't]
79
+ LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
80
+ # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
81
+ OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
82
+
83
+ TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
84
+
71
85
  def scan_term
72
- term = title_case || lower_case || upper_case || other_case
86
+ term = scan(TERM_RE)
73
87
 
74
88
  return term if term && term.length >= Spellr.config.word_minimum_length
75
89
  end
76
90
 
77
91
  NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze # everything not covered by more specific skips/scans
78
- LEFTOVER_NON_WORD_BITS_RE = %r{[/%#0-9\\]}.freeze # e.g. a / not starting //a-url.com
92
+ LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
79
93
  HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
80
- SHELL_COLOR_ESCAPE_RE = /\\(e|033)\[\d+(;\d+)*m/.freeze
94
+ SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
95
+ PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
81
96
  BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze # TODO: hex escapes e.g. \xAA. TODO: language aware escapes
82
97
  REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx (it's not a word)
83
- # https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding
84
- # Only the necessary percent encoding that actually ends in letters
85
- # URL_ENCODED_ENTITIES_RE = /%(3A|2F|3F|5B|5D|%2A|%2B|%2C|%3B|%3D)/i.freeze
86
98
  URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
87
99
  # There's got to be a better way of writing this
88
- SEQUENTIAL_LETTERS_RE = /a(b(c(d(e(f(g(h(i(j(k(l(m(n(o(p(q(r(s(t(u(v(w(x(y(z)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
89
-
90
- def skip_nonwords # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
91
- skip(NOT_EVEN_NON_WORDS_RE) ||
92
- skip(SHELL_COLOR_ESCAPE_RE) ||
93
- skip(BACKSLASH_ESCAPE_RE) ||
94
- skip(URL_ENCODED_ENTITIES_RE) ||
95
- skip(HEX_RE) ||
96
- skip_key_heuristically ||
97
- skip_uri_heuristically ||
98
- skip(LEFTOVER_NON_WORD_BITS_RE) ||
99
- skip(REPEATED_SINGLE_LETTERS_RE) ||
100
- skip(SEQUENTIAL_LETTERS_RE)
101
- end
100
+ SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
102
101
 
103
102
  # I didn't want to do this myself. BUT i need something to heuristically match on, and it's difficult
104
- URL_RE = %r{
105
- (//|https?://|s?ftp://|mailto:)? # 0 scheme
106
- ([[:alnum:]]+(?::[[:alnum:]]+)?@)? # 1 userinfo
107
- (?:(?:[[:alnum:]-]+(?:\\?\.[[:alnum:]-]+)+|localhost|\d{1,3}(?:.\d{1,3}){3})) # 2 hostname
108
- (?::\d+)? # 3 port
109
- (/(?:[[:alnum:]=!$&\-/._\\]|%\h{2})+)? # 4 path
110
- (?:\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*)? # 5 query
111
- (?:\#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+)? # 6 fragment
112
- }x.freeze
113
- def skip_uri_heuristically
114
- return unless skip_uri?
115
- return unless scan(URL_RE)
116
-
117
- heuristic_failed = if RUBY_VERSION >= '2.5'
118
- captures.all?(&:empty?)
103
+ URL_SCHEME = '(//|https?://|s?ftp://|mailto:)'
104
+ URL_USERINFO = '([[:alnum:]]+(?::[[:alnum:]]+)?@)'
105
+ URL_HOSTNAME = '((?:[[:alnum:]-]+(?:\\\\?\\.[[:alnum:]-]+)+|localhost|\\d{1,3}(?:\\.\\d{1,3}){3}))'
106
+ URL_PORT = '(:\\d+)'
107
+ URL_PATH = '(/(?:[[:alnum:]=@!$&\\-/._\\\\]|%\h{2})+)'
108
+ URL_QUERY = '(\\?(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+(?:&(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+)*)'
109
+ URL_FRAGMENT = '(\\#(?:[[:alnum:]=!$&\\-/.\\\\]|%\\h{2})+)'
110
+ URL_RE = /
111
+ (?:
112
+ #{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
113
+ |
114
+ #{URL_SCHEME}?#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
115
+ |
116
+ #{URL_SCHEME}?#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}
117
+ )
118
+ #{URL_QUERY}?#{URL_FRAGMENT}?
119
+ /x.freeze
120
+
121
+ KNOWN_KEY_PATTERNS_RE = %r{(
122
+ SG\.[\w\-]{22}\.[\w\-]{43} | # sendgrid
123
+ prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12} | # hyperwallet
124
+ GTM-[A-Z0-9]{7} | # google tag manager
125
+ sha1-[A-Za-z0-9=+/]{28} |
126
+ sha512-[A-Za-z0-9=+/]{88} |
127
+ data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])
128
+ )}x.freeze
129
+
130
+ SKIPS = Regexp.union(
131
+ NOT_EVEN_NON_WORDS_RE,
132
+ SHELL_COLOR_ESCAPE_RE,
133
+ BACKSLASH_ESCAPE_RE,
134
+ URL_ENCODED_ENTITIES_RE,
135
+ HEX_RE,
136
+ URL_RE, # 2%
137
+ KNOWN_KEY_PATTERNS_RE
138
+ ).freeze
139
+
140
+ AFTER_KEY_SKIPS = Regexp.union(
141
+ LEFTOVER_NON_WORD_BITS_RE,
142
+ REPEATED_SINGLE_LETTERS_RE,
143
+ SEQUENTIAL_LETTERS_RE
144
+ )
145
+
146
+ def skip_nonwords
147
+ skip(SKIPS) ||
148
+ skip_key_heuristically || # 5%
149
+ skip(AFTER_KEY_SKIPS)
150
+ end
151
+
152
+ KEY_RE = %r{[A-Za-z0-9]([A-Za-z0-9+/\-_]*)=*(?![[:alnum:]])}.freeze
153
+ N = NaiveBayes.new
154
+ def skip_key_heuristically # rubocop:disable Metrics/MethodLength
155
+ return unless scan(KEY_RE)
156
+ # I've come across some large base64 strings by this point they're definitely base64.
157
+ return true if matched.length > 200
158
+
159
+ if key_roughly?(matched)
160
+ if N.key?(matched)
161
+ true
162
+ else
163
+ unscan
164
+ false
165
+ end
119
166
  else
120
- # unfortunately i have to match this regex again because stringscanner doesn't give me matchdata
121
- matched.match(URL_RE).captures.compact.all?(&:empty?)
167
+ unscan
168
+ false
122
169
  end
123
-
124
- unscan && false if heuristic_failed
125
170
  end
126
171
 
127
- # url unsafe base64 or url safe base64
128
- # TODO: character distribution heuristic
129
- KEY_FULL_RE = %r{([A-Za-z\d+/]|[A-Za-z\d\-_])+[=.]*}.freeze
130
- KEY_RE = %r{
131
- (?:
132
- [A-Za-z\-_+/=]+|
133
- [\d\-_+/=]+
134
- )
135
- }x.freeze
136
- def skip_key_heuristically
137
- return unless skip_key?
138
- return unless match?(KEY_FULL_RE)
139
-
140
- # can't use regular captures because repeated capture groups don't
141
- matches = matched.scan(KEY_RE)
142
- return unless matches.length >= 3 # number chosen arbitrarily
143
-
144
- skip(KEY_FULL_RE)
172
+ # this is in a method becase the minimum word length stuff was throwing it off
173
+ # TODO: move to config maybe?
174
+ def min_alpha_re
175
+ /(?:
176
+ [A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}
177
+ |
178
+ [a-z]{#{Spellr.config.word_minimum_length}}
179
+ |
180
+ [A-Z]{#{Spellr.config.word_minimum_length}}
181
+ )/x.freeze
182
+ end
183
+ ALPHA_SEP_RE = '[A-Za-z][A-Za-z\\-_/+]*'
184
+ NUM_SEP_RE = '\\d[\\d\\-_/+]*'
185
+ THREE_CHUNK_RE = /^(?:
186
+ #{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}
187
+ |
188
+ #{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}
189
+ )/x.freeze
190
+ def key_roughly?(matched)
191
+ return unless matched.length >= Spellr.config.key_minimum_length
192
+ return unless matched.match?(THREE_CHUNK_RE)
193
+ return unless matched.match?(min_alpha_re) # or there's no point
194
+
195
+ true
145
196
  end
146
197
 
147
198
  # jump to character-aware position
@@ -149,37 +200,17 @@ module Spellr
149
200
  skip(/.{#{new_charpos - charpos}}/m)
150
201
  end
151
202
 
152
- # [Word], [Word]Word [Word]'s [Wordn't]
153
- TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
154
- def title_case
155
- scan(TITLE_CASE_RE)
156
- end
157
-
158
- # [word] [word]'s [wordn't]
159
- LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
160
- def lower_case
161
- scan(LOWER_CASE_RE)
162
- end
163
-
164
- # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
165
- UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*((?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
166
- def upper_case
167
- scan(UPPER_CASE_RE)
168
- end
169
-
170
- # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
171
- OTHER_CASE_RE = /[[:alpha:]]+/.freeze
172
- def other_case
173
- scan(OTHER_CASE_RE)
174
- end
175
-
176
203
  SPELLR_DISABLE_RE = /spellr:disable/.freeze
177
204
  def skip_and_track_disable
205
+ return if disabled?
206
+
178
207
  skip(SPELLR_DISABLE_RE) && self.disabled = true
179
208
  end
180
209
 
181
210
  SPELLR_ENABLE_RE = /spellr:enable/.freeze
182
211
  def skip_and_track_enable
212
+ return unless disabled?
213
+
183
214
  skip(SPELLR_ENABLE_RE) && self.disabled = false
184
215
  end
185
216
  end
@@ -10,10 +10,9 @@ module Spellr
10
10
  "#{count} #{word}#{'s' if count != 1}"
11
11
  end
12
12
 
13
+ # TODO: make it work without color
13
14
  def color_enabled?
14
- return $stdout.tty? if Spellr.config.color.nil?
15
-
16
- Spellr.config.color
15
+ true
17
16
  end
18
17
 
19
18
  def aqua(text)
@@ -39,5 +38,11 @@ module Spellr
39
38
 
40
39
  "\e[1;31m#{text}#{normal}"
41
40
  end
41
+
42
+ def green(text)
43
+ return text unless Spellr::StringFormat.color_enabled?
44
+
45
+ "\e[1;32m#{text}#{normal}"
46
+ end
42
47
  end
43
48
  end