spellr 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +13 -10
- data/bin/fetch_wordlist/english +3 -1
- data/lib/.spellr.yml +12 -9
- data/lib/spellr/backports.rb +19 -0
- data/lib/spellr/check.rb +15 -12
- data/lib/spellr/config.rb +46 -11
- data/lib/spellr/file.rb +7 -8
- data/lib/spellr/file_list.rb +15 -10
- data/lib/spellr/interactive.rb +8 -3
- data/lib/spellr/key_tuner/data.yml +1242 -0
- data/lib/spellr/key_tuner/naive_bayes.rb +162 -0
- data/lib/spellr/key_tuner/possible_key.rb +170 -0
- data/lib/spellr/key_tuner/stats.rb +33 -0
- data/lib/spellr/language.rb +20 -5
- data/lib/spellr/line_tokenizer.rb +115 -84
- data/lib/spellr/string_format.rb +8 -3
- data/lib/spellr/token.rb +14 -10
- data/lib/spellr/tokenizer.rb +1 -2
- data/lib/spellr/version.rb +1 -1
- data/lib/spellr/wordlist.rb +6 -5
- data/lib/spellr.rb +5 -14
- data/spellr.gemspec +3 -2
- metadata +24 -5
@@ -0,0 +1,162 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'possible_key'
|
4
|
+
require_relative 'stats'
|
5
|
+
require 'yaml'
|
6
|
+
# this is lifted in whole from this article. i don't understand the maths and i don't want to
|
7
|
+
# https://www.sitepoint.com/machine-learning-ruby-naive-bayes-theorem/
|
8
|
+
|
9
|
+
class NaiveBayes # rubocop:disable Metrics/ClassLength
|
10
|
+
include Stats
|
11
|
+
|
12
|
+
YAML_PATH = File.join(__dir__, 'data.yml')
|
13
|
+
|
14
|
+
def training_data
|
15
|
+
@training_data ||= begin
|
16
|
+
PossibleKey.load
|
17
|
+
PossibleKey.keys.each.with_object({}) do |key, data|
|
18
|
+
key_class = key.key? ? 'key' : 'not_key'
|
19
|
+
character_set = key.character_set
|
20
|
+
key_key = "#{key_class}_#{character_set}"
|
21
|
+
data[key_key] ||= []
|
22
|
+
data[key_key] << key.features
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def load_from_yaml
|
28
|
+
data = YAML.safe_load(::File.read(YAML_PATH), [Symbol])
|
29
|
+
|
30
|
+
@feature_set = data[:feature_set]
|
31
|
+
@num_classes = data[:num_classes]
|
32
|
+
@classes = data[:classes]
|
33
|
+
@features = data[:features]
|
34
|
+
end
|
35
|
+
|
36
|
+
def save_to_yaml
|
37
|
+
require 'yaml'
|
38
|
+
File.write(YAML_PATH, {
|
39
|
+
feature_set: feature_set,
|
40
|
+
num_classes: num_classes,
|
41
|
+
classes: classes,
|
42
|
+
features: features
|
43
|
+
}.to_yaml)
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize
|
47
|
+
load_from_yaml if File.exist?(YAML_PATH)
|
48
|
+
end
|
49
|
+
|
50
|
+
def num_classes
|
51
|
+
@num_classes ||= training_data&.length
|
52
|
+
end
|
53
|
+
|
54
|
+
def classes
|
55
|
+
@classes ||= training_data&.keys
|
56
|
+
end
|
57
|
+
|
58
|
+
def features
|
59
|
+
@features ||= training_data.first.last.first.keys
|
60
|
+
end
|
61
|
+
|
62
|
+
def feature_set # rubocop:disable Metrics/MethodLength
|
63
|
+
@feature_set ||= classes.each.with_object({}) do |class_name, feature_set|
|
64
|
+
feature_set[class_name] = {}
|
65
|
+
|
66
|
+
features.each do |feature|
|
67
|
+
values = training_data[class_name].map do |row|
|
68
|
+
row[feature]
|
69
|
+
end
|
70
|
+
|
71
|
+
feature_set[class_name][feature] = {
|
72
|
+
standard_deviation: standard_deviation(values),
|
73
|
+
mean: mean(values),
|
74
|
+
variance: variance(values)
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# given a class, this method determines the probability
|
81
|
+
# of a certain value occurring for a given feature
|
82
|
+
# index: index of the feature in consideration in the training data
|
83
|
+
# value: the value of the feature for which we are finding the probability
|
84
|
+
# class_name: name of the class in consideration
|
85
|
+
def feature_probability(feature, value, class_name) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
86
|
+
# get the feature value set
|
87
|
+
fs = feature_set[class_name][feature]
|
88
|
+
|
89
|
+
# statistical properties of the feature set
|
90
|
+
fs_std = fs[:standard_deviation]
|
91
|
+
fs_mean = fs[:mean]
|
92
|
+
fs_var = fs[:variance]
|
93
|
+
|
94
|
+
# deal with the edge case of a 0 standard deviation
|
95
|
+
if fs_std == 0
|
96
|
+
return fs_mean == value ? 1.0 : 0.0
|
97
|
+
end
|
98
|
+
|
99
|
+
# calculate the gaussian probability
|
100
|
+
pi = Math::PI
|
101
|
+
e = Math::E
|
102
|
+
|
103
|
+
exp = -((value - fs_mean)**2) / (2 * fs_var)
|
104
|
+
probability = (1.0 / Math.sqrt(2 * pi * fs_var)) * (e**exp)
|
105
|
+
|
106
|
+
probability
|
107
|
+
end
|
108
|
+
|
109
|
+
# multiply together the feature probabilities for all of the
|
110
|
+
# features in a class for given values
|
111
|
+
def feature_multiplication(features, class_name)
|
112
|
+
features.reduce(1.0) do |result, (key, value)|
|
113
|
+
result * feature_probability(key, value, class_name)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def debug(string) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
118
|
+
require 'terminal-table'
|
119
|
+
|
120
|
+
features = PossibleKey.new(string).features
|
121
|
+
|
122
|
+
table = Terminal::Table.new do |t|
|
123
|
+
t << ['classes'] + classes
|
124
|
+
t << :separator
|
125
|
+
t << ['probabilities'] + classes.map { |c| class_probability(features, c) }
|
126
|
+
features.each do |key, value|
|
127
|
+
t << [key] + classes.map { |c| feature_probability(key, value, c).round(4) }
|
128
|
+
end
|
129
|
+
end
|
130
|
+
puts table
|
131
|
+
|
132
|
+
nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# this is where we compute the final naive Bayesian probability
|
136
|
+
# for a given set of features being a part of a given class.
|
137
|
+
def class_probability(features, class_name)
|
138
|
+
class_fraction = 1.0 / num_classes
|
139
|
+
feature_bayes = feature_multiplication(features, class_name)
|
140
|
+
feature_bayes *= (10**Spellr.config.key_heuristic_weight) if class_name.start_with?('key_')
|
141
|
+
feature_bayes * class_fraction
|
142
|
+
end
|
143
|
+
|
144
|
+
# This the method we should be calling!
|
145
|
+
# Given a set of feature values, it decides
|
146
|
+
# what class to categorize them under
|
147
|
+
def classify(features)
|
148
|
+
classes.max_by do |class_name|
|
149
|
+
class_probability(features, class_name)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def key?(string)
|
154
|
+
key_cache[string]
|
155
|
+
end
|
156
|
+
|
157
|
+
def key_cache
|
158
|
+
@key_cache ||= Hash.new do |cache, string|
|
159
|
+
cache[string] = classify(PossibleKey.new(string).features).start_with?('key')
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require_relative 'stats'
|
5
|
+
|
6
|
+
class PossibleKey # rubocop:disable Metrics/ClassLength
|
7
|
+
include Stats
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_reader :keys
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.load # rubocop:disable Metrics/AbcSize
|
14
|
+
@keys = []
|
15
|
+
|
16
|
+
Pathname.new(__dir__).join('data', 'false_positives.txt').each_line do |line|
|
17
|
+
next if line.chomp.empty?
|
18
|
+
|
19
|
+
keys << PossibleKey.new(line.chomp, false)
|
20
|
+
end
|
21
|
+
|
22
|
+
Pathname.new(__dir__).join('data', 'keys.txt').each_line do |line|
|
23
|
+
next if line.chomp.empty?
|
24
|
+
|
25
|
+
keys << PossibleKey.new(line.chomp, true)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :string
|
30
|
+
|
31
|
+
def initialize(string, key = nil)
|
32
|
+
@string = string
|
33
|
+
@key = key
|
34
|
+
end
|
35
|
+
|
36
|
+
def features # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
37
|
+
{
|
38
|
+
**significant_letter_frequency_difference,
|
39
|
+
equal: letter_count[:'='],
|
40
|
+
length: length,
|
41
|
+
hex: character_set == :hex ? 1 : 0,
|
42
|
+
lower36: character_set == :lower36 ? 1 : 0,
|
43
|
+
upper36: character_set == :upper36 ? 1 : 0,
|
44
|
+
base64: character_set == :base64 ? 1 : 0,
|
45
|
+
mean_title_chunk_size: mean(title_chunks, &:length),
|
46
|
+
variance_title_chunk_size: variance(title_chunks, &:length),
|
47
|
+
max_title_chunk_size: max(title_chunks, &:length),
|
48
|
+
mean_lower_chunk_size: mean(lower_chunks, &:length),
|
49
|
+
variance_lower_chunk_size: variance(lower_chunks, &:length),
|
50
|
+
mean_upper_chunk_size: mean(upper_chunks, &:length),
|
51
|
+
variance_upper_chunk_size: variance(upper_chunks, &:length),
|
52
|
+
mean_alpha_chunk_size: mean(alpha_chunks, &:length),
|
53
|
+
variance_alpha_chunk_size: variance(alpha_chunks, &:length),
|
54
|
+
mean_alnum_chunk_size: mean(alnum_chunks, &:length),
|
55
|
+
variance_alnum_chunk_size: variance(alnum_chunks, &:length),
|
56
|
+
mean_digit_chunk_size: mean(digit_chunks, &:length),
|
57
|
+
variance_digit_chunk_size: variance(digit_chunks, &:length),
|
58
|
+
vowel_consonant_ratio: vowel_consonant_ratio,
|
59
|
+
alpha_chunks: alpha_chunks.length,
|
60
|
+
alnum_chunks: alnum_chunks.length,
|
61
|
+
digit_chunks: digit_chunks.length,
|
62
|
+
title_chunks: title_chunks.length,
|
63
|
+
mean_letter_frequency_difference: mean(letter_frequency_difference.values),
|
64
|
+
variance_letter_frequency_difference: max(letter_frequency_difference.values)
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
def key?
|
69
|
+
@key
|
70
|
+
end
|
71
|
+
|
72
|
+
def length
|
73
|
+
string.length
|
74
|
+
end
|
75
|
+
|
76
|
+
SIGNIFICANT_LETTERS = %i{+ - _ / A z Z q Q X x}.freeze
|
77
|
+
if RUBY_VERSION >= '2.5'
|
78
|
+
def significant_letter_frequency_difference
|
79
|
+
letter_frequency_difference.slice(*SIGNIFICANT_LETTERS)
|
80
|
+
end
|
81
|
+
else
|
82
|
+
def significant_letter_frequency_difference
|
83
|
+
letter_frequency_difference.each.with_object({}) do |key, value, hash|
|
84
|
+
hash[key] = value if SIGNIFICANT_LETTERS.include?(key)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def character_set
|
90
|
+
@character_set ||= case string
|
91
|
+
when /^[a-fA-F0-9\-]+$/ then :hex
|
92
|
+
when /^[a-z0-9]+$/ then :lower36
|
93
|
+
when /^[A-Z0-9]+$/ then :upper36
|
94
|
+
when %r{^[A-Za-z0-9\-_+/]+={0,2}$} then :base64
|
95
|
+
else
|
96
|
+
raise "#{string.inspect} is an unrecognised character set"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def character_set_total
|
101
|
+
case character_set
|
102
|
+
when :hex then 16
|
103
|
+
when :lower36 then 36
|
104
|
+
when :upper36 then 36
|
105
|
+
when :base64 then 64
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def ideal_letter_frequency
|
110
|
+
1.0 / character_set_total * length
|
111
|
+
end
|
112
|
+
|
113
|
+
LETTER_COUNT_HASH = (('A'..'Z').to_a + ('a'..'z').to_a + ('0'..'9').to_a + %w{+ _ / - =})
|
114
|
+
.map { |k| [k.to_sym, 0] }.to_h
|
115
|
+
def letter_count
|
116
|
+
@letter_count ||= begin
|
117
|
+
string.chars.each.with_object(LETTER_COUNT_HASH.dup) do |letter, hash|
|
118
|
+
hash[letter.to_sym] += 1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def letter_frequency
|
124
|
+
@letter_frequency ||= begin
|
125
|
+
l = letter_count.dup
|
126
|
+
l.each { |k, v| l[k] = v.to_f / string.length }
|
127
|
+
l
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def letter_frequency_difference
|
132
|
+
@letter_frequency_difference ||= begin
|
133
|
+
l = letter_frequency.dup
|
134
|
+
l.each { |k, v| l[k] = (v - ideal_letter_frequency).abs }
|
135
|
+
l
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
VOWELS = %i{a e i o u A E I O U}.freeze
|
140
|
+
CONSONANTS = %i{b c d f g h j k l m n p q r s t v w x y z B C D F G H J K L M N P Q R S T V W X Y Z}.freeze
|
141
|
+
def vowel_consonant_ratio
|
142
|
+
vowels = letter_count.fetch_values(*VOWELS).sum
|
143
|
+
consonants = letter_count.fetch_values(*CONSONANTS).sum
|
144
|
+
vowels / (consonants.nonzero? || 1)
|
145
|
+
end
|
146
|
+
|
147
|
+
def digit_chunks
|
148
|
+
@digit_chunks ||= string.scan(/\d+/)
|
149
|
+
end
|
150
|
+
|
151
|
+
def title_chunks
|
152
|
+
@title_chunks ||= string.scan(/[A-Z][a-z]+/)
|
153
|
+
end
|
154
|
+
|
155
|
+
def lower_chunks
|
156
|
+
@lower_chunks ||= string.scan(/[a-z]+/)
|
157
|
+
end
|
158
|
+
|
159
|
+
def upper_chunks
|
160
|
+
@upper_chunks ||= string.scan(/[A-Z]+/)
|
161
|
+
end
|
162
|
+
|
163
|
+
def alpha_chunks
|
164
|
+
@alpha_chunks ||= string.scan(/[A-Za-z]+/)
|
165
|
+
end
|
166
|
+
|
167
|
+
def alnum_chunks
|
168
|
+
@alnum_chunks ||= string.scan(/[A-Za-z0-9]+/)
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stats
|
4
|
+
def mean(values, &block)
|
5
|
+
return 0 if values.empty?
|
6
|
+
|
7
|
+
values.sum(&block).to_f / values.length
|
8
|
+
end
|
9
|
+
|
10
|
+
def min(values, &block)
|
11
|
+
return 0 if values.empty?
|
12
|
+
|
13
|
+
block ||= :itself.to_proc
|
14
|
+
block.call(values.min_by(&block))
|
15
|
+
end
|
16
|
+
|
17
|
+
def max(values, &block)
|
18
|
+
return 0 if values.empty?
|
19
|
+
|
20
|
+
block ||= :itself.to_proc
|
21
|
+
block.call(values.max_by(&block))
|
22
|
+
end
|
23
|
+
|
24
|
+
def variance(values, &block)
|
25
|
+
return 0 if values.empty?
|
26
|
+
|
27
|
+
values.sum { |sample| (mean(values, &block) - (block ? block.call(sample) : sample))**2 }.to_f / values.length
|
28
|
+
end
|
29
|
+
|
30
|
+
def standard_deviation(values, &block)
|
31
|
+
Math.sqrt(variance(values, &block))
|
32
|
+
end
|
33
|
+
end
|
data/lib/spellr/language.rb
CHANGED
@@ -7,26 +7,39 @@ module Spellr
|
|
7
7
|
attr_reader :name
|
8
8
|
attr_reader :key
|
9
9
|
|
10
|
-
def initialize(name, # rubocop:disable Metrics/ParameterLists
|
10
|
+
def initialize(name, # rubocop:disable Metrics/ParameterLists, Metrics/MethodLength
|
11
11
|
key: name[0],
|
12
12
|
generate: nil,
|
13
13
|
only: [],
|
14
|
+
includes: [],
|
14
15
|
description: '',
|
15
16
|
hashbangs: [])
|
17
|
+
unless only.empty?
|
18
|
+
warn <<~WARNING
|
19
|
+
\e[33mSpellr: `only:` language yaml key with a list of fnmatch rules is deprecated.
|
20
|
+
Please use `includes:` instead, which uses gitignore-inspired rules.
|
21
|
+
see github.com/robotdana/fast_ignore#using-an-includes-list for details\e[0m
|
22
|
+
WARNING
|
23
|
+
end
|
16
24
|
@name = name
|
17
25
|
@key = key
|
18
26
|
@description = description
|
19
27
|
@generate = generate
|
20
|
-
@
|
28
|
+
@includes = only + includes
|
21
29
|
@hashbangs = hashbangs
|
22
30
|
end
|
23
31
|
|
24
32
|
def matches?(file)
|
25
|
-
return true if @
|
33
|
+
return true if @includes.empty?
|
34
|
+
|
35
|
+
return true if fast_ignore.allowed?(file.to_s)
|
26
36
|
|
27
37
|
file = Spellr::File.wrap(file)
|
28
|
-
return true if @
|
29
|
-
|
38
|
+
return true if !@hashbangs.empty? && file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
|
39
|
+
end
|
40
|
+
|
41
|
+
def fast_ignore
|
42
|
+
@fast_ignore ||= FastIgnore.new(include_rules: @includes, gitignore: false)
|
30
43
|
end
|
31
44
|
|
32
45
|
def wordlists
|
@@ -41,6 +54,8 @@ module Spellr
|
|
41
54
|
require 'shellwords'
|
42
55
|
warn "Generating wordlist for #{name}"
|
43
56
|
|
57
|
+
generated_project_wordlist.touch
|
58
|
+
|
44
59
|
Spellr::CLI.new(generate.shellsplit)
|
45
60
|
|
46
61
|
default_wordlists
|
@@ -4,6 +4,7 @@ require 'strscan'
|
|
4
4
|
require_relative '../spellr'
|
5
5
|
require_relative 'column_location'
|
6
6
|
require_relative 'token'
|
7
|
+
require_relative 'key_tuner/naive_bayes'
|
7
8
|
|
8
9
|
module Spellr
|
9
10
|
class LineTokenizer < StringScanner # rubocop:disable Metrics/ClassLength
|
@@ -63,85 +64,135 @@ module Spellr
|
|
63
64
|
end
|
64
65
|
|
65
66
|
def next_term
|
66
|
-
|
67
|
-
|
68
|
-
|
67
|
+
if skip_nonwords_and_flags
|
68
|
+
nil
|
69
|
+
else
|
70
|
+
scan_term
|
71
|
+
end
|
69
72
|
end
|
70
73
|
|
74
|
+
# [Word], [Word]Word [Word]'s [Wordn't]
|
75
|
+
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
76
|
+
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
77
|
+
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
|
78
|
+
# [word] [word]'s [wordn't]
|
79
|
+
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
80
|
+
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
81
|
+
OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
|
82
|
+
|
83
|
+
TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
|
84
|
+
|
71
85
|
def scan_term
|
72
|
-
term =
|
86
|
+
term = scan(TERM_RE)
|
73
87
|
|
74
88
|
return term if term && term.length >= Spellr.config.word_minimum_length
|
75
89
|
end
|
76
90
|
|
77
91
|
NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze # everything not covered by more specific skips/scans
|
78
|
-
LEFTOVER_NON_WORD_BITS_RE = %r{[
|
92
|
+
LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
|
79
93
|
HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
|
80
|
-
SHELL_COLOR_ESCAPE_RE = /\\(e|
|
94
|
+
SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
|
95
|
+
PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
|
81
96
|
BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze # TODO: hex escapes e.g. \xAA. TODO: language aware escapes
|
82
97
|
REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx (it's not a word)
|
83
|
-
# https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding
|
84
|
-
# Only the necessary percent encoding that actually ends in letters
|
85
|
-
# URL_ENCODED_ENTITIES_RE = /%(3A|2F|3F|5B|5D|%2A|%2B|%2C|%3B|%3D)/i.freeze
|
86
98
|
URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
|
87
99
|
# There's got to be a better way of writing this
|
88
|
-
SEQUENTIAL_LETTERS_RE = /a(b(c(d(e(f(g(h(i(j(k(l(m(n(o(p(q(r(s(t(u(v(w(x(
|
89
|
-
|
90
|
-
def skip_nonwords # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
91
|
-
skip(NOT_EVEN_NON_WORDS_RE) ||
|
92
|
-
skip(SHELL_COLOR_ESCAPE_RE) ||
|
93
|
-
skip(BACKSLASH_ESCAPE_RE) ||
|
94
|
-
skip(URL_ENCODED_ENTITIES_RE) ||
|
95
|
-
skip(HEX_RE) ||
|
96
|
-
skip_key_heuristically ||
|
97
|
-
skip_uri_heuristically ||
|
98
|
-
skip(LEFTOVER_NON_WORD_BITS_RE) ||
|
99
|
-
skip(REPEATED_SINGLE_LETTERS_RE) ||
|
100
|
-
skip(SEQUENTIAL_LETTERS_RE)
|
101
|
-
end
|
100
|
+
SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
|
102
101
|
|
103
102
|
# I didn't want to do this myself. BUT i need something to heuristically match on, and it's difficult
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
103
|
+
URL_SCHEME = '(//|https?://|s?ftp://|mailto:)'
|
104
|
+
URL_USERINFO = '([[:alnum:]]+(?::[[:alnum:]]+)?@)'
|
105
|
+
URL_HOSTNAME = '((?:[[:alnum:]-]+(?:\\\\?\\.[[:alnum:]-]+)+|localhost|\\d{1,3}(?:\\.\\d{1,3}){3}))'
|
106
|
+
URL_PORT = '(:\\d+)'
|
107
|
+
URL_PATH = '(/(?:[[:alnum:]=@!$&\\-/._\\\\]|%\h{2})+)'
|
108
|
+
URL_QUERY = '(\\?(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+(?:&(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+)*)'
|
109
|
+
URL_FRAGMENT = '(\\#(?:[[:alnum:]=!$&\\-/.\\\\]|%\\h{2})+)'
|
110
|
+
URL_RE = /
|
111
|
+
(?:
|
112
|
+
#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
|
113
|
+
|
|
114
|
+
#{URL_SCHEME}?#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
|
115
|
+
|
|
116
|
+
#{URL_SCHEME}?#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}
|
117
|
+
)
|
118
|
+
#{URL_QUERY}?#{URL_FRAGMENT}?
|
119
|
+
/x.freeze
|
120
|
+
|
121
|
+
KNOWN_KEY_PATTERNS_RE = %r{(
|
122
|
+
SG\.[\w\-]{22}\.[\w\-]{43} | # sendgrid
|
123
|
+
prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12} | # hyperwallet
|
124
|
+
GTM-[A-Z0-9]{7} | # google tag manager
|
125
|
+
sha1-[A-Za-z0-9=+/]{28} |
|
126
|
+
sha512-[A-Za-z0-9=+/]{88} |
|
127
|
+
data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])
|
128
|
+
)}x.freeze
|
129
|
+
|
130
|
+
SKIPS = Regexp.union(
|
131
|
+
NOT_EVEN_NON_WORDS_RE,
|
132
|
+
SHELL_COLOR_ESCAPE_RE,
|
133
|
+
BACKSLASH_ESCAPE_RE,
|
134
|
+
URL_ENCODED_ENTITIES_RE,
|
135
|
+
HEX_RE,
|
136
|
+
URL_RE, # 2%
|
137
|
+
KNOWN_KEY_PATTERNS_RE
|
138
|
+
).freeze
|
139
|
+
|
140
|
+
AFTER_KEY_SKIPS = Regexp.union(
|
141
|
+
LEFTOVER_NON_WORD_BITS_RE,
|
142
|
+
REPEATED_SINGLE_LETTERS_RE,
|
143
|
+
SEQUENTIAL_LETTERS_RE
|
144
|
+
)
|
145
|
+
|
146
|
+
def skip_nonwords
|
147
|
+
skip(SKIPS) ||
|
148
|
+
skip_key_heuristically || # 5%
|
149
|
+
skip(AFTER_KEY_SKIPS)
|
150
|
+
end
|
151
|
+
|
152
|
+
KEY_RE = %r{[A-Za-z0-9]([A-Za-z0-9+/\-_]*)=*(?![[:alnum:]])}.freeze
|
153
|
+
N = NaiveBayes.new
|
154
|
+
def skip_key_heuristically # rubocop:disable Metrics/MethodLength
|
155
|
+
return unless scan(KEY_RE)
|
156
|
+
# I've come across some large base64 strings by this point they're definitely base64.
|
157
|
+
return true if matched.length > 200
|
158
|
+
|
159
|
+
if key_roughly?(matched)
|
160
|
+
if N.key?(matched)
|
161
|
+
true
|
162
|
+
else
|
163
|
+
unscan
|
164
|
+
false
|
165
|
+
end
|
119
166
|
else
|
120
|
-
|
121
|
-
|
167
|
+
unscan
|
168
|
+
false
|
122
169
|
end
|
123
|
-
|
124
|
-
unscan && false if heuristic_failed
|
125
170
|
end
|
126
171
|
|
127
|
-
#
|
128
|
-
# TODO:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
[
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
172
|
+
# this is in a method becase the minimum word length stuff was throwing it off
|
173
|
+
# TODO: move to config maybe?
|
174
|
+
def min_alpha_re
|
175
|
+
/(?:
|
176
|
+
[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}
|
177
|
+
|
|
178
|
+
[a-z]{#{Spellr.config.word_minimum_length}}
|
179
|
+
|
|
180
|
+
[A-Z]{#{Spellr.config.word_minimum_length}}
|
181
|
+
)/x.freeze
|
182
|
+
end
|
183
|
+
ALPHA_SEP_RE = '[A-Za-z][A-Za-z\\-_/+]*'
|
184
|
+
NUM_SEP_RE = '\\d[\\d\\-_/+]*'
|
185
|
+
THREE_CHUNK_RE = /^(?:
|
186
|
+
#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}
|
187
|
+
|
|
188
|
+
#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}
|
189
|
+
)/x.freeze
|
190
|
+
def key_roughly?(matched)
|
191
|
+
return unless matched.length >= Spellr.config.key_minimum_length
|
192
|
+
return unless matched.match?(THREE_CHUNK_RE)
|
193
|
+
return unless matched.match?(min_alpha_re) # or there's no point
|
194
|
+
|
195
|
+
true
|
145
196
|
end
|
146
197
|
|
147
198
|
# jump to character-aware position
|
@@ -149,37 +200,17 @@ module Spellr
|
|
149
200
|
skip(/.{#{new_charpos - charpos}}/m)
|
150
201
|
end
|
151
202
|
|
152
|
-
# [Word], [Word]Word [Word]'s [Wordn't]
|
153
|
-
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
154
|
-
def title_case
|
155
|
-
scan(TITLE_CASE_RE)
|
156
|
-
end
|
157
|
-
|
158
|
-
# [word] [word]'s [wordn't]
|
159
|
-
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
160
|
-
def lower_case
|
161
|
-
scan(LOWER_CASE_RE)
|
162
|
-
end
|
163
|
-
|
164
|
-
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
165
|
-
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*((?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
|
166
|
-
def upper_case
|
167
|
-
scan(UPPER_CASE_RE)
|
168
|
-
end
|
169
|
-
|
170
|
-
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
171
|
-
OTHER_CASE_RE = /[[:alpha:]]+/.freeze
|
172
|
-
def other_case
|
173
|
-
scan(OTHER_CASE_RE)
|
174
|
-
end
|
175
|
-
|
176
203
|
SPELLR_DISABLE_RE = /spellr:disable/.freeze
|
177
204
|
def skip_and_track_disable
|
205
|
+
return if disabled?
|
206
|
+
|
178
207
|
skip(SPELLR_DISABLE_RE) && self.disabled = true
|
179
208
|
end
|
180
209
|
|
181
210
|
SPELLR_ENABLE_RE = /spellr:enable/.freeze
|
182
211
|
def skip_and_track_enable
|
212
|
+
return unless disabled?
|
213
|
+
|
183
214
|
skip(SPELLR_ENABLE_RE) && self.disabled = false
|
184
215
|
end
|
185
216
|
end
|
data/lib/spellr/string_format.rb
CHANGED
@@ -10,10 +10,9 @@ module Spellr
|
|
10
10
|
"#{count} #{word}#{'s' if count != 1}"
|
11
11
|
end
|
12
12
|
|
13
|
+
# TODO: make it work without color
|
13
14
|
def color_enabled?
|
14
|
-
|
15
|
-
|
16
|
-
Spellr.config.color
|
15
|
+
true
|
17
16
|
end
|
18
17
|
|
19
18
|
def aqua(text)
|
@@ -39,5 +38,11 @@ module Spellr
|
|
39
38
|
|
40
39
|
"\e[1;31m#{text}#{normal}"
|
41
40
|
end
|
41
|
+
|
42
|
+
def green(text)
|
43
|
+
return text unless Spellr::StringFormat.color_enabled?
|
44
|
+
|
45
|
+
"\e[1;32m#{text}#{normal}"
|
46
|
+
end
|
42
47
|
end
|
43
48
|
end
|