spellr 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +13 -10
- data/bin/fetch_wordlist/english +3 -1
- data/lib/.spellr.yml +12 -9
- data/lib/spellr/backports.rb +19 -0
- data/lib/spellr/check.rb +15 -12
- data/lib/spellr/config.rb +46 -11
- data/lib/spellr/file.rb +7 -8
- data/lib/spellr/file_list.rb +15 -10
- data/lib/spellr/interactive.rb +8 -3
- data/lib/spellr/key_tuner/data.yml +1242 -0
- data/lib/spellr/key_tuner/naive_bayes.rb +162 -0
- data/lib/spellr/key_tuner/possible_key.rb +170 -0
- data/lib/spellr/key_tuner/stats.rb +33 -0
- data/lib/spellr/language.rb +20 -5
- data/lib/spellr/line_tokenizer.rb +115 -84
- data/lib/spellr/string_format.rb +8 -3
- data/lib/spellr/token.rb +14 -10
- data/lib/spellr/tokenizer.rb +1 -2
- data/lib/spellr/version.rb +1 -1
- data/lib/spellr/wordlist.rb +6 -5
- data/lib/spellr.rb +5 -14
- data/spellr.gemspec +3 -2
- metadata +24 -5
@@ -0,0 +1,162 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'possible_key'
|
4
|
+
require_relative 'stats'
|
5
|
+
require 'yaml'
|
6
|
+
# this is lifted in whole from this article. i don't understand the maths and i don't want to
|
7
|
+
# https://www.sitepoint.com/machine-learning-ruby-naive-bayes-theorem/
|
8
|
+
|
9
|
+
class NaiveBayes # rubocop:disable Metrics/ClassLength
|
10
|
+
include Stats
|
11
|
+
|
12
|
+
YAML_PATH = File.join(__dir__, 'data.yml')
|
13
|
+
|
14
|
+
def training_data
|
15
|
+
@training_data ||= begin
|
16
|
+
PossibleKey.load
|
17
|
+
PossibleKey.keys.each.with_object({}) do |key, data|
|
18
|
+
key_class = key.key? ? 'key' : 'not_key'
|
19
|
+
character_set = key.character_set
|
20
|
+
key_key = "#{key_class}_#{character_set}"
|
21
|
+
data[key_key] ||= []
|
22
|
+
data[key_key] << key.features
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def load_from_yaml
|
28
|
+
data = YAML.safe_load(::File.read(YAML_PATH), [Symbol])
|
29
|
+
|
30
|
+
@feature_set = data[:feature_set]
|
31
|
+
@num_classes = data[:num_classes]
|
32
|
+
@classes = data[:classes]
|
33
|
+
@features = data[:features]
|
34
|
+
end
|
35
|
+
|
36
|
+
def save_to_yaml
|
37
|
+
require 'yaml'
|
38
|
+
File.write(YAML_PATH, {
|
39
|
+
feature_set: feature_set,
|
40
|
+
num_classes: num_classes,
|
41
|
+
classes: classes,
|
42
|
+
features: features
|
43
|
+
}.to_yaml)
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize
|
47
|
+
load_from_yaml if File.exist?(YAML_PATH)
|
48
|
+
end
|
49
|
+
|
50
|
+
def num_classes
|
51
|
+
@num_classes ||= training_data&.length
|
52
|
+
end
|
53
|
+
|
54
|
+
def classes
|
55
|
+
@classes ||= training_data&.keys
|
56
|
+
end
|
57
|
+
|
58
|
+
def features
|
59
|
+
@features ||= training_data.first.last.first.keys
|
60
|
+
end
|
61
|
+
|
62
|
+
def feature_set # rubocop:disable Metrics/MethodLength
|
63
|
+
@feature_set ||= classes.each.with_object({}) do |class_name, feature_set|
|
64
|
+
feature_set[class_name] = {}
|
65
|
+
|
66
|
+
features.each do |feature|
|
67
|
+
values = training_data[class_name].map do |row|
|
68
|
+
row[feature]
|
69
|
+
end
|
70
|
+
|
71
|
+
feature_set[class_name][feature] = {
|
72
|
+
standard_deviation: standard_deviation(values),
|
73
|
+
mean: mean(values),
|
74
|
+
variance: variance(values)
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# given a class, this method determines the probability
|
81
|
+
# of a certain value occurring for a given feature
|
82
|
+
# index: index of the feature in consideration in the training data
|
83
|
+
# value: the value of the feature for which we are finding the probability
|
84
|
+
# class_name: name of the class in consideration
|
85
|
+
def feature_probability(feature, value, class_name) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
86
|
+
# get the feature value set
|
87
|
+
fs = feature_set[class_name][feature]
|
88
|
+
|
89
|
+
# statistical properties of the feature set
|
90
|
+
fs_std = fs[:standard_deviation]
|
91
|
+
fs_mean = fs[:mean]
|
92
|
+
fs_var = fs[:variance]
|
93
|
+
|
94
|
+
# deal with the edge case of a 0 standard deviation
|
95
|
+
if fs_std == 0
|
96
|
+
return fs_mean == value ? 1.0 : 0.0
|
97
|
+
end
|
98
|
+
|
99
|
+
# calculate the gaussian probability
|
100
|
+
pi = Math::PI
|
101
|
+
e = Math::E
|
102
|
+
|
103
|
+
exp = -((value - fs_mean)**2) / (2 * fs_var)
|
104
|
+
probability = (1.0 / Math.sqrt(2 * pi * fs_var)) * (e**exp)
|
105
|
+
|
106
|
+
probability
|
107
|
+
end
|
108
|
+
|
109
|
+
# multiply together the feature probabilities for all of the
|
110
|
+
# features in a class for given values
|
111
|
+
def feature_multiplication(features, class_name)
|
112
|
+
features.reduce(1.0) do |result, (key, value)|
|
113
|
+
result * feature_probability(key, value, class_name)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def debug(string) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
118
|
+
require 'terminal-table'
|
119
|
+
|
120
|
+
features = PossibleKey.new(string).features
|
121
|
+
|
122
|
+
table = Terminal::Table.new do |t|
|
123
|
+
t << ['classes'] + classes
|
124
|
+
t << :separator
|
125
|
+
t << ['probabilities'] + classes.map { |c| class_probability(features, c) }
|
126
|
+
features.each do |key, value|
|
127
|
+
t << [key] + classes.map { |c| feature_probability(key, value, c).round(4) }
|
128
|
+
end
|
129
|
+
end
|
130
|
+
puts table
|
131
|
+
|
132
|
+
nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# this is where we compute the final naive Bayesian probability
|
136
|
+
# for a given set of features being a part of a given class.
|
137
|
+
def class_probability(features, class_name)
|
138
|
+
class_fraction = 1.0 / num_classes
|
139
|
+
feature_bayes = feature_multiplication(features, class_name)
|
140
|
+
feature_bayes *= (10**Spellr.config.key_heuristic_weight) if class_name.start_with?('key_')
|
141
|
+
feature_bayes * class_fraction
|
142
|
+
end
|
143
|
+
|
144
|
+
# This the method we should be calling!
|
145
|
+
# Given a set of feature values, it decides
|
146
|
+
# what class to categorize them under
|
147
|
+
def classify(features)
|
148
|
+
classes.max_by do |class_name|
|
149
|
+
class_probability(features, class_name)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def key?(string)
|
154
|
+
key_cache[string]
|
155
|
+
end
|
156
|
+
|
157
|
+
def key_cache
|
158
|
+
@key_cache ||= Hash.new do |cache, string|
|
159
|
+
cache[string] = classify(PossibleKey.new(string).features).start_with?('key')
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require_relative 'stats'
|
5
|
+
|
6
|
+
class PossibleKey # rubocop:disable Metrics/ClassLength
|
7
|
+
include Stats
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_reader :keys
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.load # rubocop:disable Metrics/AbcSize
|
14
|
+
@keys = []
|
15
|
+
|
16
|
+
Pathname.new(__dir__).join('data', 'false_positives.txt').each_line do |line|
|
17
|
+
next if line.chomp.empty?
|
18
|
+
|
19
|
+
keys << PossibleKey.new(line.chomp, false)
|
20
|
+
end
|
21
|
+
|
22
|
+
Pathname.new(__dir__).join('data', 'keys.txt').each_line do |line|
|
23
|
+
next if line.chomp.empty?
|
24
|
+
|
25
|
+
keys << PossibleKey.new(line.chomp, true)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :string
|
30
|
+
|
31
|
+
def initialize(string, key = nil)
|
32
|
+
@string = string
|
33
|
+
@key = key
|
34
|
+
end
|
35
|
+
|
36
|
+
def features # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
37
|
+
{
|
38
|
+
**significant_letter_frequency_difference,
|
39
|
+
equal: letter_count[:'='],
|
40
|
+
length: length,
|
41
|
+
hex: character_set == :hex ? 1 : 0,
|
42
|
+
lower36: character_set == :lower36 ? 1 : 0,
|
43
|
+
upper36: character_set == :upper36 ? 1 : 0,
|
44
|
+
base64: character_set == :base64 ? 1 : 0,
|
45
|
+
mean_title_chunk_size: mean(title_chunks, &:length),
|
46
|
+
variance_title_chunk_size: variance(title_chunks, &:length),
|
47
|
+
max_title_chunk_size: max(title_chunks, &:length),
|
48
|
+
mean_lower_chunk_size: mean(lower_chunks, &:length),
|
49
|
+
variance_lower_chunk_size: variance(lower_chunks, &:length),
|
50
|
+
mean_upper_chunk_size: mean(upper_chunks, &:length),
|
51
|
+
variance_upper_chunk_size: variance(upper_chunks, &:length),
|
52
|
+
mean_alpha_chunk_size: mean(alpha_chunks, &:length),
|
53
|
+
variance_alpha_chunk_size: variance(alpha_chunks, &:length),
|
54
|
+
mean_alnum_chunk_size: mean(alnum_chunks, &:length),
|
55
|
+
variance_alnum_chunk_size: variance(alnum_chunks, &:length),
|
56
|
+
mean_digit_chunk_size: mean(digit_chunks, &:length),
|
57
|
+
variance_digit_chunk_size: variance(digit_chunks, &:length),
|
58
|
+
vowel_consonant_ratio: vowel_consonant_ratio,
|
59
|
+
alpha_chunks: alpha_chunks.length,
|
60
|
+
alnum_chunks: alnum_chunks.length,
|
61
|
+
digit_chunks: digit_chunks.length,
|
62
|
+
title_chunks: title_chunks.length,
|
63
|
+
mean_letter_frequency_difference: mean(letter_frequency_difference.values),
|
64
|
+
variance_letter_frequency_difference: max(letter_frequency_difference.values)
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
def key?
|
69
|
+
@key
|
70
|
+
end
|
71
|
+
|
72
|
+
def length
|
73
|
+
string.length
|
74
|
+
end
|
75
|
+
|
76
|
+
SIGNIFICANT_LETTERS = %i{+ - _ / A z Z q Q X x}.freeze
|
77
|
+
if RUBY_VERSION >= '2.5'
|
78
|
+
def significant_letter_frequency_difference
|
79
|
+
letter_frequency_difference.slice(*SIGNIFICANT_LETTERS)
|
80
|
+
end
|
81
|
+
else
|
82
|
+
def significant_letter_frequency_difference
|
83
|
+
letter_frequency_difference.each.with_object({}) do |key, value, hash|
|
84
|
+
hash[key] = value if SIGNIFICANT_LETTERS.include?(key)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def character_set
|
90
|
+
@character_set ||= case string
|
91
|
+
when /^[a-fA-F0-9\-]+$/ then :hex
|
92
|
+
when /^[a-z0-9]+$/ then :lower36
|
93
|
+
when /^[A-Z0-9]+$/ then :upper36
|
94
|
+
when %r{^[A-Za-z0-9\-_+/]+={0,2}$} then :base64
|
95
|
+
else
|
96
|
+
raise "#{string.inspect} is an unrecognised character set"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def character_set_total
|
101
|
+
case character_set
|
102
|
+
when :hex then 16
|
103
|
+
when :lower36 then 36
|
104
|
+
when :upper36 then 36
|
105
|
+
when :base64 then 64
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def ideal_letter_frequency
|
110
|
+
1.0 / character_set_total * length
|
111
|
+
end
|
112
|
+
|
113
|
+
LETTER_COUNT_HASH = (('A'..'Z').to_a + ('a'..'z').to_a + ('0'..'9').to_a + %w{+ _ / - =})
|
114
|
+
.map { |k| [k.to_sym, 0] }.to_h
|
115
|
+
def letter_count
|
116
|
+
@letter_count ||= begin
|
117
|
+
string.chars.each.with_object(LETTER_COUNT_HASH.dup) do |letter, hash|
|
118
|
+
hash[letter.to_sym] += 1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def letter_frequency
|
124
|
+
@letter_frequency ||= begin
|
125
|
+
l = letter_count.dup
|
126
|
+
l.each { |k, v| l[k] = v.to_f / string.length }
|
127
|
+
l
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def letter_frequency_difference
|
132
|
+
@letter_frequency_difference ||= begin
|
133
|
+
l = letter_frequency.dup
|
134
|
+
l.each { |k, v| l[k] = (v - ideal_letter_frequency).abs }
|
135
|
+
l
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
VOWELS = %i{a e i o u A E I O U}.freeze
|
140
|
+
CONSONANTS = %i{b c d f g h j k l m n p q r s t v w x y z B C D F G H J K L M N P Q R S T V W X Y Z}.freeze
|
141
|
+
def vowel_consonant_ratio
|
142
|
+
vowels = letter_count.fetch_values(*VOWELS).sum
|
143
|
+
consonants = letter_count.fetch_values(*CONSONANTS).sum
|
144
|
+
vowels / (consonants.nonzero? || 1)
|
145
|
+
end
|
146
|
+
|
147
|
+
def digit_chunks
|
148
|
+
@digit_chunks ||= string.scan(/\d+/)
|
149
|
+
end
|
150
|
+
|
151
|
+
def title_chunks
|
152
|
+
@title_chunks ||= string.scan(/[A-Z][a-z]+/)
|
153
|
+
end
|
154
|
+
|
155
|
+
def lower_chunks
|
156
|
+
@lower_chunks ||= string.scan(/[a-z]+/)
|
157
|
+
end
|
158
|
+
|
159
|
+
def upper_chunks
|
160
|
+
@upper_chunks ||= string.scan(/[A-Z]+/)
|
161
|
+
end
|
162
|
+
|
163
|
+
def alpha_chunks
|
164
|
+
@alpha_chunks ||= string.scan(/[A-Za-z]+/)
|
165
|
+
end
|
166
|
+
|
167
|
+
def alnum_chunks
|
168
|
+
@alnum_chunks ||= string.scan(/[A-Za-z0-9]+/)
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stats
|
4
|
+
def mean(values, &block)
|
5
|
+
return 0 if values.empty?
|
6
|
+
|
7
|
+
values.sum(&block).to_f / values.length
|
8
|
+
end
|
9
|
+
|
10
|
+
def min(values, &block)
|
11
|
+
return 0 if values.empty?
|
12
|
+
|
13
|
+
block ||= :itself.to_proc
|
14
|
+
block.call(values.min_by(&block))
|
15
|
+
end
|
16
|
+
|
17
|
+
def max(values, &block)
|
18
|
+
return 0 if values.empty?
|
19
|
+
|
20
|
+
block ||= :itself.to_proc
|
21
|
+
block.call(values.max_by(&block))
|
22
|
+
end
|
23
|
+
|
24
|
+
def variance(values, &block)
|
25
|
+
return 0 if values.empty?
|
26
|
+
|
27
|
+
values.sum { |sample| (mean(values, &block) - (block ? block.call(sample) : sample))**2 }.to_f / values.length
|
28
|
+
end
|
29
|
+
|
30
|
+
def standard_deviation(values, &block)
|
31
|
+
Math.sqrt(variance(values, &block))
|
32
|
+
end
|
33
|
+
end
|
data/lib/spellr/language.rb
CHANGED
@@ -7,26 +7,39 @@ module Spellr
|
|
7
7
|
attr_reader :name
|
8
8
|
attr_reader :key
|
9
9
|
|
10
|
-
def initialize(name, # rubocop:disable Metrics/ParameterLists
|
10
|
+
def initialize(name, # rubocop:disable Metrics/ParameterLists, Metrics/MethodLength
|
11
11
|
key: name[0],
|
12
12
|
generate: nil,
|
13
13
|
only: [],
|
14
|
+
includes: [],
|
14
15
|
description: '',
|
15
16
|
hashbangs: [])
|
17
|
+
unless only.empty?
|
18
|
+
warn <<~WARNING
|
19
|
+
\e[33mSpellr: `only:` language yaml key with a list of fnmatch rules is deprecated.
|
20
|
+
Please use `includes:` instead, which uses gitignore-inspired rules.
|
21
|
+
see github.com/robotdana/fast_ignore#using-an-includes-list for details\e[0m
|
22
|
+
WARNING
|
23
|
+
end
|
16
24
|
@name = name
|
17
25
|
@key = key
|
18
26
|
@description = description
|
19
27
|
@generate = generate
|
20
|
-
@
|
28
|
+
@includes = only + includes
|
21
29
|
@hashbangs = hashbangs
|
22
30
|
end
|
23
31
|
|
24
32
|
def matches?(file)
|
25
|
-
return true if @
|
33
|
+
return true if @includes.empty?
|
34
|
+
|
35
|
+
return true if fast_ignore.allowed?(file.to_s)
|
26
36
|
|
27
37
|
file = Spellr::File.wrap(file)
|
28
|
-
return true if @
|
29
|
-
|
38
|
+
return true if !@hashbangs.empty? && file.hashbang && @hashbangs.any? { |h| file.hashbang.include?(h) }
|
39
|
+
end
|
40
|
+
|
41
|
+
def fast_ignore
|
42
|
+
@fast_ignore ||= FastIgnore.new(include_rules: @includes, gitignore: false)
|
30
43
|
end
|
31
44
|
|
32
45
|
def wordlists
|
@@ -41,6 +54,8 @@ module Spellr
|
|
41
54
|
require 'shellwords'
|
42
55
|
warn "Generating wordlist for #{name}"
|
43
56
|
|
57
|
+
generated_project_wordlist.touch
|
58
|
+
|
44
59
|
Spellr::CLI.new(generate.shellsplit)
|
45
60
|
|
46
61
|
default_wordlists
|
@@ -4,6 +4,7 @@ require 'strscan'
|
|
4
4
|
require_relative '../spellr'
|
5
5
|
require_relative 'column_location'
|
6
6
|
require_relative 'token'
|
7
|
+
require_relative 'key_tuner/naive_bayes'
|
7
8
|
|
8
9
|
module Spellr
|
9
10
|
class LineTokenizer < StringScanner # rubocop:disable Metrics/ClassLength
|
@@ -63,85 +64,135 @@ module Spellr
|
|
63
64
|
end
|
64
65
|
|
65
66
|
def next_term
|
66
|
-
|
67
|
-
|
68
|
-
|
67
|
+
if skip_nonwords_and_flags
|
68
|
+
nil
|
69
|
+
else
|
70
|
+
scan_term
|
71
|
+
end
|
69
72
|
end
|
70
73
|
|
74
|
+
# [Word], [Word]Word [Word]'s [Wordn't]
|
75
|
+
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
76
|
+
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
77
|
+
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
|
78
|
+
# [word] [word]'s [wordn't]
|
79
|
+
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
80
|
+
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
81
|
+
OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
|
82
|
+
|
83
|
+
TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
|
84
|
+
|
71
85
|
def scan_term
|
72
|
-
term =
|
86
|
+
term = scan(TERM_RE)
|
73
87
|
|
74
88
|
return term if term && term.length >= Spellr.config.word_minimum_length
|
75
89
|
end
|
76
90
|
|
77
91
|
NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze # everything not covered by more specific skips/scans
|
78
|
-
LEFTOVER_NON_WORD_BITS_RE = %r{[
|
92
|
+
LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
|
79
93
|
HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
|
80
|
-
SHELL_COLOR_ESCAPE_RE = /\\(e|
|
94
|
+
SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
|
95
|
+
PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
|
81
96
|
BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze # TODO: hex escapes e.g. \xAA. TODO: language aware escapes
|
82
97
|
REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx (it's not a word)
|
83
|
-
# https://developer.mozilla.org/en-US/docs/Glossary/percent-encoding
|
84
|
-
# Only the necessary percent encoding that actually ends in letters
|
85
|
-
# URL_ENCODED_ENTITIES_RE = /%(3A|2F|3F|5B|5D|%2A|%2B|%2C|%3B|%3D)/i.freeze
|
86
98
|
URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
|
87
99
|
# There's got to be a better way of writing this
|
88
|
-
SEQUENTIAL_LETTERS_RE = /a(b(c(d(e(f(g(h(i(j(k(l(m(n(o(p(q(r(s(t(u(v(w(x(
|
89
|
-
|
90
|
-
def skip_nonwords # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
91
|
-
skip(NOT_EVEN_NON_WORDS_RE) ||
|
92
|
-
skip(SHELL_COLOR_ESCAPE_RE) ||
|
93
|
-
skip(BACKSLASH_ESCAPE_RE) ||
|
94
|
-
skip(URL_ENCODED_ENTITIES_RE) ||
|
95
|
-
skip(HEX_RE) ||
|
96
|
-
skip_key_heuristically ||
|
97
|
-
skip_uri_heuristically ||
|
98
|
-
skip(LEFTOVER_NON_WORD_BITS_RE) ||
|
99
|
-
skip(REPEATED_SINGLE_LETTERS_RE) ||
|
100
|
-
skip(SEQUENTIAL_LETTERS_RE)
|
101
|
-
end
|
100
|
+
SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
|
102
101
|
|
103
102
|
# I didn't want to do this myself. BUT i need something to heuristically match on, and it's difficult
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
103
|
+
URL_SCHEME = '(//|https?://|s?ftp://|mailto:)'
|
104
|
+
URL_USERINFO = '([[:alnum:]]+(?::[[:alnum:]]+)?@)'
|
105
|
+
URL_HOSTNAME = '((?:[[:alnum:]-]+(?:\\\\?\\.[[:alnum:]-]+)+|localhost|\\d{1,3}(?:\\.\\d{1,3}){3}))'
|
106
|
+
URL_PORT = '(:\\d+)'
|
107
|
+
URL_PATH = '(/(?:[[:alnum:]=@!$&\\-/._\\\\]|%\h{2})+)'
|
108
|
+
URL_QUERY = '(\\?(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+(?:&(?:[[:alnum:]=!$\\-/.\\\\]|%\\h{2})+)*)'
|
109
|
+
URL_FRAGMENT = '(\\#(?:[[:alnum:]=!$&\\-/.\\\\]|%\\h{2})+)'
|
110
|
+
URL_RE = /
|
111
|
+
(?:
|
112
|
+
#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
|
113
|
+
|
|
114
|
+
#{URL_SCHEME}?#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?
|
115
|
+
|
|
116
|
+
#{URL_SCHEME}?#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}
|
117
|
+
)
|
118
|
+
#{URL_QUERY}?#{URL_FRAGMENT}?
|
119
|
+
/x.freeze
|
120
|
+
|
121
|
+
KNOWN_KEY_PATTERNS_RE = %r{(
|
122
|
+
SG\.[\w\-]{22}\.[\w\-]{43} | # sendgrid
|
123
|
+
prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12} | # hyperwallet
|
124
|
+
GTM-[A-Z0-9]{7} | # google tag manager
|
125
|
+
sha1-[A-Za-z0-9=+/]{28} |
|
126
|
+
sha512-[A-Za-z0-9=+/]{88} |
|
127
|
+
data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])
|
128
|
+
)}x.freeze
|
129
|
+
|
130
|
+
SKIPS = Regexp.union(
|
131
|
+
NOT_EVEN_NON_WORDS_RE,
|
132
|
+
SHELL_COLOR_ESCAPE_RE,
|
133
|
+
BACKSLASH_ESCAPE_RE,
|
134
|
+
URL_ENCODED_ENTITIES_RE,
|
135
|
+
HEX_RE,
|
136
|
+
URL_RE, # 2%
|
137
|
+
KNOWN_KEY_PATTERNS_RE
|
138
|
+
).freeze
|
139
|
+
|
140
|
+
AFTER_KEY_SKIPS = Regexp.union(
|
141
|
+
LEFTOVER_NON_WORD_BITS_RE,
|
142
|
+
REPEATED_SINGLE_LETTERS_RE,
|
143
|
+
SEQUENTIAL_LETTERS_RE
|
144
|
+
)
|
145
|
+
|
146
|
+
def skip_nonwords
|
147
|
+
skip(SKIPS) ||
|
148
|
+
skip_key_heuristically || # 5%
|
149
|
+
skip(AFTER_KEY_SKIPS)
|
150
|
+
end
|
151
|
+
|
152
|
+
KEY_RE = %r{[A-Za-z0-9]([A-Za-z0-9+/\-_]*)=*(?![[:alnum:]])}.freeze
|
153
|
+
N = NaiveBayes.new
|
154
|
+
def skip_key_heuristically # rubocop:disable Metrics/MethodLength
|
155
|
+
return unless scan(KEY_RE)
|
156
|
+
# I've come across some large base64 strings by this point they're definitely base64.
|
157
|
+
return true if matched.length > 200
|
158
|
+
|
159
|
+
if key_roughly?(matched)
|
160
|
+
if N.key?(matched)
|
161
|
+
true
|
162
|
+
else
|
163
|
+
unscan
|
164
|
+
false
|
165
|
+
end
|
119
166
|
else
|
120
|
-
|
121
|
-
|
167
|
+
unscan
|
168
|
+
false
|
122
169
|
end
|
123
|
-
|
124
|
-
unscan && false if heuristic_failed
|
125
170
|
end
|
126
171
|
|
127
|
-
#
|
128
|
-
# TODO:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
[
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
172
|
+
# this is in a method becase the minimum word length stuff was throwing it off
|
173
|
+
# TODO: move to config maybe?
|
174
|
+
def min_alpha_re
|
175
|
+
/(?:
|
176
|
+
[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}
|
177
|
+
|
|
178
|
+
[a-z]{#{Spellr.config.word_minimum_length}}
|
179
|
+
|
|
180
|
+
[A-Z]{#{Spellr.config.word_minimum_length}}
|
181
|
+
)/x.freeze
|
182
|
+
end
|
183
|
+
ALPHA_SEP_RE = '[A-Za-z][A-Za-z\\-_/+]*'
|
184
|
+
NUM_SEP_RE = '\\d[\\d\\-_/+]*'
|
185
|
+
THREE_CHUNK_RE = /^(?:
|
186
|
+
#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}
|
187
|
+
|
|
188
|
+
#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}
|
189
|
+
)/x.freeze
|
190
|
+
def key_roughly?(matched)
|
191
|
+
return unless matched.length >= Spellr.config.key_minimum_length
|
192
|
+
return unless matched.match?(THREE_CHUNK_RE)
|
193
|
+
return unless matched.match?(min_alpha_re) # or there's no point
|
194
|
+
|
195
|
+
true
|
145
196
|
end
|
146
197
|
|
147
198
|
# jump to character-aware position
|
@@ -149,37 +200,17 @@ module Spellr
|
|
149
200
|
skip(/.{#{new_charpos - charpos}}/m)
|
150
201
|
end
|
151
202
|
|
152
|
-
# [Word], [Word]Word [Word]'s [Wordn't]
|
153
|
-
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
154
|
-
def title_case
|
155
|
-
scan(TITLE_CASE_RE)
|
156
|
-
end
|
157
|
-
|
158
|
-
# [word] [word]'s [wordn't]
|
159
|
-
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
160
|
-
def lower_case
|
161
|
-
scan(LOWER_CASE_RE)
|
162
|
-
end
|
163
|
-
|
164
|
-
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
165
|
-
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*((?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
|
166
|
-
def upper_case
|
167
|
-
scan(UPPER_CASE_RE)
|
168
|
-
end
|
169
|
-
|
170
|
-
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
171
|
-
OTHER_CASE_RE = /[[:alpha:]]+/.freeze
|
172
|
-
def other_case
|
173
|
-
scan(OTHER_CASE_RE)
|
174
|
-
end
|
175
|
-
|
176
203
|
SPELLR_DISABLE_RE = /spellr:disable/.freeze
|
177
204
|
def skip_and_track_disable
|
205
|
+
return if disabled?
|
206
|
+
|
178
207
|
skip(SPELLR_DISABLE_RE) && self.disabled = true
|
179
208
|
end
|
180
209
|
|
181
210
|
SPELLR_ENABLE_RE = /spellr:enable/.freeze
|
182
211
|
def skip_and_track_enable
|
212
|
+
return unless disabled?
|
213
|
+
|
183
214
|
skip(SPELLR_ENABLE_RE) && self.disabled = false
|
184
215
|
end
|
185
216
|
end
|
data/lib/spellr/string_format.rb
CHANGED
@@ -10,10 +10,9 @@ module Spellr
|
|
10
10
|
"#{count} #{word}#{'s' if count != 1}"
|
11
11
|
end
|
12
12
|
|
13
|
+
# TODO: make it work without color
|
13
14
|
def color_enabled?
|
14
|
-
|
15
|
-
|
16
|
-
Spellr.config.color
|
15
|
+
true
|
17
16
|
end
|
18
17
|
|
19
18
|
def aqua(text)
|
@@ -39,5 +38,11 @@ module Spellr
|
|
39
38
|
|
40
39
|
"\e[1;31m#{text}#{normal}"
|
41
40
|
end
|
41
|
+
|
42
|
+
def green(text)
|
43
|
+
return text unless Spellr::StringFormat.color_enabled?
|
44
|
+
|
45
|
+
"\e[1;32m#{text}#{normal}"
|
46
|
+
end
|
42
47
|
end
|
43
48
|
end
|