spellr 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +14 -14
- data/lib/.spellr.yml +2 -0
- data/lib/spellr/backports.rb +16 -6
- data/lib/spellr/base_reporter.rb +54 -0
- data/lib/spellr/check.rb +54 -20
- data/lib/spellr/cli.rb +13 -6
- data/lib/spellr/column_location.rb +1 -1
- data/lib/spellr/config.rb +6 -45
- data/lib/spellr/config_loader.rb +10 -6
- data/lib/spellr/file.rb +15 -2
- data/lib/spellr/file_list.rb +21 -17
- data/lib/spellr/interactive.rb +51 -116
- data/lib/spellr/interactive_add.rb +64 -0
- data/lib/spellr/interactive_replacement.rb +69 -0
- data/lib/spellr/key_tuner/naive_bayes.rb +49 -91
- data/lib/spellr/key_tuner/possible_key.rb +36 -32
- data/lib/spellr/key_tuner/stats.rb +26 -7
- data/lib/spellr/language.rb +28 -44
- data/lib/spellr/line_location.rb +13 -7
- data/lib/spellr/line_tokenizer.rb +35 -134
- data/lib/spellr/output.rb +62 -0
- data/lib/spellr/output_stubbed.rb +58 -0
- data/lib/spellr/quiet_reporter.rb +13 -0
- data/lib/spellr/reporter.rb +9 -13
- data/lib/spellr/token.rb +14 -16
- data/lib/spellr/token_regexps.rb +103 -0
- data/lib/spellr/tokenizer.rb +35 -14
- data/lib/spellr/version.rb +1 -1
- data/lib/spellr/wordlist.rb +29 -25
- data/lib/spellr/wordlist_reporter.rb +16 -8
- data/lib/spellr.rb +1 -0
- data/wordlists/ruby.txt +1046 -13
- metadata +9 -2
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../spellr'
|
4
|
+
|
5
|
+
module Spellr
|
6
|
+
module TokenRegexps
|
7
|
+
#### WORDS ####
|
8
|
+
|
9
|
+
# [Word], [Word]Word [Word]'s [Wordn't]
|
10
|
+
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
11
|
+
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
12
|
+
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze # rubocop:disable Metrics/LineLength
|
13
|
+
# [word] [word]'s [wordn't]
|
14
|
+
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
15
|
+
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
16
|
+
OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
|
17
|
+
|
18
|
+
TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
|
19
|
+
|
20
|
+
#### NON WORDS ####
|
21
|
+
|
22
|
+
NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze
|
23
|
+
LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
|
24
|
+
HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
|
25
|
+
SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
|
26
|
+
PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
|
27
|
+
# TODO: hex escapes e.g. \xAA.
|
28
|
+
# TODO: language aware escapes
|
29
|
+
BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze
|
30
|
+
REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx
|
31
|
+
URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
|
32
|
+
# There's got to be a better way of writing this
|
33
|
+
SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
|
34
|
+
|
35
|
+
# I didn't want to do this myself
|
36
|
+
# BUT i need something to heuristically match on, and it's difficult
|
37
|
+
URL_SCHEME = %r{(?://|https?://|s?ftp://|mailto:)}.freeze
|
38
|
+
URL_USERINFO = /[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
|
39
|
+
URL_IP_ADDRESS = /\d{1,3}(?:\.\d{1,3}){3}/.freeze
|
40
|
+
# literal \ so that i can match on domains in regexps. no-one cares but me.
|
41
|
+
URL_HOSTNAME = /(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
|
42
|
+
URL_PORT = /:\d+/.freeze
|
43
|
+
URL_PATH = %r{/(?:[[:alnum:]=@!$&\-/._\\]|%\h{2})+}.freeze
|
44
|
+
URL_QUERY = %r{\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*}.freeze
|
45
|
+
URL_FRAGMENT = %r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
|
46
|
+
|
47
|
+
# URL can be any valid hostname, it must have either a scheme, userinfo, or path
|
48
|
+
# it may have those and any of the others and a port, or a query or a fragment.
|
49
|
+
URL_REST = /#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
|
50
|
+
URL_RE = Regexp.union(
|
51
|
+
/#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
|
52
|
+
/#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
|
53
|
+
/#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
|
54
|
+
).freeze
|
55
|
+
|
56
|
+
KEY_SENDGRID_RE = /SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
|
57
|
+
KEY_HYPERWALLET_RE = /prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
|
58
|
+
KEY_GTM_RE = /GTM-[A-Z0-9]{7}/.freeze
|
59
|
+
KEY_SHA1 = %r{sha1-[A-Za-z0-9=+/]{28}}.freeze
|
60
|
+
KEY_SHA512 = %r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
|
61
|
+
KEY_DATA_URL = %r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
|
62
|
+
|
63
|
+
KEY_PATTERNS_RE = Regexp.union(
|
64
|
+
KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
|
65
|
+
)
|
66
|
+
|
67
|
+
SKIPS = Regexp.union(
|
68
|
+
NOT_EVEN_NON_WORDS_RE,
|
69
|
+
SHELL_COLOR_ESCAPE_RE,
|
70
|
+
BACKSLASH_ESCAPE_RE,
|
71
|
+
URL_ENCODED_ENTITIES_RE,
|
72
|
+
HEX_RE,
|
73
|
+
URL_RE, # 2%
|
74
|
+
KEY_PATTERNS_RE
|
75
|
+
).freeze
|
76
|
+
|
77
|
+
AFTER_KEY_SKIPS = Regexp.union(
|
78
|
+
LEFTOVER_NON_WORD_BITS_RE,
|
79
|
+
REPEATED_SINGLE_LETTERS_RE,
|
80
|
+
SEQUENTIAL_LETTERS_RE
|
81
|
+
)
|
82
|
+
|
83
|
+
# this is in a method because the minimum word length stuff was throwing it off
|
84
|
+
# TODO: move to config maybe?
|
85
|
+
def min_alpha_re
|
86
|
+
@min_alpha_re ||= Regexp.union(
|
87
|
+
/[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
|
88
|
+
/[a-z]{#{Spellr.config.word_minimum_length}}/,
|
89
|
+
/[A-Z]{#{Spellr.config.word_minimum_length}}/
|
90
|
+
).freeze
|
91
|
+
end
|
92
|
+
ALPHA_SEP_RE = %r{[A-Za-z][A-Za-z\-_/+]*}.freeze
|
93
|
+
NUM_SEP_RE = %r{\d[\d\-_/+]*}.freeze
|
94
|
+
THREE_CHUNK_RE = Regexp.union(
|
95
|
+
/\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
|
96
|
+
/\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
|
97
|
+
).freeze
|
98
|
+
POSSIBLE_KEY_RE = %r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
|
99
|
+
|
100
|
+
SPELLR_DISABLE_RE = /spellr:disable/.freeze
|
101
|
+
SPELLR_ENABLE_RE = /spellr:enable/.freeze
|
102
|
+
end
|
103
|
+
end
|
data/lib/spellr/tokenizer.rb
CHANGED
@@ -14,13 +14,12 @@ module Spellr
|
|
14
14
|
attr_accessor :disabled
|
15
15
|
alias_method :disabled?, :disabled
|
16
16
|
|
17
|
-
def initialize(file, start_at: nil,
|
18
|
-
# $stderr.puts start_at if start_at
|
17
|
+
def initialize(file, start_at: nil, skip_key: true)
|
19
18
|
@start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
|
20
19
|
@file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
|
21
20
|
@file.pos = @start_at.line_location.byte_offset
|
22
21
|
|
23
|
-
@line_tokenizer = LineTokenizer.new(
|
22
|
+
@line_tokenizer = LineTokenizer.new(skip_key: skip_key)
|
24
23
|
end
|
25
24
|
|
26
25
|
def terms
|
@@ -32,37 +31,59 @@ module Spellr
|
|
32
31
|
end
|
33
32
|
|
34
33
|
def each_term(&block)
|
35
|
-
|
36
|
-
prepare_tokenizer_for_line(line
|
34
|
+
file.each_line do |line|
|
35
|
+
prepare_tokenizer_for_line(line).each_term(&block)
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
|
-
def each_token(
|
39
|
+
def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
|
40
|
+
each_line_with_stats do |line, line_number, char_offset, byte_offset|
|
41
|
+
prepare_tokenizer_for_line(line).each_token(skip_term_proc: skip_term_proc) do |token|
|
42
|
+
token.line = prepare_line(line, line_number, char_offset, byte_offset)
|
43
|
+
|
44
|
+
yield token
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def prepare_line(line, line_number, char_offset, byte_offset)
|
50
|
+
line_location = LineLocation.new(
|
51
|
+
file, line_number, char_offset: char_offset, byte_offset: byte_offset
|
52
|
+
)
|
53
|
+
column_location = ColumnLocation.new(line_location: line_location)
|
54
|
+
Token.new(line, location: column_location)
|
55
|
+
end
|
56
|
+
|
57
|
+
def each_line_with_stats # rubocop:disable Metrics/MethodLength
|
41
58
|
char_offset = @start_at.line_location.char_offset
|
42
59
|
byte_offset = @start_at.line_location.byte_offset
|
43
60
|
|
44
|
-
|
45
|
-
|
61
|
+
file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
|
62
|
+
yield line, line_number, char_offset, byte_offset
|
63
|
+
|
46
64
|
char_offset += line.length
|
47
65
|
byte_offset += line.bytesize
|
48
|
-
line = Token.new(line, location: ColumnLocation.new(line_location: line_location))
|
49
|
-
prepare_tokenizer_for_line(line, line_number).each_token(&block)
|
50
66
|
end
|
51
67
|
end
|
52
68
|
|
53
69
|
def normalized_terms
|
54
|
-
enum_for(:each_term).map(&:
|
70
|
+
enum_for(:each_term).map(&:spellr_normalize).uniq.sort
|
55
71
|
end
|
56
72
|
|
57
73
|
private
|
58
74
|
|
59
75
|
attr_reader :line_tokenizer
|
60
76
|
|
61
|
-
def
|
62
|
-
|
77
|
+
def each_line_token
|
78
|
+
line_location = @start_at.line_location
|
79
|
+
|
80
|
+
file.each_line do |line|
|
81
|
+
yield Token.new(line, location: ColumnLocation.new(line_location: line_location))
|
82
|
+
line_location = line_location.advance(line)
|
83
|
+
end
|
63
84
|
end
|
64
85
|
|
65
|
-
def prepare_tokenizer_for_line(line
|
86
|
+
def prepare_tokenizer_for_line(line)
|
66
87
|
line_tokenizer.string = line
|
67
88
|
line_tokenizer.pos = 0
|
68
89
|
line_tokenizer
|
data/lib/spellr/version.rb
CHANGED
data/lib/spellr/wordlist.rb
CHANGED
@@ -27,17 +27,17 @@ module Spellr
|
|
27
27
|
end
|
28
28
|
|
29
29
|
# significantly faster than default Enumerable#include?
|
30
|
-
# requires terms to
|
30
|
+
# requires terms to have been sorted
|
31
31
|
def include?(term)
|
32
|
-
include_cache[term.
|
32
|
+
include_cache[term.spellr_normalize]
|
33
33
|
end
|
34
34
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
def <<(term)
|
36
|
+
term = term.spellr_normalize
|
37
|
+
touch
|
38
|
+
include_cache[term] = true
|
39
|
+
insert_sorted(term)
|
40
|
+
@path.write(to_a.join) # we don't need to clear the cache
|
41
41
|
end
|
42
42
|
|
43
43
|
def to_a
|
@@ -46,7 +46,7 @@ module Spellr
|
|
46
46
|
|
47
47
|
def clean(file = @path)
|
48
48
|
require_relative 'tokenizer'
|
49
|
-
write(Spellr::Tokenizer.new(file,
|
49
|
+
write(Spellr::Tokenizer.new(file, skip_key: false).normalized_terms.join)
|
50
50
|
end
|
51
51
|
|
52
52
|
def write(content)
|
@@ -61,37 +61,41 @@ module Spellr
|
|
61
61
|
@path.read
|
62
62
|
end
|
63
63
|
|
64
|
-
def clear_cache
|
65
|
-
@to_a = nil
|
66
|
-
@include = nil
|
67
|
-
end
|
68
|
-
|
69
64
|
def exist?
|
70
65
|
return @exist if defined?(@exist)
|
71
66
|
|
72
67
|
@exist = @path.exist?
|
73
68
|
end
|
74
69
|
|
75
|
-
def add(term)
|
76
|
-
touch
|
77
|
-
term = term.normalize
|
78
|
-
include_cache[term] = true
|
79
|
-
to_a << term
|
80
|
-
to_a.sort!
|
81
|
-
write(@to_a.join)
|
82
|
-
Spellr.config.clear_cache if to_a.length == 1
|
83
|
-
end
|
84
|
-
|
85
70
|
def touch
|
86
71
|
return if exist?
|
87
72
|
|
88
73
|
@path.dirname.mkpath
|
89
74
|
@path.write('')
|
90
|
-
|
75
|
+
clear_cache
|
91
76
|
end
|
92
77
|
|
93
78
|
private
|
94
79
|
|
80
|
+
def insert_sorted(term)
|
81
|
+
insert_at = to_a.bsearch_index { |value| value >= term }
|
82
|
+
insert_at ? to_a.insert(insert_at, term) : to_a.push(term)
|
83
|
+
end
|
84
|
+
|
85
|
+
def include_cache
|
86
|
+
@include_cache ||= Hash.new do |cache, term|
|
87
|
+
cache[term] = to_a.bsearch do |value|
|
88
|
+
term <=> value
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear_cache
|
94
|
+
@to_a = nil
|
95
|
+
@include = nil
|
96
|
+
remove_instance_variable(:@exist) if defined?(@exist)
|
97
|
+
end
|
98
|
+
|
95
99
|
def raise_unless_exists?
|
96
100
|
return if exist?
|
97
101
|
|
@@ -1,21 +1,29 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'set'
|
4
|
+
require_relative 'base_reporter'
|
4
5
|
|
5
6
|
module Spellr
|
6
|
-
class WordlistReporter
|
7
|
-
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@words = Set.new
|
7
|
+
class WordlistReporter < Spellr::BaseReporter
|
8
|
+
def parallel?
|
9
|
+
true
|
11
10
|
end
|
12
11
|
|
13
|
-
def finish
|
14
|
-
puts words.sort.join
|
12
|
+
def finish
|
13
|
+
output.puts words.sort.join
|
15
14
|
end
|
16
15
|
|
17
16
|
def call(token)
|
18
|
-
words << token.
|
17
|
+
words << token.spellr_normalize
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def words
|
23
|
+
@words ||= begin
|
24
|
+
output.counts[:words] = Set.new unless output.counts.key?(:words)
|
25
|
+
output.counts[:words]
|
26
|
+
end
|
19
27
|
end
|
20
28
|
end
|
21
29
|
end
|