spellr 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +14 -14
- data/lib/.spellr.yml +2 -0
- data/lib/spellr/backports.rb +16 -6
- data/lib/spellr/base_reporter.rb +54 -0
- data/lib/spellr/check.rb +54 -20
- data/lib/spellr/cli.rb +13 -6
- data/lib/spellr/column_location.rb +1 -1
- data/lib/spellr/config.rb +6 -45
- data/lib/spellr/config_loader.rb +10 -6
- data/lib/spellr/file.rb +15 -2
- data/lib/spellr/file_list.rb +21 -17
- data/lib/spellr/interactive.rb +51 -116
- data/lib/spellr/interactive_add.rb +64 -0
- data/lib/spellr/interactive_replacement.rb +69 -0
- data/lib/spellr/key_tuner/naive_bayes.rb +49 -91
- data/lib/spellr/key_tuner/possible_key.rb +36 -32
- data/lib/spellr/key_tuner/stats.rb +26 -7
- data/lib/spellr/language.rb +28 -44
- data/lib/spellr/line_location.rb +13 -7
- data/lib/spellr/line_tokenizer.rb +35 -134
- data/lib/spellr/output.rb +62 -0
- data/lib/spellr/output_stubbed.rb +58 -0
- data/lib/spellr/quiet_reporter.rb +13 -0
- data/lib/spellr/reporter.rb +9 -13
- data/lib/spellr/token.rb +14 -16
- data/lib/spellr/token_regexps.rb +103 -0
- data/lib/spellr/tokenizer.rb +35 -14
- data/lib/spellr/version.rb +1 -1
- data/lib/spellr/wordlist.rb +29 -25
- data/lib/spellr/wordlist_reporter.rb +16 -8
- data/lib/spellr.rb +1 -0
- data/wordlists/ruby.txt +1046 -13
- metadata +9 -2
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../spellr'
|
4
|
+
|
5
|
+
module Spellr
|
6
|
+
module TokenRegexps
|
7
|
+
#### WORDS ####
|
8
|
+
|
9
|
+
# [Word], [Word]Word [Word]'s [Wordn't]
|
10
|
+
TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
11
|
+
# [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
|
12
|
+
UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze # rubocop:disable Metrics/LineLength
|
13
|
+
# [word] [word]'s [wordn't]
|
14
|
+
LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
|
15
|
+
# for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
|
16
|
+
OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
|
17
|
+
|
18
|
+
TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
|
19
|
+
|
20
|
+
#### NON WORDS ####
|
21
|
+
|
22
|
+
NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze
|
23
|
+
LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
|
24
|
+
HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
|
25
|
+
SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
|
26
|
+
PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
|
27
|
+
# TODO: hex escapes e.g. \xAA.
|
28
|
+
# TODO: language aware escapes
|
29
|
+
BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze
|
30
|
+
REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx
|
31
|
+
URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
|
32
|
+
# There's got to be a better way of writing this
|
33
|
+
SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
|
34
|
+
|
35
|
+
# I didn't want to do this myself
|
36
|
+
# BUT i need something to heuristically match on, and it's difficult
|
37
|
+
URL_SCHEME = %r{(?://|https?://|s?ftp://|mailto:)}.freeze
|
38
|
+
URL_USERINFO = /[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
|
39
|
+
URL_IP_ADDRESS = /\d{1,3}(?:\.\d{1,3}){3}/.freeze
|
40
|
+
# literal \ so that i can match on domains in regexps. no-one cares but me.
|
41
|
+
URL_HOSTNAME = /(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
|
42
|
+
URL_PORT = /:\d+/.freeze
|
43
|
+
URL_PATH = %r{/(?:[[:alnum:]=@!$&\-/._\\]|%\h{2})+}.freeze
|
44
|
+
URL_QUERY = %r{\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*}.freeze
|
45
|
+
URL_FRAGMENT = %r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
|
46
|
+
|
47
|
+
# URL can be any valid hostname, it must have either a scheme, userinfo, or path
|
48
|
+
# it may have those and any of the others and a port, or a query or a fragment.
|
49
|
+
URL_REST = /#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
|
50
|
+
URL_RE = Regexp.union(
|
51
|
+
/#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
|
52
|
+
/#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
|
53
|
+
/#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
|
54
|
+
).freeze
|
55
|
+
|
56
|
+
KEY_SENDGRID_RE = /SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
|
57
|
+
KEY_HYPERWALLET_RE = /prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
|
58
|
+
KEY_GTM_RE = /GTM-[A-Z0-9]{7}/.freeze
|
59
|
+
KEY_SHA1 = %r{sha1-[A-Za-z0-9=+/]{28}}.freeze
|
60
|
+
KEY_SHA512 = %r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
|
61
|
+
KEY_DATA_URL = %r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
|
62
|
+
|
63
|
+
KEY_PATTERNS_RE = Regexp.union(
|
64
|
+
KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
|
65
|
+
)
|
66
|
+
|
67
|
+
SKIPS = Regexp.union(
|
68
|
+
NOT_EVEN_NON_WORDS_RE,
|
69
|
+
SHELL_COLOR_ESCAPE_RE,
|
70
|
+
BACKSLASH_ESCAPE_RE,
|
71
|
+
URL_ENCODED_ENTITIES_RE,
|
72
|
+
HEX_RE,
|
73
|
+
URL_RE, # 2%
|
74
|
+
KEY_PATTERNS_RE
|
75
|
+
).freeze
|
76
|
+
|
77
|
+
AFTER_KEY_SKIPS = Regexp.union(
|
78
|
+
LEFTOVER_NON_WORD_BITS_RE,
|
79
|
+
REPEATED_SINGLE_LETTERS_RE,
|
80
|
+
SEQUENTIAL_LETTERS_RE
|
81
|
+
)
|
82
|
+
|
83
|
+
# this is in a method because the minimum word length stuff was throwing it off
|
84
|
+
# TODO: move to config maybe?
|
85
|
+
def min_alpha_re
|
86
|
+
@min_alpha_re ||= Regexp.union(
|
87
|
+
/[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
|
88
|
+
/[a-z]{#{Spellr.config.word_minimum_length}}/,
|
89
|
+
/[A-Z]{#{Spellr.config.word_minimum_length}}/
|
90
|
+
).freeze
|
91
|
+
end
|
92
|
+
ALPHA_SEP_RE = %r{[A-Za-z][A-Za-z\-_/+]*}.freeze
|
93
|
+
NUM_SEP_RE = %r{\d[\d\-_/+]*}.freeze
|
94
|
+
THREE_CHUNK_RE = Regexp.union(
|
95
|
+
/\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
|
96
|
+
/\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
|
97
|
+
).freeze
|
98
|
+
POSSIBLE_KEY_RE = %r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
|
99
|
+
|
100
|
+
SPELLR_DISABLE_RE = /spellr:disable/.freeze
|
101
|
+
SPELLR_ENABLE_RE = /spellr:enable/.freeze
|
102
|
+
end
|
103
|
+
end
|
data/lib/spellr/tokenizer.rb
CHANGED
@@ -14,13 +14,12 @@ module Spellr
|
|
14
14
|
attr_accessor :disabled
|
15
15
|
alias_method :disabled?, :disabled
|
16
16
|
|
17
|
-
def initialize(file, start_at: nil,
|
18
|
-
# $stderr.puts start_at if start_at
|
17
|
+
def initialize(file, start_at: nil, skip_key: true)
|
19
18
|
@start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
|
20
19
|
@file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
|
21
20
|
@file.pos = @start_at.line_location.byte_offset
|
22
21
|
|
23
|
-
@line_tokenizer = LineTokenizer.new(
|
22
|
+
@line_tokenizer = LineTokenizer.new(skip_key: skip_key)
|
24
23
|
end
|
25
24
|
|
26
25
|
def terms
|
@@ -32,37 +31,59 @@ module Spellr
|
|
32
31
|
end
|
33
32
|
|
34
33
|
def each_term(&block)
|
35
|
-
|
36
|
-
prepare_tokenizer_for_line(line
|
34
|
+
file.each_line do |line|
|
35
|
+
prepare_tokenizer_for_line(line).each_term(&block)
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
|
-
def each_token(
|
39
|
+
def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
|
40
|
+
each_line_with_stats do |line, line_number, char_offset, byte_offset|
|
41
|
+
prepare_tokenizer_for_line(line).each_token(skip_term_proc: skip_term_proc) do |token|
|
42
|
+
token.line = prepare_line(line, line_number, char_offset, byte_offset)
|
43
|
+
|
44
|
+
yield token
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def prepare_line(line, line_number, char_offset, byte_offset)
|
50
|
+
line_location = LineLocation.new(
|
51
|
+
file, line_number, char_offset: char_offset, byte_offset: byte_offset
|
52
|
+
)
|
53
|
+
column_location = ColumnLocation.new(line_location: line_location)
|
54
|
+
Token.new(line, location: column_location)
|
55
|
+
end
|
56
|
+
|
57
|
+
def each_line_with_stats # rubocop:disable Metrics/MethodLength
|
41
58
|
char_offset = @start_at.line_location.char_offset
|
42
59
|
byte_offset = @start_at.line_location.byte_offset
|
43
60
|
|
44
|
-
|
45
|
-
|
61
|
+
file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
|
62
|
+
yield line, line_number, char_offset, byte_offset
|
63
|
+
|
46
64
|
char_offset += line.length
|
47
65
|
byte_offset += line.bytesize
|
48
|
-
line = Token.new(line, location: ColumnLocation.new(line_location: line_location))
|
49
|
-
prepare_tokenizer_for_line(line, line_number).each_token(&block)
|
50
66
|
end
|
51
67
|
end
|
52
68
|
|
53
69
|
def normalized_terms
|
54
|
-
enum_for(:each_term).map(&:
|
70
|
+
enum_for(:each_term).map(&:spellr_normalize).uniq.sort
|
55
71
|
end
|
56
72
|
|
57
73
|
private
|
58
74
|
|
59
75
|
attr_reader :line_tokenizer
|
60
76
|
|
61
|
-
def
|
62
|
-
|
77
|
+
def each_line_token
|
78
|
+
line_location = @start_at.line_location
|
79
|
+
|
80
|
+
file.each_line do |line|
|
81
|
+
yield Token.new(line, location: ColumnLocation.new(line_location: line_location))
|
82
|
+
line_location = line_location.advance(line)
|
83
|
+
end
|
63
84
|
end
|
64
85
|
|
65
|
-
def prepare_tokenizer_for_line(line
|
86
|
+
def prepare_tokenizer_for_line(line)
|
66
87
|
line_tokenizer.string = line
|
67
88
|
line_tokenizer.pos = 0
|
68
89
|
line_tokenizer
|
data/lib/spellr/version.rb
CHANGED
data/lib/spellr/wordlist.rb
CHANGED
@@ -27,17 +27,17 @@ module Spellr
|
|
27
27
|
end
|
28
28
|
|
29
29
|
# significantly faster than default Enumerable#include?
|
30
|
-
# requires terms to
|
30
|
+
# requires terms to have been sorted
|
31
31
|
def include?(term)
|
32
|
-
include_cache[term.
|
32
|
+
include_cache[term.spellr_normalize]
|
33
33
|
end
|
34
34
|
|
35
|
-
def
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
def <<(term)
|
36
|
+
term = term.spellr_normalize
|
37
|
+
touch
|
38
|
+
include_cache[term] = true
|
39
|
+
insert_sorted(term)
|
40
|
+
@path.write(to_a.join) # we don't need to clear the cache
|
41
41
|
end
|
42
42
|
|
43
43
|
def to_a
|
@@ -46,7 +46,7 @@ module Spellr
|
|
46
46
|
|
47
47
|
def clean(file = @path)
|
48
48
|
require_relative 'tokenizer'
|
49
|
-
write(Spellr::Tokenizer.new(file,
|
49
|
+
write(Spellr::Tokenizer.new(file, skip_key: false).normalized_terms.join)
|
50
50
|
end
|
51
51
|
|
52
52
|
def write(content)
|
@@ -61,37 +61,41 @@ module Spellr
|
|
61
61
|
@path.read
|
62
62
|
end
|
63
63
|
|
64
|
-
def clear_cache
|
65
|
-
@to_a = nil
|
66
|
-
@include = nil
|
67
|
-
end
|
68
|
-
|
69
64
|
def exist?
|
70
65
|
return @exist if defined?(@exist)
|
71
66
|
|
72
67
|
@exist = @path.exist?
|
73
68
|
end
|
74
69
|
|
75
|
-
def add(term)
|
76
|
-
touch
|
77
|
-
term = term.normalize
|
78
|
-
include_cache[term] = true
|
79
|
-
to_a << term
|
80
|
-
to_a.sort!
|
81
|
-
write(@to_a.join)
|
82
|
-
Spellr.config.clear_cache if to_a.length == 1
|
83
|
-
end
|
84
|
-
|
85
70
|
def touch
|
86
71
|
return if exist?
|
87
72
|
|
88
73
|
@path.dirname.mkpath
|
89
74
|
@path.write('')
|
90
|
-
|
75
|
+
clear_cache
|
91
76
|
end
|
92
77
|
|
93
78
|
private
|
94
79
|
|
80
|
+
def insert_sorted(term)
|
81
|
+
insert_at = to_a.bsearch_index { |value| value >= term }
|
82
|
+
insert_at ? to_a.insert(insert_at, term) : to_a.push(term)
|
83
|
+
end
|
84
|
+
|
85
|
+
def include_cache
|
86
|
+
@include_cache ||= Hash.new do |cache, term|
|
87
|
+
cache[term] = to_a.bsearch do |value|
|
88
|
+
term <=> value
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear_cache
|
94
|
+
@to_a = nil
|
95
|
+
@include = nil
|
96
|
+
remove_instance_variable(:@exist) if defined?(@exist)
|
97
|
+
end
|
98
|
+
|
95
99
|
def raise_unless_exists?
|
96
100
|
return if exist?
|
97
101
|
|
@@ -1,21 +1,29 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'set'
|
4
|
+
require_relative 'base_reporter'
|
4
5
|
|
5
6
|
module Spellr
|
6
|
-
class WordlistReporter
|
7
|
-
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
@words = Set.new
|
7
|
+
class WordlistReporter < Spellr::BaseReporter
|
8
|
+
def parallel?
|
9
|
+
true
|
11
10
|
end
|
12
11
|
|
13
|
-
def finish
|
14
|
-
puts words.sort.join
|
12
|
+
def finish
|
13
|
+
output.puts words.sort.join
|
15
14
|
end
|
16
15
|
|
17
16
|
def call(token)
|
18
|
-
words << token.
|
17
|
+
words << token.spellr_normalize
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def words
|
23
|
+
@words ||= begin
|
24
|
+
output.counts[:words] = Set.new unless output.counts.key?(:words)
|
25
|
+
output.counts[:words]
|
26
|
+
end
|
19
27
|
end
|
20
28
|
end
|
21
29
|
end
|