spellr 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../spellr'
4
+
5
+ module Spellr
6
+ module TokenRegexps
7
+ #### WORDS ####
8
+
9
+ # [Word], [Word]Word [Word]'s [Wordn't]
10
+ TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
11
+ # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
12
+ UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze # rubocop:disable Metrics/LineLength
13
+ # [word] [word]'s [wordn't]
14
+ LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
15
+ # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
16
+ OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
17
+
18
+ TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
19
+
20
+ #### NON WORDS ####
21
+
22
+ NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze
23
+ LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
24
+ HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
25
+ SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
26
+ PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
27
+ # TODO: hex escapes e.g. \xAA.
28
+ # TODO: language aware escapes
29
+ BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze
30
+ REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx
31
+ URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
32
+ # There's got to be a better way of writing this
33
+ SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
34
+
35
+ # I didn't want to do this myself
36
+ # BUT i need something to heuristically match on, and it's difficult
37
+ URL_SCHEME = %r{(?://|https?://|s?ftp://|mailto:)}.freeze
38
+ URL_USERINFO = /[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
39
+ URL_IP_ADDRESS = /\d{1,3}(?:\.\d{1,3}){3}/.freeze
40
+ # literal \ so that i can match on domains in regexps. no-one cares but me.
41
+ URL_HOSTNAME = /(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
42
+ URL_PORT = /:\d+/.freeze
43
+ URL_PATH = %r{/(?:[[:alnum:]=@!$&\-/._\\]|%\h{2})+}.freeze
44
+ URL_QUERY = %r{\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*}.freeze
45
+ URL_FRAGMENT = %r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
46
+
47
+ # URL can be any valid hostname, it must have either a scheme, userinfo, or path
48
+ # it may have those and any of the others and a port, or a query or a fragment.
49
+ URL_REST = /#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
50
+ URL_RE = Regexp.union(
51
+ /#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
52
+ /#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
53
+ /#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
54
+ ).freeze
55
+
56
+ KEY_SENDGRID_RE = /SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
57
+ KEY_HYPERWALLET_RE = /prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
58
+ KEY_GTM_RE = /GTM-[A-Z0-9]{7}/.freeze
59
+ KEY_SHA1 = %r{sha1-[A-Za-z0-9=+/]{28}}.freeze
60
+ KEY_SHA512 = %r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
61
+ KEY_DATA_URL = %r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
62
+
63
+ KEY_PATTERNS_RE = Regexp.union(
64
+ KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
65
+ )
66
+
67
+ SKIPS = Regexp.union(
68
+ NOT_EVEN_NON_WORDS_RE,
69
+ SHELL_COLOR_ESCAPE_RE,
70
+ BACKSLASH_ESCAPE_RE,
71
+ URL_ENCODED_ENTITIES_RE,
72
+ HEX_RE,
73
+ URL_RE, # 2%
74
+ KEY_PATTERNS_RE
75
+ ).freeze
76
+
77
+ AFTER_KEY_SKIPS = Regexp.union(
78
+ LEFTOVER_NON_WORD_BITS_RE,
79
+ REPEATED_SINGLE_LETTERS_RE,
80
+ SEQUENTIAL_LETTERS_RE
81
+ )
82
+
83
+ # this is in a method because the minimum word length stuff was throwing it off
84
+ # TODO: move to config maybe?
85
+ def min_alpha_re
86
+ @min_alpha_re ||= Regexp.union(
87
+ /[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
88
+ /[a-z]{#{Spellr.config.word_minimum_length}}/,
89
+ /[A-Z]{#{Spellr.config.word_minimum_length}}/
90
+ ).freeze
91
+ end
92
+ ALPHA_SEP_RE = %r{[A-Za-z][A-Za-z\-_/+]*}.freeze
93
+ NUM_SEP_RE = %r{\d[\d\-_/+]*}.freeze
94
+ THREE_CHUNK_RE = Regexp.union(
95
+ /\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
96
+ /\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
97
+ ).freeze
98
+ POSSIBLE_KEY_RE = %r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
99
+
100
+ SPELLR_DISABLE_RE = /spellr:disable/.freeze
101
+ SPELLR_ENABLE_RE = /spellr:enable/.freeze
102
+ end
103
+ end
@@ -14,13 +14,12 @@ module Spellr
14
14
  attr_accessor :disabled
15
15
  alias_method :disabled?, :disabled
16
16
 
17
- def initialize(file, start_at: nil, skip_uri: true, skip_key: true)
18
- # $stderr.puts start_at if start_at
17
+ def initialize(file, start_at: nil, skip_key: true)
19
18
  @start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
20
19
  @file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
21
20
  @file.pos = @start_at.line_location.byte_offset
22
21
 
23
- @line_tokenizer = LineTokenizer.new(skip_uri: skip_uri, skip_key: skip_key)
22
+ @line_tokenizer = LineTokenizer.new(skip_key: skip_key)
24
23
  end
25
24
 
26
25
  def terms
@@ -32,37 +31,59 @@ module Spellr
32
31
  end
33
32
 
34
33
  def each_term(&block)
35
- each_line_with_offset do |line, line_number|
36
- prepare_tokenizer_for_line(line, line_number).each_term(&block)
34
+ file.each_line do |line|
35
+ prepare_tokenizer_for_line(line).each_term(&block)
37
36
  end
38
37
  end
39
38
 
40
- def each_token(&block) # rubocop:disable Metrics/AbcSize
39
+ def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
40
+ each_line_with_stats do |line, line_number, char_offset, byte_offset|
41
+ prepare_tokenizer_for_line(line).each_token(skip_term_proc: skip_term_proc) do |token|
42
+ token.line = prepare_line(line, line_number, char_offset, byte_offset)
43
+
44
+ yield token
45
+ end
46
+ end
47
+ end
48
+
49
+ def prepare_line(line, line_number, char_offset, byte_offset)
50
+ line_location = LineLocation.new(
51
+ file, line_number, char_offset: char_offset, byte_offset: byte_offset
52
+ )
53
+ column_location = ColumnLocation.new(line_location: line_location)
54
+ Token.new(line, location: column_location)
55
+ end
56
+
57
+ def each_line_with_stats # rubocop:disable Metrics/MethodLength
41
58
  char_offset = @start_at.line_location.char_offset
42
59
  byte_offset = @start_at.line_location.byte_offset
43
60
 
44
- each_line_with_offset do |line, line_number|
45
- line_location = LineLocation.new(file, line_number, byte_offset: byte_offset, char_offset: char_offset)
61
+ file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
62
+ yield line, line_number, char_offset, byte_offset
63
+
46
64
  char_offset += line.length
47
65
  byte_offset += line.bytesize
48
- line = Token.new(line, location: ColumnLocation.new(line_location: line_location))
49
- prepare_tokenizer_for_line(line, line_number).each_token(&block)
50
66
  end
51
67
  end
52
68
 
53
69
  def normalized_terms
54
- enum_for(:each_term).map(&:normalize).uniq.sort
70
+ enum_for(:each_term).map(&:spellr_normalize).uniq.sort
55
71
  end
56
72
 
57
73
  private
58
74
 
59
75
  attr_reader :line_tokenizer
60
76
 
61
- def each_line_with_offset(&block)
62
- file.each_line.with_index(@start_at.line_number, &block)
77
+ def each_line_token
78
+ line_location = @start_at.line_location
79
+
80
+ file.each_line do |line|
81
+ yield Token.new(line, location: ColumnLocation.new(line_location: line_location))
82
+ line_location = line_location.advance(line)
83
+ end
63
84
  end
64
85
 
65
- def prepare_tokenizer_for_line(line, _line_number)
86
+ def prepare_tokenizer_for_line(line)
66
87
  line_tokenizer.string = line
67
88
  line_tokenizer.pos = 0
68
89
  line_tokenizer
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spellr
4
- VERSION = '0.5.0'
4
+ VERSION = '0.5.1'
5
5
  end
@@ -27,17 +27,17 @@ module Spellr
27
27
  end
28
28
 
29
29
  # significantly faster than default Enumerable#include?
30
- # requires terms to be sorted
30
+ # requires terms to have been sorted
31
31
  def include?(term)
32
- include_cache[term.normalize]
32
+ include_cache[term.spellr_normalize]
33
33
  end
34
34
 
35
- def include_cache
36
- @include_cache ||= Hash.new do |cache, term|
37
- cache[term] = to_a.bsearch do |value|
38
- term <=> value
39
- end
40
- end
35
+ def <<(term)
36
+ term = term.spellr_normalize
37
+ touch
38
+ include_cache[term] = true
39
+ insert_sorted(term)
40
+ @path.write(to_a.join) # we don't need to clear the cache
41
41
  end
42
42
 
43
43
  def to_a
@@ -46,7 +46,7 @@ module Spellr
46
46
 
47
47
  def clean(file = @path)
48
48
  require_relative 'tokenizer'
49
- write(Spellr::Tokenizer.new(file, skip_uri: false, skip_key: false).normalized_terms.join)
49
+ write(Spellr::Tokenizer.new(file, skip_key: false).normalized_terms.join)
50
50
  end
51
51
 
52
52
  def write(content)
@@ -61,37 +61,41 @@ module Spellr
61
61
  @path.read
62
62
  end
63
63
 
64
- def clear_cache
65
- @to_a = nil
66
- @include = nil
67
- end
68
-
69
64
  def exist?
70
65
  return @exist if defined?(@exist)
71
66
 
72
67
  @exist = @path.exist?
73
68
  end
74
69
 
75
- def add(term)
76
- touch
77
- term = term.normalize
78
- include_cache[term] = true
79
- to_a << term
80
- to_a.sort!
81
- write(@to_a.join)
82
- Spellr.config.clear_cache if to_a.length == 1
83
- end
84
-
85
70
  def touch
86
71
  return if exist?
87
72
 
88
73
  @path.dirname.mkpath
89
74
  @path.write('')
90
- remove_instance_variable(:@exist)
75
+ clear_cache
91
76
  end
92
77
 
93
78
  private
94
79
 
80
+ def insert_sorted(term)
81
+ insert_at = to_a.bsearch_index { |value| value >= term }
82
+ insert_at ? to_a.insert(insert_at, term) : to_a.push(term)
83
+ end
84
+
85
+ def include_cache
86
+ @include_cache ||= Hash.new do |cache, term|
87
+ cache[term] = to_a.bsearch do |value|
88
+ term <=> value
89
+ end
90
+ end
91
+ end
92
+
93
+ def clear_cache
94
+ @to_a = nil
95
+ @include = nil
96
+ remove_instance_variable(:@exist) if defined?(@exist)
97
+ end
98
+
95
99
  def raise_unless_exists?
96
100
  return if exist?
97
101
 
@@ -1,21 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'set'
4
+ require_relative 'base_reporter'
4
5
 
5
6
  module Spellr
6
- class WordlistReporter
7
- attr_reader :words
8
-
9
- def initialize
10
- @words = Set.new
7
+ class WordlistReporter < Spellr::BaseReporter
8
+ def parallel?
9
+ true
11
10
  end
12
11
 
13
- def finish(_checked)
14
- puts words.sort.join
12
+ def finish
13
+ output.puts words.sort.join
15
14
  end
16
15
 
17
16
  def call(token)
18
- words << token.normalize
17
+ words << token.spellr_normalize
18
+ end
19
+
20
+ private
21
+
22
+ def words
23
+ @words ||= begin
24
+ output.counts[:words] = Set.new unless output.counts.key?(:words)
25
+ output.counts[:words]
26
+ end
19
27
  end
20
28
  end
21
29
  end
data/lib/spellr.rb CHANGED
@@ -5,6 +5,7 @@ require_relative 'spellr/config'
5
5
 
6
6
  module Spellr
7
7
  class Error < StandardError; end
8
+
8
9
  class Wordlist
9
10
  class NotFound < Spellr::Error; end
10
11
  end