spellr 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../spellr'
4
+
5
+ module Spellr
6
+ module TokenRegexps
7
+ #### WORDS ####
8
+
9
+ # [Word], [Word]Word [Word]'s [Wordn't]
10
+ TITLE_CASE_RE = /[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
11
+ # [WORD] [WORD]Word [WORDN'T] [WORD]'S [WORD]'s [WORD]s
12
+ UPPER_CASE_RE = /[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze # rubocop:disable Metrics/LineLength
13
+ # [word] [word]'s [wordn't]
14
+ LOWER_CASE_RE = /[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
15
+ # for characters in [:alpha:] that aren't in [:lower:] or [:upper:] e.g. Arabic
16
+ OTHER_CASE_RE = /(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
17
+
18
+ TERM_RE = Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
19
+
20
+ #### NON WORDS ####
21
+
22
+ NOT_EVEN_NON_WORDS_RE = %r{[^[:alpha:]/%#0-9\\]+}.freeze
23
+ LEFTOVER_NON_WORD_BITS_RE = %r{[/%#\\]|\d+}.freeze # e.g. a / not starting //a-url.com
24
+ HEX_RE = /(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
25
+ SHELL_COLOR_ESCAPE_RE = /\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
26
+ PUNYCODE_RE = /xn--[a-v0-9\-]+(?:[[:alpha:]])/.freeze
27
+ # TODO: hex escapes e.g. \xAA.
28
+ # TODO: language aware escapes
29
+ BACKSLASH_ESCAPE_RE = /\\[a-zA-Z]/.freeze
30
+ REPEATED_SINGLE_LETTERS_RE = /(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze # e.g. xxxxxxxx
31
+ URL_ENCODED_ENTITIES_RE = /%[0-8A-F]{2}/.freeze
32
+ # There's got to be a better way of writing this
33
+ SEQUENTIAL_LETTERS_RE = /a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze # rubocop:disable Metrics/LineLength
34
+
35
+ # I didn't want to do this myself
36
+ # BUT i need something to heuristically match on, and it's difficult
37
+ URL_SCHEME = %r{(?://|https?://|s?ftp://|mailto:)}.freeze
38
+ URL_USERINFO = /[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
39
+ URL_IP_ADDRESS = /\d{1,3}(?:\.\d{1,3}){3}/.freeze
40
+ # literal \ so that i can match on domains in regexps. no-one cares but me.
41
+ URL_HOSTNAME = /(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
42
+ URL_PORT = /:\d+/.freeze
43
+ URL_PATH = %r{/(?:[[:alnum:]=@!$&\-/._\\]|%\h{2})+}.freeze
44
+ URL_QUERY = %r{\?(?:[[:alnum:]=!$\-/.\\]|%\h{2})+(?:&(?:[[:alnum:]=!$\-/.\\]|%\h{2})+)*}.freeze
45
+ URL_FRAGMENT = %r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
46
+
47
+ # URL can be any valid hostname, it must have either a scheme, userinfo, or path
48
+ # it may have those and any of the others and a port, or a query or a fragment.
49
+ URL_REST = /#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
50
+ URL_RE = Regexp.union(
51
+ /#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
52
+ /#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
53
+ /#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
54
+ ).freeze
55
+
56
+ KEY_SENDGRID_RE = /SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
57
+ KEY_HYPERWALLET_RE = /prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
58
+ KEY_GTM_RE = /GTM-[A-Z0-9]{7}/.freeze
59
+ KEY_SHA1 = %r{sha1-[A-Za-z0-9=+/]{28}}.freeze
60
+ KEY_SHA512 = %r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
61
+ KEY_DATA_URL = %r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
62
+
63
+ KEY_PATTERNS_RE = Regexp.union(
64
+ KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
65
+ )
66
+
67
+ SKIPS = Regexp.union(
68
+ NOT_EVEN_NON_WORDS_RE,
69
+ SHELL_COLOR_ESCAPE_RE,
70
+ BACKSLASH_ESCAPE_RE,
71
+ URL_ENCODED_ENTITIES_RE,
72
+ HEX_RE,
73
+ URL_RE, # 2%
74
+ KEY_PATTERNS_RE
75
+ ).freeze
76
+
77
+ AFTER_KEY_SKIPS = Regexp.union(
78
+ LEFTOVER_NON_WORD_BITS_RE,
79
+ REPEATED_SINGLE_LETTERS_RE,
80
+ SEQUENTIAL_LETTERS_RE
81
+ )
82
+
83
+ # this is in a method because the minimum word length stuff was throwing it off
84
+ # TODO: move to config maybe?
85
+ def min_alpha_re
86
+ @min_alpha_re ||= Regexp.union(
87
+ /[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
88
+ /[a-z]{#{Spellr.config.word_minimum_length}}/,
89
+ /[A-Z]{#{Spellr.config.word_minimum_length}}/
90
+ ).freeze
91
+ end
92
+ ALPHA_SEP_RE = %r{[A-Za-z][A-Za-z\-_/+]*}.freeze
93
+ NUM_SEP_RE = %r{\d[\d\-_/+]*}.freeze
94
+ THREE_CHUNK_RE = Regexp.union(
95
+ /\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
96
+ /\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
97
+ ).freeze
98
+ POSSIBLE_KEY_RE = %r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
99
+
100
+ SPELLR_DISABLE_RE = /spellr:disable/.freeze
101
+ SPELLR_ENABLE_RE = /spellr:enable/.freeze
102
+ end
103
+ end
@@ -14,13 +14,12 @@ module Spellr
14
14
  attr_accessor :disabled
15
15
  alias_method :disabled?, :disabled
16
16
 
17
- def initialize(file, start_at: nil, skip_uri: true, skip_key: true)
18
- # $stderr.puts start_at if start_at
17
+ def initialize(file, start_at: nil, skip_key: true)
19
18
  @start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
20
19
  @file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
21
20
  @file.pos = @start_at.line_location.byte_offset
22
21
 
23
- @line_tokenizer = LineTokenizer.new(skip_uri: skip_uri, skip_key: skip_key)
22
+ @line_tokenizer = LineTokenizer.new(skip_key: skip_key)
24
23
  end
25
24
 
26
25
  def terms
@@ -32,37 +31,59 @@ module Spellr
32
31
  end
33
32
 
34
33
  def each_term(&block)
35
- each_line_with_offset do |line, line_number|
36
- prepare_tokenizer_for_line(line, line_number).each_term(&block)
34
+ file.each_line do |line|
35
+ prepare_tokenizer_for_line(line).each_term(&block)
37
36
  end
38
37
  end
39
38
 
40
- def each_token(&block) # rubocop:disable Metrics/AbcSize
39
+ def each_token(skip_term_proc: nil) # rubocop:disable Metrics/MethodLength
40
+ each_line_with_stats do |line, line_number, char_offset, byte_offset|
41
+ prepare_tokenizer_for_line(line).each_token(skip_term_proc: skip_term_proc) do |token|
42
+ token.line = prepare_line(line, line_number, char_offset, byte_offset)
43
+
44
+ yield token
45
+ end
46
+ end
47
+ end
48
+
49
+ def prepare_line(line, line_number, char_offset, byte_offset)
50
+ line_location = LineLocation.new(
51
+ file, line_number, char_offset: char_offset, byte_offset: byte_offset
52
+ )
53
+ column_location = ColumnLocation.new(line_location: line_location)
54
+ Token.new(line, location: column_location)
55
+ end
56
+
57
+ def each_line_with_stats # rubocop:disable Metrics/MethodLength
41
58
  char_offset = @start_at.line_location.char_offset
42
59
  byte_offset = @start_at.line_location.byte_offset
43
60
 
44
- each_line_with_offset do |line, line_number|
45
- line_location = LineLocation.new(file, line_number, byte_offset: byte_offset, char_offset: char_offset)
61
+ file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
62
+ yield line, line_number, char_offset, byte_offset
63
+
46
64
  char_offset += line.length
47
65
  byte_offset += line.bytesize
48
- line = Token.new(line, location: ColumnLocation.new(line_location: line_location))
49
- prepare_tokenizer_for_line(line, line_number).each_token(&block)
50
66
  end
51
67
  end
52
68
 
53
69
  def normalized_terms
54
- enum_for(:each_term).map(&:normalize).uniq.sort
70
+ enum_for(:each_term).map(&:spellr_normalize).uniq.sort
55
71
  end
56
72
 
57
73
  private
58
74
 
59
75
  attr_reader :line_tokenizer
60
76
 
61
- def each_line_with_offset(&block)
62
- file.each_line.with_index(@start_at.line_number, &block)
77
+ def each_line_token
78
+ line_location = @start_at.line_location
79
+
80
+ file.each_line do |line|
81
+ yield Token.new(line, location: ColumnLocation.new(line_location: line_location))
82
+ line_location = line_location.advance(line)
83
+ end
63
84
  end
64
85
 
65
- def prepare_tokenizer_for_line(line, _line_number)
86
+ def prepare_tokenizer_for_line(line)
66
87
  line_tokenizer.string = line
67
88
  line_tokenizer.pos = 0
68
89
  line_tokenizer
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spellr
4
- VERSION = '0.5.0'
4
+ VERSION = '0.5.1'
5
5
  end
@@ -27,17 +27,17 @@ module Spellr
27
27
  end
28
28
 
29
29
  # significantly faster than default Enumerable#include?
30
- # requires terms to be sorted
30
+ # requires terms to have been sorted
31
31
  def include?(term)
32
- include_cache[term.normalize]
32
+ include_cache[term.spellr_normalize]
33
33
  end
34
34
 
35
- def include_cache
36
- @include_cache ||= Hash.new do |cache, term|
37
- cache[term] = to_a.bsearch do |value|
38
- term <=> value
39
- end
40
- end
35
+ def <<(term)
36
+ term = term.spellr_normalize
37
+ touch
38
+ include_cache[term] = true
39
+ insert_sorted(term)
40
+ @path.write(to_a.join) # we don't need to clear the cache
41
41
  end
42
42
 
43
43
  def to_a
@@ -46,7 +46,7 @@ module Spellr
46
46
 
47
47
  def clean(file = @path)
48
48
  require_relative 'tokenizer'
49
- write(Spellr::Tokenizer.new(file, skip_uri: false, skip_key: false).normalized_terms.join)
49
+ write(Spellr::Tokenizer.new(file, skip_key: false).normalized_terms.join)
50
50
  end
51
51
 
52
52
  def write(content)
@@ -61,37 +61,41 @@ module Spellr
61
61
  @path.read
62
62
  end
63
63
 
64
- def clear_cache
65
- @to_a = nil
66
- @include = nil
67
- end
68
-
69
64
  def exist?
70
65
  return @exist if defined?(@exist)
71
66
 
72
67
  @exist = @path.exist?
73
68
  end
74
69
 
75
- def add(term)
76
- touch
77
- term = term.normalize
78
- include_cache[term] = true
79
- to_a << term
80
- to_a.sort!
81
- write(@to_a.join)
82
- Spellr.config.clear_cache if to_a.length == 1
83
- end
84
-
85
70
  def touch
86
71
  return if exist?
87
72
 
88
73
  @path.dirname.mkpath
89
74
  @path.write('')
90
- remove_instance_variable(:@exist)
75
+ clear_cache
91
76
  end
92
77
 
93
78
  private
94
79
 
80
+ def insert_sorted(term)
81
+ insert_at = to_a.bsearch_index { |value| value >= term }
82
+ insert_at ? to_a.insert(insert_at, term) : to_a.push(term)
83
+ end
84
+
85
+ def include_cache
86
+ @include_cache ||= Hash.new do |cache, term|
87
+ cache[term] = to_a.bsearch do |value|
88
+ term <=> value
89
+ end
90
+ end
91
+ end
92
+
93
+ def clear_cache
94
+ @to_a = nil
95
+ @include = nil
96
+ remove_instance_variable(:@exist) if defined?(@exist)
97
+ end
98
+
95
99
  def raise_unless_exists?
96
100
  return if exist?
97
101
 
@@ -1,21 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'set'
4
+ require_relative 'base_reporter'
4
5
 
5
6
  module Spellr
6
- class WordlistReporter
7
- attr_reader :words
8
-
9
- def initialize
10
- @words = Set.new
7
+ class WordlistReporter < Spellr::BaseReporter
8
+ def parallel?
9
+ true
11
10
  end
12
11
 
13
- def finish(_checked)
14
- puts words.sort.join
12
+ def finish
13
+ output.puts words.sort.join
15
14
  end
16
15
 
17
16
  def call(token)
18
- words << token.normalize
17
+ words << token.spellr_normalize
18
+ end
19
+
20
+ private
21
+
22
+ def words
23
+ @words ||= begin
24
+ output.counts[:words] = Set.new unless output.counts.key?(:words)
25
+ output.counts[:words]
26
+ end
19
27
  end
20
28
  end
21
29
  end
data/lib/spellr.rb CHANGED
@@ -5,6 +5,7 @@ require_relative 'spellr/config'
5
5
 
6
6
  module Spellr
7
7
  class Error < StandardError; end
8
+
8
9
  class Wordlist
9
10
  class NotFound < Spellr::Error; end
10
11
  end