wordlist 0.1.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (152) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +28 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +55 -1
  5. data/Gemfile +15 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +301 -60
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +13 -12
  54. data/lib/wordlist/abstract_wordlist.rb +25 -0
  55. data/lib/wordlist/builder.rb +172 -138
  56. data/lib/wordlist/cli.rb +459 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +177 -0
  61. data/lib/wordlist/format.rb +39 -0
  62. data/lib/wordlist/lexer/lang.rb +34 -0
  63. data/lib/wordlist/lexer/stop_words.rb +69 -0
  64. data/lib/wordlist/lexer.rb +221 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +46 -0
  67. data/lib/wordlist/modifiers/downcase.rb +46 -0
  68. data/lib/wordlist/modifiers/gsub.rb +52 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +134 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +26 -0
  72. data/lib/wordlist/modifiers/sub.rb +98 -0
  73. data/lib/wordlist/modifiers/tr.rb +72 -0
  74. data/lib/wordlist/modifiers/upcase.rb +46 -0
  75. data/lib/wordlist/modifiers.rb +9 -0
  76. data/lib/wordlist/operators/binary_operator.rb +39 -0
  77. data/lib/wordlist/operators/concat.rb +48 -0
  78. data/lib/wordlist/operators/intersect.rb +56 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +73 -0
  81. data/lib/wordlist/operators/product.rb +51 -0
  82. data/lib/wordlist/operators/subtract.rb +55 -0
  83. data/lib/wordlist/operators/unary_operator.rb +30 -0
  84. data/lib/wordlist/operators/union.rb +62 -0
  85. data/lib/wordlist/operators/unique.rb +53 -0
  86. data/lib/wordlist/operators.rb +8 -0
  87. data/lib/wordlist/unique_filter.rb +41 -61
  88. data/lib/wordlist/version.rb +4 -2
  89. data/lib/wordlist/words.rb +72 -0
  90. data/lib/wordlist.rb +104 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +802 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +269 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +718 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/unary_operator_spec.rb +14 -0
  128. data/spec/operators/union_spec.rb +37 -0
  129. data/spec/operators/unique_spec.rb +25 -0
  130. data/spec/spec_helper.rb +2 -1
  131. data/spec/unique_filter_spec.rb +108 -18
  132. data/spec/wordlist_spec.rb +55 -3
  133. data/spec/words_spec.rb +41 -0
  134. data/wordlist.gemspec +1 -0
  135. metadata +164 -126
  136. data/lib/wordlist/builders/website.rb +0 -216
  137. data/lib/wordlist/builders.rb +0 -1
  138. data/lib/wordlist/flat_file.rb +0 -47
  139. data/lib/wordlist/list.rb +0 -162
  140. data/lib/wordlist/mutator.rb +0 -113
  141. data/lib/wordlist/parsers.rb +0 -74
  142. data/lib/wordlist/runners/list.rb +0 -116
  143. data/lib/wordlist/runners/runner.rb +0 -67
  144. data/lib/wordlist/runners.rb +0 -2
  145. data/scripts/benchmark +0 -59
  146. data/scripts/text/comedy_of_errors.txt +0 -4011
  147. data/spec/classes/parser_class.rb +0 -7
  148. data/spec/classes/test_list.rb +0 -9
  149. data/spec/flat_file_spec.rb +0 -25
  150. data/spec/list_spec.rb +0 -58
  151. data/spec/mutator_spec.rb +0 -43
  152. data/spec/parsers_spec.rb +0 -118
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+ require 'wordlist/abstract_wordlist'
3
+ require 'wordlist/exceptions'
4
+ require 'wordlist/format'
5
+ require 'wordlist/compression/reader'
6
+
7
+ module Wordlist
8
+ #
9
+ # Represents a `.txt` file wordlist.
10
+ #
11
+ # wordlist = Wordlist::File.new("rockyou.txt")
12
+ # wordlist.each do |word|
13
+ # puts word
14
+ # end
15
+ #
16
+ # @api public
17
+ #
18
+ # @since 1.0.0
19
+ #
20
+ class File < AbstractWordlist
21
+
22
+ # The path to the `.txt` file
23
+ attr_reader :path
24
+
25
+ # The format of the wordlist file.
26
+ #
27
+ # @return [:txt, :gzip, :bzip2, :xz]
28
+ attr_reader :format
29
+
30
+ #
31
+ # Opens a wordlist file.
32
+ #
33
+ # @param [String] path
34
+ # The path to the `.txt` file wordlist read from.
35
+ #
36
+ # @param [:txt, :gz, :bzip2, :xz, nil] format
37
+ # The format of the wordlist. If not given the format will be inferred
38
+ # from the file extension.
39
+ #
40
+ # @raise [WordlistNotFound]
41
+ # The given path does not exist.
42
+ #
43
+ # @raise [UnknownFormat]
44
+ # The format could not be inferred from the file extension.
45
+ #
46
+ # @api public
47
+ #
48
+ def initialize(path, format: Format.infer(path))
49
+ @path = ::File.expand_path(path)
50
+ @format = format
51
+
52
+ unless ::File.file?(@path)
53
+ raise(WordlistNotFound,"wordlist file does not exist: #{@path.inspect}")
54
+ end
55
+
56
+ unless Format::FORMATS.include?(@format)
57
+ raise(UnknownFormat,"unknown format given: #{@format.inspect}")
58
+ end
59
+ end
60
+
61
+ #
62
+ # Opens a wordlist file.
63
+ #
64
+ # @param [String] path
65
+ # The path to the `.txt` file wordlist read from.
66
+ #
67
+ # @yield [wordlist]
68
+ # If a block is given, it will be passed the opened wordlist.
69
+ #
70
+ # @yieldparam [File] wordlist
71
+ # The newly opened wordlist.
72
+ #
73
+ # @return [File]
74
+ # The newly opened wordlist.
75
+ #
76
+ # @see #initialize
77
+ #
78
+ # @api public
79
+ #
80
+ def self.open(path,**kwargs)
81
+ wordlist = new(path,**kwargs)
82
+ yield wordlist if block_given?
83
+ return wordlist
84
+ end
85
+
86
+ #
87
+ # Opens and reads the wordlist file.
88
+ #
89
+ # @param [String] path
90
+ # The path to the `.txt` file wordlist read from.
91
+ #
92
+ # @yield [word]
93
+ # The given block will be passed every word from the wordlist.
94
+ #
95
+ # @yieldparam [String] word
96
+ # A word from the wordlist.
97
+ #
98
+ # @return [Enumerator]
99
+ # If no block is given, an Enumerator object will be returned.
100
+ #
101
+ def self.read(path,**kwargs,&block)
102
+ open(path,**kwargs).each(&block)
103
+ end
104
+
105
+ #
106
+ # Enumerates through each line in the `.txt` file wordlist.
107
+ #
108
+ # @yield [line]
109
+ # The given block will be passed each line from the `.txt` file.
110
+ #
111
+ # @yieldparam [String] line
112
+ # A newline terminated line from the file.
113
+ #
114
+ # @return [Enumerator]
115
+ # If no block is given, an Enumerator object will be returned.
116
+ #
117
+ # @api semipublic
118
+ #
119
+ def each_line(&block)
120
+ return enum_for(__method__) unless block
121
+
122
+ open { |io| io.each_line(&block) }
123
+ end
124
+
125
+ #
126
+ # Enumerates through every word in the `.txt` file.
127
+ #
128
+ # @yield [word]
129
+ # The given block will be passed every word from the wordlist.
130
+ #
131
+ # @yieldparam [String] word
132
+ # A word from the wordlist.
133
+ #
134
+ # @return [Enumerator]
135
+ # If no block is given, an Enumerator object will be returned.
136
+ #
137
+ # @note
138
+ # Empty lines and lines beginning with `#` characters will be ignored.
139
+ #
140
+ # @example
141
+ # wordlist.each do |word|
142
+ # puts word
143
+ # end
144
+ #
145
+ # @api public
146
+ #
147
+ def each
148
+ return enum_for(__method__) unless block_given?
149
+
150
+ each_line do |line|
151
+ line.chomp!
152
+
153
+ unless (line.empty? || line.start_with?('#'))
154
+ yield line
155
+ end
156
+ end
157
+ end
158
+
159
+ private
160
+
161
+ #
162
+ # Opens the wordlist for reading.
163
+ #
164
+ # @yield [io]
165
+ #
166
+ # @yieldparam [IO] io
167
+ #
168
+ def open(&block)
169
+ if @format == :txt
170
+ ::File.open(@path,&block)
171
+ else
172
+ Compression::Reader.open(@path, format: @format, &block)
173
+ end
174
+ end
175
+
176
+ end
177
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+ require 'wordlist/exceptions'
3
+
4
+ module Wordlist
5
+ #
6
+ # Handles wordlist format detection.
7
+ #
8
+ # @since 1.0.0
9
+ #
10
+ module Format
11
+ # Mapping of file extensions to formats
12
+ FILE_FORMATS = {
13
+ '.txt' => :txt,
14
+ '.gz' => :gzip,
15
+ '.bz2' => :bzip2,
16
+ '.xz' => :xz
17
+ }
18
+
19
+ # Valid formats.
20
+ FORMATS = FILE_FORMATS.values
21
+
22
+ #
23
+ # Infers the format from the given file name.
24
+ #
25
+ # @param [String] path
26
+ # The path to the file.
27
+ #
28
+ # @return [:txt, :gzip, :bzip2, :xz]
29
+ #
30
+ # @raise [UnknownFormat]
31
+ # The format could not be inferred from the file path.
32
+ #
33
+ def self.infer(path)
34
+ FILE_FORMATS.fetch(::File.extname(path)) do
35
+ raise(UnknownFormat,"could not infer the format of file: #{path.inspect}")
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wordlist
4
+ class Lexer
5
+ #
6
+ # Detects the system's default language.
7
+ #
8
+ # @api semipublic
9
+ #
10
+ # @since 1.0.0
11
+ #
12
+ module Lang
13
+ #
14
+ # The default language.
15
+ #
16
+ # @return [Symbol]
17
+ #
18
+ def self.default
19
+ if (lang = ENV['LANG'])
20
+ lang, encoding = lang.split('.',2)
21
+ lang, country = lang.split('_',2)
22
+
23
+ unless lang == 'C'
24
+ lang.to_sym
25
+ else
26
+ :en
27
+ end
28
+ else
29
+ :en
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+ require 'wordlist/exceptions'
3
+
4
+ module Wordlist
5
+ class Lexer
6
+ #
7
+ # Stop words for various languages.
8
+ #
9
+ # @api semipublic
10
+ #
11
+ # @since 1.0.0
12
+ #
13
+ module StopWords
14
+ # The directory containing the stop words `.txt` files.
15
+ DIRECTORY = ::File.expand_path(::File.join(__dir__,'..','..','..','data','stop_words'))
16
+
17
+ #
18
+ # The path to the stop words `.txt` file.
19
+ #
20
+ # @param [Symbol] lang
21
+ # The language to load.
22
+ #
23
+ # @return [String]
24
+ #
25
+ def self.path_for(lang)
26
+ ::File.join(DIRECTORY,"#{lang}.txt")
27
+ end
28
+
29
+ #
30
+ # Reads the stop words.
31
+ #
32
+ # @param [Symbol] lang
33
+ # The language to load.
34
+ #
35
+ # @return [Array<String>]
36
+ #
37
+ # @raise [UnsupportedLanguage]
38
+ #
39
+ def self.read(lang)
40
+ path = path_for(lang)
41
+
42
+ unless ::File.file?(path)
43
+ raise(UnsupportedLanguage,"unsupported language: #{lang}")
44
+ end
45
+
46
+ lines = ::File.readlines(path)
47
+ lines.each(&:chomp!)
48
+ lines
49
+ end
50
+
51
+ @stop_words = {}
52
+ @mutex = Mutex.new
53
+
54
+ #
55
+ # Lazy loads the stop words for the given language.
56
+ #
57
+ # @param [Symbol] lang
58
+ # The language to load.
59
+ #
60
+ # @return [Array<String>]
61
+ #
62
+ def self.[](lang)
63
+ @mutex.synchronize do
64
+ @stop_words[lang] ||= read(lang)
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,221 @@
1
+ # frozen_string_literal: true
2
+ require 'wordlist/lexer/lang'
3
+ require 'wordlist/lexer/stop_words'
4
+
5
+ require 'strscan'
6
+
7
+ module Wordlist
8
+ #
9
+ # Parses arbitrary text and scans each word from it.
10
+ #
11
+ # @api semipublic
12
+ #
13
+ # @since 1.0.0
14
+ #
15
+ class Lexer
16
+
17
+ # Regexp to match acronyms.
18
+ ACRONYM = /[[:alpha:]](?:\.[[:alpha:]])+\./
19
+
20
+ # Default set of punctuation characters allowed within words
21
+ SPECIAL_CHARS = %w[_ - ']
22
+
23
+ # @return [Symbol]
24
+ attr_reader :lang
25
+
26
+ # @return [Array<String>]
27
+ attr_reader :stop_words
28
+
29
+ # @return [Array<String, Regexp>]
30
+ attr_reader :ignore_words
31
+
32
+ # @return [Array<String>]
33
+ attr_reader :special_chars
34
+
35
+ #
36
+ # Initializes the lexer.
37
+ #
38
+ # @param [Symbol] lang
39
+ # The language to use. Defaults to {Lang.default}.
40
+ #
41
+ # @param [Array<String>] stop_words
42
+ # The explicit stop-words to ignore. If not given, default stop words
43
+ # will be loaded based on `lang` or {Lang.default}.
44
+ #
45
+ # @param [Array<String, Regexp>] ignore_words
46
+ # Optional list of words to ignore. Can contain Strings or Regexps.
47
+ #
48
+ # @param [Boolean] digits
49
+ # Controls whether parsed words may contain digits or not.
50
+ #
51
+ # @param [Array<String>] special_chars
52
+ # The additional special characters allowed within words.
53
+ #
54
+ # @param [Boolean] numbers
55
+ # Controls whether whole numbers will be parsed as words.
56
+ #
57
+ # @param [Boolean] acronyms
58
+ # Controls whether acronyms will be parsed as words.
59
+ #
60
+ # @param [Boolean] normalize_case
61
+ # Controls whether to convert all words to lowercase.
62
+ #
63
+ # @param [Boolean] normalize_apostrophes
64
+ # Controls whether apostrophes will be removed from the end of words.
65
+ #
66
+ # @param [Boolean] normalize_acronyms
67
+ # Controls whether acronyms will have `.` characters removed.
68
+ #
69
+ # @raise [ArgumentError]
70
+ # The `ignore_words` keyword contained a value other than a String or
71
+ # Regexp.
72
+ #
73
+ def initialize(lang: Lang.default,
74
+ stop_words: StopWords[lang],
75
+ ignore_words: [],
76
+ digits: true,
77
+ special_chars: SPECIAL_CHARS,
78
+ numbers: false,
79
+ acronyms: true,
80
+ normalize_case: false,
81
+ normalize_apostrophes: false,
82
+ normalize_acronyms: false)
83
+ @lang = lang
84
+ @stop_words = stop_words
85
+ @ignore_words = ignore_words
86
+ @special_chars = special_chars
87
+
88
+ @digits = digits
89
+ @numbers = numbers
90
+ @acronyms = acronyms
91
+
92
+ @normalize_acronyms = normalize_acronyms
93
+ @normalize_apostrophes = normalize_apostrophes
94
+ @normalize_case = normalize_case
95
+
96
+ escaped_chars = Regexp.escape(@special_chars.join)
97
+
98
+ @word = if @digits
99
+ # allows numeric characters
100
+ /\p{L}(?:[\p{L}\p{Nd}#{escaped_chars}]*[\p{L}\p{Nd}])?/
101
+ else
102
+ # only allows alpha characters
103
+ /\p{L}(?:[\p{L}#{escaped_chars}]*\p{L})?/
104
+ end
105
+
106
+ skip_words = Regexp.union(
107
+ (@stop_words + @ignore_words).map { |pattern|
108
+ case pattern
109
+ when Regexp then pattern
110
+ when String then /#{Regexp.escape(pattern)}/i
111
+ else
112
+ raise(ArgumentError,"ignore_words: must contain only Strings or Regexps")
113
+ end
114
+ }
115
+ )
116
+
117
+ if @numbers
118
+ # allows lexing whole numbers
119
+ @skip_word = /(?:#{skip_words}[[:punct:]]*(?:[[:space:]]+|$))+/i
120
+ @word = /#{@word}|\d+/
121
+ @not_a_word = /[^\p{L}\d]+/
122
+ else
123
+ # skips whole numbers
124
+ @skip_word = /(?:(?:#{skip_words}|\d+)[[:punct:]]*(?:[[:space:]]+|$))+/i
125
+ @not_a_word = /[^\p{L}]+/
126
+ end
127
+ end
128
+
129
+ #
130
+ # Determines whether parsed words may contain digits or not.
131
+ #
132
+ # @return [Boolean]
133
+ #
134
+ def digits?
135
+ @digits
136
+ end
137
+
138
+ #
139
+ # Determines whether numbers will be parsed or ignored.
140
+ #
141
+ # @return [Boolean]
142
+ #
143
+ def numbers?
144
+ @numbers
145
+ end
146
+
147
+ #
148
+ # Determines whether acronyms will be parsed or ignored.
149
+ #
150
+ # @return [Boolean]
151
+ #
152
+ def acronyms?
153
+ @acronyms
154
+ end
155
+
156
+ #
157
+ # Determines whether `.` characters will be removed from acronyms.
158
+ #
159
+ # @return [Boolean]
160
+ #
161
+ def normalize_acronyms?
162
+ @normalize_acronyms
163
+ end
164
+
165
+ #
166
+ # Determines whether apostrophes will be stripped from words.
167
+ #
168
+ # @return [Boolean]
169
+ #
170
+ def normalize_apostrophes?
171
+ @normalize_apostrophes
172
+ end
173
+
174
+ #
175
+ # Determines whether all words will be converted to lowercase.
176
+ #
177
+ # @return [Boolean]
178
+ #
179
+ def normalize_case?
180
+ @normalize_case
181
+ end
182
+
183
+ #
184
+ # Enumerates over each word in the text.
185
+ #
186
+ # @yield [word]
187
+ # The given block will be passed each word from the text.
188
+ #
189
+ # @yieldparam [String] word
190
+ # A parsed word from the text.
191
+ #
192
+ # @return [Array<String>]
193
+ # If no block is given, an Array of the parsed words will be returned
194
+ # instead.
195
+ #
196
+ def parse(text,&block)
197
+ return enum_for(__method__,text).to_a unless block_given?
198
+
199
+ scanner = StringScanner.new(text)
200
+
201
+ until scanner.eos?
202
+ scanner.skip(@not_a_word)
203
+ scanner.skip(@skip_word)
204
+
205
+ if (acronym = scanner.scan(ACRONYM))
206
+ if @acronyms
207
+ acronym.tr!('.','') if @normalize_acronyms
208
+
209
+ yield acronym
210
+ end
211
+ elsif (word = scanner.scan(@word))
212
+ word.downcase! if @normalize_case
213
+ word.chomp!("'s") if (@normalize_apostrophes && word.end_with?("'s"))
214
+
215
+ yield word
216
+ end
217
+ end
218
+ end
219
+
220
+ end
221
+ end