wordlist 0.1.1 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (152) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +28 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +55 -1
  5. data/Gemfile +15 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +301 -60
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +13 -12
  54. data/lib/wordlist/abstract_wordlist.rb +25 -0
  55. data/lib/wordlist/builder.rb +172 -138
  56. data/lib/wordlist/cli.rb +459 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +177 -0
  61. data/lib/wordlist/format.rb +39 -0
  62. data/lib/wordlist/lexer/lang.rb +34 -0
  63. data/lib/wordlist/lexer/stop_words.rb +69 -0
  64. data/lib/wordlist/lexer.rb +221 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +46 -0
  67. data/lib/wordlist/modifiers/downcase.rb +46 -0
  68. data/lib/wordlist/modifiers/gsub.rb +52 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +134 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +26 -0
  72. data/lib/wordlist/modifiers/sub.rb +98 -0
  73. data/lib/wordlist/modifiers/tr.rb +72 -0
  74. data/lib/wordlist/modifiers/upcase.rb +46 -0
  75. data/lib/wordlist/modifiers.rb +9 -0
  76. data/lib/wordlist/operators/binary_operator.rb +39 -0
  77. data/lib/wordlist/operators/concat.rb +48 -0
  78. data/lib/wordlist/operators/intersect.rb +56 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +73 -0
  81. data/lib/wordlist/operators/product.rb +51 -0
  82. data/lib/wordlist/operators/subtract.rb +55 -0
  83. data/lib/wordlist/operators/unary_operator.rb +30 -0
  84. data/lib/wordlist/operators/union.rb +62 -0
  85. data/lib/wordlist/operators/unique.rb +53 -0
  86. data/lib/wordlist/operators.rb +8 -0
  87. data/lib/wordlist/unique_filter.rb +41 -61
  88. data/lib/wordlist/version.rb +4 -2
  89. data/lib/wordlist/words.rb +72 -0
  90. data/lib/wordlist.rb +104 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +802 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +269 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +718 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/unary_operator_spec.rb +14 -0
  128. data/spec/operators/union_spec.rb +37 -0
  129. data/spec/operators/unique_spec.rb +25 -0
  130. data/spec/spec_helper.rb +2 -1
  131. data/spec/unique_filter_spec.rb +108 -18
  132. data/spec/wordlist_spec.rb +55 -3
  133. data/spec/words_spec.rb +41 -0
  134. data/wordlist.gemspec +1 -0
  135. metadata +164 -126
  136. data/lib/wordlist/builders/website.rb +0 -216
  137. data/lib/wordlist/builders.rb +0 -1
  138. data/lib/wordlist/flat_file.rb +0 -47
  139. data/lib/wordlist/list.rb +0 -162
  140. data/lib/wordlist/mutator.rb +0 -113
  141. data/lib/wordlist/parsers.rb +0 -74
  142. data/lib/wordlist/runners/list.rb +0 -116
  143. data/lib/wordlist/runners/runner.rb +0 -67
  144. data/lib/wordlist/runners.rb +0 -2
  145. data/scripts/benchmark +0 -59
  146. data/scripts/text/comedy_of_errors.txt +0 -4011
  147. data/spec/classes/parser_class.rb +0 -7
  148. data/spec/classes/test_list.rb +0 -9
  149. data/spec/flat_file_spec.rb +0 -25
  150. data/spec/list_spec.rb +0 -58
  151. data/spec/mutator_spec.rb +0 -43
  152. data/spec/parsers_spec.rb +0 -118
@@ -0,0 +1,718 @@
1
+ require 'spec_helper'
2
+ require 'wordlist/lexer'
3
+
4
+ describe Wordlist::Lexer do
5
+ let(:text) { "foo bar baz qux" }
6
+
7
+ describe "#initialize" do
8
+ let(:default_lang) { Wordlist::Lexer::Lang.default }
9
+
10
+ it "must default #lang to Lang.default_lang" do
11
+ expect(subject.lang).to eq(default_lang)
12
+ end
13
+
14
+ it "must set #stop_words to the stop words for :en" do
15
+ expect(subject.stop_words).to eq(Wordlist::Lexer::StopWords[default_lang])
16
+ end
17
+
18
+ it "must default #ignore_words to []" do
19
+ expect(subject.ignore_words).to eq([])
20
+ end
21
+
22
+ it "must default #digits? to true" do
23
+ expect(subject.digits?).to be(true)
24
+ end
25
+
26
+ it "must default #special_chars to SPECIAL_CHARS" do
27
+ expect(subject.special_chars).to eq(described_class::SPECIAL_CHARS)
28
+ end
29
+
30
+ it "must default #numbers? to false" do
31
+ expect(subject.numbers?).to be(false)
32
+ end
33
+
34
+ it "must default #acroynyms? to true" do
35
+ expect(subject.acronyms?).to be(true)
36
+ end
37
+
38
+ it "must default #normalize_case? to false" do
39
+ expect(subject.normalize_case?).to be(false)
40
+ end
41
+
42
+ it "must default #normalize_apostrophes? to false" do
43
+ expect(subject.normalize_apostrophes?).to be(false)
44
+ end
45
+
46
+ it "must default #normalize_acroynyms? to false" do
47
+ expect(subject.normalize_acronyms?).to be(false)
48
+ end
49
+
50
+ context "when the lang: keyword is given" do
51
+ let(:lang) { :es }
52
+
53
+ subject { described_class.new(lang: lang) }
54
+
55
+ it "must set #lang" do
56
+ expect(subject.lang).to eq(lang)
57
+ end
58
+
59
+ it "must set #stop_words to the stop words for that language" do
60
+ expect(subject.stop_words).to eq(Wordlist::Lexer::StopWords[lang])
61
+ end
62
+ end
63
+
64
+ context "when ignore_words: keyword argument is given" do
65
+ let(:ignore_words) { %w[foo bar] }
66
+
67
+ subject { described_class.new(ignore_words: ignore_words) }
68
+
69
+ it "must set #ignore_words" do
70
+ expect(subject.ignore_words).to eq(ignore_words)
71
+ end
72
+
73
+ context "and it contains an object other than a String or Regexp" do
74
+ it do
75
+ expect {
76
+ described_class.new(ignore_words: [Object.new])
77
+ }.to raise_error(ArgumentError,"ignore_words: must contain only Strings or Regexps")
78
+ end
79
+ end
80
+ end
81
+
82
+ context "when digits: false is given" do
83
+ subject { described_class.new(digits: false) }
84
+
85
+ it "must set #digits? to false" do
86
+ expect(subject.digits?).to be(false)
87
+ end
88
+ end
89
+
90
+ context "when special_chars: keyword is given" do
91
+ let(:special_chars) { %w[_ -] }
92
+
93
+ subject { described_class.new(special_chars: special_chars) }
94
+
95
+ it "must set #special_chars" do
96
+ expect(subject.special_chars).to eq(special_chars)
97
+ end
98
+ end
99
+
100
+ context "when numbers: true is given" do
101
+ subject { described_class.new(numbers: true) }
102
+
103
+ it "must set #numbers? to true" do
104
+ expect(subject.numbers?).to be(true)
105
+ end
106
+ end
107
+
108
+ context "when acronyms: true is given" do
109
+ subject { described_class.new(acronyms: true) }
110
+
111
+ it "must set #acronyms? to true" do
112
+ expect(subject.acronyms?).to be(true)
113
+ end
114
+ end
115
+
116
+ context "when normalize_case: true is given" do
117
+ subject { described_class.new(normalize_case: true) }
118
+
119
+ it "must set #normalize_case? to true" do
120
+ expect(subject.normalize_case?).to be(true)
121
+ end
122
+ end
123
+
124
+ context "when normalize_apostrophes: true is given" do
125
+ subject { described_class.new(normalize_apostrophes: true) }
126
+
127
+ it "must set #normalize_apostrophes? to true" do
128
+ expect(subject.normalize_apostrophes?).to be(true)
129
+ end
130
+ end
131
+
132
+ context "when normalize_acronyms: true is given" do
133
+ subject { described_class.new(normalize_acronyms: true) }
134
+
135
+ it "must set #normalize_acronyms? to true" do
136
+ expect(subject.normalize_acronyms?).to be(true)
137
+ end
138
+ end
139
+ end
140
+
141
+ describe "#parse" do
142
+ let(:expected_words) { %w[foo bar baz qux] }
143
+ let(:text) { expected_words.join(' ') }
144
+
145
+ context "when a block is given" do
146
+ it "must yield each scanned word from the text" do
147
+ expect { |b|
148
+ subject.parse(text,&b)
149
+ }.to yield_successive_args(*expected_words)
150
+ end
151
+
152
+ context "and when the text is empty" do
153
+ let(:text) { '' }
154
+
155
+ it "must not yield any words" do
156
+ expect { |b|
157
+ subject.parse(text,&b)
158
+ }.to_not yield_control
159
+ end
160
+ end
161
+
162
+ context "and when the text is only whitespace" do
163
+ let(:text) { " \t\r\n" }
164
+
165
+ it "must not yield any words" do
166
+ expect { |b|
167
+ subject.parse(text,&b)
168
+ }.to_not yield_control
169
+ end
170
+ end
171
+
172
+ context "and when the text contains newlines" do
173
+ let(:text) { expected_words.join("\n") }
174
+
175
+ it "must parse each line" do
176
+ expect { |b|
177
+ subject.parse(text,&b)
178
+ }.to yield_successive_args(*expected_words)
179
+ end
180
+ end
181
+
182
+ context "and when the text is only unicode whitespace" do
183
+ let(:text) { "\u{a0}" }
184
+
185
+ it "must not yield any words" do
186
+ expect { |b|
187
+ subject.parse(text,&b)
188
+ }.to_not yield_control
189
+ end
190
+ end
191
+
192
+ context "and when the text is only unicode symbols" do
193
+ let(:text) { "©" }
194
+
195
+ it "must not yield any words" do
196
+ expect { |b|
197
+ subject.parse(text,&b)
198
+ }.to_not yield_control
199
+ end
200
+ end
201
+
202
+ context "and when the words are separated by unicode whitespace" do
203
+ let(:text) { expected_words.join("\u{a0}") }
204
+
205
+ it "must skip the unicode whitespace and yield the words" do
206
+ expect { |b|
207
+ subject.parse(text,&b)
208
+ }.to yield_successive_args(*expected_words)
209
+ end
210
+ end
211
+
212
+ context "and when the text contains unicode symbols" do
213
+ let(:text) { "© #{super()}" }
214
+
215
+ it "must skip any unicode symbols and yield the words" do
216
+ expect { |b|
217
+ subject.parse(text,&b)
218
+ }.to yield_successive_args(*expected_words)
219
+ end
220
+ end
221
+
222
+ context "when the words contain unicode letters" do
223
+ let(:expected_words) { %w[Hervé Schäfer Ålesund] }
224
+
225
+ it "must parse words containing unicode letters" do
226
+ expect { |b|
227
+ subject.parse(text,&b)
228
+ }.to yield_successive_args(*expected_words)
229
+ end
230
+ end
231
+
232
+ context "when the words contain uppercase letters" do
233
+ let(:expected_words) { %w[foo Bar baZ QUX] }
234
+
235
+ it "must parse words containing uppercase letters" do
236
+ expect { |b|
237
+ subject.parse(text,&b)
238
+ }.to yield_successive_args(*expected_words)
239
+ end
240
+
241
+ context "but when initialized with normalize_case: true" do
242
+ let(:expected_words) { %w[foo bar baz qux] }
243
+ let(:text) { "foo Bar baZ QUX" }
244
+
245
+ subject { described_class.new(normalize_case: true) }
246
+
247
+ it "must convert all words to lowercase" do
248
+ expect { |b|
249
+ subject.parse(text,&b)
250
+ }.to yield_successive_args(*expected_words)
251
+ end
252
+ end
253
+ end
254
+
255
+ context "and when the text contains single letters" do
256
+ let(:letters) { %w[x y z] }
257
+ let(:expected_words) { super() + letters }
258
+
259
+ it "must parse single letter words" do
260
+ expect { |b|
261
+ subject.parse(text,&b)
262
+ }.to yield_successive_args(*expected_words)
263
+ end
264
+
265
+ context "when the text also contains single letter stop words" do
266
+ let(:letters) { %w[a b c i j k] }
267
+ let(:stop_words) { %w[a i] }
268
+ let(:expected_words) { super() - stop_words }
269
+ let(:text) { "#{super()} #{stop_words.join(' ')}" }
270
+
271
+ it "must parse single letter words" do
272
+ expect { |b|
273
+ subject.parse(text,&b)
274
+ }.to yield_successive_args(*expected_words)
275
+ end
276
+ end
277
+ end
278
+
279
+ context "and when the text contains punctuation" do
280
+ let(:text) { expected_words.join(", ") + '.' }
281
+
282
+ it "must ignore all punctuation" do
283
+ expect { |b|
284
+ subject.parse(text,&b)
285
+ }.to yield_successive_args(*expected_words)
286
+ end
287
+
288
+ context "and the words start with a '\\'' characters" do
289
+ let(:expected_words) { %w[foo bar baz] }
290
+ let(:text) { "foo 'bar baz" }
291
+
292
+ it "must skip the leading '\\' character'" do
293
+ expect { |b|
294
+ subject.parse(text,&b)
295
+ }.to yield_successive_args(*expected_words)
296
+ end
297
+ end
298
+
299
+ context "and the words contain '\\'' characters" do
300
+ let(:expected_words) { super() + %w[O'Brian] }
301
+
302
+ it "must parse the words containing a '\\''" do
303
+ expect { |b|
304
+ subject.parse(text,&b)
305
+ }.to yield_successive_args(*expected_words)
306
+ end
307
+
308
+ context "and when initialized with normalize_apostrophes: true" do
309
+ let(:text) { "foo bar's baz" }
310
+ let(:expected_words) { %w[foo bar baz] }
311
+
312
+ subject { described_class.new(normalize_apostrophes: true) }
313
+
314
+ it "must remove any trailing \"'s\" from the words" do
315
+ expect { |b|
316
+ subject.parse(text,&b)
317
+ }.to yield_successive_args(*expected_words)
318
+ end
319
+ end
320
+ end
321
+
322
+ context "and the words end with a '\\'' characters" do
323
+ let(:expected_words) { %w[foo bar baz] }
324
+ let(:text) { "foo bar' baz" }
325
+
326
+ it "must skip the trailing '\\' character'" do
327
+ expect { |b|
328
+ subject.parse(text,&b)
329
+ }.to yield_successive_args(*expected_words)
330
+ end
331
+ end
332
+
333
+ context "and the words start with a '-' characters" do
334
+ let(:expected_words) { %w[foo bar baz] }
335
+ let(:text) { "foo -bar baz" }
336
+
337
+ it "must skip the leading '-' character'" do
338
+ expect { |b|
339
+ subject.parse(text,&b)
340
+ }.to yield_successive_args(*expected_words)
341
+ end
342
+ end
343
+
344
+ context "and the words contain '-' characters" do
345
+ let(:expected_words) { %w[foo-bar baz-qux] }
346
+
347
+ it "must parse words containing a '-'" do
348
+ expect { |b|
349
+ subject.parse(text,&b)
350
+ }.to yield_successive_args(*expected_words)
351
+ end
352
+
353
+ context "and when initialized with chars: keyword argument" do
354
+ context "and it does not include '-'" do
355
+ let(:text) { "foo bar-baz qux" }
356
+ let(:expected_words) { %w[foo bar baz qux] }
357
+
358
+ subject { described_class.new(special_chars: ['_']) }
359
+
360
+ it "must split hyphenated words into multiple words" do
361
+ expect { |b|
362
+ subject.parse(text,&b)
363
+ }.to yield_successive_args(*expected_words)
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ context "and the words end with a '-' characters" do
370
+ let(:expected_words) { %w[foo bar baz] }
371
+ let(:text) { "foo bar- baz" }
372
+
373
+ it "must skip the trailing '-' character'" do
374
+ expect { |b|
375
+ subject.parse(text,&b)
376
+ }.to yield_successive_args(*expected_words)
377
+ end
378
+ end
379
+
380
+ context "and the words start with a '_' characters" do
381
+ let(:expected_words) { %w[foo bar baz] }
382
+ let(:text) { "foo _bar baz" }
383
+
384
+ it "must skip the leading '_' character'" do
385
+ expect { |b|
386
+ subject.parse(text,&b)
387
+ }.to yield_successive_args(*expected_words)
388
+ end
389
+ end
390
+
391
+ context "and the words contain '_' characters" do
392
+ let(:expected_words) { %w[foo_bar baz_qux] }
393
+
394
+ it "must treat the words containing a '_' as a single word" do
395
+ expect { |b|
396
+ subject.parse(text,&b)
397
+ }.to yield_successive_args(*expected_words)
398
+ end
399
+
400
+ context "and when initialized with chars: keyword argument" do
401
+ context "and it does not include '_'" do
402
+ let(:text) { "foo bar_baz qux" }
403
+ let(:expected_words) { %w[foo bar baz qux] }
404
+
405
+ subject { described_class.new(special_chars: ['-']) }
406
+
407
+ it "must split hyphenated words into multiple words" do
408
+ expect { |b|
409
+ subject.parse(text,&b)
410
+ }.to yield_successive_args(*expected_words)
411
+ end
412
+ end
413
+ end
414
+ end
415
+
416
+ context "and the words end with a '_' characters" do
417
+ let(:expected_words) { %w[foo bar baz] }
418
+ let(:text) { "foo bar_ baz" }
419
+
420
+ it "must skip the trailing '_' character'" do
421
+ expect { |b|
422
+ subject.parse(text,&b)
423
+ }.to yield_successive_args(*expected_words)
424
+ end
425
+ end
426
+
427
+ context "and the words start with a '.' characters" do
428
+ let(:expected_words) { %w[foo bar baz] }
429
+ let(:text) { "foo .bar baz" }
430
+
431
+ it "must skip the leading '.' character'" do
432
+ expect { |b|
433
+ subject.parse(text,&b)
434
+ }.to yield_successive_args(*expected_words)
435
+ end
436
+ end
437
+
438
+ context "and the words contain '.' characters" do
439
+ let(:text) { "foo.bar baz.qux" }
440
+ let(:expected_words) { %w[foo bar baz qux] }
441
+
442
+ it "must split words containing '.' into multiple words" do
443
+ expect { |b|
444
+ subject.parse(text,&b)
445
+ }.to yield_successive_args(*expected_words)
446
+ end
447
+
448
+ context "and when initialized with chars: keyword argument" do
449
+ context "and it does include '.'" do
450
+ let(:expected_words) { %w[foo.bar baz.qux] }
451
+
452
+ subject { described_class.new(special_chars: ['.']) }
453
+
454
+ it "must treat words containing a '.' as a single word" do
455
+ expect { |b|
456
+ subject.parse(text,&b)
457
+ }.to yield_successive_args(*expected_words)
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ context "and the words end with a '.' characters" do
464
+ let(:expected_words) { %w[foo bar baz] }
465
+ let(:text) { "foo bar. baz" }
466
+
467
+ it "must skip the trailing '.' character'" do
468
+ expect { |b|
469
+ subject.parse(text,&b)
470
+ }.to yield_successive_args(*expected_words)
471
+ end
472
+
473
+ context "but the word is an acronym" do
474
+ let(:expected_words) { %w[foo B.A.R. baz] }
475
+ let(:text) { "foo B.A.R. baz" }
476
+
477
+ it "must parse whole acronyms'" do
478
+ expect { |b|
479
+ subject.parse(text,&b)
480
+ }.to yield_successive_args(*expected_words)
481
+ end
482
+
483
+ context "but was initialized with acronyms: false" do
484
+ let(:expected_words) { %w[foo baz] }
485
+
486
+ subject { described_class.new(acronyms: false) }
487
+
488
+ it "must skip the whole acronyms" do
489
+ expect { |b|
490
+ subject.parse(text,&b)
491
+ }.to yield_successive_args(*expected_words)
492
+ end
493
+ end
494
+
495
+ context "and when initialized with normalize_acronyms: true" do
496
+ let(:expected_words) { %w[foo BAR baz] }
497
+ let(:text) { "foo B.A.R. baz" }
498
+
499
+ subject { described_class.new(normalize_acronyms: true) }
500
+
501
+ it "must remove the '.' characters from acronyms" do
502
+ expect { |b|
503
+ subject.parse(text,&b)
504
+ }.to yield_successive_args(*expected_words)
505
+ end
506
+ end
507
+ end
508
+ end
509
+ end
510
+
511
+ context "and when the text contains numbers" do
512
+ let(:text) { expected_words.join(" 1234 ") }
513
+
514
+ it "must ignore whole numbers" do
515
+ expect { |b|
516
+ subject.parse(text,&b)
517
+ }.to yield_successive_args(*expected_words)
518
+ end
519
+
520
+ context "when initialized with numbers: true" do
521
+ let(:expected_words) { %w[foo 1234 bar 000 baz 0123] }
522
+ let(:text) { expected_words.join(' ') }
523
+
524
+ subject { described_class.new(numbers: true) }
525
+
526
+ it "must parse whole numbers" do
527
+ expect { |b|
528
+ subject.parse(text,&b)
529
+ }.to yield_successive_args(*expected_words)
530
+ end
531
+ end
532
+
533
+ context "but the text also contains words that start with digits" do
534
+ let(:text) { expected_words.map { |word| "123#{word}" }.join(' ') }
535
+
536
+ it "must ignore the leading digits of words" do
537
+ expect { |b|
538
+ subject.parse(text,&b)
539
+ }.to yield_successive_args(*expected_words)
540
+ end
541
+ end
542
+
543
+ context "but the text also contains words that contain digits" do
544
+ let(:expected_words) { %w[foo bar1baz qux] }
545
+
546
+ it "must not ignore the digits within words" do
547
+ expect { |b|
548
+ subject.parse(text,&b)
549
+ }.to yield_successive_args(*expected_words)
550
+ end
551
+
552
+ context "but when initialized with digits: false" do
553
+ let(:expected_words) { %w[foo bar baz qux] }
554
+ let(:text) { "foo bar2baz qux" }
555
+
556
+ subject { described_class.new(digits: false) }
557
+
558
+ it "must ignore the leading digits within words" do
559
+ expect { |b|
560
+ subject.parse(text,&b)
561
+ }.to yield_successive_args(*expected_words)
562
+ end
563
+ end
564
+ end
565
+
566
+ context "but the text also contains words that end in digits" do
567
+ let(:expected_words) { super().map { |word| "#{word}123" } }
568
+
569
+ it "must not ignore the digits within words" do
570
+ expect { |b|
571
+ subject.parse(text,&b)
572
+ }.to yield_successive_args(*expected_words)
573
+ end
574
+
575
+ context "but when initialized with digits: false" do
576
+ let(:expected_words) { %w[foo bar baz qux] }
577
+ let(:text) { "foo bar2 baz qux4" }
578
+
579
+ subject { described_class.new(digits: false) }
580
+
581
+ it "must ignore the leading digits within words" do
582
+ expect { |b|
583
+ subject.parse(text,&b)
584
+ }.to yield_successive_args(*expected_words)
585
+ end
586
+ end
587
+ end
588
+ end
589
+
590
+ context "and when the text contains new-lines" do
591
+ let(:text) { expected_words.join($/) }
592
+
593
+ it "must ignore new-line characters" do
594
+ expect { |b|
595
+ subject.parse(text,&b)
596
+ }.to yield_successive_args(*expected_words)
597
+ end
598
+ end
599
+
600
+ context "and when the text contains stop-words" do
601
+ let(:stop_words) { %w[a the is be] }
602
+ let(:text) { expected_words.zip(stop_words).flatten.join(' ') }
603
+
604
+ it "must ignore the stop-words and parse the non-stop-words" do
605
+ expect { |b|
606
+ subject.parse(text,&b)
607
+ }.to yield_successive_args(*expected_words)
608
+ end
609
+
610
+ context "and when the stop words are capitlized" do
611
+ let(:stop_words) { super().map(&:capitalize) }
612
+
613
+ it "must ignore the capitlized stop-words" do
614
+ expect { |b|
615
+ subject.parse(text,&b)
616
+ }.to yield_successive_args(*expected_words)
617
+ end
618
+ end
619
+
620
+ context "and when the stop words are uppercase" do
621
+ let(:stop_words) { super().map(&:upcase) }
622
+
623
+ it "must ignore the uppercase stop-words" do
624
+ expect { |b|
625
+ subject.parse(text,&b)
626
+ }.to yield_successive_args(*expected_words)
627
+ end
628
+ end
629
+
630
+ context "and when the text ends with a stop word" do
631
+ let(:text) { "#{super()} is" }
632
+
633
+ it "must ignore the last stop word" do
634
+ expect { |b|
635
+ subject.parse(text,&b)
636
+ }.to yield_successive_args(*expected_words)
637
+ end
638
+ end
639
+
640
+ context "and when a stop word is followed by other letters" do
641
+ let(:stop_word) { "be" }
642
+ let(:expected_words) { super() + ["#{stop_word}tter"] }
643
+
644
+ it "must not ignore stop words followed by other letters" do
645
+ expect { |b|
646
+ subject.parse(text,&b)
647
+ }.to yield_successive_args(*expected_words)
648
+ end
649
+ end
650
+
651
+ context "and when a stop word is followed by digits" do
652
+ let(:stop_word) { "a" }
653
+ let(:expected_words) { super() + ["#{stop_word}1234"] }
654
+
655
+ it "must not ignore stop words followed by digits" do
656
+ expect { |b|
657
+ subject.parse(text,&b)
658
+ }.to yield_successive_args(*expected_words)
659
+ end
660
+ end
661
+
662
+ context "and when a stop word is followed by punctuation" do
663
+ let(:stop_words) { %w[is. be, the?] }
664
+
665
+ it "must not ignore stop words followed by punctuation" do
666
+ expect { |b|
667
+ subject.parse(text,&b)
668
+ }.to yield_successive_args(*expected_words)
669
+ end
670
+ end
671
+
672
+ context "and when the text contains multiple successive stop-words" do
673
+ let(:text) { (stop_words + expected_words).join(' ') }
674
+
675
+ it "must ignore multiple successive stop-words" do
676
+ expect { |b|
677
+ subject.parse(text,&b)
678
+ }.to yield_successive_args(*expected_words)
679
+ end
680
+ end
681
+ end
682
+
683
+ context "and when #ignore_words contains a String" do
684
+ let(:ignore_words) { %w[foo baz] }
685
+ let(:expected_words) { %w[bar qux] }
686
+ let(:text) { "foo bar baz qux" }
687
+
688
+ subject { described_class.new(ignore_words: ignore_words) }
689
+
690
+ it "must filter out words matching that String" do
691
+ expect { |b|
692
+ subject.parse(text,&b)
693
+ }.to yield_successive_args(*expected_words)
694
+ end
695
+ end
696
+
697
+ context "and when #ignore_words contains a Regexp" do
698
+ let(:ignore_words) { [/ba[a-z]/] }
699
+ let(:expected_words) { %w[foo qux] }
700
+ let(:text) { "foo bar baz qux" }
701
+
702
+ subject { described_class.new(ignore_words: ignore_words) }
703
+
704
+ it "must filter out words matching that Regexp" do
705
+ expect { |b|
706
+ subject.parse(text,&b)
707
+ }.to yield_successive_args(*expected_words)
708
+ end
709
+ end
710
+ end
711
+
712
+ context "when no block is given" do
713
+ it "must return an Array of the parsed words" do
714
+ expect(subject.parse(text)).to eq(expected_words)
715
+ end
716
+ end
717
+ end
718
+ end