wordlist 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (148) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +27 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +45 -1
  5. data/Gemfile +13 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +266 -61
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +4 -10
  54. data/lib/wordlist/abstract_wordlist.rb +24 -0
  55. data/lib/wordlist/builder.rb +170 -138
  56. data/lib/wordlist/cli.rb +458 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +176 -0
  61. data/lib/wordlist/format.rb +38 -0
  62. data/lib/wordlist/lexer/lang.rb +32 -0
  63. data/lib/wordlist/lexer/stop_words.rb +68 -0
  64. data/lib/wordlist/lexer.rb +218 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +45 -0
  67. data/lib/wordlist/modifiers/downcase.rb +45 -0
  68. data/lib/wordlist/modifiers/gsub.rb +51 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +133 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +25 -0
  72. data/lib/wordlist/modifiers/sub.rb +97 -0
  73. data/lib/wordlist/modifiers/tr.rb +71 -0
  74. data/lib/wordlist/modifiers/upcase.rb +45 -0
  75. data/lib/wordlist/modifiers.rb +8 -0
  76. data/lib/wordlist/operators/binary_operator.rb +38 -0
  77. data/lib/wordlist/operators/concat.rb +47 -0
  78. data/lib/wordlist/operators/intersect.rb +55 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +72 -0
  81. data/lib/wordlist/operators/product.rb +50 -0
  82. data/lib/wordlist/operators/subtract.rb +54 -0
  83. data/lib/wordlist/operators/unary_operator.rb +29 -0
  84. data/lib/wordlist/operators/union.rb +61 -0
  85. data/lib/wordlist/operators/unique.rb +52 -0
  86. data/lib/wordlist/operators.rb +7 -0
  87. data/lib/wordlist/unique_filter.rb +40 -61
  88. data/lib/wordlist/version.rb +1 -1
  89. data/lib/wordlist/words.rb +71 -0
  90. data/lib/wordlist.rb +103 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +801 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +258 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +652 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/union_spec.rb +37 -0
  128. data/spec/operators/unique_spec.rb +25 -0
  129. data/spec/spec_helper.rb +2 -1
  130. data/spec/unique_filter_spec.rb +108 -18
  131. data/spec/wordlist_spec.rb +55 -3
  132. data/spec/words_spec.rb +41 -0
  133. metadata +183 -120
  134. data/lib/wordlist/builders/website.rb +0 -216
  135. data/lib/wordlist/builders.rb +0 -1
  136. data/lib/wordlist/flat_file.rb +0 -47
  137. data/lib/wordlist/list.rb +0 -162
  138. data/lib/wordlist/mutator.rb +0 -113
  139. data/lib/wordlist/parsers.rb +0 -74
  140. data/lib/wordlist/runners/list.rb +0 -116
  141. data/lib/wordlist/runners/runner.rb +0 -67
  142. data/lib/wordlist/runners.rb +0 -2
  143. data/scripts/benchmark +0 -59
  144. data/scripts/text/comedy_of_errors.txt +0 -4011
  145. data/spec/flat_file_spec.rb +0 -25
  146. data/spec/list_spec.rb +0 -58
  147. data/spec/mutator_spec.rb +0 -43
  148. data/spec/parsers_spec.rb +0 -118
@@ -0,0 +1,652 @@
1
+ require 'spec_helper'
2
+ require 'wordlist/lexer'
3
+
4
+ describe Wordlist::Lexer do
5
+ let(:text) { "foo bar baz qux" }
6
+
7
+ it do
8
+ expect(described_class).to include(Enumerable)
9
+ end
10
+
11
+ describe "#initialize" do
12
+ let(:default_lang) { Wordlist::Lexer::Lang.default }
13
+
14
+ it "must default #lang to Lang.default_lang" do
15
+ expect(subject.lang).to eq(default_lang)
16
+ end
17
+
18
+ it "must set #stop_words to the stop words for :en" do
19
+ expect(subject.stop_words).to eq(Wordlist::Lexer::StopWords[default_lang])
20
+ end
21
+
22
+ it "must default #ignore_words to []" do
23
+ expect(subject.ignore_words).to eq([])
24
+ end
25
+
26
+ it "must default #digits? to true" do
27
+ expect(subject.digits?).to be(true)
28
+ end
29
+
30
+ it "must default #special_chars to SPECIAL_CHARS" do
31
+ expect(subject.special_chars).to eq(described_class::SPECIAL_CHARS)
32
+ end
33
+
34
+ it "must default #numbers? to false" do
35
+ expect(subject.numbers?).to be(false)
36
+ end
37
+
38
+ it "must default #acroynyms? to true" do
39
+ expect(subject.acronyms?).to be(true)
40
+ end
41
+
42
+ it "must default #normalize_case? to false" do
43
+ expect(subject.normalize_case?).to be(false)
44
+ end
45
+
46
+ it "must default #normalize_apostrophes? to false" do
47
+ expect(subject.normalize_apostrophes?).to be(false)
48
+ end
49
+
50
+ it "must default #normalize_acroynyms? to false" do
51
+ expect(subject.normalize_acronyms?).to be(false)
52
+ end
53
+
54
+ context "when the lang: keyword is given" do
55
+ let(:lang) { :es }
56
+
57
+ subject { described_class.new(lang: lang) }
58
+
59
+ it "must set #lang" do
60
+ expect(subject.lang).to eq(lang)
61
+ end
62
+
63
+ it "must set #stop_words to the stop words for that language" do
64
+ expect(subject.stop_words).to eq(Wordlist::Lexer::StopWords[lang])
65
+ end
66
+ end
67
+
68
+ context "when ignore_words: keyword argument is given" do
69
+ let(:ignore_words) { %w[foo bar] }
70
+
71
+ subject { described_class.new(ignore_words: ignore_words) }
72
+
73
+ it "must set #ignore_words" do
74
+ expect(subject.ignore_words).to eq(ignore_words)
75
+ end
76
+
77
+ context "and it contains an object other than a String or Regexp" do
78
+ it do
79
+ expect {
80
+ described_class.new(ignore_words: [Object.new])
81
+ }.to raise_error(ArgumentError,"ignore_words: must contain only Strings or Regexps")
82
+ end
83
+ end
84
+ end
85
+
86
+ context "when digits: false is given" do
87
+ subject { described_class.new(digits: false) }
88
+
89
+ it "must set #digits? to false" do
90
+ expect(subject.digits?).to be(false)
91
+ end
92
+ end
93
+
94
+ context "when special_chars: keyword is given" do
95
+ let(:special_chars) { %w[_ -] }
96
+
97
+ subject { described_class.new(special_chars: special_chars) }
98
+
99
+ it "must set #special_chars" do
100
+ expect(subject.special_chars).to eq(special_chars)
101
+ end
102
+ end
103
+
104
+ context "when numbers: true is given" do
105
+ subject { described_class.new(numbers: true) }
106
+
107
+ it "must set #numbers? to true" do
108
+ expect(subject.numbers?).to be(true)
109
+ end
110
+ end
111
+
112
+ context "when acronyms: true is given" do
113
+ subject { described_class.new(acronyms: true) }
114
+
115
+ it "must set #acronyms? to true" do
116
+ expect(subject.acronyms?).to be(true)
117
+ end
118
+ end
119
+
120
+ context "when normalize_case: true is given" do
121
+ subject { described_class.new(normalize_case: true) }
122
+
123
+ it "must set #normalize_case? to true" do
124
+ expect(subject.normalize_case?).to be(true)
125
+ end
126
+ end
127
+
128
+ context "when normalize_apostrophes: true is given" do
129
+ subject { described_class.new(normalize_apostrophes: true) }
130
+
131
+ it "must set #normalize_apostrophes? to true" do
132
+ expect(subject.normalize_apostrophes?).to be(true)
133
+ end
134
+ end
135
+
136
+ context "when normalize_acronyms: true is given" do
137
+ subject { described_class.new(normalize_acronyms: true) }
138
+
139
+ it "must set #normalize_acronyms? to true" do
140
+ expect(subject.normalize_acronyms?).to be(true)
141
+ end
142
+ end
143
+ end
144
+
145
+ describe "#parse" do
146
+ let(:expected_words) { %w[foo bar baz qux] }
147
+ let(:text) { expected_words.join(' ') }
148
+
149
+ context "when a block is given" do
150
+ it "must yield each scanned word from the text" do
151
+ expect { |b|
152
+ subject.parse(text,&b)
153
+ }.to yield_successive_args(*expected_words)
154
+ end
155
+
156
+ context "when the words contain uppercase letters" do
157
+ let(:expected_words) { %w[foo Bar baZ QUX] }
158
+
159
+ it "must parse words containing uppercase letters" do
160
+ expect { |b|
161
+ subject.parse(text,&b)
162
+ }.to yield_successive_args(*expected_words)
163
+ end
164
+
165
+ context "but when initialized with normalize_case: true" do
166
+ let(:expected_words) { %w[foo bar baz qux] }
167
+ let(:text) { "foo Bar baZ QUX" }
168
+
169
+ subject { described_class.new(normalize_case: true) }
170
+
171
+ it "must convert all words to lowercase" do
172
+ expect { |b|
173
+ subject.parse(text,&b)
174
+ }.to yield_successive_args(*expected_words)
175
+ end
176
+ end
177
+ end
178
+
179
+ context "and when the text contains single letters" do
180
+ let(:letters) { %w[x y z] }
181
+ let(:expected_words) { super() + letters }
182
+
183
+ it "must parse single letter words" do
184
+ expect { |b|
185
+ subject.parse(text,&b)
186
+ }.to yield_successive_args(*expected_words)
187
+ end
188
+
189
+ context "when the text also contains single letter stop words" do
190
+ let(:letters) { %w[a b c i j k] }
191
+ let(:stop_words) { %w[a i] }
192
+ let(:expected_words) { super() - stop_words }
193
+ let(:text) { "#{super()} #{stop_words.join(' ')}" }
194
+
195
+ it "must parse single letter words" do
196
+ expect { |b|
197
+ subject.parse(text,&b)
198
+ }.to yield_successive_args(*expected_words)
199
+ end
200
+ end
201
+ end
202
+
203
+ context "and when the text contains newlines" do
204
+ let(:text) { expected_words.join("\n") }
205
+
206
+ it "must parse each line" do
207
+ expect { |b|
208
+ subject.parse(text,&b)
209
+ }.to yield_successive_args(*expected_words)
210
+ end
211
+ end
212
+
213
+ context "and when the text contains punctuation" do
214
+ let(:text) { expected_words.join(", ") + '.' }
215
+
216
+ it "must ignore all punctuation" do
217
+ expect { |b|
218
+ subject.parse(text,&b)
219
+ }.to yield_successive_args(*expected_words)
220
+ end
221
+
222
+ context "and the words start with a '\\'' characters" do
223
+ let(:expected_words) { %w[foo bar baz] }
224
+ let(:text) { "foo 'bar baz" }
225
+
226
+ it "must skip the leading '\\' character'" do
227
+ expect { |b|
228
+ subject.parse(text,&b)
229
+ }.to yield_successive_args(*expected_words)
230
+ end
231
+ end
232
+
233
+ context "and the words contain '\\'' characters" do
234
+ let(:expected_words) { super() + %w[O'Brian] }
235
+
236
+ it "must parse the words containing a '\\''" do
237
+ expect { |b|
238
+ subject.parse(text,&b)
239
+ }.to yield_successive_args(*expected_words)
240
+ end
241
+
242
+ context "and when initialized with normalize_apostrophes: true" do
243
+ let(:text) { "foo bar's baz" }
244
+ let(:expected_words) { %w[foo bar baz] }
245
+
246
+ subject { described_class.new(normalize_apostrophes: true) }
247
+
248
+ it "must remove any trailing \"'s\" from the words" do
249
+ expect { |b|
250
+ subject.parse(text,&b)
251
+ }.to yield_successive_args(*expected_words)
252
+ end
253
+ end
254
+ end
255
+
256
+ context "and the words end with a '\\'' characters" do
257
+ let(:expected_words) { %w[foo bar baz] }
258
+ let(:text) { "foo bar' baz" }
259
+
260
+ it "must skip the trailing '\\' character'" do
261
+ expect { |b|
262
+ subject.parse(text,&b)
263
+ }.to yield_successive_args(*expected_words)
264
+ end
265
+ end
266
+
267
+ context "and the words start with a '-' characters" do
268
+ let(:expected_words) { %w[foo bar baz] }
269
+ let(:text) { "foo -bar baz" }
270
+
271
+ it "must skip the leading '-' character'" do
272
+ expect { |b|
273
+ subject.parse(text,&b)
274
+ }.to yield_successive_args(*expected_words)
275
+ end
276
+ end
277
+
278
+ context "and the words contain '-' characters" do
279
+ let(:expected_words) { %w[foo-bar baz-qux] }
280
+
281
+ it "must parse words containing a '-'" do
282
+ expect { |b|
283
+ subject.parse(text,&b)
284
+ }.to yield_successive_args(*expected_words)
285
+ end
286
+
287
+ context "and when initialized with chars: keyword argument" do
288
+ context "and it does not include '-'" do
289
+ let(:text) { "foo bar-baz qux" }
290
+ let(:expected_words) { %w[foo bar baz qux] }
291
+
292
+ subject { described_class.new(special_chars: ['_']) }
293
+
294
+ it "must split hyphenated words into multiple words" do
295
+ expect { |b|
296
+ subject.parse(text,&b)
297
+ }.to yield_successive_args(*expected_words)
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+ context "and the words end with a '-' characters" do
304
+ let(:expected_words) { %w[foo bar baz] }
305
+ let(:text) { "foo bar- baz" }
306
+
307
+ it "must skip the trailing '-' character'" do
308
+ expect { |b|
309
+ subject.parse(text,&b)
310
+ }.to yield_successive_args(*expected_words)
311
+ end
312
+ end
313
+
314
+ context "and the words start with a '_' characters" do
315
+ let(:expected_words) { %w[foo bar baz] }
316
+ let(:text) { "foo _bar baz" }
317
+
318
+ it "must skip the leading '_' character'" do
319
+ expect { |b|
320
+ subject.parse(text,&b)
321
+ }.to yield_successive_args(*expected_words)
322
+ end
323
+ end
324
+
325
+ context "and the words contain '_' characters" do
326
+ let(:expected_words) { %w[foo_bar baz_qux] }
327
+
328
+ it "must treat the words containing a '_' as a single word" do
329
+ expect { |b|
330
+ subject.parse(text,&b)
331
+ }.to yield_successive_args(*expected_words)
332
+ end
333
+
334
+ context "and when initialized with chars: keyword argument" do
335
+ context "and it does not include '_'" do
336
+ let(:text) { "foo bar_baz qux" }
337
+ let(:expected_words) { %w[foo bar baz qux] }
338
+
339
+ subject { described_class.new(special_chars: ['-']) }
340
+
341
+ it "must split hyphenated words into multiple words" do
342
+ expect { |b|
343
+ subject.parse(text,&b)
344
+ }.to yield_successive_args(*expected_words)
345
+ end
346
+ end
347
+ end
348
+ end
349
+
350
+ context "and the words end with a '_' characters" do
351
+ let(:expected_words) { %w[foo bar baz] }
352
+ let(:text) { "foo bar_ baz" }
353
+
354
+ it "must skip the trailing '_' character'" do
355
+ expect { |b|
356
+ subject.parse(text,&b)
357
+ }.to yield_successive_args(*expected_words)
358
+ end
359
+ end
360
+
361
+ context "and the words start with a '.' characters" do
362
+ let(:expected_words) { %w[foo bar baz] }
363
+ let(:text) { "foo .bar baz" }
364
+
365
+ it "must skip the leading '.' character'" do
366
+ expect { |b|
367
+ subject.parse(text,&b)
368
+ }.to yield_successive_args(*expected_words)
369
+ end
370
+ end
371
+
372
+ context "and the words contain '.' characters" do
373
+ let(:text) { "foo.bar baz.qux" }
374
+ let(:expected_words) { %w[foo bar baz qux] }
375
+
376
+ it "must split words containing '.' into multiple words" do
377
+ expect { |b|
378
+ subject.parse(text,&b)
379
+ }.to yield_successive_args(*expected_words)
380
+ end
381
+
382
+ context "and when initialized with chars: keyword argument" do
383
+ context "and it does include '.'" do
384
+ let(:expected_words) { %w[foo.bar baz.qux] }
385
+
386
+ subject { described_class.new(special_chars: ['.']) }
387
+
388
+ it "must treat words containing a '.' as a single word" do
389
+ expect { |b|
390
+ subject.parse(text,&b)
391
+ }.to yield_successive_args(*expected_words)
392
+ end
393
+ end
394
+ end
395
+ end
396
+
397
+ context "and the words end with a '.' characters" do
398
+ let(:expected_words) { %w[foo bar baz] }
399
+ let(:text) { "foo bar. baz" }
400
+
401
+ it "must skip the trailing '.' character'" do
402
+ expect { |b|
403
+ subject.parse(text,&b)
404
+ }.to yield_successive_args(*expected_words)
405
+ end
406
+
407
+ context "but the word is an acronym" do
408
+ let(:expected_words) { %w[foo B.A.R. baz] }
409
+ let(:text) { "foo B.A.R. baz" }
410
+
411
+ it "must parse whole acronyms'" do
412
+ expect { |b|
413
+ subject.parse(text,&b)
414
+ }.to yield_successive_args(*expected_words)
415
+ end
416
+
417
+ context "but was initialized with acronyms: false" do
418
+ let(:expected_words) { %w[foo baz] }
419
+
420
+ subject { described_class.new(acronyms: false) }
421
+
422
+ it "must skip the whole acronyms" do
423
+ expect { |b|
424
+ subject.parse(text,&b)
425
+ }.to yield_successive_args(*expected_words)
426
+ end
427
+ end
428
+
429
+ context "and when initialized with normalize_acronyms: true" do
430
+ let(:expected_words) { %w[foo BAR baz] }
431
+ let(:text) { "foo B.A.R. baz" }
432
+
433
+ subject { described_class.new(normalize_acronyms: true) }
434
+
435
+ it "must remove the '.' characters from acronyms" do
436
+ expect { |b|
437
+ subject.parse(text,&b)
438
+ }.to yield_successive_args(*expected_words)
439
+ end
440
+ end
441
+ end
442
+ end
443
+ end
444
+
445
+ context "and when the text contains numbers" do
446
+ let(:text) { expected_words.join(" 1234 ") }
447
+
448
+ it "must ignore whole numbers" do
449
+ expect { |b|
450
+ subject.parse(text,&b)
451
+ }.to yield_successive_args(*expected_words)
452
+ end
453
+
454
+ context "when initialized with numbers: true" do
455
+ let(:expected_words) { %w[foo 1234 bar 000 baz 0123] }
456
+ let(:text) { expected_words.join(' ') }
457
+
458
+ subject { described_class.new(numbers: true) }
459
+
460
+ it "must parse whole numbers" do
461
+ expect { |b|
462
+ subject.parse(text,&b)
463
+ }.to yield_successive_args(*expected_words)
464
+ end
465
+ end
466
+
467
+ context "but the text also contains words that start with digits" do
468
+ let(:text) { expected_words.map { |word| "123#{word}" }.join(' ') }
469
+
470
+ it "must ignore the leading digits of words" do
471
+ expect { |b|
472
+ subject.parse(text,&b)
473
+ }.to yield_successive_args(*expected_words)
474
+ end
475
+ end
476
+
477
+ context "but the text also contains words that contain digits" do
478
+ let(:expected_words) { %w[foo bar1baz qux] }
479
+
480
+ it "must not ignore the digits within words" do
481
+ expect { |b|
482
+ subject.parse(text,&b)
483
+ }.to yield_successive_args(*expected_words)
484
+ end
485
+
486
+ context "but when initialized with digits: false" do
487
+ let(:expected_words) { %w[foo bar baz qux] }
488
+ let(:text) { "foo bar2baz qux" }
489
+
490
+ subject { described_class.new(digits: false) }
491
+
492
+ it "must ignore the leading digits within words" do
493
+ expect { |b|
494
+ subject.parse(text,&b)
495
+ }.to yield_successive_args(*expected_words)
496
+ end
497
+ end
498
+ end
499
+
500
+ context "but the text also contains words that end in digits" do
501
+ let(:expected_words) { super().map { |word| "#{word}123" } }
502
+
503
+ it "must not ignore the digits within words" do
504
+ expect { |b|
505
+ subject.parse(text,&b)
506
+ }.to yield_successive_args(*expected_words)
507
+ end
508
+
509
+ context "but when initialized with digits: false" do
510
+ let(:expected_words) { %w[foo bar baz qux] }
511
+ let(:text) { "foo bar2 baz qux4" }
512
+
513
+ subject { described_class.new(digits: false) }
514
+
515
+ it "must ignore the leading digits within words" do
516
+ expect { |b|
517
+ subject.parse(text,&b)
518
+ }.to yield_successive_args(*expected_words)
519
+ end
520
+ end
521
+ end
522
+ end
523
+
524
+ context "and when the text contains new-lines" do
525
+ let(:text) { expected_words.join($/) }
526
+
527
+ it "must ignore new-line characters" do
528
+ expect { |b|
529
+ subject.parse(text,&b)
530
+ }.to yield_successive_args(*expected_words)
531
+ end
532
+ end
533
+
534
+ context "and when the text contains stop-words" do
535
+ let(:stop_words) { %w[a the is be] }
536
+ let(:text) { expected_words.zip(stop_words).flatten.join(' ') }
537
+
538
+ it "must ignore the stop-words and parse the non-stop-words" do
539
+ expect { |b|
540
+ subject.parse(text,&b)
541
+ }.to yield_successive_args(*expected_words)
542
+ end
543
+
544
+ context "and when the stop words are capitlized" do
545
+ let(:stop_words) { super().map(&:capitalize) }
546
+
547
+ it "must ignore the capitlized stop-words" do
548
+ expect { |b|
549
+ subject.parse(text,&b)
550
+ }.to yield_successive_args(*expected_words)
551
+ end
552
+ end
553
+
554
+ context "and when the stop words are uppercase" do
555
+ let(:stop_words) { super().map(&:upcase) }
556
+
557
+ it "must ignore the uppercase stop-words" do
558
+ expect { |b|
559
+ subject.parse(text,&b)
560
+ }.to yield_successive_args(*expected_words)
561
+ end
562
+ end
563
+
564
+ context "and when the text ends with a stop word" do
565
+ let(:text) { "#{super()} is" }
566
+
567
+ it "must ignore the last stop word" do
568
+ expect { |b|
569
+ subject.parse(text,&b)
570
+ }.to yield_successive_args(*expected_words)
571
+ end
572
+ end
573
+
574
+ context "and when a stop word is followed by other letters" do
575
+ let(:stop_word) { "be" }
576
+ let(:expected_words) { super() + ["#{stop_word}tter"] }
577
+
578
+ it "must not ignore stop words followed by other letters" do
579
+ expect { |b|
580
+ subject.parse(text,&b)
581
+ }.to yield_successive_args(*expected_words)
582
+ end
583
+ end
584
+
585
+ context "and when a stop word is followed by digits" do
586
+ let(:stop_word) { "a" }
587
+ let(:expected_words) { super() + ["#{stop_word}1234"] }
588
+
589
+ it "must not ignore stop words followed by digits" do
590
+ expect { |b|
591
+ subject.parse(text,&b)
592
+ }.to yield_successive_args(*expected_words)
593
+ end
594
+ end
595
+
596
+ context "and when a stop word is followed by punctuation" do
597
+ let(:stop_words) { %w[is. be, the?] }
598
+
599
+ it "must not ignore stop words followed by punctuation" do
600
+ expect { |b|
601
+ subject.parse(text,&b)
602
+ }.to yield_successive_args(*expected_words)
603
+ end
604
+ end
605
+
606
+ context "and when the text contains multiple successive stop-words" do
607
+ let(:text) { (stop_words + expected_words).join(' ') }
608
+
609
+ it "must ignore multiple successive stop-words" do
610
+ expect { |b|
611
+ subject.parse(text,&b)
612
+ }.to yield_successive_args(*expected_words)
613
+ end
614
+ end
615
+ end
616
+
617
+ context "and when #ignore_words contains a String" do
618
+ let(:ignore_words) { %w[foo baz] }
619
+ let(:expected_words) { %w[bar qux] }
620
+ let(:text) { "foo bar baz qux" }
621
+
622
+ subject { described_class.new(ignore_words: ignore_words) }
623
+
624
+ it "must filter out words matching that String" do
625
+ expect { |b|
626
+ subject.parse(text,&b)
627
+ }.to yield_successive_args(*expected_words)
628
+ end
629
+ end
630
+
631
+ context "and when #ignore_words contains a Regexp" do
632
+ let(:ignore_words) { [/ba[a-z]/] }
633
+ let(:expected_words) { %w[foo qux] }
634
+ let(:text) { "foo bar baz qux" }
635
+
636
+ subject { described_class.new(ignore_words: ignore_words) }
637
+
638
+ it "must filter out words matching that Regexp" do
639
+ expect { |b|
640
+ subject.parse(text,&b)
641
+ }.to yield_successive_args(*expected_words)
642
+ end
643
+ end
644
+ end
645
+
646
+ context "when no block is given" do
647
+ it "must return an Array of the parsed words" do
648
+ expect(subject.parse(text)).to eq(expected_words)
649
+ end
650
+ end
651
+ end
652
+ end