pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -3,7 +3,6 @@ require 'spec_helper'
3
3
  describe PragmaticTokenizer do
4
4
  context 'Language: English (en)' do
5
5
  context '#tokenize (example strings)' do
6
-
7
6
  context 'no options selected' do
8
7
  it 'tokenizes a string #001' do
9
8
  text = "Hello world."
@@ -171,7 +170,7 @@ describe PragmaticTokenizer do
171
170
  text = "U.S.A. U.S.A. U.S.A."
172
171
  pt = PragmaticTokenizer::Tokenizer.new(text)
173
172
  expect(pt.tokenize).to eq(
174
- ["u.s.a.", "u.s.a.", "u.s.a."]
173
+ ["u.s.a.", "u.s.a.", "u.s.a."]
175
174
  )
176
175
  end
177
176
  end
@@ -186,8 +185,9 @@ describe PragmaticTokenizer do
186
185
  it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
187
186
  text = "Mr. Smith, hello world."
188
187
  abbreviations = ['mrs']
189
- pt = PragmaticTokenizer::Tokenizer.new(text,
190
- abbreviations: abbreviations
188
+ pt = PragmaticTokenizer::Tokenizer.new(
189
+ text,
190
+ abbreviations: abbreviations
191
191
  )
192
192
  expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
193
193
  end
@@ -195,8 +195,9 @@ describe PragmaticTokenizer do
195
195
  it 'recognizes a user-supplied abbreviation' do
196
196
  text = "thisisnotanormalabbreviation. hello world."
197
197
  abbreviations = ['thisisnotanormalabbreviation']
198
- pt = PragmaticTokenizer::Tokenizer.new(text,
199
- abbreviations: abbreviations
198
+ pt = PragmaticTokenizer::Tokenizer.new(
199
+ text,
200
+ abbreviations: abbreviations
200
201
  )
201
202
  expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
202
203
  end
@@ -204,16 +205,18 @@ describe PragmaticTokenizer do
204
205
  it 'handles an empty user-supplied abbreviation array' do
205
206
  text = "thisisnotanormalabbreviation. hello world."
206
207
  abbreviations = []
207
- pt = PragmaticTokenizer::Tokenizer.new(text,
208
- abbreviations: abbreviations
208
+ pt = PragmaticTokenizer::Tokenizer.new(
209
+ text,
210
+ abbreviations: abbreviations
209
211
  )
210
212
  expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
211
213
  end
212
214
 
213
215
  it 'handles abrreviations across multiple languages' do
214
216
  text = "Mr. Smith how are ü. today."
215
- pt = PragmaticTokenizer::Tokenizer.new(text,
216
- filter_languages: [:en, :de]
217
+ pt = PragmaticTokenizer::Tokenizer.new(
218
+ text,
219
+ filter_languages: [:en, :de]
217
220
  )
218
221
  expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
219
222
  end
@@ -221,9 +224,10 @@ describe PragmaticTokenizer do
221
224
  it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
222
225
  text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
223
226
  abbreviations = ['thisisnotanormalabbreviation']
224
- pt = PragmaticTokenizer::Tokenizer.new(text,
225
- filter_languages: [:en, :de],
226
- abbreviations: abbreviations
227
+ pt = PragmaticTokenizer::Tokenizer.new(
228
+ text,
229
+ filter_languages: [:en, :de],
230
+ abbreviations: abbreviations
227
231
  )
228
232
  expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
229
233
  end
@@ -240,9 +244,10 @@ describe PragmaticTokenizer do
240
244
  it 'expands user-supplied contractions' do
241
245
  text = "Hello supa'soo guy."
242
246
  contractions = { "supa'soo" => "super smooth" }
243
- pt = PragmaticTokenizer::Tokenizer.new(text,
244
- contractions: contractions,
245
- expand_contractions: true
247
+ pt = PragmaticTokenizer::Tokenizer.new(
248
+ text,
249
+ contractions: contractions,
250
+ expand_contractions: true
246
251
  )
247
252
  expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
248
253
  end
@@ -250,29 +255,32 @@ describe PragmaticTokenizer do
250
255
  it 'does not expands user-supplied contractions' do
251
256
  text = "Hello supa'soo guy."
252
257
  contractions = { "supa'soo" => "super smooth" }
253
- pt = PragmaticTokenizer::Tokenizer.new(text,
254
- contractions: contractions,
255
- expand_contractions: false
258
+ pt = PragmaticTokenizer::Tokenizer.new(
259
+ text,
260
+ contractions: contractions,
261
+ expand_contractions: false
256
262
  )
257
- expect(pt.tokenize).to eq( ["hello", "supa'soo", "guy", "."])
263
+ expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", "."])
258
264
  end
259
265
 
260
266
  it 'expands user-supplied contractions and language contractions' do
261
267
  text = "Hello supa'soo guy. auf's wasn't it?"
262
268
  contractions = { "supa'soo" => "super smooth" }
263
- pt = PragmaticTokenizer::Tokenizer.new(text,
264
- contractions: contractions,
265
- expand_contractions: true,
266
- filter_languages: [:en, :de]
269
+ pt = PragmaticTokenizer::Tokenizer.new(
270
+ text,
271
+ contractions: contractions,
272
+ expand_contractions: true,
273
+ filter_languages: [:en, :de]
267
274
  )
268
275
  expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
269
276
  end
270
277
 
271
278
  it 'expands language contractions' do
272
279
  text = "Hello supa'soo guy. auf's wasn't it?"
273
- pt = PragmaticTokenizer::Tokenizer.new(text,
274
- expand_contractions: true,
275
- filter_languages: [:en, :de]
280
+ pt = PragmaticTokenizer::Tokenizer.new(
281
+ text,
282
+ expand_contractions: true,
283
+ filter_languages: [:en, :de]
276
284
  )
277
285
  expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
278
286
  end
@@ -280,8 +288,9 @@ describe PragmaticTokenizer do
280
288
  it 'tokenizes a string #001' do
281
289
  # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
282
290
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
283
- pt = PragmaticTokenizer::Tokenizer.new(text,
284
- expand_contractions: true
291
+ pt = PragmaticTokenizer::Tokenizer.new(
292
+ text,
293
+ expand_contractions: true
285
294
  )
286
295
  expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
287
296
  end
@@ -289,24 +298,27 @@ describe PragmaticTokenizer do
289
298
  it 'tokenizes a string #002' do
290
299
  # http://nlp.stanford.edu/software/tokenizer.shtml
291
300
  text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
292
- pt = PragmaticTokenizer::Tokenizer.new(text,
293
- expand_contractions: true
301
+ pt = PragmaticTokenizer::Tokenizer.new(
302
+ text,
303
+ expand_contractions: true
294
304
  )
295
305
  expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
296
306
  end
297
307
 
298
308
  it 'tokenizes a string #003' do
299
309
  text = "Look for his/her account."
300
- pt = PragmaticTokenizer::Tokenizer.new(text,
301
- expand_contractions: true
310
+ pt = PragmaticTokenizer::Tokenizer.new(
311
+ text,
312
+ expand_contractions: true
302
313
  )
303
314
  expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
304
315
  end
305
316
 
306
317
  it 'tokenizes a string #004' do
307
318
  text = "I like apples and/or oranges."
308
- pt = PragmaticTokenizer::Tokenizer.new(text,
309
- expand_contractions: true
319
+ pt = PragmaticTokenizer::Tokenizer.new(
320
+ text,
321
+ expand_contractions: true
310
322
  )
311
323
  expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
312
324
  end
@@ -315,8 +327,9 @@ describe PragmaticTokenizer do
315
327
  context 'option (emojis)' do
316
328
  it 'removes emoji' do
317
329
  text = "Return the emoji 👿😍😱🐔🌚. 🌚"
318
- pt = PragmaticTokenizer::Tokenizer.new(text,
319
- remove_emoji: true
330
+ pt = PragmaticTokenizer::Tokenizer.new(
331
+ text,
332
+ remove_emoji: true
320
333
  )
321
334
  expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
322
335
  end
@@ -329,24 +342,27 @@ describe PragmaticTokenizer do
329
342
 
330
343
  it 'removes snowflakes 1' do
331
344
  text = "hello❄️❄️❄️"
332
- pt = PragmaticTokenizer::Tokenizer.new(text,
333
- remove_emoji: true
345
+ pt = PragmaticTokenizer::Tokenizer.new(
346
+ text,
347
+ remove_emoji: true
334
348
  )
335
349
  expect(pt.tokenize).to eq(["hello"])
336
350
  end
337
351
 
338
352
  it 'removes snowflakes 2' do
339
353
  text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
340
- pt = PragmaticTokenizer::Tokenizer.new(text,
341
- remove_emoji: true
354
+ pt = PragmaticTokenizer::Tokenizer.new(
355
+ text,
356
+ remove_emoji: true
342
357
  )
343
358
  expect(pt.tokenize).to eq(["hello"])
344
359
  end
345
360
 
346
361
  it 'removes snowflakes 3' do
347
362
  text = "hello\u2744\u2744\u2744"
348
- pt = PragmaticTokenizer::Tokenizer.new(text,
349
- remove_emoji: true
363
+ pt = PragmaticTokenizer::Tokenizer.new(
364
+ text,
365
+ remove_emoji: true
350
366
  )
351
367
  expect(pt.tokenize).to eq(["hello"])
352
368
  end
@@ -355,24 +371,27 @@ describe PragmaticTokenizer do
355
371
  context 'option (hashtags)' do
356
372
  it 'tokenizes a string #001' do
357
373
  text = "This is a #hashtag yay!"
358
- pt = PragmaticTokenizer::Tokenizer.new(text,
359
- hashtags: :remove
374
+ pt = PragmaticTokenizer::Tokenizer.new(
375
+ text,
376
+ hashtags: :remove
360
377
  )
361
378
  expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
362
379
  end
363
380
 
364
381
  it 'tokenizes a string #002' do
365
382
  text = "This is a #hashtag yay!"
366
- pt = PragmaticTokenizer::Tokenizer.new(text,
367
- hashtags: :keep_and_clean
383
+ pt = PragmaticTokenizer::Tokenizer.new(
384
+ text,
385
+ hashtags: :keep_and_clean
368
386
  )
369
387
  expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
370
388
  end
371
389
 
372
390
  it 'tokenizes a string #003' do
373
391
  text = "This is a #hashtag yay!"
374
- pt = PragmaticTokenizer::Tokenizer.new(text,
375
- hashtags: :keep_original
392
+ pt = PragmaticTokenizer::Tokenizer.new(
393
+ text,
394
+ hashtags: :keep_original
376
395
  )
377
396
  expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
378
397
  end
@@ -381,24 +400,27 @@ describe PragmaticTokenizer do
381
400
  context 'option (mentions)' do
382
401
  it 'tokenizes a string #001' do
383
402
  text = "This is a @mention @mention2 yay!"
384
- pt = PragmaticTokenizer::Tokenizer.new(text,
385
- mentions: :remove
403
+ pt = PragmaticTokenizer::Tokenizer.new(
404
+ text,
405
+ mentions: :remove
386
406
  )
387
407
  expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
388
408
  end
389
409
 
390
410
  it 'tokenizes a string #002' do
391
411
  text = "This is a @mention @mention2 yay!"
392
- pt = PragmaticTokenizer::Tokenizer.new(text,
393
- mentions: :keep_and_clean
412
+ pt = PragmaticTokenizer::Tokenizer.new(
413
+ text,
414
+ mentions: :keep_and_clean
394
415
  )
395
416
  expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
396
417
  end
397
418
 
398
419
  it 'tokenizes a string #003' do
399
420
  text = "This is a @mention @mention2 yay!"
400
- pt = PragmaticTokenizer::Tokenizer.new(text,
401
- mentions: :keep_original
421
+ pt = PragmaticTokenizer::Tokenizer.new(
422
+ text,
423
+ mentions: :keep_original
402
424
  )
403
425
  expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
404
426
  end
@@ -407,8 +429,9 @@ describe PragmaticTokenizer do
407
429
  context 'option (email addresses)' do
408
430
  it 'tokenizes a string #001' do
409
431
  text = "Here are some emails jon@hotmail.com ben123@gmail.com."
410
- pt = PragmaticTokenizer::Tokenizer.new(text,
411
- remove_emails: :true
432
+ pt = PragmaticTokenizer::Tokenizer.new(
433
+ text,
434
+ remove_emails: :true
412
435
  )
413
436
  expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
414
437
  end
@@ -421,8 +444,9 @@ describe PragmaticTokenizer do
421
444
 
422
445
  it 'knows what is not an email address' do
423
446
  text = "the great cook.@someone something else@whoever"
424
- pt = PragmaticTokenizer::Tokenizer.new(text,
425
- remove_emails: true
447
+ pt = PragmaticTokenizer::Tokenizer.new(
448
+ text,
449
+ remove_emails: true
426
450
  )
427
451
  expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
428
452
  end
@@ -431,8 +455,9 @@ describe PragmaticTokenizer do
431
455
  context 'option (urls)' do
432
456
  it 'tokenizes a string #001' do
433
457
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
434
- pt = PragmaticTokenizer::Tokenizer.new(text,
435
- remove_urls: :true
458
+ pt = PragmaticTokenizer::Tokenizer.new(
459
+ text,
460
+ remove_urls: :true
436
461
  )
437
462
  expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
438
463
  end
@@ -447,8 +472,9 @@ describe PragmaticTokenizer do
447
472
  context 'option (domains)' do
448
473
  it 'tokenizes a string #001' do
449
474
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
450
- pt = PragmaticTokenizer::Tokenizer.new(text,
451
- remove_domains: :true
475
+ pt = PragmaticTokenizer::Tokenizer.new(
476
+ text,
477
+ remove_domains: :true
452
478
  )
453
479
  expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
454
480
  end
@@ -462,24 +488,27 @@ describe PragmaticTokenizer do
462
488
  it 'knows what is not a domain 1' do
463
489
  skip "NOT IMPLEMENTED"
464
490
  text = "this is a sentence.and no domain."
465
- pt = PragmaticTokenizer::Tokenizer.new(text,
466
- remove_domains: true
491
+ pt = PragmaticTokenizer::Tokenizer.new(
492
+ text,
493
+ remove_domains: true
467
494
  )
468
495
  expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
469
496
  end
470
497
 
471
498
  it 'knows what is not a domain 2' do
472
499
  text = "former president g.w.bush was..."
473
- pt = PragmaticTokenizer::Tokenizer.new(text,
474
- remove_domains: true
500
+ pt = PragmaticTokenizer::Tokenizer.new(
501
+ text,
502
+ remove_domains: true
475
503
  )
476
504
  expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
477
505
  end
478
506
 
479
507
  it 'knows what is not a domain 3' do
480
508
  text = "2.something-times"
481
- pt = PragmaticTokenizer::Tokenizer.new(text,
482
- remove_domains: true
509
+ pt = PragmaticTokenizer::Tokenizer.new(
510
+ text,
511
+ remove_domains: true
483
512
  )
484
513
  expect(pt.tokenize).to eq(["2.something-times"])
485
514
  end
@@ -488,16 +517,18 @@ describe PragmaticTokenizer do
488
517
  context 'option (long_word_split)' do
489
518
  it 'tokenizes a string #001' do
490
519
  text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
491
- pt = PragmaticTokenizer::Tokenizer.new(text,
492
- long_word_split: 10
520
+ pt = PragmaticTokenizer::Tokenizer.new(
521
+ text,
522
+ long_word_split: 10
493
523
  )
494
524
  expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
495
525
  end
496
526
 
497
527
  it 'tokenizes a string #002' do
498
528
  text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
499
- pt = PragmaticTokenizer::Tokenizer.new(text,
500
- long_word_split: 4
529
+ pt = PragmaticTokenizer::Tokenizer.new(
530
+ text,
531
+ long_word_split: 4
501
532
  )
502
533
  expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
503
534
  end
@@ -506,136 +537,153 @@ describe PragmaticTokenizer do
506
537
  context 'option (clean)' do
507
538
  it 'tokenizes a string #001' do
508
539
  text = "Hello ---------------."
509
- pt = PragmaticTokenizer::Tokenizer.new(text,
510
- clean: true
540
+ pt = PragmaticTokenizer::Tokenizer.new(
541
+ text,
542
+ clean: true
511
543
  )
512
544
  expect(pt.tokenize).to eq(["hello", "."])
513
545
  end
514
546
 
515
547
  it 'tokenizes a string #002' do
516
548
  text = "Hello ____________________ ."
517
- pt = PragmaticTokenizer::Tokenizer.new(text,
518
- clean: true
549
+ pt = PragmaticTokenizer::Tokenizer.new(
550
+ text,
551
+ clean: true
519
552
  )
520
553
  expect(pt.tokenize).to eq(["hello", "."])
521
554
  end
522
555
 
523
556
  it 'tokenizes a string #003' do
524
557
  text = "© ABC Company 1994"
525
- pt = PragmaticTokenizer::Tokenizer.new(text,
526
- clean: true
558
+ pt = PragmaticTokenizer::Tokenizer.new(
559
+ text,
560
+ clean: true
527
561
  )
528
- expect(pt.tokenize).to eq(["abc", "company", "1994"])
562
+ expect(pt.tokenize).to eq(%w(abc company 1994))
529
563
  end
530
564
 
531
565
  it 'tokenizes a string #004' do
532
566
  text = "This sentence has a long string of dots ......................."
533
- pt = PragmaticTokenizer::Tokenizer.new(text,
534
- clean: true
567
+ pt = PragmaticTokenizer::Tokenizer.new(
568
+ text,
569
+ clean: true
535
570
  )
536
- expect(pt.tokenize).to eq(["this", "sentence", "has", "a", "long", "string", "of", "dots"])
571
+ expect(pt.tokenize).to eq(%w(this sentence has a long string of dots))
537
572
  end
538
573
 
539
574
  it 'tokenizes a string #005' do
540
575
  text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
541
- pt = PragmaticTokenizer::Tokenizer.new(text,
542
- clean: true
576
+ pt = PragmaticTokenizer::Tokenizer.new(
577
+ text,
578
+ clean: true
543
579
  )
544
580
  expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
545
581
  end
546
582
 
547
583
  it 'does not remove a valid hashtag' do
548
584
  text = "This #sentence has a long string of dots ......................."
549
- pt = PragmaticTokenizer::Tokenizer.new(text,
550
- clean: true
585
+ pt = PragmaticTokenizer::Tokenizer.new(
586
+ text,
587
+ clean: true
551
588
  )
552
589
  expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
553
590
  end
554
591
 
555
592
  it 'does not remove a valid mention' do
556
593
  text = "This @sentence has a long string of dots ......................."
557
- pt = PragmaticTokenizer::Tokenizer.new(text,
558
- clean: true
594
+ pt = PragmaticTokenizer::Tokenizer.new(
595
+ text,
596
+ clean: true
559
597
  )
560
598
  expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
561
599
  end
562
600
 
563
601
  it 'cleans words with symbols 1' do
564
602
  text = "something.com:article title !!wow look!!1"
565
- pt = PragmaticTokenizer::Tokenizer.new(text,
566
- clean: true
603
+ pt = PragmaticTokenizer::Tokenizer.new(
604
+ text,
605
+ clean: true
567
606
  )
568
607
  expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
569
608
  end
570
609
 
571
610
  it 'cleans words with symbols 2' do
572
611
  text = "something.com:article title !!wow look!!1!1!11!"
573
- pt = PragmaticTokenizer::Tokenizer.new(text,
574
- clean: true
612
+ pt = PragmaticTokenizer::Tokenizer.new(
613
+ text,
614
+ clean: true
575
615
  )
576
616
  expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
577
617
  end
578
618
 
579
619
  it 'identifies prefixed symbols' do
580
620
  text = "look:the sky is blue"
581
- pt = PragmaticTokenizer::Tokenizer.new(text,
582
- clean: true
621
+ pt = PragmaticTokenizer::Tokenizer.new(
622
+ text,
623
+ clean: true
583
624
  )
584
- expect(pt.tokenize).to eq(["look", "the", "sky", "is", "blue"])
625
+ expect(pt.tokenize).to eq(%w(look the sky is blue))
585
626
  end
586
627
 
587
628
  it 'keeps numbers at the end of mentions and hashtags' do
588
629
  text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
589
- pt = PragmaticTokenizer::Tokenizer.new(text,
590
- clean: true
630
+ pt = PragmaticTokenizer::Tokenizer.new(
631
+ text,
632
+ clean: true
591
633
  )
592
634
  expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
593
635
  end
594
636
 
595
637
  it 'cleans a prefixed weird hyphen' do
596
638
  text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
597
- pt = PragmaticTokenizer::Tokenizer.new(text,
598
- clean: true
639
+ pt = PragmaticTokenizer::Tokenizer.new(
640
+ text,
641
+ clean: true
599
642
  )
600
- expect(pt.tokenize).to eq(["high", "temperature", "and", "wind"])
643
+ expect(pt.tokenize).to eq(%w(high temperature and wind))
601
644
  end
602
645
 
603
646
  it 'cleans (r) and (c) and (tm)' do
604
647
  text = "the oscar® night ©companyname is a trademark™"
605
- pt = PragmaticTokenizer::Tokenizer.new(text,
606
- clean: true
648
+ pt = PragmaticTokenizer::Tokenizer.new(
649
+ text,
650
+ clean: true
607
651
  )
608
- expect(pt.tokenize).to eq(["the", "oscar", "night", "companyname", "is", "a", "trademark"])
652
+ expect(pt.tokenize).to eq(%w(the oscar night companyname is a trademark))
609
653
  end
610
654
 
611
655
  it 'cleans letters in boxes 1' do
612
656
  text = "making🇦🇹postcards"
613
- pt = PragmaticTokenizer::Tokenizer.new(text,
614
- clean: true
657
+ pt = PragmaticTokenizer::Tokenizer.new(
658
+ text,
659
+ clean: true
615
660
  )
616
- expect(pt.tokenize).to eq(["making", "postcards"])
661
+ expect(pt.tokenize).to eq(%w(making postcards))
617
662
  end
618
663
 
619
664
  it 'removes colons' do
620
665
  text = "At 19:30 o'clock: Mad Max: Fury Road"
621
- pt = PragmaticTokenizer::Tokenizer.new(text,
622
- clean: true
666
+ pt = PragmaticTokenizer::Tokenizer.new(
667
+ text,
668
+ clean: true
623
669
  )
624
670
  expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
625
671
  end
626
672
 
627
673
  it 'removes a hyphen prefix 3' do
628
674
  text = "women's clothes and –shoes needed"
629
- pt = PragmaticTokenizer::Tokenizer.new(text,
630
- clean: true
675
+ pt = PragmaticTokenizer::Tokenizer.new(
676
+ text,
677
+ clean: true
631
678
  )
632
679
  expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
633
680
  end
634
681
 
635
682
  it 'does not remove tokens with ampersands' do
636
683
  text = "you&me"
637
- pt = PragmaticTokenizer::Tokenizer.new(text,
638
- clean: true
684
+ pt = PragmaticTokenizer::Tokenizer.new(
685
+ text,
686
+ clean: true
639
687
  )
640
688
  expect(pt.tokenize).to eq(["you", "&", "me"])
641
689
  end
@@ -645,8 +693,9 @@ describe PragmaticTokenizer do
645
693
  it 'tokenizes a string #001' do
646
694
  # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
647
695
  text = "I.B.M. cat's can't"
648
- pt = PragmaticTokenizer::Tokenizer.new(text,
649
- classic_filter: true
696
+ pt = PragmaticTokenizer::Tokenizer.new(
697
+ text,
698
+ classic_filter: true
650
699
  )
651
700
  expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
652
701
  end
@@ -654,34 +703,38 @@ describe PragmaticTokenizer do
654
703
  it 'tokenizes a string #002' do
655
704
  # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
656
705
  text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
657
- pt = PragmaticTokenizer::Tokenizer.new(text,
658
- classic_filter: true
706
+ pt = PragmaticTokenizer::Tokenizer.new(
707
+ text,
708
+ classic_filter: true
659
709
  )
660
710
  expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
661
711
  end
662
712
 
663
713
  it 'optimizes the classic filter' do
664
714
  text = "therés something"
665
- pt = PragmaticTokenizer::Tokenizer.new(text,
666
- classic_filter: true
715
+ pt = PragmaticTokenizer::Tokenizer.new(
716
+ text,
717
+ classic_filter: true
667
718
  )
668
- expect(pt.tokenize).to eq(["there", "something"])
719
+ expect(pt.tokenize).to eq(%w(there something))
669
720
  end
670
721
 
671
722
  it 'optimizes the classic filter' do
672
723
  text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
673
- pt = PragmaticTokenizer::Tokenizer.new(text,
674
- classic_filter: true
724
+ pt = PragmaticTokenizer::Tokenizer.new(
725
+ text,
726
+ classic_filter: true
675
727
  )
676
- expect(pt.tokenize).to eq(["there", "something"])
728
+ expect(pt.tokenize).to eq(%w(there something))
677
729
  end
678
730
  end
679
731
 
680
732
  context 'option (language)' do
681
733
  it 'tokenizes a string #001' do
682
734
  text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
683
- pt = PragmaticTokenizer::Tokenizer.new(text,
684
- language: 'en'
735
+ pt = PragmaticTokenizer::Tokenizer.new(
736
+ text,
737
+ language: 'en'
685
738
  )
686
739
  expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
687
740
  end
@@ -697,8 +750,9 @@ describe PragmaticTokenizer do
697
750
  to the consequences for research and innovation or the public interest.\'
698
751
  Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
699
752
  patent lawyers on the court could prove troublesome.\'"
700
- pt = PragmaticTokenizer::Tokenizer.new(text,
701
- language: 'en'
753
+ pt = PragmaticTokenizer::Tokenizer.new(
754
+ text,
755
+ language: 'en'
702
756
  )
703
757
  expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
704
758
  end
@@ -707,56 +761,63 @@ describe PragmaticTokenizer do
707
761
  context 'option (numbers)' do
708
762
  it 'tokenizes a string #001' do
709
763
  text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
710
- pt = PragmaticTokenizer::Tokenizer.new(text,
711
- numbers: :all
764
+ pt = PragmaticTokenizer::Tokenizer.new(
765
+ text,
766
+ numbers: :all
712
767
  )
713
768
  expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
714
769
  end
715
770
 
716
771
  it 'tokenizes a string #002' do
717
772
  text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
718
- pt = PragmaticTokenizer::Tokenizer.new(text,
719
- numbers: :none
773
+ pt = PragmaticTokenizer::Tokenizer.new(
774
+ text,
775
+ numbers: :none
720
776
  )
721
777
  expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
722
778
  end
723
779
 
724
780
  it 'tokenizes a string #003' do
725
781
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
726
- pt = PragmaticTokenizer::Tokenizer.new(text,
727
- numbers: :semi
782
+ pt = PragmaticTokenizer::Tokenizer.new(
783
+ text,
784
+ numbers: :semi
728
785
  )
729
786
  expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
730
787
  end
731
788
 
732
789
  it 'tokenizes a string #004' do
733
790
  text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
734
- pt = PragmaticTokenizer::Tokenizer.new(text,
735
- numbers: :only
791
+ pt = PragmaticTokenizer::Tokenizer.new(
792
+ text,
793
+ numbers: :only
736
794
  )
737
795
  expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
738
796
  end
739
797
 
740
798
  it 'tokenizes a string #005' do
741
799
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
742
- pt = PragmaticTokenizer::Tokenizer.new(text,
743
- numbers: :none
800
+ pt = PragmaticTokenizer::Tokenizer.new(
801
+ text,
802
+ numbers: :none
744
803
  )
745
804
  expect(pt.tokenize).to eq([])
746
805
  end
747
806
 
748
807
  it 'tokenizes a string #005' do
749
808
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
750
- pt = PragmaticTokenizer::Tokenizer.new(text,
751
- numbers: :none
809
+ pt = PragmaticTokenizer::Tokenizer.new(
810
+ text,
811
+ numbers: :none
752
812
  )
753
813
  expect(pt.tokenize).to eq(["number"])
754
814
  end
755
815
 
756
816
  it 'tokenizes a string #006' do
757
817
  text = "Remove III Roman Numerals and IX. with a period."
758
- pt = PragmaticTokenizer::Tokenizer.new(text,
759
- numbers: :none
818
+ pt = PragmaticTokenizer::Tokenizer.new(
819
+ text,
820
+ numbers: :none
760
821
  )
761
822
  expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
762
823
  end
@@ -765,8 +826,9 @@ describe PragmaticTokenizer do
765
826
  context 'option (minimum_length)' do
766
827
  it 'tokenizes a string #001' do
767
828
  text = "Let's test the minimum length of fiver."
768
- pt = PragmaticTokenizer::Tokenizer.new(text,
769
- minimum_length: 5
829
+ pt = PragmaticTokenizer::Tokenizer.new(
830
+ text,
831
+ minimum_length: 5
770
832
  )
771
833
  expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
772
834
  end
@@ -775,241 +837,271 @@ describe PragmaticTokenizer do
775
837
  context 'option (punctuation)' do
776
838
  it 'tokenizes a string #001' do
777
839
  text = "kath. / evang"
778
- pt = PragmaticTokenizer::Tokenizer.new(text,
779
- punctuation: 'none'
840
+ pt = PragmaticTokenizer::Tokenizer.new(
841
+ text,
842
+ punctuation: 'none'
780
843
  )
781
- expect(pt.tokenize).to eq(["kath", "evang"])
844
+ expect(pt.tokenize).to eq(%w(kath evang))
782
845
  end
783
846
 
784
847
  it 'tokenizes a string #002' do
785
848
  text = "derStandard.at › Sport"
786
- pt = PragmaticTokenizer::Tokenizer.new(text,
787
- punctuation: 'none'
849
+ pt = PragmaticTokenizer::Tokenizer.new(
850
+ text,
851
+ punctuation: 'none'
788
852
  )
789
853
  expect(pt.tokenize).to eq(["derstandard.at", "sport"])
790
854
  end
791
855
 
792
856
  it 'tokenizes a string #003' do
793
857
  text = "hello ^^"
794
- pt = PragmaticTokenizer::Tokenizer.new(text,
795
- punctuation: 'none'
858
+ pt = PragmaticTokenizer::Tokenizer.new(
859
+ text,
860
+ punctuation: 'none'
796
861
  )
797
862
  expect(pt.tokenize).to eq(["hello"])
798
863
  end
799
864
 
800
865
  it 'tokenizes a string #004' do
801
866
  text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
802
- pt = PragmaticTokenizer::Tokenizer.new(text,
803
- punctuation: 'none'
867
+ pt = PragmaticTokenizer::Tokenizer.new(
868
+ text,
869
+ punctuation: 'none'
804
870
  )
805
871
  expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
806
872
  end
807
873
 
808
874
  it 'tokenizes a string #005' do
809
875
  text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
810
- pt = PragmaticTokenizer::Tokenizer.new(text,
811
- punctuation: 'none'
876
+ pt = PragmaticTokenizer::Tokenizer.new(
877
+ text,
878
+ punctuation: 'none'
812
879
  )
813
- expect(pt.tokenize).to eq(["a", "sentence", "one", "with", "two", "dots", "and", "with", "three", "or", "horizontal", "ellipsis", "which", "are", "three", "dots", "too"])
880
+ expect(pt.tokenize).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
814
881
  end
815
882
 
816
883
  it 'tokenizes a string #006' do
817
884
  text = "+++ BREAKING +++ something happened; is it interesting?"
818
- pt = PragmaticTokenizer::Tokenizer.new(text,
819
- punctuation: 'none'
885
+ pt = PragmaticTokenizer::Tokenizer.new(
886
+ text,
887
+ punctuation: 'none'
820
888
  )
821
- expect(pt.tokenize).to eq(["breaking", "something", "happened", "is", "it", "interesting"])
889
+ expect(pt.tokenize).to eq(%w(breaking something happened is it interesting))
822
890
  end
823
891
 
824
892
  it 'tokenizes a string #007' do
825
893
  text = "Some *interesting stuff* is __happening here__"
826
- pt = PragmaticTokenizer::Tokenizer.new(text,
827
- punctuation: 'none'
894
+ pt = PragmaticTokenizer::Tokenizer.new(
895
+ text,
896
+ punctuation: 'none'
828
897
  )
829
898
  expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
830
899
  end
831
900
 
832
901
  it 'tokenizes a string #008' do
833
902
  text = "Hello; what is your: name @username **delete**"
834
- pt = PragmaticTokenizer::Tokenizer.new(text,
835
- punctuation: 'none'
903
+ pt = PragmaticTokenizer::Tokenizer.new(
904
+ text,
905
+ punctuation: 'none'
836
906
  )
837
907
  expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
838
908
  end
839
909
 
840
910
  it 'tokenizes a string #009' do
841
911
  text = "hello ;-) yes"
842
- pt = PragmaticTokenizer::Tokenizer.new(text,
843
- punctuation: :none
912
+ pt = PragmaticTokenizer::Tokenizer.new(
913
+ text,
914
+ punctuation: :none
844
915
  )
845
- expect(pt.tokenize).to eq(["hello", "yes"])
916
+ expect(pt.tokenize).to eq(%w(hello yes))
846
917
  end
847
918
 
848
919
  it 'tokenizes a string #010' do
849
920
  text = "hello ;)"
850
- pt = PragmaticTokenizer::Tokenizer.new(text,
851
- punctuation: 'none'
921
+ pt = PragmaticTokenizer::Tokenizer.new(
922
+ text,
923
+ punctuation: 'none'
852
924
  )
853
925
  expect(pt.tokenize).to eq(["hello"])
854
926
  end
855
927
 
856
928
  it 'tokenizes a string #011' do
857
929
  text = "Hello ____________________ ."
858
- pt = PragmaticTokenizer::Tokenizer.new(text,
859
- punctuation: :none
930
+ pt = PragmaticTokenizer::Tokenizer.new(
931
+ text,
932
+ punctuation: :none
860
933
  )
861
934
  expect(pt.tokenize).to eq(["hello"])
862
935
  end
863
936
 
864
937
  it 'handles non-domain words with a dot 1' do
865
938
  text = "They were being helped.This is solidarity."
866
- pt = PragmaticTokenizer::Tokenizer.new(text,
867
- punctuation: 'none'
868
- )
869
- expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity"])
939
+ pt = PragmaticTokenizer::Tokenizer.new(
940
+ text,
941
+ punctuation: 'none'
942
+ )
943
+ expect(pt.tokenize).to eq(%w(they were being helped this is solidarity))
870
944
  end
871
945
 
872
946
  it 'handles non-domain words with a dot 2' do
873
947
  text = "picture was taken in sept.2015"
874
- pt = PragmaticTokenizer::Tokenizer.new(text,
875
- punctuation: 'none'
948
+ pt = PragmaticTokenizer::Tokenizer.new(
949
+ text,
950
+ punctuation: 'none'
876
951
  )
877
952
  expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
878
953
  end
879
954
 
880
955
  it 'handles non-domain words with a dot 3' do
881
956
  text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
882
- pt = PragmaticTokenizer::Tokenizer.new(text,
883
- punctuation: 'none'
957
+ pt = PragmaticTokenizer::Tokenizer.new(
958
+ text,
959
+ punctuation: 'none'
884
960
  )
885
961
  expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
886
962
  end
887
963
 
888
964
  it 'handles numbers with symbols 1' do
889
965
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
890
- pt = PragmaticTokenizer::Tokenizer.new(text,
891
- punctuation: 'none'
966
+ pt = PragmaticTokenizer::Tokenizer.new(
967
+ text,
968
+ punctuation: 'none'
892
969
  )
893
970
  expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
894
971
  end
895
972
 
896
973
  it 'handles numbers with symbols 2' do
897
974
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
898
- pt = PragmaticTokenizer::Tokenizer.new(text,
899
- punctuation: 'none'
975
+ pt = PragmaticTokenizer::Tokenizer.new(
976
+ text,
977
+ punctuation: 'none'
900
978
  )
901
979
  expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
902
980
  end
903
981
 
904
982
  it 'handles apostrophes and quotes' do
905
983
  text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
906
- pt = PragmaticTokenizer::Tokenizer.new(text,
907
- punctuation: 'none'
984
+ pt = PragmaticTokenizer::Tokenizer.new(
985
+ text,
986
+ punctuation: 'none'
908
987
  )
909
988
  expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
910
989
  end
911
990
 
912
991
  it 'handles mentions' do
913
992
  text = ".@someone I disagree"
914
- pt = PragmaticTokenizer::Tokenizer.new(text,
915
- punctuation: 'none'
993
+ pt = PragmaticTokenizer::Tokenizer.new(
994
+ text,
995
+ punctuation: 'none'
916
996
  )
917
997
  expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
918
998
  end
919
999
 
920
1000
  it 'handles old school emoticons 2' do
921
1001
  text = "oooh! <3"
922
- pt = PragmaticTokenizer::Tokenizer.new(text,
923
- punctuation: 'none'
1002
+ pt = PragmaticTokenizer::Tokenizer.new(
1003
+ text,
1004
+ punctuation: 'none'
924
1005
  )
925
1006
  expect(pt.tokenize).to eq(["oooh", "<3"])
926
1007
  end
927
1008
 
928
1009
  it 'handles old school emoticons 3' do
929
1010
  text = "@someone &lt;33"
930
- pt = PragmaticTokenizer::Tokenizer.new(text,
931
- punctuation: 'none'
1011
+ pt = PragmaticTokenizer::Tokenizer.new(
1012
+ text,
1013
+ punctuation: 'none'
932
1014
  )
933
1015
  expect(pt.tokenize).to eq(["@someone", "<33"])
934
1016
  end
935
1017
 
936
1018
  it 'handles words with a symbol prefix 1' do
937
1019
  text = "Yes! /cc @someone"
938
- pt = PragmaticTokenizer::Tokenizer.new(text,
939
- punctuation: 'none'
1020
+ pt = PragmaticTokenizer::Tokenizer.new(
1021
+ text,
1022
+ punctuation: 'none'
940
1023
  )
941
1024
  expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
942
1025
  end
943
1026
 
944
1027
  it 'handles words with a emoji suffix' do
945
1028
  text = "Let's meet there.😝 ok?"
946
- pt = PragmaticTokenizer::Tokenizer.new(text,
947
- punctuation: 'none'
1029
+ pt = PragmaticTokenizer::Tokenizer.new(
1030
+ text,
1031
+ punctuation: 'none'
948
1032
  )
949
1033
  expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
950
1034
  end
951
1035
 
952
1036
  it 'handles words with a symbol prefix 2' do
953
1037
  text = "blah blah |photo by @someone"
954
- pt = PragmaticTokenizer::Tokenizer.new(text,
955
- punctuation: 'none'
1038
+ pt = PragmaticTokenizer::Tokenizer.new(
1039
+ text,
1040
+ punctuation: 'none'
956
1041
  )
957
1042
  expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
958
1043
  end
959
1044
 
960
1045
  it 'handles pseudo-contractions' do
961
1046
  text = "I suggest to buy stocks that are low value+have momentum"
962
- pt = PragmaticTokenizer::Tokenizer.new(text,
963
- punctuation: 'none'
1047
+ pt = PragmaticTokenizer::Tokenizer.new(
1048
+ text,
1049
+ punctuation: 'none'
964
1050
  )
965
- expect(pt.tokenize).to eq(["i", "suggest", "to", "buy", "stocks", "that", "are", "low", "value", "have", "momentum"])
1051
+ expect(pt.tokenize).to eq(%w(i suggest to buy stocks that are low value have momentum))
966
1052
  end
967
1053
 
968
1054
  it 'handles apostrophes and quotes 1' do
969
1055
  text = "Watch the video of @amandapalmer's song “Killing Type” here"
970
- pt = PragmaticTokenizer::Tokenizer.new(text,
971
- punctuation: 'none'
1056
+ pt = PragmaticTokenizer::Tokenizer.new(
1057
+ text,
1058
+ punctuation: 'none'
972
1059
  )
973
1060
  expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
974
1061
  end
975
1062
 
976
- it 'handles apostrophes and quotes 2' do
1063
+ it 'handles apostrophes and quotes 2' do
977
1064
  text = "Watch the video of @amandapalmer`s song “Killing Type” here"
978
- pt = PragmaticTokenizer::Tokenizer.new(text,
979
- punctuation: 'none'
1065
+ pt = PragmaticTokenizer::Tokenizer.new(
1066
+ text,
1067
+ punctuation: 'none'
980
1068
  )
981
1069
  expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
982
1070
  end
983
1071
 
984
1072
  it 'handles numbers suffixed with a symbol' do
985
1073
  text = "4 Things Marketers Must Do Better in 2016: blah"
986
- pt = PragmaticTokenizer::Tokenizer.new(text,
987
- punctuation: 'none'
1074
+ pt = PragmaticTokenizer::Tokenizer.new(
1075
+ text,
1076
+ punctuation: 'none'
988
1077
  )
989
- expect(pt.tokenize).to eq(["4", "things", "marketers", "must", "do", "better", "in", "2016", "blah"])
1078
+ expect(pt.tokenize).to eq(%w(4 things marketers must do better in 2016 blah))
990
1079
  end
991
1080
 
992
1081
  it 'handles words with a emoticon suffix' do
993
1082
  skip "NOT IMPLEMENTED"
994
1083
  text = "look, a dog with shoes☺ !!"
995
- pt = PragmaticTokenizer::Tokenizer.new(text,
996
- punctuation: 'none'
1084
+ pt = PragmaticTokenizer::Tokenizer.new(
1085
+ text,
1086
+ punctuation: 'none'
997
1087
  )
998
1088
  expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
999
1089
  end
1000
1090
 
1001
1091
  it 'handles emoji 1' do
1002
1092
  text = "How bad!😝"
1003
- pt = PragmaticTokenizer::Tokenizer.new(text,
1004
- punctuation: 'none'
1093
+ pt = PragmaticTokenizer::Tokenizer.new(
1094
+ text,
1095
+ punctuation: 'none'
1005
1096
  )
1006
1097
  expect(pt.tokenize).to eq(["how", "bad", "😝"])
1007
1098
  end
1008
1099
 
1009
1100
  it 'handles emoji 2' do
1010
1101
  text = "😝How bad!"
1011
- pt = PragmaticTokenizer::Tokenizer.new(text,
1012
- punctuation: 'none'
1102
+ pt = PragmaticTokenizer::Tokenizer.new(
1103
+ text,
1104
+ punctuation: 'none'
1013
1105
  )
1014
1106
  expect(pt.tokenize).to eq(["😝", "how", "bad"])
1015
1107
  end
@@ -1017,16 +1109,18 @@ describe PragmaticTokenizer do
1017
1109
  it 'identifies old school emoticons' do
1018
1110
  skip "NOT IMPLEMENTED"
1019
1111
  text = 'looking forward to the new kodak super8 camera \o/'
1020
- pt = PragmaticTokenizer::Tokenizer.new(text,
1021
- punctuation: 'none'
1112
+ pt = PragmaticTokenizer::Tokenizer.new(
1113
+ text,
1114
+ punctuation: 'none'
1022
1115
  )
1023
1116
  expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
1024
1117
  end
1025
1118
 
1026
1119
  it 'splits at hashtags' do
1027
1120
  text = "some sentence#RT ... i like u2.#bono"
1028
- pt = PragmaticTokenizer::Tokenizer.new(text,
1029
- punctuation: :none
1121
+ pt = PragmaticTokenizer::Tokenizer.new(
1122
+ text,
1123
+ punctuation: :none
1030
1124
  )
1031
1125
  expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
1032
1126
  end
@@ -1035,41 +1129,45 @@ describe PragmaticTokenizer do
1035
1129
  context 'option (remove_stop_words)' do
1036
1130
  it 'removes stop words' do
1037
1131
  text = 'This is a short sentence with explanations and stop words.'
1038
- pt = PragmaticTokenizer::Tokenizer.new(text,
1039
- language: 'en',
1040
- remove_stop_words: true
1132
+ pt = PragmaticTokenizer::Tokenizer.new(
1133
+ text,
1134
+ language: 'en',
1135
+ remove_stop_words: true
1041
1136
  )
1042
1137
  expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
1043
1138
  end
1044
1139
 
1045
1140
  it 'removes user-supplied stop words' do
1046
1141
  text = 'This is a short sentence with explanations and stop words.'
1047
- pt = PragmaticTokenizer::Tokenizer.new(text,
1048
- language: 'en',
1049
- remove_stop_words: true,
1050
- stop_words: ["and", "a"]
1142
+ pt = PragmaticTokenizer::Tokenizer.new(
1143
+ text,
1144
+ language: 'en',
1145
+ remove_stop_words: true,
1146
+ stop_words: %w(and a)
1051
1147
  )
1052
1148
  expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
1053
1149
  end
1054
1150
 
1055
1151
  it 'removes user-supplied stop words and default stop words' do
1056
1152
  text = 'This is a short sentence with explanations and stop words.'
1057
- pt = PragmaticTokenizer::Tokenizer.new(text,
1058
- language: 'en',
1059
- remove_stop_words: true,
1060
- stop_words: ["sentence"],
1061
- filter_languages: [:en]
1153
+ pt = PragmaticTokenizer::Tokenizer.new(
1154
+ text,
1155
+ language: 'en',
1156
+ remove_stop_words: true,
1157
+ stop_words: ["sentence"],
1158
+ filter_languages: [:en]
1062
1159
  )
1063
1160
  expect(pt.tokenize).to eq(["short", "explanations", "."])
1064
1161
  end
1065
1162
 
1066
1163
  it 'removes user-supplied stop words and default stop words across multiple languages' do
1067
1164
  text = 'This is a short sentence with explanations and stop words. And achte German words.'
1068
- pt = PragmaticTokenizer::Tokenizer.new(text,
1069
- language: 'en',
1070
- remove_stop_words: true,
1071
- stop_words: ["sentence"],
1072
- filter_languages: [:en, :de]
1165
+ pt = PragmaticTokenizer::Tokenizer.new(
1166
+ text,
1167
+ language: 'en',
1168
+ remove_stop_words: true,
1169
+ stop_words: ["sentence"],
1170
+ filter_languages: [:en, :de]
1073
1171
  )
1074
1172
  expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
1075
1173
  end
@@ -1078,27 +1176,30 @@ describe PragmaticTokenizer do
1078
1176
  context 'multiple options selected' do
1079
1177
  it 'tokenizes a string #001' do
1080
1178
  text = 'His name is Mr. Smith.'
1081
- pt = PragmaticTokenizer::Tokenizer.new(text,
1082
- language: 'en',
1083
- punctuation: 'none'
1179
+ pt = PragmaticTokenizer::Tokenizer.new(
1180
+ text,
1181
+ language: 'en',
1182
+ punctuation: 'none'
1084
1183
  )
1085
1184
  expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1086
1185
  end
1087
1186
 
1088
1187
  it 'tokenizes a string #002' do
1089
1188
  text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
1090
- pt = PragmaticTokenizer::Tokenizer.new(text,
1091
- language: 'en',
1092
- punctuation: 'only'
1189
+ pt = PragmaticTokenizer::Tokenizer.new(
1190
+ text,
1191
+ language: 'en',
1192
+ punctuation: 'only'
1093
1193
  )
1094
1194
  expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
1095
1195
  end
1096
1196
 
1097
1197
  it 'tokenizes a string #003' do
1098
1198
  text = "Hello the a it experiment one fine."
1099
- pt = PragmaticTokenizer::Tokenizer.new(text,
1100
- language: 'en',
1101
- remove_stop_words: true
1199
+ pt = PragmaticTokenizer::Tokenizer.new(
1200
+ text,
1201
+ language: 'en',
1202
+ remove_stop_words: true
1102
1203
  )
1103
1204
  expect(pt.tokenize).to eq(["experiment", "fine", "."])
1104
1205
  end
@@ -1106,214 +1207,235 @@ describe PragmaticTokenizer do
1106
1207
  it 'tokenizes a string #004' do
1107
1208
  # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
1108
1209
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
1109
- pt = PragmaticTokenizer::Tokenizer.new(text,
1110
- expand_contractions: true,
1111
- remove_stop_words: true,
1112
- punctuation: 'none'
1210
+ pt = PragmaticTokenizer::Tokenizer.new(
1211
+ text,
1212
+ expand_contractions: true,
1213
+ remove_stop_words: true,
1214
+ punctuation: 'none'
1113
1215
  )
1114
- expect(pt.tokenize).to eq(["crazy", "sandowsky", "afford"])
1216
+ expect(pt.tokenize).to eq(%w(crazy sandowsky afford))
1115
1217
  end
1116
1218
 
1117
1219
  it 'tokenizes a string #005' do
1118
1220
  text = "Hello world with a stop word experiment."
1119
- pt = PragmaticTokenizer::Tokenizer.new(text,
1120
- language: 'en',
1121
- clean: true,
1122
- numbers: :none,
1123
- minimum_length: 3,
1124
- expand_contractions: true,
1125
- remove_stop_words: true,
1126
- punctuation: 'none'
1221
+ pt = PragmaticTokenizer::Tokenizer.new(
1222
+ text,
1223
+ language: 'en',
1224
+ clean: true,
1225
+ numbers: :none,
1226
+ minimum_length: 3,
1227
+ expand_contractions: true,
1228
+ remove_stop_words: true,
1229
+ punctuation: 'none'
1127
1230
  )
1128
1231
  expect(pt.tokenize).to eq(["experiment"])
1129
1232
  end
1130
1233
 
1131
1234
  it 'tokenizes a string #006' do
1132
1235
  text = "Hello; what is your: name @username **delete**"
1133
- pt = PragmaticTokenizer::Tokenizer.new(text,
1134
- clean: true,
1135
- punctuation: 'none'
1236
+ pt = PragmaticTokenizer::Tokenizer.new(
1237
+ text,
1238
+ clean: true,
1239
+ punctuation: 'none'
1136
1240
  )
1137
1241
  expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
1138
1242
  end
1139
1243
 
1140
1244
  it 'tokenizes a string #007' do
1141
1245
  text = 'His name is Mr. Smith.'
1142
- pt = PragmaticTokenizer::Tokenizer.new(text,
1143
- language: 'en',
1144
- punctuation: 'none',
1145
- downcase: false
1246
+ pt = PragmaticTokenizer::Tokenizer.new(
1247
+ text,
1248
+ language: 'en',
1249
+ punctuation: 'none',
1250
+ downcase: false
1146
1251
  )
1147
1252
  expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
1148
1253
  end
1149
1254
 
1150
1255
  it 'tokenizes a string #008' do
1151
1256
  text = "Can't go tonight. Didn't finish."
1152
- pt = PragmaticTokenizer::Tokenizer.new(text,
1153
- downcase: false,
1154
- expand_contractions: true
1257
+ pt = PragmaticTokenizer::Tokenizer.new(
1258
+ text,
1259
+ downcase: false,
1260
+ expand_contractions: true
1155
1261
  )
1156
1262
  expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
1157
1263
  end
1158
1264
 
1159
1265
  it 'tokenizes a string #009' do
1160
1266
  text = "Some *interesting stuff* is __happening here__"
1161
- pt = PragmaticTokenizer::Tokenizer.new(text,
1162
- punctuation: 'none',
1163
- clean: true
1267
+ pt = PragmaticTokenizer::Tokenizer.new(
1268
+ text,
1269
+ punctuation: 'none',
1270
+ clean: true
1164
1271
  )
1165
- expect(pt.tokenize).to eq(["some", "interesting", "stuff", "is", "happening", "here"])
1272
+ expect(pt.tokenize).to eq(%w(some interesting stuff is happening here))
1166
1273
  end
1167
1274
 
1168
1275
  it 'also allows symbols for options' do
1169
1276
  text = 'His name is Mr. Smith.'
1170
- pt = PragmaticTokenizer::Tokenizer.new(text,
1171
- language: :en,
1172
- punctuation: :none
1277
+ pt = PragmaticTokenizer::Tokenizer.new(
1278
+ text,
1279
+ language: :en,
1280
+ punctuation: :none
1173
1281
  )
1174
1282
  expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1175
1283
  end
1176
1284
 
1177
1285
  it 'handles long strings 1' do
1178
1286
  text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
1179
- pt = PragmaticTokenizer::Tokenizer.new(text,
1180
- language: 'en',
1181
- clean: true,
1182
- minimum_length: 3,
1183
- expand_contractions: true,
1184
- remove_stop_words: true,
1185
- numbers: :none,
1186
- punctuation: :none
1287
+ pt = PragmaticTokenizer::Tokenizer.new(
1288
+ text,
1289
+ language: 'en',
1290
+ clean: true,
1291
+ minimum_length: 3,
1292
+ expand_contractions: true,
1293
+ remove_stop_words: true,
1294
+ numbers: :none,
1295
+ punctuation: :none
1187
1296
  )
1188
1297
  expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
1189
1298
  end
1190
1299
 
1191
1300
  it 'handles long strings 2' do
1192
1301
  text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
1193
- pt = PragmaticTokenizer::Tokenizer.new(text,
1194
- language: 'en',
1195
- clean: true,
1196
- minimum_length: 3,
1197
- expand_contractions: true,
1198
- remove_stop_words: true,
1199
- numbers: :none,
1200
- punctuation: :none
1302
+ pt = PragmaticTokenizer::Tokenizer.new(
1303
+ text,
1304
+ language: 'en',
1305
+ clean: true,
1306
+ minimum_length: 3,
1307
+ expand_contractions: true,
1308
+ remove_stop_words: true,
1309
+ numbers: :none,
1310
+ punctuation: :none
1201
1311
  )
1202
1312
  expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
1203
1313
  end
1204
1314
 
1205
1315
  it 'handles markdown' do
1206
1316
  text = "This is _bold_ and this is *italic*"
1207
- pt = PragmaticTokenizer::Tokenizer.new(text,
1208
- punctuation: 'none',
1209
- clean: true
1317
+ pt = PragmaticTokenizer::Tokenizer.new(
1318
+ text,
1319
+ punctuation: 'none',
1320
+ clean: true
1210
1321
  )
1211
- expect(pt.tokenize).to eq(["this", "is", "bold", "and", "this", "is", "italic"])
1322
+ expect(pt.tokenize).to eq(%w(this is bold and this is italic))
1212
1323
  end
1213
1324
 
1214
1325
  it 'handles single quotes' do
1215
1326
  text = "Recognised as one of the ‘good’ games."
1216
- pt = PragmaticTokenizer::Tokenizer.new(text,
1217
- language: 'en',
1218
- clean: true,
1219
- numbers: :none,
1220
- minimum_length: 3,
1221
- expand_contractions: true,
1222
- remove_stop_words: true,
1223
- punctuation: :none,
1224
- downcase: true)
1225
- expect(pt.tokenize).to eq(["recognised", "good", "games"])
1327
+ pt = PragmaticTokenizer::Tokenizer.new(
1328
+ text,
1329
+ language: 'en',
1330
+ clean: true,
1331
+ numbers: :none,
1332
+ minimum_length: 3,
1333
+ expand_contractions: true,
1334
+ remove_stop_words: true,
1335
+ punctuation: :none,
1336
+ downcase: true)
1337
+ expect(pt.tokenize).to eq(%w(recognised good games))
1226
1338
  end
1227
1339
 
1228
1340
  it 'removes control characters' do
1229
1341
  text = "\u0000 \u001F \u007FHello test."
1230
- pt = PragmaticTokenizer::Tokenizer.new(text,
1231
- language: 'en',
1232
- clean: true
1342
+ pt = PragmaticTokenizer::Tokenizer.new(
1343
+ text,
1344
+ language: 'en',
1345
+ clean: true
1233
1346
  )
1234
1347
  expect(pt.tokenize).to eq(["hello", "test", "."])
1235
1348
  end
1236
1349
 
1237
1350
  it 'splits too long words with hypens' do
1238
1351
  text = "hi-hat and old-school but not really-important-long-word"
1239
- pt = PragmaticTokenizer::Tokenizer.new(text,
1240
- punctuation: 'none',
1241
- long_word_split: 12
1352
+ pt = PragmaticTokenizer::Tokenizer.new(
1353
+ text,
1354
+ punctuation: 'none',
1355
+ long_word_split: 12
1242
1356
  )
1243
1357
  expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
1244
1358
  end
1245
1359
 
1246
1360
  it 'handles hashtags 2' do
1247
1361
  text = "This is the #upper-#limit"
1248
- pt = PragmaticTokenizer::Tokenizer.new(text,
1249
- punctuation: 'none',
1250
- hashtags: :keep_and_clean
1362
+ pt = PragmaticTokenizer::Tokenizer.new(
1363
+ text,
1364
+ punctuation: 'none',
1365
+ hashtags: :keep_and_clean
1251
1366
  )
1252
- expect(pt.tokenize).to eq(["this", "is", "the", "upper", "limit"])
1367
+ expect(pt.tokenize).to eq(%w(this is the upper limit))
1253
1368
  end
1254
1369
 
1255
1370
  it 'handles hashtags 3' do
1256
1371
  text = "The #2016-fun has just begun."
1257
- pt = PragmaticTokenizer::Tokenizer.new(text,
1258
- punctuation: 'none',
1259
- hashtags: :keep_and_clean
1372
+ pt = PragmaticTokenizer::Tokenizer.new(
1373
+ text,
1374
+ punctuation: 'none',
1375
+ hashtags: :keep_and_clean
1260
1376
  )
1261
- expect(pt.tokenize).to eq(["the", "2016", "fun", "has", "just", "begun"])
1377
+ expect(pt.tokenize).to eq(%w(the 2016 fun has just begun))
1262
1378
  end
1263
1379
 
1264
1380
  it 'does not clean mentions' do
1265
1381
  text = "@_someone_ because @someone and @_someone was taken"
1266
- pt = PragmaticTokenizer::Tokenizer.new(text,
1267
- mentions: :keep_original,
1268
- clean: true
1382
+ pt = PragmaticTokenizer::Tokenizer.new(
1383
+ text,
1384
+ mentions: :keep_original,
1385
+ clean: true
1269
1386
  )
1270
1387
  expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
1271
1388
  end
1272
1389
 
1273
1390
  it 'removes double single quotes' do
1274
1391
  text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
1275
- pt = PragmaticTokenizer::Tokenizer.new(text,
1276
- punctuation: :none,
1277
- clean: true
1392
+ pt = PragmaticTokenizer::Tokenizer.new(
1393
+ text,
1394
+ punctuation: :none,
1395
+ clean: true
1278
1396
  )
1279
- expect(pt.tokenize).to eq(["strong", "statement", "in", "the", "day", "the", "earth", "caught", "fire", "1961"])
1397
+ expect(pt.tokenize).to eq(%w(strong statement in the day the earth caught fire 1961))
1280
1398
  end
1281
1399
 
1282
1400
  it 'removes a hyphen prefix 1' do
1283
1401
  text = "Geopol.-Strategy"
1284
- pt = PragmaticTokenizer::Tokenizer.new(text,
1285
- punctuation: :none,
1286
- clean: true
1402
+ pt = PragmaticTokenizer::Tokenizer.new(
1403
+ text,
1404
+ punctuation: :none,
1405
+ clean: true
1287
1406
  )
1288
- expect(pt.tokenize).to eq(["geopol", "strategy"])
1407
+ expect(pt.tokenize).to eq(%w(geopol strategy))
1289
1408
  end
1290
1409
 
1291
1410
  it 'removes a hyphen prefix 2' do
1292
1411
  text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
1293
- pt = PragmaticTokenizer::Tokenizer.new(text,
1294
- punctuation: :none,
1295
- clean: true
1412
+ pt = PragmaticTokenizer::Tokenizer.new(
1413
+ text,
1414
+ punctuation: :none,
1415
+ clean: true
1296
1416
  )
1297
1417
  expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
1298
1418
  end
1299
1419
 
1300
1420
  it 'does not remove tokens with ampersands' do
1301
1421
  text = "you&amp;me"
1302
- pt = PragmaticTokenizer::Tokenizer.new(text,
1303
- clean: true,
1304
- punctuation: :none
1422
+ pt = PragmaticTokenizer::Tokenizer.new(
1423
+ text,
1424
+ clean: true,
1425
+ punctuation: :none
1305
1426
  )
1306
- expect(pt.tokenize).to eq(["you", "me"])
1427
+ expect(pt.tokenize).to eq(%w(you me))
1307
1428
  end
1308
1429
 
1309
1430
  it 'cleans percent signs not related to numbers' do
1310
1431
  text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
1311
- pt = PragmaticTokenizer::Tokenizer.new(text,
1312
- clean: true,
1313
- numbers: :none,
1314
- punctuation: :none
1432
+ pt = PragmaticTokenizer::Tokenizer.new(
1433
+ text,
1434
+ clean: true,
1435
+ numbers: :none,
1436
+ punctuation: :none
1315
1437
  )
1316
- expect(pt.tokenize).to eq(["tudow", "provides", "company", "users", "a", "way", "to", "offer", "each", "other", "and", "guests", "and", "interpreters", "free", "assistance", "to", "date", "there", "have", "been", "questions", "asked"])
1438
+ expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
1317
1439
  end
1318
1440
  end
1319
1441
  end
@@ -1473,8 +1595,9 @@ describe PragmaticTokenizer do
1473
1595
 
1474
1596
  it 'handles empty tokens' do
1475
1597
  text = "!!!!! https://t.co/xxxx"
1476
- pt = PragmaticTokenizer::Tokenizer.new(text,
1477
- punctuation: 'none'
1598
+ pt = PragmaticTokenizer::Tokenizer.new(
1599
+ text,
1600
+ punctuation: 'none'
1478
1601
  )
1479
1602
  expect(pt.tokenize).to eq(["https://t.co/xxxx"])
1480
1603
  end
@@ -1532,4 +1655,4 @@ describe PragmaticTokenizer do
1532
1655
  end
1533
1656
  end
1534
1657
  end
1535
- end
1658
+ end