pragmatic_tokenizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -3,7 +3,6 @@ require 'spec_helper'
|
|
3
3
|
describe PragmaticTokenizer do
|
4
4
|
context 'Language: English (en)' do
|
5
5
|
context '#tokenize (example strings)' do
|
6
|
-
|
7
6
|
context 'no options selected' do
|
8
7
|
it 'tokenizes a string #001' do
|
9
8
|
text = "Hello world."
|
@@ -171,7 +170,7 @@ describe PragmaticTokenizer do
|
|
171
170
|
text = "U.S.A. U.S.A. U.S.A."
|
172
171
|
pt = PragmaticTokenizer::Tokenizer.new(text)
|
173
172
|
expect(pt.tokenize).to eq(
|
174
|
-
|
173
|
+
["u.s.a.", "u.s.a.", "u.s.a."]
|
175
174
|
)
|
176
175
|
end
|
177
176
|
end
|
@@ -186,8 +185,9 @@ describe PragmaticTokenizer do
|
|
186
185
|
it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
|
187
186
|
text = "Mr. Smith, hello world."
|
188
187
|
abbreviations = ['mrs']
|
189
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
190
|
-
|
188
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
189
|
+
text,
|
190
|
+
abbreviations: abbreviations
|
191
191
|
)
|
192
192
|
expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
|
193
193
|
end
|
@@ -195,8 +195,9 @@ describe PragmaticTokenizer do
|
|
195
195
|
it 'recognizes a user-supplied abbreviation' do
|
196
196
|
text = "thisisnotanormalabbreviation. hello world."
|
197
197
|
abbreviations = ['thisisnotanormalabbreviation']
|
198
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
199
|
-
|
198
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
199
|
+
text,
|
200
|
+
abbreviations: abbreviations
|
200
201
|
)
|
201
202
|
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
|
202
203
|
end
|
@@ -204,16 +205,18 @@ describe PragmaticTokenizer do
|
|
204
205
|
it 'handles an empty user-supplied abbreviation array' do
|
205
206
|
text = "thisisnotanormalabbreviation. hello world."
|
206
207
|
abbreviations = []
|
207
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
208
|
-
|
208
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
209
|
+
text,
|
210
|
+
abbreviations: abbreviations
|
209
211
|
)
|
210
212
|
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
|
211
213
|
end
|
212
214
|
|
213
215
|
it 'handles abrreviations across multiple languages' do
|
214
216
|
text = "Mr. Smith how are ü. today."
|
215
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
216
|
-
|
217
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
218
|
+
text,
|
219
|
+
filter_languages: [:en, :de]
|
217
220
|
)
|
218
221
|
expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
|
219
222
|
end
|
@@ -221,9 +224,10 @@ describe PragmaticTokenizer do
|
|
221
224
|
it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
|
222
225
|
text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
|
223
226
|
abbreviations = ['thisisnotanormalabbreviation']
|
224
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
225
|
-
|
226
|
-
|
227
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
228
|
+
text,
|
229
|
+
filter_languages: [:en, :de],
|
230
|
+
abbreviations: abbreviations
|
227
231
|
)
|
228
232
|
expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
|
229
233
|
end
|
@@ -240,9 +244,10 @@ describe PragmaticTokenizer do
|
|
240
244
|
it 'expands user-supplied contractions' do
|
241
245
|
text = "Hello supa'soo guy."
|
242
246
|
contractions = { "supa'soo" => "super smooth" }
|
243
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
244
|
-
|
245
|
-
|
247
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
248
|
+
text,
|
249
|
+
contractions: contractions,
|
250
|
+
expand_contractions: true
|
246
251
|
)
|
247
252
|
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
|
248
253
|
end
|
@@ -250,29 +255,32 @@ describe PragmaticTokenizer do
|
|
250
255
|
it 'does not expands user-supplied contractions' do
|
251
256
|
text = "Hello supa'soo guy."
|
252
257
|
contractions = { "supa'soo" => "super smooth" }
|
253
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
254
|
-
|
255
|
-
|
258
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
259
|
+
text,
|
260
|
+
contractions: contractions,
|
261
|
+
expand_contractions: false
|
256
262
|
)
|
257
|
-
expect(pt.tokenize).to eq(
|
263
|
+
expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", "."])
|
258
264
|
end
|
259
265
|
|
260
266
|
it 'expands user-supplied contractions and language contractions' do
|
261
267
|
text = "Hello supa'soo guy. auf's wasn't it?"
|
262
268
|
contractions = { "supa'soo" => "super smooth" }
|
263
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
264
|
-
|
265
|
-
|
266
|
-
|
269
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
270
|
+
text,
|
271
|
+
contractions: contractions,
|
272
|
+
expand_contractions: true,
|
273
|
+
filter_languages: [:en, :de]
|
267
274
|
)
|
268
275
|
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
269
276
|
end
|
270
277
|
|
271
278
|
it 'expands language contractions' do
|
272
279
|
text = "Hello supa'soo guy. auf's wasn't it?"
|
273
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
274
|
-
|
275
|
-
|
280
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
281
|
+
text,
|
282
|
+
expand_contractions: true,
|
283
|
+
filter_languages: [:en, :de]
|
276
284
|
)
|
277
285
|
expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
278
286
|
end
|
@@ -280,8 +288,9 @@ describe PragmaticTokenizer do
|
|
280
288
|
it 'tokenizes a string #001' do
|
281
289
|
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
282
290
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
283
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
284
|
-
|
291
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
292
|
+
text,
|
293
|
+
expand_contractions: true
|
285
294
|
)
|
286
295
|
expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
|
287
296
|
end
|
@@ -289,24 +298,27 @@ describe PragmaticTokenizer do
|
|
289
298
|
it 'tokenizes a string #002' do
|
290
299
|
# http://nlp.stanford.edu/software/tokenizer.shtml
|
291
300
|
text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
|
292
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
293
|
-
|
301
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
302
|
+
text,
|
303
|
+
expand_contractions: true
|
294
304
|
)
|
295
305
|
expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
|
296
306
|
end
|
297
307
|
|
298
308
|
it 'tokenizes a string #003' do
|
299
309
|
text = "Look for his/her account."
|
300
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
301
|
-
|
310
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
311
|
+
text,
|
312
|
+
expand_contractions: true
|
302
313
|
)
|
303
314
|
expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
|
304
315
|
end
|
305
316
|
|
306
317
|
it 'tokenizes a string #004' do
|
307
318
|
text = "I like apples and/or oranges."
|
308
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
309
|
-
|
319
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
320
|
+
text,
|
321
|
+
expand_contractions: true
|
310
322
|
)
|
311
323
|
expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
|
312
324
|
end
|
@@ -315,8 +327,9 @@ describe PragmaticTokenizer do
|
|
315
327
|
context 'option (emojis)' do
|
316
328
|
it 'removes emoji' do
|
317
329
|
text = "Return the emoji 👿😍😱🐔🌚. 🌚"
|
318
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
319
|
-
|
330
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
331
|
+
text,
|
332
|
+
remove_emoji: true
|
320
333
|
)
|
321
334
|
expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
|
322
335
|
end
|
@@ -329,24 +342,27 @@ describe PragmaticTokenizer do
|
|
329
342
|
|
330
343
|
it 'removes snowflakes 1' do
|
331
344
|
text = "hello❄️❄️❄️"
|
332
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
333
|
-
|
345
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
346
|
+
text,
|
347
|
+
remove_emoji: true
|
334
348
|
)
|
335
349
|
expect(pt.tokenize).to eq(["hello"])
|
336
350
|
end
|
337
351
|
|
338
352
|
it 'removes snowflakes 2' do
|
339
353
|
text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
|
340
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
341
|
-
|
354
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
355
|
+
text,
|
356
|
+
remove_emoji: true
|
342
357
|
)
|
343
358
|
expect(pt.tokenize).to eq(["hello"])
|
344
359
|
end
|
345
360
|
|
346
361
|
it 'removes snowflakes 3' do
|
347
362
|
text = "hello\u2744\u2744\u2744"
|
348
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
349
|
-
|
363
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
364
|
+
text,
|
365
|
+
remove_emoji: true
|
350
366
|
)
|
351
367
|
expect(pt.tokenize).to eq(["hello"])
|
352
368
|
end
|
@@ -355,24 +371,27 @@ describe PragmaticTokenizer do
|
|
355
371
|
context 'option (hashtags)' do
|
356
372
|
it 'tokenizes a string #001' do
|
357
373
|
text = "This is a #hashtag yay!"
|
358
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
359
|
-
|
374
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
375
|
+
text,
|
376
|
+
hashtags: :remove
|
360
377
|
)
|
361
378
|
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
362
379
|
end
|
363
380
|
|
364
381
|
it 'tokenizes a string #002' do
|
365
382
|
text = "This is a #hashtag yay!"
|
366
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
367
|
-
|
383
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
384
|
+
text,
|
385
|
+
hashtags: :keep_and_clean
|
368
386
|
)
|
369
387
|
expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
|
370
388
|
end
|
371
389
|
|
372
390
|
it 'tokenizes a string #003' do
|
373
391
|
text = "This is a #hashtag yay!"
|
374
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
375
|
-
|
392
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
393
|
+
text,
|
394
|
+
hashtags: :keep_original
|
376
395
|
)
|
377
396
|
expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
|
378
397
|
end
|
@@ -381,24 +400,27 @@ describe PragmaticTokenizer do
|
|
381
400
|
context 'option (mentions)' do
|
382
401
|
it 'tokenizes a string #001' do
|
383
402
|
text = "This is a @mention @mention2 yay!"
|
384
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
385
|
-
|
403
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
404
|
+
text,
|
405
|
+
mentions: :remove
|
386
406
|
)
|
387
407
|
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
388
408
|
end
|
389
409
|
|
390
410
|
it 'tokenizes a string #002' do
|
391
411
|
text = "This is a @mention @mention2 yay!"
|
392
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
393
|
-
|
412
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
413
|
+
text,
|
414
|
+
mentions: :keep_and_clean
|
394
415
|
)
|
395
416
|
expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
|
396
417
|
end
|
397
418
|
|
398
419
|
it 'tokenizes a string #003' do
|
399
420
|
text = "This is a @mention @mention2 yay!"
|
400
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
401
|
-
|
421
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
422
|
+
text,
|
423
|
+
mentions: :keep_original
|
402
424
|
)
|
403
425
|
expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
|
404
426
|
end
|
@@ -407,8 +429,9 @@ describe PragmaticTokenizer do
|
|
407
429
|
context 'option (email addresses)' do
|
408
430
|
it 'tokenizes a string #001' do
|
409
431
|
text = "Here are some emails jon@hotmail.com ben123@gmail.com."
|
410
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
411
|
-
|
432
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
433
|
+
text,
|
434
|
+
remove_emails: :true
|
412
435
|
)
|
413
436
|
expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
|
414
437
|
end
|
@@ -421,8 +444,9 @@ describe PragmaticTokenizer do
|
|
421
444
|
|
422
445
|
it 'knows what is not an email address' do
|
423
446
|
text = "the great cook.@someone something else@whoever"
|
424
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
425
|
-
|
447
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
448
|
+
text,
|
449
|
+
remove_emails: true
|
426
450
|
)
|
427
451
|
expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
|
428
452
|
end
|
@@ -431,8 +455,9 @@ describe PragmaticTokenizer do
|
|
431
455
|
context 'option (urls)' do
|
432
456
|
it 'tokenizes a string #001' do
|
433
457
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
434
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
435
|
-
|
458
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
459
|
+
text,
|
460
|
+
remove_urls: :true
|
436
461
|
)
|
437
462
|
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
|
438
463
|
end
|
@@ -447,8 +472,9 @@ describe PragmaticTokenizer do
|
|
447
472
|
context 'option (domains)' do
|
448
473
|
it 'tokenizes a string #001' do
|
449
474
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
450
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
451
|
-
|
475
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
476
|
+
text,
|
477
|
+
remove_domains: :true
|
452
478
|
)
|
453
479
|
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
|
454
480
|
end
|
@@ -462,24 +488,27 @@ describe PragmaticTokenizer do
|
|
462
488
|
it 'knows what is not a domain 1' do
|
463
489
|
skip "NOT IMPLEMENTED"
|
464
490
|
text = "this is a sentence.and no domain."
|
465
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
466
|
-
|
491
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
492
|
+
text,
|
493
|
+
remove_domains: true
|
467
494
|
)
|
468
495
|
expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
|
469
496
|
end
|
470
497
|
|
471
498
|
it 'knows what is not a domain 2' do
|
472
499
|
text = "former president g.w.bush was..."
|
473
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
474
|
-
|
500
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
501
|
+
text,
|
502
|
+
remove_domains: true
|
475
503
|
)
|
476
504
|
expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
|
477
505
|
end
|
478
506
|
|
479
507
|
it 'knows what is not a domain 3' do
|
480
508
|
text = "2.something-times"
|
481
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
482
|
-
|
509
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
510
|
+
text,
|
511
|
+
remove_domains: true
|
483
512
|
)
|
484
513
|
expect(pt.tokenize).to eq(["2.something-times"])
|
485
514
|
end
|
@@ -488,16 +517,18 @@ describe PragmaticTokenizer do
|
|
488
517
|
context 'option (long_word_split)' do
|
489
518
|
it 'tokenizes a string #001' do
|
490
519
|
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
491
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
492
|
-
|
520
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
521
|
+
text,
|
522
|
+
long_word_split: 10
|
493
523
|
)
|
494
524
|
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
|
495
525
|
end
|
496
526
|
|
497
527
|
it 'tokenizes a string #002' do
|
498
528
|
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
499
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
500
|
-
|
529
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
530
|
+
text,
|
531
|
+
long_word_split: 4
|
501
532
|
)
|
502
533
|
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
503
534
|
end
|
@@ -506,136 +537,153 @@ describe PragmaticTokenizer do
|
|
506
537
|
context 'option (clean)' do
|
507
538
|
it 'tokenizes a string #001' do
|
508
539
|
text = "Hello ---------------."
|
509
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
510
|
-
|
540
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
541
|
+
text,
|
542
|
+
clean: true
|
511
543
|
)
|
512
544
|
expect(pt.tokenize).to eq(["hello", "."])
|
513
545
|
end
|
514
546
|
|
515
547
|
it 'tokenizes a string #002' do
|
516
548
|
text = "Hello ____________________ ."
|
517
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
518
|
-
|
549
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
550
|
+
text,
|
551
|
+
clean: true
|
519
552
|
)
|
520
553
|
expect(pt.tokenize).to eq(["hello", "."])
|
521
554
|
end
|
522
555
|
|
523
556
|
it 'tokenizes a string #003' do
|
524
557
|
text = "© ABC Company 1994"
|
525
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
526
|
-
|
558
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
559
|
+
text,
|
560
|
+
clean: true
|
527
561
|
)
|
528
|
-
expect(pt.tokenize).to eq(
|
562
|
+
expect(pt.tokenize).to eq(%w(abc company 1994))
|
529
563
|
end
|
530
564
|
|
531
565
|
it 'tokenizes a string #004' do
|
532
566
|
text = "This sentence has a long string of dots ......................."
|
533
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
534
|
-
|
567
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
568
|
+
text,
|
569
|
+
clean: true
|
535
570
|
)
|
536
|
-
expect(pt.tokenize).to eq(
|
571
|
+
expect(pt.tokenize).to eq(%w(this sentence has a long string of dots))
|
537
572
|
end
|
538
573
|
|
539
574
|
it 'tokenizes a string #005' do
|
540
575
|
text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
|
541
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
542
|
-
|
576
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
577
|
+
text,
|
578
|
+
clean: true
|
543
579
|
)
|
544
580
|
expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
|
545
581
|
end
|
546
582
|
|
547
583
|
it 'does not remove a valid hashtag' do
|
548
584
|
text = "This #sentence has a long string of dots ......................."
|
549
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
550
|
-
|
585
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
586
|
+
text,
|
587
|
+
clean: true
|
551
588
|
)
|
552
589
|
expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
|
553
590
|
end
|
554
591
|
|
555
592
|
it 'does not remove a valid mention' do
|
556
593
|
text = "This @sentence has a long string of dots ......................."
|
557
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
558
|
-
|
594
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
595
|
+
text,
|
596
|
+
clean: true
|
559
597
|
)
|
560
598
|
expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
|
561
599
|
end
|
562
600
|
|
563
601
|
it 'cleans words with symbols 1' do
|
564
602
|
text = "something.com:article title !!wow look!!1"
|
565
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
566
|
-
|
603
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
604
|
+
text,
|
605
|
+
clean: true
|
567
606
|
)
|
568
607
|
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
569
608
|
end
|
570
609
|
|
571
610
|
it 'cleans words with symbols 2' do
|
572
611
|
text = "something.com:article title !!wow look!!1!1!11!"
|
573
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
574
|
-
|
612
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
613
|
+
text,
|
614
|
+
clean: true
|
575
615
|
)
|
576
616
|
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
577
617
|
end
|
578
618
|
|
579
619
|
it 'identifies prefixed symbols' do
|
580
620
|
text = "look:the sky is blue"
|
581
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
582
|
-
|
621
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
622
|
+
text,
|
623
|
+
clean: true
|
583
624
|
)
|
584
|
-
expect(pt.tokenize).to eq(
|
625
|
+
expect(pt.tokenize).to eq(%w(look the sky is blue))
|
585
626
|
end
|
586
627
|
|
587
628
|
it 'keeps numbers at the end of mentions and hashtags' do
|
588
629
|
text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
|
589
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
590
|
-
|
630
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
631
|
+
text,
|
632
|
+
clean: true
|
591
633
|
)
|
592
634
|
expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
|
593
635
|
end
|
594
636
|
|
595
637
|
it 'cleans a prefixed weird hyphen' do
|
596
638
|
text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
|
597
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
598
|
-
|
639
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
640
|
+
text,
|
641
|
+
clean: true
|
599
642
|
)
|
600
|
-
expect(pt.tokenize).to eq(
|
643
|
+
expect(pt.tokenize).to eq(%w(high temperature and wind))
|
601
644
|
end
|
602
645
|
|
603
646
|
it 'cleans (r) and (c) and (tm)' do
|
604
647
|
text = "the oscar® night ©companyname is a trademark™"
|
605
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
606
|
-
|
648
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
649
|
+
text,
|
650
|
+
clean: true
|
607
651
|
)
|
608
|
-
expect(pt.tokenize).to eq(
|
652
|
+
expect(pt.tokenize).to eq(%w(the oscar night companyname is a trademark))
|
609
653
|
end
|
610
654
|
|
611
655
|
it 'cleans letters in boxes 1' do
|
612
656
|
text = "making🇦🇹postcards"
|
613
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
614
|
-
|
657
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
658
|
+
text,
|
659
|
+
clean: true
|
615
660
|
)
|
616
|
-
expect(pt.tokenize).to eq(
|
661
|
+
expect(pt.tokenize).to eq(%w(making postcards))
|
617
662
|
end
|
618
663
|
|
619
664
|
it 'removes colons' do
|
620
665
|
text = "At 19:30 o'clock: Mad Max: Fury Road"
|
621
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
622
|
-
|
666
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
667
|
+
text,
|
668
|
+
clean: true
|
623
669
|
)
|
624
670
|
expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
|
625
671
|
end
|
626
672
|
|
627
673
|
it 'removes a hyphen prefix 3' do
|
628
674
|
text = "women's clothes and –shoes needed"
|
629
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
630
|
-
|
675
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
676
|
+
text,
|
677
|
+
clean: true
|
631
678
|
)
|
632
679
|
expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
|
633
680
|
end
|
634
681
|
|
635
682
|
it 'does not remove tokens with ampersands' do
|
636
683
|
text = "you&me"
|
637
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
638
|
-
|
684
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
685
|
+
text,
|
686
|
+
clean: true
|
639
687
|
)
|
640
688
|
expect(pt.tokenize).to eq(["you", "&", "me"])
|
641
689
|
end
|
@@ -645,8 +693,9 @@ describe PragmaticTokenizer do
|
|
645
693
|
it 'tokenizes a string #001' do
|
646
694
|
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
647
695
|
text = "I.B.M. cat's can't"
|
648
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
649
|
-
|
696
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
697
|
+
text,
|
698
|
+
classic_filter: true
|
650
699
|
)
|
651
700
|
expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
|
652
701
|
end
|
@@ -654,34 +703,38 @@ describe PragmaticTokenizer do
|
|
654
703
|
it 'tokenizes a string #002' do
|
655
704
|
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
656
705
|
text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
|
657
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
658
|
-
|
706
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
707
|
+
text,
|
708
|
+
classic_filter: true
|
659
709
|
)
|
660
710
|
expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
|
661
711
|
end
|
662
712
|
|
663
713
|
it 'optimizes the classic filter' do
|
664
714
|
text = "therés something"
|
665
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
666
|
-
|
715
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
716
|
+
text,
|
717
|
+
classic_filter: true
|
667
718
|
)
|
668
|
-
expect(pt.tokenize).to eq(
|
719
|
+
expect(pt.tokenize).to eq(%w(there something))
|
669
720
|
end
|
670
721
|
|
671
722
|
it 'optimizes the classic filter' do
|
672
723
|
text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
|
673
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
674
|
-
|
724
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
725
|
+
text,
|
726
|
+
classic_filter: true
|
675
727
|
)
|
676
|
-
expect(pt.tokenize).to eq(
|
728
|
+
expect(pt.tokenize).to eq(%w(there something))
|
677
729
|
end
|
678
730
|
end
|
679
731
|
|
680
732
|
context 'option (language)' do
|
681
733
|
it 'tokenizes a string #001' do
|
682
734
|
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
683
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
684
|
-
|
735
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
736
|
+
text,
|
737
|
+
language: 'en'
|
685
738
|
)
|
686
739
|
expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
|
687
740
|
end
|
@@ -697,8 +750,9 @@ describe PragmaticTokenizer do
|
|
697
750
|
to the consequences for research and innovation or the public interest.\'
|
698
751
|
Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
|
699
752
|
patent lawyers on the court could prove troublesome.\'"
|
700
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
701
|
-
|
753
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
754
|
+
text,
|
755
|
+
language: 'en'
|
702
756
|
)
|
703
757
|
expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
|
704
758
|
end
|
@@ -707,56 +761,63 @@ describe PragmaticTokenizer do
|
|
707
761
|
context 'option (numbers)' do
|
708
762
|
it 'tokenizes a string #001' do
|
709
763
|
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
710
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
711
|
-
|
764
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
765
|
+
text,
|
766
|
+
numbers: :all
|
712
767
|
)
|
713
768
|
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
714
769
|
end
|
715
770
|
|
716
771
|
it 'tokenizes a string #002' do
|
717
772
|
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
718
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
719
|
-
|
773
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
774
|
+
text,
|
775
|
+
numbers: :none
|
720
776
|
)
|
721
777
|
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
|
722
778
|
end
|
723
779
|
|
724
780
|
it 'tokenizes a string #003' do
|
725
781
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
726
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
727
|
-
|
782
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
783
|
+
text,
|
784
|
+
numbers: :semi
|
728
785
|
)
|
729
786
|
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
|
730
787
|
end
|
731
788
|
|
732
789
|
it 'tokenizes a string #004' do
|
733
790
|
text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
|
734
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
735
|
-
|
791
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
792
|
+
text,
|
793
|
+
numbers: :only
|
736
794
|
)
|
737
795
|
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
|
738
796
|
end
|
739
797
|
|
740
798
|
it 'tokenizes a string #005' do
|
741
799
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
742
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
743
|
-
|
800
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
801
|
+
text,
|
802
|
+
numbers: :none
|
744
803
|
)
|
745
804
|
expect(pt.tokenize).to eq([])
|
746
805
|
end
|
747
806
|
|
748
807
|
it 'tokenizes a string #005' do
|
749
808
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
|
750
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
751
|
-
|
809
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
810
|
+
text,
|
811
|
+
numbers: :none
|
752
812
|
)
|
753
813
|
expect(pt.tokenize).to eq(["number"])
|
754
814
|
end
|
755
815
|
|
756
816
|
it 'tokenizes a string #006' do
|
757
817
|
text = "Remove III Roman Numerals and IX. with a period."
|
758
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
759
|
-
|
818
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
819
|
+
text,
|
820
|
+
numbers: :none
|
760
821
|
)
|
761
822
|
expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
|
762
823
|
end
|
@@ -765,8 +826,9 @@ describe PragmaticTokenizer do
|
|
765
826
|
context 'option (minimum_length)' do
|
766
827
|
it 'tokenizes a string #001' do
|
767
828
|
text = "Let's test the minimum length of fiver."
|
768
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
769
|
-
|
829
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
830
|
+
text,
|
831
|
+
minimum_length: 5
|
770
832
|
)
|
771
833
|
expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
|
772
834
|
end
|
@@ -775,241 +837,271 @@ describe PragmaticTokenizer do
|
|
775
837
|
context 'option (punctuation)' do
|
776
838
|
it 'tokenizes a string #001' do
|
777
839
|
text = "kath. / evang"
|
778
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
779
|
-
|
840
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
841
|
+
text,
|
842
|
+
punctuation: 'none'
|
780
843
|
)
|
781
|
-
expect(pt.tokenize).to eq(
|
844
|
+
expect(pt.tokenize).to eq(%w(kath evang))
|
782
845
|
end
|
783
846
|
|
784
847
|
it 'tokenizes a string #002' do
|
785
848
|
text = "derStandard.at › Sport"
|
786
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
787
|
-
|
849
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
850
|
+
text,
|
851
|
+
punctuation: 'none'
|
788
852
|
)
|
789
853
|
expect(pt.tokenize).to eq(["derstandard.at", "sport"])
|
790
854
|
end
|
791
855
|
|
792
856
|
it 'tokenizes a string #003' do
|
793
857
|
text = "hello ^^"
|
794
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
795
|
-
|
858
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
859
|
+
text,
|
860
|
+
punctuation: 'none'
|
796
861
|
)
|
797
862
|
expect(pt.tokenize).to eq(["hello"])
|
798
863
|
end
|
799
864
|
|
800
865
|
it 'tokenizes a string #004' do
|
801
866
|
text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
|
802
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
803
|
-
|
867
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
868
|
+
text,
|
869
|
+
punctuation: 'none'
|
804
870
|
)
|
805
871
|
expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
|
806
872
|
end
|
807
873
|
|
808
874
|
it 'tokenizes a string #005' do
|
809
875
|
text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
|
810
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
811
|
-
|
876
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
877
|
+
text,
|
878
|
+
punctuation: 'none'
|
812
879
|
)
|
813
|
-
expect(pt.tokenize).to eq(
|
880
|
+
expect(pt.tokenize).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
|
814
881
|
end
|
815
882
|
|
816
883
|
it 'tokenizes a string #006' do
|
817
884
|
text = "+++ BREAKING +++ something happened; is it interesting?"
|
818
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
819
|
-
|
885
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
886
|
+
text,
|
887
|
+
punctuation: 'none'
|
820
888
|
)
|
821
|
-
expect(pt.tokenize).to eq(
|
889
|
+
expect(pt.tokenize).to eq(%w(breaking something happened is it interesting))
|
822
890
|
end
|
823
891
|
|
824
892
|
it 'tokenizes a string #007' do
|
825
893
|
text = "Some *interesting stuff* is __happening here__"
|
826
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
827
|
-
|
894
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
895
|
+
text,
|
896
|
+
punctuation: 'none'
|
828
897
|
)
|
829
898
|
expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
|
830
899
|
end
|
831
900
|
|
832
901
|
it 'tokenizes a string #008' do
|
833
902
|
text = "Hello; what is your: name @username **delete**"
|
834
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
835
|
-
|
903
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
904
|
+
text,
|
905
|
+
punctuation: 'none'
|
836
906
|
)
|
837
907
|
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
|
838
908
|
end
|
839
909
|
|
840
910
|
it 'tokenizes a string #009' do
|
841
911
|
text = "hello ;-) yes"
|
842
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
843
|
-
|
912
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
913
|
+
text,
|
914
|
+
punctuation: :none
|
844
915
|
)
|
845
|
-
expect(pt.tokenize).to eq(
|
916
|
+
expect(pt.tokenize).to eq(%w(hello yes))
|
846
917
|
end
|
847
918
|
|
848
919
|
it 'tokenizes a string #010' do
|
849
920
|
text = "hello ;)"
|
850
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
851
|
-
|
921
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
922
|
+
text,
|
923
|
+
punctuation: 'none'
|
852
924
|
)
|
853
925
|
expect(pt.tokenize).to eq(["hello"])
|
854
926
|
end
|
855
927
|
|
856
928
|
it 'tokenizes a string #011' do
|
857
929
|
text = "Hello ____________________ ."
|
858
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
859
|
-
|
930
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
931
|
+
text,
|
932
|
+
punctuation: :none
|
860
933
|
)
|
861
934
|
expect(pt.tokenize).to eq(["hello"])
|
862
935
|
end
|
863
936
|
|
864
937
|
it 'handles non-domain words with a dot 1' do
|
865
938
|
text = "They were being helped.This is solidarity."
|
866
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
867
|
-
|
868
|
-
|
869
|
-
|
939
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
940
|
+
text,
|
941
|
+
punctuation: 'none'
|
942
|
+
)
|
943
|
+
expect(pt.tokenize).to eq(%w(they were being helped this is solidarity))
|
870
944
|
end
|
871
945
|
|
872
946
|
it 'handles non-domain words with a dot 2' do
|
873
947
|
text = "picture was taken in sept.2015"
|
874
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
875
|
-
|
948
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
949
|
+
text,
|
950
|
+
punctuation: 'none'
|
876
951
|
)
|
877
952
|
expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
|
878
953
|
end
|
879
954
|
|
880
955
|
it 'handles non-domain words with a dot 3' do
|
881
956
|
text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
|
882
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
883
|
-
|
957
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
958
|
+
text,
|
959
|
+
punctuation: 'none'
|
884
960
|
)
|
885
961
|
expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
|
886
962
|
end
|
887
963
|
|
888
964
|
it 'handles numbers with symbols 1' do
|
889
965
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
890
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
891
|
-
|
966
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
967
|
+
text,
|
968
|
+
punctuation: 'none'
|
892
969
|
)
|
893
970
|
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
894
971
|
end
|
895
972
|
|
896
973
|
it 'handles numbers with symbols 2' do
|
897
974
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
898
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
899
|
-
|
975
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
976
|
+
text,
|
977
|
+
punctuation: 'none'
|
900
978
|
)
|
901
979
|
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
902
980
|
end
|
903
981
|
|
904
982
|
it 'handles apostrophes and quotes' do
|
905
983
|
text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
|
906
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
907
|
-
|
984
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
985
|
+
text,
|
986
|
+
punctuation: 'none'
|
908
987
|
)
|
909
988
|
expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
|
910
989
|
end
|
911
990
|
|
912
991
|
it 'handles mentions' do
|
913
992
|
text = ".@someone I disagree"
|
914
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
915
|
-
|
993
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
994
|
+
text,
|
995
|
+
punctuation: 'none'
|
916
996
|
)
|
917
997
|
expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
|
918
998
|
end
|
919
999
|
|
920
1000
|
it 'handles old school emoticons 2' do
|
921
1001
|
text = "oooh! <3"
|
922
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
923
|
-
|
1002
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1003
|
+
text,
|
1004
|
+
punctuation: 'none'
|
924
1005
|
)
|
925
1006
|
expect(pt.tokenize).to eq(["oooh", "<3"])
|
926
1007
|
end
|
927
1008
|
|
928
1009
|
it 'handles old school emoticons 3' do
|
929
1010
|
text = "@someone <33"
|
930
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
931
|
-
|
1011
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1012
|
+
text,
|
1013
|
+
punctuation: 'none'
|
932
1014
|
)
|
933
1015
|
expect(pt.tokenize).to eq(["@someone", "<33"])
|
934
1016
|
end
|
935
1017
|
|
936
1018
|
it 'handles words with a symbol prefix 1' do
|
937
1019
|
text = "Yes! /cc @someone"
|
938
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
939
|
-
|
1020
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1021
|
+
text,
|
1022
|
+
punctuation: 'none'
|
940
1023
|
)
|
941
1024
|
expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
|
942
1025
|
end
|
943
1026
|
|
944
1027
|
it 'handles words with a emoji suffix' do
|
945
1028
|
text = "Let's meet there.😝 ok?"
|
946
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
947
|
-
|
1029
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1030
|
+
text,
|
1031
|
+
punctuation: 'none'
|
948
1032
|
)
|
949
1033
|
expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
|
950
1034
|
end
|
951
1035
|
|
952
1036
|
it 'handles words with a symbol prefix 2' do
|
953
1037
|
text = "blah blah |photo by @someone"
|
954
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
955
|
-
|
1038
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1039
|
+
text,
|
1040
|
+
punctuation: 'none'
|
956
1041
|
)
|
957
1042
|
expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
|
958
1043
|
end
|
959
1044
|
|
960
1045
|
it 'handles pseudo-contractions' do
|
961
1046
|
text = "I suggest to buy stocks that are low value+have momentum"
|
962
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
963
|
-
|
1047
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1048
|
+
text,
|
1049
|
+
punctuation: 'none'
|
964
1050
|
)
|
965
|
-
expect(pt.tokenize).to eq(
|
1051
|
+
expect(pt.tokenize).to eq(%w(i suggest to buy stocks that are low value have momentum))
|
966
1052
|
end
|
967
1053
|
|
968
1054
|
it 'handles apostrophes and quotes 1' do
|
969
1055
|
text = "Watch the video of @amandapalmer's song “Killing Type” here"
|
970
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
971
|
-
|
1056
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1057
|
+
text,
|
1058
|
+
punctuation: 'none'
|
972
1059
|
)
|
973
1060
|
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
|
974
1061
|
end
|
975
1062
|
|
976
|
-
|
1063
|
+
it 'handles apostrophes and quotes 2' do
|
977
1064
|
text = "Watch the video of @amandapalmer`s song “Killing Type” here"
|
978
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
979
|
-
|
1065
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1066
|
+
text,
|
1067
|
+
punctuation: 'none'
|
980
1068
|
)
|
981
1069
|
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
|
982
1070
|
end
|
983
1071
|
|
984
1072
|
it 'handles numbers suffixed with a symbol' do
|
985
1073
|
text = "4 Things Marketers Must Do Better in 2016: blah"
|
986
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
987
|
-
|
1074
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1075
|
+
text,
|
1076
|
+
punctuation: 'none'
|
988
1077
|
)
|
989
|
-
expect(pt.tokenize).to eq(
|
1078
|
+
expect(pt.tokenize).to eq(%w(4 things marketers must do better in 2016 blah))
|
990
1079
|
end
|
991
1080
|
|
992
1081
|
it 'handles words with a emoticon suffix' do
|
993
1082
|
skip "NOT IMPLEMENTED"
|
994
1083
|
text = "look, a dog with shoes☺ !!"
|
995
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
996
|
-
|
1084
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1085
|
+
text,
|
1086
|
+
punctuation: 'none'
|
997
1087
|
)
|
998
1088
|
expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
|
999
1089
|
end
|
1000
1090
|
|
1001
1091
|
it 'handles emoji 1' do
|
1002
1092
|
text = "How bad!😝"
|
1003
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1004
|
-
|
1093
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1094
|
+
text,
|
1095
|
+
punctuation: 'none'
|
1005
1096
|
)
|
1006
1097
|
expect(pt.tokenize).to eq(["how", "bad", "😝"])
|
1007
1098
|
end
|
1008
1099
|
|
1009
1100
|
it 'handles emoji 2' do
|
1010
1101
|
text = "😝How bad!"
|
1011
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1012
|
-
|
1102
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1103
|
+
text,
|
1104
|
+
punctuation: 'none'
|
1013
1105
|
)
|
1014
1106
|
expect(pt.tokenize).to eq(["😝", "how", "bad"])
|
1015
1107
|
end
|
@@ -1017,16 +1109,18 @@ describe PragmaticTokenizer do
|
|
1017
1109
|
it 'identifies old school emoticons' do
|
1018
1110
|
skip "NOT IMPLEMENTED"
|
1019
1111
|
text = 'looking forward to the new kodak super8 camera \o/'
|
1020
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1021
|
-
|
1112
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1113
|
+
text,
|
1114
|
+
punctuation: 'none'
|
1022
1115
|
)
|
1023
1116
|
expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
|
1024
1117
|
end
|
1025
1118
|
|
1026
1119
|
it 'splits at hashtags' do
|
1027
1120
|
text = "some sentence#RT ... i like u2.#bono"
|
1028
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1029
|
-
|
1121
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1122
|
+
text,
|
1123
|
+
punctuation: :none
|
1030
1124
|
)
|
1031
1125
|
expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
|
1032
1126
|
end
|
@@ -1035,41 +1129,45 @@ describe PragmaticTokenizer do
|
|
1035
1129
|
context 'option (remove_stop_words)' do
|
1036
1130
|
it 'removes stop words' do
|
1037
1131
|
text = 'This is a short sentence with explanations and stop words.'
|
1038
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1039
|
-
|
1040
|
-
|
1132
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1133
|
+
text,
|
1134
|
+
language: 'en',
|
1135
|
+
remove_stop_words: true
|
1041
1136
|
)
|
1042
1137
|
expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
|
1043
1138
|
end
|
1044
1139
|
|
1045
1140
|
it 'removes user-supplied stop words' do
|
1046
1141
|
text = 'This is a short sentence with explanations and stop words.'
|
1047
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1142
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1143
|
+
text,
|
1144
|
+
language: 'en',
|
1145
|
+
remove_stop_words: true,
|
1146
|
+
stop_words: %w(and a)
|
1051
1147
|
)
|
1052
1148
|
expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
|
1053
1149
|
end
|
1054
1150
|
|
1055
1151
|
it 'removes user-supplied stop words and default stop words' do
|
1056
1152
|
text = 'This is a short sentence with explanations and stop words.'
|
1057
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1153
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1154
|
+
text,
|
1155
|
+
language: 'en',
|
1156
|
+
remove_stop_words: true,
|
1157
|
+
stop_words: ["sentence"],
|
1158
|
+
filter_languages: [:en]
|
1062
1159
|
)
|
1063
1160
|
expect(pt.tokenize).to eq(["short", "explanations", "."])
|
1064
1161
|
end
|
1065
1162
|
|
1066
1163
|
it 'removes user-supplied stop words and default stop words across multiple languages' do
|
1067
1164
|
text = 'This is a short sentence with explanations and stop words. And achte German words.'
|
1068
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1165
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1166
|
+
text,
|
1167
|
+
language: 'en',
|
1168
|
+
remove_stop_words: true,
|
1169
|
+
stop_words: ["sentence"],
|
1170
|
+
filter_languages: [:en, :de]
|
1073
1171
|
)
|
1074
1172
|
expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
|
1075
1173
|
end
|
@@ -1078,27 +1176,30 @@ describe PragmaticTokenizer do
|
|
1078
1176
|
context 'multiple options selected' do
|
1079
1177
|
it 'tokenizes a string #001' do
|
1080
1178
|
text = 'His name is Mr. Smith.'
|
1081
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1082
|
-
|
1083
|
-
|
1179
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1180
|
+
text,
|
1181
|
+
language: 'en',
|
1182
|
+
punctuation: 'none'
|
1084
1183
|
)
|
1085
1184
|
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1086
1185
|
end
|
1087
1186
|
|
1088
1187
|
it 'tokenizes a string #002' do
|
1089
1188
|
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
1090
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1091
|
-
|
1092
|
-
|
1189
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1190
|
+
text,
|
1191
|
+
language: 'en',
|
1192
|
+
punctuation: 'only'
|
1093
1193
|
)
|
1094
1194
|
expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
|
1095
1195
|
end
|
1096
1196
|
|
1097
1197
|
it 'tokenizes a string #003' do
|
1098
1198
|
text = "Hello the a it experiment one fine."
|
1099
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1100
|
-
|
1101
|
-
|
1199
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1200
|
+
text,
|
1201
|
+
language: 'en',
|
1202
|
+
remove_stop_words: true
|
1102
1203
|
)
|
1103
1204
|
expect(pt.tokenize).to eq(["experiment", "fine", "."])
|
1104
1205
|
end
|
@@ -1106,214 +1207,235 @@ describe PragmaticTokenizer do
|
|
1106
1207
|
it 'tokenizes a string #004' do
|
1107
1208
|
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
1108
1209
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
1109
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1210
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1211
|
+
text,
|
1212
|
+
expand_contractions: true,
|
1213
|
+
remove_stop_words: true,
|
1214
|
+
punctuation: 'none'
|
1113
1215
|
)
|
1114
|
-
expect(pt.tokenize).to eq(
|
1216
|
+
expect(pt.tokenize).to eq(%w(crazy sandowsky afford))
|
1115
1217
|
end
|
1116
1218
|
|
1117
1219
|
it 'tokenizes a string #005' do
|
1118
1220
|
text = "Hello world with a stop word experiment."
|
1119
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1221
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1222
|
+
text,
|
1223
|
+
language: 'en',
|
1224
|
+
clean: true,
|
1225
|
+
numbers: :none,
|
1226
|
+
minimum_length: 3,
|
1227
|
+
expand_contractions: true,
|
1228
|
+
remove_stop_words: true,
|
1229
|
+
punctuation: 'none'
|
1127
1230
|
)
|
1128
1231
|
expect(pt.tokenize).to eq(["experiment"])
|
1129
1232
|
end
|
1130
1233
|
|
1131
1234
|
it 'tokenizes a string #006' do
|
1132
1235
|
text = "Hello; what is your: name @username **delete**"
|
1133
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1134
|
-
|
1135
|
-
|
1236
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1237
|
+
text,
|
1238
|
+
clean: true,
|
1239
|
+
punctuation: 'none'
|
1136
1240
|
)
|
1137
1241
|
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
|
1138
1242
|
end
|
1139
1243
|
|
1140
1244
|
it 'tokenizes a string #007' do
|
1141
1245
|
text = 'His name is Mr. Smith.'
|
1142
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1246
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1247
|
+
text,
|
1248
|
+
language: 'en',
|
1249
|
+
punctuation: 'none',
|
1250
|
+
downcase: false
|
1146
1251
|
)
|
1147
1252
|
expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
|
1148
1253
|
end
|
1149
1254
|
|
1150
1255
|
it 'tokenizes a string #008' do
|
1151
1256
|
text = "Can't go tonight. Didn't finish."
|
1152
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1153
|
-
|
1154
|
-
|
1257
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1258
|
+
text,
|
1259
|
+
downcase: false,
|
1260
|
+
expand_contractions: true
|
1155
1261
|
)
|
1156
1262
|
expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
|
1157
1263
|
end
|
1158
1264
|
|
1159
1265
|
it 'tokenizes a string #009' do
|
1160
1266
|
text = "Some *interesting stuff* is __happening here__"
|
1161
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1162
|
-
|
1163
|
-
|
1267
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1268
|
+
text,
|
1269
|
+
punctuation: 'none',
|
1270
|
+
clean: true
|
1164
1271
|
)
|
1165
|
-
expect(pt.tokenize).to eq(
|
1272
|
+
expect(pt.tokenize).to eq(%w(some interesting stuff is happening here))
|
1166
1273
|
end
|
1167
1274
|
|
1168
1275
|
it 'also allows symbols for options' do
|
1169
1276
|
text = 'His name is Mr. Smith.'
|
1170
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1171
|
-
|
1172
|
-
|
1277
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1278
|
+
text,
|
1279
|
+
language: :en,
|
1280
|
+
punctuation: :none
|
1173
1281
|
)
|
1174
1282
|
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1175
1283
|
end
|
1176
1284
|
|
1177
1285
|
it 'handles long strings 1' do
|
1178
1286
|
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
1179
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1287
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1288
|
+
text,
|
1289
|
+
language: 'en',
|
1290
|
+
clean: true,
|
1291
|
+
minimum_length: 3,
|
1292
|
+
expand_contractions: true,
|
1293
|
+
remove_stop_words: true,
|
1294
|
+
numbers: :none,
|
1295
|
+
punctuation: :none
|
1187
1296
|
)
|
1188
1297
|
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
|
1189
1298
|
end
|
1190
1299
|
|
1191
1300
|
it 'handles long strings 2' do
|
1192
1301
|
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
|
1193
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1302
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1303
|
+
text,
|
1304
|
+
language: 'en',
|
1305
|
+
clean: true,
|
1306
|
+
minimum_length: 3,
|
1307
|
+
expand_contractions: true,
|
1308
|
+
remove_stop_words: true,
|
1309
|
+
numbers: :none,
|
1310
|
+
punctuation: :none
|
1201
1311
|
)
|
1202
1312
|
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
|
1203
1313
|
end
|
1204
1314
|
|
1205
1315
|
it 'handles markdown' do
|
1206
1316
|
text = "This is _bold_ and this is *italic*"
|
1207
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1208
|
-
|
1209
|
-
|
1317
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1318
|
+
text,
|
1319
|
+
punctuation: 'none',
|
1320
|
+
clean: true
|
1210
1321
|
)
|
1211
|
-
expect(pt.tokenize).to eq(
|
1322
|
+
expect(pt.tokenize).to eq(%w(this is bold and this is italic))
|
1212
1323
|
end
|
1213
1324
|
|
1214
1325
|
it 'handles single quotes' do
|
1215
1326
|
text = "Recognised as one of the ‘good’ games."
|
1216
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1327
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1328
|
+
text,
|
1329
|
+
language: 'en',
|
1330
|
+
clean: true,
|
1331
|
+
numbers: :none,
|
1332
|
+
minimum_length: 3,
|
1333
|
+
expand_contractions: true,
|
1334
|
+
remove_stop_words: true,
|
1335
|
+
punctuation: :none,
|
1336
|
+
downcase: true)
|
1337
|
+
expect(pt.tokenize).to eq(%w(recognised good games))
|
1226
1338
|
end
|
1227
1339
|
|
1228
1340
|
it 'removes control characters' do
|
1229
1341
|
text = "\u0000 \u001F \u007FHello test."
|
1230
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1231
|
-
|
1232
|
-
|
1342
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1343
|
+
text,
|
1344
|
+
language: 'en',
|
1345
|
+
clean: true
|
1233
1346
|
)
|
1234
1347
|
expect(pt.tokenize).to eq(["hello", "test", "."])
|
1235
1348
|
end
|
1236
1349
|
|
1237
1350
|
it 'splits too long words with hypens' do
|
1238
1351
|
text = "hi-hat and old-school but not really-important-long-word"
|
1239
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1240
|
-
|
1241
|
-
|
1352
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1353
|
+
text,
|
1354
|
+
punctuation: 'none',
|
1355
|
+
long_word_split: 12
|
1242
1356
|
)
|
1243
1357
|
expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
|
1244
1358
|
end
|
1245
1359
|
|
1246
1360
|
it 'handles hashtags 2' do
|
1247
1361
|
text = "This is the #upper-#limit"
|
1248
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1249
|
-
|
1250
|
-
|
1362
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1363
|
+
text,
|
1364
|
+
punctuation: 'none',
|
1365
|
+
hashtags: :keep_and_clean
|
1251
1366
|
)
|
1252
|
-
expect(pt.tokenize).to eq(
|
1367
|
+
expect(pt.tokenize).to eq(%w(this is the upper limit))
|
1253
1368
|
end
|
1254
1369
|
|
1255
1370
|
it 'handles hashtags 3' do
|
1256
1371
|
text = "The #2016-fun has just begun."
|
1257
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1258
|
-
|
1259
|
-
|
1372
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1373
|
+
text,
|
1374
|
+
punctuation: 'none',
|
1375
|
+
hashtags: :keep_and_clean
|
1260
1376
|
)
|
1261
|
-
expect(pt.tokenize).to eq(
|
1377
|
+
expect(pt.tokenize).to eq(%w(the 2016 fun has just begun))
|
1262
1378
|
end
|
1263
1379
|
|
1264
1380
|
it 'does not clean mentions' do
|
1265
1381
|
text = "@_someone_ because @someone and @_someone was taken"
|
1266
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1267
|
-
|
1268
|
-
|
1382
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1383
|
+
text,
|
1384
|
+
mentions: :keep_original,
|
1385
|
+
clean: true
|
1269
1386
|
)
|
1270
1387
|
expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
|
1271
1388
|
end
|
1272
1389
|
|
1273
1390
|
it 'removes double single quotes' do
|
1274
1391
|
text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
|
1275
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1276
|
-
|
1277
|
-
|
1392
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1393
|
+
text,
|
1394
|
+
punctuation: :none,
|
1395
|
+
clean: true
|
1278
1396
|
)
|
1279
|
-
expect(pt.tokenize).to eq(
|
1397
|
+
expect(pt.tokenize).to eq(%w(strong statement in the day the earth caught fire 1961))
|
1280
1398
|
end
|
1281
1399
|
|
1282
1400
|
it 'removes a hyphen prefix 1' do
|
1283
1401
|
text = "Geopol.-Strategy"
|
1284
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1285
|
-
|
1286
|
-
|
1402
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1403
|
+
text,
|
1404
|
+
punctuation: :none,
|
1405
|
+
clean: true
|
1287
1406
|
)
|
1288
|
-
expect(pt.tokenize).to eq(
|
1407
|
+
expect(pt.tokenize).to eq(%w(geopol strategy))
|
1289
1408
|
end
|
1290
1409
|
|
1291
1410
|
it 'removes a hyphen prefix 2' do
|
1292
1411
|
text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
|
1293
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1294
|
-
|
1295
|
-
|
1412
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1413
|
+
text,
|
1414
|
+
punctuation: :none,
|
1415
|
+
clean: true
|
1296
1416
|
)
|
1297
1417
|
expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
|
1298
1418
|
end
|
1299
1419
|
|
1300
1420
|
it 'does not remove tokens with ampersands' do
|
1301
1421
|
text = "you&me"
|
1302
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1303
|
-
|
1304
|
-
|
1422
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1423
|
+
text,
|
1424
|
+
clean: true,
|
1425
|
+
punctuation: :none
|
1305
1426
|
)
|
1306
|
-
expect(pt.tokenize).to eq(
|
1427
|
+
expect(pt.tokenize).to eq(%w(you me))
|
1307
1428
|
end
|
1308
1429
|
|
1309
1430
|
it 'cleans percent signs not related to numbers' do
|
1310
1431
|
text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
|
1311
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1432
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1433
|
+
text,
|
1434
|
+
clean: true,
|
1435
|
+
numbers: :none,
|
1436
|
+
punctuation: :none
|
1315
1437
|
)
|
1316
|
-
expect(pt.tokenize).to eq(
|
1438
|
+
expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
|
1317
1439
|
end
|
1318
1440
|
end
|
1319
1441
|
end
|
@@ -1473,8 +1595,9 @@ describe PragmaticTokenizer do
|
|
1473
1595
|
|
1474
1596
|
it 'handles empty tokens' do
|
1475
1597
|
text = "!!!!! https://t.co/xxxx"
|
1476
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1477
|
-
|
1598
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1599
|
+
text,
|
1600
|
+
punctuation: 'none'
|
1478
1601
|
)
|
1479
1602
|
expect(pt.tokenize).to eq(["https://t.co/xxxx"])
|
1480
1603
|
end
|
@@ -1532,4 +1655,4 @@ describe PragmaticTokenizer do
|
|
1532
1655
|
end
|
1533
1656
|
end
|
1534
1657
|
end
|
1535
|
-
end
|
1658
|
+
end
|