pragmatic_tokenizer 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1535 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: English (en)' do
5
+ context '#tokenize (example strings)' do
6
+
7
+ context 'no options selected' do
8
+ it 'tokenizes a string #001' do
9
+ text = "Hello world."
10
+ pt = PragmaticTokenizer::Tokenizer.new(text)
11
+ expect(pt.tokenize).to eq(["hello", "world", "."])
12
+ end
13
+
14
+ it 'tokenizes a string #002' do
15
+ text = "Hello Dr. Death."
16
+ pt = PragmaticTokenizer::Tokenizer.new(text)
17
+ expect(pt.tokenize).to eq(["hello", "dr.", "death", "."])
18
+ end
19
+
20
+ it 'tokenizes a string #003' do
21
+ text = "Hello ____________________ ."
22
+ pt = PragmaticTokenizer::Tokenizer.new(text)
23
+ expect(pt.tokenize).to eq(["hello", "____________________", "."])
24
+ end
25
+
26
+ it 'tokenizes a string #004' do
27
+ text = "It has a state-of-the-art design."
28
+ pt = PragmaticTokenizer::Tokenizer.new(text)
29
+ expect(pt.tokenize).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
30
+ end
31
+
32
+ it 'tokenizes a string #005' do
33
+ text = "Jan. 2015 was 20% colder than now. But not in inter- and outer-space."
34
+ pt = PragmaticTokenizer::Tokenizer.new(text)
35
+ expect(pt.tokenize).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
36
+ end
37
+
38
+ it 'tokenizes a string #006' do
39
+ text = 'Go to http://www.example.com.'
40
+ pt = PragmaticTokenizer::Tokenizer.new(text)
41
+ expect(pt.tokenize).to eq(["go", "to", "http://www.example.com", "."])
42
+ end
43
+
44
+ it 'tokenizes a string #007' do
45
+ text = 'One of the lawyers from ‚Making a Murderer’ admitted a mistake'
46
+ pt = PragmaticTokenizer::Tokenizer.new(text)
47
+ expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
48
+ end
49
+
50
+ it 'tokenizes a string #008' do
51
+ text = "One of the lawyers from 'Making a Murderer' admitted a mistake"
52
+ pt = PragmaticTokenizer::Tokenizer.new(text)
53
+ expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
54
+ end
55
+
56
+ it 'tokenizes a string #009' do
57
+ text = "hello ;-) yes"
58
+ pt = PragmaticTokenizer::Tokenizer.new(text)
59
+ expect(pt.tokenize).to eq(["hello", ";", "-", ")", "yes"])
60
+ end
61
+
62
+ it 'tokenizes a string #010' do
63
+ text = "hello ;)"
64
+ pt = PragmaticTokenizer::Tokenizer.new(text)
65
+ expect(pt.tokenize).to eq(["hello", ";", ")"])
66
+ end
67
+
68
+ it 'tokenizes a string #011' do
69
+ text = "area <0.8 cm2"
70
+ pt = PragmaticTokenizer::Tokenizer.new(text)
71
+ expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
72
+ end
73
+
74
+ it 'tokenizes a string #012' do
75
+ text = "area <0.8 cm2"
76
+ pt = PragmaticTokenizer::Tokenizer.new(text)
77
+ expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
78
+ end
79
+
80
+ it 'tokenizes a string #013' do
81
+ text = "the “Star-Trek“-Inventor"
82
+ pt = PragmaticTokenizer::Tokenizer.new(text)
83
+ expect(pt.tokenize).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
84
+ end
85
+
86
+ it 'tokenizes a string #014' do
87
+ text = "#ab-cd"
88
+ pt = PragmaticTokenizer::Tokenizer.new(text)
89
+ expect(pt.tokenize).to eq(["#ab-cd"])
90
+ end
91
+
92
+ it 'handles numbers with symbols 2' do
93
+ text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
94
+ pt = PragmaticTokenizer::Tokenizer.new(text)
95
+ expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
96
+ end
97
+
98
+ it 'handles numbers with symbols 3' do
99
+ text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
100
+ pt = PragmaticTokenizer::Tokenizer.new(text)
101
+ expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
102
+ end
103
+
104
+ it 'splits at a comma' do
105
+ text = "16.1. day one,17.2. day two"
106
+ pt = PragmaticTokenizer::Tokenizer.new(text)
107
+ expect(pt.tokenize).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
108
+ end
109
+
110
+ it 'identifies single quotes' do
111
+ text = "Sean Penn Sat for Secret Interview With ‘El Chapo,’ Mexican Drug"
112
+ pt = PragmaticTokenizer::Tokenizer.new(text)
113
+ expect(pt.tokenize).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
114
+ end
115
+
116
+ it 'identifies prefixed symbols' do
117
+ text = "look:the sky is blue"
118
+ pt = PragmaticTokenizer::Tokenizer.new(text)
119
+ expect(pt.tokenize).to eq(["look", ":", "the", "sky", "is", "blue"])
120
+ end
121
+
122
+ it 'identifies hashtags with numbers too' do
123
+ text = "this is a sentence.#yay this too.#withnumbers123"
124
+ pt = PragmaticTokenizer::Tokenizer.new(text)
125
+ expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
126
+ end
127
+
128
+ it 'splits emojis' do
129
+ text = "🤔🙄"
130
+ pt = PragmaticTokenizer::Tokenizer.new(text)
131
+ expect(pt.tokenize).to eq(["🤔", "🙄"])
132
+ end
133
+
134
+ it 'handles snowflakes 1' do
135
+ text = "❄️❄️❄️"
136
+ pt = PragmaticTokenizer::Tokenizer.new(text)
137
+ expect(pt.tokenize).to eq(["❄️", "❄️", "❄️"])
138
+ end
139
+
140
+ it 'handles snowflakes 2' do
141
+ text = "\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
142
+ pt = PragmaticTokenizer::Tokenizer.new(text)
143
+ expect(pt.tokenize).to eq(["❄︎", "❄︎", "❄︎"])
144
+ end
145
+
146
+ it 'handles snowflakes 3' do
147
+ text = "\u2744\u2744\u2744"
148
+ pt = PragmaticTokenizer::Tokenizer.new(text)
149
+ expect(pt.tokenize).to eq(["\u2744", "\u2744", "\u2744"])
150
+ end
151
+
152
+ it 'separates tokens' do
153
+ text = "football≠soccer"
154
+ pt = PragmaticTokenizer::Tokenizer.new(text)
155
+ expect(pt.tokenize).to eq(["football", "≠", "soccer"])
156
+ end
157
+
158
+ it 'deals with missing whitespaces' do
159
+ text = "this is sentence one!this is sentence two.@someone"
160
+ pt = PragmaticTokenizer::Tokenizer.new(text)
161
+ expect(pt.tokenize).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
162
+ end
163
+
164
+ it 'handles weird apostrophes' do
165
+ text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
166
+ pt = PragmaticTokenizer::Tokenizer.new(text)
167
+ expect(pt.tokenize).to eq(["there`s", "something"])
168
+ end
169
+
170
+ it 'treats abbreviations always the same' do
171
+ text = "U.S.A. U.S.A. U.S.A."
172
+ pt = PragmaticTokenizer::Tokenizer.new(text)
173
+ expect(pt.tokenize).to eq(
174
+ ["u.s.a.", "u.s.a.", "u.s.a."]
175
+ )
176
+ end
177
+ end
178
+
179
+ context 'user-supplied abbreviations' do
180
+ it 'tokenizes a regular string with an abbreviation' do
181
+ text = "Mr. Smith, hello world."
182
+ pt = PragmaticTokenizer::Tokenizer.new(text)
183
+ expect(pt.tokenize).to eq(["mr.", "smith", ",", "hello", "world", "."])
184
+ end
185
+
186
+ it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
187
+ text = "Mr. Smith, hello world."
188
+ abbreviations = ['mrs']
189
+ pt = PragmaticTokenizer::Tokenizer.new(text,
190
+ abbreviations: abbreviations
191
+ )
192
+ expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
193
+ end
194
+
195
+ it 'recognizes a user-supplied abbreviation' do
196
+ text = "thisisnotanormalabbreviation. hello world."
197
+ abbreviations = ['thisisnotanormalabbreviation']
198
+ pt = PragmaticTokenizer::Tokenizer.new(text,
199
+ abbreviations: abbreviations
200
+ )
201
+ expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
202
+ end
203
+
204
+ it 'handles an empty user-supplied abbreviation array' do
205
+ text = "thisisnotanormalabbreviation. hello world."
206
+ abbreviations = []
207
+ pt = PragmaticTokenizer::Tokenizer.new(text,
208
+ abbreviations: abbreviations
209
+ )
210
+ expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
211
+ end
212
+
213
+ it 'handles abrreviations across multiple languages' do
214
+ text = "Mr. Smith how are ü. today."
215
+ pt = PragmaticTokenizer::Tokenizer.new(text,
216
+ filter_languages: [:en, :de]
217
+ )
218
+ expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
219
+ end
220
+
221
+ it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
222
+ text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
223
+ abbreviations = ['thisisnotanormalabbreviation']
224
+ pt = PragmaticTokenizer::Tokenizer.new(text,
225
+ filter_languages: [:en, :de],
226
+ abbreviations: abbreviations
227
+ )
228
+ expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
229
+ end
230
+ end
231
+
232
+ context 'option (expand_contractions)' do
233
+ it 'does not expand the contractions' do
234
+ # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
235
+ text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
236
+ pt = PragmaticTokenizer::Tokenizer.new(text)
237
+ expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
238
+ end
239
+
240
+ it 'expands user-supplied contractions' do
241
+ text = "Hello supa'soo guy."
242
+ contractions = { "supa'soo" => "super smooth" }
243
+ pt = PragmaticTokenizer::Tokenizer.new(text,
244
+ contractions: contractions,
245
+ expand_contractions: true
246
+ )
247
+ expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
248
+ end
249
+
250
+ it 'does not expands user-supplied contractions' do
251
+ text = "Hello supa'soo guy."
252
+ contractions = { "supa'soo" => "super smooth" }
253
+ pt = PragmaticTokenizer::Tokenizer.new(text,
254
+ contractions: contractions,
255
+ expand_contractions: false
256
+ )
257
+ expect(pt.tokenize).to eq( ["hello", "supa'soo", "guy", "."])
258
+ end
259
+
260
+ it 'expands user-supplied contractions and language contractions' do
261
+ text = "Hello supa'soo guy. auf's wasn't it?"
262
+ contractions = { "supa'soo" => "super smooth" }
263
+ pt = PragmaticTokenizer::Tokenizer.new(text,
264
+ contractions: contractions,
265
+ expand_contractions: true,
266
+ filter_languages: [:en, :de]
267
+ )
268
+ expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
269
+ end
270
+
271
+ it 'expands language contractions' do
272
+ text = "Hello supa'soo guy. auf's wasn't it?"
273
+ pt = PragmaticTokenizer::Tokenizer.new(text,
274
+ expand_contractions: true,
275
+ filter_languages: [:en, :de]
276
+ )
277
+ expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
278
+ end
279
+
280
+ it 'tokenizes a string #001' do
281
+ # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
282
+ text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
283
+ pt = PragmaticTokenizer::Tokenizer.new(text,
284
+ expand_contractions: true
285
+ )
286
+ expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
287
+ end
288
+
289
+ it 'tokenizes a string #002' do
290
+ # http://nlp.stanford.edu/software/tokenizer.shtml
291
+ text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
292
+ pt = PragmaticTokenizer::Tokenizer.new(text,
293
+ expand_contractions: true
294
+ )
295
+ expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
296
+ end
297
+
298
+ it 'tokenizes a string #003' do
299
+ text = "Look for his/her account."
300
+ pt = PragmaticTokenizer::Tokenizer.new(text,
301
+ expand_contractions: true
302
+ )
303
+ expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
304
+ end
305
+
306
+ it 'tokenizes a string #004' do
307
+ text = "I like apples and/or oranges."
308
+ pt = PragmaticTokenizer::Tokenizer.new(text,
309
+ expand_contractions: true
310
+ )
311
+ expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
312
+ end
313
+ end
314
+
315
+ context 'option (emojis)' do
316
+ it 'removes emoji' do
317
+ text = "Return the emoji 👿😍😱🐔🌚. 🌚"
318
+ pt = PragmaticTokenizer::Tokenizer.new(text,
319
+ remove_emoji: true
320
+ )
321
+ expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
322
+ end
323
+
324
+ it 'does not remove emoji' do
325
+ text = "Return the emoji 👿😍😱🐔🌚. 🌚"
326
+ pt = PragmaticTokenizer::Tokenizer.new(text)
327
+ expect(pt.tokenize).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
328
+ end
329
+
330
+ it 'removes snowflakes 1' do
331
+ text = "hello❄️❄️❄️"
332
+ pt = PragmaticTokenizer::Tokenizer.new(text,
333
+ remove_emoji: true
334
+ )
335
+ expect(pt.tokenize).to eq(["hello"])
336
+ end
337
+
338
+ it 'removes snowflakes 2' do
339
+ text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
340
+ pt = PragmaticTokenizer::Tokenizer.new(text,
341
+ remove_emoji: true
342
+ )
343
+ expect(pt.tokenize).to eq(["hello"])
344
+ end
345
+
346
+ it 'removes snowflakes 3' do
347
+ text = "hello\u2744\u2744\u2744"
348
+ pt = PragmaticTokenizer::Tokenizer.new(text,
349
+ remove_emoji: true
350
+ )
351
+ expect(pt.tokenize).to eq(["hello"])
352
+ end
353
+ end
354
+
355
+ context 'option (hashtags)' do
356
+ it 'tokenizes a string #001' do
357
+ text = "This is a #hashtag yay!"
358
+ pt = PragmaticTokenizer::Tokenizer.new(text,
359
+ hashtags: :remove
360
+ )
361
+ expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
362
+ end
363
+
364
+ it 'tokenizes a string #002' do
365
+ text = "This is a #hashtag yay!"
366
+ pt = PragmaticTokenizer::Tokenizer.new(text,
367
+ hashtags: :keep_and_clean
368
+ )
369
+ expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
370
+ end
371
+
372
+ it 'tokenizes a string #003' do
373
+ text = "This is a #hashtag yay!"
374
+ pt = PragmaticTokenizer::Tokenizer.new(text,
375
+ hashtags: :keep_original
376
+ )
377
+ expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
378
+ end
379
+ end
380
+
381
+ context 'option (mentions)' do
382
+ it 'tokenizes a string #001' do
383
+ text = "This is a @mention @mention2 yay!"
384
+ pt = PragmaticTokenizer::Tokenizer.new(text,
385
+ mentions: :remove
386
+ )
387
+ expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
388
+ end
389
+
390
+ it 'tokenizes a string #002' do
391
+ text = "This is a @mention @mention2 yay!"
392
+ pt = PragmaticTokenizer::Tokenizer.new(text,
393
+ mentions: :keep_and_clean
394
+ )
395
+ expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
396
+ end
397
+
398
+ it 'tokenizes a string #003' do
399
+ text = "This is a @mention @mention2 yay!"
400
+ pt = PragmaticTokenizer::Tokenizer.new(text,
401
+ mentions: :keep_original
402
+ )
403
+ expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
404
+ end
405
+ end
406
+
407
+ context 'option (email addresses)' do
408
+ it 'tokenizes a string #001' do
409
+ text = "Here are some emails jon@hotmail.com ben123@gmail.com."
410
+ pt = PragmaticTokenizer::Tokenizer.new(text,
411
+ remove_emails: :true
412
+ )
413
+ expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
414
+ end
415
+
416
+ it 'tokenizes a string #002' do
417
+ text = "Here are some emails jon@hotmail.com ben123@gmail.com."
418
+ pt = PragmaticTokenizer::Tokenizer.new(text)
419
+ expect(pt.tokenize).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
420
+ end
421
+
422
+ it 'knows what is not an email address' do
423
+ text = "the great cook.@someone something else@whoever"
424
+ pt = PragmaticTokenizer::Tokenizer.new(text,
425
+ remove_emails: true
426
+ )
427
+ expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
428
+ end
429
+ end
430
+
431
+ context 'option (urls)' do
432
+ it 'tokenizes a string #001' do
433
+ text = "Here are some domains and urls google.com https://www.google.com www.google.com."
434
+ pt = PragmaticTokenizer::Tokenizer.new(text,
435
+ remove_urls: :true
436
+ )
437
+ expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
438
+ end
439
+
440
+ it 'tokenizes a string #002' do
441
+ text = "Here are some domains and urls google.com https://www.google.com www.google.com."
442
+ pt = PragmaticTokenizer::Tokenizer.new(text)
443
+ expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
444
+ end
445
+ end
446
+
447
+ context 'option (domains)' do
448
+ it 'tokenizes a string #001' do
449
+ text = "Here are some domains and urls google.com https://www.google.com www.google.com."
450
+ pt = PragmaticTokenizer::Tokenizer.new(text,
451
+ remove_domains: :true
452
+ )
453
+ expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
454
+ end
455
+
456
+ it 'tokenizes a string #002' do
457
+ text = "Here are some domains and urls google.com https://www.google.com www.google.com."
458
+ pt = PragmaticTokenizer::Tokenizer.new(text)
459
+ expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
460
+ end
461
+
462
+ it 'knows what is not a domain 1' do
463
+ skip "NOT IMPLEMENTED"
464
+ text = "this is a sentence.and no domain."
465
+ pt = PragmaticTokenizer::Tokenizer.new(text,
466
+ remove_domains: true
467
+ )
468
+ expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
469
+ end
470
+
471
+ it 'knows what is not a domain 2' do
472
+ text = "former president g.w.bush was..."
473
+ pt = PragmaticTokenizer::Tokenizer.new(text,
474
+ remove_domains: true
475
+ )
476
+ expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
477
+ end
478
+
479
+ it 'knows what is not a domain 3' do
480
+ text = "2.something-times"
481
+ pt = PragmaticTokenizer::Tokenizer.new(text,
482
+ remove_domains: true
483
+ )
484
+ expect(pt.tokenize).to eq(["2.something-times"])
485
+ end
486
+ end
487
+
488
+ context 'option (long_word_split)' do
489
+ it 'tokenizes a string #001' do
490
+ text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
491
+ pt = PragmaticTokenizer::Tokenizer.new(text,
492
+ long_word_split: 10
493
+ )
494
+ expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
495
+ end
496
+
497
+ it 'tokenizes a string #002' do
498
+ text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
499
+ pt = PragmaticTokenizer::Tokenizer.new(text,
500
+ long_word_split: 4
501
+ )
502
+ expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
503
+ end
504
+ end
505
+
506
+ context 'option (clean)' do
507
+ it 'tokenizes a string #001' do
508
+ text = "Hello ---------------."
509
+ pt = PragmaticTokenizer::Tokenizer.new(text,
510
+ clean: true
511
+ )
512
+ expect(pt.tokenize).to eq(["hello", "."])
513
+ end
514
+
515
+ it 'tokenizes a string #002' do
516
+ text = "Hello ____________________ ."
517
+ pt = PragmaticTokenizer::Tokenizer.new(text,
518
+ clean: true
519
+ )
520
+ expect(pt.tokenize).to eq(["hello", "."])
521
+ end
522
+
523
+ it 'tokenizes a string #003' do
524
+ text = "© ABC Company 1994"
525
+ pt = PragmaticTokenizer::Tokenizer.new(text,
526
+ clean: true
527
+ )
528
+ expect(pt.tokenize).to eq(["abc", "company", "1994"])
529
+ end
530
+
531
+ it 'tokenizes a string #004' do
532
+ text = "This sentence has a long string of dots ......................."
533
+ pt = PragmaticTokenizer::Tokenizer.new(text,
534
+ clean: true
535
+ )
536
+ expect(pt.tokenize).to eq(["this", "sentence", "has", "a", "long", "string", "of", "dots"])
537
+ end
538
+
539
+ it 'tokenizes a string #005' do
540
+ text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
541
+ pt = PragmaticTokenizer::Tokenizer.new(text,
542
+ clean: true
543
+ )
544
+ expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
545
+ end
546
+
547
+ it 'does not remove a valid hashtag' do
548
+ text = "This #sentence has a long string of dots ......................."
549
+ pt = PragmaticTokenizer::Tokenizer.new(text,
550
+ clean: true
551
+ )
552
+ expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
553
+ end
554
+
555
+ it 'does not remove a valid mention' do
556
+ text = "This @sentence has a long string of dots ......................."
557
+ pt = PragmaticTokenizer::Tokenizer.new(text,
558
+ clean: true
559
+ )
560
+ expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
561
+ end
562
+
563
+ it 'cleans words with symbols 1' do
564
+ text = "something.com:article title !!wow look!!1"
565
+ pt = PragmaticTokenizer::Tokenizer.new(text,
566
+ clean: true
567
+ )
568
+ expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
569
+ end
570
+
571
+ it 'cleans words with symbols 2' do
572
+ text = "something.com:article title !!wow look!!1!1!11!"
573
+ pt = PragmaticTokenizer::Tokenizer.new(text,
574
+ clean: true
575
+ )
576
+ expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
577
+ end
578
+
579
+ it 'identifies prefixed symbols' do
580
+ text = "look:the sky is blue"
581
+ pt = PragmaticTokenizer::Tokenizer.new(text,
582
+ clean: true
583
+ )
584
+ expect(pt.tokenize).to eq(["look", "the", "sky", "is", "blue"])
585
+ end
586
+
587
+ it 'keeps numbers at the end of mentions and hashtags' do
588
+ text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
589
+ pt = PragmaticTokenizer::Tokenizer.new(text,
590
+ clean: true
591
+ )
592
+ expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
593
+ end
594
+
595
+ it 'cleans a prefixed weird hyphen' do
596
+ text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
597
+ pt = PragmaticTokenizer::Tokenizer.new(text,
598
+ clean: true
599
+ )
600
+ expect(pt.tokenize).to eq(["high", "temperature", "and", "wind"])
601
+ end
602
+
603
+ it 'cleans (r) and (c) and (tm)' do
604
+ text = "the oscar® night ©companyname is a trademark™"
605
+ pt = PragmaticTokenizer::Tokenizer.new(text,
606
+ clean: true
607
+ )
608
+ expect(pt.tokenize).to eq(["the", "oscar", "night", "companyname", "is", "a", "trademark"])
609
+ end
610
+
611
+ it 'cleans letters in boxes 1' do
612
+ text = "making🇦🇹postcards"
613
+ pt = PragmaticTokenizer::Tokenizer.new(text,
614
+ clean: true
615
+ )
616
+ expect(pt.tokenize).to eq(["making", "postcards"])
617
+ end
618
+
619
+ it 'removes colons' do
620
+ text = "At 19:30 o'clock: Mad Max: Fury Road"
621
+ pt = PragmaticTokenizer::Tokenizer.new(text,
622
+ clean: true
623
+ )
624
+ expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
625
+ end
626
+
627
+ it 'removes a hyphen prefix 3' do
628
+ text = "women's clothes and –shoes needed"
629
+ pt = PragmaticTokenizer::Tokenizer.new(text,
630
+ clean: true
631
+ )
632
+ expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
633
+ end
634
+
635
+ it 'does not remove tokens with ampersands' do
636
+ text = "you&amp;me"
637
+ pt = PragmaticTokenizer::Tokenizer.new(text,
638
+ clean: true
639
+ )
640
+ expect(pt.tokenize).to eq(["you", "&", "me"])
641
+ end
642
+ end
643
+
644
+ context 'option (classic_filter)' do
645
+ it 'tokenizes a string #001' do
646
+ # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
647
+ text = "I.B.M. cat's can't"
648
+ pt = PragmaticTokenizer::Tokenizer.new(text,
649
+ classic_filter: true
650
+ )
651
+ expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
652
+ end
653
+
654
+ it 'tokenizes a string #002' do
655
+ # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
656
+ text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
657
+ pt = PragmaticTokenizer::Tokenizer.new(text,
658
+ classic_filter: true
659
+ )
660
+ expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
661
+ end
662
+
663
+ it 'optimizes the classic filter' do
664
+ text = "therés something"
665
+ pt = PragmaticTokenizer::Tokenizer.new(text,
666
+ classic_filter: true
667
+ )
668
+ expect(pt.tokenize).to eq(["there", "something"])
669
+ end
670
+
671
+ it 'optimizes the classic filter' do
672
+ text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
673
+ pt = PragmaticTokenizer::Tokenizer.new(text,
674
+ classic_filter: true
675
+ )
676
+ expect(pt.tokenize).to eq(["there", "something"])
677
+ end
678
+ end
679
+
680
+ context 'option (language)' do
681
+ it 'tokenizes a string #001' do
682
+ text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
683
+ pt = PragmaticTokenizer::Tokenizer.new(text,
684
+ language: 'en'
685
+ )
686
+ expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
687
+ end
688
+
689
+ it 'tokenizes a string #002' do
690
+ text = "Lisa Raines, a lawyer and director of government relations
691
+ for the Industrial Biotechnical Association, contends that a judge
692
+ well-versed in patent law and the concerns of research-based industries
693
+ would have ruled otherwise. And Judge Newman, a former patent lawyer,
694
+ wrote in her dissent when the court denied a motion for a rehearing of
695
+ the case by the full court, \'The panel's judicial legislation has
696
+ affected an important high-technological industry, without regard
697
+ to the consequences for research and innovation or the public interest.\'
698
+ Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
699
+ patent lawyers on the court could prove troublesome.\'"
700
+ pt = PragmaticTokenizer::Tokenizer.new(text,
701
+ language: 'en'
702
+ )
703
+ expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
704
+ end
705
+ end
706
+
707
+ context 'option (numbers)' do
708
+ it 'tokenizes a string #001' do
709
+ text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
710
+ pt = PragmaticTokenizer::Tokenizer.new(text,
711
+ numbers: :all
712
+ )
713
+ expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
714
+ end
715
+
716
+ it 'tokenizes a string #002' do
717
+ text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
718
+ pt = PragmaticTokenizer::Tokenizer.new(text,
719
+ numbers: :none
720
+ )
721
+ expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
722
+ end
723
+
724
+ it 'tokenizes a string #003' do
725
+ text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
726
+ pt = PragmaticTokenizer::Tokenizer.new(text,
727
+ numbers: :semi
728
+ )
729
+ expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
730
+ end
731
+
732
+ it 'tokenizes a string #004' do
733
+ text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
734
+ pt = PragmaticTokenizer::Tokenizer.new(text,
735
+ numbers: :only
736
+ )
737
+ expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
738
+ end
739
+
740
+ it 'tokenizes a string #005' do
741
+ text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
742
+ pt = PragmaticTokenizer::Tokenizer.new(text,
743
+ numbers: :none
744
+ )
745
+ expect(pt.tokenize).to eq([])
746
+ end
747
+
748
+ it 'tokenizes a string #005' do
749
+ text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
750
+ pt = PragmaticTokenizer::Tokenizer.new(text,
751
+ numbers: :none
752
+ )
753
+ expect(pt.tokenize).to eq(["number"])
754
+ end
755
+
756
+ it 'tokenizes a string #006' do
757
+ text = "Remove III Roman Numerals and IX. with a period."
758
+ pt = PragmaticTokenizer::Tokenizer.new(text,
759
+ numbers: :none
760
+ )
761
+ expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
762
+ end
763
+ end
764
+
765
+ context 'option (minimum_length)' do
766
+ it 'tokenizes a string #001' do
767
+ text = "Let's test the minimum length of fiver."
768
+ pt = PragmaticTokenizer::Tokenizer.new(text,
769
+ minimum_length: 5
770
+ )
771
+ expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
772
+ end
773
+ end
774
+
775
+ context 'option (punctuation)' do
776
+ it 'tokenizes a string #001' do
777
+ text = "kath. / evang"
778
+ pt = PragmaticTokenizer::Tokenizer.new(text,
779
+ punctuation: 'none'
780
+ )
781
+ expect(pt.tokenize).to eq(["kath", "evang"])
782
+ end
783
+
784
+ it 'tokenizes a string #002' do
785
+ text = "derStandard.at › Sport"
786
+ pt = PragmaticTokenizer::Tokenizer.new(text,
787
+ punctuation: 'none'
788
+ )
789
+ expect(pt.tokenize).to eq(["derstandard.at", "sport"])
790
+ end
791
+
792
+ it 'tokenizes a string #003' do
793
+ text = "hello ^^"
794
+ pt = PragmaticTokenizer::Tokenizer.new(text,
795
+ punctuation: 'none'
796
+ )
797
+ expect(pt.tokenize).to eq(["hello"])
798
+ end
799
+
800
+ it 'tokenizes a string #004' do
801
+ text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
802
+ pt = PragmaticTokenizer::Tokenizer.new(text,
803
+ punctuation: 'none'
804
+ )
805
+ expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
806
+ end
807
+
808
+ it 'tokenizes a string #005' do
809
+ text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
810
+ pt = PragmaticTokenizer::Tokenizer.new(text,
811
+ punctuation: 'none'
812
+ )
813
+ expect(pt.tokenize).to eq(["a", "sentence", "one", "with", "two", "dots", "and", "with", "three", "or", "horizontal", "ellipsis", "which", "are", "three", "dots", "too"])
814
+ end
815
+
816
+ it 'tokenizes a string #006' do
817
+ text = "+++ BREAKING +++ something happened; is it interesting?"
818
+ pt = PragmaticTokenizer::Tokenizer.new(text,
819
+ punctuation: 'none'
820
+ )
821
+ expect(pt.tokenize).to eq(["breaking", "something", "happened", "is", "it", "interesting"])
822
+ end
823
+
824
+ it 'tokenizes a string #007' do
825
+ text = "Some *interesting stuff* is __happening here__"
826
+ pt = PragmaticTokenizer::Tokenizer.new(text,
827
+ punctuation: 'none'
828
+ )
829
+ expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
830
+ end
831
+
832
+ it 'tokenizes a string #008' do
833
+ text = "Hello; what is your: name @username **delete**"
834
+ pt = PragmaticTokenizer::Tokenizer.new(text,
835
+ punctuation: 'none'
836
+ )
837
+ expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
838
+ end
839
+
840
+ it 'tokenizes a string #009' do
841
+ text = "hello ;-) yes"
842
+ pt = PragmaticTokenizer::Tokenizer.new(text,
843
+ punctuation: :none
844
+ )
845
+ expect(pt.tokenize).to eq(["hello", "yes"])
846
+ end
847
+
848
+ it 'tokenizes a string #010' do
849
+ text = "hello ;)"
850
+ pt = PragmaticTokenizer::Tokenizer.new(text,
851
+ punctuation: 'none'
852
+ )
853
+ expect(pt.tokenize).to eq(["hello"])
854
+ end
855
+
856
+ it 'tokenizes a string #011' do
857
+ text = "Hello ____________________ ."
858
+ pt = PragmaticTokenizer::Tokenizer.new(text,
859
+ punctuation: :none
860
+ )
861
+ expect(pt.tokenize).to eq(["hello"])
862
+ end
863
+
864
+ it 'handles non-domain words with a dot 1' do
865
+ text = "They were being helped.This is solidarity."
866
+ pt = PragmaticTokenizer::Tokenizer.new(text,
867
+ punctuation: 'none'
868
+ )
869
+ expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity"])
870
+ end
871
+
872
+ it 'handles non-domain words with a dot 2' do
873
+ text = "picture was taken in sept.2015"
874
+ pt = PragmaticTokenizer::Tokenizer.new(text,
875
+ punctuation: 'none'
876
+ )
877
+ expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
878
+ end
879
+
880
+ it 'handles non-domain words with a dot 3' do
881
+ text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
882
+ pt = PragmaticTokenizer::Tokenizer.new(text,
883
+ punctuation: 'none'
884
+ )
885
+ expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
886
+ end
887
+
888
+ it 'handles numbers with symbols 1' do
889
+ text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
890
+ pt = PragmaticTokenizer::Tokenizer.new(text,
891
+ punctuation: 'none'
892
+ )
893
+ expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
894
+ end
895
+
896
+ it 'handles numbers with symbols 2' do
897
+ text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
898
+ pt = PragmaticTokenizer::Tokenizer.new(text,
899
+ punctuation: 'none'
900
+ )
901
+ expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
902
+ end
903
+
904
+ it 'handles apostrophes and quotes' do
905
+ text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
906
+ pt = PragmaticTokenizer::Tokenizer.new(text,
907
+ punctuation: 'none'
908
+ )
909
+ expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
910
+ end
911
+
912
+ it 'handles mentions' do
913
+ text = ".@someone I disagree"
914
+ pt = PragmaticTokenizer::Tokenizer.new(text,
915
+ punctuation: 'none'
916
+ )
917
+ expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
918
+ end
919
+
920
+ it 'handles old school emoticons 2' do
921
+ text = "oooh! <3"
922
+ pt = PragmaticTokenizer::Tokenizer.new(text,
923
+ punctuation: 'none'
924
+ )
925
+ expect(pt.tokenize).to eq(["oooh", "<3"])
926
+ end
927
+
928
+ it 'handles old school emoticons 3' do
929
+ text = "@someone &lt;33"
930
+ pt = PragmaticTokenizer::Tokenizer.new(text,
931
+ punctuation: 'none'
932
+ )
933
+ expect(pt.tokenize).to eq(["@someone", "<33"])
934
+ end
935
+
936
+ it 'handles words with a symbol prefix 1' do
937
+ text = "Yes! /cc @someone"
938
+ pt = PragmaticTokenizer::Tokenizer.new(text,
939
+ punctuation: 'none'
940
+ )
941
+ expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
942
+ end
943
+
944
+ it 'handles words with a emoji suffix' do
945
+ text = "Let's meet there.😝 ok?"
946
+ pt = PragmaticTokenizer::Tokenizer.new(text,
947
+ punctuation: 'none'
948
+ )
949
+ expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
950
+ end
951
+
952
+ it 'handles words with a symbol prefix 2' do
953
+ text = "blah blah |photo by @someone"
954
+ pt = PragmaticTokenizer::Tokenizer.new(text,
955
+ punctuation: 'none'
956
+ )
957
+ expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
958
+ end
959
+
960
+ it 'handles pseudo-contractions' do
961
+ text = "I suggest to buy stocks that are low value+have momentum"
962
+ pt = PragmaticTokenizer::Tokenizer.new(text,
963
+ punctuation: 'none'
964
+ )
965
+ expect(pt.tokenize).to eq(["i", "suggest", "to", "buy", "stocks", "that", "are", "low", "value", "have", "momentum"])
966
+ end
967
+
968
+ it 'handles apostrophes and quotes 1' do
969
+ text = "Watch the video of @amandapalmer's song “Killing Type” here"
970
+ pt = PragmaticTokenizer::Tokenizer.new(text,
971
+ punctuation: 'none'
972
+ )
973
+ expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
974
+ end
975
+
976
+ it 'handles apostrophes and quotes 2' do
977
+ text = "Watch the video of @amandapalmer`s song “Killing Type” here"
978
+ pt = PragmaticTokenizer::Tokenizer.new(text,
979
+ punctuation: 'none'
980
+ )
981
+ expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
982
+ end
983
+
984
+ it 'handles numbers suffixed with a symbol' do
985
+ text = "4 Things Marketers Must Do Better in 2016: blah"
986
+ pt = PragmaticTokenizer::Tokenizer.new(text,
987
+ punctuation: 'none'
988
+ )
989
+ expect(pt.tokenize).to eq(["4", "things", "marketers", "must", "do", "better", "in", "2016", "blah"])
990
+ end
991
+
992
+ it 'handles words with a emoticon suffix' do
993
+ skip "NOT IMPLEMENTED"
994
+ text = "look, a dog with shoes☺ !!"
995
+ pt = PragmaticTokenizer::Tokenizer.new(text,
996
+ punctuation: 'none'
997
+ )
998
+ expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
999
+ end
1000
+
1001
+ it 'handles emoji 1' do
1002
+ text = "How bad!😝"
1003
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1004
+ punctuation: 'none'
1005
+ )
1006
+ expect(pt.tokenize).to eq(["how", "bad", "😝"])
1007
+ end
1008
+
1009
+ it 'handles emoji 2' do
1010
+ text = "😝How bad!"
1011
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1012
+ punctuation: 'none'
1013
+ )
1014
+ expect(pt.tokenize).to eq(["😝", "how", "bad"])
1015
+ end
1016
+
1017
+ it 'identifies old school emoticons' do
1018
+ skip "NOT IMPLEMENTED"
1019
+ text = 'looking forward to the new kodak super8 camera \o/'
1020
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1021
+ punctuation: 'none'
1022
+ )
1023
+ expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
1024
+ end
1025
+
1026
+ it 'splits at hashtags' do
1027
+ text = "some sentence#RT ... i like u2.#bono"
1028
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1029
+ punctuation: :none
1030
+ )
1031
+ expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
1032
+ end
1033
+ end
1034
+
1035
+ context 'option (remove_stop_words)' do
1036
+ it 'removes stop words' do
1037
+ text = 'This is a short sentence with explanations and stop words.'
1038
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1039
+ language: 'en',
1040
+ remove_stop_words: true
1041
+ )
1042
+ expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
1043
+ end
1044
+
1045
+ it 'removes user-supplied stop words' do
1046
+ text = 'This is a short sentence with explanations and stop words.'
1047
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1048
+ language: 'en',
1049
+ remove_stop_words: true,
1050
+ stop_words: ["and", "a"]
1051
+ )
1052
+ expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
1053
+ end
1054
+
1055
+ it 'removes user-supplied stop words and default stop words' do
1056
+ text = 'This is a short sentence with explanations and stop words.'
1057
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1058
+ language: 'en',
1059
+ remove_stop_words: true,
1060
+ stop_words: ["sentence"],
1061
+ filter_languages: [:en]
1062
+ )
1063
+ expect(pt.tokenize).to eq(["short", "explanations", "."])
1064
+ end
1065
+
1066
+ it 'removes user-supplied stop words and default stop words across multiple languages' do
1067
+ text = 'This is a short sentence with explanations and stop words. And achte German words.'
1068
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1069
+ language: 'en',
1070
+ remove_stop_words: true,
1071
+ stop_words: ["sentence"],
1072
+ filter_languages: [:en, :de]
1073
+ )
1074
+ expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
1075
+ end
1076
+ end
1077
+
1078
+ context 'multiple options selected' do
1079
+ it 'tokenizes a string #001' do
1080
+ text = 'His name is Mr. Smith.'
1081
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1082
+ language: 'en',
1083
+ punctuation: 'none'
1084
+ )
1085
+ expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1086
+ end
1087
+
1088
+ it 'tokenizes a string #002' do
1089
+ text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
1090
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1091
+ language: 'en',
1092
+ punctuation: 'only'
1093
+ )
1094
+ expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
1095
+ end
1096
+
1097
+ it 'tokenizes a string #003' do
1098
+ text = "Hello the a it experiment one fine."
1099
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1100
+ language: 'en',
1101
+ remove_stop_words: true
1102
+ )
1103
+ expect(pt.tokenize).to eq(["experiment", "fine", "."])
1104
+ end
1105
+
1106
+ it 'tokenizes a string #004' do
1107
+ # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
1108
+ text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
1109
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1110
+ expand_contractions: true,
1111
+ remove_stop_words: true,
1112
+ punctuation: 'none'
1113
+ )
1114
+ expect(pt.tokenize).to eq(["crazy", "sandowsky", "afford"])
1115
+ end
1116
+
1117
+ it 'tokenizes a string #005' do
1118
+ text = "Hello world with a stop word experiment."
1119
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1120
+ language: 'en',
1121
+ clean: true,
1122
+ numbers: :none,
1123
+ minimum_length: 3,
1124
+ expand_contractions: true,
1125
+ remove_stop_words: true,
1126
+ punctuation: 'none'
1127
+ )
1128
+ expect(pt.tokenize).to eq(["experiment"])
1129
+ end
1130
+
1131
+ it 'tokenizes a string #006' do
1132
+ text = "Hello; what is your: name @username **delete**"
1133
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1134
+ clean: true,
1135
+ punctuation: 'none'
1136
+ )
1137
+ expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
1138
+ end
1139
+
1140
+ it 'tokenizes a string #007' do
1141
+ text = 'His name is Mr. Smith.'
1142
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1143
+ language: 'en',
1144
+ punctuation: 'none',
1145
+ downcase: false
1146
+ )
1147
+ expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
1148
+ end
1149
+
1150
+ it 'tokenizes a string #008' do
1151
+ text = "Can't go tonight. Didn't finish."
1152
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1153
+ downcase: false,
1154
+ expand_contractions: true
1155
+ )
1156
+ expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
1157
+ end
1158
+
1159
+ it 'tokenizes a string #009' do
1160
+ text = "Some *interesting stuff* is __happening here__"
1161
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1162
+ punctuation: 'none',
1163
+ clean: true
1164
+ )
1165
+ expect(pt.tokenize).to eq(["some", "interesting", "stuff", "is", "happening", "here"])
1166
+ end
1167
+
1168
+ it 'also allows symbols for options' do
1169
+ text = 'His name is Mr. Smith.'
1170
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1171
+ language: :en,
1172
+ punctuation: :none
1173
+ )
1174
+ expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1175
+ end
1176
+
1177
+ it 'handles long strings 1' do
1178
+ text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
1179
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1180
+ language: 'en',
1181
+ clean: true,
1182
+ minimum_length: 3,
1183
+ expand_contractions: true,
1184
+ remove_stop_words: true,
1185
+ numbers: :none,
1186
+ punctuation: :none
1187
+ )
1188
+ expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
1189
+ end
1190
+
1191
+ it 'handles long strings 2' do
1192
+ text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
1193
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1194
+ language: 'en',
1195
+ clean: true,
1196
+ minimum_length: 3,
1197
+ expand_contractions: true,
1198
+ remove_stop_words: true,
1199
+ numbers: :none,
1200
+ punctuation: :none
1201
+ )
1202
+ expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
1203
+ end
1204
+
1205
+ it 'handles markdown' do
1206
+ text = "This is _bold_ and this is *italic*"
1207
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1208
+ punctuation: 'none',
1209
+ clean: true
1210
+ )
1211
+ expect(pt.tokenize).to eq(["this", "is", "bold", "and", "this", "is", "italic"])
1212
+ end
1213
+
1214
+ it 'handles single quotes' do
1215
+ text = "Recognised as one of the ‘good’ games."
1216
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1217
+ language: 'en',
1218
+ clean: true,
1219
+ numbers: :none,
1220
+ minimum_length: 3,
1221
+ expand_contractions: true,
1222
+ remove_stop_words: true,
1223
+ punctuation: :none,
1224
+ downcase: true)
1225
+ expect(pt.tokenize).to eq(["recognised", "good", "games"])
1226
+ end
1227
+
1228
+ it 'removes control characters' do
1229
+ text = "\u0000 \u001F \u007FHello test."
1230
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1231
+ language: 'en',
1232
+ clean: true
1233
+ )
1234
+ expect(pt.tokenize).to eq(["hello", "test", "."])
1235
+ end
1236
+
1237
+ it 'splits too long words with hypens' do
1238
+ text = "hi-hat and old-school but not really-important-long-word"
1239
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1240
+ punctuation: 'none',
1241
+ long_word_split: 12
1242
+ )
1243
+ expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
1244
+ end
1245
+
1246
+ it 'handles hashtags 2' do
1247
+ text = "This is the #upper-#limit"
1248
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1249
+ punctuation: 'none',
1250
+ hashtags: :keep_and_clean
1251
+ )
1252
+ expect(pt.tokenize).to eq(["this", "is", "the", "upper", "limit"])
1253
+ end
1254
+
1255
+ it 'handles hashtags 3' do
1256
+ text = "The #2016-fun has just begun."
1257
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1258
+ punctuation: 'none',
1259
+ hashtags: :keep_and_clean
1260
+ )
1261
+ expect(pt.tokenize).to eq(["the", "2016", "fun", "has", "just", "begun"])
1262
+ end
1263
+
1264
+ it 'does not clean mentions' do
1265
+ text = "@_someone_ because @someone and @_someone was taken"
1266
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1267
+ mentions: :keep_original,
1268
+ clean: true
1269
+ )
1270
+ expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
1271
+ end
1272
+
1273
+ it 'removes double single quotes' do
1274
+ text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
1275
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1276
+ punctuation: :none,
1277
+ clean: true
1278
+ )
1279
+ expect(pt.tokenize).to eq(["strong", "statement", "in", "the", "day", "the", "earth", "caught", "fire", "1961"])
1280
+ end
1281
+
1282
+ it 'removes a hyphen prefix 1' do
1283
+ text = "Geopol.-Strategy"
1284
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1285
+ punctuation: :none,
1286
+ clean: true
1287
+ )
1288
+ expect(pt.tokenize).to eq(["geopol", "strategy"])
1289
+ end
1290
+
1291
+ it 'removes a hyphen prefix 2' do
1292
+ text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
1293
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1294
+ punctuation: :none,
1295
+ clean: true
1296
+ )
1297
+ expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
1298
+ end
1299
+
1300
+ it 'does not remove tokens with ampersands' do
1301
+ text = "you&amp;me"
1302
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1303
+ clean: true,
1304
+ punctuation: :none
1305
+ )
1306
+ expect(pt.tokenize).to eq(["you", "me"])
1307
+ end
1308
+
1309
+ it 'cleans percent signs not related to numbers' do
1310
+ text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
1311
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1312
+ clean: true,
1313
+ numbers: :none,
1314
+ punctuation: :none
1315
+ )
1316
+ expect(pt.tokenize).to eq(["tudow", "provides", "company", "users", "a", "way", "to", "offer", "each", "other", "and", "guests", "and", "interpreters", "free", "assistance", "to", "date", "there", "have", "been", "questions", "asked"])
1317
+ end
1318
+ end
1319
+ end
1320
+
1321
+ context 'ending punctutation' do
1322
+ it 'handles ending question marks' do
1323
+ text = 'What is your name?'
1324
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["what", "is", "your", "name", "?"])
1325
+ end
1326
+
1327
+ it 'handles exclamation points' do
1328
+ text = 'You are the best!'
1329
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "are", "the", "best", "!"])
1330
+ end
1331
+
1332
+ it 'handles periods' do
1333
+ text = 'This way a productive day.'
1334
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "way", "a", "productive", "day", "."])
1335
+ end
1336
+
1337
+ it 'handles quotation marks' do
1338
+ text = "\"He is not the one you are looking for.\""
1339
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "\""])
1340
+ end
1341
+
1342
+ it 'handles single quotation marks' do
1343
+ text = "'He is not the one you are looking for.'"
1344
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "'"])
1345
+ end
1346
+
1347
+ it "handles single quotation marks ('twas)" do
1348
+ text = "'Twas the night before Christmas and 'twas cloudy."
1349
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'twas", "the", "night", "before", "christmas", "and", "'twas", "cloudy", "."])
1350
+ end
1351
+
1352
+ it 'handles double quotes at the end of a sentence' do
1353
+ text = "She said, \"I love cake.\""
1354
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\""])
1355
+ end
1356
+
1357
+ it 'handles double quotes at the beginning of a sentence' do
1358
+ text = "\"I love cake.\", she said to her friend."
1359
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "i", "love", "cake", ".", "\"", ",", "she", "said", "to", "her", "friend", "."])
1360
+ end
1361
+
1362
+ it 'handles double quotes in the middle of a sentence' do
1363
+ text = "She said, \"I love cake.\" to her friend."
1364
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\"", "to", "her", "friend", "."])
1365
+ end
1366
+ end
1367
+
1368
+ context 'other punctutation' do
1369
+ it 'handles ellipses' do
1370
+ text = 'Today is the last day...'
1371
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['today', 'is', 'the', 'last', 'day', '...'])
1372
+ end
1373
+
1374
+ it 'handles special quotes' do
1375
+ text = "«That's right», he said."
1376
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["«", "that's", "right", "»", ",", "he", "said", "."])
1377
+ end
1378
+
1379
+ it 'handles upside down punctuation (¿)' do
1380
+ text = "¿Really?"
1381
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¿", "really", "?"])
1382
+ end
1383
+
1384
+ it 'handles upside down punctuation (¡)' do
1385
+ text = "¡Really!"
1386
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¡", "really", "!"])
1387
+ end
1388
+
1389
+ it 'handles colons' do
1390
+ text = "This was the news: 'Today is the day!'"
1391
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "was", "the", "news", ":", "'", "today", "is", "the", "day", "!", "'"])
1392
+ end
1393
+
1394
+ it 'handles web addresses' do
1395
+ text = "Please visit the site - https://www.tm-town.com"
1396
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", "-", "https://www.tm-town.com"])
1397
+ end
1398
+
1399
+ it 'handles multiple colons and web addresses' do
1400
+ text = "Please visit the site: https://www.tm-town.com"
1401
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", ":", "https://www.tm-town.com"])
1402
+ end
1403
+
1404
+ it 'handles multiple dashes' do
1405
+ text = "John--here is your ticket."
1406
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["john", "-", "here", "is", "your", "ticket", "."])
1407
+ end
1408
+
1409
+ it 'handles brackets' do
1410
+ text = "This is an array: ['Hello']."
1411
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "an", "array", ":", "[", "'", "hello", "'", "]", "."])
1412
+ end
1413
+
1414
+ it 'handles double question marks' do
1415
+ text = "This is a question??"
1416
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "?"])
1417
+ end
1418
+
1419
+ it 'handles multiple ending punctuation' do
1420
+ text = "This is a question?!?"
1421
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "!", "?"])
1422
+ end
1423
+
1424
+ it 'handles contractions 1' do
1425
+ text = "How'd it go yesterday?"
1426
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["how'd", "it", "go", "yesterday", "?"])
1427
+ end
1428
+
1429
+ it 'handles contractions 2' do
1430
+ text = "You shouldn't worry."
1431
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "shouldn't", "worry", "."])
1432
+ end
1433
+
1434
+ it 'handles contractions 3' do
1435
+ text = "We've gone too far. It'll be over when we're done."
1436
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["we've", "gone", "too", "far", ".", "it'll", "be", "over", "when", "we're", "done", "."])
1437
+ end
1438
+
1439
+ it 'handles numbers' do
1440
+ text = 'He paid $10,000,000 for the new house which is equivalent to ¥1,000,000,000.00.'
1441
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['he', 'paid', '$10,000,000', 'for', 'the', 'new', 'house', 'which', 'is', 'equivalent', 'to', '¥1,000,000,000.00', '.'])
1442
+ end
1443
+
1444
+ it 'follows the Chicago Manual of Style on punctuation' do
1445
+ text = 'An abbreviation that ends with a period must not be left hanging without it (in parentheses, e.g.), and a sentence containing a parenthesis must itself have terminal punctuation (are we almost done?).'
1446
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['an', 'abbreviation', 'that', 'ends', 'with', 'a', 'period', 'must', 'not', 'be', 'left', 'hanging', 'without', 'it', '(', 'in', 'parentheses', ',', 'e.g.', ')', ',', 'and', 'a', 'sentence', 'containing', 'a', 'parenthesis', 'must', 'itself', 'have', 'terminal', 'punctuation', '(', 'are', 'we', 'almost', 'done', '?', ')', '.'])
1447
+ end
1448
+
1449
+ it 'is case insensitive' do
1450
+ text = 'his name is mr. smith, king of the \'entire\' forest.'
1451
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith', ',', 'king', 'of', 'the', '\'', 'entire', '\'', 'forest', '.'])
1452
+ end
1453
+
1454
+ it 'handles web url addresses #1' do
1455
+ text = 'Check out http://www.google.com/?this_is_a_url/hello-world.html for more info.'
1456
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "http://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1457
+ end
1458
+
1459
+ it 'handles web url addresses #2' do
1460
+ text = 'Check out https://www.google.com/?this_is_a_url/hello-world.html for more info.'
1461
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "https://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1462
+ end
1463
+
1464
+ it 'handles web url addresses #3' do
1465
+ text = 'Check out www.google.com/?this_is_a_url/hello-world.html for more info.'
1466
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1467
+ end
1468
+
1469
+ it 'handles email addresses' do
1470
+ text = 'Please email example@example.com for more info.'
1471
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "email", "example@example.com", "for", "more", "info", "."])
1472
+ end
1473
+
1474
+ it 'handles empty tokens' do
1475
+ text = "!!!!! https://t.co/xxxx"
1476
+ pt = PragmaticTokenizer::Tokenizer.new(text,
1477
+ punctuation: 'none'
1478
+ )
1479
+ expect(pt.tokenize).to eq(["https://t.co/xxxx"])
1480
+ end
1481
+ end
1482
+
1483
+ context 'abbreviations' do
1484
+ it 'handles military abbreviations' do
1485
+ text = 'His name is Col. Smith.'
1486
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["his", "name", "is", "col.", "smith", "."])
1487
+ end
1488
+
1489
+ it 'handles institution abbreviations' do
1490
+ text = 'She went to East Univ. to get her degree.'
1491
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "went", "to", "east", "univ.", "to", "get", "her", "degree", "."])
1492
+ end
1493
+
1494
+ it 'handles company abbreviations' do
1495
+ text = 'He works at ABC Inc. on weekends.'
1496
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc.", "on", "weekends", "."])
1497
+ end
1498
+
1499
+ it 'handles old state abbreviations' do
1500
+ text = 'He went to school in Mass. back in the day.'
1501
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "went", "to", "school", "in", "mass.", "back", "in", "the", "day", "."])
1502
+ end
1503
+
1504
+ it 'handles month abbreviations' do
1505
+ text = 'It is cold in Jan. they say.'
1506
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["it", "is", "cold", "in", "jan.", "they", "say", "."])
1507
+ end
1508
+
1509
+ it 'handles miscellaneous abbreviations' do
1510
+ text = '1, 2, 3, etc. is the beat.'
1511
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['1', ',', '2', ',', '3', ',', 'etc.', 'is', 'the', 'beat', '.'])
1512
+ end
1513
+
1514
+ it 'handles one letter abbreviations (i.e. Alfred E. Stone)' do
1515
+ text = 'Alfred E. Stone is a person.'
1516
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["alfred", "e.", "stone", "is", "a", "person", "."])
1517
+ end
1518
+
1519
+ it 'handles repeating letter-dot words (i.e. U.S.A. or J.C. Penney)' do
1520
+ text = 'The U.S.A. is a country.'
1521
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["the", "u.s.a.", "is", "a", "country", "."])
1522
+ end
1523
+
1524
+ it 'handles abbreviations that occur at the end of a sentence' do
1525
+ text = 'He works at ABC Inc.'
1526
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc."])
1527
+ end
1528
+
1529
+ it 'handles punctuation after an abbreviation' do
1530
+ text = 'Exclamation point requires both marks (Q.E.D.!).'
1531
+ expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['exclamation', 'point', 'requires', 'both', 'marks', '(', 'q.e.d.', '!', ')', '.'])
1532
+ end
1533
+ end
1534
+ end
1535
+ end