pragmatic_tokenizer 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -9
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +3 -3
- data/spec/languages/bulgarian_spec.rb +41 -0
- data/spec/languages/deutsch_spec.rb +229 -0
- data/spec/languages/english_spec.rb +1535 -0
- data/spec/languages/french_spec.rb +13 -0
- data/spec/performance_spec.rb +62 -0
- data/spec/pragmatic_tokenizer_spec.rb +41 -0
- data/spec/spec_helper.rb +2 -0
- metadata +17 -5
- data/bin/console +0 -14
- data/bin/setup +0 -7
@@ -0,0 +1,1535 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe PragmaticTokenizer do
|
4
|
+
context 'Language: English (en)' do
|
5
|
+
context '#tokenize (example strings)' do
|
6
|
+
|
7
|
+
context 'no options selected' do
|
8
|
+
it 'tokenizes a string #001' do
|
9
|
+
text = "Hello world."
|
10
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
11
|
+
expect(pt.tokenize).to eq(["hello", "world", "."])
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'tokenizes a string #002' do
|
15
|
+
text = "Hello Dr. Death."
|
16
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
17
|
+
expect(pt.tokenize).to eq(["hello", "dr.", "death", "."])
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'tokenizes a string #003' do
|
21
|
+
text = "Hello ____________________ ."
|
22
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
23
|
+
expect(pt.tokenize).to eq(["hello", "____________________", "."])
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'tokenizes a string #004' do
|
27
|
+
text = "It has a state-of-the-art design."
|
28
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
29
|
+
expect(pt.tokenize).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'tokenizes a string #005' do
|
33
|
+
text = "Jan. 2015 was 20% colder than now. But not in inter- and outer-space."
|
34
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
35
|
+
expect(pt.tokenize).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'tokenizes a string #006' do
|
39
|
+
text = 'Go to http://www.example.com.'
|
40
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
41
|
+
expect(pt.tokenize).to eq(["go", "to", "http://www.example.com", "."])
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'tokenizes a string #007' do
|
45
|
+
text = 'One of the lawyers from ‚Making a Murderer’ admitted a mistake'
|
46
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
47
|
+
expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'tokenizes a string #008' do
|
51
|
+
text = "One of the lawyers from 'Making a Murderer' admitted a mistake"
|
52
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
53
|
+
expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'tokenizes a string #009' do
|
57
|
+
text = "hello ;-) yes"
|
58
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
59
|
+
expect(pt.tokenize).to eq(["hello", ";", "-", ")", "yes"])
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'tokenizes a string #010' do
|
63
|
+
text = "hello ;)"
|
64
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
65
|
+
expect(pt.tokenize).to eq(["hello", ";", ")"])
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'tokenizes a string #011' do
|
69
|
+
text = "area <0.8 cm2"
|
70
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
71
|
+
expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'tokenizes a string #012' do
|
75
|
+
text = "area <0.8 cm2"
|
76
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
77
|
+
expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'tokenizes a string #013' do
|
81
|
+
text = "the “Star-Trek“-Inventor"
|
82
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
83
|
+
expect(pt.tokenize).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'tokenizes a string #014' do
|
87
|
+
text = "#ab-cd"
|
88
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
89
|
+
expect(pt.tokenize).to eq(["#ab-cd"])
|
90
|
+
end
|
91
|
+
|
92
|
+
it 'handles numbers with symbols 2' do
|
93
|
+
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
94
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
95
|
+
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'handles numbers with symbols 3' do
|
99
|
+
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
100
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
101
|
+
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
102
|
+
end
|
103
|
+
|
104
|
+
it 'splits at a comma' do
|
105
|
+
text = "16.1. day one,17.2. day two"
|
106
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
107
|
+
expect(pt.tokenize).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'identifies single quotes' do
|
111
|
+
text = "Sean Penn Sat for Secret Interview With ‘El Chapo,’ Mexican Drug"
|
112
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
113
|
+
expect(pt.tokenize).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'identifies prefixed symbols' do
|
117
|
+
text = "look:the sky is blue"
|
118
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
119
|
+
expect(pt.tokenize).to eq(["look", ":", "the", "sky", "is", "blue"])
|
120
|
+
end
|
121
|
+
|
122
|
+
it 'identifies hashtags with numbers too' do
|
123
|
+
text = "this is a sentence.#yay this too.#withnumbers123"
|
124
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
125
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'splits emojis' do
|
129
|
+
text = "🤔🙄"
|
130
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
131
|
+
expect(pt.tokenize).to eq(["🤔", "🙄"])
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'handles snowflakes 1' do
|
135
|
+
text = "❄️❄️❄️"
|
136
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
137
|
+
expect(pt.tokenize).to eq(["❄️", "❄️", "❄️"])
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'handles snowflakes 2' do
|
141
|
+
text = "\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
|
142
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
143
|
+
expect(pt.tokenize).to eq(["❄︎", "❄︎", "❄︎"])
|
144
|
+
end
|
145
|
+
|
146
|
+
it 'handles snowflakes 3' do
|
147
|
+
text = "\u2744\u2744\u2744"
|
148
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
149
|
+
expect(pt.tokenize).to eq(["\u2744", "\u2744", "\u2744"])
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'separates tokens' do
|
153
|
+
text = "football≠soccer"
|
154
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
155
|
+
expect(pt.tokenize).to eq(["football", "≠", "soccer"])
|
156
|
+
end
|
157
|
+
|
158
|
+
it 'deals with missing whitespaces' do
|
159
|
+
text = "this is sentence one!this is sentence two.@someone"
|
160
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
161
|
+
expect(pt.tokenize).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
|
162
|
+
end
|
163
|
+
|
164
|
+
it 'handles weird apostrophes' do
|
165
|
+
text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
|
166
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
167
|
+
expect(pt.tokenize).to eq(["there`s", "something"])
|
168
|
+
end
|
169
|
+
|
170
|
+
it 'treats abbreviations always the same' do
|
171
|
+
text = "U.S.A. U.S.A. U.S.A."
|
172
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
173
|
+
expect(pt.tokenize).to eq(
|
174
|
+
["u.s.a.", "u.s.a.", "u.s.a."]
|
175
|
+
)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
context 'user-supplied abbreviations' do
|
180
|
+
it 'tokenizes a regular string with an abbreviation' do
|
181
|
+
text = "Mr. Smith, hello world."
|
182
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
183
|
+
expect(pt.tokenize).to eq(["mr.", "smith", ",", "hello", "world", "."])
|
184
|
+
end
|
185
|
+
|
186
|
+
it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
|
187
|
+
text = "Mr. Smith, hello world."
|
188
|
+
abbreviations = ['mrs']
|
189
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
190
|
+
abbreviations: abbreviations
|
191
|
+
)
|
192
|
+
expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
|
193
|
+
end
|
194
|
+
|
195
|
+
it 'recognizes a user-supplied abbreviation' do
|
196
|
+
text = "thisisnotanormalabbreviation. hello world."
|
197
|
+
abbreviations = ['thisisnotanormalabbreviation']
|
198
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
199
|
+
abbreviations: abbreviations
|
200
|
+
)
|
201
|
+
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
|
202
|
+
end
|
203
|
+
|
204
|
+
it 'handles an empty user-supplied abbreviation array' do
|
205
|
+
text = "thisisnotanormalabbreviation. hello world."
|
206
|
+
abbreviations = []
|
207
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
208
|
+
abbreviations: abbreviations
|
209
|
+
)
|
210
|
+
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
|
211
|
+
end
|
212
|
+
|
213
|
+
it 'handles abrreviations across multiple languages' do
|
214
|
+
text = "Mr. Smith how are ü. today."
|
215
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
216
|
+
filter_languages: [:en, :de]
|
217
|
+
)
|
218
|
+
expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
|
219
|
+
end
|
220
|
+
|
221
|
+
it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
|
222
|
+
text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
|
223
|
+
abbreviations = ['thisisnotanormalabbreviation']
|
224
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
225
|
+
filter_languages: [:en, :de],
|
226
|
+
abbreviations: abbreviations
|
227
|
+
)
|
228
|
+
expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
context 'option (expand_contractions)' do
|
233
|
+
it 'does not expand the contractions' do
|
234
|
+
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
235
|
+
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
236
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
237
|
+
expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
|
238
|
+
end
|
239
|
+
|
240
|
+
it 'expands user-supplied contractions' do
|
241
|
+
text = "Hello supa'soo guy."
|
242
|
+
contractions = { "supa'soo" => "super smooth" }
|
243
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
244
|
+
contractions: contractions,
|
245
|
+
expand_contractions: true
|
246
|
+
)
|
247
|
+
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
|
248
|
+
end
|
249
|
+
|
250
|
+
it 'does not expands user-supplied contractions' do
|
251
|
+
text = "Hello supa'soo guy."
|
252
|
+
contractions = { "supa'soo" => "super smooth" }
|
253
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
254
|
+
contractions: contractions,
|
255
|
+
expand_contractions: false
|
256
|
+
)
|
257
|
+
expect(pt.tokenize).to eq( ["hello", "supa'soo", "guy", "."])
|
258
|
+
end
|
259
|
+
|
260
|
+
it 'expands user-supplied contractions and language contractions' do
|
261
|
+
text = "Hello supa'soo guy. auf's wasn't it?"
|
262
|
+
contractions = { "supa'soo" => "super smooth" }
|
263
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
264
|
+
contractions: contractions,
|
265
|
+
expand_contractions: true,
|
266
|
+
filter_languages: [:en, :de]
|
267
|
+
)
|
268
|
+
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
269
|
+
end
|
270
|
+
|
271
|
+
it 'expands language contractions' do
|
272
|
+
text = "Hello supa'soo guy. auf's wasn't it?"
|
273
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
274
|
+
expand_contractions: true,
|
275
|
+
filter_languages: [:en, :de]
|
276
|
+
)
|
277
|
+
expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
278
|
+
end
|
279
|
+
|
280
|
+
it 'tokenizes a string #001' do
|
281
|
+
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
282
|
+
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
283
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
284
|
+
expand_contractions: true
|
285
|
+
)
|
286
|
+
expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
|
287
|
+
end
|
288
|
+
|
289
|
+
it 'tokenizes a string #002' do
|
290
|
+
# http://nlp.stanford.edu/software/tokenizer.shtml
|
291
|
+
text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
|
292
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
293
|
+
expand_contractions: true
|
294
|
+
)
|
295
|
+
expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
|
296
|
+
end
|
297
|
+
|
298
|
+
it 'tokenizes a string #003' do
|
299
|
+
text = "Look for his/her account."
|
300
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
301
|
+
expand_contractions: true
|
302
|
+
)
|
303
|
+
expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
|
304
|
+
end
|
305
|
+
|
306
|
+
it 'tokenizes a string #004' do
|
307
|
+
text = "I like apples and/or oranges."
|
308
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
309
|
+
expand_contractions: true
|
310
|
+
)
|
311
|
+
expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
context 'option (emojis)' do
|
316
|
+
it 'removes emoji' do
|
317
|
+
text = "Return the emoji 👿😍😱🐔🌚. 🌚"
|
318
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
319
|
+
remove_emoji: true
|
320
|
+
)
|
321
|
+
expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
|
322
|
+
end
|
323
|
+
|
324
|
+
it 'does not remove emoji' do
|
325
|
+
text = "Return the emoji 👿😍😱🐔🌚. 🌚"
|
326
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
327
|
+
expect(pt.tokenize).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
|
328
|
+
end
|
329
|
+
|
330
|
+
it 'removes snowflakes 1' do
|
331
|
+
text = "hello❄️❄️❄️"
|
332
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
333
|
+
remove_emoji: true
|
334
|
+
)
|
335
|
+
expect(pt.tokenize).to eq(["hello"])
|
336
|
+
end
|
337
|
+
|
338
|
+
it 'removes snowflakes 2' do
|
339
|
+
text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
|
340
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
341
|
+
remove_emoji: true
|
342
|
+
)
|
343
|
+
expect(pt.tokenize).to eq(["hello"])
|
344
|
+
end
|
345
|
+
|
346
|
+
it 'removes snowflakes 3' do
|
347
|
+
text = "hello\u2744\u2744\u2744"
|
348
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
349
|
+
remove_emoji: true
|
350
|
+
)
|
351
|
+
expect(pt.tokenize).to eq(["hello"])
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
context 'option (hashtags)' do
|
356
|
+
it 'tokenizes a string #001' do
|
357
|
+
text = "This is a #hashtag yay!"
|
358
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
359
|
+
hashtags: :remove
|
360
|
+
)
|
361
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
362
|
+
end
|
363
|
+
|
364
|
+
it 'tokenizes a string #002' do
|
365
|
+
text = "This is a #hashtag yay!"
|
366
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
367
|
+
hashtags: :keep_and_clean
|
368
|
+
)
|
369
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
|
370
|
+
end
|
371
|
+
|
372
|
+
it 'tokenizes a string #003' do
|
373
|
+
text = "This is a #hashtag yay!"
|
374
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
375
|
+
hashtags: :keep_original
|
376
|
+
)
|
377
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
context 'option (mentions)' do
|
382
|
+
it 'tokenizes a string #001' do
|
383
|
+
text = "This is a @mention @mention2 yay!"
|
384
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
385
|
+
mentions: :remove
|
386
|
+
)
|
387
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
388
|
+
end
|
389
|
+
|
390
|
+
it 'tokenizes a string #002' do
|
391
|
+
text = "This is a @mention @mention2 yay!"
|
392
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
393
|
+
mentions: :keep_and_clean
|
394
|
+
)
|
395
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
|
396
|
+
end
|
397
|
+
|
398
|
+
it 'tokenizes a string #003' do
|
399
|
+
text = "This is a @mention @mention2 yay!"
|
400
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
401
|
+
mentions: :keep_original
|
402
|
+
)
|
403
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
context 'option (email addresses)' do
|
408
|
+
it 'tokenizes a string #001' do
|
409
|
+
text = "Here are some emails jon@hotmail.com ben123@gmail.com."
|
410
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
411
|
+
remove_emails: :true
|
412
|
+
)
|
413
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
|
414
|
+
end
|
415
|
+
|
416
|
+
it 'tokenizes a string #002' do
|
417
|
+
text = "Here are some emails jon@hotmail.com ben123@gmail.com."
|
418
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
419
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
|
420
|
+
end
|
421
|
+
|
422
|
+
it 'knows what is not an email address' do
|
423
|
+
text = "the great cook.@someone something else@whoever"
|
424
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
425
|
+
remove_emails: true
|
426
|
+
)
|
427
|
+
expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
context 'option (urls)' do
|
432
|
+
it 'tokenizes a string #001' do
|
433
|
+
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
434
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
435
|
+
remove_urls: :true
|
436
|
+
)
|
437
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
|
438
|
+
end
|
439
|
+
|
440
|
+
it 'tokenizes a string #002' do
|
441
|
+
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
442
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
443
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
context 'option (domains)' do
|
448
|
+
it 'tokenizes a string #001' do
|
449
|
+
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
450
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
451
|
+
remove_domains: :true
|
452
|
+
)
|
453
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
|
454
|
+
end
|
455
|
+
|
456
|
+
it 'tokenizes a string #002' do
|
457
|
+
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
458
|
+
pt = PragmaticTokenizer::Tokenizer.new(text)
|
459
|
+
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
460
|
+
end
|
461
|
+
|
462
|
+
it 'knows what is not a domain 1' do
|
463
|
+
skip "NOT IMPLEMENTED"
|
464
|
+
text = "this is a sentence.and no domain."
|
465
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
466
|
+
remove_domains: true
|
467
|
+
)
|
468
|
+
expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
|
469
|
+
end
|
470
|
+
|
471
|
+
it 'knows what is not a domain 2' do
|
472
|
+
text = "former president g.w.bush was..."
|
473
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
474
|
+
remove_domains: true
|
475
|
+
)
|
476
|
+
expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
|
477
|
+
end
|
478
|
+
|
479
|
+
it 'knows what is not a domain 3' do
|
480
|
+
text = "2.something-times"
|
481
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
482
|
+
remove_domains: true
|
483
|
+
)
|
484
|
+
expect(pt.tokenize).to eq(["2.something-times"])
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
context 'option (long_word_split)' do
|
489
|
+
it 'tokenizes a string #001' do
|
490
|
+
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
491
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
492
|
+
long_word_split: 10
|
493
|
+
)
|
494
|
+
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
|
495
|
+
end
|
496
|
+
|
497
|
+
it 'tokenizes a string #002' do
|
498
|
+
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
499
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
500
|
+
long_word_split: 4
|
501
|
+
)
|
502
|
+
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
context 'option (clean)' do
|
507
|
+
it 'tokenizes a string #001' do
|
508
|
+
text = "Hello ---------------."
|
509
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
510
|
+
clean: true
|
511
|
+
)
|
512
|
+
expect(pt.tokenize).to eq(["hello", "."])
|
513
|
+
end
|
514
|
+
|
515
|
+
it 'tokenizes a string #002' do
|
516
|
+
text = "Hello ____________________ ."
|
517
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
518
|
+
clean: true
|
519
|
+
)
|
520
|
+
expect(pt.tokenize).to eq(["hello", "."])
|
521
|
+
end
|
522
|
+
|
523
|
+
it 'tokenizes a string #003' do
|
524
|
+
text = "© ABC Company 1994"
|
525
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
526
|
+
clean: true
|
527
|
+
)
|
528
|
+
expect(pt.tokenize).to eq(["abc", "company", "1994"])
|
529
|
+
end
|
530
|
+
|
531
|
+
it 'tokenizes a string #004' do
|
532
|
+
text = "This sentence has a long string of dots ......................."
|
533
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
534
|
+
clean: true
|
535
|
+
)
|
536
|
+
expect(pt.tokenize).to eq(["this", "sentence", "has", "a", "long", "string", "of", "dots"])
|
537
|
+
end
|
538
|
+
|
539
|
+
it 'tokenizes a string #005' do
|
540
|
+
text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
|
541
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
542
|
+
clean: true
|
543
|
+
)
|
544
|
+
expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
|
545
|
+
end
|
546
|
+
|
547
|
+
it 'does not remove a valid hashtag' do
|
548
|
+
text = "This #sentence has a long string of dots ......................."
|
549
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
550
|
+
clean: true
|
551
|
+
)
|
552
|
+
expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
|
553
|
+
end
|
554
|
+
|
555
|
+
it 'does not remove a valid mention' do
|
556
|
+
text = "This @sentence has a long string of dots ......................."
|
557
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
558
|
+
clean: true
|
559
|
+
)
|
560
|
+
expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
|
561
|
+
end
|
562
|
+
|
563
|
+
it 'cleans words with symbols 1' do
|
564
|
+
text = "something.com:article title !!wow look!!1"
|
565
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
566
|
+
clean: true
|
567
|
+
)
|
568
|
+
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
569
|
+
end
|
570
|
+
|
571
|
+
it 'cleans words with symbols 2' do
|
572
|
+
text = "something.com:article title !!wow look!!1!1!11!"
|
573
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
574
|
+
clean: true
|
575
|
+
)
|
576
|
+
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
577
|
+
end
|
578
|
+
|
579
|
+
it 'identifies prefixed symbols' do
|
580
|
+
text = "look:the sky is blue"
|
581
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
582
|
+
clean: true
|
583
|
+
)
|
584
|
+
expect(pt.tokenize).to eq(["look", "the", "sky", "is", "blue"])
|
585
|
+
end
|
586
|
+
|
587
|
+
it 'keeps numbers at the end of mentions and hashtags' do
|
588
|
+
text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
|
589
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
590
|
+
clean: true
|
591
|
+
)
|
592
|
+
expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
|
593
|
+
end
|
594
|
+
|
595
|
+
it 'cleans a prefixed weird hyphen' do
|
596
|
+
text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
|
597
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
598
|
+
clean: true
|
599
|
+
)
|
600
|
+
expect(pt.tokenize).to eq(["high", "temperature", "and", "wind"])
|
601
|
+
end
|
602
|
+
|
603
|
+
it 'cleans (r) and (c) and (tm)' do
|
604
|
+
text = "the oscar® night ©companyname is a trademark™"
|
605
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
606
|
+
clean: true
|
607
|
+
)
|
608
|
+
expect(pt.tokenize).to eq(["the", "oscar", "night", "companyname", "is", "a", "trademark"])
|
609
|
+
end
|
610
|
+
|
611
|
+
it 'cleans letters in boxes 1' do
|
612
|
+
text = "making🇦🇹postcards"
|
613
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
614
|
+
clean: true
|
615
|
+
)
|
616
|
+
expect(pt.tokenize).to eq(["making", "postcards"])
|
617
|
+
end
|
618
|
+
|
619
|
+
it 'removes colons' do
|
620
|
+
text = "At 19:30 o'clock: Mad Max: Fury Road"
|
621
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
622
|
+
clean: true
|
623
|
+
)
|
624
|
+
expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
|
625
|
+
end
|
626
|
+
|
627
|
+
it 'removes a hyphen prefix 3' do
|
628
|
+
text = "women's clothes and –shoes needed"
|
629
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
630
|
+
clean: true
|
631
|
+
)
|
632
|
+
expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
|
633
|
+
end
|
634
|
+
|
635
|
+
it 'does not remove tokens with ampersands' do
|
636
|
+
text = "you&me"
|
637
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
638
|
+
clean: true
|
639
|
+
)
|
640
|
+
expect(pt.tokenize).to eq(["you", "&", "me"])
|
641
|
+
end
|
642
|
+
end
|
643
|
+
|
644
|
+
context 'option (classic_filter)' do
|
645
|
+
it 'tokenizes a string #001' do
|
646
|
+
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
647
|
+
text = "I.B.M. cat's can't"
|
648
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
649
|
+
classic_filter: true
|
650
|
+
)
|
651
|
+
expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
|
652
|
+
end
|
653
|
+
|
654
|
+
it 'tokenizes a string #002' do
|
655
|
+
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
656
|
+
text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
|
657
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
658
|
+
classic_filter: true
|
659
|
+
)
|
660
|
+
expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
|
661
|
+
end
|
662
|
+
|
663
|
+
it 'optimizes the classic filter' do
|
664
|
+
text = "therés something"
|
665
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
666
|
+
classic_filter: true
|
667
|
+
)
|
668
|
+
expect(pt.tokenize).to eq(["there", "something"])
|
669
|
+
end
|
670
|
+
|
671
|
+
it 'optimizes the classic filter' do
|
672
|
+
text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
|
673
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
674
|
+
classic_filter: true
|
675
|
+
)
|
676
|
+
expect(pt.tokenize).to eq(["there", "something"])
|
677
|
+
end
|
678
|
+
end
|
679
|
+
|
680
|
+
context 'option (language)' do
|
681
|
+
it 'tokenizes a string #001' do
|
682
|
+
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
683
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
684
|
+
language: 'en'
|
685
|
+
)
|
686
|
+
expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
|
687
|
+
end
|
688
|
+
|
689
|
+
it 'tokenizes a string #002' do
|
690
|
+
text = "Lisa Raines, a lawyer and director of government relations
|
691
|
+
for the Industrial Biotechnical Association, contends that a judge
|
692
|
+
well-versed in patent law and the concerns of research-based industries
|
693
|
+
would have ruled otherwise. And Judge Newman, a former patent lawyer,
|
694
|
+
wrote in her dissent when the court denied a motion for a rehearing of
|
695
|
+
the case by the full court, \'The panel's judicial legislation has
|
696
|
+
affected an important high-technological industry, without regard
|
697
|
+
to the consequences for research and innovation or the public interest.\'
|
698
|
+
Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
|
699
|
+
patent lawyers on the court could prove troublesome.\'"
|
700
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
701
|
+
language: 'en'
|
702
|
+
)
|
703
|
+
expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
|
704
|
+
end
|
705
|
+
end
|
706
|
+
|
707
|
+
context 'option (numbers)' do
|
708
|
+
it 'tokenizes a string #001' do
|
709
|
+
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
710
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
711
|
+
numbers: :all
|
712
|
+
)
|
713
|
+
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
714
|
+
end
|
715
|
+
|
716
|
+
it 'tokenizes a string #002' do
|
717
|
+
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
718
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
719
|
+
numbers: :none
|
720
|
+
)
|
721
|
+
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
|
722
|
+
end
|
723
|
+
|
724
|
+
it 'tokenizes a string #003' do
|
725
|
+
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
726
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
727
|
+
numbers: :semi
|
728
|
+
)
|
729
|
+
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
|
730
|
+
end
|
731
|
+
|
732
|
+
it 'tokenizes a string #004' do
|
733
|
+
text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
|
734
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
735
|
+
numbers: :only
|
736
|
+
)
|
737
|
+
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
|
738
|
+
end
|
739
|
+
|
740
|
+
it 'tokenizes a string #005' do
|
741
|
+
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
742
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
743
|
+
numbers: :none
|
744
|
+
)
|
745
|
+
expect(pt.tokenize).to eq([])
|
746
|
+
end
|
747
|
+
|
748
|
+
it 'tokenizes a string #005' do
|
749
|
+
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
|
750
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
751
|
+
numbers: :none
|
752
|
+
)
|
753
|
+
expect(pt.tokenize).to eq(["number"])
|
754
|
+
end
|
755
|
+
|
756
|
+
it 'tokenizes a string #006' do
|
757
|
+
text = "Remove III Roman Numerals and IX. with a period."
|
758
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
759
|
+
numbers: :none
|
760
|
+
)
|
761
|
+
expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
|
762
|
+
end
|
763
|
+
end
|
764
|
+
|
765
|
+
context 'option (minimum_length)' do
|
766
|
+
it 'tokenizes a string #001' do
|
767
|
+
text = "Let's test the minimum length of fiver."
|
768
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
769
|
+
minimum_length: 5
|
770
|
+
)
|
771
|
+
expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
|
772
|
+
end
|
773
|
+
end
|
774
|
+
|
775
|
+
context 'option (punctuation)' do
|
776
|
+
it 'tokenizes a string #001' do
|
777
|
+
text = "kath. / evang"
|
778
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
779
|
+
punctuation: 'none'
|
780
|
+
)
|
781
|
+
expect(pt.tokenize).to eq(["kath", "evang"])
|
782
|
+
end
|
783
|
+
|
784
|
+
it 'tokenizes a string #002' do
|
785
|
+
text = "derStandard.at › Sport"
|
786
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
787
|
+
punctuation: 'none'
|
788
|
+
)
|
789
|
+
expect(pt.tokenize).to eq(["derstandard.at", "sport"])
|
790
|
+
end
|
791
|
+
|
792
|
+
it 'tokenizes a string #003' do
|
793
|
+
text = "hello ^^"
|
794
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
795
|
+
punctuation: 'none'
|
796
|
+
)
|
797
|
+
expect(pt.tokenize).to eq(["hello"])
|
798
|
+
end
|
799
|
+
|
800
|
+
it 'tokenizes a string #004' do
|
801
|
+
text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
|
802
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
803
|
+
punctuation: 'none'
|
804
|
+
)
|
805
|
+
expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
|
806
|
+
end
|
807
|
+
|
808
|
+
it 'tokenizes a string #005' do
|
809
|
+
text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
|
810
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
811
|
+
punctuation: 'none'
|
812
|
+
)
|
813
|
+
expect(pt.tokenize).to eq(["a", "sentence", "one", "with", "two", "dots", "and", "with", "three", "or", "horizontal", "ellipsis", "which", "are", "three", "dots", "too"])
|
814
|
+
end
|
815
|
+
|
816
|
+
it 'tokenizes a string #006' do
|
817
|
+
text = "+++ BREAKING +++ something happened; is it interesting?"
|
818
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
819
|
+
punctuation: 'none'
|
820
|
+
)
|
821
|
+
expect(pt.tokenize).to eq(["breaking", "something", "happened", "is", "it", "interesting"])
|
822
|
+
end
|
823
|
+
|
824
|
+
it 'tokenizes a string #007' do
|
825
|
+
text = "Some *interesting stuff* is __happening here__"
|
826
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
827
|
+
punctuation: 'none'
|
828
|
+
)
|
829
|
+
expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
|
830
|
+
end
|
831
|
+
|
832
|
+
it 'tokenizes a string #008' do
|
833
|
+
text = "Hello; what is your: name @username **delete**"
|
834
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
835
|
+
punctuation: 'none'
|
836
|
+
)
|
837
|
+
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
|
838
|
+
end
|
839
|
+
|
840
|
+
it 'tokenizes a string #009' do
|
841
|
+
text = "hello ;-) yes"
|
842
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
843
|
+
punctuation: :none
|
844
|
+
)
|
845
|
+
expect(pt.tokenize).to eq(["hello", "yes"])
|
846
|
+
end
|
847
|
+
|
848
|
+
it 'tokenizes a string #010' do
|
849
|
+
text = "hello ;)"
|
850
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
851
|
+
punctuation: 'none'
|
852
|
+
)
|
853
|
+
expect(pt.tokenize).to eq(["hello"])
|
854
|
+
end
|
855
|
+
|
856
|
+
it 'tokenizes a string #011' do
|
857
|
+
text = "Hello ____________________ ."
|
858
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
859
|
+
punctuation: :none
|
860
|
+
)
|
861
|
+
expect(pt.tokenize).to eq(["hello"])
|
862
|
+
end
|
863
|
+
|
864
|
+
it 'handles non-domain words with a dot 1' do
|
865
|
+
text = "They were being helped.This is solidarity."
|
866
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
867
|
+
punctuation: 'none'
|
868
|
+
)
|
869
|
+
expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity"])
|
870
|
+
end
|
871
|
+
|
872
|
+
it 'handles non-domain words with a dot 2' do
|
873
|
+
text = "picture was taken in sept.2015"
|
874
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
875
|
+
punctuation: 'none'
|
876
|
+
)
|
877
|
+
expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
|
878
|
+
end
|
879
|
+
|
880
|
+
it 'handles non-domain words with a dot 3' do
|
881
|
+
text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
|
882
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
883
|
+
punctuation: 'none'
|
884
|
+
)
|
885
|
+
expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
|
886
|
+
end
|
887
|
+
|
888
|
+
it 'handles numbers with symbols 1' do
|
889
|
+
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
890
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
891
|
+
punctuation: 'none'
|
892
|
+
)
|
893
|
+
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
894
|
+
end
|
895
|
+
|
896
|
+
it 'handles numbers with symbols 2' do
|
897
|
+
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
898
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
899
|
+
punctuation: 'none'
|
900
|
+
)
|
901
|
+
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
902
|
+
end
|
903
|
+
|
904
|
+
it 'handles apostrophes and quotes' do
|
905
|
+
text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
|
906
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
907
|
+
punctuation: 'none'
|
908
|
+
)
|
909
|
+
expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
|
910
|
+
end
|
911
|
+
|
912
|
+
it 'handles mentions' do
|
913
|
+
text = ".@someone I disagree"
|
914
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
915
|
+
punctuation: 'none'
|
916
|
+
)
|
917
|
+
expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
|
918
|
+
end
|
919
|
+
|
920
|
+
it 'handles old school emoticons 2' do
|
921
|
+
text = "oooh! <3"
|
922
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
923
|
+
punctuation: 'none'
|
924
|
+
)
|
925
|
+
expect(pt.tokenize).to eq(["oooh", "<3"])
|
926
|
+
end
|
927
|
+
|
928
|
+
it 'handles old school emoticons 3' do
|
929
|
+
text = "@someone <33"
|
930
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
931
|
+
punctuation: 'none'
|
932
|
+
)
|
933
|
+
expect(pt.tokenize).to eq(["@someone", "<33"])
|
934
|
+
end
|
935
|
+
|
936
|
+
it 'handles words with a symbol prefix 1' do
|
937
|
+
text = "Yes! /cc @someone"
|
938
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
939
|
+
punctuation: 'none'
|
940
|
+
)
|
941
|
+
expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
|
942
|
+
end
|
943
|
+
|
944
|
+
it 'handles words with a emoji suffix' do
|
945
|
+
text = "Let's meet there.😝 ok?"
|
946
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
947
|
+
punctuation: 'none'
|
948
|
+
)
|
949
|
+
expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
|
950
|
+
end
|
951
|
+
|
952
|
+
it 'handles words with a symbol prefix 2' do
|
953
|
+
text = "blah blah |photo by @someone"
|
954
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
955
|
+
punctuation: 'none'
|
956
|
+
)
|
957
|
+
expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
|
958
|
+
end
|
959
|
+
|
960
|
+
it 'handles pseudo-contractions' do
|
961
|
+
text = "I suggest to buy stocks that are low value+have momentum"
|
962
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
963
|
+
punctuation: 'none'
|
964
|
+
)
|
965
|
+
expect(pt.tokenize).to eq(["i", "suggest", "to", "buy", "stocks", "that", "are", "low", "value", "have", "momentum"])
|
966
|
+
end
|
967
|
+
|
968
|
+
it 'handles apostrophes and quotes 1' do
|
969
|
+
text = "Watch the video of @amandapalmer's song “Killing Type” here"
|
970
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
971
|
+
punctuation: 'none'
|
972
|
+
)
|
973
|
+
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
|
974
|
+
end
|
975
|
+
|
976
|
+
it 'handles apostrophes and quotes 2' do
|
977
|
+
text = "Watch the video of @amandapalmer`s song “Killing Type” here"
|
978
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
979
|
+
punctuation: 'none'
|
980
|
+
)
|
981
|
+
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
|
982
|
+
end
|
983
|
+
|
984
|
+
it 'handles numbers suffixed with a symbol' do
|
985
|
+
text = "4 Things Marketers Must Do Better in 2016: blah"
|
986
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
987
|
+
punctuation: 'none'
|
988
|
+
)
|
989
|
+
expect(pt.tokenize).to eq(["4", "things", "marketers", "must", "do", "better", "in", "2016", "blah"])
|
990
|
+
end
|
991
|
+
|
992
|
+
it 'handles words with a emoticon suffix' do
|
993
|
+
skip "NOT IMPLEMENTED"
|
994
|
+
text = "look, a dog with shoes☺ !!"
|
995
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
996
|
+
punctuation: 'none'
|
997
|
+
)
|
998
|
+
expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
|
999
|
+
end
|
1000
|
+
|
1001
|
+
it 'handles emoji 1' do
|
1002
|
+
text = "How bad!😝"
|
1003
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1004
|
+
punctuation: 'none'
|
1005
|
+
)
|
1006
|
+
expect(pt.tokenize).to eq(["how", "bad", "😝"])
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
it 'handles emoji 2' do
|
1010
|
+
text = "😝How bad!"
|
1011
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1012
|
+
punctuation: 'none'
|
1013
|
+
)
|
1014
|
+
expect(pt.tokenize).to eq(["😝", "how", "bad"])
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
it 'identifies old school emoticons' do
|
1018
|
+
skip "NOT IMPLEMENTED"
|
1019
|
+
text = 'looking forward to the new kodak super8 camera \o/'
|
1020
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1021
|
+
punctuation: 'none'
|
1022
|
+
)
|
1023
|
+
expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
it 'splits at hashtags' do
|
1027
|
+
text = "some sentence#RT ... i like u2.#bono"
|
1028
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1029
|
+
punctuation: :none
|
1030
|
+
)
|
1031
|
+
expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
|
1032
|
+
end
|
1033
|
+
end
|
1034
|
+
|
1035
|
+
context 'option (remove_stop_words)' do
|
1036
|
+
it 'removes stop words' do
|
1037
|
+
text = 'This is a short sentence with explanations and stop words.'
|
1038
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1039
|
+
language: 'en',
|
1040
|
+
remove_stop_words: true
|
1041
|
+
)
|
1042
|
+
expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
|
1043
|
+
end
|
1044
|
+
|
1045
|
+
it 'removes user-supplied stop words' do
|
1046
|
+
text = 'This is a short sentence with explanations and stop words.'
|
1047
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1048
|
+
language: 'en',
|
1049
|
+
remove_stop_words: true,
|
1050
|
+
stop_words: ["and", "a"]
|
1051
|
+
)
|
1052
|
+
expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
it 'removes user-supplied stop words and default stop words' do
|
1056
|
+
text = 'This is a short sentence with explanations and stop words.'
|
1057
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1058
|
+
language: 'en',
|
1059
|
+
remove_stop_words: true,
|
1060
|
+
stop_words: ["sentence"],
|
1061
|
+
filter_languages: [:en]
|
1062
|
+
)
|
1063
|
+
expect(pt.tokenize).to eq(["short", "explanations", "."])
|
1064
|
+
end
|
1065
|
+
|
1066
|
+
it 'removes user-supplied stop words and default stop words across multiple languages' do
|
1067
|
+
text = 'This is a short sentence with explanations and stop words. And achte German words.'
|
1068
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1069
|
+
language: 'en',
|
1070
|
+
remove_stop_words: true,
|
1071
|
+
stop_words: ["sentence"],
|
1072
|
+
filter_languages: [:en, :de]
|
1073
|
+
)
|
1074
|
+
expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
|
1075
|
+
end
|
1076
|
+
end
|
1077
|
+
|
1078
|
+
context 'multiple options selected' do
|
1079
|
+
it 'tokenizes a string #001' do
|
1080
|
+
text = 'His name is Mr. Smith.'
|
1081
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1082
|
+
language: 'en',
|
1083
|
+
punctuation: 'none'
|
1084
|
+
)
|
1085
|
+
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1086
|
+
end
|
1087
|
+
|
1088
|
+
it 'tokenizes a string #002' do
|
1089
|
+
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
1090
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1091
|
+
language: 'en',
|
1092
|
+
punctuation: 'only'
|
1093
|
+
)
|
1094
|
+
expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
it 'tokenizes a string #003' do
|
1098
|
+
text = "Hello the a it experiment one fine."
|
1099
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1100
|
+
language: 'en',
|
1101
|
+
remove_stop_words: true
|
1102
|
+
)
|
1103
|
+
expect(pt.tokenize).to eq(["experiment", "fine", "."])
|
1104
|
+
end
|
1105
|
+
|
1106
|
+
it 'tokenizes a string #004' do
|
1107
|
+
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
1108
|
+
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
1109
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1110
|
+
expand_contractions: true,
|
1111
|
+
remove_stop_words: true,
|
1112
|
+
punctuation: 'none'
|
1113
|
+
)
|
1114
|
+
expect(pt.tokenize).to eq(["crazy", "sandowsky", "afford"])
|
1115
|
+
end
|
1116
|
+
|
1117
|
+
it 'tokenizes a string #005' do
|
1118
|
+
text = "Hello world with a stop word experiment."
|
1119
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1120
|
+
language: 'en',
|
1121
|
+
clean: true,
|
1122
|
+
numbers: :none,
|
1123
|
+
minimum_length: 3,
|
1124
|
+
expand_contractions: true,
|
1125
|
+
remove_stop_words: true,
|
1126
|
+
punctuation: 'none'
|
1127
|
+
)
|
1128
|
+
expect(pt.tokenize).to eq(["experiment"])
|
1129
|
+
end
|
1130
|
+
|
1131
|
+
it 'tokenizes a string #006' do
|
1132
|
+
text = "Hello; what is your: name @username **delete**"
|
1133
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1134
|
+
clean: true,
|
1135
|
+
punctuation: 'none'
|
1136
|
+
)
|
1137
|
+
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
it 'tokenizes a string #007' do
|
1141
|
+
text = 'His name is Mr. Smith.'
|
1142
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1143
|
+
language: 'en',
|
1144
|
+
punctuation: 'none',
|
1145
|
+
downcase: false
|
1146
|
+
)
|
1147
|
+
expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
|
1148
|
+
end
|
1149
|
+
|
1150
|
+
it 'tokenizes a string #008' do
|
1151
|
+
text = "Can't go tonight. Didn't finish."
|
1152
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1153
|
+
downcase: false,
|
1154
|
+
expand_contractions: true
|
1155
|
+
)
|
1156
|
+
expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
|
1157
|
+
end
|
1158
|
+
|
1159
|
+
it 'tokenizes a string #009' do
|
1160
|
+
text = "Some *interesting stuff* is __happening here__"
|
1161
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1162
|
+
punctuation: 'none',
|
1163
|
+
clean: true
|
1164
|
+
)
|
1165
|
+
expect(pt.tokenize).to eq(["some", "interesting", "stuff", "is", "happening", "here"])
|
1166
|
+
end
|
1167
|
+
|
1168
|
+
it 'also allows symbols for options' do
|
1169
|
+
text = 'His name is Mr. Smith.'
|
1170
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1171
|
+
language: :en,
|
1172
|
+
punctuation: :none
|
1173
|
+
)
|
1174
|
+
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
it 'handles long strings 1' do
|
1178
|
+
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
1179
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1180
|
+
language: 'en',
|
1181
|
+
clean: true,
|
1182
|
+
minimum_length: 3,
|
1183
|
+
expand_contractions: true,
|
1184
|
+
remove_stop_words: true,
|
1185
|
+
numbers: :none,
|
1186
|
+
punctuation: :none
|
1187
|
+
)
|
1188
|
+
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
it 'handles long strings 2' do
|
1192
|
+
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
|
1193
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1194
|
+
language: 'en',
|
1195
|
+
clean: true,
|
1196
|
+
minimum_length: 3,
|
1197
|
+
expand_contractions: true,
|
1198
|
+
remove_stop_words: true,
|
1199
|
+
numbers: :none,
|
1200
|
+
punctuation: :none
|
1201
|
+
)
|
1202
|
+
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
it 'handles markdown' do
|
1206
|
+
text = "This is _bold_ and this is *italic*"
|
1207
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1208
|
+
punctuation: 'none',
|
1209
|
+
clean: true
|
1210
|
+
)
|
1211
|
+
expect(pt.tokenize).to eq(["this", "is", "bold", "and", "this", "is", "italic"])
|
1212
|
+
end
|
1213
|
+
|
1214
|
+
it 'handles single quotes' do
|
1215
|
+
text = "Recognised as one of the ‘good’ games."
|
1216
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1217
|
+
language: 'en',
|
1218
|
+
clean: true,
|
1219
|
+
numbers: :none,
|
1220
|
+
minimum_length: 3,
|
1221
|
+
expand_contractions: true,
|
1222
|
+
remove_stop_words: true,
|
1223
|
+
punctuation: :none,
|
1224
|
+
downcase: true)
|
1225
|
+
expect(pt.tokenize).to eq(["recognised", "good", "games"])
|
1226
|
+
end
|
1227
|
+
|
1228
|
+
it 'removes control characters' do
|
1229
|
+
text = "\u0000 \u001F \u007FHello test."
|
1230
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1231
|
+
language: 'en',
|
1232
|
+
clean: true
|
1233
|
+
)
|
1234
|
+
expect(pt.tokenize).to eq(["hello", "test", "."])
|
1235
|
+
end
|
1236
|
+
|
1237
|
+
it 'splits too long words with hypens' do
|
1238
|
+
text = "hi-hat and old-school but not really-important-long-word"
|
1239
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1240
|
+
punctuation: 'none',
|
1241
|
+
long_word_split: 12
|
1242
|
+
)
|
1243
|
+
expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
|
1244
|
+
end
|
1245
|
+
|
1246
|
+
it 'handles hashtags 2' do
|
1247
|
+
text = "This is the #upper-#limit"
|
1248
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1249
|
+
punctuation: 'none',
|
1250
|
+
hashtags: :keep_and_clean
|
1251
|
+
)
|
1252
|
+
expect(pt.tokenize).to eq(["this", "is", "the", "upper", "limit"])
|
1253
|
+
end
|
1254
|
+
|
1255
|
+
it 'handles hashtags 3' do
|
1256
|
+
text = "The #2016-fun has just begun."
|
1257
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1258
|
+
punctuation: 'none',
|
1259
|
+
hashtags: :keep_and_clean
|
1260
|
+
)
|
1261
|
+
expect(pt.tokenize).to eq(["the", "2016", "fun", "has", "just", "begun"])
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
it 'does not clean mentions' do
|
1265
|
+
text = "@_someone_ because @someone and @_someone was taken"
|
1266
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1267
|
+
mentions: :keep_original,
|
1268
|
+
clean: true
|
1269
|
+
)
|
1270
|
+
expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
|
1271
|
+
end
|
1272
|
+
|
1273
|
+
it 'removes double single quotes' do
|
1274
|
+
text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
|
1275
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1276
|
+
punctuation: :none,
|
1277
|
+
clean: true
|
1278
|
+
)
|
1279
|
+
expect(pt.tokenize).to eq(["strong", "statement", "in", "the", "day", "the", "earth", "caught", "fire", "1961"])
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
it 'removes a hyphen prefix 1' do
|
1283
|
+
text = "Geopol.-Strategy"
|
1284
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1285
|
+
punctuation: :none,
|
1286
|
+
clean: true
|
1287
|
+
)
|
1288
|
+
expect(pt.tokenize).to eq(["geopol", "strategy"])
|
1289
|
+
end
|
1290
|
+
|
1291
|
+
it 'removes a hyphen prefix 2' do
|
1292
|
+
text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
|
1293
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1294
|
+
punctuation: :none,
|
1295
|
+
clean: true
|
1296
|
+
)
|
1297
|
+
expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
|
1298
|
+
end
|
1299
|
+
|
1300
|
+
it 'does not remove tokens with ampersands' do
|
1301
|
+
text = "you&me"
|
1302
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1303
|
+
clean: true,
|
1304
|
+
punctuation: :none
|
1305
|
+
)
|
1306
|
+
expect(pt.tokenize).to eq(["you", "me"])
|
1307
|
+
end
|
1308
|
+
|
1309
|
+
it 'cleans percent signs not related to numbers' do
|
1310
|
+
text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
|
1311
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1312
|
+
clean: true,
|
1313
|
+
numbers: :none,
|
1314
|
+
punctuation: :none
|
1315
|
+
)
|
1316
|
+
expect(pt.tokenize).to eq(["tudow", "provides", "company", "users", "a", "way", "to", "offer", "each", "other", "and", "guests", "and", "interpreters", "free", "assistance", "to", "date", "there", "have", "been", "questions", "asked"])
|
1317
|
+
end
|
1318
|
+
end
|
1319
|
+
end
|
1320
|
+
|
1321
|
+
context 'ending punctutation' do
|
1322
|
+
it 'handles ending question marks' do
|
1323
|
+
text = 'What is your name?'
|
1324
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["what", "is", "your", "name", "?"])
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
it 'handles exclamation points' do
|
1328
|
+
text = 'You are the best!'
|
1329
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "are", "the", "best", "!"])
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
it 'handles periods' do
|
1333
|
+
text = 'This way a productive day.'
|
1334
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "way", "a", "productive", "day", "."])
|
1335
|
+
end
|
1336
|
+
|
1337
|
+
it 'handles quotation marks' do
|
1338
|
+
text = "\"He is not the one you are looking for.\""
|
1339
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "\""])
|
1340
|
+
end
|
1341
|
+
|
1342
|
+
it 'handles single quotation marks' do
|
1343
|
+
text = "'He is not the one you are looking for.'"
|
1344
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "'"])
|
1345
|
+
end
|
1346
|
+
|
1347
|
+
it "handles single quotation marks ('twas)" do
|
1348
|
+
text = "'Twas the night before Christmas and 'twas cloudy."
|
1349
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'twas", "the", "night", "before", "christmas", "and", "'twas", "cloudy", "."])
|
1350
|
+
end
|
1351
|
+
|
1352
|
+
it 'handles double quotes at the end of a sentence' do
|
1353
|
+
text = "She said, \"I love cake.\""
|
1354
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\""])
|
1355
|
+
end
|
1356
|
+
|
1357
|
+
it 'handles double quotes at the beginning of a sentence' do
|
1358
|
+
text = "\"I love cake.\", she said to her friend."
|
1359
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "i", "love", "cake", ".", "\"", ",", "she", "said", "to", "her", "friend", "."])
|
1360
|
+
end
|
1361
|
+
|
1362
|
+
it 'handles double quotes in the middle of a sentence' do
|
1363
|
+
text = "She said, \"I love cake.\" to her friend."
|
1364
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\"", "to", "her", "friend", "."])
|
1365
|
+
end
|
1366
|
+
end
|
1367
|
+
|
1368
|
+
context 'other punctutation' do
|
1369
|
+
it 'handles ellipses' do
|
1370
|
+
text = 'Today is the last day...'
|
1371
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['today', 'is', 'the', 'last', 'day', '...'])
|
1372
|
+
end
|
1373
|
+
|
1374
|
+
it 'handles special quotes' do
|
1375
|
+
text = "«That's right», he said."
|
1376
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["«", "that's", "right", "»", ",", "he", "said", "."])
|
1377
|
+
end
|
1378
|
+
|
1379
|
+
it 'handles upside down punctuation (¿)' do
|
1380
|
+
text = "¿Really?"
|
1381
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¿", "really", "?"])
|
1382
|
+
end
|
1383
|
+
|
1384
|
+
it 'handles upside down punctuation (¡)' do
|
1385
|
+
text = "¡Really!"
|
1386
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¡", "really", "!"])
|
1387
|
+
end
|
1388
|
+
|
1389
|
+
it 'handles colons' do
|
1390
|
+
text = "This was the news: 'Today is the day!'"
|
1391
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "was", "the", "news", ":", "'", "today", "is", "the", "day", "!", "'"])
|
1392
|
+
end
|
1393
|
+
|
1394
|
+
it 'handles web addresses' do
|
1395
|
+
text = "Please visit the site - https://www.tm-town.com"
|
1396
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", "-", "https://www.tm-town.com"])
|
1397
|
+
end
|
1398
|
+
|
1399
|
+
it 'handles multiple colons and web addresses' do
|
1400
|
+
text = "Please visit the site: https://www.tm-town.com"
|
1401
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", ":", "https://www.tm-town.com"])
|
1402
|
+
end
|
1403
|
+
|
1404
|
+
it 'handles multiple dashes' do
|
1405
|
+
text = "John--here is your ticket."
|
1406
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["john", "-", "here", "is", "your", "ticket", "."])
|
1407
|
+
end
|
1408
|
+
|
1409
|
+
it 'handles brackets' do
|
1410
|
+
text = "This is an array: ['Hello']."
|
1411
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "an", "array", ":", "[", "'", "hello", "'", "]", "."])
|
1412
|
+
end
|
1413
|
+
|
1414
|
+
it 'handles double question marks' do
|
1415
|
+
text = "This is a question??"
|
1416
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "?"])
|
1417
|
+
end
|
1418
|
+
|
1419
|
+
it 'handles multiple ending punctuation' do
|
1420
|
+
text = "This is a question?!?"
|
1421
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "!", "?"])
|
1422
|
+
end
|
1423
|
+
|
1424
|
+
it 'handles contractions 1' do
|
1425
|
+
text = "How'd it go yesterday?"
|
1426
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["how'd", "it", "go", "yesterday", "?"])
|
1427
|
+
end
|
1428
|
+
|
1429
|
+
it 'handles contractions 2' do
|
1430
|
+
text = "You shouldn't worry."
|
1431
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "shouldn't", "worry", "."])
|
1432
|
+
end
|
1433
|
+
|
1434
|
+
it 'handles contractions 3' do
|
1435
|
+
text = "We've gone too far. It'll be over when we're done."
|
1436
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["we've", "gone", "too", "far", ".", "it'll", "be", "over", "when", "we're", "done", "."])
|
1437
|
+
end
|
1438
|
+
|
1439
|
+
it 'handles numbers' do
|
1440
|
+
text = 'He paid $10,000,000 for the new house which is equivalent to ¥1,000,000,000.00.'
|
1441
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['he', 'paid', '$10,000,000', 'for', 'the', 'new', 'house', 'which', 'is', 'equivalent', 'to', '¥1,000,000,000.00', '.'])
|
1442
|
+
end
|
1443
|
+
|
1444
|
+
it 'follows the Chicago Manual of Style on punctuation' do
|
1445
|
+
text = 'An abbreviation that ends with a period must not be left hanging without it (in parentheses, e.g.), and a sentence containing a parenthesis must itself have terminal punctuation (are we almost done?).'
|
1446
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['an', 'abbreviation', 'that', 'ends', 'with', 'a', 'period', 'must', 'not', 'be', 'left', 'hanging', 'without', 'it', '(', 'in', 'parentheses', ',', 'e.g.', ')', ',', 'and', 'a', 'sentence', 'containing', 'a', 'parenthesis', 'must', 'itself', 'have', 'terminal', 'punctuation', '(', 'are', 'we', 'almost', 'done', '?', ')', '.'])
|
1447
|
+
end
|
1448
|
+
|
1449
|
+
it 'is case insensitive' do
|
1450
|
+
text = 'his name is mr. smith, king of the \'entire\' forest.'
|
1451
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith', ',', 'king', 'of', 'the', '\'', 'entire', '\'', 'forest', '.'])
|
1452
|
+
end
|
1453
|
+
|
1454
|
+
it 'handles web url addresses #1' do
|
1455
|
+
text = 'Check out http://www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1456
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "http://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1457
|
+
end
|
1458
|
+
|
1459
|
+
it 'handles web url addresses #2' do
|
1460
|
+
text = 'Check out https://www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1461
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "https://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1462
|
+
end
|
1463
|
+
|
1464
|
+
it 'handles web url addresses #3' do
|
1465
|
+
text = 'Check out www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1466
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1467
|
+
end
|
1468
|
+
|
1469
|
+
it 'handles email addresses' do
|
1470
|
+
text = 'Please email example@example.com for more info.'
|
1471
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "email", "example@example.com", "for", "more", "info", "."])
|
1472
|
+
end
|
1473
|
+
|
1474
|
+
it 'handles empty tokens' do
|
1475
|
+
text = "!!!!! https://t.co/xxxx"
|
1476
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1477
|
+
punctuation: 'none'
|
1478
|
+
)
|
1479
|
+
expect(pt.tokenize).to eq(["https://t.co/xxxx"])
|
1480
|
+
end
|
1481
|
+
end
|
1482
|
+
|
1483
|
+
context 'abbreviations' do
|
1484
|
+
it 'handles military abbreviations' do
|
1485
|
+
text = 'His name is Col. Smith.'
|
1486
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["his", "name", "is", "col.", "smith", "."])
|
1487
|
+
end
|
1488
|
+
|
1489
|
+
it 'handles institution abbreviations' do
|
1490
|
+
text = 'She went to East Univ. to get her degree.'
|
1491
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "went", "to", "east", "univ.", "to", "get", "her", "degree", "."])
|
1492
|
+
end
|
1493
|
+
|
1494
|
+
it 'handles company abbreviations' do
|
1495
|
+
text = 'He works at ABC Inc. on weekends.'
|
1496
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc.", "on", "weekends", "."])
|
1497
|
+
end
|
1498
|
+
|
1499
|
+
it 'handles old state abbreviations' do
|
1500
|
+
text = 'He went to school in Mass. back in the day.'
|
1501
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "went", "to", "school", "in", "mass.", "back", "in", "the", "day", "."])
|
1502
|
+
end
|
1503
|
+
|
1504
|
+
it 'handles month abbreviations' do
|
1505
|
+
text = 'It is cold in Jan. they say.'
|
1506
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["it", "is", "cold", "in", "jan.", "they", "say", "."])
|
1507
|
+
end
|
1508
|
+
|
1509
|
+
it 'handles miscellaneous abbreviations' do
|
1510
|
+
text = '1, 2, 3, etc. is the beat.'
|
1511
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['1', ',', '2', ',', '3', ',', 'etc.', 'is', 'the', 'beat', '.'])
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
it 'handles one letter abbreviations (i.e. Alfred E. Stone)' do
|
1515
|
+
text = 'Alfred E. Stone is a person.'
|
1516
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["alfred", "e.", "stone", "is", "a", "person", "."])
|
1517
|
+
end
|
1518
|
+
|
1519
|
+
it 'handles repeating letter-dot words (i.e. U.S.A. or J.C. Penney)' do
|
1520
|
+
text = 'The U.S.A. is a country.'
|
1521
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["the", "u.s.a.", "is", "a", "country", "."])
|
1522
|
+
end
|
1523
|
+
|
1524
|
+
it 'handles abbreviations that occur at the end of a sentence' do
|
1525
|
+
text = 'He works at ABC Inc.'
|
1526
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc."])
|
1527
|
+
end
|
1528
|
+
|
1529
|
+
it 'handles punctuation after an abbreviation' do
|
1530
|
+
text = 'Exclamation point requires both marks (Q.E.D.!).'
|
1531
|
+
expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['exclamation', 'point', 'requires', 'both', 'marks', '(', 'q.e.d.', '!', ')', '.'])
|
1532
|
+
end
|
1533
|
+
end
|
1534
|
+
end
|
1535
|
+
end
|