tibetan-word-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/chunks.js ADDED
@@ -0,0 +1,516 @@
1
+ /**
2
+ * Text chunking for Tibetan - splits text into syllables and other chunks
3
+ * Ported from Botok's chunks.py and chunkframework.py
4
+ */
5
+
6
+ import { CharMarkers as c, ChunkMarkers as u, NO_SHAD_CONS, VOWELS } from './constants.js';
7
+ import { getCharCategory, isTibetanCategory } from './char-categories.js';
8
+
9
+ /**
10
+ * BoString - analyzes character types in a string
11
+ */
12
+ export class BoString {
13
+ /**
14
+ * @param {string} text - Input text
15
+ * @param {string[]} ignoreChars - Characters to treat as transparent
16
+ */
17
+ constructor(text, ignoreChars = []) {
18
+ this.string = text;
19
+ this.len = text.length;
20
+ this.ignoreChars = new Set(ignoreChars);
21
+ this.baseStructure = new Map();
22
+ this._analyzeChars();
23
+ }
24
+
25
+ _analyzeChars() {
26
+ for (let i = 0; i < this.len; i++) {
27
+ const char = this.string[i];
28
+ if (this.ignoreChars.has(char)) {
29
+ this.baseStructure.set(i, c.TRANSPARENT);
30
+ } else {
31
+ this.baseStructure.set(i, getCharCategory(char));
32
+ }
33
+ }
34
+ }
35
+
36
+ /**
37
+ * Get character category at index
38
+ * @param {number} idx
39
+ * @returns {number} Character marker
40
+ */
41
+ getCategory(idx) {
42
+ return this.baseStructure.get(idx);
43
+ }
44
+
45
+ /**
46
+ * Export categories for a slice
47
+ * @param {number} start
48
+ * @param {number} length
49
+ * @returns {Map<number, number>}
50
+ */
51
+ exportGroups(start, length) {
52
+ const result = new Map();
53
+ for (let i = 0; i < length; i++) {
54
+ result.set(i, this.baseStructure.get(start + i));
55
+ }
56
+ return result;
57
+ }
58
+ }
59
+
60
+ /**
61
+ * ChunkFramework - base class for text chunking
62
+ */
63
+ export class ChunkFramework {
64
+ /**
65
+ * @param {string} text - Input text
66
+ * @param {string[]} ignoreChars - Characters to ignore
67
+ */
68
+ constructor(text, ignoreChars = []) {
69
+ this.bs = new BoString(text, ignoreChars);
70
+ }
71
+
72
+ /**
73
+ * Generic chunking method
74
+ * @param {number} start - Start index
75
+ * @param {number} end - End index
76
+ * @param {Function} conditionFunc - Test function
77
+ * @returns {Array} Chunks as [isMatch, start, length] tuples
78
+ */
79
+ chunk(start, end, conditionFunc) {
80
+ const chunks = [];
81
+ let chunkStart = start;
82
+ let length = 0;
83
+ let prevState = null;
84
+
85
+ for (let i = start; i < end; i++) {
86
+ const currentState = conditionFunc.call(this, i);
87
+
88
+ if (prevState === null) {
89
+ prevState = currentState;
90
+ }
91
+
92
+ if (currentState === prevState) {
93
+ length++;
94
+ } else {
95
+ chunks.push([prevState, chunkStart, length]);
96
+ prevState = currentState;
97
+ chunkStart += length;
98
+ length = 1;
99
+ }
100
+ }
101
+
102
+ // Final chunk
103
+ if (length > 0) {
104
+ chunks.push([prevState, chunkStart, length]);
105
+ }
106
+
107
+ return chunks;
108
+ }
109
+
110
+ /**
111
+ * Chunk using a condition function with markers
112
+ */
113
+ chunkUsing(conditionFunc, start, end, yes, no) {
114
+ if (start === null && end === null) {
115
+ start = 0;
116
+ end = this.bs.len;
117
+ }
118
+
119
+ const indices = this.chunk(start, end, conditionFunc);
120
+ return indices.map(([isMatch, s, l]) => [isMatch ? yes : no, s, l]);
121
+ }
122
+
123
+ /**
124
+ * Pipe chunking - re-chunk specific chunks
125
+ */
126
+ pipeChunk(chunks, chunkFunc, toChunkMarker, yes) {
127
+ const result = [];
128
+
129
+ for (const chunk of chunks) {
130
+ if (chunk[0] === toChunkMarker) {
131
+ const newChunks = chunkFunc.call(this, chunk[1], chunk[1] + chunk[2], yes);
132
+ if (newChunks && newChunks.length > 0) {
133
+ for (const newChunk of newChunks) {
134
+ if (newChunk[0] !== yes) {
135
+ result.push([chunk[0], newChunk[1], newChunk[2]]);
136
+ } else {
137
+ result.push(newChunk);
138
+ }
139
+ }
140
+ } else {
141
+ result.push(chunk);
142
+ }
143
+ } else {
144
+ result.push(chunk);
145
+ }
146
+ }
147
+
148
+ return result;
149
+ }
150
+
151
+ // Test methods
152
+ _isBoUnicode(idx) {
153
+ const cat = this.bs.getCategory(idx);
154
+ return isTibetanCategory(cat);
155
+ }
156
+
157
+ _isPunct(idx) {
158
+ const cat = this.bs.getCategory(idx);
159
+ const prevCat = idx > 0 ? this.bs.getCategory(idx - 1) : null;
160
+
161
+ // Special handling for punctuation after symbols/punct
162
+ if (prevCat !== null) {
163
+ const isPrevSpecial = (
164
+ prevCat === c.SYMBOL ||
165
+ prevCat === c.NUMERAL ||
166
+ prevCat === c.OTHER ||
167
+ prevCat === c.NORMAL_PUNCT ||
168
+ prevCat === c.SPECIAL_PUNCT ||
169
+ prevCat === c.TSEK ||
170
+ prevCat === c.TRANSPARENT
171
+ );
172
+ const isCurrPunct = (
173
+ cat === c.TSEK ||
174
+ cat === c.TRANSPARENT ||
175
+ cat === c.NORMAL_PUNCT
176
+ );
177
+ if (isPrevSpecial && isCurrPunct) {
178
+ return true;
179
+ }
180
+ }
181
+
182
+ return (
183
+ cat === c.NORMAL_PUNCT ||
184
+ cat === c.SPECIAL_PUNCT ||
185
+ cat === c.TRANSPARENT
186
+ );
187
+ }
188
+
189
+ _isSymbol(idx) {
190
+ const cat = this.bs.getCategory(idx);
191
+ return (
192
+ cat === c.SYMBOL ||
193
+ cat === c.TRANSPARENT ||
194
+ cat === c.NFC
195
+ );
196
+ }
197
+
198
+ _isNumeral(idx) {
199
+ const cat = this.bs.getCategory(idx);
200
+ return cat === c.NUMERAL || cat === c.TRANSPARENT;
201
+ }
202
+
203
+ _isSpace(idx) {
204
+ return this.bs.getCategory(idx) === c.TRANSPARENT;
205
+ }
206
+
207
+ _isTsekOrLongSkrtVowel(idx) {
208
+ const cat = this.bs.getCategory(idx);
209
+ return cat === c.TSEK || cat === c.SKRT_LONG_VOW;
210
+ }
211
+
212
+ _isLatin(idx) {
213
+ const cat = this.bs.getCategory(idx);
214
+ return cat === c.LATIN || cat === c.TRANSPARENT;
215
+ }
216
+
217
+ _isCjk(idx) {
218
+ const cat = this.bs.getCategory(idx);
219
+ return cat === c.CJK || cat === c.TRANSPARENT;
220
+ }
221
+
222
+ // Chunking methods
223
+ chunkBoChars(start = null, end = null, yes = u.BO, no = u.OTHER) {
224
+ return this.chunkUsing(this._isBoUnicode, start, end, yes, no);
225
+ }
226
+
227
+ chunkPunct(start = null, end = null, yes = u.PUNCT, no = u.NON_PUNCT) {
228
+ return this.chunkUsing(this._isPunct, start, end, yes, no);
229
+ }
230
+
231
+ chunkSymbol(start = null, end = null, yes = u.SYM, no = u.NON_SYM) {
232
+ return this.chunkUsing(this._isSymbol, start, end, yes, no);
233
+ }
234
+
235
+ chunkNumber(start = null, end = null, yes = u.NUM, no = u.NON_NUM) {
236
+ return this.chunkUsing(this._isNumeral, start, end, yes, no);
237
+ }
238
+
239
+ chunkSpaces(start = null, end = null, yes = u.SPACE, no = u.NON_SPACE) {
240
+ return this.chunkUsing(this._isSpace, start, end, yes, no);
241
+ }
242
+
243
+ chunkLatin(start = null, end = null, yes = u.LATIN, no = u.OTHER) {
244
+ return this.chunkUsing(this._isLatin, start, end, yes, no);
245
+ }
246
+
247
+ chunkCjk(start = null, end = null, yes = u.CJK, no = u.OTHER) {
248
+ return this.chunkUsing(this._isCjk, start, end, yes, no);
249
+ }
250
+
251
+ /**
252
+ * Syllabify - split Tibetan text into syllables
253
+ */
254
+ syllabify(start = null, end = null, yes = u.TEXT) {
255
+ if (start === null && end === null) {
256
+ start = 0;
257
+ end = this.bs.len;
258
+ }
259
+
260
+ const indices = this.chunk(start, end, this._isTsekOrLongSkrtVowel);
261
+
262
+ // Merge tsek with preceding syllable
263
+ for (let i = 0; i < indices.length; i++) {
264
+ if (indices[i][0] && i > 0 && !indices[i - 1][0]) {
265
+ indices[i - 1] = [
266
+ indices[i - 1][0],
267
+ indices[i - 1][1],
268
+ indices[i - 1][2] + indices[i][2]
269
+ ];
270
+ indices[i] = null; // Mark for removal
271
+ }
272
+ }
273
+
274
+ // Filter out nulls and tsek-only chunks, return syllables
275
+ return indices
276
+ .filter(chunk => chunk !== null && !chunk[0])
277
+ .map(([_, s, l]) => [yes, s, l]);
278
+ }
279
+
280
+ /**
281
+ * Adjust syllables for special cases
282
+ */
283
+ adjustSyls(start = null, end = null, yes = u.TEXT) {
284
+ if (start === null && end === null) {
285
+ start = 0;
286
+ end = this.bs.len;
287
+ }
288
+
289
+ const indices = this.chunk(start, end, this._isSpace);
290
+
291
+ for (let i = 0; i < indices.length; i++) {
292
+ if (indices.length - 1 > i && i > 0 && indices[i][0]) {
293
+ // Space chunk between text chunks
294
+ const [_, s, e] = indices[i - 1];
295
+ const text = this.bs.string.slice(s, s + e);
296
+
297
+ // Check if preceding text ends with ཀ, ག, ཤ (optionally with vowel)
298
+ const lastChar = text[text.length - 1];
299
+ const secondLastChar = text.length >= 2 ? text[text.length - 2] : null;
300
+
301
+ const endsWithNoShadCons = NO_SHAD_CONS.includes(lastChar);
302
+ const endsWithVowelAfterNoShadCons = (
303
+ VOWELS.includes(lastChar) &&
304
+ secondLastChar &&
305
+ NO_SHAD_CONS.includes(secondLastChar)
306
+ );
307
+
308
+ if (endsWithNoShadCons || endsWithVowelAfterNoShadCons) {
309
+ // Merge space with preceding
310
+ indices[i - 1] = [
311
+ yes,
312
+ indices[i - 1][1],
313
+ indices[i - 1][2] + indices[i][2]
314
+ ];
315
+ } else {
316
+ // Merge all three chunks
317
+ indices[i - 1] = [
318
+ indices[i - 1][0],
319
+ indices[i - 1][1],
320
+ indices[i - 1][2] + indices[i][2] + (indices[i + 1] ? indices[i + 1][2] : 0)
321
+ ];
322
+ if (indices[i + 1]) {
323
+ indices[i + 1] = null;
324
+ }
325
+ }
326
+ indices[i] = null;
327
+ } else if (indices[i][0] === false) {
328
+ indices[i] = [yes, indices[i][1], indices[i][2]];
329
+ } else if ((i === 0 || i === indices.length - 1) && indices[i][0] === true) {
330
+ indices[i] = [u.PUNCT, indices[i][1], indices[i][2]];
331
+ }
332
+ }
333
+
334
+ // Filter out nulls
335
+ const result = indices.filter(chunk => chunk !== null && chunk[0] !== true);
336
+ return result.length > 1 ? result : [];
337
+ }
338
+
339
+ /**
340
+ * Merge skippable punctuation (tsek and space) with adjacent chunks
341
+ */
342
+ mergeSkippablePunct(chunks) {
343
+ const isSkippable = (idx) => {
344
+ const cat = this.bs.getCategory(idx);
345
+ return cat === c.TSEK || cat === c.TRANSPARENT;
346
+ };
347
+
348
+ let i = 0;
349
+ while (i < chunks.length) {
350
+ const current = chunks[i];
351
+
352
+ // Check if entire chunk is skippable
353
+ let allSkippable = true;
354
+ for (let j = current[1]; j < current[1] + current[2]; j++) {
355
+ if (!isSkippable(j)) {
356
+ allSkippable = false;
357
+ break;
358
+ }
359
+ }
360
+
361
+ if (allSkippable) {
362
+ if (i === 0 && chunks.length > 1) {
363
+ // Merge with next
364
+ chunks[1] = [chunks[1][0], current[1], chunks[1][2] + current[2]];
365
+ chunks.splice(0, 1);
366
+ continue;
367
+ } else if (i > 0) {
368
+ // Merge with previous
369
+ chunks[i - 1] = [
370
+ chunks[i - 1][0],
371
+ chunks[i - 1][1],
372
+ chunks[i - 1][2] + current[2]
373
+ ];
374
+ chunks.splice(i, 1);
375
+ continue;
376
+ }
377
+ }
378
+ i++;
379
+ }
380
+
381
+ return this.mergeSimilarChunks(chunks);
382
+ }
383
+
384
+ /**
385
+ * Merge adjacent chunks with same marker (except TEXT)
386
+ */
387
+ mergeSimilarChunks(chunks) {
388
+ let i = 1;
389
+ while (i < chunks.length) {
390
+ const prev = chunks[i - 1];
391
+ const curr = chunks[i];
392
+
393
+ if (
394
+ prev[0] !== u.TEXT &&
395
+ curr[0] !== u.TEXT &&
396
+ prev[0] === curr[0]
397
+ ) {
398
+ chunks[i - 1] = [prev[0], prev[1], prev[2] + curr[2]];
399
+ chunks.splice(i, 1);
400
+ } else {
401
+ i++;
402
+ }
403
+ }
404
+ return chunks;
405
+ }
406
+ }
407
+
408
+ /**
409
+ * TokChunks - produces chunks suitable for tokenization
410
+ */
411
+ export class TokChunks extends ChunkFramework {
412
+ /**
413
+ * @param {string} text - Input text
414
+ * @param {string[]} ignoreChars - Characters to ignore
415
+ * @param {boolean} spaceAsPunct - Treat space as punctuation
416
+ */
417
+ constructor(text, ignoreChars = [], spaceAsPunct = false) {
418
+ super(text, ignoreChars);
419
+ this.spaceAsPunct = spaceAsPunct;
420
+ this.chunks = null;
421
+ }
422
+
423
+ /**
424
+ * Make chunks for tokenization
425
+ */
426
+ makeChunks() {
427
+ let chunks = this.chunkBoChars();
428
+
429
+ if (this.spaceAsPunct) {
430
+ chunks = this.pipeChunk(chunks, this.chunkSpaces, u.BO, u.PUNCT);
431
+ }
432
+
433
+ chunks = this.pipeChunk(chunks, this.chunkPunct, u.BO, u.PUNCT);
434
+ chunks = this.pipeChunk(chunks, this.chunkSymbol, u.BO, u.SYM);
435
+ chunks = this.pipeChunk(chunks, this.chunkNumber, u.BO, u.NUM);
436
+
437
+ if (!this.spaceAsPunct) {
438
+ chunks = this.mergeSkippablePunct(chunks);
439
+ }
440
+
441
+ chunks = this.pipeChunk(chunks, this.syllabify, u.BO, u.TEXT);
442
+ chunks = this.pipeChunk(chunks, this.adjustSyls, u.TEXT, u.TEXT);
443
+ chunks = this.pipeChunk(chunks, this.chunkCjk, u.OTHER, u.CJK);
444
+ chunks = this.pipeChunk(chunks, this.chunkLatin, u.OTHER, u.LATIN);
445
+
446
+ if (!this.spaceAsPunct) {
447
+ chunks = this.mergeSkippablePunct(chunks);
448
+ }
449
+
450
+ return chunks;
451
+ }
452
+
453
+ /**
454
+ * Prepare chunks for trie lookup
455
+ * Returns array of [syllable_chars | null, chunk_info] tuples
456
+ */
457
+ serveSylsToTrie() {
458
+ const chunks = this.makeChunks();
459
+ this.chunks = [];
460
+
461
+ for (const chunk of chunks) {
462
+ if (chunk[0] === u.TEXT) {
463
+ const sylChars = this._getTextChars(chunk[1], chunk[1] + chunk[2]);
464
+ this.chunks.push([sylChars, chunk]);
465
+ } else {
466
+ this.chunks.push([null, chunk]);
467
+ }
468
+ }
469
+
470
+ return this.chunks;
471
+ }
472
+
473
+ /**
474
+ * Get syllables as strings
475
+ */
476
+ getSyls() {
477
+ const chunks = this.makeChunks();
478
+ const syls = [];
479
+
480
+ for (const chunk of chunks) {
481
+ if (chunk[0] === u.TEXT) {
482
+ const charIdxs = this._getTextChars(chunk[1], chunk[1] + chunk[2]);
483
+ const syl = charIdxs.map(i => this.bs.string[i]).join('');
484
+ syls.push(syl);
485
+ }
486
+ }
487
+
488
+ return syls;
489
+ }
490
+
491
+ /**
492
+ * Get character indices for syllable text (excluding tsek and spaces)
493
+ * @private
494
+ */
495
+ _getTextChars(start, end) {
496
+ const chars = [];
497
+ for (let i = start; i < end; i++) {
498
+ if (this._isSylText(i)) {
499
+ chars.push(i);
500
+ }
501
+ }
502
+ return chars;
503
+ }
504
+
505
+ /**
506
+ * Test if character is part of syllable text
507
+ * @private
508
+ */
509
+ _isSylText(idx) {
510
+ const cat = this.bs.getCategory(idx);
511
+ return (
512
+ cat !== c.TSEK &&
513
+ cat !== c.TRANSPARENT
514
+ ) || cat === c.SKRT_LONG_VOW; // Visarga is part of syllable
515
+ }
516
+ }
@@ -0,0 +1,102 @@
1
+ /**
2
+ * Constants for Tibetan word tokenizer
3
+ * Ported from Botok (Python) - https://github.com/Esukhia/botok
4
+ */
5
+
6
+ // Special characters
7
+ export const TSEK = '་'; // Tibetan syllable separator
8
+ export const NAMCHE = 'ཿ'; // Visarga
9
+ export const SHAD = '།'; // Tibetan full stop
10
+ export const AA = 'འ'; // A-chung
11
+ export const HASH = '#';
12
+
13
+ // Characters that can end a syllable without shad
14
+ export const NO_SHAD_CONS = ['ཀ', 'ག', 'ཤ'];
15
+ export const VOWELS = ['ི'];
16
+
17
+ // Dagdra particles (nominalizers)
18
+ export const DAGDRA = ['པ་', 'པོ་', 'བ་', 'བོ་'];
19
+
20
+ /**
21
+ * Character markers - categories for each Tibetan Unicode character
22
+ */
23
+ export const CharMarkers = {
24
+ // Regular Tibetan
25
+ CONS: 1, // Consonant
26
+ SUB_CONS: 2, // Subjoined consonant
27
+ VOW: 3, // Vowel
28
+ TSEK: 4, // Tshek (syllable separator)
29
+
30
+ // Punctuation
31
+ NORMAL_PUNCT: 5,
32
+ SPECIAL_PUNCT: 6,
33
+
34
+ // Others
35
+ NUMERAL: 7,
36
+ SYMBOL: 8,
37
+ IN_SYL_MARK: 9,
38
+ NON_BO_NON_SKRT: 10,
39
+
40
+ // Sanskrit-specific
41
+ SKRT_CONS: 11,
42
+ SKRT_SUB_CONS: 12,
43
+ SKRT_VOW: 13,
44
+ SKRT_LONG_VOW: 14,
45
+
46
+ // Other languages
47
+ CJK: 15,
48
+ LATIN: 16,
49
+
50
+ // Misc
51
+ OTHER: 17,
52
+ TRANSPARENT: 18, // Spaces and ignorable characters
53
+ NFC: 19, // Characters needing NFC normalization
54
+ };
55
+
56
+ // Reverse lookup for char markers
57
+ export const charMarkerNames = Object.fromEntries(
58
+ Object.entries(CharMarkers).map(([k, v]) => [v, k])
59
+ );
60
+
61
+ /**
62
+ * Chunk markers - categories for text chunks
63
+ */
64
+ export const ChunkMarkers = {
65
+ // Languages
66
+ BO: 100,
67
+ LATIN: 101,
68
+ CJK: 102,
69
+ OTHER: 103,
70
+
71
+ // Tibetan textual content
72
+ TEXT: 104,
73
+
74
+ // Tibetan non-textual content
75
+ PUNCT: 105,
76
+ NON_PUNCT: 106,
77
+ SPACE: 107,
78
+ NON_SPACE: 108,
79
+ SYM: 109,
80
+ NON_SYM: 110,
81
+ NUM: 111,
82
+ NON_NUM: 112,
83
+ };
84
+
85
+ // Reverse lookup for chunk markers
86
+ export const chunkMarkerNames = Object.fromEntries(
87
+ Object.entries(ChunkMarkers).map(([k, v]) => [v, k])
88
+ );
89
+
90
+ /**
91
+ * Word markers - categories for tokenized words
92
+ */
93
+ export const WordMarkers = {
94
+ WORD: 1000,
95
+ NO_POS: 1001,
96
+ NON_WORD: 1002,
97
+ };
98
+
99
+ // Reverse lookup for word markers
100
+ export const wordMarkerNames = Object.fromEntries(
101
+ Object.entries(WordMarkers).map(([k, v]) => [v, k])
102
+ );
package/src/index.js ADDED
@@ -0,0 +1,68 @@
1
+ /**
2
+ * tibetan-word-tokenizer
3
+ *
4
+ * A JavaScript port of Botok - Tibetan word tokenizer with Sanskrit detection
5
+ * https://github.com/Esukhia/botok
6
+ */
7
+
8
+ export * from './constants.js';
9
+ export * from './char-categories.js';
10
+ export * from './chunks.js';
11
+ export * from './trie.js';
12
+ export * from './tokenizer.js';
13
+ export * from './sanskrit.js';
14
+
15
+ // Re-export main classes as named exports for convenience
16
+ import { WordTokenizer, Token, Tokenize } from './tokenizer.js';
17
+ import { Trie, TrieNode } from './trie.js';
18
+ import { TokChunks, BoString, ChunkFramework } from './chunks.js';
19
+ import { isSanskritSyllable, hasSanskritSyllable, isSanskrit } from './sanskrit.js';
20
+
21
+ export {
22
+ WordTokenizer,
23
+ Token,
24
+ Tokenize,
25
+ Trie,
26
+ TrieNode,
27
+ TokChunks,
28
+ BoString,
29
+ ChunkFramework,
30
+ isSanskritSyllable,
31
+ hasSanskritSyllable,
32
+ isSanskrit,
33
+ };
34
+
35
+ /**
36
+ * Create a tokenizer with built-in dictionary
37
+ * @param {Object} options - Configuration options
38
+ * @param {Object} options.dictionary - Pre-loaded dictionary data
39
+ * @returns {WordTokenizer}
40
+ */
41
+ export function createTokenizer(options = {}) {
42
+ const tokenizer = new WordTokenizer();
43
+
44
+ if (options.dictionary) {
45
+ tokenizer.loadDictionary(options.dictionary);
46
+ }
47
+
48
+ return tokenizer;
49
+ }
50
+
51
+ /**
52
+ * Quick tokenization without loading full dictionary
53
+ * Uses syllable boundaries only (less accurate but works without dictionary)
54
+ * @param {string} text - Input text
55
+ * @returns {string[]} Array of syllables/tokens
56
+ */
57
+ export function quickTokenize(text) {
58
+ const chunks = new TokChunks(text);
59
+ return chunks.getSyls();
60
+ }
61
+
62
+ export default {
63
+ WordTokenizer,
64
+ Trie,
65
+ TokChunks,
66
+ createTokenizer,
67
+ quickTokenize,
68
+ };