tibetan-word-tokenizer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/dictionary.json +1 -0
- package/data/dictionary.pretty.json +123406 -0
- package/package.json +38 -0
- package/src/char-categories.js +349 -0
- package/src/chunks.js +516 -0
- package/src/constants.js +102 -0
- package/src/index.js +68 -0
- package/src/sanskrit.js +228 -0
- package/src/tokenizer.js +434 -0
- package/src/trie.js +263 -0
package/src/chunks.js
ADDED
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking for Tibetan - splits text into syllables and other chunks
|
|
3
|
+
* Ported from Botok's chunks.py and chunkframework.py
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { CharMarkers as c, ChunkMarkers as u, NO_SHAD_CONS, VOWELS } from './constants.js';
|
|
7
|
+
import { getCharCategory, isTibetanCategory } from './char-categories.js';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* BoString - analyzes character types in a string
|
|
11
|
+
*/
|
|
12
|
+
export class BoString {
|
|
13
|
+
/**
|
|
14
|
+
* @param {string} text - Input text
|
|
15
|
+
* @param {string[]} ignoreChars - Characters to treat as transparent
|
|
16
|
+
*/
|
|
17
|
+
constructor(text, ignoreChars = []) {
|
|
18
|
+
this.string = text;
|
|
19
|
+
this.len = text.length;
|
|
20
|
+
this.ignoreChars = new Set(ignoreChars);
|
|
21
|
+
this.baseStructure = new Map();
|
|
22
|
+
this._analyzeChars();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_analyzeChars() {
|
|
26
|
+
for (let i = 0; i < this.len; i++) {
|
|
27
|
+
const char = this.string[i];
|
|
28
|
+
if (this.ignoreChars.has(char)) {
|
|
29
|
+
this.baseStructure.set(i, c.TRANSPARENT);
|
|
30
|
+
} else {
|
|
31
|
+
this.baseStructure.set(i, getCharCategory(char));
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Get character category at index
|
|
38
|
+
* @param {number} idx
|
|
39
|
+
* @returns {number} Character marker
|
|
40
|
+
*/
|
|
41
|
+
getCategory(idx) {
|
|
42
|
+
return this.baseStructure.get(idx);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Export categories for a slice
|
|
47
|
+
* @param {number} start
|
|
48
|
+
* @param {number} length
|
|
49
|
+
* @returns {Map<number, number>}
|
|
50
|
+
*/
|
|
51
|
+
exportGroups(start, length) {
|
|
52
|
+
const result = new Map();
|
|
53
|
+
for (let i = 0; i < length; i++) {
|
|
54
|
+
result.set(i, this.baseStructure.get(start + i));
|
|
55
|
+
}
|
|
56
|
+
return result;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* ChunkFramework - base class for text chunking
|
|
62
|
+
*/
|
|
63
|
+
export class ChunkFramework {
|
|
64
|
+
/**
|
|
65
|
+
* @param {string} text - Input text
|
|
66
|
+
* @param {string[]} ignoreChars - Characters to ignore
|
|
67
|
+
*/
|
|
68
|
+
constructor(text, ignoreChars = []) {
|
|
69
|
+
this.bs = new BoString(text, ignoreChars);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Generic chunking method
|
|
74
|
+
* @param {number} start - Start index
|
|
75
|
+
* @param {number} end - End index
|
|
76
|
+
* @param {Function} conditionFunc - Test function
|
|
77
|
+
* @returns {Array} Chunks as [isMatch, start, length] tuples
|
|
78
|
+
*/
|
|
79
|
+
chunk(start, end, conditionFunc) {
|
|
80
|
+
const chunks = [];
|
|
81
|
+
let chunkStart = start;
|
|
82
|
+
let length = 0;
|
|
83
|
+
let prevState = null;
|
|
84
|
+
|
|
85
|
+
for (let i = start; i < end; i++) {
|
|
86
|
+
const currentState = conditionFunc.call(this, i);
|
|
87
|
+
|
|
88
|
+
if (prevState === null) {
|
|
89
|
+
prevState = currentState;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (currentState === prevState) {
|
|
93
|
+
length++;
|
|
94
|
+
} else {
|
|
95
|
+
chunks.push([prevState, chunkStart, length]);
|
|
96
|
+
prevState = currentState;
|
|
97
|
+
chunkStart += length;
|
|
98
|
+
length = 1;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Final chunk
|
|
103
|
+
if (length > 0) {
|
|
104
|
+
chunks.push([prevState, chunkStart, length]);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return chunks;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Chunk using a condition function with markers
|
|
112
|
+
*/
|
|
113
|
+
chunkUsing(conditionFunc, start, end, yes, no) {
|
|
114
|
+
if (start === null && end === null) {
|
|
115
|
+
start = 0;
|
|
116
|
+
end = this.bs.len;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const indices = this.chunk(start, end, conditionFunc);
|
|
120
|
+
return indices.map(([isMatch, s, l]) => [isMatch ? yes : no, s, l]);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Pipe chunking - re-chunk specific chunks
|
|
125
|
+
*/
|
|
126
|
+
pipeChunk(chunks, chunkFunc, toChunkMarker, yes) {
|
|
127
|
+
const result = [];
|
|
128
|
+
|
|
129
|
+
for (const chunk of chunks) {
|
|
130
|
+
if (chunk[0] === toChunkMarker) {
|
|
131
|
+
const newChunks = chunkFunc.call(this, chunk[1], chunk[1] + chunk[2], yes);
|
|
132
|
+
if (newChunks && newChunks.length > 0) {
|
|
133
|
+
for (const newChunk of newChunks) {
|
|
134
|
+
if (newChunk[0] !== yes) {
|
|
135
|
+
result.push([chunk[0], newChunk[1], newChunk[2]]);
|
|
136
|
+
} else {
|
|
137
|
+
result.push(newChunk);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
} else {
|
|
141
|
+
result.push(chunk);
|
|
142
|
+
}
|
|
143
|
+
} else {
|
|
144
|
+
result.push(chunk);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return result;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Test methods
|
|
152
|
+
_isBoUnicode(idx) {
|
|
153
|
+
const cat = this.bs.getCategory(idx);
|
|
154
|
+
return isTibetanCategory(cat);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
_isPunct(idx) {
|
|
158
|
+
const cat = this.bs.getCategory(idx);
|
|
159
|
+
const prevCat = idx > 0 ? this.bs.getCategory(idx - 1) : null;
|
|
160
|
+
|
|
161
|
+
// Special handling for punctuation after symbols/punct
|
|
162
|
+
if (prevCat !== null) {
|
|
163
|
+
const isPrevSpecial = (
|
|
164
|
+
prevCat === c.SYMBOL ||
|
|
165
|
+
prevCat === c.NUMERAL ||
|
|
166
|
+
prevCat === c.OTHER ||
|
|
167
|
+
prevCat === c.NORMAL_PUNCT ||
|
|
168
|
+
prevCat === c.SPECIAL_PUNCT ||
|
|
169
|
+
prevCat === c.TSEK ||
|
|
170
|
+
prevCat === c.TRANSPARENT
|
|
171
|
+
);
|
|
172
|
+
const isCurrPunct = (
|
|
173
|
+
cat === c.TSEK ||
|
|
174
|
+
cat === c.TRANSPARENT ||
|
|
175
|
+
cat === c.NORMAL_PUNCT
|
|
176
|
+
);
|
|
177
|
+
if (isPrevSpecial && isCurrPunct) {
|
|
178
|
+
return true;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return (
|
|
183
|
+
cat === c.NORMAL_PUNCT ||
|
|
184
|
+
cat === c.SPECIAL_PUNCT ||
|
|
185
|
+
cat === c.TRANSPARENT
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
_isSymbol(idx) {
|
|
190
|
+
const cat = this.bs.getCategory(idx);
|
|
191
|
+
return (
|
|
192
|
+
cat === c.SYMBOL ||
|
|
193
|
+
cat === c.TRANSPARENT ||
|
|
194
|
+
cat === c.NFC
|
|
195
|
+
);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
_isNumeral(idx) {
|
|
199
|
+
const cat = this.bs.getCategory(idx);
|
|
200
|
+
return cat === c.NUMERAL || cat === c.TRANSPARENT;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
_isSpace(idx) {
|
|
204
|
+
return this.bs.getCategory(idx) === c.TRANSPARENT;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
_isTsekOrLongSkrtVowel(idx) {
|
|
208
|
+
const cat = this.bs.getCategory(idx);
|
|
209
|
+
return cat === c.TSEK || cat === c.SKRT_LONG_VOW;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
_isLatin(idx) {
|
|
213
|
+
const cat = this.bs.getCategory(idx);
|
|
214
|
+
return cat === c.LATIN || cat === c.TRANSPARENT;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
_isCjk(idx) {
|
|
218
|
+
const cat = this.bs.getCategory(idx);
|
|
219
|
+
return cat === c.CJK || cat === c.TRANSPARENT;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Chunking methods
|
|
223
|
+
chunkBoChars(start = null, end = null, yes = u.BO, no = u.OTHER) {
|
|
224
|
+
return this.chunkUsing(this._isBoUnicode, start, end, yes, no);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
chunkPunct(start = null, end = null, yes = u.PUNCT, no = u.NON_PUNCT) {
|
|
228
|
+
return this.chunkUsing(this._isPunct, start, end, yes, no);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
chunkSymbol(start = null, end = null, yes = u.SYM, no = u.NON_SYM) {
|
|
232
|
+
return this.chunkUsing(this._isSymbol, start, end, yes, no);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
chunkNumber(start = null, end = null, yes = u.NUM, no = u.NON_NUM) {
|
|
236
|
+
return this.chunkUsing(this._isNumeral, start, end, yes, no);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
chunkSpaces(start = null, end = null, yes = u.SPACE, no = u.NON_SPACE) {
|
|
240
|
+
return this.chunkUsing(this._isSpace, start, end, yes, no);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
chunkLatin(start = null, end = null, yes = u.LATIN, no = u.OTHER) {
|
|
244
|
+
return this.chunkUsing(this._isLatin, start, end, yes, no);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
chunkCjk(start = null, end = null, yes = u.CJK, no = u.OTHER) {
|
|
248
|
+
return this.chunkUsing(this._isCjk, start, end, yes, no);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Syllabify - split Tibetan text into syllables
|
|
253
|
+
*/
|
|
254
|
+
syllabify(start = null, end = null, yes = u.TEXT) {
|
|
255
|
+
if (start === null && end === null) {
|
|
256
|
+
start = 0;
|
|
257
|
+
end = this.bs.len;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const indices = this.chunk(start, end, this._isTsekOrLongSkrtVowel);
|
|
261
|
+
|
|
262
|
+
// Merge tsek with preceding syllable
|
|
263
|
+
for (let i = 0; i < indices.length; i++) {
|
|
264
|
+
if (indices[i][0] && i > 0 && !indices[i - 1][0]) {
|
|
265
|
+
indices[i - 1] = [
|
|
266
|
+
indices[i - 1][0],
|
|
267
|
+
indices[i - 1][1],
|
|
268
|
+
indices[i - 1][2] + indices[i][2]
|
|
269
|
+
];
|
|
270
|
+
indices[i] = null; // Mark for removal
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Filter out nulls and tsek-only chunks, return syllables
|
|
275
|
+
return indices
|
|
276
|
+
.filter(chunk => chunk !== null && !chunk[0])
|
|
277
|
+
.map(([_, s, l]) => [yes, s, l]);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Adjust syllables for special cases
|
|
282
|
+
*/
|
|
283
|
+
adjustSyls(start = null, end = null, yes = u.TEXT) {
|
|
284
|
+
if (start === null && end === null) {
|
|
285
|
+
start = 0;
|
|
286
|
+
end = this.bs.len;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const indices = this.chunk(start, end, this._isSpace);
|
|
290
|
+
|
|
291
|
+
for (let i = 0; i < indices.length; i++) {
|
|
292
|
+
if (indices.length - 1 > i && i > 0 && indices[i][0]) {
|
|
293
|
+
// Space chunk between text chunks
|
|
294
|
+
const [_, s, e] = indices[i - 1];
|
|
295
|
+
const text = this.bs.string.slice(s, s + e);
|
|
296
|
+
|
|
297
|
+
// Check if preceding text ends with ཀ, ག, ཤ (optionally with vowel)
|
|
298
|
+
const lastChar = text[text.length - 1];
|
|
299
|
+
const secondLastChar = text.length >= 2 ? text[text.length - 2] : null;
|
|
300
|
+
|
|
301
|
+
const endsWithNoShadCons = NO_SHAD_CONS.includes(lastChar);
|
|
302
|
+
const endsWithVowelAfterNoShadCons = (
|
|
303
|
+
VOWELS.includes(lastChar) &&
|
|
304
|
+
secondLastChar &&
|
|
305
|
+
NO_SHAD_CONS.includes(secondLastChar)
|
|
306
|
+
);
|
|
307
|
+
|
|
308
|
+
if (endsWithNoShadCons || endsWithVowelAfterNoShadCons) {
|
|
309
|
+
// Merge space with preceding
|
|
310
|
+
indices[i - 1] = [
|
|
311
|
+
yes,
|
|
312
|
+
indices[i - 1][1],
|
|
313
|
+
indices[i - 1][2] + indices[i][2]
|
|
314
|
+
];
|
|
315
|
+
} else {
|
|
316
|
+
// Merge all three chunks
|
|
317
|
+
indices[i - 1] = [
|
|
318
|
+
indices[i - 1][0],
|
|
319
|
+
indices[i - 1][1],
|
|
320
|
+
indices[i - 1][2] + indices[i][2] + (indices[i + 1] ? indices[i + 1][2] : 0)
|
|
321
|
+
];
|
|
322
|
+
if (indices[i + 1]) {
|
|
323
|
+
indices[i + 1] = null;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
indices[i] = null;
|
|
327
|
+
} else if (indices[i][0] === false) {
|
|
328
|
+
indices[i] = [yes, indices[i][1], indices[i][2]];
|
|
329
|
+
} else if ((i === 0 || i === indices.length - 1) && indices[i][0] === true) {
|
|
330
|
+
indices[i] = [u.PUNCT, indices[i][1], indices[i][2]];
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Filter out nulls
|
|
335
|
+
const result = indices.filter(chunk => chunk !== null && chunk[0] !== true);
|
|
336
|
+
return result.length > 1 ? result : [];
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Merge skippable punctuation (tsek and space) with adjacent chunks
|
|
341
|
+
*/
|
|
342
|
+
mergeSkippablePunct(chunks) {
|
|
343
|
+
const isSkippable = (idx) => {
|
|
344
|
+
const cat = this.bs.getCategory(idx);
|
|
345
|
+
return cat === c.TSEK || cat === c.TRANSPARENT;
|
|
346
|
+
};
|
|
347
|
+
|
|
348
|
+
let i = 0;
|
|
349
|
+
while (i < chunks.length) {
|
|
350
|
+
const current = chunks[i];
|
|
351
|
+
|
|
352
|
+
// Check if entire chunk is skippable
|
|
353
|
+
let allSkippable = true;
|
|
354
|
+
for (let j = current[1]; j < current[1] + current[2]; j++) {
|
|
355
|
+
if (!isSkippable(j)) {
|
|
356
|
+
allSkippable = false;
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
if (allSkippable) {
|
|
362
|
+
if (i === 0 && chunks.length > 1) {
|
|
363
|
+
// Merge with next
|
|
364
|
+
chunks[1] = [chunks[1][0], current[1], chunks[1][2] + current[2]];
|
|
365
|
+
chunks.splice(0, 1);
|
|
366
|
+
continue;
|
|
367
|
+
} else if (i > 0) {
|
|
368
|
+
// Merge with previous
|
|
369
|
+
chunks[i - 1] = [
|
|
370
|
+
chunks[i - 1][0],
|
|
371
|
+
chunks[i - 1][1],
|
|
372
|
+
chunks[i - 1][2] + current[2]
|
|
373
|
+
];
|
|
374
|
+
chunks.splice(i, 1);
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
i++;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
return this.mergeSimilarChunks(chunks);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* Merge adjacent chunks with same marker (except TEXT)
|
|
386
|
+
*/
|
|
387
|
+
mergeSimilarChunks(chunks) {
|
|
388
|
+
let i = 1;
|
|
389
|
+
while (i < chunks.length) {
|
|
390
|
+
const prev = chunks[i - 1];
|
|
391
|
+
const curr = chunks[i];
|
|
392
|
+
|
|
393
|
+
if (
|
|
394
|
+
prev[0] !== u.TEXT &&
|
|
395
|
+
curr[0] !== u.TEXT &&
|
|
396
|
+
prev[0] === curr[0]
|
|
397
|
+
) {
|
|
398
|
+
chunks[i - 1] = [prev[0], prev[1], prev[2] + curr[2]];
|
|
399
|
+
chunks.splice(i, 1);
|
|
400
|
+
} else {
|
|
401
|
+
i++;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return chunks;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* TokChunks - produces chunks suitable for tokenization
|
|
410
|
+
*/
|
|
411
|
+
export class TokChunks extends ChunkFramework {
|
|
412
|
+
/**
|
|
413
|
+
* @param {string} text - Input text
|
|
414
|
+
* @param {string[]} ignoreChars - Characters to ignore
|
|
415
|
+
* @param {boolean} spaceAsPunct - Treat space as punctuation
|
|
416
|
+
*/
|
|
417
|
+
constructor(text, ignoreChars = [], spaceAsPunct = false) {
|
|
418
|
+
super(text, ignoreChars);
|
|
419
|
+
this.spaceAsPunct = spaceAsPunct;
|
|
420
|
+
this.chunks = null;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Make chunks for tokenization
|
|
425
|
+
*/
|
|
426
|
+
makeChunks() {
|
|
427
|
+
let chunks = this.chunkBoChars();
|
|
428
|
+
|
|
429
|
+
if (this.spaceAsPunct) {
|
|
430
|
+
chunks = this.pipeChunk(chunks, this.chunkSpaces, u.BO, u.PUNCT);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
chunks = this.pipeChunk(chunks, this.chunkPunct, u.BO, u.PUNCT);
|
|
434
|
+
chunks = this.pipeChunk(chunks, this.chunkSymbol, u.BO, u.SYM);
|
|
435
|
+
chunks = this.pipeChunk(chunks, this.chunkNumber, u.BO, u.NUM);
|
|
436
|
+
|
|
437
|
+
if (!this.spaceAsPunct) {
|
|
438
|
+
chunks = this.mergeSkippablePunct(chunks);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
chunks = this.pipeChunk(chunks, this.syllabify, u.BO, u.TEXT);
|
|
442
|
+
chunks = this.pipeChunk(chunks, this.adjustSyls, u.TEXT, u.TEXT);
|
|
443
|
+
chunks = this.pipeChunk(chunks, this.chunkCjk, u.OTHER, u.CJK);
|
|
444
|
+
chunks = this.pipeChunk(chunks, this.chunkLatin, u.OTHER, u.LATIN);
|
|
445
|
+
|
|
446
|
+
if (!this.spaceAsPunct) {
|
|
447
|
+
chunks = this.mergeSkippablePunct(chunks);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
return chunks;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
/**
|
|
454
|
+
* Prepare chunks for trie lookup
|
|
455
|
+
* Returns array of [syllable_chars | null, chunk_info] tuples
|
|
456
|
+
*/
|
|
457
|
+
serveSylsToTrie() {
|
|
458
|
+
const chunks = this.makeChunks();
|
|
459
|
+
this.chunks = [];
|
|
460
|
+
|
|
461
|
+
for (const chunk of chunks) {
|
|
462
|
+
if (chunk[0] === u.TEXT) {
|
|
463
|
+
const sylChars = this._getTextChars(chunk[1], chunk[1] + chunk[2]);
|
|
464
|
+
this.chunks.push([sylChars, chunk]);
|
|
465
|
+
} else {
|
|
466
|
+
this.chunks.push([null, chunk]);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
return this.chunks;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* Get syllables as strings
|
|
475
|
+
*/
|
|
476
|
+
getSyls() {
|
|
477
|
+
const chunks = this.makeChunks();
|
|
478
|
+
const syls = [];
|
|
479
|
+
|
|
480
|
+
for (const chunk of chunks) {
|
|
481
|
+
if (chunk[0] === u.TEXT) {
|
|
482
|
+
const charIdxs = this._getTextChars(chunk[1], chunk[1] + chunk[2]);
|
|
483
|
+
const syl = charIdxs.map(i => this.bs.string[i]).join('');
|
|
484
|
+
syls.push(syl);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return syls;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Get character indices for syllable text (excluding tsek and spaces)
|
|
493
|
+
* @private
|
|
494
|
+
*/
|
|
495
|
+
_getTextChars(start, end) {
|
|
496
|
+
const chars = [];
|
|
497
|
+
for (let i = start; i < end; i++) {
|
|
498
|
+
if (this._isSylText(i)) {
|
|
499
|
+
chars.push(i);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
return chars;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Test if character is part of syllable text
|
|
507
|
+
* @private
|
|
508
|
+
*/
|
|
509
|
+
_isSylText(idx) {
|
|
510
|
+
const cat = this.bs.getCategory(idx);
|
|
511
|
+
return (
|
|
512
|
+
cat !== c.TSEK &&
|
|
513
|
+
cat !== c.TRANSPARENT
|
|
514
|
+
) || cat === c.SKRT_LONG_VOW; // Visarga is part of syllable
|
|
515
|
+
}
|
|
516
|
+
}
|
package/src/constants.js
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Constants for Tibetan word tokenizer
|
|
3
|
+
* Ported from Botok (Python) - https://github.com/Esukhia/botok
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Special characters
|
|
7
|
+
export const TSEK = '་'; // Tibetan syllable separator
|
|
8
|
+
export const NAMCHE = 'ཿ'; // Visarga
|
|
9
|
+
export const SHAD = '།'; // Tibetan full stop
|
|
10
|
+
export const AA = 'འ'; // A-chung
|
|
11
|
+
export const HASH = '#';
|
|
12
|
+
|
|
13
|
+
// Characters that can end a syllable without shad
|
|
14
|
+
export const NO_SHAD_CONS = ['ཀ', 'ག', 'ཤ'];
|
|
15
|
+
export const VOWELS = ['ི'];
|
|
16
|
+
|
|
17
|
+
// Dagdra particles (nominalizers)
|
|
18
|
+
export const DAGDRA = ['པ་', 'པོ་', 'བ་', 'བོ་'];
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Character markers - categories for each Tibetan Unicode character
|
|
22
|
+
*/
|
|
23
|
+
export const CharMarkers = {
|
|
24
|
+
// Regular Tibetan
|
|
25
|
+
CONS: 1, // Consonant
|
|
26
|
+
SUB_CONS: 2, // Subjoined consonant
|
|
27
|
+
VOW: 3, // Vowel
|
|
28
|
+
TSEK: 4, // Tshek (syllable separator)
|
|
29
|
+
|
|
30
|
+
// Punctuation
|
|
31
|
+
NORMAL_PUNCT: 5,
|
|
32
|
+
SPECIAL_PUNCT: 6,
|
|
33
|
+
|
|
34
|
+
// Others
|
|
35
|
+
NUMERAL: 7,
|
|
36
|
+
SYMBOL: 8,
|
|
37
|
+
IN_SYL_MARK: 9,
|
|
38
|
+
NON_BO_NON_SKRT: 10,
|
|
39
|
+
|
|
40
|
+
// Sanskrit-specific
|
|
41
|
+
SKRT_CONS: 11,
|
|
42
|
+
SKRT_SUB_CONS: 12,
|
|
43
|
+
SKRT_VOW: 13,
|
|
44
|
+
SKRT_LONG_VOW: 14,
|
|
45
|
+
|
|
46
|
+
// Other languages
|
|
47
|
+
CJK: 15,
|
|
48
|
+
LATIN: 16,
|
|
49
|
+
|
|
50
|
+
// Misc
|
|
51
|
+
OTHER: 17,
|
|
52
|
+
TRANSPARENT: 18, // Spaces and ignorable characters
|
|
53
|
+
NFC: 19, // Characters needing NFC normalization
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
// Reverse lookup for char markers
|
|
57
|
+
export const charMarkerNames = Object.fromEntries(
|
|
58
|
+
Object.entries(CharMarkers).map(([k, v]) => [v, k])
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Chunk markers - categories for text chunks
|
|
63
|
+
*/
|
|
64
|
+
export const ChunkMarkers = {
|
|
65
|
+
// Languages
|
|
66
|
+
BO: 100,
|
|
67
|
+
LATIN: 101,
|
|
68
|
+
CJK: 102,
|
|
69
|
+
OTHER: 103,
|
|
70
|
+
|
|
71
|
+
// Tibetan textual content
|
|
72
|
+
TEXT: 104,
|
|
73
|
+
|
|
74
|
+
// Tibetan non-textual content
|
|
75
|
+
PUNCT: 105,
|
|
76
|
+
NON_PUNCT: 106,
|
|
77
|
+
SPACE: 107,
|
|
78
|
+
NON_SPACE: 108,
|
|
79
|
+
SYM: 109,
|
|
80
|
+
NON_SYM: 110,
|
|
81
|
+
NUM: 111,
|
|
82
|
+
NON_NUM: 112,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
// Reverse lookup for chunk markers
|
|
86
|
+
export const chunkMarkerNames = Object.fromEntries(
|
|
87
|
+
Object.entries(ChunkMarkers).map(([k, v]) => [v, k])
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Word markers - categories for tokenized words
|
|
92
|
+
*/
|
|
93
|
+
export const WordMarkers = {
|
|
94
|
+
WORD: 1000,
|
|
95
|
+
NO_POS: 1001,
|
|
96
|
+
NON_WORD: 1002,
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
// Reverse lookup for word markers
|
|
100
|
+
export const wordMarkerNames = Object.fromEntries(
|
|
101
|
+
Object.entries(WordMarkers).map(([k, v]) => [v, k])
|
|
102
|
+
);
|
package/src/index.js
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tibetan-word-tokenizer
|
|
3
|
+
*
|
|
4
|
+
* A JavaScript port of Botok - Tibetan word tokenizer with Sanskrit detection
|
|
5
|
+
* https://github.com/Esukhia/botok
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export * from './constants.js';
|
|
9
|
+
export * from './char-categories.js';
|
|
10
|
+
export * from './chunks.js';
|
|
11
|
+
export * from './trie.js';
|
|
12
|
+
export * from './tokenizer.js';
|
|
13
|
+
export * from './sanskrit.js';
|
|
14
|
+
|
|
15
|
+
// Re-export main classes as named exports for convenience
|
|
16
|
+
import { WordTokenizer, Token, Tokenize } from './tokenizer.js';
|
|
17
|
+
import { Trie, TrieNode } from './trie.js';
|
|
18
|
+
import { TokChunks, BoString, ChunkFramework } from './chunks.js';
|
|
19
|
+
import { isSanskritSyllable, hasSanskritSyllable, isSanskrit } from './sanskrit.js';
|
|
20
|
+
|
|
21
|
+
export {
|
|
22
|
+
WordTokenizer,
|
|
23
|
+
Token,
|
|
24
|
+
Tokenize,
|
|
25
|
+
Trie,
|
|
26
|
+
TrieNode,
|
|
27
|
+
TokChunks,
|
|
28
|
+
BoString,
|
|
29
|
+
ChunkFramework,
|
|
30
|
+
isSanskritSyllable,
|
|
31
|
+
hasSanskritSyllable,
|
|
32
|
+
isSanskrit,
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Create a tokenizer with built-in dictionary
|
|
37
|
+
* @param {Object} options - Configuration options
|
|
38
|
+
* @param {Object} options.dictionary - Pre-loaded dictionary data
|
|
39
|
+
* @returns {WordTokenizer}
|
|
40
|
+
*/
|
|
41
|
+
export function createTokenizer(options = {}) {
|
|
42
|
+
const tokenizer = new WordTokenizer();
|
|
43
|
+
|
|
44
|
+
if (options.dictionary) {
|
|
45
|
+
tokenizer.loadDictionary(options.dictionary);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return tokenizer;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Quick tokenization without loading full dictionary
|
|
53
|
+
* Uses syllable boundaries only (less accurate but works without dictionary)
|
|
54
|
+
* @param {string} text - Input text
|
|
55
|
+
* @returns {string[]} Array of syllables/tokens
|
|
56
|
+
*/
|
|
57
|
+
export function quickTokenize(text) {
|
|
58
|
+
const chunks = new TokChunks(text);
|
|
59
|
+
return chunks.getSyls();
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export default {
|
|
63
|
+
WordTokenizer,
|
|
64
|
+
Trie,
|
|
65
|
+
TokChunks,
|
|
66
|
+
createTokenizer,
|
|
67
|
+
quickTokenize,
|
|
68
|
+
};
|