tibetan-word-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,38 @@
1
+ {
2
+ "name": "tibetan-word-tokenizer",
3
+ "version": "1.0.0",
4
+ "description": "A JavaScript port of Botok - Tibetan word tokenizer with Sanskrit detection",
5
+ "main": "src/index.js",
6
+ "type": "module",
7
+ "exports": {
8
+ ".": "./src/index.js",
9
+ "./trie": "./src/trie.js",
10
+ "./tokenizer": "./src/tokenizer.js",
11
+ "./chunks": "./src/chunks.js"
12
+ },
13
+ "scripts": {
14
+ "test": "node --test test/*.test.js",
15
+ "build-data": "node scripts/build-dictionary.js"
16
+ },
17
+ "keywords": [
18
+ "tibetan",
19
+ "tokenizer",
20
+ "nlp",
21
+ "word-segmentation",
22
+ "botok",
23
+ "sanskrit"
24
+ ],
25
+ "author": "",
26
+ "license": "Apache-2.0",
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "https://github.com/your-username/tibetan-word-tokenizer"
30
+ },
31
+ "files": [
32
+ "src/",
33
+ "data/"
34
+ ],
35
+ "engines": {
36
+ "node": ">=18.0.0"
37
+ }
38
+ }
@@ -0,0 +1,349 @@
1
+ /**
2
+ * Character category mappings for Tibetan Unicode
3
+ * Ported from Botok's bo_uni_table.csv
4
+ */
5
+
6
+ import { CharMarkers as c } from './constants.js';
7
+
8
+ /**
9
+ * Tibetan Unicode character categories
10
+ * Derived from Botok's bo_uni_table.csv
11
+ */
12
+ const tibetanCharCategories = {
13
+ // Special/NFC characters
14
+ '\u0F00': c.NFC, // ༀ TIBETAN SYLLABLE OM
15
+
16
+ // Punctuation marks
17
+ '\u0F01': c.SPECIAL_PUNCT, // ༁
18
+ '\u0F02': c.SPECIAL_PUNCT, // ༂
19
+ '\u0F03': c.SPECIAL_PUNCT, // ༃
20
+ '\u0F04': c.NORMAL_PUNCT, // ༄
21
+ '\u0F05': c.NORMAL_PUNCT, // ༅
22
+ '\u0F06': c.NORMAL_PUNCT, // ༆
23
+ '\u0F07': c.SPECIAL_PUNCT, // ༇
24
+ '\u0F08': c.NORMAL_PUNCT, // ༈
25
+ '\u0F09': c.SPECIAL_PUNCT, // ༉
26
+ '\u0F0A': c.SPECIAL_PUNCT, // ༊
27
+ '\u0F0B': c.TSEK, // ་ TSHEG
28
+ '\u0F0C': c.TSEK, // ༌ DELIMITER TSHEG
29
+ '\u0F0D': c.NORMAL_PUNCT, // །
30
+ '\u0F0E': c.NORMAL_PUNCT, // ༎
31
+ '\u0F0F': c.NORMAL_PUNCT, // ༏
32
+ '\u0F10': c.NORMAL_PUNCT, // ༐
33
+ '\u0F11': c.NORMAL_PUNCT, // ༑
34
+ '\u0F12': c.SPECIAL_PUNCT, // ༒
35
+ '\u0F13': c.SYMBOL, // ༓
36
+ '\u0F14': c.NORMAL_PUNCT, // ༔
37
+
38
+ // Symbols
39
+ '\u0F15': c.SYMBOL, // ༕
40
+ '\u0F16': c.SYMBOL, // ༖
41
+ '\u0F17': c.SYMBOL, // ༗
42
+ '\u0F18': c.SYMBOL, // ༘
43
+ '\u0F19': c.SYMBOL, // ༙
44
+ '\u0F1A': c.SYMBOL, // ༚
45
+ '\u0F1B': c.SYMBOL, // ༛
46
+ '\u0F1C': c.SYMBOL, // ༜
47
+ '\u0F1D': c.SYMBOL, // ༝
48
+ '\u0F1E': c.SYMBOL, // ༞
49
+ '\u0F1F': c.SYMBOL, // ༟
50
+
51
+ // Numerals
52
+ '\u0F20': c.NUMERAL, // ༠
53
+ '\u0F21': c.NUMERAL, // ༡
54
+ '\u0F22': c.NUMERAL, // ༢
55
+ '\u0F23': c.NUMERAL, // ༣
56
+ '\u0F24': c.NUMERAL, // ༤
57
+ '\u0F25': c.NUMERAL, // ༥
58
+ '\u0F26': c.NUMERAL, // ༦
59
+ '\u0F27': c.NUMERAL, // ༧
60
+ '\u0F28': c.NUMERAL, // ༨
61
+ '\u0F29': c.NUMERAL, // ༩
62
+
63
+ // Half numerals (symbols)
64
+ '\u0F2A': c.SYMBOL, // ༪
65
+ '\u0F2B': c.SYMBOL, // ༫
66
+ '\u0F2C': c.SYMBOL, // ༬
67
+ '\u0F2D': c.SYMBOL, // ༭
68
+ '\u0F2E': c.SYMBOL, // ༮
69
+ '\u0F2F': c.SYMBOL, // ༯
70
+ '\u0F30': c.SYMBOL, // ༰
71
+ '\u0F31': c.SYMBOL, // ༱
72
+ '\u0F32': c.SYMBOL, // ༲
73
+ '\u0F33': c.SYMBOL, // ༳
74
+
75
+ // More punctuation
76
+ '\u0F34': c.NORMAL_PUNCT, // ༴
77
+
78
+ // In-syllable marks
79
+ '\u0F35': c.IN_SYL_MARK, // ༵
80
+ '\u0F36': c.SYMBOL, // ༶
81
+ '\u0F37': c.IN_SYL_MARK, // ༷
82
+ '\u0F38': c.IN_SYL_MARK, // ༸
83
+ '\u0F39': c.SYMBOL, // ༹ TSA-PHRU
84
+
85
+ // Brackets
86
+ '\u0F3A': c.SPECIAL_PUNCT, // ༺
87
+ '\u0F3B': c.SPECIAL_PUNCT, // ༻
88
+ '\u0F3C': c.NORMAL_PUNCT, // ༼
89
+ '\u0F3D': c.NORMAL_PUNCT, // ༽
90
+ '\u0F3E': c.SPECIAL_PUNCT, // ༾
91
+ '\u0F3F': c.SPECIAL_PUNCT, // ༿
92
+
93
+ // Consonants
94
+ '\u0F40': c.CONS, // ཀ KA
95
+ '\u0F41': c.CONS, // ཁ KHA
96
+ '\u0F42': c.CONS, // ག GA
97
+ '\u0F43': c.NFC, // གྷ GHA (NFC)
98
+ '\u0F44': c.CONS, // ང NGA
99
+ '\u0F45': c.CONS, // ཅ CA
100
+ '\u0F46': c.CONS, // ཆ CHA
101
+ '\u0F47': c.CONS, // ཇ JA
102
+ '\u0F48': c.NFC, // ཈ (reserved)
103
+ '\u0F49': c.CONS, // ཉ NYA
104
+ '\u0F4A': c.SKRT_CONS, // ཊ TTA (Sanskrit)
105
+ '\u0F4B': c.SKRT_CONS, // ཋ TTHA (Sanskrit)
106
+ '\u0F4C': c.SKRT_CONS, // ཌ DDA (Sanskrit)
107
+ '\u0F4D': c.NFC, // ཌྷ DDHA (NFC)
108
+ '\u0F4E': c.SKRT_CONS, // ཎ NNA (Sanskrit)
109
+ '\u0F4F': c.CONS, // ཏ TA
110
+ '\u0F50': c.CONS, // ཐ THA
111
+ '\u0F51': c.CONS, // ད DA
112
+ '\u0F52': c.NFC, // དྷ DHA (NFC)
113
+ '\u0F53': c.CONS, // ན NA
114
+ '\u0F54': c.CONS, // པ PA
115
+ '\u0F55': c.CONS, // ཕ PHA
116
+ '\u0F56': c.CONS, // བ BA
117
+ '\u0F57': c.NFC, // བྷ BHA (NFC)
118
+ '\u0F58': c.CONS, // མ MA
119
+ '\u0F59': c.CONS, // ཙ TSA
120
+ '\u0F5A': c.CONS, // ཚ TSHA
121
+ '\u0F5B': c.CONS, // ཛ DZA
122
+ '\u0F5C': c.NFC, // ཛྷ DZHA (NFC)
123
+ '\u0F5D': c.CONS, // ཝ WA
124
+ '\u0F5E': c.CONS, // ཞ ZHA
125
+ '\u0F5F': c.CONS, // ཟ ZA
126
+ '\u0F60': c.CONS, // འ -A
127
+ '\u0F61': c.CONS, // ཡ YA
128
+ '\u0F62': c.CONS, // ར RA
129
+ '\u0F63': c.CONS, // ལ LA
130
+ '\u0F64': c.CONS, // ཤ SHA
131
+ '\u0F65': c.SKRT_CONS, // ཥ SSA (Sanskrit)
132
+ '\u0F66': c.CONS, // ས SA
133
+ '\u0F67': c.CONS, // ཧ HA
134
+ '\u0F68': c.CONS, // ཨ A
135
+ '\u0F69': c.NFC, // ཀྵ KSSA (NFC)
136
+ '\u0F6A': c.CONS, // ཪ fixed-form RA
137
+ '\u0F6B': c.NON_BO_NON_SKRT, // ཫ KKA
138
+ '\u0F6C': c.NON_BO_NON_SKRT, // ཬ RRA
139
+
140
+ // Vowels
141
+ '\u0F71': c.SKRT_SUB_CONS, // ཱ AA (long vowel - Sanskrit)
142
+ '\u0F72': c.VOW, // ི I
143
+ '\u0F73': c.NFC, // ཱི II (NFC)
144
+ '\u0F74': c.VOW, // ུ U
145
+ '\u0F75': c.NFC, // ཱུ UU (NFC)
146
+ '\u0F76': c.NFC, // ྲྀ vocalic R (NFC)
147
+ '\u0F77': c.NFC, // ཷ vocalic RR (NFC)
148
+ '\u0F78': c.NFC, // ླྀ vocalic L (NFC)
149
+ '\u0F79': c.NFC, // ཹ vocalic LL (NFC)
150
+ '\u0F7A': c.VOW, // ེ E
151
+ '\u0F7B': c.SKRT_VOW, // ཻ EE (Sanskrit)
152
+ '\u0F7C': c.VOW, // ོ O
153
+ '\u0F7D': c.SKRT_VOW, // ཽ OO (Sanskrit)
154
+ '\u0F7E': c.IN_SYL_MARK, // ཾ anusvara/bindu
155
+ '\u0F7F': c.SKRT_LONG_VOW, // ཿ visarga (Sanskrit)
156
+ '\u0F80': c.SKRT_VOW, // ྀ reversed I (Sanskrit)
157
+ '\u0F81': c.NFC, // ཱྀ reversed II (NFC)
158
+ '\u0F82': c.SKRT_VOW, // ྂ (Sanskrit)
159
+ '\u0F83': c.SKRT_VOW, // ྃ (Sanskrit)
160
+ '\u0F84': c.SKRT_VOW, // ྄ halanta (Sanskrit)
161
+ '\u0F85': c.SKRT_CONS, // ྅ paluta (Sanskrit)
162
+ '\u0F86': c.SKRT_VOW, // ྆
163
+ '\u0F87': c.SYMBOL, // ྇
164
+
165
+ // Head letters (symbols)
166
+ '\u0F88': c.SYMBOL, // ྈ
167
+ '\u0F89': c.SYMBOL, // ྉ
168
+ '\u0F8A': c.SYMBOL, // ྊ
169
+ '\u0F8B': c.SYMBOL, // ྋ
170
+ '\u0F8C': c.SYMBOL, // ྌ
171
+ '\u0F8D': c.SYMBOL, // ྍ
172
+ '\u0F8E': c.SYMBOL, // ྎ
173
+ '\u0F8F': c.SYMBOL, // ྏ
174
+
175
+ // Subjoined consonants
176
+ '\u0F90': c.SUB_CONS, // ྐ subjoined KA
177
+ '\u0F91': c.SKRT_SUB_CONS, // ྑ subjoined KHA (Sanskrit context)
178
+ '\u0F92': c.SUB_CONS, // ྒ subjoined GA
179
+ '\u0F93': c.NFC, // ྒྷ subjoined GHA (NFC)
180
+ '\u0F94': c.SUB_CONS, // ྔ subjoined NGA
181
+ '\u0F95': c.SUB_CONS, // ྕ subjoined CA
182
+ '\u0F96': c.SKRT_SUB_CONS, // ྖ subjoined CHA (Sanskrit context)
183
+ '\u0F97': c.SUB_CONS, // ྗ subjoined JA
184
+ '\u0F99': c.SUB_CONS, // ྙ subjoined NYA
185
+ '\u0F9A': c.SKRT_SUB_CONS, // ྚ subjoined TTA (Sanskrit)
186
+ '\u0F9B': c.SKRT_SUB_CONS, // ྛ subjoined TTHA (Sanskrit)
187
+ '\u0F9C': c.SKRT_SUB_CONS, // ྜ subjoined DDA (Sanskrit)
188
+ '\u0F9D': c.NFC, // ྜྷ subjoined DDHA (NFC)
189
+ '\u0F9E': c.SKRT_SUB_CONS, // ྞ subjoined NNA (Sanskrit)
190
+ '\u0F9F': c.SUB_CONS, // ྟ subjoined TA
191
+ '\u0FA0': c.SKRT_SUB_CONS, // ྠ subjoined THA (Sanskrit context)
192
+ '\u0FA1': c.SUB_CONS, // ྡ subjoined DA
193
+ '\u0FA2': c.NFC, // ྡྷ subjoined DHA (NFC)
194
+ '\u0FA3': c.SUB_CONS, // ྣ subjoined NA
195
+ '\u0FA4': c.SUB_CONS, // ྤ subjoined PA
196
+ '\u0FA5': c.SKRT_SUB_CONS, // ྥ subjoined PHA (Sanskrit context)
197
+ '\u0FA6': c.SUB_CONS, // ྦ subjoined BA
198
+ '\u0FA7': c.NFC, // ྦྷ subjoined BHA (NFC)
199
+ '\u0FA8': c.SUB_CONS, // ྨ subjoined MA
200
+ '\u0FA9': c.SUB_CONS, // ྩ subjoined TSA
201
+ '\u0FAA': c.SKRT_SUB_CONS, // ྪ subjoined TSHA (Sanskrit context)
202
+ '\u0FAB': c.SUB_CONS, // ྫ subjoined DZA
203
+ '\u0FAC': c.NFC, // ྫྷ subjoined DZHA (NFC)
204
+ '\u0FAD': c.SUB_CONS, // ྭ subjoined WA
205
+ '\u0FAE': c.SKRT_SUB_CONS, // ྮ subjoined ZHA (Sanskrit context)
206
+ '\u0FAF': c.SKRT_SUB_CONS, // ྯ subjoined ZA (Sanskrit context)
207
+ '\u0FB0': c.SKRT_SUB_CONS, // ྰ subjoined -A (Sanskrit)
208
+ '\u0FB1': c.SUB_CONS, // ྱ subjoined YA
209
+ '\u0FB2': c.SUB_CONS, // ྲ subjoined RA
210
+ '\u0FB3': c.SUB_CONS, // ླ subjoined LA
211
+ '\u0FB4': c.SKRT_SUB_CONS, // ྴ subjoined SHA (Sanskrit context)
212
+ '\u0FB5': c.SKRT_SUB_CONS, // ྵ subjoined SSA (Sanskrit)
213
+ '\u0FB6': c.SKRT_SUB_CONS, // ྶ subjoined SA (Sanskrit context)
214
+ '\u0FB7': c.SUB_CONS, // ྷ subjoined HA
215
+ '\u0FB8': c.SKRT_SUB_CONS, // ྸ subjoined A (Sanskrit)
216
+ '\u0FB9': c.NFC, // ྐྵ subjoined KSSA (NFC)
217
+ '\u0FBA': c.SKRT_SUB_CONS, // ྺ fixed-form subjoined WA (Sanskrit)
218
+ '\u0FBB': c.SKRT_SUB_CONS, // ྻ fixed-form subjoined YA (Sanskrit)
219
+ '\u0FBC': c.SKRT_SUB_CONS, // ྼ fixed-form subjoined RA (Sanskrit)
220
+
221
+ // More symbols
222
+ '\u0FBE': c.SYMBOL, // ྾
223
+ '\u0FBF': c.SYMBOL, // ྿
224
+ '\u0FC0': c.SYMBOL, // ࿀
225
+ '\u0FC1': c.SYMBOL, // ࿁
226
+ '\u0FC2': c.SYMBOL, // ࿂
227
+ '\u0FC3': c.SYMBOL, // ࿃
228
+ '\u0FC4': c.SYMBOL, // ࿄
229
+ '\u0FC5': c.SYMBOL, // ࿅
230
+ '\u0FC6': c.SYMBOL, // ࿆
231
+ '\u0FC7': c.SYMBOL, // ࿇
232
+ '\u0FC8': c.SYMBOL, // ࿈
233
+ '\u0FC9': c.SYMBOL, // ࿉
234
+ '\u0FCA': c.SYMBOL, // ࿊
235
+ '\u0FCB': c.SYMBOL, // ࿋
236
+ '\u0FCC': c.SYMBOL, // ࿌
237
+ '\u0FCE': c.SYMBOL, // ࿎
238
+ '\u0FCF': c.SYMBOL, // ࿏
239
+ '\u0FD0': c.SPECIAL_PUNCT, // ࿐
240
+ '\u0FD1': c.SPECIAL_PUNCT, // ࿑
241
+ '\u0FD2': c.SYMBOL, // ࿒
242
+ '\u0FD3': c.SPECIAL_PUNCT, // ࿓
243
+ '\u0FD4': c.SPECIAL_PUNCT, // ࿔
244
+ '\u0FD5': c.SYMBOL, // ࿕
245
+ '\u0FD6': c.SYMBOL, // ࿖
246
+ '\u0FD7': c.SYMBOL, // ࿗
247
+ '\u0FD8': c.SYMBOL, // ࿘
248
+ '\u0FD9': c.SPECIAL_PUNCT, // ࿙
249
+ '\u0FDA': c.SPECIAL_PUNCT, // ࿚
250
+ };
251
+
252
+ /**
253
+ * Characters that are treated as transparent (spaces, etc.)
254
+ * These are allowed anywhere in text and don't affect tokenization
255
+ */
256
+ const transparentChars = new Set([
257
+ ' ', // SPACE
258
+ '\u180E', // MONGOLIAN VOWEL SEPARATOR
259
+ '\u2000', // EN QUAD
260
+ '\u2001', // EM QUAD
261
+ '\u2002', // EN SPACE
262
+ '\u2003', // EM SPACE
263
+ '\u2004', // THREE-PER-EM SPACE
264
+ '\u2005', // FOUR-PER-EM SPACE
265
+ '\u2006', // SIX-PER-EM SPACE
266
+ '\u2007', // FIGURE SPACE
267
+ '\u2008', // PUNCTUATION SPACE
268
+ '\u2009', // THIN SPACE
269
+ '\u200A', // HAIR SPACE
270
+ '\u200B', // ZERO WIDTH SPACE
271
+ '\u202F', // NARROW NO-BREAK SPACE
272
+ '\u205F', // MEDIUM MATHEMATICAL SPACE
273
+ '\u3000', // IDEOGRAPHIC SPACE
274
+ '\uFEFF', // ZERO WIDTH NO-BREAK SPACE
275
+ '\t', // TAB
276
+ '\n', // NEWLINE
277
+ '\u00A0', // NON-BREAKING SPACE
278
+ ]);
279
+
280
+ /**
281
+ * Get the character category for a given character
282
+ * @param {string} char - Single character
283
+ * @returns {number} Character marker constant
284
+ */
285
+ export function getCharCategory(char) {
286
+ // Check transparent chars first
287
+ if (transparentChars.has(char)) {
288
+ return c.TRANSPARENT;
289
+ }
290
+
291
+ // Check Tibetan range (U+0F00 to U+0FFF)
292
+ const code = char.charCodeAt(0);
293
+ if (code >= 0x0F00 && code <= 0x0FFF) {
294
+ const category = tibetanCharCategories[char];
295
+ if (category !== undefined) {
296
+ return category;
297
+ }
298
+ // Should not happen for valid Tibetan chars
299
+ return c.OTHER;
300
+ }
301
+
302
+ // CJK range
303
+ if (
304
+ (code >= 0x2E80 && code <= 0xFAFF) ||
305
+ (code >= 0xFE30 && code <= 0xFE4F) ||
306
+ (code >= 0x20000 && code <= 0x2FA1F)
307
+ ) {
308
+ return c.CJK;
309
+ }
310
+
311
+ // Latin range
312
+ // 1. Basic Latin to Combining Diacritical Marks
313
+ // 2. Latin Extended Additional to Currency Symbols
314
+ if (
315
+ (code >= 0x0020 && code <= 0x036F) ||
316
+ (code >= 0x1E00 && code <= 0x20CF)
317
+ ) {
318
+ return c.LATIN;
319
+ }
320
+
321
+ return c.OTHER;
322
+ }
323
+
324
+ /**
325
+ * Check if a character is in the Tibetan Unicode range
326
+ * @param {number} category - Character category
327
+ * @returns {boolean}
328
+ */
329
+ export function isTibetanCategory(category) {
330
+ return (
331
+ category !== c.OTHER &&
332
+ category !== c.LATIN &&
333
+ category !== c.CJK
334
+ );
335
+ }
336
+
337
+ /**
338
+ * Check if a character category indicates Sanskrit
339
+ * @param {number} category - Character category
340
+ * @returns {boolean}
341
+ */
342
+ export function isSanskritCategory(category) {
343
+ return (
344
+ category === c.SKRT_CONS ||
345
+ category === c.SKRT_SUB_CONS ||
346
+ category === c.SKRT_VOW ||
347
+ category === c.SKRT_LONG_VOW
348
+ );
349
+ }