tibetan-word-tokenizer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/dictionary.json +1 -0
- package/data/dictionary.pretty.json +123406 -0
- package/package.json +38 -0
- package/src/char-categories.js +349 -0
- package/src/chunks.js +516 -0
- package/src/constants.js +102 -0
- package/src/index.js +68 -0
- package/src/sanskrit.js +228 -0
- package/src/tokenizer.js +434 -0
- package/src/trie.js +263 -0
package/src/trie.js
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Trie data structure for Tibetan word lookup
|
|
3
|
+
* Ported from Botok's basictrie.py and trie.py
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Node in the Trie
|
|
8
|
+
*/
|
|
9
|
+
class TrieNode {
|
|
10
|
+
constructor(label = null, leaf = false) {
|
|
11
|
+
this.label = label;
|
|
12
|
+
this.leaf = leaf;
|
|
13
|
+
this.data = { _: {} }; // Stores word metadata
|
|
14
|
+
this.children = new Map();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
addChild(key, leaf = false) {
|
|
18
|
+
if (!this.children.has(key)) {
|
|
19
|
+
this.children.set(key, new TrieNode(key, leaf));
|
|
20
|
+
}
|
|
21
|
+
return this.children.get(key);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
canWalk() {
|
|
25
|
+
return this.children.size > 0;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
isMatch() {
|
|
29
|
+
return this.leaf;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Trie for efficient word lookup
|
|
35
|
+
* Words are stored as sequences of syllables
|
|
36
|
+
*/
|
|
37
|
+
export class Trie {
|
|
38
|
+
constructor() {
|
|
39
|
+
this.head = new TrieNode();
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Add a word to the trie
|
|
44
|
+
* @param {string[]} syllables - Array of syllables forming the word
|
|
45
|
+
* @param {Object} data - Optional metadata for the word
|
|
46
|
+
*/
|
|
47
|
+
add(syllables, data = null) {
|
|
48
|
+
let currentNode = this.head;
|
|
49
|
+
|
|
50
|
+
for (const syl of syllables) {
|
|
51
|
+
if (currentNode.children.has(syl)) {
|
|
52
|
+
currentNode = currentNode.children.get(syl);
|
|
53
|
+
} else {
|
|
54
|
+
currentNode = currentNode.addChild(syl);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
currentNode.leaf = true;
|
|
59
|
+
|
|
60
|
+
if (data) {
|
|
61
|
+
Object.assign(currentNode.data, data);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Walk the trie one syllable at a time
|
|
67
|
+
* @param {string} syllable - Current syllable
|
|
68
|
+
* @param {TrieNode} currentNode - Current position in trie (null = start)
|
|
69
|
+
* @returns {TrieNode|null} Next node or null if no match
|
|
70
|
+
*/
|
|
71
|
+
walk(syllable, currentNode = null) {
|
|
72
|
+
if (!currentNode) {
|
|
73
|
+
currentNode = this.head;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (currentNode.children.has(syllable)) {
|
|
77
|
+
return currentNode.children.get(syllable);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Check if a word exists in the trie
|
|
85
|
+
* @param {string[]} syllables - Array of syllables
|
|
86
|
+
* @returns {Object} { exists: boolean, data: Object }
|
|
87
|
+
*/
|
|
88
|
+
hasWord(syllables) {
|
|
89
|
+
if (!syllables || syllables.length === 0) {
|
|
90
|
+
return { exists: false, data: {} };
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
let currentNode = this.head;
|
|
94
|
+
|
|
95
|
+
for (const syl of syllables) {
|
|
96
|
+
if (currentNode.children.has(syl)) {
|
|
97
|
+
currentNode = currentNode.children.get(syl);
|
|
98
|
+
} else {
|
|
99
|
+
return { exists: false, data: currentNode.data };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
exists: currentNode.leaf,
|
|
105
|
+
data: currentNode.data
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Add metadata to an existing word
|
|
111
|
+
* @param {string[]} syllables - Array of syllables
|
|
112
|
+
* @param {Object} data - Metadata to add
|
|
113
|
+
* @returns {boolean} True if word exists and data was added
|
|
114
|
+
*/
|
|
115
|
+
addData(syllables, data) {
|
|
116
|
+
if (!syllables || syllables.length === 0) {
|
|
117
|
+
return false;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
let currentNode = this.head;
|
|
121
|
+
|
|
122
|
+
for (const syl of syllables) {
|
|
123
|
+
if (currentNode.children.has(syl)) {
|
|
124
|
+
currentNode = currentNode.children.get(syl);
|
|
125
|
+
} else {
|
|
126
|
+
return false;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (!currentNode.leaf) {
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Add senses (meanings/POS)
|
|
135
|
+
if (!currentNode.data.senses) {
|
|
136
|
+
currentNode.data.senses = [];
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Add meaning if different from existing
|
|
140
|
+
const isDifferent = this._isDifferentMeaning(currentNode.data.senses, data);
|
|
141
|
+
if (isDifferent) {
|
|
142
|
+
currentNode.data.senses.push(data);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Check if a meaning is different from existing ones
|
|
150
|
+
* @private
|
|
151
|
+
*/
|
|
152
|
+
_isDifferentMeaning(senses, newMeaning) {
|
|
153
|
+
for (const existing of senses) {
|
|
154
|
+
let same = true;
|
|
155
|
+
for (const [key, value] of Object.entries(newMeaning)) {
|
|
156
|
+
if (existing[key] !== value) {
|
|
157
|
+
same = false;
|
|
158
|
+
break;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
if (same) {
|
|
162
|
+
return false;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return true;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Deactivate a word (make it not findable)
|
|
170
|
+
* @param {string[]} syllables - Array of syllables
|
|
171
|
+
* @param {boolean} reactivate - If true, reactivate instead
|
|
172
|
+
* @returns {boolean} True if word exists
|
|
173
|
+
*/
|
|
174
|
+
deactivate(syllables, reactivate = false) {
|
|
175
|
+
let currentNode = this.head;
|
|
176
|
+
|
|
177
|
+
for (const syl of syllables) {
|
|
178
|
+
if (currentNode.children.has(syl)) {
|
|
179
|
+
currentNode = currentNode.children.get(syl);
|
|
180
|
+
} else {
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
currentNode.leaf = reactivate;
|
|
186
|
+
return true;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Get statistics about the trie
|
|
191
|
+
* @returns {Object} { wordCount, nodeCount }
|
|
192
|
+
*/
|
|
193
|
+
getStats() {
|
|
194
|
+
let wordCount = 0;
|
|
195
|
+
let nodeCount = 0;
|
|
196
|
+
|
|
197
|
+
const traverse = (node) => {
|
|
198
|
+
nodeCount++;
|
|
199
|
+
if (node.leaf) {
|
|
200
|
+
wordCount++;
|
|
201
|
+
}
|
|
202
|
+
for (const child of node.children.values()) {
|
|
203
|
+
traverse(child);
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
traverse(this.head);
|
|
208
|
+
|
|
209
|
+
return { wordCount, nodeCount };
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Serialize trie to JSON
|
|
214
|
+
* @returns {Object} Serialized trie
|
|
215
|
+
*/
|
|
216
|
+
toJSON() {
|
|
217
|
+
const serializeNode = (node) => {
|
|
218
|
+
const obj = {
|
|
219
|
+
leaf: node.leaf,
|
|
220
|
+
data: node.data,
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
if (node.children.size > 0) {
|
|
224
|
+
obj.children = {};
|
|
225
|
+
for (const [key, child] of node.children) {
|
|
226
|
+
obj.children[key] = serializeNode(child);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return obj;
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
return serializeNode(this.head);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Deserialize trie from JSON
|
|
238
|
+
* @param {Object} json - Serialized trie
|
|
239
|
+
* @returns {Trie}
|
|
240
|
+
*/
|
|
241
|
+
static fromJSON(json) {
|
|
242
|
+
const trie = new Trie();
|
|
243
|
+
|
|
244
|
+
const deserializeNode = (obj, node) => {
|
|
245
|
+
node.leaf = obj.leaf;
|
|
246
|
+
node.data = obj.data || { _: {} };
|
|
247
|
+
|
|
248
|
+
if (obj.children) {
|
|
249
|
+
for (const [key, childObj] of Object.entries(obj.children)) {
|
|
250
|
+
const childNode = new TrieNode(key);
|
|
251
|
+
node.children.set(key, childNode);
|
|
252
|
+
deserializeNode(childObj, childNode);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
deserializeNode(json, trie.head);
|
|
258
|
+
|
|
259
|
+
return trie;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
export { TrieNode };
|