tibetan-word-tokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/trie.js ADDED
@@ -0,0 +1,263 @@
1
+ /**
2
+ * Trie data structure for Tibetan word lookup
3
+ * Ported from Botok's basictrie.py and trie.py
4
+ */
5
+
6
+ /**
7
+ * Node in the Trie
8
+ */
9
+ class TrieNode {
10
+ constructor(label = null, leaf = false) {
11
+ this.label = label;
12
+ this.leaf = leaf;
13
+ this.data = { _: {} }; // Stores word metadata
14
+ this.children = new Map();
15
+ }
16
+
17
+ addChild(key, leaf = false) {
18
+ if (!this.children.has(key)) {
19
+ this.children.set(key, new TrieNode(key, leaf));
20
+ }
21
+ return this.children.get(key);
22
+ }
23
+
24
+ canWalk() {
25
+ return this.children.size > 0;
26
+ }
27
+
28
+ isMatch() {
29
+ return this.leaf;
30
+ }
31
+ }
32
+
33
+ /**
34
+ * Trie for efficient word lookup
35
+ * Words are stored as sequences of syllables
36
+ */
37
+ export class Trie {
38
+ constructor() {
39
+ this.head = new TrieNode();
40
+ }
41
+
42
+ /**
43
+ * Add a word to the trie
44
+ * @param {string[]} syllables - Array of syllables forming the word
45
+ * @param {Object} data - Optional metadata for the word
46
+ */
47
+ add(syllables, data = null) {
48
+ let currentNode = this.head;
49
+
50
+ for (const syl of syllables) {
51
+ if (currentNode.children.has(syl)) {
52
+ currentNode = currentNode.children.get(syl);
53
+ } else {
54
+ currentNode = currentNode.addChild(syl);
55
+ }
56
+ }
57
+
58
+ currentNode.leaf = true;
59
+
60
+ if (data) {
61
+ Object.assign(currentNode.data, data);
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Walk the trie one syllable at a time
67
+ * @param {string} syllable - Current syllable
68
+ * @param {TrieNode} currentNode - Current position in trie (null = start)
69
+ * @returns {TrieNode|null} Next node or null if no match
70
+ */
71
+ walk(syllable, currentNode = null) {
72
+ if (!currentNode) {
73
+ currentNode = this.head;
74
+ }
75
+
76
+ if (currentNode.children.has(syllable)) {
77
+ return currentNode.children.get(syllable);
78
+ }
79
+
80
+ return null;
81
+ }
82
+
83
+ /**
84
+ * Check if a word exists in the trie
85
+ * @param {string[]} syllables - Array of syllables
86
+ * @returns {Object} { exists: boolean, data: Object }
87
+ */
88
+ hasWord(syllables) {
89
+ if (!syllables || syllables.length === 0) {
90
+ return { exists: false, data: {} };
91
+ }
92
+
93
+ let currentNode = this.head;
94
+
95
+ for (const syl of syllables) {
96
+ if (currentNode.children.has(syl)) {
97
+ currentNode = currentNode.children.get(syl);
98
+ } else {
99
+ return { exists: false, data: currentNode.data };
100
+ }
101
+ }
102
+
103
+ return {
104
+ exists: currentNode.leaf,
105
+ data: currentNode.data
106
+ };
107
+ }
108
+
109
+ /**
110
+ * Add metadata to an existing word
111
+ * @param {string[]} syllables - Array of syllables
112
+ * @param {Object} data - Metadata to add
113
+ * @returns {boolean} True if word exists and data was added
114
+ */
115
+ addData(syllables, data) {
116
+ if (!syllables || syllables.length === 0) {
117
+ return false;
118
+ }
119
+
120
+ let currentNode = this.head;
121
+
122
+ for (const syl of syllables) {
123
+ if (currentNode.children.has(syl)) {
124
+ currentNode = currentNode.children.get(syl);
125
+ } else {
126
+ return false;
127
+ }
128
+ }
129
+
130
+ if (!currentNode.leaf) {
131
+ return false;
132
+ }
133
+
134
+ // Add senses (meanings/POS)
135
+ if (!currentNode.data.senses) {
136
+ currentNode.data.senses = [];
137
+ }
138
+
139
+ // Add meaning if different from existing
140
+ const isDifferent = this._isDifferentMeaning(currentNode.data.senses, data);
141
+ if (isDifferent) {
142
+ currentNode.data.senses.push(data);
143
+ }
144
+
145
+ return true;
146
+ }
147
+
148
+ /**
149
+ * Check if a meaning is different from existing ones
150
+ * @private
151
+ */
152
+ _isDifferentMeaning(senses, newMeaning) {
153
+ for (const existing of senses) {
154
+ let same = true;
155
+ for (const [key, value] of Object.entries(newMeaning)) {
156
+ if (existing[key] !== value) {
157
+ same = false;
158
+ break;
159
+ }
160
+ }
161
+ if (same) {
162
+ return false;
163
+ }
164
+ }
165
+ return true;
166
+ }
167
+
168
+ /**
169
+ * Deactivate a word (make it not findable)
170
+ * @param {string[]} syllables - Array of syllables
171
+ * @param {boolean} reactivate - If true, reactivate instead
172
+ * @returns {boolean} True if word exists
173
+ */
174
+ deactivate(syllables, reactivate = false) {
175
+ let currentNode = this.head;
176
+
177
+ for (const syl of syllables) {
178
+ if (currentNode.children.has(syl)) {
179
+ currentNode = currentNode.children.get(syl);
180
+ } else {
181
+ return false;
182
+ }
183
+ }
184
+
185
+ currentNode.leaf = reactivate;
186
+ return true;
187
+ }
188
+
189
+ /**
190
+ * Get statistics about the trie
191
+ * @returns {Object} { wordCount, nodeCount }
192
+ */
193
+ getStats() {
194
+ let wordCount = 0;
195
+ let nodeCount = 0;
196
+
197
+ const traverse = (node) => {
198
+ nodeCount++;
199
+ if (node.leaf) {
200
+ wordCount++;
201
+ }
202
+ for (const child of node.children.values()) {
203
+ traverse(child);
204
+ }
205
+ };
206
+
207
+ traverse(this.head);
208
+
209
+ return { wordCount, nodeCount };
210
+ }
211
+
212
+ /**
213
+ * Serialize trie to JSON
214
+ * @returns {Object} Serialized trie
215
+ */
216
+ toJSON() {
217
+ const serializeNode = (node) => {
218
+ const obj = {
219
+ leaf: node.leaf,
220
+ data: node.data,
221
+ };
222
+
223
+ if (node.children.size > 0) {
224
+ obj.children = {};
225
+ for (const [key, child] of node.children) {
226
+ obj.children[key] = serializeNode(child);
227
+ }
228
+ }
229
+
230
+ return obj;
231
+ };
232
+
233
+ return serializeNode(this.head);
234
+ }
235
+
236
+ /**
237
+ * Deserialize trie from JSON
238
+ * @param {Object} json - Serialized trie
239
+ * @returns {Trie}
240
+ */
241
+ static fromJSON(json) {
242
+ const trie = new Trie();
243
+
244
+ const deserializeNode = (obj, node) => {
245
+ node.leaf = obj.leaf;
246
+ node.data = obj.data || { _: {} };
247
+
248
+ if (obj.children) {
249
+ for (const [key, childObj] of Object.entries(obj.children)) {
250
+ const childNode = new TrieNode(key);
251
+ node.children.set(key, childNode);
252
+ deserializeNode(childObj, childNode);
253
+ }
254
+ }
255
+ };
256
+
257
+ deserializeNode(json, trie.head);
258
+
259
+ return trie;
260
+ }
261
+ }
262
+
263
+ export { TrieNode };