kuromoji-ko 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1375 @@
1
+ // src/viterbi/ViterbiNode.ts
2
+ var ViterbiNode = class {
3
+ constructor(nodeName, nodeCost, startPos, length, type, leftId, rightId, surfaceForm) {
4
+ this.name = nodeName;
5
+ this.cost = nodeCost;
6
+ this.start_pos = startPos;
7
+ this.length = length;
8
+ this.left_id = leftId;
9
+ this.right_id = rightId;
10
+ this.prev = null;
11
+ this.surface_form = surfaceForm;
12
+ this.shortest_cost = type === "BOS" ? 0 : Number.MAX_VALUE;
13
+ this.type = type;
14
+ }
15
+ };
16
+
17
+ // src/viterbi/ViterbiLattice.ts
18
+ var ViterbiLattice = class {
19
+ constructor() {
20
+ this.nodesEndAt = [];
21
+ this.nodesEndAt[0] = [new ViterbiNode(-1, 0, 0, 0, "BOS", 0, 0, "")];
22
+ this.eosPos = 1;
23
+ }
24
+ /**
25
+ * Append node to the lattice
26
+ */
27
+ append(node) {
28
+ const lastPos = node.start_pos + node.length - 1;
29
+ if (this.eosPos < lastPos) {
30
+ this.eosPos = lastPos;
31
+ }
32
+ let prevNodes = this.nodesEndAt[lastPos];
33
+ if (prevNodes == null) {
34
+ prevNodes = [];
35
+ }
36
+ prevNodes.push(node);
37
+ this.nodesEndAt[lastPos] = prevNodes;
38
+ }
39
+ /**
40
+ * Append EOS (End of Sentence) node
41
+ */
42
+ appendEos() {
43
+ const lastIndex = this.nodesEndAt.length;
44
+ this.eosPos++;
45
+ this.nodesEndAt[lastIndex] = [new ViterbiNode(-1, 0, this.eosPos, 0, "EOS", 0, 0, "")];
46
+ }
47
+ };
48
+
49
+ // src/util/SurrogateAwareString.ts
50
+ var SurrogateAwareString = class _SurrogateAwareString {
51
+ constructor(str) {
52
+ this.str = str;
53
+ this.indexMapping = [];
54
+ for (let pos = 0; pos < str.length; pos++) {
55
+ const ch = str.charAt(pos);
56
+ this.indexMapping.push(pos);
57
+ if (_SurrogateAwareString.isSurrogatePair(ch)) {
58
+ pos++;
59
+ }
60
+ }
61
+ this.length = this.indexMapping.length;
62
+ }
63
+ slice(index) {
64
+ if (this.indexMapping.length <= index) {
65
+ return "";
66
+ }
67
+ const surrogateAwareIndex = this.indexMapping[index];
68
+ return this.str.slice(surrogateAwareIndex);
69
+ }
70
+ charAt(index) {
71
+ if (this.str.length <= index) {
72
+ return "";
73
+ }
74
+ const surrogateAwareStartIndex = this.indexMapping[index];
75
+ const surrogateAwareEndIndex = this.indexMapping[index + 1];
76
+ if (surrogateAwareEndIndex == null) {
77
+ return this.str.slice(surrogateAwareStartIndex);
78
+ }
79
+ return this.str.slice(surrogateAwareStartIndex, surrogateAwareEndIndex);
80
+ }
81
+ charCodeAt(index) {
82
+ if (this.indexMapping.length <= index) {
83
+ return NaN;
84
+ }
85
+ const surrogateAwareIndex = this.indexMapping[index];
86
+ const upper = this.str.charCodeAt(surrogateAwareIndex);
87
+ if (upper >= 55296 && upper <= 56319 && surrogateAwareIndex < this.str.length) {
88
+ const lower = this.str.charCodeAt(surrogateAwareIndex + 1);
89
+ if (lower >= 56320 && lower <= 57343) {
90
+ return (upper - 55296) * 1024 + lower - 56320 + 65536;
91
+ }
92
+ }
93
+ return upper;
94
+ }
95
+ toString() {
96
+ return this.str;
97
+ }
98
+ static isSurrogatePair(ch) {
99
+ const utf16Code = ch.charCodeAt(0);
100
+ return utf16Code >= 55296 && utf16Code <= 56319;
101
+ }
102
+ };
103
+
104
+ // src/viterbi/ViterbiBuilder.ts
105
+ var ViterbiBuilder = class {
106
+ constructor(dic) {
107
+ this.trie = dic.trie;
108
+ this.tokenInfoDictionary = dic.tokenInfoDictionary;
109
+ this.unknownDictionary = dic.unknownDictionary;
110
+ }
111
+ /**
112
+ * Build word lattice from input text
113
+ */
114
+ build(sentenceStr) {
115
+ const lattice = new ViterbiLattice();
116
+ const sentence = new SurrogateAwareString(sentenceStr);
117
+ for (let pos = 0; pos < sentence.length; pos++) {
118
+ const tail = sentence.slice(pos);
119
+ const vocabulary = this.trie.commonPrefixSearch(tail);
120
+ for (let n = 0; n < vocabulary.length; n++) {
121
+ const trieId = vocabulary[n].v;
122
+ const key = vocabulary[n].k;
123
+ const tokenInfoIds = this.tokenInfoDictionary.targetMap[trieId];
124
+ if (tokenInfoIds == null) continue;
125
+ for (let i = 0; i < tokenInfoIds.length; i++) {
126
+ const tokenInfoId = tokenInfoIds[i];
127
+ const leftId = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId);
128
+ const rightId = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId + 2);
129
+ const wordCost = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId + 4);
130
+ lattice.append(
131
+ new ViterbiNode(
132
+ tokenInfoId,
133
+ wordCost,
134
+ pos + 1,
135
+ key.length,
136
+ "KNOWN",
137
+ leftId,
138
+ rightId,
139
+ key
140
+ )
141
+ );
142
+ }
143
+ }
144
+ const surrogateAwareTail = new SurrogateAwareString(tail);
145
+ const headChar = new SurrogateAwareString(surrogateAwareTail.charAt(0));
146
+ const headCharClass = this.unknownDictionary.lookup(headChar.toString());
147
+ if (vocabulary == null || vocabulary.length === 0 || headCharClass && headCharClass.is_always_invoke === 1) {
148
+ let key = headChar;
149
+ if (headCharClass && headCharClass.is_grouping === 1 && surrogateAwareTail.length > 1) {
150
+ for (let k = 1; k < surrogateAwareTail.length; k++) {
151
+ const nextChar = surrogateAwareTail.charAt(k);
152
+ const nextCharClass = this.unknownDictionary.lookup(nextChar);
153
+ if (!nextCharClass || headCharClass.class_name !== nextCharClass.class_name) {
154
+ break;
155
+ }
156
+ key = new SurrogateAwareString(key.toString() + nextChar);
157
+ }
158
+ }
159
+ if (headCharClass) {
160
+ const unkIds = this.unknownDictionary.targetMap[headCharClass.class_id];
161
+ if (unkIds) {
162
+ for (let j = 0; j < unkIds.length; j++) {
163
+ const unkId = unkIds[j];
164
+ const leftId = this.unknownDictionary.dictionary.getShort(unkId);
165
+ const rightId = this.unknownDictionary.dictionary.getShort(unkId + 2);
166
+ const wordCost = this.unknownDictionary.dictionary.getShort(unkId + 4);
167
+ lattice.append(
168
+ new ViterbiNode(
169
+ unkId,
170
+ wordCost,
171
+ pos + 1,
172
+ key.length,
173
+ "UNKNOWN",
174
+ leftId,
175
+ rightId,
176
+ key.toString()
177
+ )
178
+ );
179
+ }
180
+ }
181
+ }
182
+ }
183
+ }
184
+ lattice.appendEos();
185
+ return lattice;
186
+ }
187
+ };
188
+
189
+ // src/viterbi/ViterbiSearcher.ts
190
+ var ViterbiSearcher = class {
191
+ constructor(connectionCosts) {
192
+ this.connectionCosts = connectionCosts;
193
+ }
194
+ /**
195
+ * Search best path using forward-backward algorithm
196
+ */
197
+ search(lattice) {
198
+ lattice = this.forward(lattice);
199
+ return this.backward(lattice);
200
+ }
201
+ /**
202
+ * Forward pass - compute shortest costs
203
+ */
204
+ forward(lattice) {
205
+ for (let i = 1; i <= lattice.eosPos; i++) {
206
+ const nodes = lattice.nodesEndAt[i];
207
+ if (nodes == null) {
208
+ continue;
209
+ }
210
+ for (let j = 0; j < nodes.length; j++) {
211
+ const node = nodes[j];
212
+ let cost = Number.MAX_VALUE;
213
+ let shortestPrevNode = null;
214
+ const prevNodes = lattice.nodesEndAt[node.start_pos - 1];
215
+ if (prevNodes == null) {
216
+ continue;
217
+ }
218
+ for (let k = 0; k < prevNodes.length; k++) {
219
+ const prevNode = prevNodes[k];
220
+ let edgeCost;
221
+ if (node.left_id == null || prevNode.right_id == null) {
222
+ console.log("Left or right is null");
223
+ edgeCost = 0;
224
+ } else {
225
+ edgeCost = this.connectionCosts.get(prevNode.right_id, node.left_id);
226
+ }
227
+ const totalCost = prevNode.shortest_cost + edgeCost + node.cost;
228
+ if (totalCost < cost) {
229
+ shortestPrevNode = prevNode;
230
+ cost = totalCost;
231
+ }
232
+ }
233
+ node.prev = shortestPrevNode;
234
+ node.shortest_cost = cost;
235
+ }
236
+ }
237
+ return lattice;
238
+ }
239
+ /**
240
+ * Backward pass - trace back the best path
241
+ */
242
+ backward(lattice) {
243
+ const shortestPath = [];
244
+ const lastNodes = lattice.nodesEndAt[lattice.nodesEndAt.length - 1];
245
+ if (!lastNodes || lastNodes.length === 0) {
246
+ return [];
247
+ }
248
+ const eos = lastNodes[0];
249
+ let nodeBack = eos.prev;
250
+ if (nodeBack == null) {
251
+ return [];
252
+ }
253
+ while (nodeBack.type !== "BOS") {
254
+ shortestPath.push(nodeBack);
255
+ if (nodeBack.prev == null) {
256
+ return [];
257
+ }
258
+ nodeBack = nodeBack.prev;
259
+ }
260
+ return shortestPath.reverse();
261
+ }
262
+ };
263
+
264
+ // src/KoreanToken.ts
265
+ var POS_TAGS = {
266
+ // 체언 (Substantives)
267
+ NNG: "\uC77C\uBC18 \uBA85\uC0AC",
268
+ // General noun
269
+ NNP: "\uACE0\uC720 \uBA85\uC0AC",
270
+ // Proper noun
271
+ NNB: "\uC758\uC874 \uBA85\uC0AC",
272
+ // Dependent noun
273
+ NR: "\uC218\uC0AC",
274
+ // Numeral
275
+ NP: "\uB300\uBA85\uC0AC",
276
+ // Pronoun
277
+ // 용언 (Predicates)
278
+ VV: "\uB3D9\uC0AC",
279
+ // Verb
280
+ VA: "\uD615\uC6A9\uC0AC",
281
+ // Adjective
282
+ VX: "\uBCF4\uC870 \uC6A9\uC5B8",
283
+ // Auxiliary predicate
284
+ VCP: "\uAE0D\uC815 \uC9C0\uC815\uC0AC",
285
+ // Positive copula (이다)
286
+ VCN: "\uBD80\uC815 \uC9C0\uC815\uC0AC",
287
+ // Negative copula (아니다)
288
+ // 관형사 (Determiners)
289
+ MM: "\uAD00\uD615\uC0AC",
290
+ // Determiner
291
+ // 부사 (Adverbs)
292
+ MAG: "\uC77C\uBC18 \uBD80\uC0AC",
293
+ // General adverb
294
+ MAJ: "\uC811\uC18D \uBD80\uC0AC",
295
+ // Conjunctive adverb
296
+ // 감탄사 (Interjections)
297
+ IC: "\uAC10\uD0C4\uC0AC",
298
+ // Interjection
299
+ // 조사 (Particles)
300
+ JKS: "\uC8FC\uACA9 \uC870\uC0AC",
301
+ // Subject case particle
302
+ JKC: "\uBCF4\uACA9 \uC870\uC0AC",
303
+ // Complement case particle
304
+ JKG: "\uAD00\uD615\uACA9 \uC870\uC0AC",
305
+ // Adnominal case particle
306
+ JKO: "\uBAA9\uC801\uACA9 \uC870\uC0AC",
307
+ // Object case particle
308
+ JKB: "\uBD80\uC0AC\uACA9 \uC870\uC0AC",
309
+ // Adverbial case particle
310
+ JKV: "\uD638\uACA9 \uC870\uC0AC",
311
+ // Vocative case particle
312
+ JKQ: "\uC778\uC6A9\uACA9 \uC870\uC0AC",
313
+ // Quotative case particle
314
+ JX: "\uBCF4\uC870\uC0AC",
315
+ // Auxiliary particle
316
+ JC: "\uC811\uC18D \uC870\uC0AC",
317
+ // Conjunctive particle
318
+ // 어미 (Endings)
319
+ EP: "\uC120\uC5B4\uB9D0 \uC5B4\uBBF8",
320
+ // Pre-final ending
321
+ EF: "\uC885\uACB0 \uC5B4\uBBF8",
322
+ // Final ending
323
+ EC: "\uC5F0\uACB0 \uC5B4\uBBF8",
324
+ // Connective ending
325
+ ETN: "\uBA85\uC0AC\uD615 \uC804\uC131 \uC5B4\uBBF8",
326
+ // Nominalizing ending
327
+ ETM: "\uAD00\uD615\uD615 \uC804\uC131 \uC5B4\uBBF8",
328
+ // Adnominalizing ending
329
+ // 접사 (Affixes)
330
+ XPN: "\uCCB4\uC5B8 \uC811\uB450\uC0AC",
331
+ // Noun prefix
332
+ XSN: "\uBA85\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
333
+ // Noun-deriving suffix
334
+ XSV: "\uB3D9\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
335
+ // Verb-deriving suffix
336
+ XSA: "\uD615\uC6A9\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
337
+ // Adjective-deriving suffix
338
+ XR: "\uC5B4\uADFC",
339
+ // Root
340
+ // 부호 (Symbols)
341
+ SF: "\uB9C8\uCE68\uD45C, \uBB3C\uC74C\uD45C, \uB290\uB08C\uD45C",
342
+ // Period, question, exclamation
343
+ SE: "\uC904\uC784\uD45C",
344
+ // Ellipsis
345
+ SS: "\uB530\uC634\uD45C, \uAD04\uD638\uD45C",
346
+ // Quotes, brackets
347
+ SP: "\uC27C\uD45C, \uAC00\uC6B4\uB383\uC810, \uCF5C\uB860, \uBE57\uAE08",
348
+ // Comma, interpunct, colon, slash
349
+ SO: "\uBD99\uC784\uD45C",
350
+ // Hyphen
351
+ SW: "\uAE30\uD0C0 \uAE30\uD638",
352
+ // Other symbols
353
+ // 한글 외 (Non-Hangul)
354
+ SL: "\uC678\uAD6D\uC5B4",
355
+ // Foreign language
356
+ SH: "\uD55C\uC790",
357
+ // Chinese characters
358
+ SN: "\uC22B\uC790",
359
+ // Numbers
360
+ // 분석 불능 (Unknown)
361
+ NA: "\uBD84\uC11D\uBD88\uB2A5",
362
+ // Unable to analyze
363
+ NF: "\uBA85\uC0AC\uCD94\uC815\uBC94\uC8FC",
364
+ // Presumed noun
365
+ NV: "\uC6A9\uC5B8\uCD94\uC815\uBC94\uC8FC"
366
+ // Presumed predicate
367
+ };
368
+ var KoreanToken = class _KoreanToken {
369
+ constructor(options = {}) {
370
+ this.word_id = options.word_id ?? 0;
371
+ this.word_type = options.word_type ?? "KNOWN";
372
+ this.word_position = options.word_position ?? 1;
373
+ this.surface_form = options.surface_form ?? "";
374
+ this.pos = options.pos ?? "*";
375
+ this.semantic_class = options.semantic_class ?? "*";
376
+ this.has_final_consonant = options.has_final_consonant ?? "*";
377
+ this.reading = options.reading ?? "*";
378
+ this.type = options.type ?? "*";
379
+ this.first_pos = options.first_pos ?? "*";
380
+ this.last_pos = options.last_pos ?? "*";
381
+ this.expression = options.expression ?? "*";
382
+ }
383
+ /**
384
+ * Get human-readable POS description
385
+ */
386
+ get posDescription() {
387
+ return POS_TAGS[this.pos] || this.pos;
388
+ }
389
+ /**
390
+ * Check if token ends with a consonant (받침)
391
+ */
392
+ get hasBatchim() {
393
+ return this.has_final_consonant === "T";
394
+ }
395
+ /**
396
+ * Check if this is a compound word
397
+ */
398
+ get isCompound() {
399
+ return this.type === "Compound";
400
+ }
401
+ /**
402
+ * Check if this is an inflected form
403
+ */
404
+ get isInflected() {
405
+ return this.type === "Inflect";
406
+ }
407
+ /**
408
+ * Get the decomposed parts for compound/inflected words
409
+ */
410
+ get parts() {
411
+ if (this.expression === "*") return [];
412
+ return this.expression.split("+").map((part) => {
413
+ const [surface, pos] = part.split("/");
414
+ return { surface, pos };
415
+ });
416
+ }
417
+ /**
418
+ * Create token from features array
419
+ */
420
+ static fromFeatures(surface, features, wordId = 0, position = 1, wordType = "KNOWN") {
421
+ return new _KoreanToken({
422
+ word_id: wordId,
423
+ word_type: wordType,
424
+ word_position: position,
425
+ surface_form: surface,
426
+ pos: features[0] ?? "*",
427
+ semantic_class: features[1] ?? "*",
428
+ has_final_consonant: features[2] ?? "*",
429
+ reading: features[3] ?? "*",
430
+ type: features[4] ?? "*",
431
+ first_pos: features[5] ?? "*",
432
+ last_pos: features[6] ?? "*",
433
+ expression: features[7] ?? "*"
434
+ });
435
+ }
436
+ /**
437
+ * Convert to plain object
438
+ */
439
+ toJSON() {
440
+ return {
441
+ word_id: this.word_id,
442
+ word_type: this.word_type,
443
+ word_position: this.word_position,
444
+ surface_form: this.surface_form,
445
+ pos: this.pos,
446
+ posDescription: this.posDescription,
447
+ semantic_class: this.semantic_class,
448
+ has_final_consonant: this.has_final_consonant,
449
+ reading: this.reading,
450
+ type: this.type,
451
+ first_pos: this.first_pos,
452
+ last_pos: this.last_pos,
453
+ expression: this.expression
454
+ };
455
+ }
456
+ };
457
+
458
+ // src/KoreanFormatter.ts
459
+ var KoreanFormatter = class {
460
+ /**
461
+ * Format a known word entry
462
+ */
463
+ formatEntry(wordId, position, type, features) {
464
+ return new KoreanToken({
465
+ word_id: wordId,
466
+ word_type: type,
467
+ word_position: position,
468
+ surface_form: features[0] ?? "",
469
+ pos: features[1] ?? "*",
470
+ semantic_class: features[2] ?? "*",
471
+ has_final_consonant: features[3] ?? "*",
472
+ reading: features[4] ?? "*",
473
+ type: features[5] ?? "*",
474
+ first_pos: features[6] ?? "*",
475
+ last_pos: features[7] ?? "*",
476
+ expression: features[8] ?? "*"
477
+ });
478
+ }
479
+ /**
480
+ * Format an unknown word entry
481
+ */
482
+ formatUnknownEntry(wordId, position, type, features, surfaceForm) {
483
+ return new KoreanToken({
484
+ word_id: wordId,
485
+ word_type: type,
486
+ word_position: position,
487
+ surface_form: surfaceForm,
488
+ pos: features[1] ?? "*",
489
+ semantic_class: features[2] ?? "*",
490
+ has_final_consonant: features[3] ?? "*",
491
+ reading: features[4] ?? "*",
492
+ type: features[5] ?? "*",
493
+ first_pos: features[6] ?? "*",
494
+ last_pos: features[7] ?? "*",
495
+ expression: features[8] ?? "*"
496
+ });
497
+ }
498
+ };
499
+
500
+ // src/Tokenizer.ts
501
+ var PUNCTUATION = /[.?!。?!]/;
502
+ var Tokenizer = class _Tokenizer {
503
+ constructor(dic) {
504
+ this.tokenInfoDictionary = dic.tokenInfoDictionary;
505
+ this.unknownDictionary = dic.unknownDictionary;
506
+ this.viterbiBuilder = new ViterbiBuilder(dic);
507
+ this.viterbiSearcher = new ViterbiSearcher(dic.connectionCosts);
508
+ this.formatter = new KoreanFormatter();
509
+ }
510
+ /**
511
+ * Split text by sentence-ending punctuation
512
+ */
513
+ static splitByPunctuation(input) {
514
+ const sentences = [];
515
+ let tail = input;
516
+ while (true) {
517
+ if (tail === "") {
518
+ break;
519
+ }
520
+ const index = tail.search(PUNCTUATION);
521
+ if (index < 0) {
522
+ sentences.push(tail);
523
+ break;
524
+ }
525
+ sentences.push(tail.substring(0, index + 1));
526
+ tail = tail.substring(index + 1);
527
+ }
528
+ return sentences;
529
+ }
530
+ /**
531
+ * Tokenize text into morphemes
532
+ */
533
+ tokenize(text) {
534
+ const sentences = _Tokenizer.splitByPunctuation(text);
535
+ const tokens = [];
536
+ for (let i = 0; i < sentences.length; i++) {
537
+ const sentence = sentences[i];
538
+ this.tokenizeForSentence(sentence, tokens);
539
+ }
540
+ return tokens;
541
+ }
542
+ /**
543
+ * Tokenize a single sentence
544
+ */
545
+ tokenizeForSentence(sentence, tokens = []) {
546
+ const lattice = this.getLattice(sentence);
547
+ const bestPath = this.viterbiSearcher.search(lattice);
548
+ let lastPos = 0;
549
+ if (tokens.length > 0) {
550
+ lastPos = tokens[tokens.length - 1].word_position;
551
+ }
552
+ for (let j = 0; j < bestPath.length; j++) {
553
+ const node = bestPath[j];
554
+ let token;
555
+ let features;
556
+ let featuresLine;
557
+ if (node.type === "KNOWN") {
558
+ featuresLine = this.tokenInfoDictionary.getFeatures(node.name);
559
+ features = featuresLine ? featuresLine.split(",") : [];
560
+ token = this.formatter.formatEntry(
561
+ node.name,
562
+ lastPos + node.start_pos,
563
+ "KNOWN",
564
+ features
565
+ );
566
+ } else if (node.type === "UNKNOWN") {
567
+ featuresLine = this.unknownDictionary.getFeatures(node.name);
568
+ features = featuresLine ? featuresLine.split(",") : [];
569
+ token = this.formatter.formatUnknownEntry(
570
+ node.name,
571
+ lastPos + node.start_pos,
572
+ "UNKNOWN",
573
+ features,
574
+ node.surface_form
575
+ );
576
+ } else {
577
+ token = this.formatter.formatEntry(node.name, lastPos + node.start_pos, "KNOWN", []);
578
+ }
579
+ tokens.push(token);
580
+ }
581
+ return tokens;
582
+ }
583
+ /**
584
+ * Get just the surface forms as an array (wakachi-gaki)
585
+ */
586
+ wakati(text) {
587
+ const tokens = this.tokenize(text);
588
+ return tokens.map((token) => token.surface_form);
589
+ }
590
+ /**
591
+ * Get space-separated surface forms
592
+ */
593
+ wakatiString(text) {
594
+ return this.wakati(text).join(" ");
595
+ }
596
+ /**
597
+ * Build word lattice for analysis
598
+ */
599
+ getLattice(text) {
600
+ return this.viterbiBuilder.build(text);
601
+ }
602
+ };
603
+
604
+ // src/util/ByteBuffer.ts
605
+ function stringToUtf8Bytes(str) {
606
+ const bytes = new Uint8Array(str.length * 4);
607
+ let i = 0;
608
+ let j = 0;
609
+ while (i < str.length) {
610
+ let unicodeCode;
611
+ const utf16Code = str.charCodeAt(i++);
612
+ if (utf16Code >= 55296 && utf16Code <= 56319) {
613
+ const upper = utf16Code;
614
+ const lower = str.charCodeAt(i++);
615
+ if (lower >= 56320 && lower <= 57343) {
616
+ unicodeCode = (upper - 55296) * (1 << 10) + (1 << 16) + (lower - 56320);
617
+ } else {
618
+ throw new Error("Malformed surrogate pair");
619
+ }
620
+ } else {
621
+ unicodeCode = utf16Code;
622
+ }
623
+ if (unicodeCode < 128) {
624
+ bytes[j++] = unicodeCode;
625
+ } else if (unicodeCode < 1 << 11) {
626
+ bytes[j++] = unicodeCode >>> 6 | 192;
627
+ bytes[j++] = unicodeCode & 63 | 128;
628
+ } else if (unicodeCode < 1 << 16) {
629
+ bytes[j++] = unicodeCode >>> 12 | 224;
630
+ bytes[j++] = unicodeCode >> 6 & 63 | 128;
631
+ bytes[j++] = unicodeCode & 63 | 128;
632
+ } else if (unicodeCode < 1 << 21) {
633
+ bytes[j++] = unicodeCode >>> 18 | 240;
634
+ bytes[j++] = unicodeCode >> 12 & 63 | 128;
635
+ bytes[j++] = unicodeCode >> 6 & 63 | 128;
636
+ bytes[j++] = unicodeCode & 63 | 128;
637
+ }
638
+ }
639
+ return bytes.subarray(0, j);
640
+ }
641
+ function utf8BytesToString(bytes) {
642
+ let str = "";
643
+ let i = 0;
644
+ while (i < bytes.length) {
645
+ const b1 = bytes[i++];
646
+ let code;
647
+ if (b1 < 128) {
648
+ code = b1;
649
+ } else if (b1 >> 5 === 6) {
650
+ const b2 = bytes[i++];
651
+ code = (b1 & 31) << 6 | b2 & 63;
652
+ } else if (b1 >> 4 === 14) {
653
+ const b2 = bytes[i++];
654
+ const b3 = bytes[i++];
655
+ code = (b1 & 15) << 12 | (b2 & 63) << 6 | b3 & 63;
656
+ } else {
657
+ const b2 = bytes[i++];
658
+ const b3 = bytes[i++];
659
+ const b4 = bytes[i++];
660
+ code = (b1 & 7) << 18 | (b2 & 63) << 12 | (b3 & 63) << 6 | b4 & 63;
661
+ }
662
+ if (code < 65536) {
663
+ str += String.fromCharCode(code);
664
+ } else {
665
+ code -= 65536;
666
+ const upper = 55296 | code >> 10;
667
+ const lower = 56320 | code & 1023;
668
+ str += String.fromCharCode(upper, lower);
669
+ }
670
+ }
671
+ return str;
672
+ }
673
+ var ByteBuffer = class {
674
+ constructor(arg) {
675
+ if (arg == null) {
676
+ this.buffer = new Uint8Array(1024 * 1024);
677
+ this.position = 0;
678
+ } else if (typeof arg === "number") {
679
+ this.buffer = new Uint8Array(arg);
680
+ this.position = 0;
681
+ } else if (arg instanceof Uint8Array) {
682
+ this.buffer = arg;
683
+ this.position = 0;
684
+ } else if (arg instanceof ArrayBuffer) {
685
+ this.buffer = new Uint8Array(arg);
686
+ this.position = 0;
687
+ } else {
688
+ throw new Error("Invalid parameter type for ByteBuffer constructor");
689
+ }
690
+ }
691
+ size() {
692
+ return this.buffer.length;
693
+ }
694
+ reallocate() {
695
+ const newArray = new Uint8Array(this.buffer.length * 2);
696
+ newArray.set(this.buffer);
697
+ this.buffer = newArray;
698
+ }
699
+ shrink() {
700
+ this.buffer = this.buffer.subarray(0, this.position);
701
+ return this.buffer;
702
+ }
703
+ put(b) {
704
+ if (this.buffer.length < this.position + 1) {
705
+ this.reallocate();
706
+ }
707
+ this.buffer[this.position++] = b;
708
+ }
709
+ get(index) {
710
+ if (index == null) {
711
+ index = this.position;
712
+ this.position += 1;
713
+ }
714
+ if (this.buffer.length < index + 1) {
715
+ return 0;
716
+ }
717
+ return this.buffer[index];
718
+ }
719
+ // Write short to buffer (little endian)
720
+ putShort(num) {
721
+ if (65535 < num) {
722
+ throw new Error(`${num} is over short value`);
723
+ }
724
+ const lower = 255 & num;
725
+ const upper = (65280 & num) >> 8;
726
+ this.put(lower);
727
+ this.put(upper);
728
+ }
729
+ // Read short from buffer (little endian)
730
+ getShort(index) {
731
+ if (index == null) {
732
+ index = this.position;
733
+ this.position += 2;
734
+ }
735
+ if (this.buffer.length < index + 2) {
736
+ return 0;
737
+ }
738
+ const lower = this.buffer[index];
739
+ const upper = this.buffer[index + 1];
740
+ let value = (upper << 8) + lower;
741
+ if (value & 32768) {
742
+ value = -(value - 1 ^ 65535);
743
+ }
744
+ return value;
745
+ }
746
+ // Write integer to buffer (little endian)
747
+ putInt(num) {
748
+ if (4294967295 < num) {
749
+ throw new Error(`${num} is over integer value`);
750
+ }
751
+ const b0 = 255 & num;
752
+ const b1 = (65280 & num) >> 8;
753
+ const b2 = (16711680 & num) >> 16;
754
+ const b3 = (4278190080 & num) >> 24;
755
+ this.put(b0);
756
+ this.put(b1);
757
+ this.put(b2);
758
+ this.put(b3);
759
+ }
760
+ // Read integer from buffer (little endian)
761
+ getInt(index) {
762
+ if (index == null) {
763
+ index = this.position;
764
+ this.position += 4;
765
+ }
766
+ if (this.buffer.length < index + 4) {
767
+ return 0;
768
+ }
769
+ const b0 = this.buffer[index];
770
+ const b1 = this.buffer[index + 1];
771
+ const b2 = this.buffer[index + 2];
772
+ const b3 = this.buffer[index + 3];
773
+ return (b3 << 24 >>> 0) + (b2 << 16) + (b1 << 8) + b0;
774
+ }
775
+ readInt() {
776
+ const pos = this.position;
777
+ this.position += 4;
778
+ return this.getInt(pos);
779
+ }
780
+ putString(str) {
781
+ const bytes = stringToUtf8Bytes(str);
782
+ for (let i = 0; i < bytes.length; i++) {
783
+ this.put(bytes[i]);
784
+ }
785
+ this.put(0);
786
+ }
787
+ getString(index) {
788
+ const buf = [];
789
+ if (index == null) {
790
+ index = this.position;
791
+ }
792
+ while (true) {
793
+ if (this.buffer.length < index + 1) {
794
+ break;
795
+ }
796
+ const ch = this.get(index++);
797
+ if (ch === 0) {
798
+ break;
799
+ } else {
800
+ buf.push(ch);
801
+ }
802
+ }
803
+ this.position = index;
804
+ return utf8BytesToString(buf);
805
+ }
806
+ };
807
+
808
+ // src/dict/TokenInfoDictionary.ts
809
+ var TokenInfoDictionary = class {
810
+ constructor() {
811
+ this.dictionary = new ByteBuffer(10 * 1024 * 1024);
812
+ this.targetMap = {};
813
+ this.posBuffer = new ByteBuffer(10 * 1024 * 1024);
814
+ }
815
+ /**
816
+ * Build dictionary from entries
817
+ * Entry format: [surface, left_id, right_id, word_cost, ...features]
818
+ */
819
+ buildDictionary(entries) {
820
+ const dictionaryEntries = {};
821
+ for (let i = 0; i < entries.length; i++) {
822
+ const entry = entries[i];
823
+ if (entry.length < 4) {
824
+ continue;
825
+ }
826
+ const surfaceForm = entry[0];
827
+ const leftId = entry[1];
828
+ const rightId = entry[2];
829
+ const wordCost = entry[3];
830
+ const feature = entry.slice(4).join(",");
831
+ if (!isFinite(leftId) || !isFinite(rightId) || !isFinite(wordCost)) {
832
+ console.log(entry);
833
+ continue;
834
+ }
835
+ const tokenInfoId = this.put(leftId, rightId, wordCost, surfaceForm, feature);
836
+ dictionaryEntries[tokenInfoId] = surfaceForm;
837
+ }
838
+ this.dictionary.shrink();
839
+ this.posBuffer.shrink();
840
+ return dictionaryEntries;
841
+ }
842
+ put(leftId, rightId, wordCost, surfaceForm, feature) {
843
+ const tokenInfoId = this.dictionary.position;
844
+ const posId = this.posBuffer.position;
845
+ this.dictionary.putShort(leftId);
846
+ this.dictionary.putShort(rightId);
847
+ this.dictionary.putShort(wordCost);
848
+ this.dictionary.putInt(posId);
849
+ this.posBuffer.putString(surfaceForm + "," + feature);
850
+ return tokenInfoId;
851
+ }
852
+ addMapping(source, target) {
853
+ let mapping = this.targetMap[source];
854
+ if (mapping == null) {
855
+ mapping = [];
856
+ }
857
+ mapping.push(target);
858
+ this.targetMap[source] = mapping;
859
+ }
860
+ targetMapToBuffer() {
861
+ const buffer = new ByteBuffer();
862
+ const mapKeysSize = Object.keys(this.targetMap).length;
863
+ buffer.putInt(mapKeysSize);
864
+ for (const key in this.targetMap) {
865
+ const values = this.targetMap[parseInt(key, 10)];
866
+ const mapValuesSize = values.length;
867
+ buffer.putInt(parseInt(key, 10));
868
+ buffer.putInt(mapValuesSize);
869
+ for (let i = 0; i < values.length; i++) {
870
+ buffer.putInt(values[i]);
871
+ }
872
+ }
873
+ return buffer.shrink();
874
+ }
875
+ // Load from tid.dat
876
+ loadDictionary(arrayBuffer) {
877
+ this.dictionary = new ByteBuffer(
878
+ arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
879
+ );
880
+ return this;
881
+ }
882
+ // Load from tid_pos.dat
883
+ loadPosVector(arrayBuffer) {
884
+ this.posBuffer = new ByteBuffer(
885
+ arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
886
+ );
887
+ return this;
888
+ }
889
+ // Load from tid_map.dat
890
+ loadTargetMap(arrayBuffer) {
891
+ const buffer = new ByteBuffer(
892
+ arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
893
+ );
894
+ buffer.position = 0;
895
+ this.targetMap = {};
896
+ buffer.readInt();
897
+ while (true) {
898
+ if (buffer.buffer.length < buffer.position + 1) {
899
+ break;
900
+ }
901
+ const key = buffer.readInt();
902
+ const mapValuesSize = buffer.readInt();
903
+ for (let i = 0; i < mapValuesSize; i++) {
904
+ const value = buffer.readInt();
905
+ this.addMapping(key, value);
906
+ }
907
+ }
908
+ return this;
909
+ }
910
+ /**
911
+ * Look up features in the dictionary
912
+ */
913
+ getFeatures(tokenInfoIdStr) {
914
+ const tokenInfoId = typeof tokenInfoIdStr === "string" ? parseInt(tokenInfoIdStr, 10) : tokenInfoIdStr;
915
+ if (isNaN(tokenInfoId)) {
916
+ return "";
917
+ }
918
+ const posId = this.dictionary.getInt(tokenInfoId + 6);
919
+ return this.posBuffer.getString(posId);
920
+ }
921
+ };
922
+
923
+ // src/dict/ConnectionCosts.ts
924
+ var ConnectionCosts = class {
925
+ constructor(forwardDimension, backwardDimension) {
926
+ this.forwardDimension = forwardDimension;
927
+ this.backwardDimension = backwardDimension;
928
+ this.buffer = new Int16Array(forwardDimension * backwardDimension + 2);
929
+ this.buffer[0] = forwardDimension;
930
+ this.buffer[1] = backwardDimension;
931
+ }
932
+ put(forwardId, backwardId, cost) {
933
+ const index = forwardId * this.backwardDimension + backwardId + 2;
934
+ if (this.buffer.length < index + 1) {
935
+ throw new Error("ConnectionCosts buffer overflow");
936
+ }
937
+ this.buffer[index] = cost;
938
+ }
939
+ get(forwardId, backwardId) {
940
+ const index = forwardId * this.backwardDimension + backwardId + 2;
941
+ if (this.buffer.length < index + 1) {
942
+ throw new Error("ConnectionCosts buffer overflow");
943
+ }
944
+ return this.buffer[index];
945
+ }
946
+ loadConnectionCosts(connectionCostsBuffer) {
947
+ this.forwardDimension = connectionCostsBuffer[0];
948
+ this.backwardDimension = connectionCostsBuffer[1];
949
+ this.buffer = connectionCostsBuffer;
950
+ }
951
+ };
952
+
953
+ // src/dict/CharacterClass.ts
954
+ var CharacterClass = class {
955
+ constructor(classId, className, isAlwaysInvoke, isGrouping, maxLength) {
956
+ this.class_id = classId;
957
+ this.class_name = className;
958
+ this.is_always_invoke = isAlwaysInvoke;
959
+ this.is_grouping = isGrouping;
960
+ this.max_length = maxLength;
961
+ }
962
+ };
963
+
964
+ // src/dict/InvokeDefinitionMap.ts
965
+ var InvokeDefinitionMap = class _InvokeDefinitionMap {
966
+ constructor() {
967
+ this.map = [];
968
+ this.lookupTable = {};
969
+ }
970
+ /**
971
+ * Load InvokeDefinitionMap from buffer
972
+ */
973
+ static load(invokeDefBuffer) {
974
+ const invokeDef = new _InvokeDefinitionMap();
975
+ const characterCategoryDefinition = [];
976
+ const buffer = new ByteBuffer(invokeDefBuffer);
977
+ while (buffer.position + 1 < buffer.size()) {
978
+ const classId = characterCategoryDefinition.length;
979
+ const isAlwaysInvoke = buffer.get();
980
+ const isGrouping = buffer.get();
981
+ const maxLength = buffer.getInt();
982
+ const className = buffer.getString();
983
+ characterCategoryDefinition.push(
984
+ new CharacterClass(classId, className, isAlwaysInvoke, isGrouping, maxLength)
985
+ );
986
+ }
987
+ invokeDef.init(characterCategoryDefinition);
988
+ return invokeDef;
989
+ }
990
+ /**
991
+ * Initialize with character category definitions
992
+ */
993
+ init(characterCategoryDefinition) {
994
+ if (characterCategoryDefinition == null) {
995
+ return;
996
+ }
997
+ for (let i = 0; i < characterCategoryDefinition.length; i++) {
998
+ const characterClass = characterCategoryDefinition[i];
999
+ this.map[i] = characterClass;
1000
+ this.lookupTable[characterClass.class_name] = i;
1001
+ }
1002
+ }
1003
+ /**
1004
+ * Get class information by class ID
1005
+ */
1006
+ getCharacterClass(classId) {
1007
+ return this.map[classId];
1008
+ }
1009
+ /**
1010
+ * Lookup class ID by class name
1011
+ */
1012
+ lookup(className) {
1013
+ const classId = this.lookupTable[className];
1014
+ if (classId == null) {
1015
+ return null;
1016
+ }
1017
+ return classId;
1018
+ }
1019
+ /**
1020
+ * Transform from map to binary buffer
1021
+ */
1022
+ toBuffer() {
1023
+ const buffer = new ByteBuffer();
1024
+ for (let i = 0; i < this.map.length; i++) {
1025
+ const charClass = this.map[i];
1026
+ buffer.put(charClass.is_always_invoke ? 1 : 0);
1027
+ buffer.put(charClass.is_grouping ? 1 : 0);
1028
+ buffer.putInt(charClass.max_length);
1029
+ buffer.putString(charClass.class_name);
1030
+ }
1031
+ buffer.shrink();
1032
+ return buffer.buffer;
1033
+ }
1034
+ };
1035
+
1036
+ // src/dict/CharacterDefinition.ts
1037
+ var DEFAULT_CATEGORY = "DEFAULT";
1038
+ var CharacterDefinition = class _CharacterDefinition {
1039
+ constructor() {
1040
+ this.characterCategoryMap = new Uint8Array(65536);
1041
+ this.compatibleCategoryMap = new Uint32Array(65536);
1042
+ this.invokeDefinitionMap = null;
1043
+ }
1044
+ /**
1045
+ * Load CharacterDefinition from buffers
1046
+ */
1047
+ static load(catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
1048
+ const charDef = new _CharacterDefinition();
1049
+ charDef.characterCategoryMap = catMapBuffer;
1050
+ charDef.compatibleCategoryMap = compatCatMapBuffer;
1051
+ charDef.invokeDefinitionMap = InvokeDefinitionMap.load(invokeDefBuffer);
1052
+ return charDef;
1053
+ }
1054
+ static parseCharCategory(classId, parsedCategoryDef) {
1055
+ const category = parsedCategoryDef[1];
1056
+ const invoke = parseInt(parsedCategoryDef[2], 10);
1057
+ const grouping = parseInt(parsedCategoryDef[3], 10);
1058
+ const maxLength = parseInt(parsedCategoryDef[4], 10);
1059
+ if (!isFinite(invoke) || invoke !== 0 && invoke !== 1) {
1060
+ console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke);
1061
+ return null;
1062
+ }
1063
+ if (!isFinite(grouping) || grouping !== 0 && grouping !== 1) {
1064
+ console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping);
1065
+ return null;
1066
+ }
1067
+ if (!isFinite(maxLength) || maxLength < 0) {
1068
+ console.log("char.def parse error. LENGTH is 1 to n:" + maxLength);
1069
+ return null;
1070
+ }
1071
+ const isInvoke = invoke === 1;
1072
+ const isGrouping = grouping === 1;
1073
+ return new CharacterClass(classId, category, isInvoke, isGrouping, maxLength);
1074
+ }
1075
+ static parseCategoryMapping(parsedCategoryMapping) {
1076
+ const start = parseInt(parsedCategoryMapping[1], 10);
1077
+ const defaultCategory = parsedCategoryMapping[2];
1078
+ const compatibleCategory = parsedCategoryMapping.length > 3 ? parsedCategoryMapping.slice(3) : [];
1079
+ if (!isFinite(start) || start < 0 || start > 65535) {
1080
+ console.log("char.def parse error. CODE is invalid:" + start);
1081
+ }
1082
+ return { start, default: defaultCategory, compatible: compatibleCategory };
1083
+ }
1084
+ static parseRangeCategoryMapping(parsedCategoryMapping) {
1085
+ const start = parseInt(parsedCategoryMapping[1], 10);
1086
+ const end = parseInt(parsedCategoryMapping[2], 10);
1087
+ const defaultCategory = parsedCategoryMapping[3];
1088
+ const compatibleCategory = parsedCategoryMapping.length > 4 ? parsedCategoryMapping.slice(4) : [];
1089
+ if (!isFinite(start) || start < 0 || start > 65535) {
1090
+ console.log("char.def parse error. CODE is invalid:" + start);
1091
+ }
1092
+ if (!isFinite(end) || end < 0 || end > 65535) {
1093
+ console.log("char.def parse error. CODE is invalid:" + end);
1094
+ }
1095
+ return { start, end, default: defaultCategory, compatible: compatibleCategory };
1096
+ }
1097
+ /**
1098
+ * Initialize category mappings
1099
+ */
1100
+ initCategoryMappings(categoryMapping) {
1101
+ if (categoryMapping != null && this.invokeDefinitionMap != null) {
1102
+ for (let i = 0; i < categoryMapping.length; i++) {
1103
+ const mapping = categoryMapping[i];
1104
+ const end = mapping.end ?? mapping.start;
1105
+ for (let codePoint = mapping.start; codePoint <= end; codePoint++) {
1106
+ const classId = this.invokeDefinitionMap.lookup(mapping.default);
1107
+ if (classId != null) {
1108
+ this.characterCategoryMap[codePoint] = classId;
1109
+ }
1110
+ for (let j = 0; j < mapping.compatible.length; j++) {
1111
+ let bitset = this.compatibleCategoryMap[codePoint];
1112
+ const compatibleCategory = mapping.compatible[j];
1113
+ if (compatibleCategory == null) {
1114
+ continue;
1115
+ }
1116
+ const compatClassId = this.invokeDefinitionMap.lookup(compatibleCategory);
1117
+ if (compatClassId == null) {
1118
+ continue;
1119
+ }
1120
+ const classIdBit = 1 << compatClassId;
1121
+ bitset = bitset | classIdBit;
1122
+ this.compatibleCategoryMap[codePoint] = bitset;
1123
+ }
1124
+ }
1125
+ }
1126
+ }
1127
+ if (this.invokeDefinitionMap == null) {
1128
+ return;
1129
+ }
1130
+ const defaultId = this.invokeDefinitionMap.lookup(DEFAULT_CATEGORY);
1131
+ if (defaultId == null) {
1132
+ return;
1133
+ }
1134
+ for (let codePoint = 0; codePoint < this.characterCategoryMap.length; codePoint++) {
1135
+ if (this.characterCategoryMap[codePoint] === 0) {
1136
+ this.characterCategoryMap[codePoint] = 1 << defaultId;
1137
+ }
1138
+ }
1139
+ }
1140
+ /**
1141
+ * Lookup compatible categories for a character (not included 1st category)
1142
+ */
1143
+ lookupCompatibleCategory(ch) {
1144
+ const classes = [];
1145
+ const code = ch.charCodeAt(0);
1146
+ let integer;
1147
+ if (code < this.compatibleCategoryMap.length) {
1148
+ integer = this.compatibleCategoryMap[code];
1149
+ }
1150
+ if (integer == null || integer === 0) {
1151
+ return classes;
1152
+ }
1153
+ for (let bit = 0; bit < 32; bit++) {
1154
+ if (integer << 31 - bit >>> 31 === 1) {
1155
+ const characterClass = this.invokeDefinitionMap?.getCharacterClass(bit);
1156
+ if (characterClass == null) {
1157
+ continue;
1158
+ }
1159
+ classes.push(characterClass);
1160
+ }
1161
+ }
1162
+ return classes;
1163
+ }
1164
+ /**
1165
+ * Lookup category for a character
1166
+ */
1167
+ lookup(ch) {
1168
+ let classId = null;
1169
+ const code = ch.charCodeAt(0);
1170
+ if (SurrogateAwareString.isSurrogatePair(ch)) {
1171
+ classId = this.invokeDefinitionMap?.lookup(DEFAULT_CATEGORY) ?? null;
1172
+ } else if (code < this.characterCategoryMap.length) {
1173
+ classId = this.characterCategoryMap[code];
1174
+ }
1175
+ if (classId == null) {
1176
+ classId = this.invokeDefinitionMap?.lookup(DEFAULT_CATEGORY) ?? null;
1177
+ }
1178
+ if (classId == null) {
1179
+ return void 0;
1180
+ }
1181
+ return this.invokeDefinitionMap?.getCharacterClass(classId);
1182
+ }
1183
+ };
1184
+
1185
+ // src/dict/UnknownDictionary.ts
1186
+ var UnknownDictionary = class extends TokenInfoDictionary {
1187
+ constructor() {
1188
+ super();
1189
+ this.characterDefinition = null;
1190
+ }
1191
+ setCharacterDefinition(characterDefinition) {
1192
+ this.characterDefinition = characterDefinition;
1193
+ return this;
1194
+ }
1195
+ lookup(ch) {
1196
+ return this.characterDefinition?.lookup(ch);
1197
+ }
1198
+ lookupCompatibleCategory(ch) {
1199
+ return this.characterDefinition?.lookupCompatibleCategory(ch) ?? [];
1200
+ }
1201
+ loadUnknownDictionaries(unkBuffer, unkPosBuffer, unkMapBuffer, catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
1202
+ this.loadDictionary(unkBuffer);
1203
+ this.loadPosVector(unkPosBuffer);
1204
+ this.loadTargetMap(unkMapBuffer);
1205
+ this.characterDefinition = CharacterDefinition.load(
1206
+ catMapBuffer,
1207
+ compatCatMapBuffer,
1208
+ invokeDefBuffer
1209
+ );
1210
+ }
1211
+ };
1212
+
1213
+ // src/dict/DynamicDictionaries.ts
1214
+ var DynamicDictionaries = class {
1215
+ constructor(trie, tokenInfoDictionary, connectionCosts, unknownDictionary) {
1216
+ this.trie = trie ?? {
1217
+ commonPrefixSearch: () => []
1218
+ };
1219
+ this.tokenInfoDictionary = tokenInfoDictionary ?? new TokenInfoDictionary();
1220
+ this.connectionCosts = connectionCosts ?? new ConnectionCosts(0, 0);
1221
+ this.unknownDictionary = unknownDictionary ?? new UnknownDictionary();
1222
+ }
1223
+ // Load from base.dat & check.dat
1224
+ async loadTrie(baseBuffer, checkBuffer) {
1225
+ const doublearrayModule = await import("doublearray");
1226
+ const doublearray = doublearrayModule.default || doublearrayModule;
1227
+ this.trie = doublearray.load(baseBuffer, checkBuffer);
1228
+ return this;
1229
+ }
1230
+ loadTokenInfoDictionaries(tokenInfoBuffer, posBuffer, targetMapBuffer) {
1231
+ this.tokenInfoDictionary.loadDictionary(tokenInfoBuffer);
1232
+ this.tokenInfoDictionary.loadPosVector(posBuffer);
1233
+ this.tokenInfoDictionary.loadTargetMap(targetMapBuffer);
1234
+ return this;
1235
+ }
1236
+ loadConnectionCosts(ccBuffer) {
1237
+ this.connectionCosts.loadConnectionCosts(ccBuffer);
1238
+ return this;
1239
+ }
1240
+ loadUnknownDictionaries(unkBuffer, unkPosBuffer, unkMapBuffer, catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
1241
+ this.unknownDictionary.loadUnknownDictionaries(
1242
+ unkBuffer,
1243
+ unkPosBuffer,
1244
+ unkMapBuffer,
1245
+ catMapBuffer,
1246
+ compatCatMapBuffer,
1247
+ invokeDefBuffer
1248
+ );
1249
+ return this;
1250
+ }
1251
+ };
1252
+
1253
+ // src/loader/DictionaryLoader.ts
1254
+ var DictionaryLoader = class {
1255
+ constructor(dicPath) {
1256
+ this.dic = new DynamicDictionaries();
1257
+ this.dicPath = dicPath.endsWith("/") ? dicPath : dicPath + "/";
1258
+ this.isLocalPath = !dicPath.startsWith("http://") && !dicPath.startsWith("https://");
1259
+ }
1260
+ /**
1261
+ * Load a file as ArrayBuffer, handling both compressed and uncompressed
1262
+ */
1263
+ async loadArrayBuffer(filename) {
1264
+ const path = this.dicPath + filename;
1265
+ let buffer;
1266
+ if (this.isLocalPath && typeof process !== "undefined" && process.versions?.node) {
1267
+ const fs = await import("fs/promises");
1268
+ const nodePath = await import("path");
1269
+ const resolvedPath = nodePath.resolve(path);
1270
+ const fileBuffer = await fs.readFile(resolvedPath);
1271
+ buffer = fileBuffer.buffer.slice(
1272
+ fileBuffer.byteOffset,
1273
+ fileBuffer.byteOffset + fileBuffer.byteLength
1274
+ );
1275
+ } else {
1276
+ const response = await fetch(path);
1277
+ if (!response.ok) {
1278
+ throw new Error(`Failed to load ${path}: ${response.status} ${response.statusText}`);
1279
+ }
1280
+ buffer = await response.arrayBuffer();
1281
+ }
1282
+ const bytes = new Uint8Array(buffer);
1283
+ if (bytes[0] === 31 && bytes[1] === 139) {
1284
+ const pako = await import("pako");
1285
+ const decompressed = pako.inflate(bytes);
1286
+ return decompressed.buffer;
1287
+ }
1288
+ return buffer;
1289
+ }
1290
+ /**
1291
+ * Load all dictionary files
1292
+ */
1293
+ async load() {
1294
+ const [
1295
+ baseBuffer,
1296
+ checkBuffer,
1297
+ tidBuffer,
1298
+ tidPosBuffer,
1299
+ tidMapBuffer,
1300
+ ccBuffer,
1301
+ unkBuffer,
1302
+ unkPosBuffer,
1303
+ unkMapBuffer,
1304
+ unkCharBuffer,
1305
+ unkCompatBuffer,
1306
+ unkInvokeBuffer
1307
+ ] = await Promise.all([
1308
+ // TRIE
1309
+ this.loadArrayBuffer("base.dat.gz").catch(() => this.loadArrayBuffer("base.dat")),
1310
+ this.loadArrayBuffer("check.dat.gz").catch(() => this.loadArrayBuffer("check.dat")),
1311
+ // Token info
1312
+ this.loadArrayBuffer("tid.dat.gz").catch(() => this.loadArrayBuffer("tid.dat")),
1313
+ this.loadArrayBuffer("tid_pos.dat.gz").catch(() => this.loadArrayBuffer("tid_pos.dat")),
1314
+ this.loadArrayBuffer("tid_map.dat.gz").catch(() => this.loadArrayBuffer("tid_map.dat")),
1315
+ // Connection costs
1316
+ this.loadArrayBuffer("cc.dat.gz").catch(() => this.loadArrayBuffer("cc.dat")),
1317
+ // Unknown words
1318
+ this.loadArrayBuffer("unk.dat.gz").catch(() => this.loadArrayBuffer("unk.dat")),
1319
+ this.loadArrayBuffer("unk_pos.dat.gz").catch(() => this.loadArrayBuffer("unk_pos.dat")),
1320
+ this.loadArrayBuffer("unk_map.dat.gz").catch(() => this.loadArrayBuffer("unk_map.dat")),
1321
+ this.loadArrayBuffer("unk_char.dat.gz").catch(() => this.loadArrayBuffer("unk_char.dat")),
1322
+ this.loadArrayBuffer("unk_compat.dat.gz").catch(
1323
+ () => this.loadArrayBuffer("unk_compat.dat")
1324
+ ),
1325
+ this.loadArrayBuffer("unk_invoke.dat.gz").catch(
1326
+ () => this.loadArrayBuffer("unk_invoke.dat")
1327
+ )
1328
+ ]);
1329
+ await this.dic.loadTrie(new Int32Array(baseBuffer), new Int32Array(checkBuffer));
1330
+ this.dic.loadTokenInfoDictionaries(
1331
+ new Uint8Array(tidBuffer),
1332
+ new Uint8Array(tidPosBuffer),
1333
+ new Uint8Array(tidMapBuffer)
1334
+ );
1335
+ this.dic.loadConnectionCosts(new Int16Array(ccBuffer));
1336
+ this.dic.loadUnknownDictionaries(
1337
+ new Uint8Array(unkBuffer),
1338
+ new Uint8Array(unkPosBuffer),
1339
+ new Uint8Array(unkMapBuffer),
1340
+ new Uint8Array(unkCharBuffer),
1341
+ new Uint32Array(unkCompatBuffer),
1342
+ new Uint8Array(unkInvokeBuffer)
1343
+ );
1344
+ return this.dic;
1345
+ }
1346
+ };
1347
+
1348
+ // src/TokenizerBuilder.ts
1349
+ var TokenizerBuilder = class {
1350
+ constructor(options = {}) {
1351
+ this.dicPath = options.dicPath ?? "dict/";
1352
+ }
1353
+ /**
1354
+ * Build and return the tokenizer (async)
1355
+ */
1356
+ async build() {
1357
+ const loader = new DictionaryLoader(this.dicPath);
1358
+ const dic = await loader.load();
1359
+ return new Tokenizer(dic);
1360
+ }
1361
+ };
1362
+
1363
+ // src/index.ts
1364
+ function builder(options = {}) {
1365
+ return new TokenizerBuilder(options);
1366
+ }
1367
+ var index_default = { builder, TokenizerBuilder, Tokenizer, KoreanToken, POS_TAGS };
1368
+ export {
1369
+ KoreanToken,
1370
+ POS_TAGS,
1371
+ Tokenizer,
1372
+ TokenizerBuilder,
1373
+ builder,
1374
+ index_default as default
1375
+ };