kuromoji-ko 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/dict/base.dat.gz +0 -0
- package/dict/cc.dat.gz +0 -0
- package/dict/check.dat.gz +0 -0
- package/dict/tid.dat.gz +0 -0
- package/dict/tid_map.dat.gz +0 -0
- package/dict/tid_pos.dat.gz +0 -0
- package/dict/unk.dat.gz +0 -0
- package/dict/unk_char.dat.gz +0 -0
- package/dict/unk_compat.dat.gz +0 -0
- package/dict/unk_invoke.dat.gz +0 -0
- package/dict/unk_map.dat.gz +0 -0
- package/dict/unk_pos.dat.gz +0 -0
- package/dist/index.cjs +1416 -0
- package/dist/index.d.cts +352 -0
- package/dist/index.d.ts +352 -0
- package/dist/index.js +1375 -0
- package/package.json +63 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1375 @@
|
|
|
1
|
+
// src/viterbi/ViterbiNode.ts
|
|
2
|
+
var ViterbiNode = class {
|
|
3
|
+
constructor(nodeName, nodeCost, startPos, length, type, leftId, rightId, surfaceForm) {
|
|
4
|
+
this.name = nodeName;
|
|
5
|
+
this.cost = nodeCost;
|
|
6
|
+
this.start_pos = startPos;
|
|
7
|
+
this.length = length;
|
|
8
|
+
this.left_id = leftId;
|
|
9
|
+
this.right_id = rightId;
|
|
10
|
+
this.prev = null;
|
|
11
|
+
this.surface_form = surfaceForm;
|
|
12
|
+
this.shortest_cost = type === "BOS" ? 0 : Number.MAX_VALUE;
|
|
13
|
+
this.type = type;
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
// src/viterbi/ViterbiLattice.ts
|
|
18
|
+
var ViterbiLattice = class {
|
|
19
|
+
constructor() {
|
|
20
|
+
this.nodesEndAt = [];
|
|
21
|
+
this.nodesEndAt[0] = [new ViterbiNode(-1, 0, 0, 0, "BOS", 0, 0, "")];
|
|
22
|
+
this.eosPos = 1;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Append node to the lattice
|
|
26
|
+
*/
|
|
27
|
+
append(node) {
|
|
28
|
+
const lastPos = node.start_pos + node.length - 1;
|
|
29
|
+
if (this.eosPos < lastPos) {
|
|
30
|
+
this.eosPos = lastPos;
|
|
31
|
+
}
|
|
32
|
+
let prevNodes = this.nodesEndAt[lastPos];
|
|
33
|
+
if (prevNodes == null) {
|
|
34
|
+
prevNodes = [];
|
|
35
|
+
}
|
|
36
|
+
prevNodes.push(node);
|
|
37
|
+
this.nodesEndAt[lastPos] = prevNodes;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Append EOS (End of Sentence) node
|
|
41
|
+
*/
|
|
42
|
+
appendEos() {
|
|
43
|
+
const lastIndex = this.nodesEndAt.length;
|
|
44
|
+
this.eosPos++;
|
|
45
|
+
this.nodesEndAt[lastIndex] = [new ViterbiNode(-1, 0, this.eosPos, 0, "EOS", 0, 0, "")];
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
// src/util/SurrogateAwareString.ts
|
|
50
|
+
var SurrogateAwareString = class _SurrogateAwareString {
|
|
51
|
+
constructor(str) {
|
|
52
|
+
this.str = str;
|
|
53
|
+
this.indexMapping = [];
|
|
54
|
+
for (let pos = 0; pos < str.length; pos++) {
|
|
55
|
+
const ch = str.charAt(pos);
|
|
56
|
+
this.indexMapping.push(pos);
|
|
57
|
+
if (_SurrogateAwareString.isSurrogatePair(ch)) {
|
|
58
|
+
pos++;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
this.length = this.indexMapping.length;
|
|
62
|
+
}
|
|
63
|
+
slice(index) {
|
|
64
|
+
if (this.indexMapping.length <= index) {
|
|
65
|
+
return "";
|
|
66
|
+
}
|
|
67
|
+
const surrogateAwareIndex = this.indexMapping[index];
|
|
68
|
+
return this.str.slice(surrogateAwareIndex);
|
|
69
|
+
}
|
|
70
|
+
charAt(index) {
|
|
71
|
+
if (this.str.length <= index) {
|
|
72
|
+
return "";
|
|
73
|
+
}
|
|
74
|
+
const surrogateAwareStartIndex = this.indexMapping[index];
|
|
75
|
+
const surrogateAwareEndIndex = this.indexMapping[index + 1];
|
|
76
|
+
if (surrogateAwareEndIndex == null) {
|
|
77
|
+
return this.str.slice(surrogateAwareStartIndex);
|
|
78
|
+
}
|
|
79
|
+
return this.str.slice(surrogateAwareStartIndex, surrogateAwareEndIndex);
|
|
80
|
+
}
|
|
81
|
+
charCodeAt(index) {
|
|
82
|
+
if (this.indexMapping.length <= index) {
|
|
83
|
+
return NaN;
|
|
84
|
+
}
|
|
85
|
+
const surrogateAwareIndex = this.indexMapping[index];
|
|
86
|
+
const upper = this.str.charCodeAt(surrogateAwareIndex);
|
|
87
|
+
if (upper >= 55296 && upper <= 56319 && surrogateAwareIndex < this.str.length) {
|
|
88
|
+
const lower = this.str.charCodeAt(surrogateAwareIndex + 1);
|
|
89
|
+
if (lower >= 56320 && lower <= 57343) {
|
|
90
|
+
return (upper - 55296) * 1024 + lower - 56320 + 65536;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return upper;
|
|
94
|
+
}
|
|
95
|
+
toString() {
|
|
96
|
+
return this.str;
|
|
97
|
+
}
|
|
98
|
+
static isSurrogatePair(ch) {
|
|
99
|
+
const utf16Code = ch.charCodeAt(0);
|
|
100
|
+
return utf16Code >= 55296 && utf16Code <= 56319;
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
// src/viterbi/ViterbiBuilder.ts
|
|
105
|
+
var ViterbiBuilder = class {
|
|
106
|
+
constructor(dic) {
|
|
107
|
+
this.trie = dic.trie;
|
|
108
|
+
this.tokenInfoDictionary = dic.tokenInfoDictionary;
|
|
109
|
+
this.unknownDictionary = dic.unknownDictionary;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Build word lattice from input text
|
|
113
|
+
*/
|
|
114
|
+
build(sentenceStr) {
|
|
115
|
+
const lattice = new ViterbiLattice();
|
|
116
|
+
const sentence = new SurrogateAwareString(sentenceStr);
|
|
117
|
+
for (let pos = 0; pos < sentence.length; pos++) {
|
|
118
|
+
const tail = sentence.slice(pos);
|
|
119
|
+
const vocabulary = this.trie.commonPrefixSearch(tail);
|
|
120
|
+
for (let n = 0; n < vocabulary.length; n++) {
|
|
121
|
+
const trieId = vocabulary[n].v;
|
|
122
|
+
const key = vocabulary[n].k;
|
|
123
|
+
const tokenInfoIds = this.tokenInfoDictionary.targetMap[trieId];
|
|
124
|
+
if (tokenInfoIds == null) continue;
|
|
125
|
+
for (let i = 0; i < tokenInfoIds.length; i++) {
|
|
126
|
+
const tokenInfoId = tokenInfoIds[i];
|
|
127
|
+
const leftId = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId);
|
|
128
|
+
const rightId = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId + 2);
|
|
129
|
+
const wordCost = this.tokenInfoDictionary.dictionary.getShort(tokenInfoId + 4);
|
|
130
|
+
lattice.append(
|
|
131
|
+
new ViterbiNode(
|
|
132
|
+
tokenInfoId,
|
|
133
|
+
wordCost,
|
|
134
|
+
pos + 1,
|
|
135
|
+
key.length,
|
|
136
|
+
"KNOWN",
|
|
137
|
+
leftId,
|
|
138
|
+
rightId,
|
|
139
|
+
key
|
|
140
|
+
)
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
const surrogateAwareTail = new SurrogateAwareString(tail);
|
|
145
|
+
const headChar = new SurrogateAwareString(surrogateAwareTail.charAt(0));
|
|
146
|
+
const headCharClass = this.unknownDictionary.lookup(headChar.toString());
|
|
147
|
+
if (vocabulary == null || vocabulary.length === 0 || headCharClass && headCharClass.is_always_invoke === 1) {
|
|
148
|
+
let key = headChar;
|
|
149
|
+
if (headCharClass && headCharClass.is_grouping === 1 && surrogateAwareTail.length > 1) {
|
|
150
|
+
for (let k = 1; k < surrogateAwareTail.length; k++) {
|
|
151
|
+
const nextChar = surrogateAwareTail.charAt(k);
|
|
152
|
+
const nextCharClass = this.unknownDictionary.lookup(nextChar);
|
|
153
|
+
if (!nextCharClass || headCharClass.class_name !== nextCharClass.class_name) {
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
key = new SurrogateAwareString(key.toString() + nextChar);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if (headCharClass) {
|
|
160
|
+
const unkIds = this.unknownDictionary.targetMap[headCharClass.class_id];
|
|
161
|
+
if (unkIds) {
|
|
162
|
+
for (let j = 0; j < unkIds.length; j++) {
|
|
163
|
+
const unkId = unkIds[j];
|
|
164
|
+
const leftId = this.unknownDictionary.dictionary.getShort(unkId);
|
|
165
|
+
const rightId = this.unknownDictionary.dictionary.getShort(unkId + 2);
|
|
166
|
+
const wordCost = this.unknownDictionary.dictionary.getShort(unkId + 4);
|
|
167
|
+
lattice.append(
|
|
168
|
+
new ViterbiNode(
|
|
169
|
+
unkId,
|
|
170
|
+
wordCost,
|
|
171
|
+
pos + 1,
|
|
172
|
+
key.length,
|
|
173
|
+
"UNKNOWN",
|
|
174
|
+
leftId,
|
|
175
|
+
rightId,
|
|
176
|
+
key.toString()
|
|
177
|
+
)
|
|
178
|
+
);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
lattice.appendEos();
|
|
185
|
+
return lattice;
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// src/viterbi/ViterbiSearcher.ts
|
|
190
|
+
var ViterbiSearcher = class {
|
|
191
|
+
constructor(connectionCosts) {
|
|
192
|
+
this.connectionCosts = connectionCosts;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Search best path using forward-backward algorithm
|
|
196
|
+
*/
|
|
197
|
+
search(lattice) {
|
|
198
|
+
lattice = this.forward(lattice);
|
|
199
|
+
return this.backward(lattice);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Forward pass - compute shortest costs
|
|
203
|
+
*/
|
|
204
|
+
forward(lattice) {
|
|
205
|
+
for (let i = 1; i <= lattice.eosPos; i++) {
|
|
206
|
+
const nodes = lattice.nodesEndAt[i];
|
|
207
|
+
if (nodes == null) {
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
for (let j = 0; j < nodes.length; j++) {
|
|
211
|
+
const node = nodes[j];
|
|
212
|
+
let cost = Number.MAX_VALUE;
|
|
213
|
+
let shortestPrevNode = null;
|
|
214
|
+
const prevNodes = lattice.nodesEndAt[node.start_pos - 1];
|
|
215
|
+
if (prevNodes == null) {
|
|
216
|
+
continue;
|
|
217
|
+
}
|
|
218
|
+
for (let k = 0; k < prevNodes.length; k++) {
|
|
219
|
+
const prevNode = prevNodes[k];
|
|
220
|
+
let edgeCost;
|
|
221
|
+
if (node.left_id == null || prevNode.right_id == null) {
|
|
222
|
+
console.log("Left or right is null");
|
|
223
|
+
edgeCost = 0;
|
|
224
|
+
} else {
|
|
225
|
+
edgeCost = this.connectionCosts.get(prevNode.right_id, node.left_id);
|
|
226
|
+
}
|
|
227
|
+
const totalCost = prevNode.shortest_cost + edgeCost + node.cost;
|
|
228
|
+
if (totalCost < cost) {
|
|
229
|
+
shortestPrevNode = prevNode;
|
|
230
|
+
cost = totalCost;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
node.prev = shortestPrevNode;
|
|
234
|
+
node.shortest_cost = cost;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
return lattice;
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Backward pass - trace back the best path
|
|
241
|
+
*/
|
|
242
|
+
backward(lattice) {
|
|
243
|
+
const shortestPath = [];
|
|
244
|
+
const lastNodes = lattice.nodesEndAt[lattice.nodesEndAt.length - 1];
|
|
245
|
+
if (!lastNodes || lastNodes.length === 0) {
|
|
246
|
+
return [];
|
|
247
|
+
}
|
|
248
|
+
const eos = lastNodes[0];
|
|
249
|
+
let nodeBack = eos.prev;
|
|
250
|
+
if (nodeBack == null) {
|
|
251
|
+
return [];
|
|
252
|
+
}
|
|
253
|
+
while (nodeBack.type !== "BOS") {
|
|
254
|
+
shortestPath.push(nodeBack);
|
|
255
|
+
if (nodeBack.prev == null) {
|
|
256
|
+
return [];
|
|
257
|
+
}
|
|
258
|
+
nodeBack = nodeBack.prev;
|
|
259
|
+
}
|
|
260
|
+
return shortestPath.reverse();
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
// src/KoreanToken.ts
|
|
265
|
+
var POS_TAGS = {
|
|
266
|
+
// 체언 (Substantives)
|
|
267
|
+
NNG: "\uC77C\uBC18 \uBA85\uC0AC",
|
|
268
|
+
// General noun
|
|
269
|
+
NNP: "\uACE0\uC720 \uBA85\uC0AC",
|
|
270
|
+
// Proper noun
|
|
271
|
+
NNB: "\uC758\uC874 \uBA85\uC0AC",
|
|
272
|
+
// Dependent noun
|
|
273
|
+
NR: "\uC218\uC0AC",
|
|
274
|
+
// Numeral
|
|
275
|
+
NP: "\uB300\uBA85\uC0AC",
|
|
276
|
+
// Pronoun
|
|
277
|
+
// 용언 (Predicates)
|
|
278
|
+
VV: "\uB3D9\uC0AC",
|
|
279
|
+
// Verb
|
|
280
|
+
VA: "\uD615\uC6A9\uC0AC",
|
|
281
|
+
// Adjective
|
|
282
|
+
VX: "\uBCF4\uC870 \uC6A9\uC5B8",
|
|
283
|
+
// Auxiliary predicate
|
|
284
|
+
VCP: "\uAE0D\uC815 \uC9C0\uC815\uC0AC",
|
|
285
|
+
// Positive copula (이다)
|
|
286
|
+
VCN: "\uBD80\uC815 \uC9C0\uC815\uC0AC",
|
|
287
|
+
// Negative copula (아니다)
|
|
288
|
+
// 관형사 (Determiners)
|
|
289
|
+
MM: "\uAD00\uD615\uC0AC",
|
|
290
|
+
// Determiner
|
|
291
|
+
// 부사 (Adverbs)
|
|
292
|
+
MAG: "\uC77C\uBC18 \uBD80\uC0AC",
|
|
293
|
+
// General adverb
|
|
294
|
+
MAJ: "\uC811\uC18D \uBD80\uC0AC",
|
|
295
|
+
// Conjunctive adverb
|
|
296
|
+
// 감탄사 (Interjections)
|
|
297
|
+
IC: "\uAC10\uD0C4\uC0AC",
|
|
298
|
+
// Interjection
|
|
299
|
+
// 조사 (Particles)
|
|
300
|
+
JKS: "\uC8FC\uACA9 \uC870\uC0AC",
|
|
301
|
+
// Subject case particle
|
|
302
|
+
JKC: "\uBCF4\uACA9 \uC870\uC0AC",
|
|
303
|
+
// Complement case particle
|
|
304
|
+
JKG: "\uAD00\uD615\uACA9 \uC870\uC0AC",
|
|
305
|
+
// Adnominal case particle
|
|
306
|
+
JKO: "\uBAA9\uC801\uACA9 \uC870\uC0AC",
|
|
307
|
+
// Object case particle
|
|
308
|
+
JKB: "\uBD80\uC0AC\uACA9 \uC870\uC0AC",
|
|
309
|
+
// Adverbial case particle
|
|
310
|
+
JKV: "\uD638\uACA9 \uC870\uC0AC",
|
|
311
|
+
// Vocative case particle
|
|
312
|
+
JKQ: "\uC778\uC6A9\uACA9 \uC870\uC0AC",
|
|
313
|
+
// Quotative case particle
|
|
314
|
+
JX: "\uBCF4\uC870\uC0AC",
|
|
315
|
+
// Auxiliary particle
|
|
316
|
+
JC: "\uC811\uC18D \uC870\uC0AC",
|
|
317
|
+
// Conjunctive particle
|
|
318
|
+
// 어미 (Endings)
|
|
319
|
+
EP: "\uC120\uC5B4\uB9D0 \uC5B4\uBBF8",
|
|
320
|
+
// Pre-final ending
|
|
321
|
+
EF: "\uC885\uACB0 \uC5B4\uBBF8",
|
|
322
|
+
// Final ending
|
|
323
|
+
EC: "\uC5F0\uACB0 \uC5B4\uBBF8",
|
|
324
|
+
// Connective ending
|
|
325
|
+
ETN: "\uBA85\uC0AC\uD615 \uC804\uC131 \uC5B4\uBBF8",
|
|
326
|
+
// Nominalizing ending
|
|
327
|
+
ETM: "\uAD00\uD615\uD615 \uC804\uC131 \uC5B4\uBBF8",
|
|
328
|
+
// Adnominalizing ending
|
|
329
|
+
// 접사 (Affixes)
|
|
330
|
+
XPN: "\uCCB4\uC5B8 \uC811\uB450\uC0AC",
|
|
331
|
+
// Noun prefix
|
|
332
|
+
XSN: "\uBA85\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
|
|
333
|
+
// Noun-deriving suffix
|
|
334
|
+
XSV: "\uB3D9\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
|
|
335
|
+
// Verb-deriving suffix
|
|
336
|
+
XSA: "\uD615\uC6A9\uC0AC \uD30C\uC0DD \uC811\uBBF8\uC0AC",
|
|
337
|
+
// Adjective-deriving suffix
|
|
338
|
+
XR: "\uC5B4\uADFC",
|
|
339
|
+
// Root
|
|
340
|
+
// 부호 (Symbols)
|
|
341
|
+
SF: "\uB9C8\uCE68\uD45C, \uBB3C\uC74C\uD45C, \uB290\uB08C\uD45C",
|
|
342
|
+
// Period, question, exclamation
|
|
343
|
+
SE: "\uC904\uC784\uD45C",
|
|
344
|
+
// Ellipsis
|
|
345
|
+
SS: "\uB530\uC634\uD45C, \uAD04\uD638\uD45C",
|
|
346
|
+
// Quotes, brackets
|
|
347
|
+
SP: "\uC27C\uD45C, \uAC00\uC6B4\uB383\uC810, \uCF5C\uB860, \uBE57\uAE08",
|
|
348
|
+
// Comma, interpunct, colon, slash
|
|
349
|
+
SO: "\uBD99\uC784\uD45C",
|
|
350
|
+
// Hyphen
|
|
351
|
+
SW: "\uAE30\uD0C0 \uAE30\uD638",
|
|
352
|
+
// Other symbols
|
|
353
|
+
// 한글 외 (Non-Hangul)
|
|
354
|
+
SL: "\uC678\uAD6D\uC5B4",
|
|
355
|
+
// Foreign language
|
|
356
|
+
SH: "\uD55C\uC790",
|
|
357
|
+
// Chinese characters
|
|
358
|
+
SN: "\uC22B\uC790",
|
|
359
|
+
// Numbers
|
|
360
|
+
// 분석 불능 (Unknown)
|
|
361
|
+
NA: "\uBD84\uC11D\uBD88\uB2A5",
|
|
362
|
+
// Unable to analyze
|
|
363
|
+
NF: "\uBA85\uC0AC\uCD94\uC815\uBC94\uC8FC",
|
|
364
|
+
// Presumed noun
|
|
365
|
+
NV: "\uC6A9\uC5B8\uCD94\uC815\uBC94\uC8FC"
|
|
366
|
+
// Presumed predicate
|
|
367
|
+
};
|
|
368
|
+
var KoreanToken = class _KoreanToken {
|
|
369
|
+
constructor(options = {}) {
|
|
370
|
+
this.word_id = options.word_id ?? 0;
|
|
371
|
+
this.word_type = options.word_type ?? "KNOWN";
|
|
372
|
+
this.word_position = options.word_position ?? 1;
|
|
373
|
+
this.surface_form = options.surface_form ?? "";
|
|
374
|
+
this.pos = options.pos ?? "*";
|
|
375
|
+
this.semantic_class = options.semantic_class ?? "*";
|
|
376
|
+
this.has_final_consonant = options.has_final_consonant ?? "*";
|
|
377
|
+
this.reading = options.reading ?? "*";
|
|
378
|
+
this.type = options.type ?? "*";
|
|
379
|
+
this.first_pos = options.first_pos ?? "*";
|
|
380
|
+
this.last_pos = options.last_pos ?? "*";
|
|
381
|
+
this.expression = options.expression ?? "*";
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Get human-readable POS description
|
|
385
|
+
*/
|
|
386
|
+
get posDescription() {
|
|
387
|
+
return POS_TAGS[this.pos] || this.pos;
|
|
388
|
+
}
|
|
389
|
+
/**
|
|
390
|
+
* Check if token ends with a consonant (받침)
|
|
391
|
+
*/
|
|
392
|
+
get hasBatchim() {
|
|
393
|
+
return this.has_final_consonant === "T";
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Check if this is a compound word
|
|
397
|
+
*/
|
|
398
|
+
get isCompound() {
|
|
399
|
+
return this.type === "Compound";
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Check if this is an inflected form
|
|
403
|
+
*/
|
|
404
|
+
get isInflected() {
|
|
405
|
+
return this.type === "Inflect";
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Get the decomposed parts for compound/inflected words
|
|
409
|
+
*/
|
|
410
|
+
get parts() {
|
|
411
|
+
if (this.expression === "*") return [];
|
|
412
|
+
return this.expression.split("+").map((part) => {
|
|
413
|
+
const [surface, pos] = part.split("/");
|
|
414
|
+
return { surface, pos };
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
/**
|
|
418
|
+
* Create token from features array
|
|
419
|
+
*/
|
|
420
|
+
static fromFeatures(surface, features, wordId = 0, position = 1, wordType = "KNOWN") {
|
|
421
|
+
return new _KoreanToken({
|
|
422
|
+
word_id: wordId,
|
|
423
|
+
word_type: wordType,
|
|
424
|
+
word_position: position,
|
|
425
|
+
surface_form: surface,
|
|
426
|
+
pos: features[0] ?? "*",
|
|
427
|
+
semantic_class: features[1] ?? "*",
|
|
428
|
+
has_final_consonant: features[2] ?? "*",
|
|
429
|
+
reading: features[3] ?? "*",
|
|
430
|
+
type: features[4] ?? "*",
|
|
431
|
+
first_pos: features[5] ?? "*",
|
|
432
|
+
last_pos: features[6] ?? "*",
|
|
433
|
+
expression: features[7] ?? "*"
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Convert to plain object
|
|
438
|
+
*/
|
|
439
|
+
toJSON() {
|
|
440
|
+
return {
|
|
441
|
+
word_id: this.word_id,
|
|
442
|
+
word_type: this.word_type,
|
|
443
|
+
word_position: this.word_position,
|
|
444
|
+
surface_form: this.surface_form,
|
|
445
|
+
pos: this.pos,
|
|
446
|
+
posDescription: this.posDescription,
|
|
447
|
+
semantic_class: this.semantic_class,
|
|
448
|
+
has_final_consonant: this.has_final_consonant,
|
|
449
|
+
reading: this.reading,
|
|
450
|
+
type: this.type,
|
|
451
|
+
first_pos: this.first_pos,
|
|
452
|
+
last_pos: this.last_pos,
|
|
453
|
+
expression: this.expression
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
};
|
|
457
|
+
|
|
458
|
+
// src/KoreanFormatter.ts
|
|
459
|
+
var KoreanFormatter = class {
|
|
460
|
+
/**
|
|
461
|
+
* Format a known word entry
|
|
462
|
+
*/
|
|
463
|
+
formatEntry(wordId, position, type, features) {
|
|
464
|
+
return new KoreanToken({
|
|
465
|
+
word_id: wordId,
|
|
466
|
+
word_type: type,
|
|
467
|
+
word_position: position,
|
|
468
|
+
surface_form: features[0] ?? "",
|
|
469
|
+
pos: features[1] ?? "*",
|
|
470
|
+
semantic_class: features[2] ?? "*",
|
|
471
|
+
has_final_consonant: features[3] ?? "*",
|
|
472
|
+
reading: features[4] ?? "*",
|
|
473
|
+
type: features[5] ?? "*",
|
|
474
|
+
first_pos: features[6] ?? "*",
|
|
475
|
+
last_pos: features[7] ?? "*",
|
|
476
|
+
expression: features[8] ?? "*"
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Format an unknown word entry
|
|
481
|
+
*/
|
|
482
|
+
formatUnknownEntry(wordId, position, type, features, surfaceForm) {
|
|
483
|
+
return new KoreanToken({
|
|
484
|
+
word_id: wordId,
|
|
485
|
+
word_type: type,
|
|
486
|
+
word_position: position,
|
|
487
|
+
surface_form: surfaceForm,
|
|
488
|
+
pos: features[1] ?? "*",
|
|
489
|
+
semantic_class: features[2] ?? "*",
|
|
490
|
+
has_final_consonant: features[3] ?? "*",
|
|
491
|
+
reading: features[4] ?? "*",
|
|
492
|
+
type: features[5] ?? "*",
|
|
493
|
+
first_pos: features[6] ?? "*",
|
|
494
|
+
last_pos: features[7] ?? "*",
|
|
495
|
+
expression: features[8] ?? "*"
|
|
496
|
+
});
|
|
497
|
+
}
|
|
498
|
+
};
|
|
499
|
+
|
|
500
|
+
// src/Tokenizer.ts
|
|
501
|
+
var PUNCTUATION = /[.?!。?!]/;
|
|
502
|
+
var Tokenizer = class _Tokenizer {
|
|
503
|
+
constructor(dic) {
|
|
504
|
+
this.tokenInfoDictionary = dic.tokenInfoDictionary;
|
|
505
|
+
this.unknownDictionary = dic.unknownDictionary;
|
|
506
|
+
this.viterbiBuilder = new ViterbiBuilder(dic);
|
|
507
|
+
this.viterbiSearcher = new ViterbiSearcher(dic.connectionCosts);
|
|
508
|
+
this.formatter = new KoreanFormatter();
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Split text by sentence-ending punctuation
|
|
512
|
+
*/
|
|
513
|
+
static splitByPunctuation(input) {
|
|
514
|
+
const sentences = [];
|
|
515
|
+
let tail = input;
|
|
516
|
+
while (true) {
|
|
517
|
+
if (tail === "") {
|
|
518
|
+
break;
|
|
519
|
+
}
|
|
520
|
+
const index = tail.search(PUNCTUATION);
|
|
521
|
+
if (index < 0) {
|
|
522
|
+
sentences.push(tail);
|
|
523
|
+
break;
|
|
524
|
+
}
|
|
525
|
+
sentences.push(tail.substring(0, index + 1));
|
|
526
|
+
tail = tail.substring(index + 1);
|
|
527
|
+
}
|
|
528
|
+
return sentences;
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* Tokenize text into morphemes
|
|
532
|
+
*/
|
|
533
|
+
tokenize(text) {
|
|
534
|
+
const sentences = _Tokenizer.splitByPunctuation(text);
|
|
535
|
+
const tokens = [];
|
|
536
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
537
|
+
const sentence = sentences[i];
|
|
538
|
+
this.tokenizeForSentence(sentence, tokens);
|
|
539
|
+
}
|
|
540
|
+
return tokens;
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Tokenize a single sentence
|
|
544
|
+
*/
|
|
545
|
+
tokenizeForSentence(sentence, tokens = []) {
|
|
546
|
+
const lattice = this.getLattice(sentence);
|
|
547
|
+
const bestPath = this.viterbiSearcher.search(lattice);
|
|
548
|
+
let lastPos = 0;
|
|
549
|
+
if (tokens.length > 0) {
|
|
550
|
+
lastPos = tokens[tokens.length - 1].word_position;
|
|
551
|
+
}
|
|
552
|
+
for (let j = 0; j < bestPath.length; j++) {
|
|
553
|
+
const node = bestPath[j];
|
|
554
|
+
let token;
|
|
555
|
+
let features;
|
|
556
|
+
let featuresLine;
|
|
557
|
+
if (node.type === "KNOWN") {
|
|
558
|
+
featuresLine = this.tokenInfoDictionary.getFeatures(node.name);
|
|
559
|
+
features = featuresLine ? featuresLine.split(",") : [];
|
|
560
|
+
token = this.formatter.formatEntry(
|
|
561
|
+
node.name,
|
|
562
|
+
lastPos + node.start_pos,
|
|
563
|
+
"KNOWN",
|
|
564
|
+
features
|
|
565
|
+
);
|
|
566
|
+
} else if (node.type === "UNKNOWN") {
|
|
567
|
+
featuresLine = this.unknownDictionary.getFeatures(node.name);
|
|
568
|
+
features = featuresLine ? featuresLine.split(",") : [];
|
|
569
|
+
token = this.formatter.formatUnknownEntry(
|
|
570
|
+
node.name,
|
|
571
|
+
lastPos + node.start_pos,
|
|
572
|
+
"UNKNOWN",
|
|
573
|
+
features,
|
|
574
|
+
node.surface_form
|
|
575
|
+
);
|
|
576
|
+
} else {
|
|
577
|
+
token = this.formatter.formatEntry(node.name, lastPos + node.start_pos, "KNOWN", []);
|
|
578
|
+
}
|
|
579
|
+
tokens.push(token);
|
|
580
|
+
}
|
|
581
|
+
return tokens;
|
|
582
|
+
}
|
|
583
|
+
/**
|
|
584
|
+
* Get just the surface forms as an array (wakachi-gaki)
|
|
585
|
+
*/
|
|
586
|
+
wakati(text) {
|
|
587
|
+
const tokens = this.tokenize(text);
|
|
588
|
+
return tokens.map((token) => token.surface_form);
|
|
589
|
+
}
|
|
590
|
+
/**
|
|
591
|
+
* Get space-separated surface forms
|
|
592
|
+
*/
|
|
593
|
+
wakatiString(text) {
|
|
594
|
+
return this.wakati(text).join(" ");
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Build word lattice for analysis
|
|
598
|
+
*/
|
|
599
|
+
getLattice(text) {
|
|
600
|
+
return this.viterbiBuilder.build(text);
|
|
601
|
+
}
|
|
602
|
+
};
|
|
603
|
+
|
|
604
|
+
// src/util/ByteBuffer.ts
|
|
605
|
+
function stringToUtf8Bytes(str) {
|
|
606
|
+
const bytes = new Uint8Array(str.length * 4);
|
|
607
|
+
let i = 0;
|
|
608
|
+
let j = 0;
|
|
609
|
+
while (i < str.length) {
|
|
610
|
+
let unicodeCode;
|
|
611
|
+
const utf16Code = str.charCodeAt(i++);
|
|
612
|
+
if (utf16Code >= 55296 && utf16Code <= 56319) {
|
|
613
|
+
const upper = utf16Code;
|
|
614
|
+
const lower = str.charCodeAt(i++);
|
|
615
|
+
if (lower >= 56320 && lower <= 57343) {
|
|
616
|
+
unicodeCode = (upper - 55296) * (1 << 10) + (1 << 16) + (lower - 56320);
|
|
617
|
+
} else {
|
|
618
|
+
throw new Error("Malformed surrogate pair");
|
|
619
|
+
}
|
|
620
|
+
} else {
|
|
621
|
+
unicodeCode = utf16Code;
|
|
622
|
+
}
|
|
623
|
+
if (unicodeCode < 128) {
|
|
624
|
+
bytes[j++] = unicodeCode;
|
|
625
|
+
} else if (unicodeCode < 1 << 11) {
|
|
626
|
+
bytes[j++] = unicodeCode >>> 6 | 192;
|
|
627
|
+
bytes[j++] = unicodeCode & 63 | 128;
|
|
628
|
+
} else if (unicodeCode < 1 << 16) {
|
|
629
|
+
bytes[j++] = unicodeCode >>> 12 | 224;
|
|
630
|
+
bytes[j++] = unicodeCode >> 6 & 63 | 128;
|
|
631
|
+
bytes[j++] = unicodeCode & 63 | 128;
|
|
632
|
+
} else if (unicodeCode < 1 << 21) {
|
|
633
|
+
bytes[j++] = unicodeCode >>> 18 | 240;
|
|
634
|
+
bytes[j++] = unicodeCode >> 12 & 63 | 128;
|
|
635
|
+
bytes[j++] = unicodeCode >> 6 & 63 | 128;
|
|
636
|
+
bytes[j++] = unicodeCode & 63 | 128;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
return bytes.subarray(0, j);
|
|
640
|
+
}
|
|
641
|
+
function utf8BytesToString(bytes) {
|
|
642
|
+
let str = "";
|
|
643
|
+
let i = 0;
|
|
644
|
+
while (i < bytes.length) {
|
|
645
|
+
const b1 = bytes[i++];
|
|
646
|
+
let code;
|
|
647
|
+
if (b1 < 128) {
|
|
648
|
+
code = b1;
|
|
649
|
+
} else if (b1 >> 5 === 6) {
|
|
650
|
+
const b2 = bytes[i++];
|
|
651
|
+
code = (b1 & 31) << 6 | b2 & 63;
|
|
652
|
+
} else if (b1 >> 4 === 14) {
|
|
653
|
+
const b2 = bytes[i++];
|
|
654
|
+
const b3 = bytes[i++];
|
|
655
|
+
code = (b1 & 15) << 12 | (b2 & 63) << 6 | b3 & 63;
|
|
656
|
+
} else {
|
|
657
|
+
const b2 = bytes[i++];
|
|
658
|
+
const b3 = bytes[i++];
|
|
659
|
+
const b4 = bytes[i++];
|
|
660
|
+
code = (b1 & 7) << 18 | (b2 & 63) << 12 | (b3 & 63) << 6 | b4 & 63;
|
|
661
|
+
}
|
|
662
|
+
if (code < 65536) {
|
|
663
|
+
str += String.fromCharCode(code);
|
|
664
|
+
} else {
|
|
665
|
+
code -= 65536;
|
|
666
|
+
const upper = 55296 | code >> 10;
|
|
667
|
+
const lower = 56320 | code & 1023;
|
|
668
|
+
str += String.fromCharCode(upper, lower);
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
return str;
|
|
672
|
+
}
|
|
673
|
+
var ByteBuffer = class {
|
|
674
|
+
constructor(arg) {
|
|
675
|
+
if (arg == null) {
|
|
676
|
+
this.buffer = new Uint8Array(1024 * 1024);
|
|
677
|
+
this.position = 0;
|
|
678
|
+
} else if (typeof arg === "number") {
|
|
679
|
+
this.buffer = new Uint8Array(arg);
|
|
680
|
+
this.position = 0;
|
|
681
|
+
} else if (arg instanceof Uint8Array) {
|
|
682
|
+
this.buffer = arg;
|
|
683
|
+
this.position = 0;
|
|
684
|
+
} else if (arg instanceof ArrayBuffer) {
|
|
685
|
+
this.buffer = new Uint8Array(arg);
|
|
686
|
+
this.position = 0;
|
|
687
|
+
} else {
|
|
688
|
+
throw new Error("Invalid parameter type for ByteBuffer constructor");
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
size() {
|
|
692
|
+
return this.buffer.length;
|
|
693
|
+
}
|
|
694
|
+
reallocate() {
|
|
695
|
+
const newArray = new Uint8Array(this.buffer.length * 2);
|
|
696
|
+
newArray.set(this.buffer);
|
|
697
|
+
this.buffer = newArray;
|
|
698
|
+
}
|
|
699
|
+
shrink() {
|
|
700
|
+
this.buffer = this.buffer.subarray(0, this.position);
|
|
701
|
+
return this.buffer;
|
|
702
|
+
}
|
|
703
|
+
put(b) {
|
|
704
|
+
if (this.buffer.length < this.position + 1) {
|
|
705
|
+
this.reallocate();
|
|
706
|
+
}
|
|
707
|
+
this.buffer[this.position++] = b;
|
|
708
|
+
}
|
|
709
|
+
get(index) {
|
|
710
|
+
if (index == null) {
|
|
711
|
+
index = this.position;
|
|
712
|
+
this.position += 1;
|
|
713
|
+
}
|
|
714
|
+
if (this.buffer.length < index + 1) {
|
|
715
|
+
return 0;
|
|
716
|
+
}
|
|
717
|
+
return this.buffer[index];
|
|
718
|
+
}
|
|
719
|
+
// Write short to buffer (little endian)
|
|
720
|
+
putShort(num) {
|
|
721
|
+
if (65535 < num) {
|
|
722
|
+
throw new Error(`${num} is over short value`);
|
|
723
|
+
}
|
|
724
|
+
const lower = 255 & num;
|
|
725
|
+
const upper = (65280 & num) >> 8;
|
|
726
|
+
this.put(lower);
|
|
727
|
+
this.put(upper);
|
|
728
|
+
}
|
|
729
|
+
// Read short from buffer (little endian)
|
|
730
|
+
getShort(index) {
|
|
731
|
+
if (index == null) {
|
|
732
|
+
index = this.position;
|
|
733
|
+
this.position += 2;
|
|
734
|
+
}
|
|
735
|
+
if (this.buffer.length < index + 2) {
|
|
736
|
+
return 0;
|
|
737
|
+
}
|
|
738
|
+
const lower = this.buffer[index];
|
|
739
|
+
const upper = this.buffer[index + 1];
|
|
740
|
+
let value = (upper << 8) + lower;
|
|
741
|
+
if (value & 32768) {
|
|
742
|
+
value = -(value - 1 ^ 65535);
|
|
743
|
+
}
|
|
744
|
+
return value;
|
|
745
|
+
}
|
|
746
|
+
// Write integer to buffer (little endian)
|
|
747
|
+
putInt(num) {
|
|
748
|
+
if (4294967295 < num) {
|
|
749
|
+
throw new Error(`${num} is over integer value`);
|
|
750
|
+
}
|
|
751
|
+
const b0 = 255 & num;
|
|
752
|
+
const b1 = (65280 & num) >> 8;
|
|
753
|
+
const b2 = (16711680 & num) >> 16;
|
|
754
|
+
const b3 = (4278190080 & num) >> 24;
|
|
755
|
+
this.put(b0);
|
|
756
|
+
this.put(b1);
|
|
757
|
+
this.put(b2);
|
|
758
|
+
this.put(b3);
|
|
759
|
+
}
|
|
760
|
+
// Read integer from buffer (little endian)
|
|
761
|
+
getInt(index) {
|
|
762
|
+
if (index == null) {
|
|
763
|
+
index = this.position;
|
|
764
|
+
this.position += 4;
|
|
765
|
+
}
|
|
766
|
+
if (this.buffer.length < index + 4) {
|
|
767
|
+
return 0;
|
|
768
|
+
}
|
|
769
|
+
const b0 = this.buffer[index];
|
|
770
|
+
const b1 = this.buffer[index + 1];
|
|
771
|
+
const b2 = this.buffer[index + 2];
|
|
772
|
+
const b3 = this.buffer[index + 3];
|
|
773
|
+
return (b3 << 24 >>> 0) + (b2 << 16) + (b1 << 8) + b0;
|
|
774
|
+
}
|
|
775
|
+
readInt() {
|
|
776
|
+
const pos = this.position;
|
|
777
|
+
this.position += 4;
|
|
778
|
+
return this.getInt(pos);
|
|
779
|
+
}
|
|
780
|
+
putString(str) {
|
|
781
|
+
const bytes = stringToUtf8Bytes(str);
|
|
782
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
783
|
+
this.put(bytes[i]);
|
|
784
|
+
}
|
|
785
|
+
this.put(0);
|
|
786
|
+
}
|
|
787
|
+
getString(index) {
|
|
788
|
+
const buf = [];
|
|
789
|
+
if (index == null) {
|
|
790
|
+
index = this.position;
|
|
791
|
+
}
|
|
792
|
+
while (true) {
|
|
793
|
+
if (this.buffer.length < index + 1) {
|
|
794
|
+
break;
|
|
795
|
+
}
|
|
796
|
+
const ch = this.get(index++);
|
|
797
|
+
if (ch === 0) {
|
|
798
|
+
break;
|
|
799
|
+
} else {
|
|
800
|
+
buf.push(ch);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
this.position = index;
|
|
804
|
+
return utf8BytesToString(buf);
|
|
805
|
+
}
|
|
806
|
+
};
|
|
807
|
+
|
|
808
|
+
// src/dict/TokenInfoDictionary.ts
|
|
809
|
+
var TokenInfoDictionary = class {
|
|
810
|
+
constructor() {
|
|
811
|
+
this.dictionary = new ByteBuffer(10 * 1024 * 1024);
|
|
812
|
+
this.targetMap = {};
|
|
813
|
+
this.posBuffer = new ByteBuffer(10 * 1024 * 1024);
|
|
814
|
+
}
|
|
815
|
+
/**
|
|
816
|
+
* Build dictionary from entries
|
|
817
|
+
* Entry format: [surface, left_id, right_id, word_cost, ...features]
|
|
818
|
+
*/
|
|
819
|
+
buildDictionary(entries) {
|
|
820
|
+
const dictionaryEntries = {};
|
|
821
|
+
for (let i = 0; i < entries.length; i++) {
|
|
822
|
+
const entry = entries[i];
|
|
823
|
+
if (entry.length < 4) {
|
|
824
|
+
continue;
|
|
825
|
+
}
|
|
826
|
+
const surfaceForm = entry[0];
|
|
827
|
+
const leftId = entry[1];
|
|
828
|
+
const rightId = entry[2];
|
|
829
|
+
const wordCost = entry[3];
|
|
830
|
+
const feature = entry.slice(4).join(",");
|
|
831
|
+
if (!isFinite(leftId) || !isFinite(rightId) || !isFinite(wordCost)) {
|
|
832
|
+
console.log(entry);
|
|
833
|
+
continue;
|
|
834
|
+
}
|
|
835
|
+
const tokenInfoId = this.put(leftId, rightId, wordCost, surfaceForm, feature);
|
|
836
|
+
dictionaryEntries[tokenInfoId] = surfaceForm;
|
|
837
|
+
}
|
|
838
|
+
this.dictionary.shrink();
|
|
839
|
+
this.posBuffer.shrink();
|
|
840
|
+
return dictionaryEntries;
|
|
841
|
+
}
|
|
842
|
+
put(leftId, rightId, wordCost, surfaceForm, feature) {
|
|
843
|
+
const tokenInfoId = this.dictionary.position;
|
|
844
|
+
const posId = this.posBuffer.position;
|
|
845
|
+
this.dictionary.putShort(leftId);
|
|
846
|
+
this.dictionary.putShort(rightId);
|
|
847
|
+
this.dictionary.putShort(wordCost);
|
|
848
|
+
this.dictionary.putInt(posId);
|
|
849
|
+
this.posBuffer.putString(surfaceForm + "," + feature);
|
|
850
|
+
return tokenInfoId;
|
|
851
|
+
}
|
|
852
|
+
addMapping(source, target) {
|
|
853
|
+
let mapping = this.targetMap[source];
|
|
854
|
+
if (mapping == null) {
|
|
855
|
+
mapping = [];
|
|
856
|
+
}
|
|
857
|
+
mapping.push(target);
|
|
858
|
+
this.targetMap[source] = mapping;
|
|
859
|
+
}
|
|
860
|
+
targetMapToBuffer() {
|
|
861
|
+
const buffer = new ByteBuffer();
|
|
862
|
+
const mapKeysSize = Object.keys(this.targetMap).length;
|
|
863
|
+
buffer.putInt(mapKeysSize);
|
|
864
|
+
for (const key in this.targetMap) {
|
|
865
|
+
const values = this.targetMap[parseInt(key, 10)];
|
|
866
|
+
const mapValuesSize = values.length;
|
|
867
|
+
buffer.putInt(parseInt(key, 10));
|
|
868
|
+
buffer.putInt(mapValuesSize);
|
|
869
|
+
for (let i = 0; i < values.length; i++) {
|
|
870
|
+
buffer.putInt(values[i]);
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
return buffer.shrink();
|
|
874
|
+
}
|
|
875
|
+
// Load from tid.dat
|
|
876
|
+
loadDictionary(arrayBuffer) {
|
|
877
|
+
this.dictionary = new ByteBuffer(
|
|
878
|
+
arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
|
|
879
|
+
);
|
|
880
|
+
return this;
|
|
881
|
+
}
|
|
882
|
+
// Load from tid_pos.dat
|
|
883
|
+
loadPosVector(arrayBuffer) {
|
|
884
|
+
this.posBuffer = new ByteBuffer(
|
|
885
|
+
arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
|
|
886
|
+
);
|
|
887
|
+
return this;
|
|
888
|
+
}
|
|
889
|
+
// Load from tid_map.dat
|
|
890
|
+
loadTargetMap(arrayBuffer) {
|
|
891
|
+
const buffer = new ByteBuffer(
|
|
892
|
+
arrayBuffer instanceof ArrayBuffer ? new Uint8Array(arrayBuffer) : arrayBuffer
|
|
893
|
+
);
|
|
894
|
+
buffer.position = 0;
|
|
895
|
+
this.targetMap = {};
|
|
896
|
+
buffer.readInt();
|
|
897
|
+
while (true) {
|
|
898
|
+
if (buffer.buffer.length < buffer.position + 1) {
|
|
899
|
+
break;
|
|
900
|
+
}
|
|
901
|
+
const key = buffer.readInt();
|
|
902
|
+
const mapValuesSize = buffer.readInt();
|
|
903
|
+
for (let i = 0; i < mapValuesSize; i++) {
|
|
904
|
+
const value = buffer.readInt();
|
|
905
|
+
this.addMapping(key, value);
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
return this;
|
|
909
|
+
}
|
|
910
|
+
/**
|
|
911
|
+
* Look up features in the dictionary
|
|
912
|
+
*/
|
|
913
|
+
getFeatures(tokenInfoIdStr) {
|
|
914
|
+
const tokenInfoId = typeof tokenInfoIdStr === "string" ? parseInt(tokenInfoIdStr, 10) : tokenInfoIdStr;
|
|
915
|
+
if (isNaN(tokenInfoId)) {
|
|
916
|
+
return "";
|
|
917
|
+
}
|
|
918
|
+
const posId = this.dictionary.getInt(tokenInfoId + 6);
|
|
919
|
+
return this.posBuffer.getString(posId);
|
|
920
|
+
}
|
|
921
|
+
};
|
|
922
|
+
|
|
923
|
+
// src/dict/ConnectionCosts.ts
|
|
924
|
+
var ConnectionCosts = class {
|
|
925
|
+
constructor(forwardDimension, backwardDimension) {
|
|
926
|
+
this.forwardDimension = forwardDimension;
|
|
927
|
+
this.backwardDimension = backwardDimension;
|
|
928
|
+
this.buffer = new Int16Array(forwardDimension * backwardDimension + 2);
|
|
929
|
+
this.buffer[0] = forwardDimension;
|
|
930
|
+
this.buffer[1] = backwardDimension;
|
|
931
|
+
}
|
|
932
|
+
put(forwardId, backwardId, cost) {
|
|
933
|
+
const index = forwardId * this.backwardDimension + backwardId + 2;
|
|
934
|
+
if (this.buffer.length < index + 1) {
|
|
935
|
+
throw new Error("ConnectionCosts buffer overflow");
|
|
936
|
+
}
|
|
937
|
+
this.buffer[index] = cost;
|
|
938
|
+
}
|
|
939
|
+
get(forwardId, backwardId) {
|
|
940
|
+
const index = forwardId * this.backwardDimension + backwardId + 2;
|
|
941
|
+
if (this.buffer.length < index + 1) {
|
|
942
|
+
throw new Error("ConnectionCosts buffer overflow");
|
|
943
|
+
}
|
|
944
|
+
return this.buffer[index];
|
|
945
|
+
}
|
|
946
|
+
loadConnectionCosts(connectionCostsBuffer) {
|
|
947
|
+
this.forwardDimension = connectionCostsBuffer[0];
|
|
948
|
+
this.backwardDimension = connectionCostsBuffer[1];
|
|
949
|
+
this.buffer = connectionCostsBuffer;
|
|
950
|
+
}
|
|
951
|
+
};
|
|
952
|
+
|
|
953
|
+
// src/dict/CharacterClass.ts
|
|
954
|
+
var CharacterClass = class {
|
|
955
|
+
constructor(classId, className, isAlwaysInvoke, isGrouping, maxLength) {
|
|
956
|
+
this.class_id = classId;
|
|
957
|
+
this.class_name = className;
|
|
958
|
+
this.is_always_invoke = isAlwaysInvoke;
|
|
959
|
+
this.is_grouping = isGrouping;
|
|
960
|
+
this.max_length = maxLength;
|
|
961
|
+
}
|
|
962
|
+
};
|
|
963
|
+
|
|
964
|
+
// src/dict/InvokeDefinitionMap.ts
|
|
965
|
+
var InvokeDefinitionMap = class _InvokeDefinitionMap {
|
|
966
|
+
constructor() {
|
|
967
|
+
this.map = [];
|
|
968
|
+
this.lookupTable = {};
|
|
969
|
+
}
|
|
970
|
+
/**
|
|
971
|
+
* Load InvokeDefinitionMap from buffer
|
|
972
|
+
*/
|
|
973
|
+
static load(invokeDefBuffer) {
|
|
974
|
+
const invokeDef = new _InvokeDefinitionMap();
|
|
975
|
+
const characterCategoryDefinition = [];
|
|
976
|
+
const buffer = new ByteBuffer(invokeDefBuffer);
|
|
977
|
+
while (buffer.position + 1 < buffer.size()) {
|
|
978
|
+
const classId = characterCategoryDefinition.length;
|
|
979
|
+
const isAlwaysInvoke = buffer.get();
|
|
980
|
+
const isGrouping = buffer.get();
|
|
981
|
+
const maxLength = buffer.getInt();
|
|
982
|
+
const className = buffer.getString();
|
|
983
|
+
characterCategoryDefinition.push(
|
|
984
|
+
new CharacterClass(classId, className, isAlwaysInvoke, isGrouping, maxLength)
|
|
985
|
+
);
|
|
986
|
+
}
|
|
987
|
+
invokeDef.init(characterCategoryDefinition);
|
|
988
|
+
return invokeDef;
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* Initialize with character category definitions
|
|
992
|
+
*/
|
|
993
|
+
init(characterCategoryDefinition) {
|
|
994
|
+
if (characterCategoryDefinition == null) {
|
|
995
|
+
return;
|
|
996
|
+
}
|
|
997
|
+
for (let i = 0; i < characterCategoryDefinition.length; i++) {
|
|
998
|
+
const characterClass = characterCategoryDefinition[i];
|
|
999
|
+
this.map[i] = characterClass;
|
|
1000
|
+
this.lookupTable[characterClass.class_name] = i;
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
/**
|
|
1004
|
+
* Get class information by class ID
|
|
1005
|
+
*/
|
|
1006
|
+
getCharacterClass(classId) {
|
|
1007
|
+
return this.map[classId];
|
|
1008
|
+
}
|
|
1009
|
+
/**
|
|
1010
|
+
* Lookup class ID by class name
|
|
1011
|
+
*/
|
|
1012
|
+
lookup(className) {
|
|
1013
|
+
const classId = this.lookupTable[className];
|
|
1014
|
+
if (classId == null) {
|
|
1015
|
+
return null;
|
|
1016
|
+
}
|
|
1017
|
+
return classId;
|
|
1018
|
+
}
|
|
1019
|
+
/**
|
|
1020
|
+
* Transform from map to binary buffer
|
|
1021
|
+
*/
|
|
1022
|
+
toBuffer() {
|
|
1023
|
+
const buffer = new ByteBuffer();
|
|
1024
|
+
for (let i = 0; i < this.map.length; i++) {
|
|
1025
|
+
const charClass = this.map[i];
|
|
1026
|
+
buffer.put(charClass.is_always_invoke ? 1 : 0);
|
|
1027
|
+
buffer.put(charClass.is_grouping ? 1 : 0);
|
|
1028
|
+
buffer.putInt(charClass.max_length);
|
|
1029
|
+
buffer.putString(charClass.class_name);
|
|
1030
|
+
}
|
|
1031
|
+
buffer.shrink();
|
|
1032
|
+
return buffer.buffer;
|
|
1033
|
+
}
|
|
1034
|
+
};
|
|
1035
|
+
|
|
1036
|
+
// src/dict/CharacterDefinition.ts
|
|
1037
|
+
var DEFAULT_CATEGORY = "DEFAULT";
|
|
1038
|
+
var CharacterDefinition = class _CharacterDefinition {
|
|
1039
|
+
constructor() {
|
|
1040
|
+
this.characterCategoryMap = new Uint8Array(65536);
|
|
1041
|
+
this.compatibleCategoryMap = new Uint32Array(65536);
|
|
1042
|
+
this.invokeDefinitionMap = null;
|
|
1043
|
+
}
|
|
1044
|
+
/**
|
|
1045
|
+
* Load CharacterDefinition from buffers
|
|
1046
|
+
*/
|
|
1047
|
+
static load(catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
|
|
1048
|
+
const charDef = new _CharacterDefinition();
|
|
1049
|
+
charDef.characterCategoryMap = catMapBuffer;
|
|
1050
|
+
charDef.compatibleCategoryMap = compatCatMapBuffer;
|
|
1051
|
+
charDef.invokeDefinitionMap = InvokeDefinitionMap.load(invokeDefBuffer);
|
|
1052
|
+
return charDef;
|
|
1053
|
+
}
|
|
1054
|
+
static parseCharCategory(classId, parsedCategoryDef) {
|
|
1055
|
+
const category = parsedCategoryDef[1];
|
|
1056
|
+
const invoke = parseInt(parsedCategoryDef[2], 10);
|
|
1057
|
+
const grouping = parseInt(parsedCategoryDef[3], 10);
|
|
1058
|
+
const maxLength = parseInt(parsedCategoryDef[4], 10);
|
|
1059
|
+
if (!isFinite(invoke) || invoke !== 0 && invoke !== 1) {
|
|
1060
|
+
console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke);
|
|
1061
|
+
return null;
|
|
1062
|
+
}
|
|
1063
|
+
if (!isFinite(grouping) || grouping !== 0 && grouping !== 1) {
|
|
1064
|
+
console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping);
|
|
1065
|
+
return null;
|
|
1066
|
+
}
|
|
1067
|
+
if (!isFinite(maxLength) || maxLength < 0) {
|
|
1068
|
+
console.log("char.def parse error. LENGTH is 1 to n:" + maxLength);
|
|
1069
|
+
return null;
|
|
1070
|
+
}
|
|
1071
|
+
const isInvoke = invoke === 1;
|
|
1072
|
+
const isGrouping = grouping === 1;
|
|
1073
|
+
return new CharacterClass(classId, category, isInvoke, isGrouping, maxLength);
|
|
1074
|
+
}
|
|
1075
|
+
static parseCategoryMapping(parsedCategoryMapping) {
|
|
1076
|
+
const start = parseInt(parsedCategoryMapping[1], 10);
|
|
1077
|
+
const defaultCategory = parsedCategoryMapping[2];
|
|
1078
|
+
const compatibleCategory = parsedCategoryMapping.length > 3 ? parsedCategoryMapping.slice(3) : [];
|
|
1079
|
+
if (!isFinite(start) || start < 0 || start > 65535) {
|
|
1080
|
+
console.log("char.def parse error. CODE is invalid:" + start);
|
|
1081
|
+
}
|
|
1082
|
+
return { start, default: defaultCategory, compatible: compatibleCategory };
|
|
1083
|
+
}
|
|
1084
|
+
static parseRangeCategoryMapping(parsedCategoryMapping) {
|
|
1085
|
+
const start = parseInt(parsedCategoryMapping[1], 10);
|
|
1086
|
+
const end = parseInt(parsedCategoryMapping[2], 10);
|
|
1087
|
+
const defaultCategory = parsedCategoryMapping[3];
|
|
1088
|
+
const compatibleCategory = parsedCategoryMapping.length > 4 ? parsedCategoryMapping.slice(4) : [];
|
|
1089
|
+
if (!isFinite(start) || start < 0 || start > 65535) {
|
|
1090
|
+
console.log("char.def parse error. CODE is invalid:" + start);
|
|
1091
|
+
}
|
|
1092
|
+
if (!isFinite(end) || end < 0 || end > 65535) {
|
|
1093
|
+
console.log("char.def parse error. CODE is invalid:" + end);
|
|
1094
|
+
}
|
|
1095
|
+
return { start, end, default: defaultCategory, compatible: compatibleCategory };
|
|
1096
|
+
}
|
|
1097
|
+
/**
|
|
1098
|
+
* Initialize category mappings
|
|
1099
|
+
*/
|
|
1100
|
+
initCategoryMappings(categoryMapping) {
|
|
1101
|
+
if (categoryMapping != null && this.invokeDefinitionMap != null) {
|
|
1102
|
+
for (let i = 0; i < categoryMapping.length; i++) {
|
|
1103
|
+
const mapping = categoryMapping[i];
|
|
1104
|
+
const end = mapping.end ?? mapping.start;
|
|
1105
|
+
for (let codePoint = mapping.start; codePoint <= end; codePoint++) {
|
|
1106
|
+
const classId = this.invokeDefinitionMap.lookup(mapping.default);
|
|
1107
|
+
if (classId != null) {
|
|
1108
|
+
this.characterCategoryMap[codePoint] = classId;
|
|
1109
|
+
}
|
|
1110
|
+
for (let j = 0; j < mapping.compatible.length; j++) {
|
|
1111
|
+
let bitset = this.compatibleCategoryMap[codePoint];
|
|
1112
|
+
const compatibleCategory = mapping.compatible[j];
|
|
1113
|
+
if (compatibleCategory == null) {
|
|
1114
|
+
continue;
|
|
1115
|
+
}
|
|
1116
|
+
const compatClassId = this.invokeDefinitionMap.lookup(compatibleCategory);
|
|
1117
|
+
if (compatClassId == null) {
|
|
1118
|
+
continue;
|
|
1119
|
+
}
|
|
1120
|
+
const classIdBit = 1 << compatClassId;
|
|
1121
|
+
bitset = bitset | classIdBit;
|
|
1122
|
+
this.compatibleCategoryMap[codePoint] = bitset;
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
if (this.invokeDefinitionMap == null) {
|
|
1128
|
+
return;
|
|
1129
|
+
}
|
|
1130
|
+
const defaultId = this.invokeDefinitionMap.lookup(DEFAULT_CATEGORY);
|
|
1131
|
+
if (defaultId == null) {
|
|
1132
|
+
return;
|
|
1133
|
+
}
|
|
1134
|
+
for (let codePoint = 0; codePoint < this.characterCategoryMap.length; codePoint++) {
|
|
1135
|
+
if (this.characterCategoryMap[codePoint] === 0) {
|
|
1136
|
+
this.characterCategoryMap[codePoint] = 1 << defaultId;
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
/**
|
|
1141
|
+
* Lookup compatible categories for a character (not included 1st category)
|
|
1142
|
+
*/
|
|
1143
|
+
lookupCompatibleCategory(ch) {
|
|
1144
|
+
const classes = [];
|
|
1145
|
+
const code = ch.charCodeAt(0);
|
|
1146
|
+
let integer;
|
|
1147
|
+
if (code < this.compatibleCategoryMap.length) {
|
|
1148
|
+
integer = this.compatibleCategoryMap[code];
|
|
1149
|
+
}
|
|
1150
|
+
if (integer == null || integer === 0) {
|
|
1151
|
+
return classes;
|
|
1152
|
+
}
|
|
1153
|
+
for (let bit = 0; bit < 32; bit++) {
|
|
1154
|
+
if (integer << 31 - bit >>> 31 === 1) {
|
|
1155
|
+
const characterClass = this.invokeDefinitionMap?.getCharacterClass(bit);
|
|
1156
|
+
if (characterClass == null) {
|
|
1157
|
+
continue;
|
|
1158
|
+
}
|
|
1159
|
+
classes.push(characterClass);
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
return classes;
|
|
1163
|
+
}
|
|
1164
|
+
/**
|
|
1165
|
+
* Lookup category for a character
|
|
1166
|
+
*/
|
|
1167
|
+
lookup(ch) {
|
|
1168
|
+
let classId = null;
|
|
1169
|
+
const code = ch.charCodeAt(0);
|
|
1170
|
+
if (SurrogateAwareString.isSurrogatePair(ch)) {
|
|
1171
|
+
classId = this.invokeDefinitionMap?.lookup(DEFAULT_CATEGORY) ?? null;
|
|
1172
|
+
} else if (code < this.characterCategoryMap.length) {
|
|
1173
|
+
classId = this.characterCategoryMap[code];
|
|
1174
|
+
}
|
|
1175
|
+
if (classId == null) {
|
|
1176
|
+
classId = this.invokeDefinitionMap?.lookup(DEFAULT_CATEGORY) ?? null;
|
|
1177
|
+
}
|
|
1178
|
+
if (classId == null) {
|
|
1179
|
+
return void 0;
|
|
1180
|
+
}
|
|
1181
|
+
return this.invokeDefinitionMap?.getCharacterClass(classId);
|
|
1182
|
+
}
|
|
1183
|
+
};
|
|
1184
|
+
|
|
1185
|
+
// src/dict/UnknownDictionary.ts
|
|
1186
|
+
var UnknownDictionary = class extends TokenInfoDictionary {
|
|
1187
|
+
constructor() {
|
|
1188
|
+
super();
|
|
1189
|
+
this.characterDefinition = null;
|
|
1190
|
+
}
|
|
1191
|
+
setCharacterDefinition(characterDefinition) {
|
|
1192
|
+
this.characterDefinition = characterDefinition;
|
|
1193
|
+
return this;
|
|
1194
|
+
}
|
|
1195
|
+
lookup(ch) {
|
|
1196
|
+
return this.characterDefinition?.lookup(ch);
|
|
1197
|
+
}
|
|
1198
|
+
lookupCompatibleCategory(ch) {
|
|
1199
|
+
return this.characterDefinition?.lookupCompatibleCategory(ch) ?? [];
|
|
1200
|
+
}
|
|
1201
|
+
loadUnknownDictionaries(unkBuffer, unkPosBuffer, unkMapBuffer, catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
|
|
1202
|
+
this.loadDictionary(unkBuffer);
|
|
1203
|
+
this.loadPosVector(unkPosBuffer);
|
|
1204
|
+
this.loadTargetMap(unkMapBuffer);
|
|
1205
|
+
this.characterDefinition = CharacterDefinition.load(
|
|
1206
|
+
catMapBuffer,
|
|
1207
|
+
compatCatMapBuffer,
|
|
1208
|
+
invokeDefBuffer
|
|
1209
|
+
);
|
|
1210
|
+
}
|
|
1211
|
+
};
|
|
1212
|
+
|
|
1213
|
+
// src/dict/DynamicDictionaries.ts
|
|
1214
|
+
var DynamicDictionaries = class {
|
|
1215
|
+
constructor(trie, tokenInfoDictionary, connectionCosts, unknownDictionary) {
|
|
1216
|
+
this.trie = trie ?? {
|
|
1217
|
+
commonPrefixSearch: () => []
|
|
1218
|
+
};
|
|
1219
|
+
this.tokenInfoDictionary = tokenInfoDictionary ?? new TokenInfoDictionary();
|
|
1220
|
+
this.connectionCosts = connectionCosts ?? new ConnectionCosts(0, 0);
|
|
1221
|
+
this.unknownDictionary = unknownDictionary ?? new UnknownDictionary();
|
|
1222
|
+
}
|
|
1223
|
+
// Load from base.dat & check.dat
|
|
1224
|
+
async loadTrie(baseBuffer, checkBuffer) {
|
|
1225
|
+
const doublearrayModule = await import("doublearray");
|
|
1226
|
+
const doublearray = doublearrayModule.default || doublearrayModule;
|
|
1227
|
+
this.trie = doublearray.load(baseBuffer, checkBuffer);
|
|
1228
|
+
return this;
|
|
1229
|
+
}
|
|
1230
|
+
loadTokenInfoDictionaries(tokenInfoBuffer, posBuffer, targetMapBuffer) {
|
|
1231
|
+
this.tokenInfoDictionary.loadDictionary(tokenInfoBuffer);
|
|
1232
|
+
this.tokenInfoDictionary.loadPosVector(posBuffer);
|
|
1233
|
+
this.tokenInfoDictionary.loadTargetMap(targetMapBuffer);
|
|
1234
|
+
return this;
|
|
1235
|
+
}
|
|
1236
|
+
loadConnectionCosts(ccBuffer) {
|
|
1237
|
+
this.connectionCosts.loadConnectionCosts(ccBuffer);
|
|
1238
|
+
return this;
|
|
1239
|
+
}
|
|
1240
|
+
loadUnknownDictionaries(unkBuffer, unkPosBuffer, unkMapBuffer, catMapBuffer, compatCatMapBuffer, invokeDefBuffer) {
|
|
1241
|
+
this.unknownDictionary.loadUnknownDictionaries(
|
|
1242
|
+
unkBuffer,
|
|
1243
|
+
unkPosBuffer,
|
|
1244
|
+
unkMapBuffer,
|
|
1245
|
+
catMapBuffer,
|
|
1246
|
+
compatCatMapBuffer,
|
|
1247
|
+
invokeDefBuffer
|
|
1248
|
+
);
|
|
1249
|
+
return this;
|
|
1250
|
+
}
|
|
1251
|
+
};
|
|
1252
|
+
|
|
1253
|
+
// src/loader/DictionaryLoader.ts
|
|
1254
|
+
var DictionaryLoader = class {
|
|
1255
|
+
constructor(dicPath) {
|
|
1256
|
+
this.dic = new DynamicDictionaries();
|
|
1257
|
+
this.dicPath = dicPath.endsWith("/") ? dicPath : dicPath + "/";
|
|
1258
|
+
this.isLocalPath = !dicPath.startsWith("http://") && !dicPath.startsWith("https://");
|
|
1259
|
+
}
|
|
1260
|
+
/**
|
|
1261
|
+
* Load a file as ArrayBuffer, handling both compressed and uncompressed
|
|
1262
|
+
*/
|
|
1263
|
+
async loadArrayBuffer(filename) {
|
|
1264
|
+
const path = this.dicPath + filename;
|
|
1265
|
+
let buffer;
|
|
1266
|
+
if (this.isLocalPath && typeof process !== "undefined" && process.versions?.node) {
|
|
1267
|
+
const fs = await import("fs/promises");
|
|
1268
|
+
const nodePath = await import("path");
|
|
1269
|
+
const resolvedPath = nodePath.resolve(path);
|
|
1270
|
+
const fileBuffer = await fs.readFile(resolvedPath);
|
|
1271
|
+
buffer = fileBuffer.buffer.slice(
|
|
1272
|
+
fileBuffer.byteOffset,
|
|
1273
|
+
fileBuffer.byteOffset + fileBuffer.byteLength
|
|
1274
|
+
);
|
|
1275
|
+
} else {
|
|
1276
|
+
const response = await fetch(path);
|
|
1277
|
+
if (!response.ok) {
|
|
1278
|
+
throw new Error(`Failed to load ${path}: ${response.status} ${response.statusText}`);
|
|
1279
|
+
}
|
|
1280
|
+
buffer = await response.arrayBuffer();
|
|
1281
|
+
}
|
|
1282
|
+
const bytes = new Uint8Array(buffer);
|
|
1283
|
+
if (bytes[0] === 31 && bytes[1] === 139) {
|
|
1284
|
+
const pako = await import("pako");
|
|
1285
|
+
const decompressed = pako.inflate(bytes);
|
|
1286
|
+
return decompressed.buffer;
|
|
1287
|
+
}
|
|
1288
|
+
return buffer;
|
|
1289
|
+
}
|
|
1290
|
+
/**
|
|
1291
|
+
* Load all dictionary files
|
|
1292
|
+
*/
|
|
1293
|
+
async load() {
|
|
1294
|
+
const [
|
|
1295
|
+
baseBuffer,
|
|
1296
|
+
checkBuffer,
|
|
1297
|
+
tidBuffer,
|
|
1298
|
+
tidPosBuffer,
|
|
1299
|
+
tidMapBuffer,
|
|
1300
|
+
ccBuffer,
|
|
1301
|
+
unkBuffer,
|
|
1302
|
+
unkPosBuffer,
|
|
1303
|
+
unkMapBuffer,
|
|
1304
|
+
unkCharBuffer,
|
|
1305
|
+
unkCompatBuffer,
|
|
1306
|
+
unkInvokeBuffer
|
|
1307
|
+
] = await Promise.all([
|
|
1308
|
+
// TRIE
|
|
1309
|
+
this.loadArrayBuffer("base.dat.gz").catch(() => this.loadArrayBuffer("base.dat")),
|
|
1310
|
+
this.loadArrayBuffer("check.dat.gz").catch(() => this.loadArrayBuffer("check.dat")),
|
|
1311
|
+
// Token info
|
|
1312
|
+
this.loadArrayBuffer("tid.dat.gz").catch(() => this.loadArrayBuffer("tid.dat")),
|
|
1313
|
+
this.loadArrayBuffer("tid_pos.dat.gz").catch(() => this.loadArrayBuffer("tid_pos.dat")),
|
|
1314
|
+
this.loadArrayBuffer("tid_map.dat.gz").catch(() => this.loadArrayBuffer("tid_map.dat")),
|
|
1315
|
+
// Connection costs
|
|
1316
|
+
this.loadArrayBuffer("cc.dat.gz").catch(() => this.loadArrayBuffer("cc.dat")),
|
|
1317
|
+
// Unknown words
|
|
1318
|
+
this.loadArrayBuffer("unk.dat.gz").catch(() => this.loadArrayBuffer("unk.dat")),
|
|
1319
|
+
this.loadArrayBuffer("unk_pos.dat.gz").catch(() => this.loadArrayBuffer("unk_pos.dat")),
|
|
1320
|
+
this.loadArrayBuffer("unk_map.dat.gz").catch(() => this.loadArrayBuffer("unk_map.dat")),
|
|
1321
|
+
this.loadArrayBuffer("unk_char.dat.gz").catch(() => this.loadArrayBuffer("unk_char.dat")),
|
|
1322
|
+
this.loadArrayBuffer("unk_compat.dat.gz").catch(
|
|
1323
|
+
() => this.loadArrayBuffer("unk_compat.dat")
|
|
1324
|
+
),
|
|
1325
|
+
this.loadArrayBuffer("unk_invoke.dat.gz").catch(
|
|
1326
|
+
() => this.loadArrayBuffer("unk_invoke.dat")
|
|
1327
|
+
)
|
|
1328
|
+
]);
|
|
1329
|
+
await this.dic.loadTrie(new Int32Array(baseBuffer), new Int32Array(checkBuffer));
|
|
1330
|
+
this.dic.loadTokenInfoDictionaries(
|
|
1331
|
+
new Uint8Array(tidBuffer),
|
|
1332
|
+
new Uint8Array(tidPosBuffer),
|
|
1333
|
+
new Uint8Array(tidMapBuffer)
|
|
1334
|
+
);
|
|
1335
|
+
this.dic.loadConnectionCosts(new Int16Array(ccBuffer));
|
|
1336
|
+
this.dic.loadUnknownDictionaries(
|
|
1337
|
+
new Uint8Array(unkBuffer),
|
|
1338
|
+
new Uint8Array(unkPosBuffer),
|
|
1339
|
+
new Uint8Array(unkMapBuffer),
|
|
1340
|
+
new Uint8Array(unkCharBuffer),
|
|
1341
|
+
new Uint32Array(unkCompatBuffer),
|
|
1342
|
+
new Uint8Array(unkInvokeBuffer)
|
|
1343
|
+
);
|
|
1344
|
+
return this.dic;
|
|
1345
|
+
}
|
|
1346
|
+
};
|
|
1347
|
+
|
|
1348
|
+
// src/TokenizerBuilder.ts
|
|
1349
|
+
var TokenizerBuilder = class {
|
|
1350
|
+
constructor(options = {}) {
|
|
1351
|
+
this.dicPath = options.dicPath ?? "dict/";
|
|
1352
|
+
}
|
|
1353
|
+
/**
|
|
1354
|
+
* Build and return the tokenizer (async)
|
|
1355
|
+
*/
|
|
1356
|
+
async build() {
|
|
1357
|
+
const loader = new DictionaryLoader(this.dicPath);
|
|
1358
|
+
const dic = await loader.load();
|
|
1359
|
+
return new Tokenizer(dic);
|
|
1360
|
+
}
|
|
1361
|
+
};
|
|
1362
|
+
|
|
1363
|
+
// src/index.ts
|
|
1364
|
+
function builder(options = {}) {
|
|
1365
|
+
return new TokenizerBuilder(options);
|
|
1366
|
+
}
|
|
1367
|
+
var index_default = { builder, TokenizerBuilder, Tokenizer, KoreanToken, POS_TAGS };
|
|
1368
|
+
export {
|
|
1369
|
+
KoreanToken,
|
|
1370
|
+
POS_TAGS,
|
|
1371
|
+
Tokenizer,
|
|
1372
|
+
TokenizerBuilder,
|
|
1373
|
+
builder,
|
|
1374
|
+
index_default as default
|
|
1375
|
+
};
|