kuromoji-ko 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/dict/base.dat.gz +0 -0
- package/dict/cc.dat.gz +0 -0
- package/dict/check.dat.gz +0 -0
- package/dict/tid.dat.gz +0 -0
- package/dict/tid_map.dat.gz +0 -0
- package/dict/tid_pos.dat.gz +0 -0
- package/dict/unk.dat.gz +0 -0
- package/dict/unk_char.dat.gz +0 -0
- package/dict/unk_compat.dat.gz +0 -0
- package/dict/unk_invoke.dat.gz +0 -0
- package/dict/unk_map.dat.gz +0 -0
- package/dict/unk_pos.dat.gz +0 -0
- package/dist/index.cjs +1416 -0
- package/dist/index.d.cts +352 -0
- package/dist/index.d.ts +352 -0
- package/dist/index.js +1375 -0
- package/package.json +63 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
type ViterbiNodeType = 'KNOWN' | 'UNKNOWN' | 'BOS' | 'EOS';
|
|
2
|
+
/**
|
|
3
|
+
* ViterbiNode - a node in the Viterbi lattice
|
|
4
|
+
*/
|
|
5
|
+
declare class ViterbiNode {
|
|
6
|
+
name: number;
|
|
7
|
+
cost: number;
|
|
8
|
+
start_pos: number;
|
|
9
|
+
length: number;
|
|
10
|
+
left_id: number;
|
|
11
|
+
right_id: number;
|
|
12
|
+
prev: ViterbiNode | null;
|
|
13
|
+
surface_form: string;
|
|
14
|
+
shortest_cost: number;
|
|
15
|
+
type: ViterbiNodeType;
|
|
16
|
+
constructor(nodeName: number, nodeCost: number, startPos: number, length: number, type: ViterbiNodeType, leftId: number, rightId: number, surfaceForm: string);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* ViterbiLattice - a word lattice for Viterbi algorithm
|
|
21
|
+
*/
|
|
22
|
+
declare class ViterbiLattice {
|
|
23
|
+
nodesEndAt: (ViterbiNode[] | null)[];
|
|
24
|
+
eosPos: number;
|
|
25
|
+
constructor();
|
|
26
|
+
/**
|
|
27
|
+
* Append node to the lattice
|
|
28
|
+
*/
|
|
29
|
+
append(node: ViterbiNode): void;
|
|
30
|
+
/**
|
|
31
|
+
* Append EOS (End of Sentence) node
|
|
32
|
+
*/
|
|
33
|
+
appendEos(): void;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Korean Token - represents a single morpheme from tokenization
|
|
38
|
+
*
|
|
39
|
+
* mecab-ko-dic format (8 features):
|
|
40
|
+
* 0: 품사 태그 (POS tag) - e.g., NNG, VV, JKS
|
|
41
|
+
* 1: 의미 부류 (semantic class) - e.g., 행위, 인물
|
|
42
|
+
* 2: 종성 유무 (final consonant) - T/F/*
|
|
43
|
+
* 3: 읽기 (reading) - pronunciation
|
|
44
|
+
* 4: 타입 (type) - Inflect/Compound/Preanalysis/*
|
|
45
|
+
* 5: 첫번째 품사 (first POS) - for compound words
|
|
46
|
+
* 6: 마지막 품사 (last POS) - for compound words
|
|
47
|
+
* 7: 표현 (expression) - decomposition of compounds
|
|
48
|
+
*/
|
|
49
|
+
declare const POS_TAGS: Record<string, string>;
|
|
50
|
+
interface KoreanTokenOptions {
|
|
51
|
+
word_id?: number;
|
|
52
|
+
word_type?: 'KNOWN' | 'UNKNOWN';
|
|
53
|
+
word_position?: number;
|
|
54
|
+
surface_form?: string;
|
|
55
|
+
pos?: string;
|
|
56
|
+
semantic_class?: string;
|
|
57
|
+
has_final_consonant?: string;
|
|
58
|
+
reading?: string;
|
|
59
|
+
type?: string;
|
|
60
|
+
first_pos?: string;
|
|
61
|
+
last_pos?: string;
|
|
62
|
+
expression?: string;
|
|
63
|
+
}
|
|
64
|
+
interface TokenPart {
|
|
65
|
+
surface: string;
|
|
66
|
+
pos: string;
|
|
67
|
+
}
|
|
68
|
+
declare class KoreanToken {
|
|
69
|
+
word_id: number;
|
|
70
|
+
word_type: 'KNOWN' | 'UNKNOWN';
|
|
71
|
+
word_position: number;
|
|
72
|
+
surface_form: string;
|
|
73
|
+
pos: string;
|
|
74
|
+
semantic_class: string;
|
|
75
|
+
has_final_consonant: string;
|
|
76
|
+
reading: string;
|
|
77
|
+
type: string;
|
|
78
|
+
first_pos: string;
|
|
79
|
+
last_pos: string;
|
|
80
|
+
expression: string;
|
|
81
|
+
constructor(options?: KoreanTokenOptions);
|
|
82
|
+
/**
|
|
83
|
+
* Get human-readable POS description
|
|
84
|
+
*/
|
|
85
|
+
get posDescription(): string;
|
|
86
|
+
/**
|
|
87
|
+
* Check if token ends with a consonant (받침)
|
|
88
|
+
*/
|
|
89
|
+
get hasBatchim(): boolean;
|
|
90
|
+
/**
|
|
91
|
+
* Check if this is a compound word
|
|
92
|
+
*/
|
|
93
|
+
get isCompound(): boolean;
|
|
94
|
+
/**
|
|
95
|
+
* Check if this is an inflected form
|
|
96
|
+
*/
|
|
97
|
+
get isInflected(): boolean;
|
|
98
|
+
/**
|
|
99
|
+
* Get the decomposed parts for compound/inflected words
|
|
100
|
+
*/
|
|
101
|
+
get parts(): TokenPart[];
|
|
102
|
+
/**
|
|
103
|
+
* Create token from features array
|
|
104
|
+
*/
|
|
105
|
+
static fromFeatures(surface: string, features: string[], wordId?: number, position?: number, wordType?: 'KNOWN' | 'UNKNOWN'): KoreanToken;
|
|
106
|
+
/**
|
|
107
|
+
* Convert to plain object
|
|
108
|
+
*/
|
|
109
|
+
toJSON(): Record<string, unknown>;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* ByteBuffer - Utilities to manipulate byte sequences
|
|
114
|
+
*/
|
|
115
|
+
declare class ByteBuffer {
|
|
116
|
+
buffer: Uint8Array;
|
|
117
|
+
position: number;
|
|
118
|
+
constructor(arg?: number | Uint8Array | ArrayBuffer);
|
|
119
|
+
size(): number;
|
|
120
|
+
reallocate(): void;
|
|
121
|
+
shrink(): Uint8Array;
|
|
122
|
+
put(b: number): void;
|
|
123
|
+
get(index?: number): number;
|
|
124
|
+
putShort(num: number): void;
|
|
125
|
+
getShort(index?: number): number;
|
|
126
|
+
putInt(num: number): void;
|
|
127
|
+
getInt(index?: number): number;
|
|
128
|
+
readInt(): number;
|
|
129
|
+
putString(str: string): void;
|
|
130
|
+
getString(index?: number): string;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* TokenInfoDictionary - dictionary for known tokens
|
|
135
|
+
*/
|
|
136
|
+
declare class TokenInfoDictionary {
|
|
137
|
+
dictionary: ByteBuffer;
|
|
138
|
+
targetMap: Record<number, number[]>;
|
|
139
|
+
posBuffer: ByteBuffer;
|
|
140
|
+
constructor();
|
|
141
|
+
/**
|
|
142
|
+
* Build dictionary from entries
|
|
143
|
+
* Entry format: [surface, left_id, right_id, word_cost, ...features]
|
|
144
|
+
*/
|
|
145
|
+
buildDictionary(entries: (string | number)[][]): Record<number, string>;
|
|
146
|
+
put(leftId: number, rightId: number, wordCost: number, surfaceForm: string, feature: string): number;
|
|
147
|
+
addMapping(source: number, target: number): void;
|
|
148
|
+
targetMapToBuffer(): Uint8Array;
|
|
149
|
+
loadDictionary(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
150
|
+
loadPosVector(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
151
|
+
loadTargetMap(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
152
|
+
/**
|
|
153
|
+
* Look up features in the dictionary
|
|
154
|
+
*/
|
|
155
|
+
getFeatures(tokenInfoIdStr: string | number): string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* ConnectionCosts - connection costs matrix from cc.dat file
|
|
160
|
+
* 2 dimension matrix [forward_id][backward_id] -> cost
|
|
161
|
+
*/
|
|
162
|
+
declare class ConnectionCosts {
|
|
163
|
+
forwardDimension: number;
|
|
164
|
+
backwardDimension: number;
|
|
165
|
+
buffer: Int16Array;
|
|
166
|
+
constructor(forwardDimension: number, backwardDimension: number);
|
|
167
|
+
put(forwardId: number, backwardId: number, cost: number): void;
|
|
168
|
+
get(forwardId: number, backwardId: number): number;
|
|
169
|
+
loadConnectionCosts(connectionCostsBuffer: Int16Array): void;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* CharacterClass - represents a character category for unknown word processing
|
|
174
|
+
*/
|
|
175
|
+
declare class CharacterClass {
|
|
176
|
+
class_id: number;
|
|
177
|
+
class_name: string;
|
|
178
|
+
is_always_invoke: boolean | number;
|
|
179
|
+
is_grouping: boolean | number;
|
|
180
|
+
max_length: number;
|
|
181
|
+
constructor(classId: number, className: string, isAlwaysInvoke: boolean | number, isGrouping: boolean | number, maxLength: number);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* InvokeDefinitionMap - represents invoke definition part of char.def
|
|
186
|
+
*/
|
|
187
|
+
declare class InvokeDefinitionMap {
|
|
188
|
+
map: CharacterClass[];
|
|
189
|
+
lookupTable: Record<string, number>;
|
|
190
|
+
constructor();
|
|
191
|
+
/**
|
|
192
|
+
* Load InvokeDefinitionMap from buffer
|
|
193
|
+
*/
|
|
194
|
+
static load(invokeDefBuffer: Uint8Array): InvokeDefinitionMap;
|
|
195
|
+
/**
|
|
196
|
+
* Initialize with character category definitions
|
|
197
|
+
*/
|
|
198
|
+
init(characterCategoryDefinition: CharacterClass[] | null): void;
|
|
199
|
+
/**
|
|
200
|
+
* Get class information by class ID
|
|
201
|
+
*/
|
|
202
|
+
getCharacterClass(classId: number): CharacterClass | undefined;
|
|
203
|
+
/**
|
|
204
|
+
* Lookup class ID by class name
|
|
205
|
+
*/
|
|
206
|
+
lookup(className: string): number | null;
|
|
207
|
+
/**
|
|
208
|
+
* Transform from map to binary buffer
|
|
209
|
+
*/
|
|
210
|
+
toBuffer(): Uint8Array;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
interface CategoryMapping {
|
|
214
|
+
start: number;
|
|
215
|
+
end?: number;
|
|
216
|
+
default: string;
|
|
217
|
+
compatible: string[];
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* CharacterDefinition - represents char.def file and
|
|
221
|
+
* defines behavior of unknown word processing
|
|
222
|
+
*/
|
|
223
|
+
declare class CharacterDefinition {
|
|
224
|
+
characterCategoryMap: Uint8Array;
|
|
225
|
+
compatibleCategoryMap: Uint32Array;
|
|
226
|
+
invokeDefinitionMap: InvokeDefinitionMap | null;
|
|
227
|
+
constructor();
|
|
228
|
+
/**
|
|
229
|
+
* Load CharacterDefinition from buffers
|
|
230
|
+
*/
|
|
231
|
+
static load(catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): CharacterDefinition;
|
|
232
|
+
static parseCharCategory(classId: number, parsedCategoryDef: string[]): CharacterClass | null;
|
|
233
|
+
static parseCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
|
|
234
|
+
static parseRangeCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
|
|
235
|
+
/**
|
|
236
|
+
* Initialize category mappings
|
|
237
|
+
*/
|
|
238
|
+
initCategoryMappings(categoryMapping: CategoryMapping[] | null): void;
|
|
239
|
+
/**
|
|
240
|
+
* Lookup compatible categories for a character (not included 1st category)
|
|
241
|
+
*/
|
|
242
|
+
lookupCompatibleCategory(ch: string): CharacterClass[];
|
|
243
|
+
/**
|
|
244
|
+
* Lookup category for a character
|
|
245
|
+
*/
|
|
246
|
+
lookup(ch: string): CharacterClass | undefined;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* UnknownDictionary - dictionary for unknown words
|
|
251
|
+
*/
|
|
252
|
+
declare class UnknownDictionary extends TokenInfoDictionary {
|
|
253
|
+
characterDefinition: CharacterDefinition | null;
|
|
254
|
+
constructor();
|
|
255
|
+
setCharacterDefinition(characterDefinition: CharacterDefinition): this;
|
|
256
|
+
lookup(ch: string): CharacterClass | undefined;
|
|
257
|
+
lookupCompatibleCategory(ch: string): CharacterClass[];
|
|
258
|
+
loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): void;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
interface DoubleArrayTrie {
|
|
262
|
+
commonPrefixSearch(key: string): Array<{
|
|
263
|
+
k: string;
|
|
264
|
+
v: number;
|
|
265
|
+
}>;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* DynamicDictionaries - container for all dictionaries used by Tokenizer
|
|
269
|
+
*/
|
|
270
|
+
declare class DynamicDictionaries {
|
|
271
|
+
trie: DoubleArrayTrie;
|
|
272
|
+
tokenInfoDictionary: TokenInfoDictionary;
|
|
273
|
+
connectionCosts: ConnectionCosts;
|
|
274
|
+
unknownDictionary: UnknownDictionary;
|
|
275
|
+
constructor(trie?: DoubleArrayTrie | null, tokenInfoDictionary?: TokenInfoDictionary | null, connectionCosts?: ConnectionCosts | null, unknownDictionary?: UnknownDictionary | null);
|
|
276
|
+
loadTrie(baseBuffer: Int32Array, checkBuffer: Int32Array): Promise<this>;
|
|
277
|
+
loadTokenInfoDictionaries(tokenInfoBuffer: Uint8Array, posBuffer: Uint8Array, targetMapBuffer: Uint8Array): this;
|
|
278
|
+
loadConnectionCosts(ccBuffer: Int16Array): this;
|
|
279
|
+
loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): this;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Tokenizer - Korean morphological analyzer
|
|
284
|
+
*/
|
|
285
|
+
declare class Tokenizer {
|
|
286
|
+
private tokenInfoDictionary;
|
|
287
|
+
private unknownDictionary;
|
|
288
|
+
private viterbiBuilder;
|
|
289
|
+
private viterbiSearcher;
|
|
290
|
+
private formatter;
|
|
291
|
+
constructor(dic: DynamicDictionaries);
|
|
292
|
+
/**
|
|
293
|
+
* Split text by sentence-ending punctuation
|
|
294
|
+
*/
|
|
295
|
+
static splitByPunctuation(input: string): string[];
|
|
296
|
+
/**
|
|
297
|
+
* Tokenize text into morphemes
|
|
298
|
+
*/
|
|
299
|
+
tokenize(text: string): KoreanToken[];
|
|
300
|
+
/**
|
|
301
|
+
* Tokenize a single sentence
|
|
302
|
+
*/
|
|
303
|
+
tokenizeForSentence(sentence: string, tokens?: KoreanToken[]): KoreanToken[];
|
|
304
|
+
/**
|
|
305
|
+
* Get just the surface forms as an array (wakachi-gaki)
|
|
306
|
+
*/
|
|
307
|
+
wakati(text: string): string[];
|
|
308
|
+
/**
|
|
309
|
+
* Get space-separated surface forms
|
|
310
|
+
*/
|
|
311
|
+
wakatiString(text: string): string;
|
|
312
|
+
/**
|
|
313
|
+
* Build word lattice for analysis
|
|
314
|
+
*/
|
|
315
|
+
getLattice(text: string): ViterbiLattice;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
interface TokenizerBuilderOptions {
|
|
319
|
+
dicPath?: string;
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* TokenizerBuilder - builds a Tokenizer with loaded dictionaries
|
|
323
|
+
*/
|
|
324
|
+
declare class TokenizerBuilder {
|
|
325
|
+
private dicPath;
|
|
326
|
+
constructor(options?: TokenizerBuilderOptions);
|
|
327
|
+
/**
|
|
328
|
+
* Build and return the tokenizer (async)
|
|
329
|
+
*/
|
|
330
|
+
build(): Promise<Tokenizer>;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* mecab-ko - Pure TypeScript Korean Morphological Analyzer
|
|
335
|
+
*
|
|
336
|
+
* A port of kuromoji.js adapted for Korean language processing using mecab-ko-dic.
|
|
337
|
+
*/
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Create a tokenizer builder
|
|
341
|
+
*/
|
|
342
|
+
declare function builder(options?: TokenizerBuilderOptions): TokenizerBuilder;
|
|
343
|
+
|
|
344
|
+
declare const _default: {
|
|
345
|
+
builder: typeof builder;
|
|
346
|
+
TokenizerBuilder: typeof TokenizerBuilder;
|
|
347
|
+
Tokenizer: typeof Tokenizer;
|
|
348
|
+
KoreanToken: typeof KoreanToken;
|
|
349
|
+
POS_TAGS: Record<string, string>;
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
type ViterbiNodeType = 'KNOWN' | 'UNKNOWN' | 'BOS' | 'EOS';
|
|
2
|
+
/**
|
|
3
|
+
* ViterbiNode - a node in the Viterbi lattice
|
|
4
|
+
*/
|
|
5
|
+
declare class ViterbiNode {
|
|
6
|
+
name: number;
|
|
7
|
+
cost: number;
|
|
8
|
+
start_pos: number;
|
|
9
|
+
length: number;
|
|
10
|
+
left_id: number;
|
|
11
|
+
right_id: number;
|
|
12
|
+
prev: ViterbiNode | null;
|
|
13
|
+
surface_form: string;
|
|
14
|
+
shortest_cost: number;
|
|
15
|
+
type: ViterbiNodeType;
|
|
16
|
+
constructor(nodeName: number, nodeCost: number, startPos: number, length: number, type: ViterbiNodeType, leftId: number, rightId: number, surfaceForm: string);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* ViterbiLattice - a word lattice for Viterbi algorithm
|
|
21
|
+
*/
|
|
22
|
+
declare class ViterbiLattice {
|
|
23
|
+
nodesEndAt: (ViterbiNode[] | null)[];
|
|
24
|
+
eosPos: number;
|
|
25
|
+
constructor();
|
|
26
|
+
/**
|
|
27
|
+
* Append node to the lattice
|
|
28
|
+
*/
|
|
29
|
+
append(node: ViterbiNode): void;
|
|
30
|
+
/**
|
|
31
|
+
* Append EOS (End of Sentence) node
|
|
32
|
+
*/
|
|
33
|
+
appendEos(): void;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Korean Token - represents a single morpheme from tokenization
|
|
38
|
+
*
|
|
39
|
+
* mecab-ko-dic format (8 features):
|
|
40
|
+
* 0: 품사 태그 (POS tag) - e.g., NNG, VV, JKS
|
|
41
|
+
* 1: 의미 부류 (semantic class) - e.g., 행위, 인물
|
|
42
|
+
* 2: 종성 유무 (final consonant) - T/F/*
|
|
43
|
+
* 3: 읽기 (reading) - pronunciation
|
|
44
|
+
* 4: 타입 (type) - Inflect/Compound/Preanalysis/*
|
|
45
|
+
* 5: 첫번째 품사 (first POS) - for compound words
|
|
46
|
+
* 6: 마지막 품사 (last POS) - for compound words
|
|
47
|
+
* 7: 표현 (expression) - decomposition of compounds
|
|
48
|
+
*/
|
|
49
|
+
declare const POS_TAGS: Record<string, string>;
|
|
50
|
+
interface KoreanTokenOptions {
|
|
51
|
+
word_id?: number;
|
|
52
|
+
word_type?: 'KNOWN' | 'UNKNOWN';
|
|
53
|
+
word_position?: number;
|
|
54
|
+
surface_form?: string;
|
|
55
|
+
pos?: string;
|
|
56
|
+
semantic_class?: string;
|
|
57
|
+
has_final_consonant?: string;
|
|
58
|
+
reading?: string;
|
|
59
|
+
type?: string;
|
|
60
|
+
first_pos?: string;
|
|
61
|
+
last_pos?: string;
|
|
62
|
+
expression?: string;
|
|
63
|
+
}
|
|
64
|
+
interface TokenPart {
|
|
65
|
+
surface: string;
|
|
66
|
+
pos: string;
|
|
67
|
+
}
|
|
68
|
+
declare class KoreanToken {
|
|
69
|
+
word_id: number;
|
|
70
|
+
word_type: 'KNOWN' | 'UNKNOWN';
|
|
71
|
+
word_position: number;
|
|
72
|
+
surface_form: string;
|
|
73
|
+
pos: string;
|
|
74
|
+
semantic_class: string;
|
|
75
|
+
has_final_consonant: string;
|
|
76
|
+
reading: string;
|
|
77
|
+
type: string;
|
|
78
|
+
first_pos: string;
|
|
79
|
+
last_pos: string;
|
|
80
|
+
expression: string;
|
|
81
|
+
constructor(options?: KoreanTokenOptions);
|
|
82
|
+
/**
|
|
83
|
+
* Get human-readable POS description
|
|
84
|
+
*/
|
|
85
|
+
get posDescription(): string;
|
|
86
|
+
/**
|
|
87
|
+
* Check if token ends with a consonant (받침)
|
|
88
|
+
*/
|
|
89
|
+
get hasBatchim(): boolean;
|
|
90
|
+
/**
|
|
91
|
+
* Check if this is a compound word
|
|
92
|
+
*/
|
|
93
|
+
get isCompound(): boolean;
|
|
94
|
+
/**
|
|
95
|
+
* Check if this is an inflected form
|
|
96
|
+
*/
|
|
97
|
+
get isInflected(): boolean;
|
|
98
|
+
/**
|
|
99
|
+
* Get the decomposed parts for compound/inflected words
|
|
100
|
+
*/
|
|
101
|
+
get parts(): TokenPart[];
|
|
102
|
+
/**
|
|
103
|
+
* Create token from features array
|
|
104
|
+
*/
|
|
105
|
+
static fromFeatures(surface: string, features: string[], wordId?: number, position?: number, wordType?: 'KNOWN' | 'UNKNOWN'): KoreanToken;
|
|
106
|
+
/**
|
|
107
|
+
* Convert to plain object
|
|
108
|
+
*/
|
|
109
|
+
toJSON(): Record<string, unknown>;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* ByteBuffer - Utilities to manipulate byte sequences
|
|
114
|
+
*/
|
|
115
|
+
declare class ByteBuffer {
|
|
116
|
+
buffer: Uint8Array;
|
|
117
|
+
position: number;
|
|
118
|
+
constructor(arg?: number | Uint8Array | ArrayBuffer);
|
|
119
|
+
size(): number;
|
|
120
|
+
reallocate(): void;
|
|
121
|
+
shrink(): Uint8Array;
|
|
122
|
+
put(b: number): void;
|
|
123
|
+
get(index?: number): number;
|
|
124
|
+
putShort(num: number): void;
|
|
125
|
+
getShort(index?: number): number;
|
|
126
|
+
putInt(num: number): void;
|
|
127
|
+
getInt(index?: number): number;
|
|
128
|
+
readInt(): number;
|
|
129
|
+
putString(str: string): void;
|
|
130
|
+
getString(index?: number): string;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* TokenInfoDictionary - dictionary for known tokens
|
|
135
|
+
*/
|
|
136
|
+
declare class TokenInfoDictionary {
|
|
137
|
+
dictionary: ByteBuffer;
|
|
138
|
+
targetMap: Record<number, number[]>;
|
|
139
|
+
posBuffer: ByteBuffer;
|
|
140
|
+
constructor();
|
|
141
|
+
/**
|
|
142
|
+
* Build dictionary from entries
|
|
143
|
+
* Entry format: [surface, left_id, right_id, word_cost, ...features]
|
|
144
|
+
*/
|
|
145
|
+
buildDictionary(entries: (string | number)[][]): Record<number, string>;
|
|
146
|
+
put(leftId: number, rightId: number, wordCost: number, surfaceForm: string, feature: string): number;
|
|
147
|
+
addMapping(source: number, target: number): void;
|
|
148
|
+
targetMapToBuffer(): Uint8Array;
|
|
149
|
+
loadDictionary(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
150
|
+
loadPosVector(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
151
|
+
loadTargetMap(arrayBuffer: Uint8Array | ArrayBuffer): this;
|
|
152
|
+
/**
|
|
153
|
+
* Look up features in the dictionary
|
|
154
|
+
*/
|
|
155
|
+
getFeatures(tokenInfoIdStr: string | number): string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* ConnectionCosts - connection costs matrix from cc.dat file
|
|
160
|
+
* 2 dimension matrix [forward_id][backward_id] -> cost
|
|
161
|
+
*/
|
|
162
|
+
declare class ConnectionCosts {
|
|
163
|
+
forwardDimension: number;
|
|
164
|
+
backwardDimension: number;
|
|
165
|
+
buffer: Int16Array;
|
|
166
|
+
constructor(forwardDimension: number, backwardDimension: number);
|
|
167
|
+
put(forwardId: number, backwardId: number, cost: number): void;
|
|
168
|
+
get(forwardId: number, backwardId: number): number;
|
|
169
|
+
loadConnectionCosts(connectionCostsBuffer: Int16Array): void;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* CharacterClass - represents a character category for unknown word processing
|
|
174
|
+
*/
|
|
175
|
+
declare class CharacterClass {
|
|
176
|
+
class_id: number;
|
|
177
|
+
class_name: string;
|
|
178
|
+
is_always_invoke: boolean | number;
|
|
179
|
+
is_grouping: boolean | number;
|
|
180
|
+
max_length: number;
|
|
181
|
+
constructor(classId: number, className: string, isAlwaysInvoke: boolean | number, isGrouping: boolean | number, maxLength: number);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* InvokeDefinitionMap - represents invoke definition part of char.def
|
|
186
|
+
*/
|
|
187
|
+
declare class InvokeDefinitionMap {
|
|
188
|
+
map: CharacterClass[];
|
|
189
|
+
lookupTable: Record<string, number>;
|
|
190
|
+
constructor();
|
|
191
|
+
/**
|
|
192
|
+
* Load InvokeDefinitionMap from buffer
|
|
193
|
+
*/
|
|
194
|
+
static load(invokeDefBuffer: Uint8Array): InvokeDefinitionMap;
|
|
195
|
+
/**
|
|
196
|
+
* Initialize with character category definitions
|
|
197
|
+
*/
|
|
198
|
+
init(characterCategoryDefinition: CharacterClass[] | null): void;
|
|
199
|
+
/**
|
|
200
|
+
* Get class information by class ID
|
|
201
|
+
*/
|
|
202
|
+
getCharacterClass(classId: number): CharacterClass | undefined;
|
|
203
|
+
/**
|
|
204
|
+
* Lookup class ID by class name
|
|
205
|
+
*/
|
|
206
|
+
lookup(className: string): number | null;
|
|
207
|
+
/**
|
|
208
|
+
* Transform from map to binary buffer
|
|
209
|
+
*/
|
|
210
|
+
toBuffer(): Uint8Array;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
interface CategoryMapping {
|
|
214
|
+
start: number;
|
|
215
|
+
end?: number;
|
|
216
|
+
default: string;
|
|
217
|
+
compatible: string[];
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* CharacterDefinition - represents char.def file and
|
|
221
|
+
* defines behavior of unknown word processing
|
|
222
|
+
*/
|
|
223
|
+
declare class CharacterDefinition {
|
|
224
|
+
characterCategoryMap: Uint8Array;
|
|
225
|
+
compatibleCategoryMap: Uint32Array;
|
|
226
|
+
invokeDefinitionMap: InvokeDefinitionMap | null;
|
|
227
|
+
constructor();
|
|
228
|
+
/**
|
|
229
|
+
* Load CharacterDefinition from buffers
|
|
230
|
+
*/
|
|
231
|
+
static load(catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): CharacterDefinition;
|
|
232
|
+
static parseCharCategory(classId: number, parsedCategoryDef: string[]): CharacterClass | null;
|
|
233
|
+
static parseCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
|
|
234
|
+
static parseRangeCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
|
|
235
|
+
/**
|
|
236
|
+
* Initialize category mappings
|
|
237
|
+
*/
|
|
238
|
+
initCategoryMappings(categoryMapping: CategoryMapping[] | null): void;
|
|
239
|
+
/**
|
|
240
|
+
* Lookup compatible categories for a character (not included 1st category)
|
|
241
|
+
*/
|
|
242
|
+
lookupCompatibleCategory(ch: string): CharacterClass[];
|
|
243
|
+
/**
|
|
244
|
+
* Lookup category for a character
|
|
245
|
+
*/
|
|
246
|
+
lookup(ch: string): CharacterClass | undefined;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* UnknownDictionary - dictionary for unknown words
|
|
251
|
+
*/
|
|
252
|
+
declare class UnknownDictionary extends TokenInfoDictionary {
|
|
253
|
+
characterDefinition: CharacterDefinition | null;
|
|
254
|
+
constructor();
|
|
255
|
+
setCharacterDefinition(characterDefinition: CharacterDefinition): this;
|
|
256
|
+
lookup(ch: string): CharacterClass | undefined;
|
|
257
|
+
lookupCompatibleCategory(ch: string): CharacterClass[];
|
|
258
|
+
loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): void;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
interface DoubleArrayTrie {
|
|
262
|
+
commonPrefixSearch(key: string): Array<{
|
|
263
|
+
k: string;
|
|
264
|
+
v: number;
|
|
265
|
+
}>;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* DynamicDictionaries - container for all dictionaries used by Tokenizer
|
|
269
|
+
*/
|
|
270
|
+
declare class DynamicDictionaries {
|
|
271
|
+
trie: DoubleArrayTrie;
|
|
272
|
+
tokenInfoDictionary: TokenInfoDictionary;
|
|
273
|
+
connectionCosts: ConnectionCosts;
|
|
274
|
+
unknownDictionary: UnknownDictionary;
|
|
275
|
+
constructor(trie?: DoubleArrayTrie | null, tokenInfoDictionary?: TokenInfoDictionary | null, connectionCosts?: ConnectionCosts | null, unknownDictionary?: UnknownDictionary | null);
|
|
276
|
+
loadTrie(baseBuffer: Int32Array, checkBuffer: Int32Array): Promise<this>;
|
|
277
|
+
loadTokenInfoDictionaries(tokenInfoBuffer: Uint8Array, posBuffer: Uint8Array, targetMapBuffer: Uint8Array): this;
|
|
278
|
+
loadConnectionCosts(ccBuffer: Int16Array): this;
|
|
279
|
+
loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): this;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Tokenizer - Korean morphological analyzer
|
|
284
|
+
*/
|
|
285
|
+
declare class Tokenizer {
|
|
286
|
+
private tokenInfoDictionary;
|
|
287
|
+
private unknownDictionary;
|
|
288
|
+
private viterbiBuilder;
|
|
289
|
+
private viterbiSearcher;
|
|
290
|
+
private formatter;
|
|
291
|
+
constructor(dic: DynamicDictionaries);
|
|
292
|
+
/**
|
|
293
|
+
* Split text by sentence-ending punctuation
|
|
294
|
+
*/
|
|
295
|
+
static splitByPunctuation(input: string): string[];
|
|
296
|
+
/**
|
|
297
|
+
* Tokenize text into morphemes
|
|
298
|
+
*/
|
|
299
|
+
tokenize(text: string): KoreanToken[];
|
|
300
|
+
/**
|
|
301
|
+
* Tokenize a single sentence
|
|
302
|
+
*/
|
|
303
|
+
tokenizeForSentence(sentence: string, tokens?: KoreanToken[]): KoreanToken[];
|
|
304
|
+
/**
|
|
305
|
+
* Get just the surface forms as an array (wakachi-gaki)
|
|
306
|
+
*/
|
|
307
|
+
wakati(text: string): string[];
|
|
308
|
+
/**
|
|
309
|
+
* Get space-separated surface forms
|
|
310
|
+
*/
|
|
311
|
+
wakatiString(text: string): string;
|
|
312
|
+
/**
|
|
313
|
+
* Build word lattice for analysis
|
|
314
|
+
*/
|
|
315
|
+
getLattice(text: string): ViterbiLattice;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
interface TokenizerBuilderOptions {
|
|
319
|
+
dicPath?: string;
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* TokenizerBuilder - builds a Tokenizer with loaded dictionaries
|
|
323
|
+
*/
|
|
324
|
+
declare class TokenizerBuilder {
|
|
325
|
+
private dicPath;
|
|
326
|
+
constructor(options?: TokenizerBuilderOptions);
|
|
327
|
+
/**
|
|
328
|
+
* Build and return the tokenizer (async)
|
|
329
|
+
*/
|
|
330
|
+
build(): Promise<Tokenizer>;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* mecab-ko - Pure TypeScript Korean Morphological Analyzer
|
|
335
|
+
*
|
|
336
|
+
* A port of kuromoji.js adapted for Korean language processing using mecab-ko-dic.
|
|
337
|
+
*/
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Create a tokenizer builder
|
|
341
|
+
*/
|
|
342
|
+
declare function builder(options?: TokenizerBuilderOptions): TokenizerBuilder;
|
|
343
|
+
|
|
344
|
+
declare const _default: {
|
|
345
|
+
builder: typeof builder;
|
|
346
|
+
TokenizerBuilder: typeof TokenizerBuilder;
|
|
347
|
+
Tokenizer: typeof Tokenizer;
|
|
348
|
+
KoreanToken: typeof KoreanToken;
|
|
349
|
+
POS_TAGS: Record<string, string>;
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|