kuromoji-ko 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,352 @@
1
+ type ViterbiNodeType = 'KNOWN' | 'UNKNOWN' | 'BOS' | 'EOS';
2
+ /**
3
+ * ViterbiNode - a node in the Viterbi lattice
4
+ */
5
+ declare class ViterbiNode {
6
+ name: number;
7
+ cost: number;
8
+ start_pos: number;
9
+ length: number;
10
+ left_id: number;
11
+ right_id: number;
12
+ prev: ViterbiNode | null;
13
+ surface_form: string;
14
+ shortest_cost: number;
15
+ type: ViterbiNodeType;
16
+ constructor(nodeName: number, nodeCost: number, startPos: number, length: number, type: ViterbiNodeType, leftId: number, rightId: number, surfaceForm: string);
17
+ }
18
+
19
+ /**
20
+ * ViterbiLattice - a word lattice for Viterbi algorithm
21
+ */
22
+ declare class ViterbiLattice {
23
+ nodesEndAt: (ViterbiNode[] | null)[];
24
+ eosPos: number;
25
+ constructor();
26
+ /**
27
+ * Append node to the lattice
28
+ */
29
+ append(node: ViterbiNode): void;
30
+ /**
31
+ * Append EOS (End of Sentence) node
32
+ */
33
+ appendEos(): void;
34
+ }
35
+
36
+ /**
37
+ * Korean Token - represents a single morpheme from tokenization
38
+ *
39
+ * mecab-ko-dic format (8 features):
40
+ * 0: 품사 태그 (POS tag) - e.g., NNG, VV, JKS
41
+ * 1: 의미 부류 (semantic class) - e.g., 행위, 인물
42
+ * 2: 종성 유무 (final consonant) - T/F/*
43
+ * 3: 읽기 (reading) - pronunciation
44
+ * 4: 타입 (type) - Inflect/Compound/Preanalysis/*
45
+ * 5: 첫번째 품사 (first POS) - for compound words
46
+ * 6: 마지막 품사 (last POS) - for compound words
47
+ * 7: 표현 (expression) - decomposition of compounds
48
+ */
49
+ declare const POS_TAGS: Record<string, string>;
50
+ interface KoreanTokenOptions {
51
+ word_id?: number;
52
+ word_type?: 'KNOWN' | 'UNKNOWN';
53
+ word_position?: number;
54
+ surface_form?: string;
55
+ pos?: string;
56
+ semantic_class?: string;
57
+ has_final_consonant?: string;
58
+ reading?: string;
59
+ type?: string;
60
+ first_pos?: string;
61
+ last_pos?: string;
62
+ expression?: string;
63
+ }
64
+ interface TokenPart {
65
+ surface: string;
66
+ pos: string;
67
+ }
68
+ declare class KoreanToken {
69
+ word_id: number;
70
+ word_type: 'KNOWN' | 'UNKNOWN';
71
+ word_position: number;
72
+ surface_form: string;
73
+ pos: string;
74
+ semantic_class: string;
75
+ has_final_consonant: string;
76
+ reading: string;
77
+ type: string;
78
+ first_pos: string;
79
+ last_pos: string;
80
+ expression: string;
81
+ constructor(options?: KoreanTokenOptions);
82
+ /**
83
+ * Get human-readable POS description
84
+ */
85
+ get posDescription(): string;
86
+ /**
87
+ * Check if token ends with a consonant (받침)
88
+ */
89
+ get hasBatchim(): boolean;
90
+ /**
91
+ * Check if this is a compound word
92
+ */
93
+ get isCompound(): boolean;
94
+ /**
95
+ * Check if this is an inflected form
96
+ */
97
+ get isInflected(): boolean;
98
+ /**
99
+ * Get the decomposed parts for compound/inflected words
100
+ */
101
+ get parts(): TokenPart[];
102
+ /**
103
+ * Create token from features array
104
+ */
105
+ static fromFeatures(surface: string, features: string[], wordId?: number, position?: number, wordType?: 'KNOWN' | 'UNKNOWN'): KoreanToken;
106
+ /**
107
+ * Convert to plain object
108
+ */
109
+ toJSON(): Record<string, unknown>;
110
+ }
111
+
112
+ /**
113
+ * ByteBuffer - Utilities to manipulate byte sequences
114
+ */
115
+ declare class ByteBuffer {
116
+ buffer: Uint8Array;
117
+ position: number;
118
+ constructor(arg?: number | Uint8Array | ArrayBuffer);
119
+ size(): number;
120
+ reallocate(): void;
121
+ shrink(): Uint8Array;
122
+ put(b: number): void;
123
+ get(index?: number): number;
124
+ putShort(num: number): void;
125
+ getShort(index?: number): number;
126
+ putInt(num: number): void;
127
+ getInt(index?: number): number;
128
+ readInt(): number;
129
+ putString(str: string): void;
130
+ getString(index?: number): string;
131
+ }
132
+
133
+ /**
134
+ * TokenInfoDictionary - dictionary for known tokens
135
+ */
136
+ declare class TokenInfoDictionary {
137
+ dictionary: ByteBuffer;
138
+ targetMap: Record<number, number[]>;
139
+ posBuffer: ByteBuffer;
140
+ constructor();
141
+ /**
142
+ * Build dictionary from entries
143
+ * Entry format: [surface, left_id, right_id, word_cost, ...features]
144
+ */
145
+ buildDictionary(entries: (string | number)[][]): Record<number, string>;
146
+ put(leftId: number, rightId: number, wordCost: number, surfaceForm: string, feature: string): number;
147
+ addMapping(source: number, target: number): void;
148
+ targetMapToBuffer(): Uint8Array;
149
+ loadDictionary(arrayBuffer: Uint8Array | ArrayBuffer): this;
150
+ loadPosVector(arrayBuffer: Uint8Array | ArrayBuffer): this;
151
+ loadTargetMap(arrayBuffer: Uint8Array | ArrayBuffer): this;
152
+ /**
153
+ * Look up features in the dictionary
154
+ */
155
+ getFeatures(tokenInfoIdStr: string | number): string;
156
+ }
157
+
158
+ /**
159
+ * ConnectionCosts - connection costs matrix from cc.dat file
160
+ * 2 dimension matrix [forward_id][backward_id] -> cost
161
+ */
162
+ declare class ConnectionCosts {
163
+ forwardDimension: number;
164
+ backwardDimension: number;
165
+ buffer: Int16Array;
166
+ constructor(forwardDimension: number, backwardDimension: number);
167
+ put(forwardId: number, backwardId: number, cost: number): void;
168
+ get(forwardId: number, backwardId: number): number;
169
+ loadConnectionCosts(connectionCostsBuffer: Int16Array): void;
170
+ }
171
+
172
+ /**
173
+ * CharacterClass - represents a character category for unknown word processing
174
+ */
175
+ declare class CharacterClass {
176
+ class_id: number;
177
+ class_name: string;
178
+ is_always_invoke: boolean | number;
179
+ is_grouping: boolean | number;
180
+ max_length: number;
181
+ constructor(classId: number, className: string, isAlwaysInvoke: boolean | number, isGrouping: boolean | number, maxLength: number);
182
+ }
183
+
184
+ /**
185
+ * InvokeDefinitionMap - represents invoke definition part of char.def
186
+ */
187
+ declare class InvokeDefinitionMap {
188
+ map: CharacterClass[];
189
+ lookupTable: Record<string, number>;
190
+ constructor();
191
+ /**
192
+ * Load InvokeDefinitionMap from buffer
193
+ */
194
+ static load(invokeDefBuffer: Uint8Array): InvokeDefinitionMap;
195
+ /**
196
+ * Initialize with character category definitions
197
+ */
198
+ init(characterCategoryDefinition: CharacterClass[] | null): void;
199
+ /**
200
+ * Get class information by class ID
201
+ */
202
+ getCharacterClass(classId: number): CharacterClass | undefined;
203
+ /**
204
+ * Lookup class ID by class name
205
+ */
206
+ lookup(className: string): number | null;
207
+ /**
208
+ * Transform from map to binary buffer
209
+ */
210
+ toBuffer(): Uint8Array;
211
+ }
212
+
213
+ interface CategoryMapping {
214
+ start: number;
215
+ end?: number;
216
+ default: string;
217
+ compatible: string[];
218
+ }
219
+ /**
220
+ * CharacterDefinition - represents char.def file and
221
+ * defines behavior of unknown word processing
222
+ */
223
+ declare class CharacterDefinition {
224
+ characterCategoryMap: Uint8Array;
225
+ compatibleCategoryMap: Uint32Array;
226
+ invokeDefinitionMap: InvokeDefinitionMap | null;
227
+ constructor();
228
+ /**
229
+ * Load CharacterDefinition from buffers
230
+ */
231
+ static load(catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): CharacterDefinition;
232
+ static parseCharCategory(classId: number, parsedCategoryDef: string[]): CharacterClass | null;
233
+ static parseCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
234
+ static parseRangeCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
235
+ /**
236
+ * Initialize category mappings
237
+ */
238
+ initCategoryMappings(categoryMapping: CategoryMapping[] | null): void;
239
+ /**
240
+ * Lookup compatible categories for a character (not included 1st category)
241
+ */
242
+ lookupCompatibleCategory(ch: string): CharacterClass[];
243
+ /**
244
+ * Lookup category for a character
245
+ */
246
+ lookup(ch: string): CharacterClass | undefined;
247
+ }
248
+
249
+ /**
250
+ * UnknownDictionary - dictionary for unknown words
251
+ */
252
+ declare class UnknownDictionary extends TokenInfoDictionary {
253
+ characterDefinition: CharacterDefinition | null;
254
+ constructor();
255
+ setCharacterDefinition(characterDefinition: CharacterDefinition): this;
256
+ lookup(ch: string): CharacterClass | undefined;
257
+ lookupCompatibleCategory(ch: string): CharacterClass[];
258
+ loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): void;
259
+ }
260
+
261
+ interface DoubleArrayTrie {
262
+ commonPrefixSearch(key: string): Array<{
263
+ k: string;
264
+ v: number;
265
+ }>;
266
+ }
267
+ /**
268
+ * DynamicDictionaries - container for all dictionaries used by Tokenizer
269
+ */
270
+ declare class DynamicDictionaries {
271
+ trie: DoubleArrayTrie;
272
+ tokenInfoDictionary: TokenInfoDictionary;
273
+ connectionCosts: ConnectionCosts;
274
+ unknownDictionary: UnknownDictionary;
275
+ constructor(trie?: DoubleArrayTrie | null, tokenInfoDictionary?: TokenInfoDictionary | null, connectionCosts?: ConnectionCosts | null, unknownDictionary?: UnknownDictionary | null);
276
+ loadTrie(baseBuffer: Int32Array, checkBuffer: Int32Array): Promise<this>;
277
+ loadTokenInfoDictionaries(tokenInfoBuffer: Uint8Array, posBuffer: Uint8Array, targetMapBuffer: Uint8Array): this;
278
+ loadConnectionCosts(ccBuffer: Int16Array): this;
279
+ loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): this;
280
+ }
281
+
282
+ /**
283
+ * Tokenizer - Korean morphological analyzer
284
+ */
285
+ declare class Tokenizer {
286
+ private tokenInfoDictionary;
287
+ private unknownDictionary;
288
+ private viterbiBuilder;
289
+ private viterbiSearcher;
290
+ private formatter;
291
+ constructor(dic: DynamicDictionaries);
292
+ /**
293
+ * Split text by sentence-ending punctuation
294
+ */
295
+ static splitByPunctuation(input: string): string[];
296
+ /**
297
+ * Tokenize text into morphemes
298
+ */
299
+ tokenize(text: string): KoreanToken[];
300
+ /**
301
+ * Tokenize a single sentence
302
+ */
303
+ tokenizeForSentence(sentence: string, tokens?: KoreanToken[]): KoreanToken[];
304
+ /**
305
+ * Get just the surface forms as an array (wakachi-gaki)
306
+ */
307
+ wakati(text: string): string[];
308
+ /**
309
+ * Get space-separated surface forms
310
+ */
311
+ wakatiString(text: string): string;
312
+ /**
313
+ * Build word lattice for analysis
314
+ */
315
+ getLattice(text: string): ViterbiLattice;
316
+ }
317
+
318
+ interface TokenizerBuilderOptions {
319
+ dicPath?: string;
320
+ }
321
+ /**
322
+ * TokenizerBuilder - builds a Tokenizer with loaded dictionaries
323
+ */
324
+ declare class TokenizerBuilder {
325
+ private dicPath;
326
+ constructor(options?: TokenizerBuilderOptions);
327
+ /**
328
+ * Build and return the tokenizer (async)
329
+ */
330
+ build(): Promise<Tokenizer>;
331
+ }
332
+
333
+ /**
334
+ * mecab-ko - Pure TypeScript Korean Morphological Analyzer
335
+ *
336
+ * A port of kuromoji.js adapted for Korean language processing using mecab-ko-dic.
337
+ */
338
+
339
+ /**
340
+ * Create a tokenizer builder
341
+ */
342
+ declare function builder(options?: TokenizerBuilderOptions): TokenizerBuilder;
343
+
344
+ declare const _default: {
345
+ builder: typeof builder;
346
+ TokenizerBuilder: typeof TokenizerBuilder;
347
+ Tokenizer: typeof Tokenizer;
348
+ KoreanToken: typeof KoreanToken;
349
+ POS_TAGS: Record<string, string>;
350
+ };
351
+
352
+ export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
@@ -0,0 +1,352 @@
1
+ type ViterbiNodeType = 'KNOWN' | 'UNKNOWN' | 'BOS' | 'EOS';
2
+ /**
3
+ * ViterbiNode - a node in the Viterbi lattice
4
+ */
5
+ declare class ViterbiNode {
6
+ name: number;
7
+ cost: number;
8
+ start_pos: number;
9
+ length: number;
10
+ left_id: number;
11
+ right_id: number;
12
+ prev: ViterbiNode | null;
13
+ surface_form: string;
14
+ shortest_cost: number;
15
+ type: ViterbiNodeType;
16
+ constructor(nodeName: number, nodeCost: number, startPos: number, length: number, type: ViterbiNodeType, leftId: number, rightId: number, surfaceForm: string);
17
+ }
18
+
19
+ /**
20
+ * ViterbiLattice - a word lattice for Viterbi algorithm
21
+ */
22
+ declare class ViterbiLattice {
23
+ nodesEndAt: (ViterbiNode[] | null)[];
24
+ eosPos: number;
25
+ constructor();
26
+ /**
27
+ * Append node to the lattice
28
+ */
29
+ append(node: ViterbiNode): void;
30
+ /**
31
+ * Append EOS (End of Sentence) node
32
+ */
33
+ appendEos(): void;
34
+ }
35
+
36
+ /**
37
+ * Korean Token - represents a single morpheme from tokenization
38
+ *
39
+ * mecab-ko-dic format (8 features):
40
+ * 0: 품사 태그 (POS tag) - e.g., NNG, VV, JKS
41
+ * 1: 의미 부류 (semantic class) - e.g., 행위, 인물
42
+ * 2: 종성 유무 (final consonant) - T/F/*
43
+ * 3: 읽기 (reading) - pronunciation
44
+ * 4: 타입 (type) - Inflect/Compound/Preanalysis/*
45
+ * 5: 첫번째 품사 (first POS) - for compound words
46
+ * 6: 마지막 품사 (last POS) - for compound words
47
+ * 7: 표현 (expression) - decomposition of compounds
48
+ */
49
+ declare const POS_TAGS: Record<string, string>;
50
+ interface KoreanTokenOptions {
51
+ word_id?: number;
52
+ word_type?: 'KNOWN' | 'UNKNOWN';
53
+ word_position?: number;
54
+ surface_form?: string;
55
+ pos?: string;
56
+ semantic_class?: string;
57
+ has_final_consonant?: string;
58
+ reading?: string;
59
+ type?: string;
60
+ first_pos?: string;
61
+ last_pos?: string;
62
+ expression?: string;
63
+ }
64
+ interface TokenPart {
65
+ surface: string;
66
+ pos: string;
67
+ }
68
+ declare class KoreanToken {
69
+ word_id: number;
70
+ word_type: 'KNOWN' | 'UNKNOWN';
71
+ word_position: number;
72
+ surface_form: string;
73
+ pos: string;
74
+ semantic_class: string;
75
+ has_final_consonant: string;
76
+ reading: string;
77
+ type: string;
78
+ first_pos: string;
79
+ last_pos: string;
80
+ expression: string;
81
+ constructor(options?: KoreanTokenOptions);
82
+ /**
83
+ * Get human-readable POS description
84
+ */
85
+ get posDescription(): string;
86
+ /**
87
+ * Check if token ends with a consonant (받침)
88
+ */
89
+ get hasBatchim(): boolean;
90
+ /**
91
+ * Check if this is a compound word
92
+ */
93
+ get isCompound(): boolean;
94
+ /**
95
+ * Check if this is an inflected form
96
+ */
97
+ get isInflected(): boolean;
98
+ /**
99
+ * Get the decomposed parts for compound/inflected words
100
+ */
101
+ get parts(): TokenPart[];
102
+ /**
103
+ * Create token from features array
104
+ */
105
+ static fromFeatures(surface: string, features: string[], wordId?: number, position?: number, wordType?: 'KNOWN' | 'UNKNOWN'): KoreanToken;
106
+ /**
107
+ * Convert to plain object
108
+ */
109
+ toJSON(): Record<string, unknown>;
110
+ }
111
+
112
+ /**
113
+ * ByteBuffer - Utilities to manipulate byte sequences
114
+ */
115
+ declare class ByteBuffer {
116
+ buffer: Uint8Array;
117
+ position: number;
118
+ constructor(arg?: number | Uint8Array | ArrayBuffer);
119
+ size(): number;
120
+ reallocate(): void;
121
+ shrink(): Uint8Array;
122
+ put(b: number): void;
123
+ get(index?: number): number;
124
+ putShort(num: number): void;
125
+ getShort(index?: number): number;
126
+ putInt(num: number): void;
127
+ getInt(index?: number): number;
128
+ readInt(): number;
129
+ putString(str: string): void;
130
+ getString(index?: number): string;
131
+ }
132
+
133
+ /**
134
+ * TokenInfoDictionary - dictionary for known tokens
135
+ */
136
+ declare class TokenInfoDictionary {
137
+ dictionary: ByteBuffer;
138
+ targetMap: Record<number, number[]>;
139
+ posBuffer: ByteBuffer;
140
+ constructor();
141
+ /**
142
+ * Build dictionary from entries
143
+ * Entry format: [surface, left_id, right_id, word_cost, ...features]
144
+ */
145
+ buildDictionary(entries: (string | number)[][]): Record<number, string>;
146
+ put(leftId: number, rightId: number, wordCost: number, surfaceForm: string, feature: string): number;
147
+ addMapping(source: number, target: number): void;
148
+ targetMapToBuffer(): Uint8Array;
149
+ loadDictionary(arrayBuffer: Uint8Array | ArrayBuffer): this;
150
+ loadPosVector(arrayBuffer: Uint8Array | ArrayBuffer): this;
151
+ loadTargetMap(arrayBuffer: Uint8Array | ArrayBuffer): this;
152
+ /**
153
+ * Look up features in the dictionary
154
+ */
155
+ getFeatures(tokenInfoIdStr: string | number): string;
156
+ }
157
+
158
+ /**
159
+ * ConnectionCosts - connection costs matrix from cc.dat file
160
+ * 2 dimension matrix [forward_id][backward_id] -> cost
161
+ */
162
+ declare class ConnectionCosts {
163
+ forwardDimension: number;
164
+ backwardDimension: number;
165
+ buffer: Int16Array;
166
+ constructor(forwardDimension: number, backwardDimension: number);
167
+ put(forwardId: number, backwardId: number, cost: number): void;
168
+ get(forwardId: number, backwardId: number): number;
169
+ loadConnectionCosts(connectionCostsBuffer: Int16Array): void;
170
+ }
171
+
172
+ /**
173
+ * CharacterClass - represents a character category for unknown word processing
174
+ */
175
+ declare class CharacterClass {
176
+ class_id: number;
177
+ class_name: string;
178
+ is_always_invoke: boolean | number;
179
+ is_grouping: boolean | number;
180
+ max_length: number;
181
+ constructor(classId: number, className: string, isAlwaysInvoke: boolean | number, isGrouping: boolean | number, maxLength: number);
182
+ }
183
+
184
+ /**
185
+ * InvokeDefinitionMap - represents invoke definition part of char.def
186
+ */
187
+ declare class InvokeDefinitionMap {
188
+ map: CharacterClass[];
189
+ lookupTable: Record<string, number>;
190
+ constructor();
191
+ /**
192
+ * Load InvokeDefinitionMap from buffer
193
+ */
194
+ static load(invokeDefBuffer: Uint8Array): InvokeDefinitionMap;
195
+ /**
196
+ * Initialize with character category definitions
197
+ */
198
+ init(characterCategoryDefinition: CharacterClass[] | null): void;
199
+ /**
200
+ * Get class information by class ID
201
+ */
202
+ getCharacterClass(classId: number): CharacterClass | undefined;
203
+ /**
204
+ * Lookup class ID by class name
205
+ */
206
+ lookup(className: string): number | null;
207
+ /**
208
+ * Transform from map to binary buffer
209
+ */
210
+ toBuffer(): Uint8Array;
211
+ }
212
+
213
+ interface CategoryMapping {
214
+ start: number;
215
+ end?: number;
216
+ default: string;
217
+ compatible: string[];
218
+ }
219
+ /**
220
+ * CharacterDefinition - represents char.def file and
221
+ * defines behavior of unknown word processing
222
+ */
223
+ declare class CharacterDefinition {
224
+ characterCategoryMap: Uint8Array;
225
+ compatibleCategoryMap: Uint32Array;
226
+ invokeDefinitionMap: InvokeDefinitionMap | null;
227
+ constructor();
228
+ /**
229
+ * Load CharacterDefinition from buffers
230
+ */
231
+ static load(catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): CharacterDefinition;
232
+ static parseCharCategory(classId: number, parsedCategoryDef: string[]): CharacterClass | null;
233
+ static parseCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
234
+ static parseRangeCategoryMapping(parsedCategoryMapping: string[]): CategoryMapping;
235
+ /**
236
+ * Initialize category mappings
237
+ */
238
+ initCategoryMappings(categoryMapping: CategoryMapping[] | null): void;
239
+ /**
240
+ * Lookup compatible categories for a character (not included 1st category)
241
+ */
242
+ lookupCompatibleCategory(ch: string): CharacterClass[];
243
+ /**
244
+ * Lookup category for a character
245
+ */
246
+ lookup(ch: string): CharacterClass | undefined;
247
+ }
248
+
249
+ /**
250
+ * UnknownDictionary - dictionary for unknown words
251
+ */
252
+ declare class UnknownDictionary extends TokenInfoDictionary {
253
+ characterDefinition: CharacterDefinition | null;
254
+ constructor();
255
+ setCharacterDefinition(characterDefinition: CharacterDefinition): this;
256
+ lookup(ch: string): CharacterClass | undefined;
257
+ lookupCompatibleCategory(ch: string): CharacterClass[];
258
+ loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): void;
259
+ }
260
+
261
+ interface DoubleArrayTrie {
262
+ commonPrefixSearch(key: string): Array<{
263
+ k: string;
264
+ v: number;
265
+ }>;
266
+ }
267
+ /**
268
+ * DynamicDictionaries - container for all dictionaries used by Tokenizer
269
+ */
270
+ declare class DynamicDictionaries {
271
+ trie: DoubleArrayTrie;
272
+ tokenInfoDictionary: TokenInfoDictionary;
273
+ connectionCosts: ConnectionCosts;
274
+ unknownDictionary: UnknownDictionary;
275
+ constructor(trie?: DoubleArrayTrie | null, tokenInfoDictionary?: TokenInfoDictionary | null, connectionCosts?: ConnectionCosts | null, unknownDictionary?: UnknownDictionary | null);
276
+ loadTrie(baseBuffer: Int32Array, checkBuffer: Int32Array): Promise<this>;
277
+ loadTokenInfoDictionaries(tokenInfoBuffer: Uint8Array, posBuffer: Uint8Array, targetMapBuffer: Uint8Array): this;
278
+ loadConnectionCosts(ccBuffer: Int16Array): this;
279
+ loadUnknownDictionaries(unkBuffer: Uint8Array, unkPosBuffer: Uint8Array, unkMapBuffer: Uint8Array, catMapBuffer: Uint8Array, compatCatMapBuffer: Uint32Array, invokeDefBuffer: Uint8Array): this;
280
+ }
281
+
282
+ /**
283
+ * Tokenizer - Korean morphological analyzer
284
+ */
285
+ declare class Tokenizer {
286
+ private tokenInfoDictionary;
287
+ private unknownDictionary;
288
+ private viterbiBuilder;
289
+ private viterbiSearcher;
290
+ private formatter;
291
+ constructor(dic: DynamicDictionaries);
292
+ /**
293
+ * Split text by sentence-ending punctuation
294
+ */
295
+ static splitByPunctuation(input: string): string[];
296
+ /**
297
+ * Tokenize text into morphemes
298
+ */
299
+ tokenize(text: string): KoreanToken[];
300
+ /**
301
+ * Tokenize a single sentence
302
+ */
303
+ tokenizeForSentence(sentence: string, tokens?: KoreanToken[]): KoreanToken[];
304
+ /**
305
+ * Get just the surface forms as an array (wakachi-gaki)
306
+ */
307
+ wakati(text: string): string[];
308
+ /**
309
+ * Get space-separated surface forms
310
+ */
311
+ wakatiString(text: string): string;
312
+ /**
313
+ * Build word lattice for analysis
314
+ */
315
+ getLattice(text: string): ViterbiLattice;
316
+ }
317
+
318
+ interface TokenizerBuilderOptions {
319
+ dicPath?: string;
320
+ }
321
+ /**
322
+ * TokenizerBuilder - builds a Tokenizer with loaded dictionaries
323
+ */
324
+ declare class TokenizerBuilder {
325
+ private dicPath;
326
+ constructor(options?: TokenizerBuilderOptions);
327
+ /**
328
+ * Build and return the tokenizer (async)
329
+ */
330
+ build(): Promise<Tokenizer>;
331
+ }
332
+
333
+ /**
334
+ * mecab-ko - Pure TypeScript Korean Morphological Analyzer
335
+ *
336
+ * A port of kuromoji.js adapted for Korean language processing using mecab-ko-dic.
337
+ */
338
+
339
+ /**
340
+ * Create a tokenizer builder
341
+ */
342
+ declare function builder(options?: TokenizerBuilderOptions): TokenizerBuilder;
343
+
344
+ declare const _default: {
345
+ builder: typeof builder;
346
+ TokenizerBuilder: typeof TokenizerBuilder;
347
+ Tokenizer: typeof Tokenizer;
348
+ KoreanToken: typeof KoreanToken;
349
+ POS_TAGS: Record<string, string>;
350
+ };
351
+
352
+ export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };