mecab-ko-wasm 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,315 @@
1
+ # mecab-ko-wasm
2
+
3
+ WebAssembly bindings for MeCab-Ko, a Korean morphological analyzer.
4
+
5
+ This package enables Korean morphological analysis in web browsers and Node.js environments through WebAssembly.
6
+
7
+ ## Features
8
+
9
+ - **Fast**: Compiled to WebAssembly for near-native performance
10
+ - **Lightweight**: No external dependencies required in the browser
11
+ - **Cross-platform**: Works in both browser and Node.js environments
12
+ - **Type-safe**: Full TypeScript type definitions included
13
+
14
+ ## Installation
15
+
16
+ ### Using npm
17
+
18
+ ```bash
19
+ npm install mecab-ko-wasm
20
+ ```
21
+
22
+ ### Using yarn
23
+
24
+ ```bash
25
+ yarn add mecab-ko-wasm
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ### Browser (ES Modules)
31
+
32
+ ```javascript
33
+ import init, { Mecab } from 'mecab-ko-wasm';
34
+
35
+ async function analyze() {
36
+ // Initialize the WASM module
37
+ await init();
38
+
39
+ // Create a Mecab instance
40
+ const mecab = new Mecab();
41
+
42
+ // Extract morphemes
43
+ const morphs = mecab.morphs("안녕하세요");
44
+ console.log(morphs); // ["안녕", "하", "세요"]
45
+
46
+ // Get part-of-speech tags
47
+ const posJson = mecab.pos("형태소 분석");
48
+ const pos = JSON.parse(posJson);
49
+ console.log(pos); // [["형태소", "NNG"], ["분석", "NNG"]]
50
+
51
+ // Get detailed token information
52
+ const tokens = mecab.tokenize("한국어 분석기");
53
+ tokens.forEach(token => {
54
+ console.log(`${token.surface}: ${token.pos}`);
55
+ });
56
+ }
57
+
58
+ analyze();
59
+ ```
60
+
61
+ ### Node.js
62
+
63
+ ```javascript
64
+ const { Mecab } = require('mecab-ko-wasm');
65
+
66
+ const mecab = new Mecab();
67
+
68
+ // Extract morphemes
69
+ const morphs = mecab.morphs("안녕하세요");
70
+ console.log(morphs); // ["안녕", "하", "세요"]
71
+
72
+ // Extract nouns
73
+ const nouns = mecab.nouns("형태소 분석기입니다");
74
+ console.log(nouns); // ["형태소", "분석기"]
75
+
76
+ // Wakati tokenization
77
+ const words = mecab.wakati("한국어 처리");
78
+ console.log(words); // ["한국어", "처리"]
79
+ ```
80
+
81
+ ### TypeScript
82
+
83
+ ```typescript
84
+ import init, { Mecab, WasmToken } from 'mecab-ko-wasm';
85
+
86
+ async function analyze(text: string): Promise<void> {
87
+ await init();
88
+
89
+ const mecab = new Mecab();
90
+
91
+ // Tokenize with full information
92
+ const tokens: WasmToken[] = mecab.tokenize(text);
93
+ tokens.forEach((token: WasmToken) => {
94
+ console.log({
95
+ surface: token.surface,
96
+ pos: token.pos,
97
+ start: token.start,
98
+ end: token.end,
99
+ });
100
+ });
101
+
102
+ // Extract morphemes
103
+ const morphs: string[] = mecab.morphs(text);
104
+ console.log('Morphemes:', morphs);
105
+ }
106
+
107
+ analyze("한국어 형태소 분석");
108
+ ```
109
+
110
+ ## API Reference
111
+
112
+ ### `Mecab`
113
+
114
+ The main class for Korean morphological analysis.
115
+
116
+ #### Constructor
117
+
118
+ ```typescript
119
+ new Mecab(): Mecab
120
+ ```
121
+
122
+ Creates a new Mecab instance with the default dictionary.
123
+
124
+ **Throws**: Error if initialization fails
125
+
126
+ #### Methods
127
+
128
+ ##### `tokenize(text: string): WasmToken[]`
129
+
130
+ Tokenizes the input text and returns detailed token information.
131
+
132
+ **Parameters:**
133
+ - `text`: Input text to analyze
134
+
135
+ **Returns**: Array of `WasmToken` objects containing surface form, POS tag, and position information
136
+
137
+ **Example:**
138
+ ```javascript
139
+ const tokens = mecab.tokenize("안녕하세요");
140
+ // [
141
+ // { surface: "안녕", pos: "NNG", start: 0, end: 6, ... },
142
+ // { surface: "하", pos: "XSV", start: 6, end: 9, ... },
143
+ // ...
144
+ // ]
145
+ ```
146
+
147
+ ##### `morphs(text: string): string[]`
148
+
149
+ Extracts morphemes from the input text.
150
+
151
+ **Parameters:**
152
+ - `text`: Input text to analyze
153
+
154
+ **Returns**: Array of morpheme strings
155
+
156
+ **Example:**
157
+ ```javascript
158
+ const morphs = mecab.morphs("안녕하세요");
159
+ // ["안녕", "하", "세요"]
160
+ ```
161
+
162
+ ##### `pos(text: string): string`
163
+
164
+ Extracts part-of-speech tagged pairs as a JSON string.
165
+
166
+ **Parameters:**
167
+ - `text`: Input text to analyze
168
+
169
+ **Returns**: JSON string containing an array of `[surface, pos]` pairs
170
+
171
+ **Example:**
172
+ ```javascript
173
+ const posJson = mecab.pos("안녕하세요");
174
+ const pos = JSON.parse(posJson);
175
+ // [["안녕", "NNG"], ["하", "XSV"], ["세요", "EP+EF"]]
176
+ ```
177
+
178
+ ##### `nouns(text: string): string[]`
179
+
180
+ Extracts only nouns from the input text.
181
+
182
+ **Parameters:**
183
+ - `text`: Input text to analyze
184
+
185
+ **Returns**: Array of noun strings
186
+
187
+ **Example:**
188
+ ```javascript
189
+ const nouns = mecab.nouns("형태소 분석기입니다");
190
+ // ["형태소", "분석기"]
191
+ ```
192
+
193
+ ##### `wakati(text: string): string[]`
194
+
195
+ Performs wakati (space-separated) tokenization.
196
+
197
+ **Parameters:**
198
+ - `text`: Input text to analyze
199
+
200
+ **Returns**: Array of morpheme strings
201
+
202
+ **Example:**
203
+ ```javascript
204
+ const words = mecab.wakati("형태소 분석");
205
+ // ["형태소", "분석"]
206
+ ```
207
+
208
+ ### `WasmToken`
209
+
210
+ Represents a single token with detailed morphological information.
211
+
212
+ #### Properties
213
+
214
+ - `surface: string` - The surface form (표면형) of the token
215
+ - `pos: string` - Part-of-speech tag (품사 태그)
216
+ - `start: number` - Start position in bytes
217
+ - `end: number` - End position in bytes
218
+ - `reading: string | undefined` - Reading of the token (if available)
219
+ - `lemma: string | undefined` - Base form/lemma (if available)
220
+
221
+ #### Methods
222
+
223
+ ##### `toJSON(): string`
224
+
225
+ Converts the token to a JSON string.
226
+
227
+ **Returns**: JSON string representation of the token
228
+
229
+ ## Building from Source
230
+
231
+ ### Prerequisites
232
+
233
+ - Rust (1.75+)
234
+ - wasm-pack
235
+
236
+ ```bash
237
+ cargo install wasm-pack
238
+ ```
239
+
240
+ ### Build
241
+
242
+ ```bash
243
+ # Build for browser
244
+ wasm-pack build --target web
245
+
246
+ # Build for Node.js
247
+ wasm-pack build --target nodejs
248
+
249
+ # Build for bundlers (webpack, etc.)
250
+ wasm-pack build --target bundler
251
+ ```
252
+
253
+ ### Development
254
+
255
+ ```bash
256
+ # Run tests
257
+ wasm-pack test --node
258
+
259
+ # Run tests in browser (requires Chrome/Firefox)
260
+ wasm-pack test --headless --firefox
261
+ ```
262
+
263
+ ## Part-of-Speech Tags
264
+
265
+ MeCab-Ko uses the Sejong corpus POS tag set. Common tags include:
266
+
267
+ - `NNG`: General noun (일반 명사)
268
+ - `NNP`: Proper noun (고유 명사)
269
+ - `VV`: Verb (동사)
270
+ - `VA`: Adjective (형용사)
271
+ - `MAG`: General adverb (일반 부사)
272
+ - `JKS`: Subjective case particle (주격 조사)
273
+ - `JKO`: Objective case particle (목적격 조사)
274
+ - `EP`: Pre-final ending (선어말 어미)
275
+ - `EF`: Final ending (어말 어미)
276
+
277
+ For a complete list, see [Sejong POS Tags](https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY).
278
+
279
+ ## Performance
280
+
281
+ MeCab-Ko WASM provides near-native performance through WebAssembly compilation:
282
+
283
+ - **Tokenization**: ~1-2ms for typical sentences (10-20 words)
284
+ - **Memory**: ~2-5MB WASM module size (with dictionary)
285
+ - **Initialization**: ~10-50ms first load (cached afterwards)
286
+
287
+ ## Browser Compatibility
288
+
289
+ - Chrome/Edge 57+
290
+ - Firefox 52+
291
+ - Safari 11+
292
+ - Node.js 12+
293
+
294
+ ## License
295
+
296
+ Licensed under either of:
297
+
298
+ - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
299
+ - MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
300
+
301
+ at your option.
302
+
303
+ ## Contributing
304
+
305
+ Contributions are welcome! Please see [CONTRIBUTING.md](../../CONTRIBUTING.md) for guidelines.
306
+
307
+ ## Related Projects
308
+
309
+ - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko) - Original C++ implementation
310
+ - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - Korean dictionary for MeCab
311
+ - [konlpy](https://konlpy.org/) - Python Korean NLP library
312
+
313
+ ## Acknowledgments
314
+
315
+ This project is based on MeCab-Ko, originally developed by the Eunjeon project.
package/index.d.ts ADDED
@@ -0,0 +1,90 @@
1
+ /**
2
+ * MeCab-Ko WebAssembly TypeScript Definitions
3
+ * Korean morphological analyzer for browser and Node.js
4
+ */
5
+
6
+ /**
7
+ * A token representing a morpheme in Korean text
8
+ */
9
+ export interface Token {
10
+ /** The surface form (original text) */
11
+ readonly surface: string;
12
+ /** Part-of-speech tag */
13
+ readonly pos: string;
14
+ /** Start byte offset in the input text */
15
+ readonly start: number;
16
+ /** End byte offset in the input text */
17
+ readonly end: number;
18
+ /** Reading form (if available) */
19
+ readonly reading?: string;
20
+ /** Lemma/dictionary form (if available) */
21
+ readonly lemma?: string;
22
+ /** Convert token to JSON string */
23
+ toJSON(): string;
24
+ }
25
+
26
+ /**
27
+ * MeCab-Ko tokenizer class
28
+ *
29
+ * @example
30
+ * ```typescript
31
+ * import init, { Mecab } from 'mecab-ko-wasm';
32
+ *
33
+ * await init();
34
+ * const mecab = new Mecab();
35
+ *
36
+ * const tokens = mecab.tokenize('안녕하세요');
37
+ * console.log(tokens);
38
+ *
39
+ * const morphs = mecab.morphs('형태소 분석');
40
+ * console.log(morphs); // ['형태소', '분석']
41
+ *
42
+ * const nouns = mecab.nouns('한국어 형태소 분석기');
43
+ * console.log(nouns); // ['한국어', '형태소', '분석기']
44
+ * ```
45
+ */
46
+ export class Mecab {
47
+ /** Create a new MeCab tokenizer instance */
48
+ constructor();
49
+
50
+ /**
51
+ * Tokenize Korean text into morphemes
52
+ * @param text - Input Korean text
53
+ * @returns Array of Token objects
54
+ */
55
+ tokenize(text: string): Token[];
56
+
57
+ /**
58
+ * Extract morpheme surface forms
59
+ * @param text - Input Korean text
60
+ * @returns Array of surface form strings
61
+ */
62
+ morphs(text: string): string[];
63
+
64
+ /**
65
+ * Extract part-of-speech tagged pairs
66
+ * @param text - Input Korean text
67
+ * @returns Array of [surface, pos] tuples
68
+ */
69
+ pos(text: string): [string, string][];
70
+
71
+ /**
72
+ * Extract only nouns from text
73
+ * @param text - Input Korean text
74
+ * @returns Array of noun strings
75
+ */
76
+ nouns(text: string): string[];
77
+
78
+ /**
79
+ * Split text into space-separated morphemes (wakati mode)
80
+ * @param text - Input Korean text
81
+ * @returns Array of morpheme strings
82
+ */
83
+ wakati(text: string): string[];
84
+ }
85
+
86
+ /**
87
+ * Initialize the WebAssembly module
88
+ * Must be called before creating Mecab instances
89
+ */
90
+ export default function init(): Promise<void>;
package/index.js ADDED
@@ -0,0 +1,19 @@
1
+ import init, * as wasm from './mecab_ko_wasm.js';
2
+
3
+ let initialized = false;
4
+
5
+ export async function initialize() {
6
+ if (!initialized) {
7
+ await init();
8
+ initialized = true;
9
+ }
10
+ }
11
+
12
+ export const tokenize = wasm.tokenize;
13
+ export const morphs = wasm.morphs;
14
+ export const nouns = wasm.nouns;
15
+ export const pos = wasm.pos;
16
+ export const wakati = wasm.wakati;
17
+ export const getVersion = wasm.getVersion;
18
+
19
+ export default { initialize, tokenize, morphs, nouns, pos, wakati, getVersion };
@@ -0,0 +1,148 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ /**
5
+ * The main MeCab-Ko tokenizer for WebAssembly
6
+ *
7
+ * This class provides Korean morphological analysis capabilities
8
+ * in JavaScript/TypeScript environments.
9
+ */
10
+ export class Mecab {
11
+ free(): void;
12
+ [Symbol.dispose](): void;
13
+ /**
14
+ * Extract morphemes (형태소) from text
15
+ *
16
+ * Returns an array of morpheme strings without POS information.
17
+ *
18
+ * # Example (JavaScript)
19
+ *
20
+ * ```javascript
21
+ * const morphs = mecab.morphs("안녕하세요");
22
+ * console.log(morphs); // ["안녕", "하", "세요"]
23
+ * ```
24
+ */
25
+ morphs(text: string): string[];
26
+ /**
27
+ * Create a new Mecab instance with the default dictionary
28
+ *
29
+ * # Example (JavaScript)
30
+ *
31
+ * ```javascript
32
+ * const mecab = new Mecab();
33
+ * ```
34
+ *
35
+ * # Errors
36
+ *
37
+ * Returns an error if tokenizer initialization fails
38
+ */
39
+ constructor();
40
+ /**
41
+ * Extract nouns (명사) from text
42
+ *
43
+ * Returns an array of noun strings.
44
+ *
45
+ * # Example (JavaScript)
46
+ *
47
+ * ```javascript
48
+ * const nouns = mecab.nouns("형태소 분석기입니다");
49
+ * console.log(nouns); // ["형태소", "분석기"]
50
+ * ```
51
+ */
52
+ nouns(text: string): string[];
53
+ /**
54
+ * Extract part-of-speech tagged pairs
55
+ *
56
+ * Returns a JSON string containing an array of [surface, pos] pairs.
57
+ *
58
+ * # Example (JavaScript)
59
+ *
60
+ * ```javascript
61
+ * const posJson = mecab.pos("안녕하세요");
62
+ * const pos = JSON.parse(posJson);
63
+ * console.log(pos); // [["안녕", "NNG"], ["하", "XSV"], ["세요", "EP+EF"]]
64
+ * ```
65
+ *
66
+ * # Errors
67
+ *
68
+ * Returns an error if JSON serialization fails
69
+ */
70
+ pos(text: string): string;
71
+ /**
72
+ * Tokenize text and return detailed token information
73
+ *
74
+ * Returns an array of tokens with surface form, POS tag, and position information.
75
+ *
76
+ * # Example (JavaScript)
77
+ *
78
+ * ```javascript
79
+ * const tokens = mecab.tokenize("안녕하세요");
80
+ * tokens.forEach(token => {
81
+ * console.log(`${token.surface}: ${token.pos}`);
82
+ * });
83
+ * ```
84
+ */
85
+ tokenize(text: string): WasmToken[];
86
+ /**
87
+ * Perform wakati (분리) tokenization
88
+ *
89
+ * Returns an array of morpheme strings, similar to `morphs()`.
90
+ *
91
+ * # Example (JavaScript)
92
+ *
93
+ * ```javascript
94
+ * const words = mecab.wakati("형태소 분석");
95
+ * console.log(words); // ["형태소", "분석"]
96
+ * ```
97
+ */
98
+ wakati(text: string): string[];
99
+ }
100
+
101
+ /**
102
+ * A JavaScript-friendly token representation
103
+ */
104
+ export class WasmToken {
105
+ private constructor();
106
+ free(): void;
107
+ [Symbol.dispose](): void;
108
+ /**
109
+ * Convert to JSON string for easier JavaScript interop
110
+ *
111
+ * # Errors
112
+ *
113
+ * Returns an error if serialization fails
114
+ */
115
+ toJSON(): string;
116
+ /**
117
+ * Get the end position in bytes
118
+ */
119
+ readonly end: number;
120
+ /**
121
+ * Get the lemma/base form (if available)
122
+ */
123
+ readonly lemma: string | undefined;
124
+ /**
125
+ * Get the part-of-speech tag (품사)
126
+ */
127
+ readonly pos: string;
128
+ /**
129
+ * Get the reading (if available)
130
+ */
131
+ readonly reading: string | undefined;
132
+ /**
133
+ * Get the start position in bytes
134
+ */
135
+ readonly start: number;
136
+ /**
137
+ * Get the surface form (표면형)
138
+ */
139
+ readonly surface: string;
140
+ }
141
+
142
+ /**
143
+ * Initialize the WASM module
144
+ *
145
+ * This function should be called once before using the library.
146
+ * It sets up panic hooks for better error messages in development.
147
+ */
148
+ export function init(): void;
@@ -0,0 +1,9 @@
1
+ /* @ts-self-types="./mecab_ko_wasm.d.ts" */
2
+
3
+ import * as wasm from "./mecab_ko_wasm_bg.wasm";
4
+ import { __wbg_set_wasm } from "./mecab_ko_wasm_bg.js";
5
+ __wbg_set_wasm(wasm);
6
+ wasm.__wbindgen_start();
7
+ export {
8
+ Mecab, WasmToken, init
9
+ } from "./mecab_ko_wasm_bg.js";
@@ -0,0 +1,548 @@
1
+ /**
2
+ * The main MeCab-Ko tokenizer for WebAssembly
3
+ *
4
+ * This class provides Korean morphological analysis capabilities
5
+ * in JavaScript/TypeScript environments.
6
+ */
7
+ export class Mecab {
8
+ __destroy_into_raw() {
9
+ const ptr = this.__wbg_ptr;
10
+ this.__wbg_ptr = 0;
11
+ MecabFinalization.unregister(this);
12
+ return ptr;
13
+ }
14
+ free() {
15
+ const ptr = this.__destroy_into_raw();
16
+ wasm.__wbg_mecab_free(ptr, 0);
17
+ }
18
+ /**
19
+ * Extract morphemes (형태소) from text
20
+ *
21
+ * Returns an array of morpheme strings without POS information.
22
+ *
23
+ * # Example (JavaScript)
24
+ *
25
+ * ```javascript
26
+ * const morphs = mecab.morphs("안녕하세요");
27
+ * console.log(morphs); // ["안녕", "하", "세요"]
28
+ * ```
29
+ * @param {string} text
30
+ * @returns {string[]}
31
+ */
32
+ morphs(text) {
33
+ try {
34
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
35
+ const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
36
+ const len0 = WASM_VECTOR_LEN;
37
+ wasm.mecab_morphs(retptr, this.__wbg_ptr, ptr0, len0);
38
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
39
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
40
+ var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
41
+ wasm.__wbindgen_export(r0, r1 * 4, 4);
42
+ return v2;
43
+ } finally {
44
+ wasm.__wbindgen_add_to_stack_pointer(16);
45
+ }
46
+ }
47
+ /**
48
+ * Create a new Mecab instance with the default dictionary
49
+ *
50
+ * # Example (JavaScript)
51
+ *
52
+ * ```javascript
53
+ * const mecab = new Mecab();
54
+ * ```
55
+ *
56
+ * # Errors
57
+ *
58
+ * Returns an error if tokenizer initialization fails
59
+ */
60
+ constructor() {
61
+ try {
62
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
63
+ wasm.mecab_new(retptr);
64
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
65
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
66
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
67
+ if (r2) {
68
+ throw takeObject(r1);
69
+ }
70
+ this.__wbg_ptr = r0 >>> 0;
71
+ MecabFinalization.register(this, this.__wbg_ptr, this);
72
+ return this;
73
+ } finally {
74
+ wasm.__wbindgen_add_to_stack_pointer(16);
75
+ }
76
+ }
77
+ /**
78
+ * Extract nouns (명사) from text
79
+ *
80
+ * Returns an array of noun strings.
81
+ *
82
+ * # Example (JavaScript)
83
+ *
84
+ * ```javascript
85
+ * const nouns = mecab.nouns("형태소 분석기입니다");
86
+ * console.log(nouns); // ["형태소", "분석기"]
87
+ * ```
88
+ * @param {string} text
89
+ * @returns {string[]}
90
+ */
91
+ nouns(text) {
92
+ try {
93
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
94
+ const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
95
+ const len0 = WASM_VECTOR_LEN;
96
+ wasm.mecab_nouns(retptr, this.__wbg_ptr, ptr0, len0);
97
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
98
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
99
+ var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
100
+ wasm.__wbindgen_export(r0, r1 * 4, 4);
101
+ return v2;
102
+ } finally {
103
+ wasm.__wbindgen_add_to_stack_pointer(16);
104
+ }
105
+ }
106
+ /**
107
+ * Extract part-of-speech tagged pairs
108
+ *
109
+ * Returns a JSON string containing an array of [surface, pos] pairs.
110
+ *
111
+ * # Example (JavaScript)
112
+ *
113
+ * ```javascript
114
+ * const posJson = mecab.pos("안녕하세요");
115
+ * const pos = JSON.parse(posJson);
116
+ * console.log(pos); // [["안녕", "NNG"], ["하", "XSV"], ["세요", "EP+EF"]]
117
+ * ```
118
+ *
119
+ * # Errors
120
+ *
121
+ * Returns an error if JSON serialization fails
122
+ * @param {string} text
123
+ * @returns {string}
124
+ */
125
+ pos(text) {
126
+ let deferred3_0;
127
+ let deferred3_1;
128
+ try {
129
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
130
+ const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
131
+ const len0 = WASM_VECTOR_LEN;
132
+ wasm.mecab_pos(retptr, this.__wbg_ptr, ptr0, len0);
133
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
134
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
135
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
136
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
137
+ var ptr2 = r0;
138
+ var len2 = r1;
139
+ if (r3) {
140
+ ptr2 = 0; len2 = 0;
141
+ throw takeObject(r2);
142
+ }
143
+ deferred3_0 = ptr2;
144
+ deferred3_1 = len2;
145
+ return getStringFromWasm0(ptr2, len2);
146
+ } finally {
147
+ wasm.__wbindgen_add_to_stack_pointer(16);
148
+ wasm.__wbindgen_export(deferred3_0, deferred3_1, 1);
149
+ }
150
+ }
151
+ /**
152
+ * Tokenize text and return detailed token information
153
+ *
154
+ * Returns an array of tokens with surface form, POS tag, and position information.
155
+ *
156
+ * # Example (JavaScript)
157
+ *
158
+ * ```javascript
159
+ * const tokens = mecab.tokenize("안녕하세요");
160
+ * tokens.forEach(token => {
161
+ * console.log(`${token.surface}: ${token.pos}`);
162
+ * });
163
+ * ```
164
+ * @param {string} text
165
+ * @returns {WasmToken[]}
166
+ */
167
+ tokenize(text) {
168
+ try {
169
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
170
+ const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
171
+ const len0 = WASM_VECTOR_LEN;
172
+ wasm.mecab_tokenize(retptr, this.__wbg_ptr, ptr0, len0);
173
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
174
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
175
+ var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
176
+ wasm.__wbindgen_export(r0, r1 * 4, 4);
177
+ return v2;
178
+ } finally {
179
+ wasm.__wbindgen_add_to_stack_pointer(16);
180
+ }
181
+ }
182
+ /**
183
+ * Perform wakati (분리) tokenization
184
+ *
185
+ * Returns an array of morpheme strings, similar to `morphs()`.
186
+ *
187
+ * # Example (JavaScript)
188
+ *
189
+ * ```javascript
190
+ * const words = mecab.wakati("형태소 분석");
191
+ * console.log(words); // ["형태소", "분석"]
192
+ * ```
193
+ * @param {string} text
194
+ * @returns {string[]}
195
+ */
196
+ wakati(text) {
197
+ try {
198
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
199
+ const ptr0 = passStringToWasm0(text, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
200
+ const len0 = WASM_VECTOR_LEN;
201
+ wasm.mecab_wakati(retptr, this.__wbg_ptr, ptr0, len0);
202
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
203
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
204
+ var v2 = getArrayJsValueFromWasm0(r0, r1).slice();
205
+ wasm.__wbindgen_export(r0, r1 * 4, 4);
206
+ return v2;
207
+ } finally {
208
+ wasm.__wbindgen_add_to_stack_pointer(16);
209
+ }
210
+ }
211
+ }
212
+ if (Symbol.dispose) Mecab.prototype[Symbol.dispose] = Mecab.prototype.free;
213
+
214
+ /**
215
+ * A JavaScript-friendly token representation
216
+ */
217
+ export class WasmToken {
218
+ static __wrap(ptr) {
219
+ ptr = ptr >>> 0;
220
+ const obj = Object.create(WasmToken.prototype);
221
+ obj.__wbg_ptr = ptr;
222
+ WasmTokenFinalization.register(obj, obj.__wbg_ptr, obj);
223
+ return obj;
224
+ }
225
+ __destroy_into_raw() {
226
+ const ptr = this.__wbg_ptr;
227
+ this.__wbg_ptr = 0;
228
+ WasmTokenFinalization.unregister(this);
229
+ return ptr;
230
+ }
231
+ free() {
232
+ const ptr = this.__destroy_into_raw();
233
+ wasm.__wbg_wasmtoken_free(ptr, 0);
234
+ }
235
+ /**
236
+ * Get the end position in bytes
237
+ * @returns {number}
238
+ */
239
+ get end() {
240
+ const ret = wasm.wasmtoken_end(this.__wbg_ptr);
241
+ return ret >>> 0;
242
+ }
243
+ /**
244
+ * Get the lemma/base form (if available)
245
+ * @returns {string | undefined}
246
+ */
247
+ get lemma() {
248
+ try {
249
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
250
+ wasm.wasmtoken_lemma(retptr, this.__wbg_ptr);
251
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
252
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
253
+ let v1;
254
+ if (r0 !== 0) {
255
+ v1 = getStringFromWasm0(r0, r1).slice();
256
+ wasm.__wbindgen_export(r0, r1 * 1, 1);
257
+ }
258
+ return v1;
259
+ } finally {
260
+ wasm.__wbindgen_add_to_stack_pointer(16);
261
+ }
262
+ }
263
+ /**
264
+ * Get the part-of-speech tag (품사)
265
+ * @returns {string}
266
+ */
267
+ get pos() {
268
+ let deferred1_0;
269
+ let deferred1_1;
270
+ try {
271
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
272
+ wasm.wasmtoken_pos(retptr, this.__wbg_ptr);
273
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
274
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
275
+ deferred1_0 = r0;
276
+ deferred1_1 = r1;
277
+ return getStringFromWasm0(r0, r1);
278
+ } finally {
279
+ wasm.__wbindgen_add_to_stack_pointer(16);
280
+ wasm.__wbindgen_export(deferred1_0, deferred1_1, 1);
281
+ }
282
+ }
283
+ /**
284
+ * Get the reading (if available)
285
+ * @returns {string | undefined}
286
+ */
287
+ get reading() {
288
+ try {
289
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
290
+ wasm.wasmtoken_reading(retptr, this.__wbg_ptr);
291
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
292
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
293
+ let v1;
294
+ if (r0 !== 0) {
295
+ v1 = getStringFromWasm0(r0, r1).slice();
296
+ wasm.__wbindgen_export(r0, r1 * 1, 1);
297
+ }
298
+ return v1;
299
+ } finally {
300
+ wasm.__wbindgen_add_to_stack_pointer(16);
301
+ }
302
+ }
303
+ /**
304
+ * Get the start position in bytes
305
+ * @returns {number}
306
+ */
307
+ get start() {
308
+ const ret = wasm.wasmtoken_start(this.__wbg_ptr);
309
+ return ret >>> 0;
310
+ }
311
+ /**
312
+ * Get the surface form (표면형)
313
+ * @returns {string}
314
+ */
315
+ get surface() {
316
+ let deferred1_0;
317
+ let deferred1_1;
318
+ try {
319
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
320
+ wasm.wasmtoken_surface(retptr, this.__wbg_ptr);
321
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
322
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
323
+ deferred1_0 = r0;
324
+ deferred1_1 = r1;
325
+ return getStringFromWasm0(r0, r1);
326
+ } finally {
327
+ wasm.__wbindgen_add_to_stack_pointer(16);
328
+ wasm.__wbindgen_export(deferred1_0, deferred1_1, 1);
329
+ }
330
+ }
331
+ /**
332
+ * Convert to JSON string for easier JavaScript interop
333
+ *
334
+ * # Errors
335
+ *
336
+ * Returns an error if serialization fails
337
+ * @returns {string}
338
+ */
339
+ toJSON() {
340
+ let deferred2_0;
341
+ let deferred2_1;
342
+ try {
343
+ const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
344
+ wasm.wasmtoken_toJSON(retptr, this.__wbg_ptr);
345
+ var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
346
+ var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
347
+ var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
348
+ var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
349
+ var ptr1 = r0;
350
+ var len1 = r1;
351
+ if (r3) {
352
+ ptr1 = 0; len1 = 0;
353
+ throw takeObject(r2);
354
+ }
355
+ deferred2_0 = ptr1;
356
+ deferred2_1 = len1;
357
+ return getStringFromWasm0(ptr1, len1);
358
+ } finally {
359
+ wasm.__wbindgen_add_to_stack_pointer(16);
360
+ wasm.__wbindgen_export(deferred2_0, deferred2_1, 1);
361
+ }
362
+ }
363
+ }
364
+ if (Symbol.dispose) WasmToken.prototype[Symbol.dispose] = WasmToken.prototype.free;
365
+
366
+ /**
367
+ * Initialize the WASM module
368
+ *
369
+ * This function should be called once before using the library.
370
+ * It sets up panic hooks for better error messages in development.
371
+ */
372
+ export function init() {
373
+ wasm.init();
374
+ }
375
+ export function __wbg___wbindgen_throw_6ddd609b62940d55(arg0, arg1) {
376
+ throw new Error(getStringFromWasm0(arg0, arg1));
377
+ }
378
+ export function __wbg_error_a6fa202b58aa1cd3(arg0, arg1) {
379
+ let deferred0_0;
380
+ let deferred0_1;
381
+ try {
382
+ deferred0_0 = arg0;
383
+ deferred0_1 = arg1;
384
+ console.error(getStringFromWasm0(arg0, arg1));
385
+ } finally {
386
+ wasm.__wbindgen_export(deferred0_0, deferred0_1, 1);
387
+ }
388
+ }
389
+ export function __wbg_new_227d7c05414eb861() {
390
+ const ret = new Error();
391
+ return addHeapObject(ret);
392
+ }
393
+ export function __wbg_stack_3b0d974bbf31e44f(arg0, arg1) {
394
+ const ret = getObject(arg1).stack;
395
+ const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
396
+ const len1 = WASM_VECTOR_LEN;
397
+ getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
398
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
399
+ }
400
+ export function __wbg_wasmtoken_new(arg0) {
401
+ const ret = WasmToken.__wrap(arg0);
402
+ return addHeapObject(ret);
403
+ }
404
+ export function __wbindgen_cast_0000000000000001(arg0, arg1) {
405
+ // Cast intrinsic for `Ref(String) -> Externref`.
406
+ const ret = getStringFromWasm0(arg0, arg1);
407
+ return addHeapObject(ret);
408
+ }
409
+ export function __wbindgen_object_drop_ref(arg0) {
410
+ takeObject(arg0);
411
+ }
412
+ const MecabFinalization = (typeof FinalizationRegistry === 'undefined')
413
+ ? { register: () => {}, unregister: () => {} }
414
+ : new FinalizationRegistry(ptr => wasm.__wbg_mecab_free(ptr >>> 0, 1));
415
+ const WasmTokenFinalization = (typeof FinalizationRegistry === 'undefined')
416
+ ? { register: () => {}, unregister: () => {} }
417
+ : new FinalizationRegistry(ptr => wasm.__wbg_wasmtoken_free(ptr >>> 0, 1));
418
+
419
+ function addHeapObject(obj) {
420
+ if (heap_next === heap.length) heap.push(heap.length + 1);
421
+ const idx = heap_next;
422
+ heap_next = heap[idx];
423
+
424
+ heap[idx] = obj;
425
+ return idx;
426
+ }
427
+
428
+ function dropObject(idx) {
429
+ if (idx < 1028) return;
430
+ heap[idx] = heap_next;
431
+ heap_next = idx;
432
+ }
433
+
434
+ function getArrayJsValueFromWasm0(ptr, len) {
435
+ ptr = ptr >>> 0;
436
+ const mem = getDataViewMemory0();
437
+ const result = [];
438
+ for (let i = ptr; i < ptr + 4 * len; i += 4) {
439
+ result.push(takeObject(mem.getUint32(i, true)));
440
+ }
441
+ return result;
442
+ }
443
+
444
+ let cachedDataViewMemory0 = null;
445
+ function getDataViewMemory0() {
446
+ if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
447
+ cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
448
+ }
449
+ return cachedDataViewMemory0;
450
+ }
451
+
452
+ function getStringFromWasm0(ptr, len) {
453
+ ptr = ptr >>> 0;
454
+ return decodeText(ptr, len);
455
+ }
456
+
457
+ let cachedUint8ArrayMemory0 = null;
458
+ function getUint8ArrayMemory0() {
459
+ if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
460
+ cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
461
+ }
462
+ return cachedUint8ArrayMemory0;
463
+ }
464
+
465
+ function getObject(idx) { return heap[idx]; }
466
+
467
+ let heap = new Array(1024).fill(undefined);
468
+ heap.push(undefined, null, true, false);
469
+
470
+ let heap_next = heap.length;
471
+
472
+ function passStringToWasm0(arg, malloc, realloc) {
473
+ if (realloc === undefined) {
474
+ const buf = cachedTextEncoder.encode(arg);
475
+ const ptr = malloc(buf.length, 1) >>> 0;
476
+ getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
477
+ WASM_VECTOR_LEN = buf.length;
478
+ return ptr;
479
+ }
480
+
481
+ let len = arg.length;
482
+ let ptr = malloc(len, 1) >>> 0;
483
+
484
+ const mem = getUint8ArrayMemory0();
485
+
486
+ let offset = 0;
487
+
488
+ for (; offset < len; offset++) {
489
+ const code = arg.charCodeAt(offset);
490
+ if (code > 0x7F) break;
491
+ mem[ptr + offset] = code;
492
+ }
493
+ if (offset !== len) {
494
+ if (offset !== 0) {
495
+ arg = arg.slice(offset);
496
+ }
497
+ ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
498
+ const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
499
+ const ret = cachedTextEncoder.encodeInto(arg, view);
500
+
501
+ offset += ret.written;
502
+ ptr = realloc(ptr, len, offset, 1) >>> 0;
503
+ }
504
+
505
+ WASM_VECTOR_LEN = offset;
506
+ return ptr;
507
+ }
508
+
509
+ function takeObject(idx) {
510
+ const ret = getObject(idx);
511
+ dropObject(idx);
512
+ return ret;
513
+ }
514
+
515
+ let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
516
+ cachedTextDecoder.decode();
517
+ const MAX_SAFARI_DECODE_BYTES = 2146435072;
518
+ let numBytesDecoded = 0;
519
+ function decodeText(ptr, len) {
520
+ numBytesDecoded += len;
521
+ if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
522
+ cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
523
+ cachedTextDecoder.decode();
524
+ numBytesDecoded = len;
525
+ }
526
+ return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
527
+ }
528
+
529
+ const cachedTextEncoder = new TextEncoder();
530
+
531
+ if (!('encodeInto' in cachedTextEncoder)) {
532
+ cachedTextEncoder.encodeInto = function (arg, view) {
533
+ const buf = cachedTextEncoder.encode(arg);
534
+ view.set(buf);
535
+ return {
536
+ read: arg.length,
537
+ written: buf.length
538
+ };
539
+ };
540
+ }
541
+
542
+ let WASM_VECTOR_LEN = 0;
543
+
544
+
545
+ let wasm;
546
+ export function __wbg_set_wasm(val) {
547
+ wasm = val;
548
+ }
Binary file
package/package.json ADDED
@@ -0,0 +1,62 @@
1
+ {
2
+ "name": "mecab-ko-wasm",
3
+ "version": "0.3.0",
4
+ "description": "WebAssembly bindings for MeCab-Ko Korean morphological analyzer",
5
+ "main": "index.js",
6
+ "module": "mecab_ko_wasm.js",
7
+ "types": "index.d.ts",
8
+ "sideEffects": [
9
+ "./mecab_ko_wasm.js"
10
+ ],
11
+ "files": [
12
+ "index.js",
13
+ "index.d.ts",
14
+ "mecab_ko_wasm_bg.wasm",
15
+ "mecab_ko_wasm.js",
16
+ "mecab_ko_wasm.d.ts",
17
+ "mecab_ko_wasm_bg.js",
18
+ "README.md"
19
+ ],
20
+ "scripts": {
21
+ "build": "wasm-pack build --target bundler --out-dir pkg",
22
+ "build:node": "wasm-pack build --target nodejs --out-dir pkg-node",
23
+ "build:web": "wasm-pack build --target web --out-dir pkg-web",
24
+ "test": "wasm-pack test --headless --firefox"
25
+ },
26
+ "repository": {
27
+ "type": "git",
28
+ "url": "https://github.com/hephaex/mecab-ko"
29
+ },
30
+ "keywords": [
31
+ "korean",
32
+ "nlp",
33
+ "morphological-analysis",
34
+ "tokenizer",
35
+ "mecab",
36
+ "wasm",
37
+ "webassembly"
38
+ ],
39
+ "author": "hephaex <hephaex@gmail.com>",
40
+ "license": "MIT OR Apache-2.0",
41
+ "bugs": {
42
+ "url": "https://github.com/hephaex/mecab-ko/issues"
43
+ },
44
+ "homepage": "https://github.com/hephaex/mecab-ko#readme",
45
+ "exports": {
46
+ ".": {
47
+ "import": "./mecab_ko_wasm.js",
48
+ "require": "./index.js",
49
+ "types": "./index.d.ts"
50
+ },
51
+ "./package.json": "./package.json"
52
+ },
53
+ "engines": {
54
+ "node": ">=16"
55
+ },
56
+ "publishConfig": {
57
+ "access": "public"
58
+ },
59
+ "devDependencies": {
60
+ "@types/node": "^20.0.0"
61
+ }
62
+ }