lindera-wasm-nodejs-cc-cedict 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 by the project authors.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,135 @@
1
+ # lindera-wasm
2
+
3
+ WebAssembly of Lindera
4
+
5
+ ![Screenshot from 2025-09-13 23-05-49](https://github.com/user-attachments/assets/a6ca165a-825c-4260-ba52-d76cd262a21f)
6
+
7
+ ## Demo Application
8
+
9
+ - <https://lindera.github.io/lindera-wasm/>
10
+
11
+ ## npm
12
+
13
+ ### Web
14
+
15
+ - <https://www.npmjs.com/package/lindera-wasm-web-cjk>
16
+ Lindera WASM with CJK dictionaries (IPADIC, ko-dic, CC-CEDICT) for Web
17
+
18
+ - <https://www.npmjs.com/package/lindera-wasm-web-ipadic>
19
+ Lindera WASM with Japanese dictionary (IPADIC) for Web
20
+
21
+ - <https://www.npmjs.com/package/lindera-wasm-web-unidic>
22
+ Lindera WASM with Japanese dictionary (UniDic) for Web
23
+
24
+ - <https://www.npmjs.com/package/lindera-wasm-web-ko-dic>
25
+ Lindera WASM with Korean dictionary (ko-dic) for Web
26
+
27
+ - <https://www.npmjs.com/package/lindera-wasm-web-cc-cedict>
28
+ Lindera WASM with Chinese dictionary (CC-CEDICT) for Web
29
+
30
+ ### Node.js
31
+
32
+ - <https://www.npmjs.com/package/lindera-wasm-nodejs-cjk>
33
+ Lindera WASM with CJK dictionaries (IPADIC, ko-dic, CC-CEDICT) for Node.js
34
+
35
+ - <https://www.npmjs.com/package/lindera-wasm-nodejs-ipadic>
36
+ Lindera WASM with Japanese dictionary (IPADIC) for Node.js
37
+
38
+ - <https://www.npmjs.com/package/lindera-wasm-nodejs-unidic>
39
+ Lindera WASM with Japanese dictionary (UniDic) for Node.js
40
+
41
+ - <https://www.npmjs.com/package/lindera-wasm-nodejs-ko-dic>
42
+ Lindera WASM with Korean dictionary (ko-dic) for Node.js
43
+
44
+ - <https://www.npmjs.com/package/lindera-wasm-nodejs-cc-cedict>
45
+ Lindera WASM with Chinese dictionary (CC-CEDICT) for Node.js
46
+
47
+ ## Usage
48
+
49
+ init the wasm module before construct `TokenizerBuilder`:
50
+
51
+ ```ts
52
+ import __wbg_init, { TokenizerBuilder } from 'lindera-wasm'
53
+
54
+ __wbg_init.then(() => {
55
+ const builder = new TokenizerBuilder()
56
+ //...
57
+ })
58
+ ```
59
+
60
+ ### for [Vite](https://vite.dev/) base project
61
+
62
+ You should exclude this package in the `optimizeDeps`:
63
+
64
+ ```ts
65
+ // vite.config.js
66
+ import { defineConfig } from 'vite'
67
+
68
+ export default defineConfig({
69
+ optimizeDeps: {
70
+ exclude: [
71
+ "lindera-wasm"
72
+ ]
73
+ },
74
+ })
75
+ ```
76
+
77
+ ### for Browser extension development
78
+
79
+ Set the `cors` config in vite.config.js
80
+
81
+ ```ts
82
+ // vite.config.js
83
+ import { defineConfig } from 'vite'
84
+
85
+ export default defineConfig({
86
+ server: {
87
+ cors: {
88
+ origin: [
89
+ /chrome-extension:\/\//,
90
+ ],
91
+ },
92
+ },
93
+ })
94
+ ```
95
+
96
+ and set the `content_security_policy` to contains `wasm-unsafe-eval` in manifest.json:
97
+
98
+ ```json
99
+ // manifest.json
100
+ "content_security_policy": {
101
+ "extension_pages": "script-src 'self' 'wasm-unsafe-eval';"
102
+ }
103
+ ```
104
+
105
+ ## Development
106
+
107
+ ### Install project dependencies
108
+
109
+ - wasm-pack : <https://rustwasm.github.io/wasm-pack/installer/>
110
+
111
+ ### Setup repository
112
+
113
+ ```shell
114
+ # Clone lindera-py project repository
115
+ % git clone git@github.com:lindera/lindera-wasm.git
116
+ % cd lindera-wasm
117
+ ```
118
+
119
+ ### Build project
120
+
121
+ ```shell
122
+ % wasm-pack build --release --features=cjk --target=bundler
123
+ ```
124
+
125
+ ### Build example web application
126
+
127
+ ```shell
128
+ % cd lindera-wasm && npm install && npm run build && cp index.html dist/index.html
129
+ ```
130
+
131
+ ### Run example web application
132
+
133
+ ```shell
134
+ % cd lindera-wasm && npm run start
135
+ ```
@@ -0,0 +1,257 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+ /**
4
+ * Gets the version of the lindera-wasm library.
5
+ *
6
+ * # Returns
7
+ *
8
+ * The version string of the library (e.g., "1.0.0").
9
+ *
10
+ * # Examples
11
+ *
12
+ * ```javascript
13
+ * import { getVersion } from 'lindera-wasm';
14
+ * console.log(getVersion()); // "1.0.0"
15
+ * ```
16
+ */
17
+ export function getVersion(): string;
18
+ /**
19
+ * A tokenizer for morphological analysis.
20
+ *
21
+ * The `Tokenizer` performs text tokenization based on the configuration
22
+ * provided by [`TokenizerBuilder`].
23
+ *
24
+ * # Examples
25
+ *
26
+ * ```javascript
27
+ * const builder = new TokenizerBuilder();
28
+ * builder.setDictionary("embedded://ipadic");
29
+ * builder.setMode("normal");
30
+ *
31
+ * const tokenizer = builder.build();
32
+ * const tokens = tokenizer.tokenize("関西国際空港");
33
+ * console.log(tokens);
34
+ * // Output: [
35
+ * // { surface: "関西国際空港", ... },
36
+ * // ...
37
+ * // ]
38
+ * ```
39
+ */
40
+ export class Tokenizer {
41
+ private constructor();
42
+ free(): void;
43
+ [Symbol.dispose](): void;
44
+ /**
45
+ * Tokenizes the input text.
46
+ *
47
+ * Analyzes the input text and returns an array of token objects. Each token
48
+ * contains information such as surface form, part-of-speech tags, reading, etc.
49
+ * Field names in the returned objects are in camelCase.
50
+ *
51
+ * # Parameters
52
+ *
53
+ * - `input_text`: The text to tokenize.
54
+ *
55
+ * # Returns
56
+ *
57
+ * A JavaScript array of token objects. Each token object contains:
58
+ * - `surface`: The surface form of the token
59
+ * - `pos`: Part-of-speech tags
60
+ * - Additional language-specific fields
61
+ *
62
+ * # Errors
63
+ *
64
+ * Returns an error if tokenization fails.
65
+ *
66
+ * # Examples
67
+ *
68
+ * ```javascript
69
+ * const tokens = tokenizer.tokenize("東京都に行く");
70
+ * tokens.forEach(token => {
71
+ * console.log(token.surface, token.pos);
72
+ * });
73
+ * ```
74
+ */
75
+ tokenize(input_text: string): any;
76
+ }
77
+ /**
78
+ * Builder for creating a [`Tokenizer`] instance.
79
+ *
80
+ * `TokenizerBuilder` provides a fluent API for configuring and building a tokenizer
81
+ * with various options such as dictionary selection, tokenization mode, character filters,
82
+ * and token filters.
83
+ *
84
+ * # Examples
85
+ *
86
+ * ```javascript
87
+ * const builder = new TokenizerBuilder();
88
+ * builder.setDictionary("embedded://ipadic");
89
+ * builder.setMode("normal");
90
+ * builder.setKeepWhitespace(false);
91
+ * builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
92
+ * builder.appendTokenFilter("lowercase");
93
+ *
94
+ * const tokenizer = builder.build();
95
+ * ```
96
+ */
97
+ export class TokenizerBuilder {
98
+ free(): void;
99
+ [Symbol.dispose](): void;
100
+ /**
101
+ * Creates a new `TokenizerBuilder` instance.
102
+ *
103
+ * # Returns
104
+ *
105
+ * A new `TokenizerBuilder` instance.
106
+ *
107
+ * # Errors
108
+ *
109
+ * Returns an error if the builder cannot be initialized.
110
+ *
111
+ * # Examples
112
+ *
113
+ * ```javascript
114
+ * const builder = new TokenizerBuilder();
115
+ * ```
116
+ */
117
+ constructor();
118
+ /**
119
+ * Builds and returns a configured [`Tokenizer`] instance.
120
+ *
121
+ * This method consumes the builder and creates the final tokenizer with all
122
+ * configured settings.
123
+ *
124
+ * # Returns
125
+ *
126
+ * A configured `Tokenizer` instance.
127
+ *
128
+ * # Errors
129
+ *
130
+ * Returns an error if the tokenizer cannot be built with the current configuration.
131
+ *
132
+ * # Examples
133
+ *
134
+ * ```javascript
135
+ * const builder = new TokenizerBuilder();
136
+ * builder.setDictionary("embedded://ipadic");
137
+ * const tokenizer = builder.build();
138
+ * ```
139
+ */
140
+ build(): Tokenizer;
141
+ /**
142
+ * Sets the tokenization mode.
143
+ *
144
+ * # Parameters
145
+ *
146
+ * - `mode`: The tokenization mode. Valid values are:
147
+ * - `"normal"`: Standard tokenization
148
+ * - `"decompose"`: Decomposes compound words into their components
149
+ *
150
+ * # Errors
151
+ *
152
+ * Returns an error if the mode string is invalid.
153
+ *
154
+ * # Examples
155
+ *
156
+ * ```javascript
157
+ * builder.setMode("normal");
158
+ * // or
159
+ * builder.setMode("decompose");
160
+ * ```
161
+ */
162
+ setMode(mode: string): void;
163
+ /**
164
+ * Sets the dictionary to use for tokenization.
165
+ *
166
+ * # Parameters
167
+ *
168
+ * - `uri`: The dictionary URI. Valid embedded dictionaries are:
169
+ * - `"embedded://ipadic"`: Japanese IPADIC dictionary
170
+ * - `"embedded://unidic"`: Japanese UniDic dictionary
171
+ * - `"embedded://ko-dic"`: Korean ko-dic dictionary
172
+ * - `"embedded://cc-cedict"`: Chinese CC-CEDICT dictionary
173
+ *
174
+ * # Examples
175
+ *
176
+ * ```javascript
177
+ * builder.setDictionary("embedded://ipadic");
178
+ * ```
179
+ */
180
+ setDictionary(uri: string): void;
181
+ /**
182
+ * Sets a user-defined dictionary.
183
+ *
184
+ * User dictionaries allow you to add custom words and their properties
185
+ * to supplement the main dictionary.
186
+ *
187
+ * # Parameters
188
+ *
189
+ * - `uri`: The URI to the user dictionary file.
190
+ *
191
+ * # Examples
192
+ *
193
+ * ```javascript
194
+ * builder.setUserDictionary("path/to/user_dict.csv");
195
+ * ```
196
+ */
197
+ setUserDictionary(uri: string): void;
198
+ /**
199
+ * Sets whether to keep whitespace tokens in the output.
200
+ *
201
+ * # Parameters
202
+ *
203
+ * - `keep`: If `true`, whitespace tokens are preserved; if `false`, they are removed.
204
+ *
205
+ * # Examples
206
+ *
207
+ * ```javascript
208
+ * builder.setKeepWhitespace(false); // Remove whitespace tokens
209
+ * // or
210
+ * builder.setKeepWhitespace(true); // Keep whitespace tokens
211
+ * ```
212
+ */
213
+ setKeepWhitespace(keep: boolean): void;
214
+ /**
215
+ * Appends a character filter to the tokenization pipeline.
216
+ *
217
+ * Character filters transform the input text before tokenization.
218
+ *
219
+ * # Parameters
220
+ *
221
+ * - `name`: The name of the character filter (e.g., `"unicode_normalize"`).
222
+ * - `args`: A JavaScript object containing filter-specific arguments.
223
+ *
224
+ * # Errors
225
+ *
226
+ * Returns an error if the arguments cannot be parsed.
227
+ *
228
+ * # Examples
229
+ *
230
+ * ```javascript
231
+ * builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
232
+ * ```
233
+ */
234
+ appendCharacterFilter(name: string, args: any): void;
235
+ /**
236
+ * Appends a token filter to the tokenization pipeline.
237
+ *
238
+ * Token filters transform or filter the tokens after tokenization.
239
+ *
240
+ * # Parameters
241
+ *
242
+ * - `name`: The name of the token filter (e.g., `"lowercase"`, `"japanese_number"`).
243
+ * - `args`: A JavaScript object containing filter-specific arguments.
244
+ *
245
+ * # Errors
246
+ *
247
+ * Returns an error if the arguments cannot be parsed.
248
+ *
249
+ * # Examples
250
+ *
251
+ * ```javascript
252
+ * builder.appendTokenFilter("lowercase");
253
+ * builder.appendTokenFilter("japanese_number", { "tags": ["名詞,数"] });
254
+ * ```
255
+ */
256
+ appendTokenFilter(name: string, args: any): void;
257
+ }
@@ -0,0 +1,820 @@
1
+
2
+ let imports = {};
3
+ imports['__wbindgen_placeholder__'] = module.exports;
4
+
5
+ let cachedUint8ArrayMemory0 = null;
6
+
7
+ function getUint8ArrayMemory0() {
8
+ if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
9
+ cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
10
+ }
11
+ return cachedUint8ArrayMemory0;
12
+ }
13
+
14
+ let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
15
+
16
+ cachedTextDecoder.decode();
17
+
18
+ function decodeText(ptr, len) {
19
+ return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
20
+ }
21
+
22
+ function getStringFromWasm0(ptr, len) {
23
+ ptr = ptr >>> 0;
24
+ return decodeText(ptr, len);
25
+ }
26
+
27
+ let WASM_VECTOR_LEN = 0;
28
+
29
+ const cachedTextEncoder = new TextEncoder();
30
+
31
+ if (!('encodeInto' in cachedTextEncoder)) {
32
+ cachedTextEncoder.encodeInto = function (arg, view) {
33
+ const buf = cachedTextEncoder.encode(arg);
34
+ view.set(buf);
35
+ return {
36
+ read: arg.length,
37
+ written: buf.length
38
+ };
39
+ }
40
+ }
41
+
42
+ function passStringToWasm0(arg, malloc, realloc) {
43
+
44
+ if (realloc === undefined) {
45
+ const buf = cachedTextEncoder.encode(arg);
46
+ const ptr = malloc(buf.length, 1) >>> 0;
47
+ getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
48
+ WASM_VECTOR_LEN = buf.length;
49
+ return ptr;
50
+ }
51
+
52
+ let len = arg.length;
53
+ let ptr = malloc(len, 1) >>> 0;
54
+
55
+ const mem = getUint8ArrayMemory0();
56
+
57
+ let offset = 0;
58
+
59
+ for (; offset < len; offset++) {
60
+ const code = arg.charCodeAt(offset);
61
+ if (code > 0x7F) break;
62
+ mem[ptr + offset] = code;
63
+ }
64
+
65
+ if (offset !== len) {
66
+ if (offset !== 0) {
67
+ arg = arg.slice(offset);
68
+ }
69
+ ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
70
+ const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
71
+ const ret = cachedTextEncoder.encodeInto(arg, view);
72
+
73
+ offset += ret.written;
74
+ ptr = realloc(ptr, len, offset, 1) >>> 0;
75
+ }
76
+
77
+ WASM_VECTOR_LEN = offset;
78
+ return ptr;
79
+ }
80
+
81
+ let cachedDataViewMemory0 = null;
82
+
83
+ function getDataViewMemory0() {
84
+ if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
85
+ cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
86
+ }
87
+ return cachedDataViewMemory0;
88
+ }
89
+
90
+ function addToExternrefTable0(obj) {
91
+ const idx = wasm.__externref_table_alloc();
92
+ wasm.__wbindgen_export_4.set(idx, obj);
93
+ return idx;
94
+ }
95
+
96
+ function handleError(f, args) {
97
+ try {
98
+ return f.apply(this, args);
99
+ } catch (e) {
100
+ const idx = addToExternrefTable0(e);
101
+ wasm.__wbindgen_exn_store(idx);
102
+ }
103
+ }
104
+
105
+ function getArrayU8FromWasm0(ptr, len) {
106
+ ptr = ptr >>> 0;
107
+ return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
108
+ }
109
+
110
+ function isLikeNone(x) {
111
+ return x === undefined || x === null;
112
+ }
113
+
114
+ function debugString(val) {
115
+ // primitive types
116
+ const type = typeof val;
117
+ if (type == 'number' || type == 'boolean' || val == null) {
118
+ return `${val}`;
119
+ }
120
+ if (type == 'string') {
121
+ return `"${val}"`;
122
+ }
123
+ if (type == 'symbol') {
124
+ const description = val.description;
125
+ if (description == null) {
126
+ return 'Symbol';
127
+ } else {
128
+ return `Symbol(${description})`;
129
+ }
130
+ }
131
+ if (type == 'function') {
132
+ const name = val.name;
133
+ if (typeof name == 'string' && name.length > 0) {
134
+ return `Function(${name})`;
135
+ } else {
136
+ return 'Function';
137
+ }
138
+ }
139
+ // objects
140
+ if (Array.isArray(val)) {
141
+ const length = val.length;
142
+ let debug = '[';
143
+ if (length > 0) {
144
+ debug += debugString(val[0]);
145
+ }
146
+ for(let i = 1; i < length; i++) {
147
+ debug += ', ' + debugString(val[i]);
148
+ }
149
+ debug += ']';
150
+ return debug;
151
+ }
152
+ // Test for built-in
153
+ const builtInMatches = /\[object ([^\]]+)\]/.exec(toString.call(val));
154
+ let className;
155
+ if (builtInMatches && builtInMatches.length > 1) {
156
+ className = builtInMatches[1];
157
+ } else {
158
+ // Failed to match the standard '[object ClassName]'
159
+ return toString.call(val);
160
+ }
161
+ if (className == 'Object') {
162
+ // we're a user defined class or Object
163
+ // JSON.stringify avoids problems with cycles, and is generally much
164
+ // easier than looping through ownProperties of `val`.
165
+ try {
166
+ return 'Object(' + JSON.stringify(val) + ')';
167
+ } catch (_) {
168
+ return 'Object';
169
+ }
170
+ }
171
+ // errors
172
+ if (val instanceof Error) {
173
+ return `${val.name}: ${val.message}\n${val.stack}`;
174
+ }
175
+ // TODO we could test for more things here, like `Set`s and `Map`s.
176
+ return className;
177
+ }
178
+ /**
179
+ * Gets the version of the lindera-wasm library.
180
+ *
181
+ * # Returns
182
+ *
183
+ * The version string of the library (e.g., "1.0.0").
184
+ *
185
+ * # Examples
186
+ *
187
+ * ```javascript
188
+ * import { getVersion } from 'lindera-wasm';
189
+ * console.log(getVersion()); // "1.0.0"
190
+ * ```
191
+ * @returns {string}
192
+ */
193
+ exports.getVersion = function() {
194
+ let deferred1_0;
195
+ let deferred1_1;
196
+ try {
197
+ const ret = wasm.getVersion();
198
+ deferred1_0 = ret[0];
199
+ deferred1_1 = ret[1];
200
+ return getStringFromWasm0(ret[0], ret[1]);
201
+ } finally {
202
+ wasm.__wbindgen_free(deferred1_0, deferred1_1, 1);
203
+ }
204
+ };
205
+
206
+ function takeFromExternrefTable0(idx) {
207
+ const value = wasm.__wbindgen_export_4.get(idx);
208
+ wasm.__externref_table_dealloc(idx);
209
+ return value;
210
+ }
211
+
212
+ const TokenizerFinalization = (typeof FinalizationRegistry === 'undefined')
213
+ ? { register: () => {}, unregister: () => {} }
214
+ : new FinalizationRegistry(ptr => wasm.__wbg_tokenizer_free(ptr >>> 0, 1));
215
+ /**
216
+ * A tokenizer for morphological analysis.
217
+ *
218
+ * The `Tokenizer` performs text tokenization based on the configuration
219
+ * provided by [`TokenizerBuilder`].
220
+ *
221
+ * # Examples
222
+ *
223
+ * ```javascript
224
+ * const builder = new TokenizerBuilder();
225
+ * builder.setDictionary("embedded://ipadic");
226
+ * builder.setMode("normal");
227
+ *
228
+ * const tokenizer = builder.build();
229
+ * const tokens = tokenizer.tokenize("関西国際空港");
230
+ * console.log(tokens);
231
+ * // Output: [
232
+ * // { surface: "関西国際空港", ... },
233
+ * // ...
234
+ * // ]
235
+ * ```
236
+ */
237
+ class Tokenizer {
238
+
239
+ static __wrap(ptr) {
240
+ ptr = ptr >>> 0;
241
+ const obj = Object.create(Tokenizer.prototype);
242
+ obj.__wbg_ptr = ptr;
243
+ TokenizerFinalization.register(obj, obj.__wbg_ptr, obj);
244
+ return obj;
245
+ }
246
+
247
+ __destroy_into_raw() {
248
+ const ptr = this.__wbg_ptr;
249
+ this.__wbg_ptr = 0;
250
+ TokenizerFinalization.unregister(this);
251
+ return ptr;
252
+ }
253
+
254
+ free() {
255
+ const ptr = this.__destroy_into_raw();
256
+ wasm.__wbg_tokenizer_free(ptr, 0);
257
+ }
258
+ /**
259
+ * Tokenizes the input text.
260
+ *
261
+ * Analyzes the input text and returns an array of token objects. Each token
262
+ * contains information such as surface form, part-of-speech tags, reading, etc.
263
+ * Field names in the returned objects are in camelCase.
264
+ *
265
+ * # Parameters
266
+ *
267
+ * - `input_text`: The text to tokenize.
268
+ *
269
+ * # Returns
270
+ *
271
+ * A JavaScript array of token objects. Each token object contains:
272
+ * - `surface`: The surface form of the token
273
+ * - `pos`: Part-of-speech tags
274
+ * - Additional language-specific fields
275
+ *
276
+ * # Errors
277
+ *
278
+ * Returns an error if tokenization fails.
279
+ *
280
+ * # Examples
281
+ *
282
+ * ```javascript
283
+ * const tokens = tokenizer.tokenize("東京都に行く");
284
+ * tokens.forEach(token => {
285
+ * console.log(token.surface, token.pos);
286
+ * });
287
+ * ```
288
+ * @param {string} input_text
289
+ * @returns {any}
290
+ */
291
+ tokenize(input_text) {
292
+ const ptr0 = passStringToWasm0(input_text, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
293
+ const len0 = WASM_VECTOR_LEN;
294
+ const ret = wasm.tokenizer_tokenize(this.__wbg_ptr, ptr0, len0);
295
+ if (ret[2]) {
296
+ throw takeFromExternrefTable0(ret[1]);
297
+ }
298
+ return takeFromExternrefTable0(ret[0]);
299
+ }
300
+ }
301
+ if (Symbol.dispose) Tokenizer.prototype[Symbol.dispose] = Tokenizer.prototype.free;
302
+
303
+ exports.Tokenizer = Tokenizer;
304
+
305
+ const TokenizerBuilderFinalization = (typeof FinalizationRegistry === 'undefined')
306
+ ? { register: () => {}, unregister: () => {} }
307
+ : new FinalizationRegistry(ptr => wasm.__wbg_tokenizerbuilder_free(ptr >>> 0, 1));
308
+ /**
309
+ * Builder for creating a [`Tokenizer`] instance.
310
+ *
311
+ * `TokenizerBuilder` provides a fluent API for configuring and building a tokenizer
312
+ * with various options such as dictionary selection, tokenization mode, character filters,
313
+ * and token filters.
314
+ *
315
+ * # Examples
316
+ *
317
+ * ```javascript
318
+ * const builder = new TokenizerBuilder();
319
+ * builder.setDictionary("embedded://ipadic");
320
+ * builder.setMode("normal");
321
+ * builder.setKeepWhitespace(false);
322
+ * builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
323
+ * builder.appendTokenFilter("lowercase");
324
+ *
325
+ * const tokenizer = builder.build();
326
+ * ```
327
+ */
328
+ class TokenizerBuilder {
329
+
330
+ __destroy_into_raw() {
331
+ const ptr = this.__wbg_ptr;
332
+ this.__wbg_ptr = 0;
333
+ TokenizerBuilderFinalization.unregister(this);
334
+ return ptr;
335
+ }
336
+
337
+ free() {
338
+ const ptr = this.__destroy_into_raw();
339
+ wasm.__wbg_tokenizerbuilder_free(ptr, 0);
340
+ }
341
+ /**
342
+ * Creates a new `TokenizerBuilder` instance.
343
+ *
344
+ * # Returns
345
+ *
346
+ * A new `TokenizerBuilder` instance.
347
+ *
348
+ * # Errors
349
+ *
350
+ * Returns an error if the builder cannot be initialized.
351
+ *
352
+ * # Examples
353
+ *
354
+ * ```javascript
355
+ * const builder = new TokenizerBuilder();
356
+ * ```
357
+ */
358
+ constructor() {
359
+ const ret = wasm.tokenizerbuilder_new();
360
+ if (ret[2]) {
361
+ throw takeFromExternrefTable0(ret[1]);
362
+ }
363
+ this.__wbg_ptr = ret[0] >>> 0;
364
+ TokenizerBuilderFinalization.register(this, this.__wbg_ptr, this);
365
+ return this;
366
+ }
367
+ /**
368
+ * Builds and returns a configured [`Tokenizer`] instance.
369
+ *
370
+ * This method consumes the builder and creates the final tokenizer with all
371
+ * configured settings.
372
+ *
373
+ * # Returns
374
+ *
375
+ * A configured `Tokenizer` instance.
376
+ *
377
+ * # Errors
378
+ *
379
+ * Returns an error if the tokenizer cannot be built with the current configuration.
380
+ *
381
+ * # Examples
382
+ *
383
+ * ```javascript
384
+ * const builder = new TokenizerBuilder();
385
+ * builder.setDictionary("embedded://ipadic");
386
+ * const tokenizer = builder.build();
387
+ * ```
388
+ * @returns {Tokenizer}
389
+ */
390
+ build() {
391
+ const ptr = this.__destroy_into_raw();
392
+ const ret = wasm.tokenizerbuilder_build(ptr);
393
+ if (ret[2]) {
394
+ throw takeFromExternrefTable0(ret[1]);
395
+ }
396
+ return Tokenizer.__wrap(ret[0]);
397
+ }
398
+ /**
399
+ * Sets the tokenization mode.
400
+ *
401
+ * # Parameters
402
+ *
403
+ * - `mode`: The tokenization mode. Valid values are:
404
+ * - `"normal"`: Standard tokenization
405
+ * - `"decompose"`: Decomposes compound words into their components
406
+ *
407
+ * # Errors
408
+ *
409
+ * Returns an error if the mode string is invalid.
410
+ *
411
+ * # Examples
412
+ *
413
+ * ```javascript
414
+ * builder.setMode("normal");
415
+ * // or
416
+ * builder.setMode("decompose");
417
+ * ```
418
+ * @param {string} mode
419
+ */
420
+ setMode(mode) {
421
+ const ptr0 = passStringToWasm0(mode, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
422
+ const len0 = WASM_VECTOR_LEN;
423
+ const ret = wasm.tokenizerbuilder_setMode(this.__wbg_ptr, ptr0, len0);
424
+ if (ret[1]) {
425
+ throw takeFromExternrefTable0(ret[0]);
426
+ }
427
+ }
428
+ /**
429
+ * Sets the dictionary to use for tokenization.
430
+ *
431
+ * # Parameters
432
+ *
433
+ * - `uri`: The dictionary URI. Valid embedded dictionaries are:
434
+ * - `"embedded://ipadic"`: Japanese IPADIC dictionary
435
+ * - `"embedded://unidic"`: Japanese UniDic dictionary
436
+ * - `"embedded://ko-dic"`: Korean ko-dic dictionary
437
+ * - `"embedded://cc-cedict"`: Chinese CC-CEDICT dictionary
438
+ *
439
+ * # Examples
440
+ *
441
+ * ```javascript
442
+ * builder.setDictionary("embedded://ipadic");
443
+ * ```
444
+ * @param {string} uri
445
+ */
446
+ setDictionary(uri) {
447
+ const ptr0 = passStringToWasm0(uri, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
448
+ const len0 = WASM_VECTOR_LEN;
449
+ const ret = wasm.tokenizerbuilder_setDictionary(this.__wbg_ptr, ptr0, len0);
450
+ if (ret[1]) {
451
+ throw takeFromExternrefTable0(ret[0]);
452
+ }
453
+ }
454
+ /**
455
+ * Sets a user-defined dictionary.
456
+ *
457
+ * User dictionaries allow you to add custom words and their properties
458
+ * to supplement the main dictionary.
459
+ *
460
+ * # Parameters
461
+ *
462
+ * - `uri`: The URI to the user dictionary file.
463
+ *
464
+ * # Examples
465
+ *
466
+ * ```javascript
467
+ * builder.setUserDictionary("path/to/user_dict.csv");
468
+ * ```
469
+ * @param {string} uri
470
+ */
471
+ setUserDictionary(uri) {
472
+ const ptr0 = passStringToWasm0(uri, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
473
+ const len0 = WASM_VECTOR_LEN;
474
+ const ret = wasm.tokenizerbuilder_setUserDictionary(this.__wbg_ptr, ptr0, len0);
475
+ if (ret[1]) {
476
+ throw takeFromExternrefTable0(ret[0]);
477
+ }
478
+ }
479
+ /**
480
+ * Sets whether to keep whitespace tokens in the output.
481
+ *
482
+ * # Parameters
483
+ *
484
+ * - `keep`: If `true`, whitespace tokens are preserved; if `false`, they are removed.
485
+ *
486
+ * # Examples
487
+ *
488
+ * ```javascript
489
+ * builder.setKeepWhitespace(false); // Remove whitespace tokens
490
+ * // or
491
+ * builder.setKeepWhitespace(true); // Keep whitespace tokens
492
+ * ```
493
+ * @param {boolean} keep
494
+ */
495
+ setKeepWhitespace(keep) {
496
+ const ret = wasm.tokenizerbuilder_setKeepWhitespace(this.__wbg_ptr, keep);
497
+ if (ret[1]) {
498
+ throw takeFromExternrefTable0(ret[0]);
499
+ }
500
+ }
501
+ /**
502
+ * Appends a character filter to the tokenization pipeline.
503
+ *
504
+ * Character filters transform the input text before tokenization.
505
+ *
506
+ * # Parameters
507
+ *
508
+ * - `name`: The name of the character filter (e.g., `"unicode_normalize"`).
509
+ * - `args`: A JavaScript object containing filter-specific arguments.
510
+ *
511
+ * # Errors
512
+ *
513
+ * Returns an error if the arguments cannot be parsed.
514
+ *
515
+ * # Examples
516
+ *
517
+ * ```javascript
518
+ * builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
519
+ * ```
520
+ * @param {string} name
521
+ * @param {any} args
522
+ */
523
+ appendCharacterFilter(name, args) {
524
+ const ptr0 = passStringToWasm0(name, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
525
+ const len0 = WASM_VECTOR_LEN;
526
+ const ret = wasm.tokenizerbuilder_appendCharacterFilter(this.__wbg_ptr, ptr0, len0, args);
527
+ if (ret[1]) {
528
+ throw takeFromExternrefTable0(ret[0]);
529
+ }
530
+ }
531
+ /**
532
+ * Appends a token filter to the tokenization pipeline.
533
+ *
534
+ * Token filters transform or filter the tokens after tokenization.
535
+ *
536
+ * # Parameters
537
+ *
538
+ * - `name`: The name of the token filter (e.g., `"lowercase"`, `"japanese_number"`).
539
+ * - `args`: A JavaScript object containing filter-specific arguments.
540
+ *
541
+ * # Errors
542
+ *
543
+ * Returns an error if the arguments cannot be parsed.
544
+ *
545
+ * # Examples
546
+ *
547
+ * ```javascript
548
+ * builder.appendTokenFilter("lowercase");
549
+ * builder.appendTokenFilter("japanese_number", { "tags": ["名詞,数"] });
550
+ * ```
551
+ * @param {string} name
552
+ * @param {any} args
553
+ */
554
+ appendTokenFilter(name, args) {
555
+ const ptr0 = passStringToWasm0(name, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
556
+ const len0 = WASM_VECTOR_LEN;
557
+ const ret = wasm.tokenizerbuilder_appendTokenFilter(this.__wbg_ptr, ptr0, len0, args);
558
+ if (ret[1]) {
559
+ throw takeFromExternrefTable0(ret[0]);
560
+ }
561
+ }
562
+ }
563
+ if (Symbol.dispose) TokenizerBuilder.prototype[Symbol.dispose] = TokenizerBuilder.prototype.free;
564
+
565
+ exports.TokenizerBuilder = TokenizerBuilder;
566
+
567
+ exports.__wbg_Error_e17e777aac105295 = function(arg0, arg1) {
568
+ const ret = Error(getStringFromWasm0(arg0, arg1));
569
+ return ret;
570
+ };
571
+
572
+ exports.__wbg_String_8f0eb39a4a4c2f66 = function(arg0, arg1) {
573
+ const ret = String(arg1);
574
+ const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
575
+ const len1 = WASM_VECTOR_LEN;
576
+ getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
577
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
578
+ };
579
+
580
+ exports.__wbg_call_13410aac570ffff7 = function() { return handleError(function (arg0, arg1) {
581
+ const ret = arg0.call(arg1);
582
+ return ret;
583
+ }, arguments) };
584
+
585
+ exports.__wbg_done_75ed0ee6dd243d9d = function(arg0) {
586
+ const ret = arg0.done;
587
+ return ret;
588
+ };
589
+
590
+ exports.__wbg_entries_2be2f15bd5554996 = function(arg0) {
591
+ const ret = Object.entries(arg0);
592
+ return ret;
593
+ };
594
+
595
+ exports.__wbg_get_0da715ceaecea5c8 = function(arg0, arg1) {
596
+ const ret = arg0[arg1 >>> 0];
597
+ return ret;
598
+ };
599
+
600
+ exports.__wbg_get_458e874b43b18b25 = function() { return handleError(function (arg0, arg1) {
601
+ const ret = Reflect.get(arg0, arg1);
602
+ return ret;
603
+ }, arguments) };
604
+
605
+ exports.__wbg_instanceof_ArrayBuffer_67f3012529f6a2dd = function(arg0) {
606
+ let result;
607
+ try {
608
+ result = arg0 instanceof ArrayBuffer;
609
+ } catch (_) {
610
+ result = false;
611
+ }
612
+ const ret = result;
613
+ return ret;
614
+ };
615
+
616
+ exports.__wbg_instanceof_Map_ebb01a5b6b5ffd0b = function(arg0) {
617
+ let result;
618
+ try {
619
+ result = arg0 instanceof Map;
620
+ } catch (_) {
621
+ result = false;
622
+ }
623
+ const ret = result;
624
+ return ret;
625
+ };
626
+
627
+ exports.__wbg_instanceof_Uint8Array_9a8378d955933db7 = function(arg0) {
628
+ let result;
629
+ try {
630
+ result = arg0 instanceof Uint8Array;
631
+ } catch (_) {
632
+ result = false;
633
+ }
634
+ const ret = result;
635
+ return ret;
636
+ };
637
+
638
+ exports.__wbg_isArray_030cce220591fb41 = function(arg0) {
639
+ const ret = Array.isArray(arg0);
640
+ return ret;
641
+ };
642
+
643
+ exports.__wbg_isSafeInteger_1c0d1af5542e102a = function(arg0) {
644
+ const ret = Number.isSafeInteger(arg0);
645
+ return ret;
646
+ };
647
+
648
+ exports.__wbg_iterator_f370b34483c71a1c = function() {
649
+ const ret = Symbol.iterator;
650
+ return ret;
651
+ };
652
+
653
+ exports.__wbg_length_186546c51cd61acd = function(arg0) {
654
+ const ret = arg0.length;
655
+ return ret;
656
+ };
657
+
658
+ exports.__wbg_length_6bb7e81f9d7713e4 = function(arg0) {
659
+ const ret = arg0.length;
660
+ return ret;
661
+ };
662
+
663
+ exports.__wbg_new_19c25a3f2fa63a02 = function() {
664
+ const ret = new Object();
665
+ return ret;
666
+ };
667
+
668
+ exports.__wbg_new_1f3a344cf3123716 = function() {
669
+ const ret = new Array();
670
+ return ret;
671
+ };
672
+
673
+ exports.__wbg_new_638ebfaedbf32a5e = function(arg0) {
674
+ const ret = new Uint8Array(arg0);
675
+ return ret;
676
+ };
677
+
678
+ exports.__wbg_next_5b3530e612fde77d = function(arg0) {
679
+ const ret = arg0.next;
680
+ return ret;
681
+ };
682
+
683
+ exports.__wbg_next_692e82279131b03c = function() { return handleError(function (arg0) {
684
+ const ret = arg0.next();
685
+ return ret;
686
+ }, arguments) };
687
+
688
+ exports.__wbg_prototypesetcall_3d4a26c1ed734349 = function(arg0, arg1, arg2) {
689
+ Uint8Array.prototype.set.call(getArrayU8FromWasm0(arg0, arg1), arg2);
690
+ };
691
+
692
+ exports.__wbg_push_330b2eb93e4e1212 = function(arg0, arg1) {
693
+ const ret = arg0.push(arg1);
694
+ return ret;
695
+ };
696
+
697
+ exports.__wbg_set_453345bcda80b89a = function() { return handleError(function (arg0, arg1, arg2) {
698
+ const ret = Reflect.set(arg0, arg1, arg2);
699
+ return ret;
700
+ }, arguments) };
701
+
702
+ exports.__wbg_value_dd9372230531eade = function(arg0) {
703
+ const ret = arg0.value;
704
+ return ret;
705
+ };
706
+
707
+ exports.__wbg_wbindgenbigintgetasi64_ac743ece6ab9bba1 = function(arg0, arg1) {
708
+ const v = arg1;
709
+ const ret = typeof(v) === 'bigint' ? v : undefined;
710
+ getDataViewMemory0().setBigInt64(arg0 + 8 * 1, isLikeNone(ret) ? BigInt(0) : ret, true);
711
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, !isLikeNone(ret), true);
712
+ };
713
+
714
+ exports.__wbg_wbindgenbooleanget_3fe6f642c7d97746 = function(arg0) {
715
+ const v = arg0;
716
+ const ret = typeof(v) === 'boolean' ? v : undefined;
717
+ return isLikeNone(ret) ? 0xFFFFFF : ret ? 1 : 0;
718
+ };
719
+
720
+ exports.__wbg_wbindgendebugstring_99ef257a3ddda34d = function(arg0, arg1) {
721
+ const ret = debugString(arg1);
722
+ const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
723
+ const len1 = WASM_VECTOR_LEN;
724
+ getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
725
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
726
+ };
727
+
728
+ exports.__wbg_wbindgenin_d7a1ee10933d2d55 = function(arg0, arg1) {
729
+ const ret = arg0 in arg1;
730
+ return ret;
731
+ };
732
+
733
+ exports.__wbg_wbindgenisbigint_ecb90cc08a5a9154 = function(arg0) {
734
+ const ret = typeof(arg0) === 'bigint';
735
+ return ret;
736
+ };
737
+
738
+ exports.__wbg_wbindgenisfunction_8cee7dce3725ae74 = function(arg0) {
739
+ const ret = typeof(arg0) === 'function';
740
+ return ret;
741
+ };
742
+
743
+ exports.__wbg_wbindgenisobject_307a53c6bd97fbf8 = function(arg0) {
744
+ const val = arg0;
745
+ const ret = typeof(val) === 'object' && val !== null;
746
+ return ret;
747
+ };
748
+
749
+ exports.__wbg_wbindgenjsvaleq_e6f2ad59ccae1b58 = function(arg0, arg1) {
750
+ const ret = arg0 === arg1;
751
+ return ret;
752
+ };
753
+
754
+ exports.__wbg_wbindgenjsvallooseeq_9bec8c9be826bed1 = function(arg0, arg1) {
755
+ const ret = arg0 == arg1;
756
+ return ret;
757
+ };
758
+
759
+ exports.__wbg_wbindgennumberget_f74b4c7525ac05cb = function(arg0, arg1) {
760
+ const obj = arg1;
761
+ const ret = typeof(obj) === 'number' ? obj : undefined;
762
+ getDataViewMemory0().setFloat64(arg0 + 8 * 1, isLikeNone(ret) ? 0 : ret, true);
763
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, !isLikeNone(ret), true);
764
+ };
765
+
766
+ exports.__wbg_wbindgenstringget_0f16a6ddddef376f = function(arg0, arg1) {
767
+ const obj = arg1;
768
+ const ret = typeof(obj) === 'string' ? obj : undefined;
769
+ var ptr1 = isLikeNone(ret) ? 0 : passStringToWasm0(ret, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
770
+ var len1 = WASM_VECTOR_LEN;
771
+ getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
772
+ getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
773
+ };
774
+
775
+ exports.__wbg_wbindgenthrow_451ec1a8469d7eb6 = function(arg0, arg1) {
776
+ throw new Error(getStringFromWasm0(arg0, arg1));
777
+ };
778
+
779
+ exports.__wbindgen_cast_2241b6af4c4b2941 = function(arg0, arg1) {
780
+ // Cast intrinsic for `Ref(String) -> Externref`.
781
+ const ret = getStringFromWasm0(arg0, arg1);
782
+ return ret;
783
+ };
784
+
785
+ exports.__wbindgen_cast_4625c577ab2ec9ee = function(arg0) {
786
+ // Cast intrinsic for `U64 -> Externref`.
787
+ const ret = BigInt.asUintN(64, arg0);
788
+ return ret;
789
+ };
790
+
791
+ exports.__wbindgen_cast_9ae0607507abb057 = function(arg0) {
792
+ // Cast intrinsic for `I64 -> Externref`.
793
+ const ret = arg0;
794
+ return ret;
795
+ };
796
+
797
+ exports.__wbindgen_cast_d6cd19b81560fd6e = function(arg0) {
798
+ // Cast intrinsic for `F64 -> Externref`.
799
+ const ret = arg0;
800
+ return ret;
801
+ };
802
+
803
+ exports.__wbindgen_init_externref_table = function() {
804
+ const table = wasm.__wbindgen_export_4;
805
+ const offset = table.grow(4);
806
+ table.set(0, undefined);
807
+ table.set(offset + 0, undefined);
808
+ table.set(offset + 1, null);
809
+ table.set(offset + 2, true);
810
+ table.set(offset + 3, false);
811
+ ;
812
+ };
813
+
814
+ const wasmPath = `${__dirname}/lindera_wasm_bg.wasm`;
815
+ const wasmBytes = require('fs').readFileSync(wasmPath);
816
+ const wasmModule = new WebAssembly.Module(wasmBytes);
817
+ const wasm = exports.__wasm = new WebAssembly.Instance(wasmModule, imports).exports;
818
+
819
+ wasm.__wbindgen_start();
820
+
Binary file
package/package.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "lindera-wasm-nodejs-cc-cedict",
3
+ "description": "Lindera WASM with Chinese dictionary (CC-CEDICT) (nodejs target)",
4
+ "version": "1.2.1",
5
+ "license": "MIT",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/lindera/lindera-wasm"
9
+ },
10
+ "files": [
11
+ "lindera_wasm_bg.wasm",
12
+ "lindera_wasm.js",
13
+ "lindera_wasm.d.ts"
14
+ ],
15
+ "main": "lindera_wasm.js",
16
+ "homepage": "https://github.com/lindera/lindera-wasm",
17
+ "types": "lindera_wasm.d.ts",
18
+ "keywords": [
19
+ "morphological",
20
+ "analysis",
21
+ "library",
22
+ "wasm",
23
+ "webassembly"
24
+ ]
25
+ }