lindera-wasm-web-ipadic 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -11
- package/lindera_wasm.d.ts +97 -135
- package/lindera_wasm.js +324 -347
- package/lindera_wasm_bg.wasm +0 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -12,36 +12,36 @@ WebAssembly of Lindera
|
|
|
12
12
|
|
|
13
13
|
### Web
|
|
14
14
|
|
|
15
|
-
- <https://www.npmjs.com/package/lindera-wasm-web-cjk>
|
|
15
|
+
- <https://www.npmjs.com/package/lindera-wasm-web-cjk>
|
|
16
16
|
Lindera WASM with CJK dictionaries (IPADIC, ko-dic, CC-CEDICT) for Web
|
|
17
17
|
|
|
18
|
-
- <https://www.npmjs.com/package/lindera-wasm-web-ipadic>
|
|
18
|
+
- <https://www.npmjs.com/package/lindera-wasm-web-ipadic>
|
|
19
19
|
Lindera WASM with Japanese dictionary (IPADIC) for Web
|
|
20
20
|
|
|
21
|
-
- <https://www.npmjs.com/package/lindera-wasm-web-unidic>
|
|
21
|
+
- <https://www.npmjs.com/package/lindera-wasm-web-unidic>
|
|
22
22
|
Lindera WASM with Japanese dictionary (UniDic) for Web
|
|
23
23
|
|
|
24
|
-
- <https://www.npmjs.com/package/lindera-wasm-web-ko-dic>
|
|
24
|
+
- <https://www.npmjs.com/package/lindera-wasm-web-ko-dic>
|
|
25
25
|
Lindera WASM with Korean dictionary (ko-dic) for Web
|
|
26
26
|
|
|
27
|
-
- <https://www.npmjs.com/package/lindera-wasm-web-cc-cedict>
|
|
27
|
+
- <https://www.npmjs.com/package/lindera-wasm-web-cc-cedict>
|
|
28
28
|
Lindera WASM with Chinese dictionary (CC-CEDICT) for Web
|
|
29
29
|
|
|
30
30
|
### Node.js
|
|
31
31
|
|
|
32
|
-
- <https://www.npmjs.com/package/lindera-wasm-nodejs-cjk>
|
|
32
|
+
- <https://www.npmjs.com/package/lindera-wasm-nodejs-cjk>
|
|
33
33
|
Lindera WASM with CJK dictionaries (IPADIC, ko-dic, CC-CEDICT) for Node.js
|
|
34
34
|
|
|
35
|
-
- <https://www.npmjs.com/package/lindera-wasm-nodejs-ipadic>
|
|
35
|
+
- <https://www.npmjs.com/package/lindera-wasm-nodejs-ipadic>
|
|
36
36
|
Lindera WASM with Japanese dictionary (IPADIC) for Node.js
|
|
37
37
|
|
|
38
|
-
- <https://www.npmjs.com/package/lindera-wasm-nodejs-unidic>
|
|
38
|
+
- <https://www.npmjs.com/package/lindera-wasm-nodejs-unidic>
|
|
39
39
|
Lindera WASM with Japanese dictionary (UniDic) for Node.js
|
|
40
40
|
|
|
41
|
-
- <https://www.npmjs.com/package/lindera-wasm-nodejs-ko-dic>
|
|
41
|
+
- <https://www.npmjs.com/package/lindera-wasm-nodejs-ko-dic>
|
|
42
42
|
Lindera WASM with Korean dictionary (ko-dic) for Node.js
|
|
43
43
|
|
|
44
|
-
- <https://www.npmjs.com/package/lindera-wasm-nodejs-cc-cedict>
|
|
44
|
+
- <https://www.npmjs.com/package/lindera-wasm-nodejs-cc-cedict>
|
|
45
45
|
Lindera WASM with Chinese dictionary (CC-CEDICT) for Node.js
|
|
46
46
|
|
|
47
47
|
## Usage
|
|
@@ -96,7 +96,6 @@ export default defineConfig({
|
|
|
96
96
|
and set the `content_security_policy` to contains `wasm-unsafe-eval` in manifest.json:
|
|
97
97
|
|
|
98
98
|
```json
|
|
99
|
-
// manifest.json
|
|
100
99
|
"content_security_policy": {
|
|
101
100
|
"extension_pages": "script-src 'self' 'wasm-unsafe-eval';"
|
|
102
101
|
}
|
package/lindera_wasm.d.ts
CHANGED
|
@@ -1,42 +1,6 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
-
|
|
4
|
-
* Gets the version of the lindera-wasm library.
|
|
5
|
-
*
|
|
6
|
-
* # Returns
|
|
7
|
-
*
|
|
8
|
-
* The version string of the library (e.g., "1.0.0").
|
|
9
|
-
*
|
|
10
|
-
* # Examples
|
|
11
|
-
*
|
|
12
|
-
* ```javascript
|
|
13
|
-
* import { getVersion } from 'lindera-wasm';
|
|
14
|
-
* console.log(getVersion()); // "1.0.0"
|
|
15
|
-
* ```
|
|
16
|
-
*/
|
|
17
|
-
export function getVersion(): string;
|
|
18
|
-
/**
|
|
19
|
-
* A tokenizer for morphological analysis.
|
|
20
|
-
*
|
|
21
|
-
* The `Tokenizer` performs text tokenization based on the configuration
|
|
22
|
-
* provided by [`TokenizerBuilder`].
|
|
23
|
-
*
|
|
24
|
-
* # Examples
|
|
25
|
-
*
|
|
26
|
-
* ```javascript
|
|
27
|
-
* const builder = new TokenizerBuilder();
|
|
28
|
-
* builder.setDictionary("embedded://ipadic");
|
|
29
|
-
* builder.setMode("normal");
|
|
30
|
-
*
|
|
31
|
-
* const tokenizer = builder.build();
|
|
32
|
-
* const tokens = tokenizer.tokenize("関西国際空港");
|
|
33
|
-
* console.log(tokens);
|
|
34
|
-
* // Output: [
|
|
35
|
-
* // { surface: "関西国際空港", ... },
|
|
36
|
-
* // ...
|
|
37
|
-
* // ]
|
|
38
|
-
* ```
|
|
39
|
-
*/
|
|
3
|
+
|
|
40
4
|
export class Tokenizer {
|
|
41
5
|
private constructor();
|
|
42
6
|
free(): void;
|
|
@@ -74,215 +38,213 @@ export class Tokenizer {
|
|
|
74
38
|
*/
|
|
75
39
|
tokenize(input_text: string): any;
|
|
76
40
|
}
|
|
77
|
-
|
|
78
|
-
* Builder for creating a [`Tokenizer`] instance.
|
|
79
|
-
*
|
|
80
|
-
* `TokenizerBuilder` provides a fluent API for configuring and building a tokenizer
|
|
81
|
-
* with various options such as dictionary selection, tokenization mode, character filters,
|
|
82
|
-
* and token filters.
|
|
83
|
-
*
|
|
84
|
-
* # Examples
|
|
85
|
-
*
|
|
86
|
-
* ```javascript
|
|
87
|
-
* const builder = new TokenizerBuilder();
|
|
88
|
-
* builder.setDictionary("embedded://ipadic");
|
|
89
|
-
* builder.setMode("normal");
|
|
90
|
-
* builder.setKeepWhitespace(false);
|
|
91
|
-
* builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
|
|
92
|
-
* builder.appendTokenFilter("lowercase");
|
|
93
|
-
*
|
|
94
|
-
* const tokenizer = builder.build();
|
|
95
|
-
* ```
|
|
96
|
-
*/
|
|
41
|
+
|
|
97
42
|
export class TokenizerBuilder {
|
|
98
43
|
free(): void;
|
|
99
44
|
[Symbol.dispose](): void;
|
|
100
45
|
/**
|
|
101
|
-
*
|
|
102
|
-
*
|
|
103
|
-
* # Returns
|
|
104
|
-
*
|
|
105
|
-
* A new `TokenizerBuilder` instance.
|
|
46
|
+
* Sets the dictionary to use for tokenization.
|
|
106
47
|
*
|
|
107
|
-
* #
|
|
48
|
+
* # Parameters
|
|
108
49
|
*
|
|
109
|
-
*
|
|
50
|
+
* - `uri`: The dictionary URI. Valid embedded dictionaries are:
|
|
51
|
+
* - `"embedded://ipadic"`: Japanese IPADIC dictionary
|
|
52
|
+
* - `"embedded://unidic"`: Japanese UniDic dictionary
|
|
53
|
+
* - `"embedded://ko-dic"`: Korean ko-dic dictionary
|
|
54
|
+
* - `"embedded://cc-cedict"`: Chinese CC-CEDICT dictionary
|
|
110
55
|
*
|
|
111
56
|
* # Examples
|
|
112
57
|
*
|
|
113
58
|
* ```javascript
|
|
114
|
-
*
|
|
59
|
+
* builder.setDictionary("embedded://ipadic");
|
|
115
60
|
* ```
|
|
116
61
|
*/
|
|
117
|
-
|
|
62
|
+
setDictionary(uri: string): void;
|
|
118
63
|
/**
|
|
119
|
-
*
|
|
64
|
+
* Appends a token filter to the tokenization pipeline.
|
|
120
65
|
*
|
|
121
|
-
*
|
|
122
|
-
* configured settings.
|
|
66
|
+
* Token filters transform or filter the tokens after tokenization.
|
|
123
67
|
*
|
|
124
|
-
* #
|
|
68
|
+
* # Parameters
|
|
125
69
|
*
|
|
126
|
-
*
|
|
70
|
+
* - `name`: The name of the token filter (e.g., `"lowercase"`, `"japanese_number"`).
|
|
71
|
+
* - `args`: A JavaScript object containing filter-specific arguments.
|
|
127
72
|
*
|
|
128
73
|
* # Errors
|
|
129
74
|
*
|
|
130
|
-
* Returns an error if the
|
|
75
|
+
* Returns an error if the arguments cannot be parsed.
|
|
131
76
|
*
|
|
132
77
|
* # Examples
|
|
133
78
|
*
|
|
134
79
|
* ```javascript
|
|
135
|
-
*
|
|
136
|
-
* builder.
|
|
137
|
-
* const tokenizer = builder.build();
|
|
80
|
+
* builder.appendTokenFilter("lowercase");
|
|
81
|
+
* builder.appendTokenFilter("japanese_number", { "tags": ["名詞,数"] });
|
|
138
82
|
* ```
|
|
139
83
|
*/
|
|
140
|
-
|
|
84
|
+
appendTokenFilter(name: string, args: any): void;
|
|
141
85
|
/**
|
|
142
|
-
* Sets the
|
|
86
|
+
* Sets whether to keep whitespace tokens in the output.
|
|
143
87
|
*
|
|
144
88
|
* # Parameters
|
|
145
89
|
*
|
|
146
|
-
* - `
|
|
147
|
-
* - `"normal"`: Standard tokenization
|
|
148
|
-
* - `"decompose"`: Decomposes compound words into their components
|
|
149
|
-
*
|
|
150
|
-
* # Errors
|
|
151
|
-
*
|
|
152
|
-
* Returns an error if the mode string is invalid.
|
|
90
|
+
* - `keep`: If `true`, whitespace tokens are preserved; if `false`, they are removed.
|
|
153
91
|
*
|
|
154
92
|
* # Examples
|
|
155
93
|
*
|
|
156
94
|
* ```javascript
|
|
157
|
-
* builder.
|
|
95
|
+
* builder.setKeepWhitespace(false); // Remove whitespace tokens
|
|
158
96
|
* // or
|
|
159
|
-
* builder.
|
|
97
|
+
* builder.setKeepWhitespace(true); // Keep whitespace tokens
|
|
160
98
|
* ```
|
|
161
99
|
*/
|
|
162
|
-
|
|
100
|
+
setKeepWhitespace(keep: boolean): void;
|
|
163
101
|
/**
|
|
164
|
-
* Sets
|
|
102
|
+
* Sets a user-defined dictionary.
|
|
103
|
+
*
|
|
104
|
+
* User dictionaries allow you to add custom words and their properties
|
|
105
|
+
* to supplement the main dictionary.
|
|
165
106
|
*
|
|
166
107
|
* # Parameters
|
|
167
108
|
*
|
|
168
|
-
* - `uri`: The
|
|
169
|
-
* - `"embedded://ipadic"`: Japanese IPADIC dictionary
|
|
170
|
-
* - `"embedded://unidic"`: Japanese UniDic dictionary
|
|
171
|
-
* - `"embedded://ko-dic"`: Korean ko-dic dictionary
|
|
172
|
-
* - `"embedded://cc-cedict"`: Chinese CC-CEDICT dictionary
|
|
109
|
+
* - `uri`: The URI to the user dictionary file.
|
|
173
110
|
*
|
|
174
111
|
* # Examples
|
|
175
112
|
*
|
|
176
113
|
* ```javascript
|
|
177
|
-
* builder.
|
|
114
|
+
* builder.setUserDictionary("path/to/user_dict.csv");
|
|
178
115
|
* ```
|
|
179
116
|
*/
|
|
180
|
-
|
|
117
|
+
setUserDictionary(uri: string): void;
|
|
181
118
|
/**
|
|
182
|
-
*
|
|
119
|
+
* Appends a character filter to the tokenization pipeline.
|
|
183
120
|
*
|
|
184
|
-
*
|
|
185
|
-
* to supplement the main dictionary.
|
|
121
|
+
* Character filters transform the input text before tokenization.
|
|
186
122
|
*
|
|
187
123
|
* # Parameters
|
|
188
124
|
*
|
|
189
|
-
* - `
|
|
125
|
+
* - `name`: The name of the character filter (e.g., `"unicode_normalize"`).
|
|
126
|
+
* - `args`: A JavaScript object containing filter-specific arguments.
|
|
127
|
+
*
|
|
128
|
+
* # Errors
|
|
129
|
+
*
|
|
130
|
+
* Returns an error if the arguments cannot be parsed.
|
|
190
131
|
*
|
|
191
132
|
* # Examples
|
|
192
133
|
*
|
|
193
134
|
* ```javascript
|
|
194
|
-
* builder.
|
|
135
|
+
* builder.appendCharacterFilter("unicode_normalize", { "kind": "nfkc" });
|
|
195
136
|
* ```
|
|
196
137
|
*/
|
|
197
|
-
|
|
138
|
+
appendCharacterFilter(name: string, args: any): void;
|
|
198
139
|
/**
|
|
199
|
-
*
|
|
140
|
+
* Creates a new `TokenizerBuilder` instance.
|
|
200
141
|
*
|
|
201
|
-
* #
|
|
142
|
+
* # Returns
|
|
202
143
|
*
|
|
203
|
-
*
|
|
144
|
+
* A new `TokenizerBuilder` instance.
|
|
145
|
+
*
|
|
146
|
+
* # Errors
|
|
147
|
+
*
|
|
148
|
+
* Returns an error if the builder cannot be initialized.
|
|
204
149
|
*
|
|
205
150
|
* # Examples
|
|
206
151
|
*
|
|
207
152
|
* ```javascript
|
|
208
|
-
* builder
|
|
209
|
-
* // or
|
|
210
|
-
* builder.setKeepWhitespace(true); // Keep whitespace tokens
|
|
153
|
+
* const builder = new TokenizerBuilder();
|
|
211
154
|
* ```
|
|
212
155
|
*/
|
|
213
|
-
|
|
156
|
+
constructor();
|
|
214
157
|
/**
|
|
215
|
-
*
|
|
158
|
+
* Builds and returns a configured [`Tokenizer`] instance.
|
|
216
159
|
*
|
|
217
|
-
*
|
|
160
|
+
* This method consumes the builder and creates the final tokenizer with all
|
|
161
|
+
* configured settings.
|
|
218
162
|
*
|
|
219
|
-
* #
|
|
163
|
+
* # Returns
|
|
220
164
|
*
|
|
221
|
-
*
|
|
222
|
-
* - `args`: A JavaScript object containing filter-specific arguments.
|
|
165
|
+
* A configured `Tokenizer` instance.
|
|
223
166
|
*
|
|
224
167
|
* # Errors
|
|
225
168
|
*
|
|
226
|
-
* Returns an error if the
|
|
169
|
+
* Returns an error if the tokenizer cannot be built with the current configuration.
|
|
227
170
|
*
|
|
228
171
|
* # Examples
|
|
229
172
|
*
|
|
230
173
|
* ```javascript
|
|
231
|
-
* builder
|
|
174
|
+
* const builder = new TokenizerBuilder();
|
|
175
|
+
* builder.setDictionary("embedded://ipadic");
|
|
176
|
+
* const tokenizer = builder.build();
|
|
232
177
|
* ```
|
|
233
178
|
*/
|
|
234
|
-
|
|
179
|
+
build(): Tokenizer;
|
|
235
180
|
/**
|
|
236
|
-
*
|
|
237
|
-
*
|
|
238
|
-
* Token filters transform or filter the tokens after tokenization.
|
|
181
|
+
* Sets the tokenization mode.
|
|
239
182
|
*
|
|
240
183
|
* # Parameters
|
|
241
184
|
*
|
|
242
|
-
* - `
|
|
243
|
-
*
|
|
185
|
+
* - `mode`: The tokenization mode. Valid values are:
|
|
186
|
+
* - `"normal"`: Standard tokenization
|
|
187
|
+
* - `"decompose"`: Decomposes compound words into their components
|
|
244
188
|
*
|
|
245
189
|
* # Errors
|
|
246
190
|
*
|
|
247
|
-
* Returns an error if the
|
|
191
|
+
* Returns an error if the mode string is invalid.
|
|
248
192
|
*
|
|
249
193
|
* # Examples
|
|
250
194
|
*
|
|
251
195
|
* ```javascript
|
|
252
|
-
* builder.
|
|
253
|
-
*
|
|
196
|
+
* builder.setMode("normal");
|
|
197
|
+
* // or
|
|
198
|
+
* builder.setMode("decompose");
|
|
254
199
|
* ```
|
|
255
200
|
*/
|
|
256
|
-
|
|
201
|
+
setMode(mode: string): void;
|
|
257
202
|
}
|
|
258
203
|
|
|
204
|
+
/**
|
|
205
|
+
* Gets the version of the lindera-wasm library.
|
|
206
|
+
*
|
|
207
|
+
* # Returns
|
|
208
|
+
*
|
|
209
|
+
* The version string of the library (e.g., "1.0.0").
|
|
210
|
+
*
|
|
211
|
+
* # Examples
|
|
212
|
+
*
|
|
213
|
+
* ```javascript
|
|
214
|
+
* import { getVersion } from 'lindera-wasm';
|
|
215
|
+
* console.log(getVersion()); // "1.0.0"
|
|
216
|
+
* ```
|
|
217
|
+
*/
|
|
218
|
+
export function getVersion(): string;
|
|
219
|
+
|
|
259
220
|
export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
|
|
260
221
|
|
|
261
222
|
export interface InitOutput {
|
|
262
223
|
readonly memory: WebAssembly.Memory;
|
|
263
|
-
readonly
|
|
224
|
+
readonly __wbg_tokenizer_free: (a: number, b: number) => void;
|
|
264
225
|
readonly __wbg_tokenizerbuilder_free: (a: number, b: number) => void;
|
|
265
|
-
readonly
|
|
226
|
+
readonly getVersion: () => [number, number];
|
|
227
|
+
readonly tokenizer_tokenize: (a: number, b: number, c: number) => [number, number, number];
|
|
228
|
+
readonly tokenizerbuilder_appendCharacterFilter: (a: number, b: number, c: number, d: any) => [number, number];
|
|
229
|
+
readonly tokenizerbuilder_appendTokenFilter: (a: number, b: number, c: number, d: any) => [number, number];
|
|
266
230
|
readonly tokenizerbuilder_build: (a: number) => [number, number, number];
|
|
267
|
-
readonly
|
|
231
|
+
readonly tokenizerbuilder_new: () => [number, number, number];
|
|
268
232
|
readonly tokenizerbuilder_setDictionary: (a: number, b: number, c: number) => [number, number];
|
|
269
|
-
readonly tokenizerbuilder_setUserDictionary: (a: number, b: number, c: number) => [number, number];
|
|
270
233
|
readonly tokenizerbuilder_setKeepWhitespace: (a: number, b: number) => [number, number];
|
|
271
|
-
readonly
|
|
272
|
-
readonly
|
|
273
|
-
readonly __wbg_tokenizer_free: (a: number, b: number) => void;
|
|
274
|
-
readonly tokenizer_tokenize: (a: number, b: number, c: number) => [number, number, number];
|
|
234
|
+
readonly tokenizerbuilder_setMode: (a: number, b: number, c: number) => [number, number];
|
|
235
|
+
readonly tokenizerbuilder_setUserDictionary: (a: number, b: number, c: number) => [number, number];
|
|
275
236
|
readonly __wbindgen_malloc: (a: number, b: number) => number;
|
|
276
237
|
readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
|
|
277
238
|
readonly __wbindgen_exn_store: (a: number) => void;
|
|
278
239
|
readonly __externref_table_alloc: () => number;
|
|
279
|
-
readonly
|
|
280
|
-
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
|
240
|
+
readonly __wbindgen_externrefs: WebAssembly.Table;
|
|
281
241
|
readonly __externref_table_dealloc: (a: number) => void;
|
|
242
|
+
readonly __wbindgen_free: (a: number, b: number, c: number) => void;
|
|
282
243
|
readonly __wbindgen_start: () => void;
|
|
283
244
|
}
|
|
284
245
|
|
|
285
246
|
export type SyncInitInput = BufferSource | WebAssembly.Module;
|
|
247
|
+
|
|
286
248
|
/**
|
|
287
249
|
* Instantiates the given `module`, which can either be bytes or
|
|
288
250
|
* a precompiled `WebAssembly.Module`.
|