lindera-nodejs 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +308 -0
  2. package/package.json +63 -0
package/README.md ADDED
@@ -0,0 +1,308 @@
1
+ # lindera-nodejs
2
+
3
+ Node.js binding for [Lindera](https://github.com/lindera/lindera), a Japanese morphological analysis engine.
4
+
5
+ ## Overview
6
+
7
+ lindera-nodejs provides a comprehensive Node.js interface to the Lindera morphological analysis engine, supporting Japanese, Korean, and Chinese text analysis. This implementation includes all major features:
8
+
9
+ - **Multi-language Support**: Japanese (IPADIC, IPADIC-NEologd, UniDic), Korean (ko-dic), Chinese (CC-CEDICT, Jieba)
10
+ - **Character Filters**: Text preprocessing with mapping, regex, Unicode normalization, and Japanese iteration mark handling
11
+ - **Token Filters**: Post-processing filters including lowercase, length filtering, stop words, and Japanese-specific filters
12
+ - **Flexible Configuration**: Configurable tokenization modes and penalty settings
13
+ - **Metadata Support**: Complete dictionary schema and metadata management
14
+ - **TypeScript Support**: Full type definitions included out of the box
15
+
16
+ ## Features
17
+
18
+ ### Core Components
19
+
20
+ - **TokenizerBuilder**: Fluent API for building customized tokenizers
21
+ - **Tokenizer**: High-performance text tokenization with integrated filtering
22
+ - **CharacterFilter**: Pre-processing filters for text normalization
23
+ - **TokenFilter**: Post-processing filters for token refinement
24
+ - **Metadata & Schema**: Dictionary structure and configuration management
25
+ - **Training & Export** (optional): Train custom morphological analysis models from corpus data
26
+
27
+ ### Supported Dictionaries
28
+
29
+ - **Japanese**: IPADIC, IPADIC-NEologd, UniDic
30
+ - **Korean**: ko-dic
31
+ - **Chinese**: CC-CEDICT, Jieba
32
+ - **Custom**: User dictionary support
33
+
34
+ Pre-built dictionaries are available from [GitHub Releases](https://github.com/lindera/lindera/releases).
35
+ Download a dictionary archive (e.g. `lindera-ipadic-*.zip`) and specify the extracted path when loading.
36
+
37
+ ### Filter Types
38
+
39
+ **Character Filters:**
40
+
41
+ - Mapping filter (character replacement)
42
+ - Regex filter (pattern-based replacement)
43
+ - Unicode normalization (NFKC, etc.)
44
+ - Japanese iteration mark normalization
45
+
46
+ **Token Filters:**
47
+
48
+ - Text case transformation (lowercase, uppercase)
49
+ - Length filtering (min/max character length)
50
+ - Stop words filtering
51
+ - Japanese-specific filters (base form, reading form, etc.)
52
+ - Korean-specific filters
53
+
54
+ ## Install project dependencies
55
+
56
+ - Node.js 18+ : <https://nodejs.org/>
57
+ - Rust : <https://www.rust-lang.org/tools/install>
58
+ - @napi-rs/cli : `npm install -g @napi-rs/cli`
59
+
60
+ ## Setup repository
61
+
62
+ ```shell
63
+ # Clone lindera project repository
64
+ git clone git@github.com:lindera/lindera.git
65
+ cd lindera
66
+ ```
67
+
68
+ ## Install lindera-nodejs
69
+
70
+ This command builds the library with development settings (debug build).
71
+
72
+ ```shell
73
+ cd lindera-nodejs
74
+ npm install
75
+ npm run build
76
+ ```
77
+
78
+ ## Quick Start
79
+
80
+ ### Basic Tokenization
81
+
82
+ ```javascript
83
+ const { loadDictionary, Tokenizer } = require("lindera-nodejs");
84
+
85
+ // Load dictionary
86
+ // Load dictionary from a local path (download from GitHub Releases)
87
+ const dictionary = loadDictionary("/path/to/ipadic");
88
+
89
+ // Create a tokenizer
90
+ const tokenizer = new Tokenizer(dictionary, "normal");
91
+
92
+ // Tokenize Japanese text
93
+ const text = "すもももももももものうち";
94
+ const tokens = tokenizer.tokenize(text);
95
+
96
+ for (const token of tokens) {
97
+ console.log(`Text: ${token.surface}, Position: ${token.byteStart}-${token.byteEnd}`);
98
+ }
99
+ ```
100
+
101
+ ### Using Character Filters
102
+
103
+ ```javascript
104
+ const { TokenizerBuilder } = require("lindera-nodejs");
105
+
106
+ // Create tokenizer builder
107
+ const builder = new TokenizerBuilder();
108
+ builder.setMode("normal");
109
+ builder.setDictionary("/path/to/ipadic");
110
+
111
+ // Add character filters
112
+ builder.appendCharacterFilter("mapping", { mapping: { "ー": "-" } });
113
+ builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
114
+
115
+ // Build tokenizer with filters
116
+ const tokenizer = builder.build();
117
+ const text = "テストー123";
118
+ const tokens = tokenizer.tokenize(text); // Will apply filters automatically
119
+ ```
120
+
121
+ ### Using Token Filters
122
+
123
+ ```javascript
124
+ const { TokenizerBuilder } = require("lindera-nodejs");
125
+
126
+ // Create tokenizer builder
127
+ const builder = new TokenizerBuilder();
128
+ builder.setMode("normal");
129
+ builder.setDictionary("/path/to/ipadic");
130
+
131
+ // Add token filters
132
+ builder.appendTokenFilter("lowercase");
133
+ builder.appendTokenFilter("length", { min: 2, max: 10 });
134
+ builder.appendTokenFilter("japanese_stop_tags", { tags: ["助詞", "助動詞"] });
135
+
136
+ // Build tokenizer with filters
137
+ const tokenizer = builder.build();
138
+ const tokens = tokenizer.tokenize("テキストの解析");
139
+ ```
140
+
141
+ ### Integrated Pipeline
142
+
143
+ ```javascript
144
+ const { TokenizerBuilder } = require("lindera-nodejs");
145
+
146
+ // Build tokenizer with integrated filters
147
+ const builder = new TokenizerBuilder();
148
+ builder.setMode("normal");
149
+ builder.setDictionary("/path/to/ipadic");
150
+
151
+ // Add character filters
152
+ builder.appendCharacterFilter("mapping", { mapping: { "ー": "-" } });
153
+ builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
154
+
155
+ // Add token filters
156
+ builder.appendTokenFilter("lowercase");
157
+ builder.appendTokenFilter("japanese_base_form");
158
+
159
+ // Build and use
160
+ const tokenizer = builder.build();
161
+ const tokens = tokenizer.tokenize("コーヒーショップ");
162
+ ```
163
+
164
+ ### Working with Metadata
165
+
166
+ ```javascript
167
+ const { Metadata } = require("lindera-nodejs");
168
+
169
+ // Create metadata with default values
170
+ const metadata = new Metadata();
171
+ console.log(`Name: ${metadata.name}`);
172
+ console.log(`Encoding: ${metadata.encoding}`);
173
+
174
+ // Create metadata from a JSON file
175
+ const loaded = Metadata.fromJsonFile("metadata.json");
176
+ console.log(loaded.toObject());
177
+ ```
178
+
179
+ ## Advanced Usage
180
+
181
+ ### Filter Configuration Examples
182
+
183
+ Character filters and token filters accept configuration as object arguments:
184
+
185
+ ```javascript
186
+ const { TokenizerBuilder } = require("lindera-nodejs");
187
+
188
+ const builder = new TokenizerBuilder();
189
+ builder.setDictionary("/path/to/ipadic");
190
+
191
+ // Character filters with object configuration
192
+ builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
193
+ builder.appendCharacterFilter("japanese_iteration_mark", {
194
+ normalize_kanji: true,
195
+ normalize_kana: true,
196
+ });
197
+ builder.appendCharacterFilter("mapping", {
198
+ mapping: { "リンデラ": "lindera", "トウキョウ": "東京" },
199
+ });
200
+
201
+ // Token filters with object configuration
202
+ builder.appendTokenFilter("japanese_katakana_stem", { min: 3 });
203
+ builder.appendTokenFilter("length", { min: 2, max: 10 });
204
+ builder.appendTokenFilter("japanese_stop_tags", {
205
+ tags: ["助詞", "助動詞", "記号"],
206
+ });
207
+
208
+ // Filters without configuration can omit the object
209
+ builder.appendTokenFilter("lowercase");
210
+ builder.appendTokenFilter("japanese_base_form");
211
+
212
+ const tokenizer = builder.build();
213
+ ```
214
+
215
+ See `examples/` directory for comprehensive examples including:
216
+
217
+ - `tokenize.js`: Basic tokenization
218
+ - `tokenize_with_filters.js`: Using character and token filters
219
+ - `tokenize_with_userdict.js`: Custom user dictionary
220
+ - `train_and_export.js`: Train and export custom dictionaries (requires `train` feature)
221
+ - `tokenize_with_decompose.js`: Decompose mode tokenization
222
+
223
+ ## Dictionary Support
224
+
225
+ ### Japanese
226
+
227
+ - **IPADIC**: Default Japanese dictionary, good for general text
228
+ - **UniDic**: Academic dictionary with detailed morphological information
229
+
230
+ ### Korean
231
+
232
+ - **ko-dic**: Standard Korean dictionary for morphological analysis
233
+
234
+ ### Chinese
235
+
236
+ - **CC-CEDICT**: Community-maintained Chinese-English dictionary
237
+
238
+ ### Custom Dictionaries
239
+
240
+ - User dictionary support for domain-specific terms
241
+ - CSV format for easy customization
242
+
243
+ ## Dictionary Training (Experimental)
244
+
245
+ lindera-nodejs supports training custom morphological analysis models from annotated corpus data when built with the `train` feature.
246
+
247
+ ### Building with Training Support
248
+
249
+ ```shell
250
+ npm run build -- --features train
251
+ ```
252
+
253
+ ### Training a Model
254
+
255
+ ```javascript
256
+ const { train } = require("lindera-nodejs");
257
+
258
+ // Train a model from corpus
259
+ train({
260
+ seed: "path/to/seed.csv",
261
+ corpus: "path/to/corpus.txt",
262
+ charDef: "path/to/char.def",
263
+ unkDef: "path/to/unk.def",
264
+ featureDef: "path/to/feature.def",
265
+ rewriteDef: "path/to/rewrite.def",
266
+ output: "model.dat",
267
+ lambda: 0.01,
268
+ maxIter: 100,
269
+ });
270
+ ```
271
+
272
+ ### Exporting Dictionary Files
273
+
274
+ ```javascript
275
+ const { exportModel } = require("lindera-nodejs");
276
+
277
+ // Export trained model to dictionary files
278
+ exportModel({
279
+ model: "model.dat",
280
+ output: "exported_dict/",
281
+ metadata: "metadata.json",
282
+ });
283
+ ```
284
+
285
+ This will create:
286
+
287
+ - `lex.csv`: Lexicon file
288
+ - `matrix.def`: Connection cost matrix
289
+ - `unk.def`: Unknown word definitions
290
+ - `char.def`: Character definitions
291
+ - `metadata.json`: Dictionary metadata (if provided)
292
+
293
+ See `examples/train_and_export.js` for a complete example.
294
+
295
+ ## API Reference
296
+
297
+ ### Core Classes
298
+
299
+ - `TokenizerBuilder`: Fluent builder for tokenizer configuration
300
+ - `Tokenizer`: Main tokenization engine
301
+ - `Token`: Individual token with text, position, and linguistic features
302
+ - `Metadata`: Dictionary metadata and configuration
303
+ - `Schema`: Dictionary schema definition
304
+
305
+ ### Training Functions (requires `train` feature)
306
+
307
+ - `train()`: Train a morphological analysis model from corpus
308
+ - `exportModel()`: Export trained model to dictionary files
package/package.json ADDED
@@ -0,0 +1,63 @@
1
+ {
2
+ "name": "lindera-nodejs",
3
+ "version": "3.0.0",
4
+ "description": "Node.js bindings for Lindera morphological analysis engine",
5
+ "main": "index.js",
6
+ "types": "index.d.ts",
7
+ "exports": {
8
+ ".": {
9
+ "require": "./index.js",
10
+ "types": "./index.d.ts"
11
+ }
12
+ },
13
+ "napi": {
14
+ "binaryName": "lindera-nodejs",
15
+ "package": {
16
+ "name": "lindera-nodejs"
17
+ },
18
+ "targets": [
19
+ "x86_64-apple-darwin",
20
+ "aarch64-apple-darwin",
21
+ "x86_64-unknown-linux-gnu",
22
+ "aarch64-unknown-linux-gnu",
23
+ "x86_64-pc-windows-msvc",
24
+ "aarch64-pc-windows-msvc"
25
+ ]
26
+ },
27
+ "license": "MIT",
28
+ "scripts": {
29
+ "build": "napi build --platform --release -p lindera-nodejs",
30
+ "build:debug": "napi build --platform -p lindera-nodejs",
31
+ "artifacts": "napi artifacts",
32
+ "test": "node --test tests/test_*.js"
33
+ },
34
+ "files": [
35
+ "index.js",
36
+ "index.d.ts"
37
+ ],
38
+ "optionalDependencies": {
39
+ "lindera-nodejs-darwin-arm64": "3.0.0",
40
+ "lindera-nodejs-darwin-x64": "3.0.0",
41
+ "lindera-nodejs-linux-arm64-gnu": "3.0.0",
42
+ "lindera-nodejs-linux-x64-gnu": "3.0.0",
43
+ "lindera-nodejs-win32-arm64-msvc": "3.0.0",
44
+ "lindera-nodejs-win32-x64-msvc": "3.0.0"
45
+ },
46
+ "repository": {
47
+ "type": "git",
48
+ "url": "https://github.com/lindera/lindera"
49
+ },
50
+ "keywords": [
51
+ "morphological",
52
+ "analysis",
53
+ "tokenizer",
54
+ "japanese",
55
+ "korean",
56
+ "chinese",
57
+ "lindera",
58
+ "napi"
59
+ ],
60
+ "devDependencies": {
61
+ "@napi-rs/cli": "^3.6.0"
62
+ }
63
+ }