lindera-nodejs 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +308 -0
- package/package.json +63 -0
package/README.md
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# lindera-nodejs
|
|
2
|
+
|
|
3
|
+
Node.js binding for [Lindera](https://github.com/lindera/lindera), a Japanese morphological analysis engine.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
lindera-nodejs provides a comprehensive Node.js interface to the Lindera morphological analysis engine, supporting Japanese, Korean, and Chinese text analysis. This implementation includes all major features:
|
|
8
|
+
|
|
9
|
+
- **Multi-language Support**: Japanese (IPADIC, IPADIC-NEologd, UniDic), Korean (ko-dic), Chinese (CC-CEDICT, Jieba)
|
|
10
|
+
- **Character Filters**: Text preprocessing with mapping, regex, Unicode normalization, and Japanese iteration mark handling
|
|
11
|
+
- **Token Filters**: Post-processing filters including lowercase, length filtering, stop words, and Japanese-specific filters
|
|
12
|
+
- **Flexible Configuration**: Configurable tokenization modes and penalty settings
|
|
13
|
+
- **Metadata Support**: Complete dictionary schema and metadata management
|
|
14
|
+
- **TypeScript Support**: Full type definitions included out of the box
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
### Core Components
|
|
19
|
+
|
|
20
|
+
- **TokenizerBuilder**: Fluent API for building customized tokenizers
|
|
21
|
+
- **Tokenizer**: High-performance text tokenization with integrated filtering
|
|
22
|
+
- **CharacterFilter**: Pre-processing filters for text normalization
|
|
23
|
+
- **TokenFilter**: Post-processing filters for token refinement
|
|
24
|
+
- **Metadata & Schema**: Dictionary structure and configuration management
|
|
25
|
+
- **Training & Export** (optional): Train custom morphological analysis models from corpus data
|
|
26
|
+
|
|
27
|
+
### Supported Dictionaries
|
|
28
|
+
|
|
29
|
+
- **Japanese**: IPADIC, IPADIC-NEologd, UniDic
|
|
30
|
+
- **Korean**: ko-dic
|
|
31
|
+
- **Chinese**: CC-CEDICT, Jieba
|
|
32
|
+
- **Custom**: User dictionary support
|
|
33
|
+
|
|
34
|
+
Pre-built dictionaries are available from [GitHub Releases](https://github.com/lindera/lindera/releases).
|
|
35
|
+
Download a dictionary archive (e.g. `lindera-ipadic-*.zip`) and specify the extracted path when loading.
|
|
36
|
+
|
|
37
|
+
### Filter Types
|
|
38
|
+
|
|
39
|
+
**Character Filters:**
|
|
40
|
+
|
|
41
|
+
- Mapping filter (character replacement)
|
|
42
|
+
- Regex filter (pattern-based replacement)
|
|
43
|
+
- Unicode normalization (NFKC, etc.)
|
|
44
|
+
- Japanese iteration mark normalization
|
|
45
|
+
|
|
46
|
+
**Token Filters:**
|
|
47
|
+
|
|
48
|
+
- Text case transformation (lowercase, uppercase)
|
|
49
|
+
- Length filtering (min/max character length)
|
|
50
|
+
- Stop words filtering
|
|
51
|
+
- Japanese-specific filters (base form, reading form, etc.)
|
|
52
|
+
- Korean-specific filters
|
|
53
|
+
|
|
54
|
+
## Install project dependencies
|
|
55
|
+
|
|
56
|
+
- Node.js 18+ : <https://nodejs.org/>
|
|
57
|
+
- Rust : <https://www.rust-lang.org/tools/install>
|
|
58
|
+
- @napi-rs/cli : `npm install -g @napi-rs/cli`
|
|
59
|
+
|
|
60
|
+
## Setup repository
|
|
61
|
+
|
|
62
|
+
```shell
|
|
63
|
+
# Clone lindera project repository
|
|
64
|
+
git clone git@github.com:lindera/lindera.git
|
|
65
|
+
cd lindera
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Install lindera-nodejs
|
|
69
|
+
|
|
70
|
+
This command builds the library with development settings (debug build).
|
|
71
|
+
|
|
72
|
+
```shell
|
|
73
|
+
cd lindera-nodejs
|
|
74
|
+
npm install
|
|
75
|
+
npm run build
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Quick Start
|
|
79
|
+
|
|
80
|
+
### Basic Tokenization
|
|
81
|
+
|
|
82
|
+
```javascript
|
|
83
|
+
const { loadDictionary, Tokenizer } = require("lindera-nodejs");
|
|
84
|
+
|
|
85
|
+
// Load dictionary
|
|
86
|
+
// Load dictionary from a local path (download from GitHub Releases)
|
|
87
|
+
const dictionary = loadDictionary("/path/to/ipadic");
|
|
88
|
+
|
|
89
|
+
// Create a tokenizer
|
|
90
|
+
const tokenizer = new Tokenizer(dictionary, "normal");
|
|
91
|
+
|
|
92
|
+
// Tokenize Japanese text
|
|
93
|
+
const text = "すもももももももものうち";
|
|
94
|
+
const tokens = tokenizer.tokenize(text);
|
|
95
|
+
|
|
96
|
+
for (const token of tokens) {
|
|
97
|
+
console.log(`Text: ${token.surface}, Position: ${token.byteStart}-${token.byteEnd}`);
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Using Character Filters
|
|
102
|
+
|
|
103
|
+
```javascript
|
|
104
|
+
const { TokenizerBuilder } = require("lindera-nodejs");
|
|
105
|
+
|
|
106
|
+
// Create tokenizer builder
|
|
107
|
+
const builder = new TokenizerBuilder();
|
|
108
|
+
builder.setMode("normal");
|
|
109
|
+
builder.setDictionary("/path/to/ipadic");
|
|
110
|
+
|
|
111
|
+
// Add character filters
|
|
112
|
+
builder.appendCharacterFilter("mapping", { mapping: { "ー": "-" } });
|
|
113
|
+
builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
|
|
114
|
+
|
|
115
|
+
// Build tokenizer with filters
|
|
116
|
+
const tokenizer = builder.build();
|
|
117
|
+
const text = "テストー123";
|
|
118
|
+
const tokens = tokenizer.tokenize(text); // Will apply filters automatically
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Using Token Filters
|
|
122
|
+
|
|
123
|
+
```javascript
|
|
124
|
+
const { TokenizerBuilder } = require("lindera-nodejs");
|
|
125
|
+
|
|
126
|
+
// Create tokenizer builder
|
|
127
|
+
const builder = new TokenizerBuilder();
|
|
128
|
+
builder.setMode("normal");
|
|
129
|
+
builder.setDictionary("/path/to/ipadic");
|
|
130
|
+
|
|
131
|
+
// Add token filters
|
|
132
|
+
builder.appendTokenFilter("lowercase");
|
|
133
|
+
builder.appendTokenFilter("length", { min: 2, max: 10 });
|
|
134
|
+
builder.appendTokenFilter("japanese_stop_tags", { tags: ["助詞", "助動詞"] });
|
|
135
|
+
|
|
136
|
+
// Build tokenizer with filters
|
|
137
|
+
const tokenizer = builder.build();
|
|
138
|
+
const tokens = tokenizer.tokenize("テキストの解析");
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Integrated Pipeline
|
|
142
|
+
|
|
143
|
+
```javascript
|
|
144
|
+
const { TokenizerBuilder } = require("lindera-nodejs");
|
|
145
|
+
|
|
146
|
+
// Build tokenizer with integrated filters
|
|
147
|
+
const builder = new TokenizerBuilder();
|
|
148
|
+
builder.setMode("normal");
|
|
149
|
+
builder.setDictionary("/path/to/ipadic");
|
|
150
|
+
|
|
151
|
+
// Add character filters
|
|
152
|
+
builder.appendCharacterFilter("mapping", { mapping: { "ー": "-" } });
|
|
153
|
+
builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
|
|
154
|
+
|
|
155
|
+
// Add token filters
|
|
156
|
+
builder.appendTokenFilter("lowercase");
|
|
157
|
+
builder.appendTokenFilter("japanese_base_form");
|
|
158
|
+
|
|
159
|
+
// Build and use
|
|
160
|
+
const tokenizer = builder.build();
|
|
161
|
+
const tokens = tokenizer.tokenize("コーヒーショップ");
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Working with Metadata
|
|
165
|
+
|
|
166
|
+
```javascript
|
|
167
|
+
const { Metadata } = require("lindera-nodejs");
|
|
168
|
+
|
|
169
|
+
// Create metadata with default values
|
|
170
|
+
const metadata = new Metadata();
|
|
171
|
+
console.log(`Name: ${metadata.name}`);
|
|
172
|
+
console.log(`Encoding: ${metadata.encoding}`);
|
|
173
|
+
|
|
174
|
+
// Create metadata from a JSON file
|
|
175
|
+
const loaded = Metadata.fromJsonFile("metadata.json");
|
|
176
|
+
console.log(loaded.toObject());
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Advanced Usage
|
|
180
|
+
|
|
181
|
+
### Filter Configuration Examples
|
|
182
|
+
|
|
183
|
+
Character filters and token filters accept configuration as object arguments:
|
|
184
|
+
|
|
185
|
+
```javascript
|
|
186
|
+
const { TokenizerBuilder } = require("lindera-nodejs");
|
|
187
|
+
|
|
188
|
+
const builder = new TokenizerBuilder();
|
|
189
|
+
builder.setDictionary("/path/to/ipadic");
|
|
190
|
+
|
|
191
|
+
// Character filters with object configuration
|
|
192
|
+
builder.appendCharacterFilter("unicode_normalize", { kind: "nfkc" });
|
|
193
|
+
builder.appendCharacterFilter("japanese_iteration_mark", {
|
|
194
|
+
normalize_kanji: true,
|
|
195
|
+
normalize_kana: true,
|
|
196
|
+
});
|
|
197
|
+
builder.appendCharacterFilter("mapping", {
|
|
198
|
+
mapping: { "リンデラ": "lindera", "トウキョウ": "東京" },
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
// Token filters with object configuration
|
|
202
|
+
builder.appendTokenFilter("japanese_katakana_stem", { min: 3 });
|
|
203
|
+
builder.appendTokenFilter("length", { min: 2, max: 10 });
|
|
204
|
+
builder.appendTokenFilter("japanese_stop_tags", {
|
|
205
|
+
tags: ["助詞", "助動詞", "記号"],
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Filters without configuration can omit the object
|
|
209
|
+
builder.appendTokenFilter("lowercase");
|
|
210
|
+
builder.appendTokenFilter("japanese_base_form");
|
|
211
|
+
|
|
212
|
+
const tokenizer = builder.build();
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
See `examples/` directory for comprehensive examples including:
|
|
216
|
+
|
|
217
|
+
- `tokenize.js`: Basic tokenization
|
|
218
|
+
- `tokenize_with_filters.js`: Using character and token filters
|
|
219
|
+
- `tokenize_with_userdict.js`: Custom user dictionary
|
|
220
|
+
- `train_and_export.js`: Train and export custom dictionaries (requires `train` feature)
|
|
221
|
+
- `tokenize_with_decompose.js`: Decompose mode tokenization
|
|
222
|
+
|
|
223
|
+
## Dictionary Support
|
|
224
|
+
|
|
225
|
+
### Japanese
|
|
226
|
+
|
|
227
|
+
- **IPADIC**: Default Japanese dictionary, good for general text
|
|
228
|
+
- **UniDic**: Academic dictionary with detailed morphological information
|
|
229
|
+
|
|
230
|
+
### Korean
|
|
231
|
+
|
|
232
|
+
- **ko-dic**: Standard Korean dictionary for morphological analysis
|
|
233
|
+
|
|
234
|
+
### Chinese
|
|
235
|
+
|
|
236
|
+
- **CC-CEDICT**: Community-maintained Chinese-English dictionary
|
|
237
|
+
|
|
238
|
+
### Custom Dictionaries
|
|
239
|
+
|
|
240
|
+
- User dictionary support for domain-specific terms
|
|
241
|
+
- CSV format for easy customization
|
|
242
|
+
|
|
243
|
+
## Dictionary Training (Experimental)
|
|
244
|
+
|
|
245
|
+
lindera-nodejs supports training custom morphological analysis models from annotated corpus data when built with the `train` feature.
|
|
246
|
+
|
|
247
|
+
### Building with Training Support
|
|
248
|
+
|
|
249
|
+
```shell
|
|
250
|
+
npm run build -- --features train
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Training a Model
|
|
254
|
+
|
|
255
|
+
```javascript
|
|
256
|
+
const { train } = require("lindera-nodejs");
|
|
257
|
+
|
|
258
|
+
// Train a model from corpus
|
|
259
|
+
train({
|
|
260
|
+
seed: "path/to/seed.csv",
|
|
261
|
+
corpus: "path/to/corpus.txt",
|
|
262
|
+
charDef: "path/to/char.def",
|
|
263
|
+
unkDef: "path/to/unk.def",
|
|
264
|
+
featureDef: "path/to/feature.def",
|
|
265
|
+
rewriteDef: "path/to/rewrite.def",
|
|
266
|
+
output: "model.dat",
|
|
267
|
+
lambda: 0.01,
|
|
268
|
+
maxIter: 100,
|
|
269
|
+
});
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Exporting Dictionary Files
|
|
273
|
+
|
|
274
|
+
```javascript
|
|
275
|
+
const { exportModel } = require("lindera-nodejs");
|
|
276
|
+
|
|
277
|
+
// Export trained model to dictionary files
|
|
278
|
+
exportModel({
|
|
279
|
+
model: "model.dat",
|
|
280
|
+
output: "exported_dict/",
|
|
281
|
+
metadata: "metadata.json",
|
|
282
|
+
});
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
This will create:
|
|
286
|
+
|
|
287
|
+
- `lex.csv`: Lexicon file
|
|
288
|
+
- `matrix.def`: Connection cost matrix
|
|
289
|
+
- `unk.def`: Unknown word definitions
|
|
290
|
+
- `char.def`: Character definitions
|
|
291
|
+
- `metadata.json`: Dictionary metadata (if provided)
|
|
292
|
+
|
|
293
|
+
See `examples/train_and_export.js` for a complete example.
|
|
294
|
+
|
|
295
|
+
## API Reference
|
|
296
|
+
|
|
297
|
+
### Core Classes
|
|
298
|
+
|
|
299
|
+
- `TokenizerBuilder`: Fluent builder for tokenizer configuration
|
|
300
|
+
- `Tokenizer`: Main tokenization engine
|
|
301
|
+
- `Token`: Individual token with text, position, and linguistic features
|
|
302
|
+
- `Metadata`: Dictionary metadata and configuration
|
|
303
|
+
- `Schema`: Dictionary schema definition
|
|
304
|
+
|
|
305
|
+
### Training Functions (requires `train` feature)
|
|
306
|
+
|
|
307
|
+
- `train()`: Train a morphological analysis model from corpus
|
|
308
|
+
- `exportModel()`: Export trained model to dictionary files
|
package/package.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "lindera-nodejs",
|
|
3
|
+
"version": "3.0.0",
|
|
4
|
+
"description": "Node.js bindings for Lindera morphological analysis engine",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"types": "index.d.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"require": "./index.js",
|
|
10
|
+
"types": "./index.d.ts"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"napi": {
|
|
14
|
+
"binaryName": "lindera-nodejs",
|
|
15
|
+
"package": {
|
|
16
|
+
"name": "lindera-nodejs"
|
|
17
|
+
},
|
|
18
|
+
"targets": [
|
|
19
|
+
"x86_64-apple-darwin",
|
|
20
|
+
"aarch64-apple-darwin",
|
|
21
|
+
"x86_64-unknown-linux-gnu",
|
|
22
|
+
"aarch64-unknown-linux-gnu",
|
|
23
|
+
"x86_64-pc-windows-msvc",
|
|
24
|
+
"aarch64-pc-windows-msvc"
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
"license": "MIT",
|
|
28
|
+
"scripts": {
|
|
29
|
+
"build": "napi build --platform --release -p lindera-nodejs",
|
|
30
|
+
"build:debug": "napi build --platform -p lindera-nodejs",
|
|
31
|
+
"artifacts": "napi artifacts",
|
|
32
|
+
"test": "node --test tests/test_*.js"
|
|
33
|
+
},
|
|
34
|
+
"files": [
|
|
35
|
+
"index.js",
|
|
36
|
+
"index.d.ts"
|
|
37
|
+
],
|
|
38
|
+
"optionalDependencies": {
|
|
39
|
+
"lindera-nodejs-darwin-arm64": "3.0.0",
|
|
40
|
+
"lindera-nodejs-darwin-x64": "3.0.0",
|
|
41
|
+
"lindera-nodejs-linux-arm64-gnu": "3.0.0",
|
|
42
|
+
"lindera-nodejs-linux-x64-gnu": "3.0.0",
|
|
43
|
+
"lindera-nodejs-win32-arm64-msvc": "3.0.0",
|
|
44
|
+
"lindera-nodejs-win32-x64-msvc": "3.0.0"
|
|
45
|
+
},
|
|
46
|
+
"repository": {
|
|
47
|
+
"type": "git",
|
|
48
|
+
"url": "https://github.com/lindera/lindera"
|
|
49
|
+
},
|
|
50
|
+
"keywords": [
|
|
51
|
+
"morphological",
|
|
52
|
+
"analysis",
|
|
53
|
+
"tokenizer",
|
|
54
|
+
"japanese",
|
|
55
|
+
"korean",
|
|
56
|
+
"chinese",
|
|
57
|
+
"lindera",
|
|
58
|
+
"napi"
|
|
59
|
+
],
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@napi-rs/cli": "^3.6.0"
|
|
62
|
+
}
|
|
63
|
+
}
|