lindera-python 2.0.0__cp310-abi3-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lindera/__init__.py
ADDED
lindera/lindera.pyd
ADDED
|
Binary file
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lindera-python
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Classifier: Programming Language :: Python :: 3
|
|
5
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
6
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.15
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Summary: Python binding for Lindera (no embedded dictionaries)
|
|
15
|
+
Keywords: morphological,analysis,library,python
|
|
16
|
+
Home-Page: https://github.com/lindera/lindera-python
|
|
17
|
+
Author-email: Minoru Osuka <minoru.osuka@gmail.com>
|
|
18
|
+
License: MIT
|
|
19
|
+
Requires-Python: >=3.10, <3.15
|
|
20
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
21
|
+
|
|
22
|
+
# lindera-python
|
|
23
|
+
|
|
24
|
+
Python binding for [Lindera](https://github.com/lindera/lindera), a Japanese morphological analysis engine.
|
|
25
|
+
|
|
26
|
+
## Overview
|
|
27
|
+
|
|
28
|
+
lindera-python provides a comprehensive Python interface to the Lindera 1.1.1 morphological analysis engine, supporting Japanese, Korean, and Chinese text analysis. This implementation includes all major features:
|
|
29
|
+
|
|
30
|
+
- **Multi-language Support**: Japanese (IPADIC, UniDic), Korean (ko-dic), Chinese (CC-CEDICT)
|
|
31
|
+
- **Character Filters**: Text preprocessing with mapping, regex, Unicode normalization, and Japanese iteration mark handling
|
|
32
|
+
- **Token Filters**: Post-processing filters including lowercase, length filtering, stop words, and Japanese-specific filters
|
|
33
|
+
- **Flexible Configuration**: Configurable tokenization modes and penalty settings
|
|
34
|
+
- **Metadata Support**: Complete dictionary schema and metadata management
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
### Core Components
|
|
39
|
+
|
|
40
|
+
- **TokenizerBuilder**: Fluent API for building customized tokenizers
|
|
41
|
+
- **Tokenizer**: High-performance text tokenization with integrated filtering
|
|
42
|
+
- **CharacterFilter**: Pre-processing filters for text normalization
|
|
43
|
+
- **TokenFilter**: Post-processing filters for token refinement
|
|
44
|
+
- **Metadata & Schema**: Dictionary structure and configuration management
|
|
45
|
+
- **Training & Export** (optional): Train custom morphological analysis models from corpus data
|
|
46
|
+
|
|
47
|
+
### Supported Dictionaries
|
|
48
|
+
|
|
49
|
+
- **Japanese**: IPADIC (embedded), UniDic (embedded)
|
|
50
|
+
- **Korean**: ko-dic (embedded)
|
|
51
|
+
- **Chinese**: CC-CEDICT (embedded)
|
|
52
|
+
- **Custom**: User dictionary support
|
|
53
|
+
|
|
54
|
+
### Filter Types
|
|
55
|
+
|
|
56
|
+
**Character Filters:**
|
|
57
|
+
|
|
58
|
+
- Mapping filter (character replacement)
|
|
59
|
+
- Regex filter (pattern-based replacement)
|
|
60
|
+
- Unicode normalization (NFKC, etc.)
|
|
61
|
+
- Japanese iteration mark normalization
|
|
62
|
+
|
|
63
|
+
**Token Filters:**
|
|
64
|
+
|
|
65
|
+
- Text case transformation (lowercase, uppercase)
|
|
66
|
+
- Length filtering (min/max character length)
|
|
67
|
+
- Stop words filtering
|
|
68
|
+
- Japanese-specific filters (base form, reading form, etc.)
|
|
69
|
+
- Korean-specific filters
|
|
70
|
+
|
|
71
|
+
## Install project dependencies
|
|
72
|
+
|
|
73
|
+
- pyenv : <https://github.com/pyenv/pyenv?tab=readme-ov-file#installation>
|
|
74
|
+
- Poetry : <https://python-poetry.org/docs/#installation>
|
|
75
|
+
- Rust : <https://www.rust-lang.org/tools/install>
|
|
76
|
+
|
|
77
|
+
## Install Python
|
|
78
|
+
|
|
79
|
+
```shell
|
|
80
|
+
# Install Python
|
|
81
|
+
% pyenv install 3.13.5
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Setup repository and activate virtual environment
|
|
85
|
+
|
|
86
|
+
```shell
|
|
87
|
+
# Clone lindera-python project repository
|
|
88
|
+
% git clone git@github.com:lindera/lindera-python.git
|
|
89
|
+
% cd lindera-python
|
|
90
|
+
|
|
91
|
+
# Set Python version for this project
|
|
92
|
+
% pyenv local 3.13.5
|
|
93
|
+
|
|
94
|
+
# Make Python virtual environment
|
|
95
|
+
% python -m venv .venv
|
|
96
|
+
|
|
97
|
+
# Activate Python virtual environment
|
|
98
|
+
% source .venv/bin/activate
|
|
99
|
+
|
|
100
|
+
# Initialize lindera-python project
|
|
101
|
+
(.venv) % make init
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Install lindera-python as a library in the virtual environment
|
|
105
|
+
|
|
106
|
+
This command takes a long time because it builds a library that includes all the dictionaries.
|
|
107
|
+
|
|
108
|
+
```shell
|
|
109
|
+
(.venv) % make develop
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Quick Start
|
|
113
|
+
|
|
114
|
+
### Basic Tokenization
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from lindera import TokenizerBuilder
|
|
118
|
+
|
|
119
|
+
# Create a tokenizer with default settings
|
|
120
|
+
builder = TokenizerBuilder()
|
|
121
|
+
builder.set_mode("normal")
|
|
122
|
+
builder.set_dictionary("embedded://ipadic")
|
|
123
|
+
tokenizer = builder.build()
|
|
124
|
+
|
|
125
|
+
# Tokenize Japanese text
|
|
126
|
+
text = "すもももももももものうち"
|
|
127
|
+
tokens = tokenizer.tokenize(text)
|
|
128
|
+
|
|
129
|
+
for token in tokens:
|
|
130
|
+
print(f"Text: {token.text}, Position: {token.position}")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Using Character Filters
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from lindera import TokenizerBuilder
|
|
137
|
+
|
|
138
|
+
# Create tokenizer builder
|
|
139
|
+
builder = TokenizerBuilder()
|
|
140
|
+
builder.set_mode("normal")
|
|
141
|
+
builder.set_dictionary("embedded://ipadic")
|
|
142
|
+
|
|
143
|
+
# Add character filters
|
|
144
|
+
builder.append_character_filter("mapping", {"mapping": {"ー": "-"}})
|
|
145
|
+
builder.append_character_filter("unicode_normalize", {"kind": "nfkc"})
|
|
146
|
+
|
|
147
|
+
# Build tokenizer with filters
|
|
148
|
+
tokenizer = builder.build()
|
|
149
|
+
text = "テストー123"
|
|
150
|
+
tokens = tokenizer.tokenize(text) # Will apply filters automatically
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Using Token Filters
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from lindera import TokenizerBuilder
|
|
157
|
+
|
|
158
|
+
# Create tokenizer builder
|
|
159
|
+
builder = TokenizerBuilder()
|
|
160
|
+
builder.set_mode("normal")
|
|
161
|
+
builder.set_dictionary("embedded://ipadic")
|
|
162
|
+
|
|
163
|
+
# Add token filters
|
|
164
|
+
builder.append_token_filter("lowercase")
|
|
165
|
+
builder.append_token_filter("length", {"min": 2, "max": 10})
|
|
166
|
+
builder.append_token_filter("japanese_stop_tags", {"tags": ["助詞", "助動詞"]})
|
|
167
|
+
|
|
168
|
+
# Build tokenizer with filters
|
|
169
|
+
tokenizer = builder.build()
|
|
170
|
+
tokens = tokenizer.tokenize("テキストの解析")
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Integrated Pipeline
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from lindera import TokenizerBuilder
|
|
177
|
+
|
|
178
|
+
# Build tokenizer with integrated filters
|
|
179
|
+
builder = TokenizerBuilder()
|
|
180
|
+
builder.set_mode("normal")
|
|
181
|
+
builder.set_dictionary("embedded://ipadic")
|
|
182
|
+
|
|
183
|
+
# Add character filters
|
|
184
|
+
builder.append_character_filter("mapping", {"mapping": {"ー": "-"}})
|
|
185
|
+
builder.append_character_filter("unicode_normalize", {"kind": "nfkc"})
|
|
186
|
+
|
|
187
|
+
# Add token filters
|
|
188
|
+
builder.append_token_filter("lowercase")
|
|
189
|
+
builder.append_token_filter("japanese_base_form")
|
|
190
|
+
|
|
191
|
+
# Build and use
|
|
192
|
+
tokenizer = builder.build()
|
|
193
|
+
tokens = tokenizer.tokenize("コーヒーショップ")
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Working with Metadata
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from lindera import Metadata
|
|
200
|
+
|
|
201
|
+
# Get metadata for a specific dictionary
|
|
202
|
+
metadata = Metadata.load("embedded://ipadic")
|
|
203
|
+
print(f"Dictionary: {metadata.dictionary_name}")
|
|
204
|
+
print(f"Version: {metadata.dictionary_version}")
|
|
205
|
+
|
|
206
|
+
# Access schema information
|
|
207
|
+
schema = metadata.dictionary_schema
|
|
208
|
+
print(f"Schema has {len(schema.fields)} fields")
|
|
209
|
+
print(f"Fields: {schema.fields[:5]}") # First 5 fields
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Advanced Usage
|
|
213
|
+
|
|
214
|
+
### Filter Configuration Examples
|
|
215
|
+
|
|
216
|
+
Character filters and token filters accept configuration as dictionary arguments:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from lindera import TokenizerBuilder
|
|
220
|
+
|
|
221
|
+
builder = TokenizerBuilder()
|
|
222
|
+
builder.set_dictionary("embedded://ipadic")
|
|
223
|
+
|
|
224
|
+
# Character filters with dict configuration
|
|
225
|
+
builder.append_character_filter("unicode_normalize", {"kind": "nfkc"})
|
|
226
|
+
builder.append_character_filter("japanese_iteration_mark", {
|
|
227
|
+
"normalize_kanji": "true",
|
|
228
|
+
"normalize_kana": "true"
|
|
229
|
+
})
|
|
230
|
+
builder.append_character_filter("mapping", {
|
|
231
|
+
"mapping": {"リンデラ": "lindera", "トウキョウ": "東京"}
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
# Token filters with dict configuration
|
|
235
|
+
builder.append_token_filter("japanese_katakana_stem", {"min": 3})
|
|
236
|
+
builder.append_token_filter("length", {"min": 2, "max": 10})
|
|
237
|
+
builder.append_token_filter("japanese_stop_tags", {
|
|
238
|
+
"tags": ["助詞", "助動詞", "記号"]
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
# Filters without configuration can omit the dict
|
|
242
|
+
builder.append_token_filter("lowercase")
|
|
243
|
+
builder.append_token_filter("japanese_base_form")
|
|
244
|
+
|
|
245
|
+
tokenizer = builder.build()
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
See `examples/` directory for comprehensive examples including:
|
|
249
|
+
|
|
250
|
+
- `tokenize.py`: Basic tokenization
|
|
251
|
+
- `tokenize_with_filters.py`: Using character and token filters
|
|
252
|
+
- `tokenize_with_userdict.py`: Custom user dictionary
|
|
253
|
+
- `train_and_export.py`: Train and export custom dictionaries (requires `train` feature)
|
|
254
|
+
- Multi-language tokenization
|
|
255
|
+
- Advanced configuration options
|
|
256
|
+
|
|
257
|
+
## Dictionary Support
|
|
258
|
+
|
|
259
|
+
### Japanese
|
|
260
|
+
|
|
261
|
+
- **IPADIC**: Default Japanese dictionary, good for general text
|
|
262
|
+
- **UniDic**: Academic dictionary with detailed morphological information
|
|
263
|
+
|
|
264
|
+
### Korean
|
|
265
|
+
|
|
266
|
+
- **ko-dic**: Standard Korean dictionary for morphological analysis
|
|
267
|
+
|
|
268
|
+
### Chinese
|
|
269
|
+
|
|
270
|
+
- **CC-CEDICT**: Community-maintained Chinese-English dictionary
|
|
271
|
+
|
|
272
|
+
### Custom Dictionaries
|
|
273
|
+
|
|
274
|
+
- User dictionary support for domain-specific terms
|
|
275
|
+
- CSV format for easy customization
|
|
276
|
+
|
|
277
|
+
## Dictionary Training (Experimental)
|
|
278
|
+
|
|
279
|
+
lindera-python supports training custom morphological analysis models from annotated corpus data when built with the `train` feature.
|
|
280
|
+
|
|
281
|
+
### Building with Training Support
|
|
282
|
+
|
|
283
|
+
```shell
|
|
284
|
+
# Install with training support
|
|
285
|
+
(.venv) % maturin develop --features train
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
### Training a Model
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
import lindera
|
|
292
|
+
|
|
293
|
+
# Train a model from corpus
|
|
294
|
+
lindera.train(
|
|
295
|
+
seed="path/to/seed.csv", # Seed lexicon
|
|
296
|
+
corpus="path/to/corpus.txt", # Training corpus
|
|
297
|
+
char_def="path/to/char.def", # Character definitions
|
|
298
|
+
unk_def="path/to/unk.def", # Unknown word definitions
|
|
299
|
+
feature_def="path/to/feature.def", # Feature templates
|
|
300
|
+
rewrite_def="path/to/rewrite.def", # Rewrite rules
|
|
301
|
+
output="model.dat", # Output model file
|
|
302
|
+
lambda_=0.01, # L1 regularization
|
|
303
|
+
max_iter=100, # Max iterations
|
|
304
|
+
max_threads=None # Auto-detect CPU cores
|
|
305
|
+
)
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### Exporting Dictionary Files
|
|
309
|
+
|
|
310
|
+
```python
|
|
311
|
+
# Export trained model to dictionary files
|
|
312
|
+
lindera.export(
|
|
313
|
+
model="model.dat", # Trained model
|
|
314
|
+
output="exported_dict/", # Output directory
|
|
315
|
+
metadata="metadata.json" # Optional metadata file
|
|
316
|
+
)
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
This will create:
|
|
320
|
+
|
|
321
|
+
- `lex.csv`: Lexicon file
|
|
322
|
+
- `matrix.def`: Connection cost matrix
|
|
323
|
+
- `unk.def`: Unknown word definitions
|
|
324
|
+
- `char.def`: Character definitions
|
|
325
|
+
- `metadata.json`: Dictionary metadata (if provided)
|
|
326
|
+
|
|
327
|
+
See `examples/train_and_export.py` for a complete example.
|
|
328
|
+
|
|
329
|
+
## API Reference
|
|
330
|
+
|
|
331
|
+
### Core Classes
|
|
332
|
+
|
|
333
|
+
- `TokenizerBuilder`: Fluent builder for tokenizer configuration
|
|
334
|
+
- `Tokenizer`: Main tokenization engine
|
|
335
|
+
- `Token`: Individual token with text, position, and linguistic features
|
|
336
|
+
- `CharacterFilter`: Text preprocessing filters
|
|
337
|
+
- `TokenFilter`: Token post-processing filters
|
|
338
|
+
- `Metadata`: Dictionary metadata and configuration
|
|
339
|
+
- `Schema`: Dictionary schema definition
|
|
340
|
+
|
|
341
|
+
### Training Functions (requires `train` feature)
|
|
342
|
+
|
|
343
|
+
- `train()`: Train a morphological analysis model from corpus
|
|
344
|
+
- `export()`: Export trained model to dictionary files
|
|
345
|
+
|
|
346
|
+
See the `test_basic.py` file for comprehensive API usage examples.
|
|
347
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
lindera\__init__.py,sha256=789DzXq3kU2cR6OYXrndhkP8w8SvAi4mbS8Aiu6wrkI,111
|
|
2
|
+
lindera\lindera.pyd,sha256=UU16AsU_YkdcoyFZjOp2NgPC3kC1bhemDzcyzm77nuA,4318208
|
|
3
|
+
lindera_python-2.0.0.dist-info\METADATA,sha256=u5MJfIpag8FfbZEkLMKBtigoYZIOUHJJTkA_bQPwjTY,10676
|
|
4
|
+
lindera_python-2.0.0.dist-info\WHEEL,sha256=MufjGu_63ruR8ZgxzOhetQW9p_leDU_zsBcxlIi9DWc,96
|
|
5
|
+
lindera_python-2.0.0.dist-info\licenses\LICENSE,sha256=0mN1ENjb-MS9AfPb8v8YKJMMBxy2ngKHvESZ7iN_HYQ,1096
|
|
6
|
+
lindera_python-2.0.0.dist-info\RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Lindera Morphology
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|