kuromoji-ko 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/dict/base.dat.gz +0 -0
- package/dict/cc.dat.gz +0 -0
- package/dict/check.dat.gz +0 -0
- package/dict/tid.dat.gz +0 -0
- package/dict/tid_map.dat.gz +0 -0
- package/dict/tid_pos.dat.gz +0 -0
- package/dict/unk.dat.gz +0 -0
- package/dict/unk_char.dat.gz +0 -0
- package/dict/unk_compat.dat.gz +0 -0
- package/dict/unk_invoke.dat.gz +0 -0
- package/dict/unk_map.dat.gz +0 -0
- package/dict/unk_pos.dat.gz +0 -0
- package/dist/index.cjs +1416 -0
- package/dist/index.d.cts +352 -0
- package/dist/index.d.ts +352 -0
- package/dist/index.js +1375 -0
- package/package.json +63 -0
package/README.md
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# kuromoji-ko
|
|
2
|
+
|
|
3
|
+
**Pure JavaScript Korean Morphological Analyzer**
|
|
4
|
+
|
|
5
|
+
A port of [kuromoji.js](https://github.com/takuyaa/kuromoji.js) adapted for Korean language processing using [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic).
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- ๐ Pure JavaScript - runs in Node.js, browsers, and serverless (Vercel, Cloudflare Workers)
|
|
10
|
+
- ๐ฆ No native dependencies - no compilation required
|
|
11
|
+
- ๐ฐ๐ท Korean-optimized - uses mecab-ko-dic with Sejong tagset
|
|
12
|
+
- โก Viterbi algorithm - accurate morphological analysis
|
|
13
|
+
- ๐ง Simple API - tokenize Korean text in a few lines
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
npm install kuromoji-ko
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```javascript
|
|
24
|
+
import kuromoji from 'kuromoji-ko';
|
|
25
|
+
|
|
26
|
+
const tokenizer = await kuromoji.builder({
|
|
27
|
+
dicPath: './dict'
|
|
28
|
+
}).build();
|
|
29
|
+
|
|
30
|
+
const tokens = tokenizer.tokenize('์๋
ํ์ธ์');
|
|
31
|
+
|
|
32
|
+
for (const token of tokens) {
|
|
33
|
+
console.log(token.surface_form, token.pos, token.posDescription);
|
|
34
|
+
}
|
|
35
|
+
// ์๋
NNG ์ผ๋ฐ ๋ช
์ฌ
|
|
36
|
+
// ํ XSV ๋์ฌ ํ์ ์ ๋ฏธ์ฌ
|
|
37
|
+
// ์ธ์ EF ์ข
๊ฒฐ ์ด๋ฏธ
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Building the Dictionary
|
|
41
|
+
|
|
42
|
+
Before using kuromoji-ko, you need to build the dictionary files from mecab-ko-dic:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Download mecab-ko-dic
|
|
46
|
+
git clone https://bitbucket.org/eunjeon/mecab-ko-dic.git
|
|
47
|
+
|
|
48
|
+
# Build dictionary
|
|
49
|
+
npm run build:dict -- ./mecab-ko-dic ./dict
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This creates binary dictionary files in the `./dict` directory.
|
|
53
|
+
|
|
54
|
+
## API
|
|
55
|
+
|
|
56
|
+
### `kuromoji.builder(options)`
|
|
57
|
+
|
|
58
|
+
Create a tokenizer builder.
|
|
59
|
+
|
|
60
|
+
```javascript
|
|
61
|
+
const builder = kuromoji.builder({
|
|
62
|
+
dicPath: './dict', // Path to dictionary directory
|
|
63
|
+
loader: customLoader // Optional custom file loader
|
|
64
|
+
});
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### `builder.build()`
|
|
68
|
+
|
|
69
|
+
Build and return the tokenizer (async).
|
|
70
|
+
|
|
71
|
+
```javascript
|
|
72
|
+
const tokenizer = await builder.build();
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### `tokenizer.tokenize(text)`
|
|
76
|
+
|
|
77
|
+
Tokenize Korean text into morphemes.
|
|
78
|
+
|
|
79
|
+
```javascript
|
|
80
|
+
const tokens = tokenizer.tokenize('ํ๊ตญ์ด ํํ์ ๋ถ์');
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### `tokenizer.wakati(text)`
|
|
84
|
+
|
|
85
|
+
Get just the surface forms as an array.
|
|
86
|
+
|
|
87
|
+
```javascript
|
|
88
|
+
const words = tokenizer.wakati('ํ๊ตญ์ด ํํ์ ๋ถ์');
|
|
89
|
+
// ['ํ๊ตญ์ด', 'ํํ์', '๋ถ์']
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### `tokenizer.wakatiString(text)`
|
|
93
|
+
|
|
94
|
+
Get space-separated surface forms.
|
|
95
|
+
|
|
96
|
+
```javascript
|
|
97
|
+
const str = tokenizer.wakatiString('ํ๊ตญ์ด ํํ์ ๋ถ์');
|
|
98
|
+
// 'ํ๊ตญ์ด ํํ์ ๋ถ์'
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Token Object
|
|
102
|
+
|
|
103
|
+
Each token has the following properties:
|
|
104
|
+
|
|
105
|
+
| Property | Description | Example |
|
|
106
|
+
|----------|-------------|---------|
|
|
107
|
+
| `surface_form` | Surface text | `'ํ๊ตญ์ด'` |
|
|
108
|
+
| `word_position` | Position in text (1-indexed) | `1` |
|
|
109
|
+
| `word_id` | Dictionary word ID | `12345` |
|
|
110
|
+
| `word_type` | KNOWN or UNKNOWN | `'KNOWN'` |
|
|
111
|
+
| `pos` | POS tag (Sejong tagset) | `'NNG'` |
|
|
112
|
+
| `posDescription` | POS description | `'์ผ๋ฐ ๋ช
์ฌ'` |
|
|
113
|
+
| `semantic_class` | Semantic category | `'*'` |
|
|
114
|
+
| `has_final_consonant` | Ends with ๋ฐ์นจ? (T/F/*) | `'F'` |
|
|
115
|
+
| `reading` | Pronunciation | `'ํ๊ตญ์ด'` |
|
|
116
|
+
| `type` | Inflect/Compound/Preanalysis | `'Compound'` |
|
|
117
|
+
| `first_pos` | First POS (compounds) | `'NNG'` |
|
|
118
|
+
| `last_pos` | Last POS (compounds) | `'NNG'` |
|
|
119
|
+
| `expression` | Decomposition | `'ํ๊ตญ/NNG/*+์ด/NNG/*'` |
|
|
120
|
+
|
|
121
|
+
## Korean POS Tags (Sejong Tagset)
|
|
122
|
+
|
|
123
|
+
### ์ฒด์ธ (Substantives)
|
|
124
|
+
| Tag | Description |
|
|
125
|
+
|-----|-------------|
|
|
126
|
+
| NNG | ์ผ๋ฐ ๋ช
์ฌ (General noun) |
|
|
127
|
+
| NNP | ๊ณ ์ ๋ช
์ฌ (Proper noun) |
|
|
128
|
+
| NNB | ์์กด ๋ช
์ฌ (Dependent noun) |
|
|
129
|
+
| NR | ์์ฌ (Numeral) |
|
|
130
|
+
| NP | ๋๋ช
์ฌ (Pronoun) |
|
|
131
|
+
|
|
132
|
+
### ์ฉ์ธ (Predicates)
|
|
133
|
+
| Tag | Description |
|
|
134
|
+
|-----|-------------|
|
|
135
|
+
| VV | ๋์ฌ (Verb) |
|
|
136
|
+
| VA | ํ์ฉ์ฌ (Adjective) |
|
|
137
|
+
| VX | ๋ณด์กฐ ์ฉ์ธ (Auxiliary) |
|
|
138
|
+
| VCP | ๊ธ์ ์ง์ ์ฌ (Copula ์ด๋ค) |
|
|
139
|
+
| VCN | ๋ถ์ ์ง์ ์ฌ (Negative ์๋๋ค) |
|
|
140
|
+
|
|
141
|
+
### ์กฐ์ฌ (Particles)
|
|
142
|
+
| Tag | Description |
|
|
143
|
+
|-----|-------------|
|
|
144
|
+
| JKS | ์ฃผ๊ฒฉ ์กฐ์ฌ (Subject) |
|
|
145
|
+
| JKO | ๋ชฉ์ ๊ฒฉ ์กฐ์ฌ (Object) |
|
|
146
|
+
| JKB | ๋ถ์ฌ๊ฒฉ ์กฐ์ฌ (Adverbial) |
|
|
147
|
+
| JX | ๋ณด์กฐ์ฌ (Auxiliary particle) |
|
|
148
|
+
|
|
149
|
+
### ์ด๋ฏธ (Endings)
|
|
150
|
+
| Tag | Description |
|
|
151
|
+
|-----|-------------|
|
|
152
|
+
| EP | ์ ์ด๋ง ์ด๋ฏธ (Pre-final) |
|
|
153
|
+
| EF | ์ข
๊ฒฐ ์ด๋ฏธ (Final) |
|
|
154
|
+
| EC | ์ฐ๊ฒฐ ์ด๋ฏธ (Connective) |
|
|
155
|
+
| ETN | ๋ช
์ฌํ ์ ์ฑ ์ด๋ฏธ (Nominalizing) |
|
|
156
|
+
| ETM | ๊ดํํ ์ ์ฑ ์ด๋ฏธ (Adnominalizing) |
|
|
157
|
+
|
|
158
|
+
### ๊ธฐํ (Others)
|
|
159
|
+
| Tag | Description |
|
|
160
|
+
|-----|-------------|
|
|
161
|
+
| SL | ์ธ๊ตญ์ด (Foreign) |
|
|
162
|
+
| SH | ํ์ (Chinese characters) |
|
|
163
|
+
| SN | ์ซ์ (Numbers) |
|
|
164
|
+
| SW | ๊ธฐํ ๊ธฐํธ (Symbols) |
|
|
165
|
+
|
|
166
|
+
## Browser Usage
|
|
167
|
+
|
|
168
|
+
```html
|
|
169
|
+
<script type="module">
|
|
170
|
+
import kuromoji from 'https://cdn.jsdelivr.net/npm/kuromoji-ko/dist/index.mjs';
|
|
171
|
+
|
|
172
|
+
const tokenizer = await kuromoji.builder({
|
|
173
|
+
dicPath: 'https://cdn.jsdelivr.net/npm/kuromoji-ko/dict/'
|
|
174
|
+
}).build();
|
|
175
|
+
|
|
176
|
+
console.log(tokenizer.tokenize('์๋
ํ์ธ์'));
|
|
177
|
+
</script>
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Serverless (Vercel) Usage
|
|
181
|
+
|
|
182
|
+
kuromoji-ko runs without native dependencies, making it perfect for serverless:
|
|
183
|
+
|
|
184
|
+
```javascript
|
|
185
|
+
// api/tokenize.js
|
|
186
|
+
import kuromoji from 'kuromoji-ko';
|
|
187
|
+
|
|
188
|
+
let tokenizerPromise = null;
|
|
189
|
+
|
|
190
|
+
function getTokenizer() {
|
|
191
|
+
if (!tokenizerPromise) {
|
|
192
|
+
tokenizerPromise = kuromoji.builder({
|
|
193
|
+
dicPath: './dict'
|
|
194
|
+
}).build();
|
|
195
|
+
}
|
|
196
|
+
return tokenizerPromise;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export default async function handler(req, res) {
|
|
200
|
+
const tokenizer = await getTokenizer();
|
|
201
|
+
const tokens = tokenizer.tokenize(req.body.text);
|
|
202
|
+
res.json(tokens);
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## How It Works
|
|
207
|
+
|
|
208
|
+
kuromoji-ko implements morphological analysis using:
|
|
209
|
+
|
|
210
|
+
1. **Double-Array TRIE** - Efficient dictionary lookup for surface forms
|
|
211
|
+
2. **Viterbi Algorithm** - Dynamic programming to find the optimal segmentation
|
|
212
|
+
3. **Connection Costs** - Bigram model for morpheme transitions
|
|
213
|
+
4. **Unknown Word Handling** - Character-type based POS estimation
|
|
214
|
+
|
|
215
|
+
## Credits
|
|
216
|
+
|
|
217
|
+
- [kuromoji.js](https://github.com/takuyaa/kuromoji.js) - Original Japanese implementation
|
|
218
|
+
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - Korean dictionary
|
|
219
|
+
- [MeCab](https://taku910.github.io/mecab/) - Original C++ morphological analyzer
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
Apache-2.0
|
|
224
|
+
|
|
225
|
+
Dictionary files (mecab-ko-dic) are also Apache-2.0 licensed.
|
package/dict/base.dat.gz
ADDED
|
Binary file
|
package/dict/cc.dat.gz
ADDED
|
Binary file
|
|
Binary file
|
package/dict/tid.dat.gz
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/dict/unk.dat.gz
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|