kuromoji-ko 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -3
- package/dist/index.cjs +227 -1
- package/dist/index.d.cts +158 -1
- package/dist/index.d.ts +158 -1
- package/dist/index.js +224 -1
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -20,6 +20,24 @@ npm install kuromoji-ko
|
|
|
20
20
|
|
|
21
21
|
## Quick Start
|
|
22
22
|
|
|
23
|
+
### napi-mecab Compatible API (Recommended)
|
|
24
|
+
|
|
25
|
+
```javascript
|
|
26
|
+
import { MeCab } from 'kuromoji-ko';
|
|
27
|
+
|
|
28
|
+
const mecab = await MeCab.create({ engine: 'ko', dictPath: './dict' });
|
|
29
|
+
const tokens = mecab.parse('안녕하세요');
|
|
30
|
+
|
|
31
|
+
for (const token of tokens) {
|
|
32
|
+
console.log(token.surface, token.pos, token.lemma);
|
|
33
|
+
}
|
|
34
|
+
// 안녕 ['NNG'] 안녕
|
|
35
|
+
// 하 ['XSV'] 하다
|
|
36
|
+
// 세요 ['EF'] 세요
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Classic API
|
|
40
|
+
|
|
23
41
|
```javascript
|
|
24
42
|
import kuromoji from 'kuromoji-ko';
|
|
25
43
|
|
|
@@ -53,7 +71,62 @@ This creates binary dictionary files in the `./dict` directory.
|
|
|
53
71
|
|
|
54
72
|
## API
|
|
55
73
|
|
|
56
|
-
###
|
|
74
|
+
### MeCab API (napi-mecab compatible)
|
|
75
|
+
|
|
76
|
+
#### `MeCab.create(options)`
|
|
77
|
+
|
|
78
|
+
Create a MeCab instance asynchronously.
|
|
79
|
+
|
|
80
|
+
```javascript
|
|
81
|
+
import { MeCab } from 'kuromoji-ko';
|
|
82
|
+
|
|
83
|
+
const mecab = await MeCab.create({
|
|
84
|
+
engine: 'ko', // Only 'ko' is supported
|
|
85
|
+
dictPath: './dict' // Path to dictionary directory
|
|
86
|
+
});
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### `mecab.parse(text)`
|
|
90
|
+
|
|
91
|
+
Parse text into an array of Token objects.
|
|
92
|
+
|
|
93
|
+
```javascript
|
|
94
|
+
const tokens = mecab.parse('아버지가방에들어가신다');
|
|
95
|
+
tokens.forEach(t => console.log(t.surface, t.pos));
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Token Object (napi-mecab compatible)
|
|
99
|
+
|
|
100
|
+
| Property | Type | Description |
|
|
101
|
+
|----------|------|-------------|
|
|
102
|
+
| `surface` | `string` | How the token looks in the input text |
|
|
103
|
+
| `pos` | `string[]` | Parts of speech as array (split by "+") |
|
|
104
|
+
| `lemma` | `string` | Dictionary headword (adds "다" for verbs) |
|
|
105
|
+
| `pronunciation` | `string \| null` | How the token is pronounced |
|
|
106
|
+
| `hasBatchim` | `boolean \| null` | Whether token has final consonant (받침) |
|
|
107
|
+
| `hasJongseong` | `boolean \| null` | Alias for hasBatchim |
|
|
108
|
+
| `semanticClass` | `string \| null` | Semantic word class or category |
|
|
109
|
+
| `type` | `string \| null` | Token type (Inflect/Compound/Preanalysis) |
|
|
110
|
+
| `expression` | `ExpressionToken[] \| null` | Breakdown of compound/inflected tokens |
|
|
111
|
+
| `features` | `string` | Raw features string (comma-separated) |
|
|
112
|
+
| `raw` | `string` | Raw MeCab output format (surface\tfeatures) |
|
|
113
|
+
|
|
114
|
+
### ExpressionToken Object
|
|
115
|
+
|
|
116
|
+
For compound or inflected words, `expression` returns an array of ExpressionToken:
|
|
117
|
+
|
|
118
|
+
| Property | Type | Description |
|
|
119
|
+
|----------|------|-------------|
|
|
120
|
+
| `morpheme` | `string` | The normalized token |
|
|
121
|
+
| `pos` | `string` | Part of speech |
|
|
122
|
+
| `lemma` | `string` | Dictionary form (adds "다" for verbs) |
|
|
123
|
+
| `semanticClass` | `string \| null` | Semantic category |
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
### Classic API
|
|
128
|
+
|
|
129
|
+
#### `kuromoji.builder(options)`
|
|
57
130
|
|
|
58
131
|
Create a tokenizer builder.
|
|
59
132
|
|
|
@@ -98,9 +171,9 @@ const str = tokenizer.wakatiString('한국어 형태소 분석');
|
|
|
98
171
|
// '한국어 형태소 분석'
|
|
99
172
|
```
|
|
100
173
|
|
|
101
|
-
##
|
|
174
|
+
## KoreanToken Object (Classic API)
|
|
102
175
|
|
|
103
|
-
Each token has the following properties:
|
|
176
|
+
Each token from `tokenizer.tokenize()` has the following properties:
|
|
104
177
|
|
|
105
178
|
| Property | Description | Example |
|
|
106
179
|
|----------|-------------|---------|
|
package/dist/index.cjs
CHANGED
|
@@ -30,8 +30,11 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
+
ExpressionToken: () => ExpressionToken,
|
|
33
34
|
KoreanToken: () => KoreanToken,
|
|
35
|
+
MeCab: () => MeCab,
|
|
34
36
|
POS_TAGS: () => POS_TAGS,
|
|
37
|
+
Token: () => Token,
|
|
35
38
|
Tokenizer: () => Tokenizer,
|
|
36
39
|
TokenizerBuilder: () => TokenizerBuilder,
|
|
37
40
|
builder: () => builder,
|
|
@@ -1401,15 +1404,238 @@ var TokenizerBuilder = class {
|
|
|
1401
1404
|
}
|
|
1402
1405
|
};
|
|
1403
1406
|
|
|
1407
|
+
// src/ExpressionToken.ts
|
|
1408
|
+
var VERB_TAGS = ["VV", "VA", "VX", "VCP", "VCN"];
|
|
1409
|
+
function nullIfStar(value) {
|
|
1410
|
+
return value === "*" ? null : value;
|
|
1411
|
+
}
|
|
1412
|
+
var ExpressionToken = class {
|
|
1413
|
+
constructor(raw) {
|
|
1414
|
+
const parts = raw.split("/");
|
|
1415
|
+
this._morpheme = parts[0] ?? "";
|
|
1416
|
+
this._pos = parts[1] ?? "";
|
|
1417
|
+
this._semanticClass = parts[2] ?? "*";
|
|
1418
|
+
}
|
|
1419
|
+
/**
|
|
1420
|
+
* The normalized token/morpheme
|
|
1421
|
+
*/
|
|
1422
|
+
get morpheme() {
|
|
1423
|
+
return this._morpheme;
|
|
1424
|
+
}
|
|
1425
|
+
/**
|
|
1426
|
+
* The part of speech tag
|
|
1427
|
+
*/
|
|
1428
|
+
get pos() {
|
|
1429
|
+
return this._pos;
|
|
1430
|
+
}
|
|
1431
|
+
/**
|
|
1432
|
+
* The dictionary form (adds 다 for verbs)
|
|
1433
|
+
*/
|
|
1434
|
+
get lemma() {
|
|
1435
|
+
if (VERB_TAGS.includes(this._pos)) {
|
|
1436
|
+
return this._morpheme + "\uB2E4";
|
|
1437
|
+
}
|
|
1438
|
+
return this._morpheme;
|
|
1439
|
+
}
|
|
1440
|
+
/**
|
|
1441
|
+
* The semantic word class or category
|
|
1442
|
+
*/
|
|
1443
|
+
get semanticClass() {
|
|
1444
|
+
return nullIfStar(this._semanticClass);
|
|
1445
|
+
}
|
|
1446
|
+
};
|
|
1447
|
+
|
|
1448
|
+
// src/Token.ts
|
|
1449
|
+
var VERB_TAGS2 = ["VV", "VA", "VX", "VCP", "VCN"];
|
|
1450
|
+
function nullIfStar2(value) {
|
|
1451
|
+
return value === "*" ? null : value;
|
|
1452
|
+
}
|
|
1453
|
+
var Token = class {
|
|
1454
|
+
constructor(token) {
|
|
1455
|
+
this._token = token;
|
|
1456
|
+
}
|
|
1457
|
+
/**
|
|
1458
|
+
* How the token looks in the input text
|
|
1459
|
+
*/
|
|
1460
|
+
get surface() {
|
|
1461
|
+
return this._token.surface_form;
|
|
1462
|
+
}
|
|
1463
|
+
/**
|
|
1464
|
+
* The raw features string (comma-separated)
|
|
1465
|
+
*/
|
|
1466
|
+
get features() {
|
|
1467
|
+
return [
|
|
1468
|
+
this._token.pos,
|
|
1469
|
+
this._token.semantic_class,
|
|
1470
|
+
this._token.has_final_consonant,
|
|
1471
|
+
this._token.reading,
|
|
1472
|
+
this._token.type,
|
|
1473
|
+
this._token.first_pos,
|
|
1474
|
+
this._token.last_pos,
|
|
1475
|
+
this._token.expression
|
|
1476
|
+
].join(",");
|
|
1477
|
+
}
|
|
1478
|
+
/**
|
|
1479
|
+
* The raw string in MeCab format (surface\tfeatures)
|
|
1480
|
+
*/
|
|
1481
|
+
get raw() {
|
|
1482
|
+
return `${this.surface} ${this.features}`;
|
|
1483
|
+
}
|
|
1484
|
+
/**
|
|
1485
|
+
* Parts of speech as an array (split by "+")
|
|
1486
|
+
*/
|
|
1487
|
+
get pos() {
|
|
1488
|
+
return this._token.pos.split("+");
|
|
1489
|
+
}
|
|
1490
|
+
/**
|
|
1491
|
+
* The dictionary headword (adds 다 for verbs)
|
|
1492
|
+
*/
|
|
1493
|
+
get lemma() {
|
|
1494
|
+
const basePos = this.pos[0];
|
|
1495
|
+
if (VERB_TAGS2.includes(basePos)) {
|
|
1496
|
+
return this.surface + "\uB2E4";
|
|
1497
|
+
}
|
|
1498
|
+
return this.surface;
|
|
1499
|
+
}
|
|
1500
|
+
/**
|
|
1501
|
+
* How the token is pronounced
|
|
1502
|
+
*/
|
|
1503
|
+
get pronunciation() {
|
|
1504
|
+
return nullIfStar2(this._token.reading);
|
|
1505
|
+
}
|
|
1506
|
+
/**
|
|
1507
|
+
* Whether the token has a final consonant (받침/batchim)
|
|
1508
|
+
*/
|
|
1509
|
+
get hasBatchim() {
|
|
1510
|
+
const val = this._token.has_final_consonant;
|
|
1511
|
+
if (val === "T") return true;
|
|
1512
|
+
if (val === "F") return false;
|
|
1513
|
+
return null;
|
|
1514
|
+
}
|
|
1515
|
+
/**
|
|
1516
|
+
* Alias for hasBatchim (종성/jongseong)
|
|
1517
|
+
*/
|
|
1518
|
+
get hasJongseong() {
|
|
1519
|
+
return this.hasBatchim;
|
|
1520
|
+
}
|
|
1521
|
+
/**
|
|
1522
|
+
* The semantic word class or category
|
|
1523
|
+
*/
|
|
1524
|
+
get semanticClass() {
|
|
1525
|
+
return nullIfStar2(this._token.semantic_class);
|
|
1526
|
+
}
|
|
1527
|
+
/**
|
|
1528
|
+
* The type of token (Inflect/Compound/Preanalysis)
|
|
1529
|
+
*/
|
|
1530
|
+
get type() {
|
|
1531
|
+
return nullIfStar2(this._token.type);
|
|
1532
|
+
}
|
|
1533
|
+
/**
|
|
1534
|
+
* The broken-down expression tokens for compound/inflected words
|
|
1535
|
+
*/
|
|
1536
|
+
get expression() {
|
|
1537
|
+
if (this._token.expression === "*") return null;
|
|
1538
|
+
return this._token.expression.split("+").map((part) => new ExpressionToken(part));
|
|
1539
|
+
}
|
|
1540
|
+
/**
|
|
1541
|
+
* Get the underlying KoreanToken
|
|
1542
|
+
*/
|
|
1543
|
+
get koreanToken() {
|
|
1544
|
+
return this._token;
|
|
1545
|
+
}
|
|
1546
|
+
};
|
|
1547
|
+
|
|
1548
|
+
// src/MeCab.ts
|
|
1549
|
+
var MeCab = class _MeCab {
|
|
1550
|
+
constructor(tokenizer) {
|
|
1551
|
+
this.tokenizer = tokenizer;
|
|
1552
|
+
}
|
|
1553
|
+
/**
|
|
1554
|
+
* Create a MeCab instance asynchronously.
|
|
1555
|
+
*
|
|
1556
|
+
* Unlike napi-mecab which uses a synchronous constructor,
|
|
1557
|
+
* this pure JavaScript implementation requires async initialization
|
|
1558
|
+
* to load the dictionary files without blocking.
|
|
1559
|
+
*
|
|
1560
|
+
* @example
|
|
1561
|
+
* ```typescript
|
|
1562
|
+
* const mecab = await MeCab.create({ engine: 'ko' });
|
|
1563
|
+
* const tokens = mecab.parse('안녕하세요');
|
|
1564
|
+
* ```
|
|
1565
|
+
*/
|
|
1566
|
+
static async create(opts = {}) {
|
|
1567
|
+
const engine = opts.engine ?? "ko";
|
|
1568
|
+
if (engine !== "ko") {
|
|
1569
|
+
throw new Error(
|
|
1570
|
+
`"${engine}" is not a supported mecab engine. Only "ko" (Korean) is supported.`
|
|
1571
|
+
);
|
|
1572
|
+
}
|
|
1573
|
+
const builder2 = new TokenizerBuilder({
|
|
1574
|
+
dicPath: opts.dictPath
|
|
1575
|
+
});
|
|
1576
|
+
const tokenizer = await builder2.build();
|
|
1577
|
+
return new _MeCab(tokenizer);
|
|
1578
|
+
}
|
|
1579
|
+
/**
|
|
1580
|
+
* Parse text into an array of tokens.
|
|
1581
|
+
*
|
|
1582
|
+
* @param text - The text to parse
|
|
1583
|
+
* @returns Array of Token objects
|
|
1584
|
+
*
|
|
1585
|
+
* @example
|
|
1586
|
+
* ```typescript
|
|
1587
|
+
* const tokens = mecab.parse('아버지가방에들어가신다');
|
|
1588
|
+
* tokens.forEach(t => console.log(t.surface, t.pos));
|
|
1589
|
+
* ```
|
|
1590
|
+
*/
|
|
1591
|
+
parse(text) {
|
|
1592
|
+
const koreanTokens = this.tokenizer.tokenize(text);
|
|
1593
|
+
return koreanTokens.map((token) => new Token(token));
|
|
1594
|
+
}
|
|
1595
|
+
/**
|
|
1596
|
+
* Get just the surface forms as an array.
|
|
1597
|
+
* Convenience method equivalent to napi-mecab parse + map surface.
|
|
1598
|
+
*/
|
|
1599
|
+
wakati(text) {
|
|
1600
|
+
return this.tokenizer.wakati(text);
|
|
1601
|
+
}
|
|
1602
|
+
/**
|
|
1603
|
+
* Get space-separated surface forms.
|
|
1604
|
+
*/
|
|
1605
|
+
wakatiString(text) {
|
|
1606
|
+
return this.tokenizer.wakatiString(text);
|
|
1607
|
+
}
|
|
1608
|
+
/**
|
|
1609
|
+
* Access the underlying Tokenizer for advanced usage.
|
|
1610
|
+
*/
|
|
1611
|
+
get underlyingTokenizer() {
|
|
1612
|
+
return this.tokenizer;
|
|
1613
|
+
}
|
|
1614
|
+
};
|
|
1615
|
+
|
|
1404
1616
|
// src/index.ts
|
|
1405
1617
|
function builder(options = {}) {
|
|
1406
1618
|
return new TokenizerBuilder(options);
|
|
1407
1619
|
}
|
|
1408
|
-
var index_default = {
|
|
1620
|
+
var index_default = {
|
|
1621
|
+
// Original API
|
|
1622
|
+
builder,
|
|
1623
|
+
TokenizerBuilder,
|
|
1624
|
+
Tokenizer,
|
|
1625
|
+
KoreanToken,
|
|
1626
|
+
POS_TAGS,
|
|
1627
|
+
// napi-mecab compatible API
|
|
1628
|
+
MeCab,
|
|
1629
|
+
Token,
|
|
1630
|
+
ExpressionToken
|
|
1631
|
+
};
|
|
1409
1632
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1410
1633
|
0 && (module.exports = {
|
|
1634
|
+
ExpressionToken,
|
|
1411
1635
|
KoreanToken,
|
|
1636
|
+
MeCab,
|
|
1412
1637
|
POS_TAGS,
|
|
1638
|
+
Token,
|
|
1413
1639
|
Tokenizer,
|
|
1414
1640
|
TokenizerBuilder,
|
|
1415
1641
|
builder
|
package/dist/index.d.cts
CHANGED
|
@@ -330,6 +330,160 @@ declare class TokenizerBuilder {
|
|
|
330
330
|
build(): Promise<Tokenizer>;
|
|
331
331
|
}
|
|
332
332
|
|
|
333
|
+
/**
|
|
334
|
+
* ExpressionToken - represents a component of an agglutinated Korean token
|
|
335
|
+
*
|
|
336
|
+
* Korean compound/inflected words have an expression field in the format:
|
|
337
|
+
* "morpheme/pos/semanticClass+morpheme/pos/semanticClass+..."
|
|
338
|
+
*
|
|
339
|
+
* This class represents a single component of that expression.
|
|
340
|
+
*/
|
|
341
|
+
declare class ExpressionToken {
|
|
342
|
+
private _morpheme;
|
|
343
|
+
private _pos;
|
|
344
|
+
private _semanticClass;
|
|
345
|
+
constructor(raw: string);
|
|
346
|
+
/**
|
|
347
|
+
* The normalized token/morpheme
|
|
348
|
+
*/
|
|
349
|
+
get morpheme(): string;
|
|
350
|
+
/**
|
|
351
|
+
* The part of speech tag
|
|
352
|
+
*/
|
|
353
|
+
get pos(): string;
|
|
354
|
+
/**
|
|
355
|
+
* The dictionary form (adds 다 for verbs)
|
|
356
|
+
*/
|
|
357
|
+
get lemma(): string;
|
|
358
|
+
/**
|
|
359
|
+
* The semantic word class or category
|
|
360
|
+
*/
|
|
361
|
+
get semanticClass(): string | null;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Token - napi-mecab compatible token wrapper
|
|
366
|
+
*
|
|
367
|
+
* Provides getters that match the napi-mecab API for Korean tokens.
|
|
368
|
+
*/
|
|
369
|
+
|
|
370
|
+
declare class Token {
|
|
371
|
+
private _token;
|
|
372
|
+
constructor(token: KoreanToken);
|
|
373
|
+
/**
|
|
374
|
+
* How the token looks in the input text
|
|
375
|
+
*/
|
|
376
|
+
get surface(): string;
|
|
377
|
+
/**
|
|
378
|
+
* The raw features string (comma-separated)
|
|
379
|
+
*/
|
|
380
|
+
get features(): string;
|
|
381
|
+
/**
|
|
382
|
+
* The raw string in MeCab format (surface\tfeatures)
|
|
383
|
+
*/
|
|
384
|
+
get raw(): string;
|
|
385
|
+
/**
|
|
386
|
+
* Parts of speech as an array (split by "+")
|
|
387
|
+
*/
|
|
388
|
+
get pos(): string[];
|
|
389
|
+
/**
|
|
390
|
+
* The dictionary headword (adds 다 for verbs)
|
|
391
|
+
*/
|
|
392
|
+
get lemma(): string | null;
|
|
393
|
+
/**
|
|
394
|
+
* How the token is pronounced
|
|
395
|
+
*/
|
|
396
|
+
get pronunciation(): string | null;
|
|
397
|
+
/**
|
|
398
|
+
* Whether the token has a final consonant (받침/batchim)
|
|
399
|
+
*/
|
|
400
|
+
get hasBatchim(): boolean | null;
|
|
401
|
+
/**
|
|
402
|
+
* Alias for hasBatchim (종성/jongseong)
|
|
403
|
+
*/
|
|
404
|
+
get hasJongseong(): boolean | null;
|
|
405
|
+
/**
|
|
406
|
+
* The semantic word class or category
|
|
407
|
+
*/
|
|
408
|
+
get semanticClass(): string | null;
|
|
409
|
+
/**
|
|
410
|
+
* The type of token (Inflect/Compound/Preanalysis)
|
|
411
|
+
*/
|
|
412
|
+
get type(): string | null;
|
|
413
|
+
/**
|
|
414
|
+
* The broken-down expression tokens for compound/inflected words
|
|
415
|
+
*/
|
|
416
|
+
get expression(): ExpressionToken[] | null;
|
|
417
|
+
/**
|
|
418
|
+
* Get the underlying KoreanToken
|
|
419
|
+
*/
|
|
420
|
+
get koreanToken(): KoreanToken;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* MeCab - napi-mecab compatible API wrapper
|
|
425
|
+
*
|
|
426
|
+
* Provides a familiar API for users coming from napi-mecab.
|
|
427
|
+
* Uses async initialization since this is a pure JavaScript implementation.
|
|
428
|
+
*/
|
|
429
|
+
|
|
430
|
+
interface MeCabOptions {
|
|
431
|
+
/**
|
|
432
|
+
* The language engine to use. Only 'ko' (Korean) is supported.
|
|
433
|
+
* @default 'ko'
|
|
434
|
+
*/
|
|
435
|
+
engine?: 'ko';
|
|
436
|
+
/**
|
|
437
|
+
* Path to the dictionary directory.
|
|
438
|
+
* @default 'dict/'
|
|
439
|
+
*/
|
|
440
|
+
dictPath?: string;
|
|
441
|
+
}
|
|
442
|
+
declare class MeCab {
|
|
443
|
+
private tokenizer;
|
|
444
|
+
private constructor();
|
|
445
|
+
/**
|
|
446
|
+
* Create a MeCab instance asynchronously.
|
|
447
|
+
*
|
|
448
|
+
* Unlike napi-mecab which uses a synchronous constructor,
|
|
449
|
+
* this pure JavaScript implementation requires async initialization
|
|
450
|
+
* to load the dictionary files without blocking.
|
|
451
|
+
*
|
|
452
|
+
* @example
|
|
453
|
+
* ```typescript
|
|
454
|
+
* const mecab = await MeCab.create({ engine: 'ko' });
|
|
455
|
+
* const tokens = mecab.parse('안녕하세요');
|
|
456
|
+
* ```
|
|
457
|
+
*/
|
|
458
|
+
static create(opts?: MeCabOptions): Promise<MeCab>;
|
|
459
|
+
/**
|
|
460
|
+
* Parse text into an array of tokens.
|
|
461
|
+
*
|
|
462
|
+
* @param text - The text to parse
|
|
463
|
+
* @returns Array of Token objects
|
|
464
|
+
*
|
|
465
|
+
* @example
|
|
466
|
+
* ```typescript
|
|
467
|
+
* const tokens = mecab.parse('아버지가방에들어가신다');
|
|
468
|
+
* tokens.forEach(t => console.log(t.surface, t.pos));
|
|
469
|
+
* ```
|
|
470
|
+
*/
|
|
471
|
+
parse(text: string): Token[];
|
|
472
|
+
/**
|
|
473
|
+
* Get just the surface forms as an array.
|
|
474
|
+
* Convenience method equivalent to napi-mecab parse + map surface.
|
|
475
|
+
*/
|
|
476
|
+
wakati(text: string): string[];
|
|
477
|
+
/**
|
|
478
|
+
* Get space-separated surface forms.
|
|
479
|
+
*/
|
|
480
|
+
wakatiString(text: string): string;
|
|
481
|
+
/**
|
|
482
|
+
* Access the underlying Tokenizer for advanced usage.
|
|
483
|
+
*/
|
|
484
|
+
get underlyingTokenizer(): Tokenizer;
|
|
485
|
+
}
|
|
486
|
+
|
|
333
487
|
/**
|
|
334
488
|
* mecab-ko - Pure TypeScript Korean Morphological Analyzer
|
|
335
489
|
*
|
|
@@ -347,6 +501,9 @@ declare const _default: {
|
|
|
347
501
|
Tokenizer: typeof Tokenizer;
|
|
348
502
|
KoreanToken: typeof KoreanToken;
|
|
349
503
|
POS_TAGS: Record<string, string>;
|
|
504
|
+
MeCab: typeof MeCab;
|
|
505
|
+
Token: typeof Token;
|
|
506
|
+
ExpressionToken: typeof ExpressionToken;
|
|
350
507
|
};
|
|
351
508
|
|
|
352
|
-
export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|
|
509
|
+
export { ExpressionToken, KoreanToken, MeCab, type MeCabOptions, POS_TAGS, Token, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -330,6 +330,160 @@ declare class TokenizerBuilder {
|
|
|
330
330
|
build(): Promise<Tokenizer>;
|
|
331
331
|
}
|
|
332
332
|
|
|
333
|
+
/**
|
|
334
|
+
* ExpressionToken - represents a component of an agglutinated Korean token
|
|
335
|
+
*
|
|
336
|
+
* Korean compound/inflected words have an expression field in the format:
|
|
337
|
+
* "morpheme/pos/semanticClass+morpheme/pos/semanticClass+..."
|
|
338
|
+
*
|
|
339
|
+
* This class represents a single component of that expression.
|
|
340
|
+
*/
|
|
341
|
+
declare class ExpressionToken {
|
|
342
|
+
private _morpheme;
|
|
343
|
+
private _pos;
|
|
344
|
+
private _semanticClass;
|
|
345
|
+
constructor(raw: string);
|
|
346
|
+
/**
|
|
347
|
+
* The normalized token/morpheme
|
|
348
|
+
*/
|
|
349
|
+
get morpheme(): string;
|
|
350
|
+
/**
|
|
351
|
+
* The part of speech tag
|
|
352
|
+
*/
|
|
353
|
+
get pos(): string;
|
|
354
|
+
/**
|
|
355
|
+
* The dictionary form (adds 다 for verbs)
|
|
356
|
+
*/
|
|
357
|
+
get lemma(): string;
|
|
358
|
+
/**
|
|
359
|
+
* The semantic word class or category
|
|
360
|
+
*/
|
|
361
|
+
get semanticClass(): string | null;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Token - napi-mecab compatible token wrapper
|
|
366
|
+
*
|
|
367
|
+
* Provides getters that match the napi-mecab API for Korean tokens.
|
|
368
|
+
*/
|
|
369
|
+
|
|
370
|
+
declare class Token {
|
|
371
|
+
private _token;
|
|
372
|
+
constructor(token: KoreanToken);
|
|
373
|
+
/**
|
|
374
|
+
* How the token looks in the input text
|
|
375
|
+
*/
|
|
376
|
+
get surface(): string;
|
|
377
|
+
/**
|
|
378
|
+
* The raw features string (comma-separated)
|
|
379
|
+
*/
|
|
380
|
+
get features(): string;
|
|
381
|
+
/**
|
|
382
|
+
* The raw string in MeCab format (surface\tfeatures)
|
|
383
|
+
*/
|
|
384
|
+
get raw(): string;
|
|
385
|
+
/**
|
|
386
|
+
* Parts of speech as an array (split by "+")
|
|
387
|
+
*/
|
|
388
|
+
get pos(): string[];
|
|
389
|
+
/**
|
|
390
|
+
* The dictionary headword (adds 다 for verbs)
|
|
391
|
+
*/
|
|
392
|
+
get lemma(): string | null;
|
|
393
|
+
/**
|
|
394
|
+
* How the token is pronounced
|
|
395
|
+
*/
|
|
396
|
+
get pronunciation(): string | null;
|
|
397
|
+
/**
|
|
398
|
+
* Whether the token has a final consonant (받침/batchim)
|
|
399
|
+
*/
|
|
400
|
+
get hasBatchim(): boolean | null;
|
|
401
|
+
/**
|
|
402
|
+
* Alias for hasBatchim (종성/jongseong)
|
|
403
|
+
*/
|
|
404
|
+
get hasJongseong(): boolean | null;
|
|
405
|
+
/**
|
|
406
|
+
* The semantic word class or category
|
|
407
|
+
*/
|
|
408
|
+
get semanticClass(): string | null;
|
|
409
|
+
/**
|
|
410
|
+
* The type of token (Inflect/Compound/Preanalysis)
|
|
411
|
+
*/
|
|
412
|
+
get type(): string | null;
|
|
413
|
+
/**
|
|
414
|
+
* The broken-down expression tokens for compound/inflected words
|
|
415
|
+
*/
|
|
416
|
+
get expression(): ExpressionToken[] | null;
|
|
417
|
+
/**
|
|
418
|
+
* Get the underlying KoreanToken
|
|
419
|
+
*/
|
|
420
|
+
get koreanToken(): KoreanToken;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* MeCab - napi-mecab compatible API wrapper
|
|
425
|
+
*
|
|
426
|
+
* Provides a familiar API for users coming from napi-mecab.
|
|
427
|
+
* Uses async initialization since this is a pure JavaScript implementation.
|
|
428
|
+
*/
|
|
429
|
+
|
|
430
|
+
interface MeCabOptions {
|
|
431
|
+
/**
|
|
432
|
+
* The language engine to use. Only 'ko' (Korean) is supported.
|
|
433
|
+
* @default 'ko'
|
|
434
|
+
*/
|
|
435
|
+
engine?: 'ko';
|
|
436
|
+
/**
|
|
437
|
+
* Path to the dictionary directory.
|
|
438
|
+
* @default 'dict/'
|
|
439
|
+
*/
|
|
440
|
+
dictPath?: string;
|
|
441
|
+
}
|
|
442
|
+
declare class MeCab {
|
|
443
|
+
private tokenizer;
|
|
444
|
+
private constructor();
|
|
445
|
+
/**
|
|
446
|
+
* Create a MeCab instance asynchronously.
|
|
447
|
+
*
|
|
448
|
+
* Unlike napi-mecab which uses a synchronous constructor,
|
|
449
|
+
* this pure JavaScript implementation requires async initialization
|
|
450
|
+
* to load the dictionary files without blocking.
|
|
451
|
+
*
|
|
452
|
+
* @example
|
|
453
|
+
* ```typescript
|
|
454
|
+
* const mecab = await MeCab.create({ engine: 'ko' });
|
|
455
|
+
* const tokens = mecab.parse('안녕하세요');
|
|
456
|
+
* ```
|
|
457
|
+
*/
|
|
458
|
+
static create(opts?: MeCabOptions): Promise<MeCab>;
|
|
459
|
+
/**
|
|
460
|
+
* Parse text into an array of tokens.
|
|
461
|
+
*
|
|
462
|
+
* @param text - The text to parse
|
|
463
|
+
* @returns Array of Token objects
|
|
464
|
+
*
|
|
465
|
+
* @example
|
|
466
|
+
* ```typescript
|
|
467
|
+
* const tokens = mecab.parse('아버지가방에들어가신다');
|
|
468
|
+
* tokens.forEach(t => console.log(t.surface, t.pos));
|
|
469
|
+
* ```
|
|
470
|
+
*/
|
|
471
|
+
parse(text: string): Token[];
|
|
472
|
+
/**
|
|
473
|
+
* Get just the surface forms as an array.
|
|
474
|
+
* Convenience method equivalent to napi-mecab parse + map surface.
|
|
475
|
+
*/
|
|
476
|
+
wakati(text: string): string[];
|
|
477
|
+
/**
|
|
478
|
+
* Get space-separated surface forms.
|
|
479
|
+
*/
|
|
480
|
+
wakatiString(text: string): string;
|
|
481
|
+
/**
|
|
482
|
+
* Access the underlying Tokenizer for advanced usage.
|
|
483
|
+
*/
|
|
484
|
+
get underlyingTokenizer(): Tokenizer;
|
|
485
|
+
}
|
|
486
|
+
|
|
333
487
|
/**
|
|
334
488
|
* mecab-ko - Pure TypeScript Korean Morphological Analyzer
|
|
335
489
|
*
|
|
@@ -347,6 +501,9 @@ declare const _default: {
|
|
|
347
501
|
Tokenizer: typeof Tokenizer;
|
|
348
502
|
KoreanToken: typeof KoreanToken;
|
|
349
503
|
POS_TAGS: Record<string, string>;
|
|
504
|
+
MeCab: typeof MeCab;
|
|
505
|
+
Token: typeof Token;
|
|
506
|
+
ExpressionToken: typeof ExpressionToken;
|
|
350
507
|
};
|
|
351
508
|
|
|
352
|
-
export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|
|
509
|
+
export { ExpressionToken, KoreanToken, MeCab, type MeCabOptions, POS_TAGS, Token, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
|
package/dist/index.js
CHANGED
|
@@ -1360,14 +1360,237 @@ var TokenizerBuilder = class {
|
|
|
1360
1360
|
}
|
|
1361
1361
|
};
|
|
1362
1362
|
|
|
1363
|
+
// src/ExpressionToken.ts
|
|
1364
|
+
var VERB_TAGS = ["VV", "VA", "VX", "VCP", "VCN"];
|
|
1365
|
+
function nullIfStar(value) {
|
|
1366
|
+
return value === "*" ? null : value;
|
|
1367
|
+
}
|
|
1368
|
+
var ExpressionToken = class {
|
|
1369
|
+
constructor(raw) {
|
|
1370
|
+
const parts = raw.split("/");
|
|
1371
|
+
this._morpheme = parts[0] ?? "";
|
|
1372
|
+
this._pos = parts[1] ?? "";
|
|
1373
|
+
this._semanticClass = parts[2] ?? "*";
|
|
1374
|
+
}
|
|
1375
|
+
/**
|
|
1376
|
+
* The normalized token/morpheme
|
|
1377
|
+
*/
|
|
1378
|
+
get morpheme() {
|
|
1379
|
+
return this._morpheme;
|
|
1380
|
+
}
|
|
1381
|
+
/**
|
|
1382
|
+
* The part of speech tag
|
|
1383
|
+
*/
|
|
1384
|
+
get pos() {
|
|
1385
|
+
return this._pos;
|
|
1386
|
+
}
|
|
1387
|
+
/**
|
|
1388
|
+
* The dictionary form (adds 다 for verbs)
|
|
1389
|
+
*/
|
|
1390
|
+
get lemma() {
|
|
1391
|
+
if (VERB_TAGS.includes(this._pos)) {
|
|
1392
|
+
return this._morpheme + "\uB2E4";
|
|
1393
|
+
}
|
|
1394
|
+
return this._morpheme;
|
|
1395
|
+
}
|
|
1396
|
+
/**
|
|
1397
|
+
* The semantic word class or category
|
|
1398
|
+
*/
|
|
1399
|
+
get semanticClass() {
|
|
1400
|
+
return nullIfStar(this._semanticClass);
|
|
1401
|
+
}
|
|
1402
|
+
};
|
|
1403
|
+
|
|
1404
|
+
// src/Token.ts
|
|
1405
|
+
var VERB_TAGS2 = ["VV", "VA", "VX", "VCP", "VCN"];
|
|
1406
|
+
function nullIfStar2(value) {
|
|
1407
|
+
return value === "*" ? null : value;
|
|
1408
|
+
}
|
|
1409
|
+
var Token = class {
|
|
1410
|
+
constructor(token) {
|
|
1411
|
+
this._token = token;
|
|
1412
|
+
}
|
|
1413
|
+
/**
|
|
1414
|
+
* How the token looks in the input text
|
|
1415
|
+
*/
|
|
1416
|
+
get surface() {
|
|
1417
|
+
return this._token.surface_form;
|
|
1418
|
+
}
|
|
1419
|
+
/**
|
|
1420
|
+
* The raw features string (comma-separated)
|
|
1421
|
+
*/
|
|
1422
|
+
get features() {
|
|
1423
|
+
return [
|
|
1424
|
+
this._token.pos,
|
|
1425
|
+
this._token.semantic_class,
|
|
1426
|
+
this._token.has_final_consonant,
|
|
1427
|
+
this._token.reading,
|
|
1428
|
+
this._token.type,
|
|
1429
|
+
this._token.first_pos,
|
|
1430
|
+
this._token.last_pos,
|
|
1431
|
+
this._token.expression
|
|
1432
|
+
].join(",");
|
|
1433
|
+
}
|
|
1434
|
+
/**
|
|
1435
|
+
* The raw string in MeCab format (surface\tfeatures)
|
|
1436
|
+
*/
|
|
1437
|
+
get raw() {
|
|
1438
|
+
return `${this.surface} ${this.features}`;
|
|
1439
|
+
}
|
|
1440
|
+
/**
|
|
1441
|
+
* Parts of speech as an array (split by "+")
|
|
1442
|
+
*/
|
|
1443
|
+
get pos() {
|
|
1444
|
+
return this._token.pos.split("+");
|
|
1445
|
+
}
|
|
1446
|
+
/**
|
|
1447
|
+
* The dictionary headword (adds 다 for verbs)
|
|
1448
|
+
*/
|
|
1449
|
+
get lemma() {
|
|
1450
|
+
const basePos = this.pos[0];
|
|
1451
|
+
if (VERB_TAGS2.includes(basePos)) {
|
|
1452
|
+
return this.surface + "\uB2E4";
|
|
1453
|
+
}
|
|
1454
|
+
return this.surface;
|
|
1455
|
+
}
|
|
1456
|
+
/**
|
|
1457
|
+
* How the token is pronounced
|
|
1458
|
+
*/
|
|
1459
|
+
get pronunciation() {
|
|
1460
|
+
return nullIfStar2(this._token.reading);
|
|
1461
|
+
}
|
|
1462
|
+
/**
|
|
1463
|
+
* Whether the token has a final consonant (받침/batchim)
|
|
1464
|
+
*/
|
|
1465
|
+
get hasBatchim() {
|
|
1466
|
+
const val = this._token.has_final_consonant;
|
|
1467
|
+
if (val === "T") return true;
|
|
1468
|
+
if (val === "F") return false;
|
|
1469
|
+
return null;
|
|
1470
|
+
}
|
|
1471
|
+
/**
|
|
1472
|
+
* Alias for hasBatchim (종성/jongseong)
|
|
1473
|
+
*/
|
|
1474
|
+
get hasJongseong() {
|
|
1475
|
+
return this.hasBatchim;
|
|
1476
|
+
}
|
|
1477
|
+
/**
|
|
1478
|
+
* The semantic word class or category
|
|
1479
|
+
*/
|
|
1480
|
+
get semanticClass() {
|
|
1481
|
+
return nullIfStar2(this._token.semantic_class);
|
|
1482
|
+
}
|
|
1483
|
+
/**
|
|
1484
|
+
* The type of token (Inflect/Compound/Preanalysis)
|
|
1485
|
+
*/
|
|
1486
|
+
get type() {
|
|
1487
|
+
return nullIfStar2(this._token.type);
|
|
1488
|
+
}
|
|
1489
|
+
/**
|
|
1490
|
+
* The broken-down expression tokens for compound/inflected words
|
|
1491
|
+
*/
|
|
1492
|
+
get expression() {
|
|
1493
|
+
if (this._token.expression === "*") return null;
|
|
1494
|
+
return this._token.expression.split("+").map((part) => new ExpressionToken(part));
|
|
1495
|
+
}
|
|
1496
|
+
/**
|
|
1497
|
+
* Get the underlying KoreanToken
|
|
1498
|
+
*/
|
|
1499
|
+
get koreanToken() {
|
|
1500
|
+
return this._token;
|
|
1501
|
+
}
|
|
1502
|
+
};
|
|
1503
|
+
|
|
1504
|
+
// src/MeCab.ts
|
|
1505
|
+
var MeCab = class _MeCab {
|
|
1506
|
+
constructor(tokenizer) {
|
|
1507
|
+
this.tokenizer = tokenizer;
|
|
1508
|
+
}
|
|
1509
|
+
/**
|
|
1510
|
+
* Create a MeCab instance asynchronously.
|
|
1511
|
+
*
|
|
1512
|
+
* Unlike napi-mecab which uses a synchronous constructor,
|
|
1513
|
+
* this pure JavaScript implementation requires async initialization
|
|
1514
|
+
* to load the dictionary files without blocking.
|
|
1515
|
+
*
|
|
1516
|
+
* @example
|
|
1517
|
+
* ```typescript
|
|
1518
|
+
* const mecab = await MeCab.create({ engine: 'ko' });
|
|
1519
|
+
* const tokens = mecab.parse('안녕하세요');
|
|
1520
|
+
* ```
|
|
1521
|
+
*/
|
|
1522
|
+
static async create(opts = {}) {
|
|
1523
|
+
const engine = opts.engine ?? "ko";
|
|
1524
|
+
if (engine !== "ko") {
|
|
1525
|
+
throw new Error(
|
|
1526
|
+
`"${engine}" is not a supported mecab engine. Only "ko" (Korean) is supported.`
|
|
1527
|
+
);
|
|
1528
|
+
}
|
|
1529
|
+
const builder2 = new TokenizerBuilder({
|
|
1530
|
+
dicPath: opts.dictPath
|
|
1531
|
+
});
|
|
1532
|
+
const tokenizer = await builder2.build();
|
|
1533
|
+
return new _MeCab(tokenizer);
|
|
1534
|
+
}
|
|
1535
|
+
/**
|
|
1536
|
+
* Parse text into an array of tokens.
|
|
1537
|
+
*
|
|
1538
|
+
* @param text - The text to parse
|
|
1539
|
+
* @returns Array of Token objects
|
|
1540
|
+
*
|
|
1541
|
+
* @example
|
|
1542
|
+
* ```typescript
|
|
1543
|
+
* const tokens = mecab.parse('아버지가방에들어가신다');
|
|
1544
|
+
* tokens.forEach(t => console.log(t.surface, t.pos));
|
|
1545
|
+
* ```
|
|
1546
|
+
*/
|
|
1547
|
+
parse(text) {
|
|
1548
|
+
const koreanTokens = this.tokenizer.tokenize(text);
|
|
1549
|
+
return koreanTokens.map((token) => new Token(token));
|
|
1550
|
+
}
|
|
1551
|
+
/**
|
|
1552
|
+
* Get just the surface forms as an array.
|
|
1553
|
+
* Convenience method equivalent to napi-mecab parse + map surface.
|
|
1554
|
+
*/
|
|
1555
|
+
wakati(text) {
|
|
1556
|
+
return this.tokenizer.wakati(text);
|
|
1557
|
+
}
|
|
1558
|
+
/**
|
|
1559
|
+
* Get space-separated surface forms.
|
|
1560
|
+
*/
|
|
1561
|
+
wakatiString(text) {
|
|
1562
|
+
return this.tokenizer.wakatiString(text);
|
|
1563
|
+
}
|
|
1564
|
+
/**
|
|
1565
|
+
* Access the underlying Tokenizer for advanced usage.
|
|
1566
|
+
*/
|
|
1567
|
+
get underlyingTokenizer() {
|
|
1568
|
+
return this.tokenizer;
|
|
1569
|
+
}
|
|
1570
|
+
};
|
|
1571
|
+
|
|
1363
1572
|
// src/index.ts
|
|
1364
1573
|
function builder(options = {}) {
|
|
1365
1574
|
return new TokenizerBuilder(options);
|
|
1366
1575
|
}
|
|
1367
|
-
var index_default = {
|
|
1576
|
+
var index_default = {
|
|
1577
|
+
// Original API
|
|
1578
|
+
builder,
|
|
1579
|
+
TokenizerBuilder,
|
|
1580
|
+
Tokenizer,
|
|
1581
|
+
KoreanToken,
|
|
1582
|
+
POS_TAGS,
|
|
1583
|
+
// napi-mecab compatible API
|
|
1584
|
+
MeCab,
|
|
1585
|
+
Token,
|
|
1586
|
+
ExpressionToken
|
|
1587
|
+
};
|
|
1368
1588
|
export {
|
|
1589
|
+
ExpressionToken,
|
|
1369
1590
|
KoreanToken,
|
|
1591
|
+
MeCab,
|
|
1370
1592
|
POS_TAGS,
|
|
1593
|
+
Token,
|
|
1371
1594
|
Tokenizer,
|
|
1372
1595
|
TokenizerBuilder,
|
|
1373
1596
|
builder,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "kuromoji-ko",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "Pure TypeScript Korean Morphological Analyzer - serverless compatible, based on kuromoji.js and mecab-ko-dic",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -42,12 +42,12 @@
|
|
|
42
42
|
"license": "Apache-2.0",
|
|
43
43
|
"repository": {
|
|
44
44
|
"type": "git",
|
|
45
|
-
"url": "git+https://github.com/
|
|
45
|
+
"url": "git+https://github.com/elfsmelf/kuromoji-ko.git"
|
|
46
46
|
},
|
|
47
47
|
"bugs": {
|
|
48
|
-
"url": "https://github.com/
|
|
48
|
+
"url": "https://github.com/elfsmelf/kuromoji-ko/issues"
|
|
49
49
|
},
|
|
50
|
-
"homepage": "https://github.com/
|
|
50
|
+
"homepage": "https://github.com/elfsmelf/kuromoji-ko#readme",
|
|
51
51
|
"devDependencies": {
|
|
52
52
|
"@types/node": "^20.10.0",
|
|
53
53
|
"@types/pako": "^2.0.4",
|