kuromoji-ko 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -20,6 +20,24 @@ npm install kuromoji-ko
20
20
 
21
21
  ## Quick Start
22
22
 
23
+ ### napi-mecab Compatible API (Recommended)
24
+
25
+ ```javascript
26
+ import { MeCab } from 'kuromoji-ko';
27
+
28
+ const mecab = await MeCab.create({ engine: 'ko', dictPath: './dict' });
29
+ const tokens = mecab.parse('안녕하세요');
30
+
31
+ for (const token of tokens) {
32
+ console.log(token.surface, token.pos, token.lemma);
33
+ }
34
+ // 안녕 ['NNG'] 안녕
35
+ // 하 ['XSV'] 하다
36
+ // 세요 ['EF'] 세요
37
+ ```
38
+
39
+ ### Classic API
40
+
23
41
  ```javascript
24
42
  import kuromoji from 'kuromoji-ko';
25
43
 
@@ -53,7 +71,62 @@ This creates binary dictionary files in the `./dict` directory.
53
71
 
54
72
  ## API
55
73
 
56
- ### `kuromoji.builder(options)`
74
+ ### MeCab API (napi-mecab compatible)
75
+
76
+ #### `MeCab.create(options)`
77
+
78
+ Create a MeCab instance asynchronously.
79
+
80
+ ```javascript
81
+ import { MeCab } from 'kuromoji-ko';
82
+
83
+ const mecab = await MeCab.create({
84
+ engine: 'ko', // Only 'ko' is supported
85
+ dictPath: './dict' // Path to dictionary directory
86
+ });
87
+ ```
88
+
89
+ #### `mecab.parse(text)`
90
+
91
+ Parse text into an array of Token objects.
92
+
93
+ ```javascript
94
+ const tokens = mecab.parse('아버지가방에들어가신다');
95
+ tokens.forEach(t => console.log(t.surface, t.pos));
96
+ ```
97
+
98
+ ### Token Object (napi-mecab compatible)
99
+
100
+ | Property | Type | Description |
101
+ |----------|------|-------------|
102
+ | `surface` | `string` | How the token looks in the input text |
103
+ | `pos` | `string[]` | Parts of speech as array (split by "+") |
104
+ | `lemma` | `string` | Dictionary headword (adds "다" for verbs) |
105
+ | `pronunciation` | `string \| null` | How the token is pronounced |
106
+ | `hasBatchim` | `boolean \| null` | Whether token has final consonant (받침) |
107
+ | `hasJongseong` | `boolean \| null` | Alias for hasBatchim |
108
+ | `semanticClass` | `string \| null` | Semantic word class or category |
109
+ | `type` | `string \| null` | Token type (Inflect/Compound/Preanalysis) |
110
+ | `expression` | `ExpressionToken[] \| null` | Breakdown of compound/inflected tokens |
111
+ | `features` | `string` | Raw features string (comma-separated) |
112
+ | `raw` | `string` | Raw MeCab output format (surface\tfeatures) |
113
+
114
+ ### ExpressionToken Object
115
+
116
+ For compound or inflected words, `expression` returns an array of ExpressionToken:
117
+
118
+ | Property | Type | Description |
119
+ |----------|------|-------------|
120
+ | `morpheme` | `string` | The normalized token |
121
+ | `pos` | `string` | Part of speech |
122
+ | `lemma` | `string` | Dictionary form (adds "다" for verbs) |
123
+ | `semanticClass` | `string \| null` | Semantic category |
124
+
125
+ ---
126
+
127
+ ### Classic API
128
+
129
+ #### `kuromoji.builder(options)`
57
130
 
58
131
  Create a tokenizer builder.
59
132
 
@@ -98,9 +171,9 @@ const str = tokenizer.wakatiString('한국어 형태소 분석');
98
171
  // '한국어 형태소 분석'
99
172
  ```
100
173
 
101
- ## Token Object
174
+ ## KoreanToken Object (Classic API)
102
175
 
103
- Each token has the following properties:
176
+ Each token from `tokenizer.tokenize()` has the following properties:
104
177
 
105
178
  | Property | Description | Example |
106
179
  |----------|-------------|---------|
package/dist/index.cjs CHANGED
@@ -30,8 +30,11 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
+ ExpressionToken: () => ExpressionToken,
33
34
  KoreanToken: () => KoreanToken,
35
+ MeCab: () => MeCab,
34
36
  POS_TAGS: () => POS_TAGS,
37
+ Token: () => Token,
35
38
  Tokenizer: () => Tokenizer,
36
39
  TokenizerBuilder: () => TokenizerBuilder,
37
40
  builder: () => builder,
@@ -1401,15 +1404,238 @@ var TokenizerBuilder = class {
1401
1404
  }
1402
1405
  };
1403
1406
 
1407
+ // src/ExpressionToken.ts
1408
+ var VERB_TAGS = ["VV", "VA", "VX", "VCP", "VCN"];
1409
+ function nullIfStar(value) {
1410
+ return value === "*" ? null : value;
1411
+ }
1412
+ var ExpressionToken = class {
1413
+ constructor(raw) {
1414
+ const parts = raw.split("/");
1415
+ this._morpheme = parts[0] ?? "";
1416
+ this._pos = parts[1] ?? "";
1417
+ this._semanticClass = parts[2] ?? "*";
1418
+ }
1419
+ /**
1420
+ * The normalized token/morpheme
1421
+ */
1422
+ get morpheme() {
1423
+ return this._morpheme;
1424
+ }
1425
+ /**
1426
+ * The part of speech tag
1427
+ */
1428
+ get pos() {
1429
+ return this._pos;
1430
+ }
1431
+ /**
1432
+ * The dictionary form (adds 다 for verbs)
1433
+ */
1434
+ get lemma() {
1435
+ if (VERB_TAGS.includes(this._pos)) {
1436
+ return this._morpheme + "\uB2E4";
1437
+ }
1438
+ return this._morpheme;
1439
+ }
1440
+ /**
1441
+ * The semantic word class or category
1442
+ */
1443
+ get semanticClass() {
1444
+ return nullIfStar(this._semanticClass);
1445
+ }
1446
+ };
1447
+
1448
+ // src/Token.ts
1449
+ var VERB_TAGS2 = ["VV", "VA", "VX", "VCP", "VCN"];
1450
+ function nullIfStar2(value) {
1451
+ return value === "*" ? null : value;
1452
+ }
1453
+ var Token = class {
1454
+ constructor(token) {
1455
+ this._token = token;
1456
+ }
1457
+ /**
1458
+ * How the token looks in the input text
1459
+ */
1460
+ get surface() {
1461
+ return this._token.surface_form;
1462
+ }
1463
+ /**
1464
+ * The raw features string (comma-separated)
1465
+ */
1466
+ get features() {
1467
+ return [
1468
+ this._token.pos,
1469
+ this._token.semantic_class,
1470
+ this._token.has_final_consonant,
1471
+ this._token.reading,
1472
+ this._token.type,
1473
+ this._token.first_pos,
1474
+ this._token.last_pos,
1475
+ this._token.expression
1476
+ ].join(",");
1477
+ }
1478
+ /**
1479
+ * The raw string in MeCab format (surface\tfeatures)
1480
+ */
1481
+ get raw() {
1482
+ return `${this.surface} ${this.features}`;
1483
+ }
1484
+ /**
1485
+ * Parts of speech as an array (split by "+")
1486
+ */
1487
+ get pos() {
1488
+ return this._token.pos.split("+");
1489
+ }
1490
+ /**
1491
+ * The dictionary headword (adds 다 for verbs)
1492
+ */
1493
+ get lemma() {
1494
+ const basePos = this.pos[0];
1495
+ if (VERB_TAGS2.includes(basePos)) {
1496
+ return this.surface + "\uB2E4";
1497
+ }
1498
+ return this.surface;
1499
+ }
1500
+ /**
1501
+ * How the token is pronounced
1502
+ */
1503
+ get pronunciation() {
1504
+ return nullIfStar2(this._token.reading);
1505
+ }
1506
+ /**
1507
+ * Whether the token has a final consonant (받침/batchim)
1508
+ */
1509
+ get hasBatchim() {
1510
+ const val = this._token.has_final_consonant;
1511
+ if (val === "T") return true;
1512
+ if (val === "F") return false;
1513
+ return null;
1514
+ }
1515
+ /**
1516
+ * Alias for hasBatchim (종성/jongseong)
1517
+ */
1518
+ get hasJongseong() {
1519
+ return this.hasBatchim;
1520
+ }
1521
+ /**
1522
+ * The semantic word class or category
1523
+ */
1524
+ get semanticClass() {
1525
+ return nullIfStar2(this._token.semantic_class);
1526
+ }
1527
+ /**
1528
+ * The type of token (Inflect/Compound/Preanalysis)
1529
+ */
1530
+ get type() {
1531
+ return nullIfStar2(this._token.type);
1532
+ }
1533
+ /**
1534
+ * The broken-down expression tokens for compound/inflected words
1535
+ */
1536
+ get expression() {
1537
+ if (this._token.expression === "*") return null;
1538
+ return this._token.expression.split("+").map((part) => new ExpressionToken(part));
1539
+ }
1540
+ /**
1541
+ * Get the underlying KoreanToken
1542
+ */
1543
+ get koreanToken() {
1544
+ return this._token;
1545
+ }
1546
+ };
1547
+
1548
+ // src/MeCab.ts
1549
+ var MeCab = class _MeCab {
1550
+ constructor(tokenizer) {
1551
+ this.tokenizer = tokenizer;
1552
+ }
1553
+ /**
1554
+ * Create a MeCab instance asynchronously.
1555
+ *
1556
+ * Unlike napi-mecab which uses a synchronous constructor,
1557
+ * this pure JavaScript implementation requires async initialization
1558
+ * to load the dictionary files without blocking.
1559
+ *
1560
+ * @example
1561
+ * ```typescript
1562
+ * const mecab = await MeCab.create({ engine: 'ko' });
1563
+ * const tokens = mecab.parse('안녕하세요');
1564
+ * ```
1565
+ */
1566
+ static async create(opts = {}) {
1567
+ const engine = opts.engine ?? "ko";
1568
+ if (engine !== "ko") {
1569
+ throw new Error(
1570
+ `"${engine}" is not a supported mecab engine. Only "ko" (Korean) is supported.`
1571
+ );
1572
+ }
1573
+ const builder2 = new TokenizerBuilder({
1574
+ dicPath: opts.dictPath
1575
+ });
1576
+ const tokenizer = await builder2.build();
1577
+ return new _MeCab(tokenizer);
1578
+ }
1579
+ /**
1580
+ * Parse text into an array of tokens.
1581
+ *
1582
+ * @param text - The text to parse
1583
+ * @returns Array of Token objects
1584
+ *
1585
+ * @example
1586
+ * ```typescript
1587
+ * const tokens = mecab.parse('아버지가방에들어가신다');
1588
+ * tokens.forEach(t => console.log(t.surface, t.pos));
1589
+ * ```
1590
+ */
1591
+ parse(text) {
1592
+ const koreanTokens = this.tokenizer.tokenize(text);
1593
+ return koreanTokens.map((token) => new Token(token));
1594
+ }
1595
+ /**
1596
+ * Get just the surface forms as an array.
1597
+ * Convenience method equivalent to napi-mecab parse + map surface.
1598
+ */
1599
+ wakati(text) {
1600
+ return this.tokenizer.wakati(text);
1601
+ }
1602
+ /**
1603
+ * Get space-separated surface forms.
1604
+ */
1605
+ wakatiString(text) {
1606
+ return this.tokenizer.wakatiString(text);
1607
+ }
1608
+ /**
1609
+ * Access the underlying Tokenizer for advanced usage.
1610
+ */
1611
+ get underlyingTokenizer() {
1612
+ return this.tokenizer;
1613
+ }
1614
+ };
1615
+
1404
1616
  // src/index.ts
1405
1617
  function builder(options = {}) {
1406
1618
  return new TokenizerBuilder(options);
1407
1619
  }
1408
- var index_default = { builder, TokenizerBuilder, Tokenizer, KoreanToken, POS_TAGS };
1620
+ var index_default = {
1621
+ // Original API
1622
+ builder,
1623
+ TokenizerBuilder,
1624
+ Tokenizer,
1625
+ KoreanToken,
1626
+ POS_TAGS,
1627
+ // napi-mecab compatible API
1628
+ MeCab,
1629
+ Token,
1630
+ ExpressionToken
1631
+ };
1409
1632
  // Annotate the CommonJS export names for ESM import in node:
1410
1633
  0 && (module.exports = {
1634
+ ExpressionToken,
1411
1635
  KoreanToken,
1636
+ MeCab,
1412
1637
  POS_TAGS,
1638
+ Token,
1413
1639
  Tokenizer,
1414
1640
  TokenizerBuilder,
1415
1641
  builder
package/dist/index.d.cts CHANGED
@@ -330,6 +330,160 @@ declare class TokenizerBuilder {
330
330
  build(): Promise<Tokenizer>;
331
331
  }
332
332
 
333
+ /**
334
+ * ExpressionToken - represents a component of an agglutinated Korean token
335
+ *
336
+ * Korean compound/inflected words have an expression field in the format:
337
+ * "morpheme/pos/semanticClass+morpheme/pos/semanticClass+..."
338
+ *
339
+ * This class represents a single component of that expression.
340
+ */
341
+ declare class ExpressionToken {
342
+ private _morpheme;
343
+ private _pos;
344
+ private _semanticClass;
345
+ constructor(raw: string);
346
+ /**
347
+ * The normalized token/morpheme
348
+ */
349
+ get morpheme(): string;
350
+ /**
351
+ * The part of speech tag
352
+ */
353
+ get pos(): string;
354
+ /**
355
+ * The dictionary form (adds 다 for verbs)
356
+ */
357
+ get lemma(): string;
358
+ /**
359
+ * The semantic word class or category
360
+ */
361
+ get semanticClass(): string | null;
362
+ }
363
+
364
+ /**
365
+ * Token - napi-mecab compatible token wrapper
366
+ *
367
+ * Provides getters that match the napi-mecab API for Korean tokens.
368
+ */
369
+
370
+ declare class Token {
371
+ private _token;
372
+ constructor(token: KoreanToken);
373
+ /**
374
+ * How the token looks in the input text
375
+ */
376
+ get surface(): string;
377
+ /**
378
+ * The raw features string (comma-separated)
379
+ */
380
+ get features(): string;
381
+ /**
382
+ * The raw string in MeCab format (surface\tfeatures)
383
+ */
384
+ get raw(): string;
385
+ /**
386
+ * Parts of speech as an array (split by "+")
387
+ */
388
+ get pos(): string[];
389
+ /**
390
+ * The dictionary headword (adds 다 for verbs)
391
+ */
392
+ get lemma(): string | null;
393
+ /**
394
+ * How the token is pronounced
395
+ */
396
+ get pronunciation(): string | null;
397
+ /**
398
+ * Whether the token has a final consonant (받침/batchim)
399
+ */
400
+ get hasBatchim(): boolean | null;
401
+ /**
402
+ * Alias for hasBatchim (종성/jongseong)
403
+ */
404
+ get hasJongseong(): boolean | null;
405
+ /**
406
+ * The semantic word class or category
407
+ */
408
+ get semanticClass(): string | null;
409
+ /**
410
+ * The type of token (Inflect/Compound/Preanalysis)
411
+ */
412
+ get type(): string | null;
413
+ /**
414
+ * The broken-down expression tokens for compound/inflected words
415
+ */
416
+ get expression(): ExpressionToken[] | null;
417
+ /**
418
+ * Get the underlying KoreanToken
419
+ */
420
+ get koreanToken(): KoreanToken;
421
+ }
422
+
423
+ /**
424
+ * MeCab - napi-mecab compatible API wrapper
425
+ *
426
+ * Provides a familiar API for users coming from napi-mecab.
427
+ * Uses async initialization since this is a pure JavaScript implementation.
428
+ */
429
+
430
+ interface MeCabOptions {
431
+ /**
432
+ * The language engine to use. Only 'ko' (Korean) is supported.
433
+ * @default 'ko'
434
+ */
435
+ engine?: 'ko';
436
+ /**
437
+ * Path to the dictionary directory.
438
+ * @default 'dict/'
439
+ */
440
+ dictPath?: string;
441
+ }
442
+ declare class MeCab {
443
+ private tokenizer;
444
+ private constructor();
445
+ /**
446
+ * Create a MeCab instance asynchronously.
447
+ *
448
+ * Unlike napi-mecab which uses a synchronous constructor,
449
+ * this pure JavaScript implementation requires async initialization
450
+ * to load the dictionary files without blocking.
451
+ *
452
+ * @example
453
+ * ```typescript
454
+ * const mecab = await MeCab.create({ engine: 'ko' });
455
+ * const tokens = mecab.parse('안녕하세요');
456
+ * ```
457
+ */
458
+ static create(opts?: MeCabOptions): Promise<MeCab>;
459
+ /**
460
+ * Parse text into an array of tokens.
461
+ *
462
+ * @param text - The text to parse
463
+ * @returns Array of Token objects
464
+ *
465
+ * @example
466
+ * ```typescript
467
+ * const tokens = mecab.parse('아버지가방에들어가신다');
468
+ * tokens.forEach(t => console.log(t.surface, t.pos));
469
+ * ```
470
+ */
471
+ parse(text: string): Token[];
472
+ /**
473
+ * Get just the surface forms as an array.
474
+ * Convenience method equivalent to napi-mecab parse + map surface.
475
+ */
476
+ wakati(text: string): string[];
477
+ /**
478
+ * Get space-separated surface forms.
479
+ */
480
+ wakatiString(text: string): string;
481
+ /**
482
+ * Access the underlying Tokenizer for advanced usage.
483
+ */
484
+ get underlyingTokenizer(): Tokenizer;
485
+ }
486
+
333
487
  /**
334
488
  * mecab-ko - Pure TypeScript Korean Morphological Analyzer
335
489
  *
@@ -347,6 +501,9 @@ declare const _default: {
347
501
  Tokenizer: typeof Tokenizer;
348
502
  KoreanToken: typeof KoreanToken;
349
503
  POS_TAGS: Record<string, string>;
504
+ MeCab: typeof MeCab;
505
+ Token: typeof Token;
506
+ ExpressionToken: typeof ExpressionToken;
350
507
  };
351
508
 
352
- export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
509
+ export { ExpressionToken, KoreanToken, MeCab, type MeCabOptions, POS_TAGS, Token, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
package/dist/index.d.ts CHANGED
@@ -330,6 +330,160 @@ declare class TokenizerBuilder {
330
330
  build(): Promise<Tokenizer>;
331
331
  }
332
332
 
333
+ /**
334
+ * ExpressionToken - represents a component of an agglutinated Korean token
335
+ *
336
+ * Korean compound/inflected words have an expression field in the format:
337
+ * "morpheme/pos/semanticClass+morpheme/pos/semanticClass+..."
338
+ *
339
+ * This class represents a single component of that expression.
340
+ */
341
+ declare class ExpressionToken {
342
+ private _morpheme;
343
+ private _pos;
344
+ private _semanticClass;
345
+ constructor(raw: string);
346
+ /**
347
+ * The normalized token/morpheme
348
+ */
349
+ get morpheme(): string;
350
+ /**
351
+ * The part of speech tag
352
+ */
353
+ get pos(): string;
354
+ /**
355
+ * The dictionary form (adds 다 for verbs)
356
+ */
357
+ get lemma(): string;
358
+ /**
359
+ * The semantic word class or category
360
+ */
361
+ get semanticClass(): string | null;
362
+ }
363
+
364
+ /**
365
+ * Token - napi-mecab compatible token wrapper
366
+ *
367
+ * Provides getters that match the napi-mecab API for Korean tokens.
368
+ */
369
+
370
+ declare class Token {
371
+ private _token;
372
+ constructor(token: KoreanToken);
373
+ /**
374
+ * How the token looks in the input text
375
+ */
376
+ get surface(): string;
377
+ /**
378
+ * The raw features string (comma-separated)
379
+ */
380
+ get features(): string;
381
+ /**
382
+ * The raw string in MeCab format (surface\tfeatures)
383
+ */
384
+ get raw(): string;
385
+ /**
386
+ * Parts of speech as an array (split by "+")
387
+ */
388
+ get pos(): string[];
389
+ /**
390
+ * The dictionary headword (adds 다 for verbs)
391
+ */
392
+ get lemma(): string | null;
393
+ /**
394
+ * How the token is pronounced
395
+ */
396
+ get pronunciation(): string | null;
397
+ /**
398
+ * Whether the token has a final consonant (받침/batchim)
399
+ */
400
+ get hasBatchim(): boolean | null;
401
+ /**
402
+ * Alias for hasBatchim (종성/jongseong)
403
+ */
404
+ get hasJongseong(): boolean | null;
405
+ /**
406
+ * The semantic word class or category
407
+ */
408
+ get semanticClass(): string | null;
409
+ /**
410
+ * The type of token (Inflect/Compound/Preanalysis)
411
+ */
412
+ get type(): string | null;
413
+ /**
414
+ * The broken-down expression tokens for compound/inflected words
415
+ */
416
+ get expression(): ExpressionToken[] | null;
417
+ /**
418
+ * Get the underlying KoreanToken
419
+ */
420
+ get koreanToken(): KoreanToken;
421
+ }
422
+
423
+ /**
424
+ * MeCab - napi-mecab compatible API wrapper
425
+ *
426
+ * Provides a familiar API for users coming from napi-mecab.
427
+ * Uses async initialization since this is a pure JavaScript implementation.
428
+ */
429
+
430
+ interface MeCabOptions {
431
+ /**
432
+ * The language engine to use. Only 'ko' (Korean) is supported.
433
+ * @default 'ko'
434
+ */
435
+ engine?: 'ko';
436
+ /**
437
+ * Path to the dictionary directory.
438
+ * @default 'dict/'
439
+ */
440
+ dictPath?: string;
441
+ }
442
+ declare class MeCab {
443
+ private tokenizer;
444
+ private constructor();
445
+ /**
446
+ * Create a MeCab instance asynchronously.
447
+ *
448
+ * Unlike napi-mecab which uses a synchronous constructor,
449
+ * this pure JavaScript implementation requires async initialization
450
+ * to load the dictionary files without blocking.
451
+ *
452
+ * @example
453
+ * ```typescript
454
+ * const mecab = await MeCab.create({ engine: 'ko' });
455
+ * const tokens = mecab.parse('안녕하세요');
456
+ * ```
457
+ */
458
+ static create(opts?: MeCabOptions): Promise<MeCab>;
459
+ /**
460
+ * Parse text into an array of tokens.
461
+ *
462
+ * @param text - The text to parse
463
+ * @returns Array of Token objects
464
+ *
465
+ * @example
466
+ * ```typescript
467
+ * const tokens = mecab.parse('아버지가방에들어가신다');
468
+ * tokens.forEach(t => console.log(t.surface, t.pos));
469
+ * ```
470
+ */
471
+ parse(text: string): Token[];
472
+ /**
473
+ * Get just the surface forms as an array.
474
+ * Convenience method equivalent to napi-mecab parse + map surface.
475
+ */
476
+ wakati(text: string): string[];
477
+ /**
478
+ * Get space-separated surface forms.
479
+ */
480
+ wakatiString(text: string): string;
481
+ /**
482
+ * Access the underlying Tokenizer for advanced usage.
483
+ */
484
+ get underlyingTokenizer(): Tokenizer;
485
+ }
486
+
333
487
  /**
334
488
  * mecab-ko - Pure TypeScript Korean Morphological Analyzer
335
489
  *
@@ -347,6 +501,9 @@ declare const _default: {
347
501
  Tokenizer: typeof Tokenizer;
348
502
  KoreanToken: typeof KoreanToken;
349
503
  POS_TAGS: Record<string, string>;
504
+ MeCab: typeof MeCab;
505
+ Token: typeof Token;
506
+ ExpressionToken: typeof ExpressionToken;
350
507
  };
351
508
 
352
- export { KoreanToken, POS_TAGS, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
509
+ export { ExpressionToken, KoreanToken, MeCab, type MeCabOptions, POS_TAGS, Token, Tokenizer, TokenizerBuilder, type TokenizerBuilderOptions, builder, _default as default };
package/dist/index.js CHANGED
@@ -1360,14 +1360,237 @@ var TokenizerBuilder = class {
1360
1360
  }
1361
1361
  };
1362
1362
 
1363
+ // src/ExpressionToken.ts
1364
+ var VERB_TAGS = ["VV", "VA", "VX", "VCP", "VCN"];
1365
+ function nullIfStar(value) {
1366
+ return value === "*" ? null : value;
1367
+ }
1368
+ var ExpressionToken = class {
1369
+ constructor(raw) {
1370
+ const parts = raw.split("/");
1371
+ this._morpheme = parts[0] ?? "";
1372
+ this._pos = parts[1] ?? "";
1373
+ this._semanticClass = parts[2] ?? "*";
1374
+ }
1375
+ /**
1376
+ * The normalized token/morpheme
1377
+ */
1378
+ get morpheme() {
1379
+ return this._morpheme;
1380
+ }
1381
+ /**
1382
+ * The part of speech tag
1383
+ */
1384
+ get pos() {
1385
+ return this._pos;
1386
+ }
1387
+ /**
1388
+ * The dictionary form (adds 다 for verbs)
1389
+ */
1390
+ get lemma() {
1391
+ if (VERB_TAGS.includes(this._pos)) {
1392
+ return this._morpheme + "\uB2E4";
1393
+ }
1394
+ return this._morpheme;
1395
+ }
1396
+ /**
1397
+ * The semantic word class or category
1398
+ */
1399
+ get semanticClass() {
1400
+ return nullIfStar(this._semanticClass);
1401
+ }
1402
+ };
1403
+
1404
+ // src/Token.ts
1405
+ var VERB_TAGS2 = ["VV", "VA", "VX", "VCP", "VCN"];
1406
+ function nullIfStar2(value) {
1407
+ return value === "*" ? null : value;
1408
+ }
1409
+ var Token = class {
1410
+ constructor(token) {
1411
+ this._token = token;
1412
+ }
1413
+ /**
1414
+ * How the token looks in the input text
1415
+ */
1416
+ get surface() {
1417
+ return this._token.surface_form;
1418
+ }
1419
+ /**
1420
+ * The raw features string (comma-separated)
1421
+ */
1422
+ get features() {
1423
+ return [
1424
+ this._token.pos,
1425
+ this._token.semantic_class,
1426
+ this._token.has_final_consonant,
1427
+ this._token.reading,
1428
+ this._token.type,
1429
+ this._token.first_pos,
1430
+ this._token.last_pos,
1431
+ this._token.expression
1432
+ ].join(",");
1433
+ }
1434
+ /**
1435
+ * The raw string in MeCab format (surface\tfeatures)
1436
+ */
1437
+ get raw() {
1438
+ return `${this.surface} ${this.features}`;
1439
+ }
1440
+ /**
1441
+ * Parts of speech as an array (split by "+")
1442
+ */
1443
+ get pos() {
1444
+ return this._token.pos.split("+");
1445
+ }
1446
+ /**
1447
+ * The dictionary headword (adds 다 for verbs)
1448
+ */
1449
+ get lemma() {
1450
+ const basePos = this.pos[0];
1451
+ if (VERB_TAGS2.includes(basePos)) {
1452
+ return this.surface + "\uB2E4";
1453
+ }
1454
+ return this.surface;
1455
+ }
1456
+ /**
1457
+ * How the token is pronounced
1458
+ */
1459
+ get pronunciation() {
1460
+ return nullIfStar2(this._token.reading);
1461
+ }
1462
+ /**
1463
+ * Whether the token has a final consonant (받침/batchim)
1464
+ */
1465
+ get hasBatchim() {
1466
+ const val = this._token.has_final_consonant;
1467
+ if (val === "T") return true;
1468
+ if (val === "F") return false;
1469
+ return null;
1470
+ }
1471
+ /**
1472
+ * Alias for hasBatchim (종성/jongseong)
1473
+ */
1474
+ get hasJongseong() {
1475
+ return this.hasBatchim;
1476
+ }
1477
+ /**
1478
+ * The semantic word class or category
1479
+ */
1480
+ get semanticClass() {
1481
+ return nullIfStar2(this._token.semantic_class);
1482
+ }
1483
+ /**
1484
+ * The type of token (Inflect/Compound/Preanalysis)
1485
+ */
1486
+ get type() {
1487
+ return nullIfStar2(this._token.type);
1488
+ }
1489
+ /**
1490
+ * The broken-down expression tokens for compound/inflected words
1491
+ */
1492
+ get expression() {
1493
+ if (this._token.expression === "*") return null;
1494
+ return this._token.expression.split("+").map((part) => new ExpressionToken(part));
1495
+ }
1496
+ /**
1497
+ * Get the underlying KoreanToken
1498
+ */
1499
+ get koreanToken() {
1500
+ return this._token;
1501
+ }
1502
+ };
1503
+
1504
+ // src/MeCab.ts
1505
+ var MeCab = class _MeCab {
1506
+ constructor(tokenizer) {
1507
+ this.tokenizer = tokenizer;
1508
+ }
1509
+ /**
1510
+ * Create a MeCab instance asynchronously.
1511
+ *
1512
+ * Unlike napi-mecab which uses a synchronous constructor,
1513
+ * this pure JavaScript implementation requires async initialization
1514
+ * to load the dictionary files without blocking.
1515
+ *
1516
+ * @example
1517
+ * ```typescript
1518
+ * const mecab = await MeCab.create({ engine: 'ko' });
1519
+ * const tokens = mecab.parse('안녕하세요');
1520
+ * ```
1521
+ */
1522
+ static async create(opts = {}) {
1523
+ const engine = opts.engine ?? "ko";
1524
+ if (engine !== "ko") {
1525
+ throw new Error(
1526
+ `"${engine}" is not a supported mecab engine. Only "ko" (Korean) is supported.`
1527
+ );
1528
+ }
1529
+ const builder2 = new TokenizerBuilder({
1530
+ dicPath: opts.dictPath
1531
+ });
1532
+ const tokenizer = await builder2.build();
1533
+ return new _MeCab(tokenizer);
1534
+ }
1535
+ /**
1536
+ * Parse text into an array of tokens.
1537
+ *
1538
+ * @param text - The text to parse
1539
+ * @returns Array of Token objects
1540
+ *
1541
+ * @example
1542
+ * ```typescript
1543
+ * const tokens = mecab.parse('아버지가방에들어가신다');
1544
+ * tokens.forEach(t => console.log(t.surface, t.pos));
1545
+ * ```
1546
+ */
1547
+ parse(text) {
1548
+ const koreanTokens = this.tokenizer.tokenize(text);
1549
+ return koreanTokens.map((token) => new Token(token));
1550
+ }
1551
+ /**
1552
+ * Get just the surface forms as an array.
1553
+ * Convenience method equivalent to napi-mecab parse + map surface.
1554
+ */
1555
+ wakati(text) {
1556
+ return this.tokenizer.wakati(text);
1557
+ }
1558
+ /**
1559
+ * Get space-separated surface forms.
1560
+ */
1561
+ wakatiString(text) {
1562
+ return this.tokenizer.wakatiString(text);
1563
+ }
1564
+ /**
1565
+ * Access the underlying Tokenizer for advanced usage.
1566
+ */
1567
+ get underlyingTokenizer() {
1568
+ return this.tokenizer;
1569
+ }
1570
+ };
1571
+
1363
1572
  // src/index.ts
1364
1573
  function builder(options = {}) {
1365
1574
  return new TokenizerBuilder(options);
1366
1575
  }
1367
- var index_default = { builder, TokenizerBuilder, Tokenizer, KoreanToken, POS_TAGS };
1576
+ var index_default = {
1577
+ // Original API
1578
+ builder,
1579
+ TokenizerBuilder,
1580
+ Tokenizer,
1581
+ KoreanToken,
1582
+ POS_TAGS,
1583
+ // napi-mecab compatible API
1584
+ MeCab,
1585
+ Token,
1586
+ ExpressionToken
1587
+ };
1368
1588
  export {
1589
+ ExpressionToken,
1369
1590
  KoreanToken,
1591
+ MeCab,
1370
1592
  POS_TAGS,
1593
+ Token,
1371
1594
  Tokenizer,
1372
1595
  TokenizerBuilder,
1373
1596
  builder,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kuromoji-ko",
3
- "version": "1.0.1",
3
+ "version": "1.0.2",
4
4
  "description": "Pure TypeScript Korean Morphological Analyzer - serverless compatible, based on kuromoji.js and mecab-ko-dic",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",