khmer-segment 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -24
- package/dist/index.cjs +42 -2
- package/dist/index.d.cts +11 -1
- package/dist/index.d.ts +11 -1
- package/dist/index.js +40 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -27,6 +27,8 @@ import {
|
|
|
27
27
|
countClusters,
|
|
28
28
|
createDictionary,
|
|
29
29
|
segmentWords,
|
|
30
|
+
getCaretBoundaries,
|
|
31
|
+
deleteBackward,
|
|
30
32
|
} from 'khmer-segment';
|
|
31
33
|
|
|
32
34
|
// Detect Khmer text
|
|
@@ -89,10 +91,10 @@ console.log(result.tokens);
|
|
|
89
91
|
|
|
90
92
|
```ts
|
|
91
93
|
interface SegmentOptions {
|
|
92
|
-
strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "
|
|
94
|
+
strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "viterbi"
|
|
93
95
|
dictionary?: KhmerDictionary;
|
|
94
96
|
normalize?: boolean; // default: true
|
|
95
|
-
viterbiBoundaryPenalty?: number; // default: 0
|
|
97
|
+
viterbiBoundaryPenalty?: number; // default: 10.0 (Viterbi only)
|
|
96
98
|
}
|
|
97
99
|
```
|
|
98
100
|
|
|
@@ -213,11 +215,51 @@ Same idea as FMM, but scans right-to-left. Can produce different segmentation on
|
|
|
213
215
|
|
|
214
216
|
Runs both FMM and BMM, then picks the better result using heuristics: fewer unknown tokens wins; if tied, fewer total tokens (longer matches) wins; if still tied, FMM is preferred. This generally produces better results than either FMM or BMM alone.
|
|
215
217
|
|
|
216
|
-
### Viterbi
|
|
218
|
+
### Viterbi
|
|
217
219
|
|
|
218
220
|
Frequency-weighted dynamic programming segmentation. Finds the globally lowest-cost path through all possible word boundaries using `-log(frequency)` as word cost. Requires a dictionary with frequency data.
|
|
219
221
|
|
|
220
|
-
**
|
|
222
|
+
**Default strategy** as of v0.4.0. With a boundary penalty of 10.0, Viterbi achieves Boundary F1 = 0.8572 (+5.3% over BiMM) and Token F1 = 0.6744 (+4.2% over BiMM) while maintaining superior OOV handling (OOV Boundary F1 = 0.8875 vs BiMM's 0.4186).
|
|
223
|
+
|
|
224
|
+
### Text Editing
|
|
225
|
+
|
|
226
|
+
#### `getCaretBoundaries(text, options?)`
|
|
227
|
+
|
|
228
|
+
Returns an array of valid caret positions (indices where the cursor can rest) based on Khmer cluster boundaries.
|
|
229
|
+
|
|
230
|
+
```ts
|
|
231
|
+
import { getCaretBoundaries } from 'khmer-segment';
|
|
232
|
+
|
|
233
|
+
getCaretBoundaries(''); // [0]
|
|
234
|
+
getCaretBoundaries('ក'); // [0, 1]
|
|
235
|
+
getCaretBoundaries('ក្ក'); // [0, 3] — coeng+subscript is one cluster
|
|
236
|
+
getCaretBoundaries('កក'); // [0, 1, 2] — two clusters
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
#### `deleteBackward(text, cursorIndex, options?)`
|
|
240
|
+
|
|
241
|
+
Deletes the cluster (or character) before the cursor, respecting cluster boundaries.
|
|
242
|
+
|
|
243
|
+
```ts
|
|
244
|
+
import { deleteBackward } from 'khmer-segment';
|
|
245
|
+
|
|
246
|
+
deleteBackward('កក', 2); // { text: 'ក', cursorIndex: 1 }
|
|
247
|
+
deleteBackward('ក្កក', 4); // { text: 'ក្ក', cursorIndex: 3 } — deletes last cluster
|
|
248
|
+
deleteBackward('ក', 0); // { text: 'ក', cursorIndex: 0 } — no-op at start
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
#### `CaretOptions`
|
|
252
|
+
|
|
253
|
+
```ts
|
|
254
|
+
interface CaretOptions {
|
|
255
|
+
normalize?: boolean; // default: false — operate on raw text
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
interface DeleteResult {
|
|
259
|
+
text: string;
|
|
260
|
+
cursorIndex: number;
|
|
261
|
+
}
|
|
262
|
+
```
|
|
221
263
|
|
|
222
264
|
### Digit Grouping
|
|
223
265
|
|
|
@@ -287,9 +329,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
287
329
|
|
|
288
330
|
## Limitations
|
|
289
331
|
|
|
290
|
-
-
|
|
291
|
-
- No caret/backspace helpers yet
|
|
292
|
-
- Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~80% boundary F1 for dictionary-based matching)
|
|
332
|
+
- Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~86% boundary F1 for dictionary-based matching)
|
|
293
333
|
|
|
294
334
|
---
|
|
295
335
|
|
|
@@ -297,14 +337,14 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
297
337
|
|
|
298
338
|
Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segmentation-crf-khmer](https://github.com/phylypo/segmentation-crf-khmer)) with the default 101,107-word dictionary.
|
|
299
339
|
|
|
300
|
-
| Strategy
|
|
301
|
-
|
|
|
302
|
-
| **
|
|
303
|
-
|
|
|
304
|
-
|
|
|
305
|
-
|
|
|
340
|
+
| Strategy | Boundary F1 | Token F1 | Exact Match | OOV Rate | OOV Boundary F1 | Relative Speed |
|
|
341
|
+
| ----------- | ----------- | ---------- | ----------- | -------- | --------------- | --------------- |
|
|
342
|
+
| **Viterbi** | **0.8572** | **0.6744** | **1.4%** | 5.4% | **0.8875** | 1.4x |
|
|
343
|
+
| BiMM | 0.8041 | 0.6327 | 2.0% | 32.6% | 0.4186 | 1.0x (baseline) |
|
|
344
|
+
| FMM | 0.8024 | 0.6304 | 2.0% | 32.8% | — | 0.5x |
|
|
345
|
+
| BMM | 0.7981 | 0.6239 | 1.8% | 32.6% | — | 0.7x |
|
|
306
346
|
|
|
307
|
-
**Recommended:** `strategy: '
|
|
347
|
+
**Recommended:** `strategy: 'viterbi'` (default) for best accuracy. See [`docs/benchmark-results.md`](docs/benchmark-results.md) for full details and [`docs/benchmark-methodology.md`](docs/benchmark-methodology.md) for methodology.
|
|
308
348
|
|
|
309
349
|
---
|
|
310
350
|
|
|
@@ -336,22 +376,20 @@ Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segment
|
|
|
336
376
|
- Corrected custom dictionary `size` to report unique non-empty words
|
|
337
377
|
- Added changelog, CI checks, and stricter prepublish formatting verification
|
|
338
378
|
|
|
339
|
-
### v0.3.0
|
|
379
|
+
### v0.3.0
|
|
340
380
|
|
|
341
|
-
- **Viterbi algorithm** — frequency-weighted DP segmentation
|
|
381
|
+
- **Viterbi algorithm** — frequency-weighted DP segmentation
|
|
342
382
|
- **Dictionary expansion** — 49,113 → 101,107 words (merged from Sovichea/khmer_segmenter + SIL + Royal Academy)
|
|
343
383
|
- **Full Unicode normalization** — composite vowel fixing, ROBAT ordering, stacked coeng support
|
|
344
384
|
- **Full KCC cluster model** — ROBAT continuation, independent vowel bases
|
|
345
385
|
- **Accuracy benchmarking** — 87,875-sentence gold standard, per-strategy metrics
|
|
346
|
-
- Benchmark results: BiMM Boundary F1 = 0.804, Viterbi = 0.735 (still needs cost model work)
|
|
347
386
|
|
|
348
|
-
### v0.4.0 (
|
|
387
|
+
### v0.4.0 (current)
|
|
349
388
|
|
|
350
|
-
-
|
|
351
|
-
-
|
|
352
|
-
-
|
|
353
|
-
-
|
|
354
|
-
- Compressed dictionary format
|
|
389
|
+
- **Default strategy switched to Viterbi** (penalty=10.0): Boundary F1 = 0.8572, Token F1 = 0.6744
|
|
390
|
+
- **`getCaretBoundaries(text)`** — returns valid caret positions based on Khmer cluster boundaries
|
|
391
|
+
- **`deleteBackward(text, cursorIndex)`** — cluster-safe backspace for text editors
|
|
392
|
+
- **Extended Viterbi penalty sweep** — range [0.25–10.0], documented in `docs/viterbi-penalty-sweep.md`
|
|
355
393
|
|
|
356
394
|
### Future
|
|
357
395
|
|
|
@@ -403,8 +441,9 @@ Features:
|
|
|
403
441
|
|
|
404
442
|
- Live Khmer text input with instant results
|
|
405
443
|
- Editable dictionary (add/remove words on the fly)
|
|
406
|
-
- Strategy selector (FMM / BMM / BiMM)
|
|
444
|
+
- Strategy selector (FMM / BMM / BiMM / Viterbi)
|
|
407
445
|
- Normalize toggle (On/Off)
|
|
446
|
+
- Caret boundary visualization
|
|
408
447
|
- Detection, normalization, cluster splitting, and segmentation panels
|
|
409
448
|
- JSON output with copy button
|
|
410
449
|
|
package/dist/index.cjs
CHANGED
|
@@ -23,6 +23,8 @@ __export(index_exports, {
|
|
|
23
23
|
containsKhmer: () => containsKhmer,
|
|
24
24
|
countClusters: () => countClusters,
|
|
25
25
|
createDictionary: () => createDictionary,
|
|
26
|
+
deleteBackward: () => deleteBackward,
|
|
27
|
+
getCaretBoundaries: () => getCaretBoundaries,
|
|
26
28
|
getClusterBoundaries: () => getClusterBoundaries,
|
|
27
29
|
isKhmerChar: () => isKhmerChar,
|
|
28
30
|
isKhmerText: () => isKhmerText,
|
|
@@ -379,7 +381,7 @@ var DEFAULT_COST = 10;
|
|
|
379
381
|
var UNKNOWN_COST = 20;
|
|
380
382
|
var SINGLE_CONSONANT_PENALTY = 10;
|
|
381
383
|
var ORPHAN_SIGN_PENALTY = 50;
|
|
382
|
-
var DEFAULT_BOUNDARY_PENALTY =
|
|
384
|
+
var DEFAULT_BOUNDARY_PENALTY = 10;
|
|
383
385
|
function isClusterStart(cp) {
|
|
384
386
|
return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
|
|
385
387
|
}
|
|
@@ -605,7 +607,7 @@ function segmentWords(text, options) {
|
|
|
605
607
|
const dictionary = options?.dictionary;
|
|
606
608
|
let tokens;
|
|
607
609
|
if (dictionary) {
|
|
608
|
-
const strategy = options?.strategy ?? "
|
|
610
|
+
const strategy = options?.strategy ?? "viterbi";
|
|
609
611
|
switch (strategy) {
|
|
610
612
|
case "bmm":
|
|
611
613
|
tokens = bmmSegment(clusters, dictionary);
|
|
@@ -645,6 +647,42 @@ function markKhmerSentencePunctuationKnown(tokens) {
|
|
|
645
647
|
);
|
|
646
648
|
}
|
|
647
649
|
|
|
650
|
+
// src/core/caret.ts
|
|
651
|
+
function getCaretBoundaries(text, options) {
|
|
652
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
653
|
+
if (!src) return [0];
|
|
654
|
+
const clusters = splitClusters(src);
|
|
655
|
+
const positions = [0];
|
|
656
|
+
let offset = 0;
|
|
657
|
+
for (const cluster of clusters) {
|
|
658
|
+
offset += cluster.length;
|
|
659
|
+
positions.push(offset);
|
|
660
|
+
}
|
|
661
|
+
return positions;
|
|
662
|
+
}
|
|
663
|
+
function deleteBackward(text, cursorIndex, options) {
|
|
664
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
665
|
+
if (!Number.isInteger(cursorIndex)) {
|
|
666
|
+
throw new TypeError(
|
|
667
|
+
`cursorIndex must be an integer, got ${cursorIndex}`
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
const clamped = Math.max(0, Math.min(cursorIndex, src.length));
|
|
671
|
+
if (clamped === 0) {
|
|
672
|
+
return { text: src, cursorIndex: 0 };
|
|
673
|
+
}
|
|
674
|
+
const boundaries = getCaretBoundaries(src, { normalize: false });
|
|
675
|
+
let prev = 0;
|
|
676
|
+
for (const b of boundaries) {
|
|
677
|
+
if (b >= clamped) break;
|
|
678
|
+
prev = b;
|
|
679
|
+
}
|
|
680
|
+
return {
|
|
681
|
+
text: src.slice(0, prev) + src.slice(clamped),
|
|
682
|
+
cursorIndex: prev
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
|
|
648
686
|
// src/dictionary/trie.ts
|
|
649
687
|
var TrieNode = class {
|
|
650
688
|
constructor() {
|
|
@@ -743,6 +781,8 @@ function createDictionary(words, frequencies) {
|
|
|
743
781
|
containsKhmer,
|
|
744
782
|
countClusters,
|
|
745
783
|
createDictionary,
|
|
784
|
+
deleteBackward,
|
|
785
|
+
getCaretBoundaries,
|
|
746
786
|
getClusterBoundaries,
|
|
747
787
|
isKhmerChar,
|
|
748
788
|
isKhmerText,
|
package/dist/index.d.cts
CHANGED
|
@@ -54,9 +54,19 @@ interface KhmerDictionary {
|
|
|
54
54
|
getFrequency?(word: string): number | undefined;
|
|
55
55
|
size: number;
|
|
56
56
|
}
|
|
57
|
+
interface CaretOptions {
|
|
58
|
+
normalize?: boolean;
|
|
59
|
+
}
|
|
60
|
+
interface DeleteResult {
|
|
61
|
+
text: string;
|
|
62
|
+
cursorIndex: number;
|
|
63
|
+
}
|
|
57
64
|
|
|
58
65
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
59
66
|
|
|
67
|
+
declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
|
|
68
|
+
declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
|
|
69
|
+
|
|
60
70
|
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
61
71
|
|
|
62
|
-
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
|
72
|
+
export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.d.ts
CHANGED
|
@@ -54,9 +54,19 @@ interface KhmerDictionary {
|
|
|
54
54
|
getFrequency?(word: string): number | undefined;
|
|
55
55
|
size: number;
|
|
56
56
|
}
|
|
57
|
+
interface CaretOptions {
|
|
58
|
+
normalize?: boolean;
|
|
59
|
+
}
|
|
60
|
+
interface DeleteResult {
|
|
61
|
+
text: string;
|
|
62
|
+
cursorIndex: number;
|
|
63
|
+
}
|
|
57
64
|
|
|
58
65
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
59
66
|
|
|
67
|
+
declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
|
|
68
|
+
declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
|
|
69
|
+
|
|
60
70
|
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
61
71
|
|
|
62
|
-
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
|
72
|
+
export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.js
CHANGED
|
@@ -344,7 +344,7 @@ var DEFAULT_COST = 10;
|
|
|
344
344
|
var UNKNOWN_COST = 20;
|
|
345
345
|
var SINGLE_CONSONANT_PENALTY = 10;
|
|
346
346
|
var ORPHAN_SIGN_PENALTY = 50;
|
|
347
|
-
var DEFAULT_BOUNDARY_PENALTY =
|
|
347
|
+
var DEFAULT_BOUNDARY_PENALTY = 10;
|
|
348
348
|
function isClusterStart(cp) {
|
|
349
349
|
return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
|
|
350
350
|
}
|
|
@@ -570,7 +570,7 @@ function segmentWords(text, options) {
|
|
|
570
570
|
const dictionary = options?.dictionary;
|
|
571
571
|
let tokens;
|
|
572
572
|
if (dictionary) {
|
|
573
|
-
const strategy = options?.strategy ?? "
|
|
573
|
+
const strategy = options?.strategy ?? "viterbi";
|
|
574
574
|
switch (strategy) {
|
|
575
575
|
case "bmm":
|
|
576
576
|
tokens = bmmSegment(clusters, dictionary);
|
|
@@ -610,6 +610,42 @@ function markKhmerSentencePunctuationKnown(tokens) {
|
|
|
610
610
|
);
|
|
611
611
|
}
|
|
612
612
|
|
|
613
|
+
// src/core/caret.ts
|
|
614
|
+
function getCaretBoundaries(text, options) {
|
|
615
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
616
|
+
if (!src) return [0];
|
|
617
|
+
const clusters = splitClusters(src);
|
|
618
|
+
const positions = [0];
|
|
619
|
+
let offset = 0;
|
|
620
|
+
for (const cluster of clusters) {
|
|
621
|
+
offset += cluster.length;
|
|
622
|
+
positions.push(offset);
|
|
623
|
+
}
|
|
624
|
+
return positions;
|
|
625
|
+
}
|
|
626
|
+
function deleteBackward(text, cursorIndex, options) {
|
|
627
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
628
|
+
if (!Number.isInteger(cursorIndex)) {
|
|
629
|
+
throw new TypeError(
|
|
630
|
+
`cursorIndex must be an integer, got ${cursorIndex}`
|
|
631
|
+
);
|
|
632
|
+
}
|
|
633
|
+
const clamped = Math.max(0, Math.min(cursorIndex, src.length));
|
|
634
|
+
if (clamped === 0) {
|
|
635
|
+
return { text: src, cursorIndex: 0 };
|
|
636
|
+
}
|
|
637
|
+
const boundaries = getCaretBoundaries(src, { normalize: false });
|
|
638
|
+
let prev = 0;
|
|
639
|
+
for (const b of boundaries) {
|
|
640
|
+
if (b >= clamped) break;
|
|
641
|
+
prev = b;
|
|
642
|
+
}
|
|
643
|
+
return {
|
|
644
|
+
text: src.slice(0, prev) + src.slice(clamped),
|
|
645
|
+
cursorIndex: prev
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
613
649
|
// src/dictionary/trie.ts
|
|
614
650
|
var TrieNode = class {
|
|
615
651
|
constructor() {
|
|
@@ -707,6 +743,8 @@ export {
|
|
|
707
743
|
containsKhmer,
|
|
708
744
|
countClusters,
|
|
709
745
|
createDictionary,
|
|
746
|
+
deleteBackward,
|
|
747
|
+
getCaretBoundaries,
|
|
710
748
|
getClusterBoundaries,
|
|
711
749
|
isKhmerChar,
|
|
712
750
|
isKhmerText,
|