khmer-segment 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -27,6 +27,8 @@ import {
27
27
  countClusters,
28
28
  createDictionary,
29
29
  segmentWords,
30
+ getCaretBoundaries,
31
+ deleteBackward,
30
32
  } from 'khmer-segment';
31
33
 
32
34
  // Detect Khmer text
@@ -89,10 +91,10 @@ console.log(result.tokens);
89
91
 
90
92
  ```ts
91
93
  interface SegmentOptions {
92
- strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "fmm"
94
+ strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "viterbi"
93
95
  dictionary?: KhmerDictionary;
94
96
  normalize?: boolean; // default: true
95
- viterbiBoundaryPenalty?: number; // default: 0.75 (Viterbi only)
97
+ viterbiBoundaryPenalty?: number; // default: 10.0 (Viterbi only)
96
98
  }
97
99
  ```
98
100
 
@@ -213,11 +215,51 @@ Same idea as FMM, but scans right-to-left. Can produce different segmentation on
213
215
 
214
216
  Runs both FMM and BMM, then picks the better result using heuristics: fewer unknown tokens wins; if tied, fewer total tokens (longer matches) wins; if still tied, FMM is preferred. This generally produces better results than either FMM or BMM alone.
215
217
 
216
- ### Viterbi (Experimental)
218
+ ### Viterbi
217
219
 
218
220
  Frequency-weighted dynamic programming segmentation. Finds the globally lowest-cost path through all possible word boundaries using `-log(frequency)` as word cost. Requires a dictionary with frequency data.
219
221
 
220
- **Note:** The current cost model still over-segments on real-world text (Boundary F1 = 0.735 vs BiMM's 0.804). Use `strategy: 'bimm'` for best results until the Viterbi cost model is tuned further.
222
+ **Default strategy** as of v0.4.0. With a boundary penalty of 10.0, Viterbi achieves Boundary F1 = 0.8572 (+5.3% over BiMM) and Token F1 = 0.6744 (+4.2% over BiMM) while maintaining superior OOV handling (OOV Boundary F1 = 0.8875 vs BiMM's 0.4186).
223
+
224
+ ### Text Editing
225
+
226
+ #### `getCaretBoundaries(text, options?)`
227
+
228
+ Returns an array of valid caret positions (indices where the cursor can rest) based on Khmer cluster boundaries.
229
+
230
+ ```ts
231
+ import { getCaretBoundaries } from 'khmer-segment';
232
+
233
+ getCaretBoundaries(''); // [0]
234
+ getCaretBoundaries('ក'); // [0, 1]
235
+ getCaretBoundaries('ក្ក'); // [0, 3] — coeng+subscript is one cluster
236
+ getCaretBoundaries('កក'); // [0, 1, 2] — two clusters
237
+ ```
238
+
239
+ #### `deleteBackward(text, cursorIndex, options?)`
240
+
241
+ Deletes the cluster (or character) before the cursor, respecting cluster boundaries.
242
+
243
+ ```ts
244
+ import { deleteBackward } from 'khmer-segment';
245
+
246
+ deleteBackward('កក', 2); // { text: 'ក', cursorIndex: 1 }
247
+ deleteBackward('ក្កក', 4); // { text: 'ក្ក', cursorIndex: 3 } — deletes last cluster
248
+ deleteBackward('ក', 0); // { text: 'ក', cursorIndex: 0 } — no-op at start
249
+ ```
250
+
251
+ #### `CaretOptions`
252
+
253
+ ```ts
254
+ interface CaretOptions {
255
+ normalize?: boolean; // default: false — operate on raw text
256
+ }
257
+
258
+ interface DeleteResult {
259
+ text: string;
260
+ cursorIndex: number;
261
+ }
262
+ ```
221
263
 
222
264
  ### Digit Grouping
223
265
 
@@ -287,9 +329,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
287
329
 
288
330
  ## Limitations
289
331
 
290
- - Viterbi strategy is experimental cost model over-segments; BiMM recommended for best accuracy
291
- - No caret/backspace helpers yet
292
- - Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~80% boundary F1 for dictionary-based matching)
332
+ - Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~86% boundary F1 for dictionary-based matching)
293
333
 
294
334
  ---
295
335
 
@@ -297,14 +337,14 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
297
337
 
298
338
  Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segmentation-crf-khmer](https://github.com/phylypo/segmentation-crf-khmer)) with the default 101,107-word dictionary.
299
339
 
300
- | Strategy | Boundary F1 | Token F1 | Exact Match | OOV Rate | Relative Speed |
301
- | -------- | ----------- | ---------- | ----------- | -------- | --------------- |
302
- | **BiMM** | **0.8041** | **0.6327** | **2.0%** | 32.6% | 1.0x (baseline) |
303
- | FMM | 0.8024 | 0.6304 | 2.0% | 32.8% | 0.5x |
304
- | BMM | 0.7981 | 0.6239 | 1.8% | 32.6% | 0.7x |
305
- | Viterbi | 0.7348 | 0.4340 | 0.1% | 5.4% | 1.4x |
340
+ | Strategy | Boundary F1 | Token F1 | Exact Match | OOV Rate | OOV Boundary F1 | Relative Speed |
341
+ | ----------- | ----------- | ---------- | ----------- | -------- | --------------- | --------------- |
342
+ | **Viterbi** | **0.8572** | **0.6744** | **1.4%** | 5.4% | **0.8875** | 1.4x |
343
+ | BiMM | 0.8041 | 0.6327 | 2.0% | 32.6% | 0.4186 | 1.0x (baseline) |
344
+ | FMM | 0.8024 | 0.6304 | 2.0% | 32.8% | — | 0.5x |
345
+ | BMM | 0.7981 | 0.6239 | 1.8% | 32.6% | — | 0.7x |
306
346
 
307
- **Recommended:** `strategy: 'bimm'` for best accuracy. See [`docs/benchmark-results.md`](docs/benchmark-results.md) for full details and [`docs/benchmark-methodology.md`](docs/benchmark-methodology.md) for methodology.
347
+ **Recommended:** `strategy: 'viterbi'` (default) for best accuracy. See [`docs/benchmark-results.md`](docs/benchmark-results.md) for full details and [`docs/benchmark-methodology.md`](docs/benchmark-methodology.md) for methodology.
308
348
 
309
349
  ---
310
350
 
@@ -336,22 +376,20 @@ Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segment
336
376
  - Corrected custom dictionary `size` to report unique non-empty words
337
377
  - Added changelog, CI checks, and stricter prepublish formatting verification
338
378
 
339
- ### v0.3.0 (current)
379
+ ### v0.3.0
340
380
 
341
- - **Viterbi algorithm** — frequency-weighted DP segmentation (experimental; cost model needs tuning)
381
+ - **Viterbi algorithm** — frequency-weighted DP segmentation
342
382
  - **Dictionary expansion** — 49,113 → 101,107 words (merged from Sovichea/khmer_segmenter + SIL + Royal Academy)
343
383
  - **Full Unicode normalization** — composite vowel fixing, ROBAT ordering, stacked coeng support
344
384
  - **Full KCC cluster model** — ROBAT continuation, independent vowel bases
345
385
  - **Accuracy benchmarking** — 87,875-sentence gold standard, per-strategy metrics
346
- - Benchmark results: BiMM Boundary F1 = 0.804, Viterbi = 0.735 (still needs cost model work)
347
386
 
348
- ### v0.4.0 (planned)
387
+ ### v0.4.0 (current)
349
388
 
350
- - `deleteBackward(text, cursorIndex)` cluster-safe backspace
351
- - `getCaretBoundaries(text)` — caret-safe navigation
352
- - Viterbi cost model tuning (boundary penalty)
353
- - Switch default strategy to Viterbi after tuning
354
- - Compressed dictionary format
389
+ - **Default strategy switched to Viterbi** (penalty=10.0): Boundary F1 = 0.8572, Token F1 = 0.6744
390
+ - **`getCaretBoundaries(text)`**returns valid caret positions based on Khmer cluster boundaries
391
+ - **`deleteBackward(text, cursorIndex)`** cluster-safe backspace for text editors
392
+ - **Extended Viterbi penalty sweep** range [0.25–10.0], documented in `docs/viterbi-penalty-sweep.md`
355
393
 
356
394
  ### Future
357
395
 
@@ -403,8 +441,9 @@ Features:
403
441
 
404
442
  - Live Khmer text input with instant results
405
443
  - Editable dictionary (add/remove words on the fly)
406
- - Strategy selector (FMM / BMM / BiMM)
444
+ - Strategy selector (FMM / BMM / BiMM / Viterbi)
407
445
  - Normalize toggle (On/Off)
446
+ - Caret boundary visualization
408
447
  - Detection, normalization, cluster splitting, and segmentation panels
409
448
  - JSON output with copy button
410
449
 
package/dist/index.cjs CHANGED
@@ -23,6 +23,8 @@ __export(index_exports, {
23
23
  containsKhmer: () => containsKhmer,
24
24
  countClusters: () => countClusters,
25
25
  createDictionary: () => createDictionary,
26
+ deleteBackward: () => deleteBackward,
27
+ getCaretBoundaries: () => getCaretBoundaries,
26
28
  getClusterBoundaries: () => getClusterBoundaries,
27
29
  isKhmerChar: () => isKhmerChar,
28
30
  isKhmerText: () => isKhmerText,
@@ -44,6 +46,9 @@ var DEPENDENT_VOWEL_START = 6068;
44
46
  var DEPENDENT_VOWEL_END = 6085;
45
47
  var SIGN_START = 6086;
46
48
  var SIGN_END = 6099;
49
+ var KHMER_PUNCT_KHAN = 6100;
50
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
51
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
47
52
  var KHMER_COENG = 6098;
48
53
  var DIGIT_START = 6112;
49
54
  var DIGIT_END = 6121;
@@ -81,6 +86,12 @@ function isAsciiDigit(cp) {
81
86
  function isDigit(cp) {
82
87
  return isKhmerDigit(cp) || isAsciiDigit(cp);
83
88
  }
89
+ function isKhmerSentencePunctuation(cp) {
90
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
91
+ }
92
+ function isKhmerSentencePunctuationToken(value) {
93
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
94
+ }
84
95
  function isClusterBase(cp) {
85
96
  return isConsonant(cp) || isIndependentVowel(cp);
86
97
  }
@@ -370,7 +381,7 @@ var DEFAULT_COST = 10;
370
381
  var UNKNOWN_COST = 20;
371
382
  var SINGLE_CONSONANT_PENALTY = 10;
372
383
  var ORPHAN_SIGN_PENALTY = 50;
373
- var DEFAULT_BOUNDARY_PENALTY = 0.75;
384
+ var DEFAULT_BOUNDARY_PENALTY = 10;
374
385
  function isClusterStart(cp) {
375
386
  return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
376
387
  }
@@ -452,7 +463,7 @@ function viterbiSegment(clusters, dictionary, options) {
452
463
  if (cost < dp[i + 1]) {
453
464
  dp[i + 1] = cost;
454
465
  from[i + 1] = i;
455
- fromKnown[i + 1] = false;
466
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
456
467
  }
457
468
  continue;
458
469
  }
@@ -596,7 +607,7 @@ function segmentWords(text, options) {
596
607
  const dictionary = options?.dictionary;
597
608
  let tokens;
598
609
  if (dictionary) {
599
- const strategy = options?.strategy ?? "fmm";
610
+ const strategy = options?.strategy ?? "viterbi";
600
611
  switch (strategy) {
601
612
  case "bmm":
602
613
  tokens = bmmSegment(clusters, dictionary);
@@ -623,12 +634,54 @@ function segmentWords(text, options) {
623
634
  });
624
635
  }
625
636
  tokens = groupDigitTokens(tokens);
637
+ tokens = markKhmerSentencePunctuationKnown(tokens);
626
638
  return {
627
639
  original: text,
628
640
  normalized,
629
641
  tokens
630
642
  };
631
643
  }
644
+ function markKhmerSentencePunctuationKnown(tokens) {
645
+ return tokens.map(
646
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
647
+ );
648
+ }
649
+
650
+ // src/core/caret.ts
651
+ function getCaretBoundaries(text, options) {
652
+ const src = options?.normalize ? normalizeKhmer(text) : text;
653
+ if (!src) return [0];
654
+ const clusters = splitClusters(src);
655
+ const positions = [0];
656
+ let offset = 0;
657
+ for (const cluster of clusters) {
658
+ offset += cluster.length;
659
+ positions.push(offset);
660
+ }
661
+ return positions;
662
+ }
663
+ function deleteBackward(text, cursorIndex, options) {
664
+ const src = options?.normalize ? normalizeKhmer(text) : text;
665
+ if (!Number.isInteger(cursorIndex)) {
666
+ throw new TypeError(
667
+ `cursorIndex must be an integer, got ${cursorIndex}`
668
+ );
669
+ }
670
+ const clamped = Math.max(0, Math.min(cursorIndex, src.length));
671
+ if (clamped === 0) {
672
+ return { text: src, cursorIndex: 0 };
673
+ }
674
+ const boundaries = getCaretBoundaries(src, { normalize: false });
675
+ let prev = 0;
676
+ for (const b of boundaries) {
677
+ if (b >= clamped) break;
678
+ prev = b;
679
+ }
680
+ return {
681
+ text: src.slice(0, prev) + src.slice(clamped),
682
+ cursorIndex: prev
683
+ };
684
+ }
632
685
 
633
686
  // src/dictionary/trie.ts
634
687
  var TrieNode = class {
@@ -728,6 +781,8 @@ function createDictionary(words, frequencies) {
728
781
  containsKhmer,
729
782
  countClusters,
730
783
  createDictionary,
784
+ deleteBackward,
785
+ getCaretBoundaries,
731
786
  getClusterBoundaries,
732
787
  isKhmerChar,
733
788
  isKhmerText,
package/dist/index.d.cts CHANGED
@@ -54,9 +54,19 @@ interface KhmerDictionary {
54
54
  getFrequency?(word: string): number | undefined;
55
55
  size: number;
56
56
  }
57
+ interface CaretOptions {
58
+ normalize?: boolean;
59
+ }
60
+ interface DeleteResult {
61
+ text: string;
62
+ cursorIndex: number;
63
+ }
57
64
 
58
65
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
59
66
 
67
+ declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
68
+ declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
69
+
60
70
  declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
61
71
 
62
- export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
72
+ export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.d.ts CHANGED
@@ -54,9 +54,19 @@ interface KhmerDictionary {
54
54
  getFrequency?(word: string): number | undefined;
55
55
  size: number;
56
56
  }
57
+ interface CaretOptions {
58
+ normalize?: boolean;
59
+ }
60
+ interface DeleteResult {
61
+ text: string;
62
+ cursorIndex: number;
63
+ }
57
64
 
58
65
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
59
66
 
67
+ declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
68
+ declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
69
+
60
70
  declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
61
71
 
62
- export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
72
+ export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.js CHANGED
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
9
9
  var DEPENDENT_VOWEL_END = 6085;
10
10
  var SIGN_START = 6086;
11
11
  var SIGN_END = 6099;
12
+ var KHMER_PUNCT_KHAN = 6100;
13
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
14
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
12
15
  var KHMER_COENG = 6098;
13
16
  var DIGIT_START = 6112;
14
17
  var DIGIT_END = 6121;
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
46
49
  function isDigit(cp) {
47
50
  return isKhmerDigit(cp) || isAsciiDigit(cp);
48
51
  }
52
+ function isKhmerSentencePunctuation(cp) {
53
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
54
+ }
55
+ function isKhmerSentencePunctuationToken(value) {
56
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
57
+ }
49
58
  function isClusterBase(cp) {
50
59
  return isConsonant(cp) || isIndependentVowel(cp);
51
60
  }
@@ -335,7 +344,7 @@ var DEFAULT_COST = 10;
335
344
  var UNKNOWN_COST = 20;
336
345
  var SINGLE_CONSONANT_PENALTY = 10;
337
346
  var ORPHAN_SIGN_PENALTY = 50;
338
- var DEFAULT_BOUNDARY_PENALTY = 0.75;
347
+ var DEFAULT_BOUNDARY_PENALTY = 10;
339
348
  function isClusterStart(cp) {
340
349
  return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
341
350
  }
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
417
426
  if (cost < dp[i + 1]) {
418
427
  dp[i + 1] = cost;
419
428
  from[i + 1] = i;
420
- fromKnown[i + 1] = false;
429
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
421
430
  }
422
431
  continue;
423
432
  }
@@ -561,7 +570,7 @@ function segmentWords(text, options) {
561
570
  const dictionary = options?.dictionary;
562
571
  let tokens;
563
572
  if (dictionary) {
564
- const strategy = options?.strategy ?? "fmm";
573
+ const strategy = options?.strategy ?? "viterbi";
565
574
  switch (strategy) {
566
575
  case "bmm":
567
576
  tokens = bmmSegment(clusters, dictionary);
@@ -588,12 +597,54 @@ function segmentWords(text, options) {
588
597
  });
589
598
  }
590
599
  tokens = groupDigitTokens(tokens);
600
+ tokens = markKhmerSentencePunctuationKnown(tokens);
591
601
  return {
592
602
  original: text,
593
603
  normalized,
594
604
  tokens
595
605
  };
596
606
  }
607
+ function markKhmerSentencePunctuationKnown(tokens) {
608
+ return tokens.map(
609
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
610
+ );
611
+ }
612
+
613
+ // src/core/caret.ts
614
+ function getCaretBoundaries(text, options) {
615
+ const src = options?.normalize ? normalizeKhmer(text) : text;
616
+ if (!src) return [0];
617
+ const clusters = splitClusters(src);
618
+ const positions = [0];
619
+ let offset = 0;
620
+ for (const cluster of clusters) {
621
+ offset += cluster.length;
622
+ positions.push(offset);
623
+ }
624
+ return positions;
625
+ }
626
+ function deleteBackward(text, cursorIndex, options) {
627
+ const src = options?.normalize ? normalizeKhmer(text) : text;
628
+ if (!Number.isInteger(cursorIndex)) {
629
+ throw new TypeError(
630
+ `cursorIndex must be an integer, got ${cursorIndex}`
631
+ );
632
+ }
633
+ const clamped = Math.max(0, Math.min(cursorIndex, src.length));
634
+ if (clamped === 0) {
635
+ return { text: src, cursorIndex: 0 };
636
+ }
637
+ const boundaries = getCaretBoundaries(src, { normalize: false });
638
+ let prev = 0;
639
+ for (const b of boundaries) {
640
+ if (b >= clamped) break;
641
+ prev = b;
642
+ }
643
+ return {
644
+ text: src.slice(0, prev) + src.slice(clamped),
645
+ cursorIndex: prev
646
+ };
647
+ }
597
648
 
598
649
  // src/dictionary/trie.ts
599
650
  var TrieNode = class {
@@ -692,6 +743,8 @@ export {
692
743
  containsKhmer,
693
744
  countClusters,
694
745
  createDictionary,
746
+ deleteBackward,
747
+ getCaretBoundaries,
695
748
  getClusterBoundaries,
696
749
  isKhmerChar,
697
750
  isKhmerText,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "khmer-segment",
3
- "version": "0.3.1",
3
+ "version": "0.4.0",
4
4
  "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",