khmer-segment 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -24
- package/dist/index.cjs +58 -3
- package/dist/index.d.cts +11 -1
- package/dist/index.d.ts +11 -1
- package/dist/index.js +56 -3
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -27,6 +27,8 @@ import {
|
|
|
27
27
|
countClusters,
|
|
28
28
|
createDictionary,
|
|
29
29
|
segmentWords,
|
|
30
|
+
getCaretBoundaries,
|
|
31
|
+
deleteBackward,
|
|
30
32
|
} from 'khmer-segment';
|
|
31
33
|
|
|
32
34
|
// Detect Khmer text
|
|
@@ -89,10 +91,10 @@ console.log(result.tokens);
|
|
|
89
91
|
|
|
90
92
|
```ts
|
|
91
93
|
interface SegmentOptions {
|
|
92
|
-
strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "
|
|
94
|
+
strategy?: 'fmm' | 'bmm' | 'bimm' | 'viterbi'; // default: "viterbi"
|
|
93
95
|
dictionary?: KhmerDictionary;
|
|
94
96
|
normalize?: boolean; // default: true
|
|
95
|
-
viterbiBoundaryPenalty?: number; // default: 0
|
|
97
|
+
viterbiBoundaryPenalty?: number; // default: 10.0 (Viterbi only)
|
|
96
98
|
}
|
|
97
99
|
```
|
|
98
100
|
|
|
@@ -213,11 +215,51 @@ Same idea as FMM, but scans right-to-left. Can produce different segmentation on
|
|
|
213
215
|
|
|
214
216
|
Runs both FMM and BMM, then picks the better result using heuristics: fewer unknown tokens wins; if tied, fewer total tokens (longer matches) wins; if still tied, FMM is preferred. This generally produces better results than either FMM or BMM alone.
|
|
215
217
|
|
|
216
|
-
### Viterbi
|
|
218
|
+
### Viterbi
|
|
217
219
|
|
|
218
220
|
Frequency-weighted dynamic programming segmentation. Finds the globally lowest-cost path through all possible word boundaries using `-log(frequency)` as word cost. Requires a dictionary with frequency data.
|
|
219
221
|
|
|
220
|
-
**
|
|
222
|
+
**Default strategy** as of v0.4.0. With a boundary penalty of 10.0, Viterbi achieves Boundary F1 = 0.8572 (+5.3% over BiMM) and Token F1 = 0.6744 (+4.2% over BiMM) while maintaining superior OOV handling (OOV Boundary F1 = 0.8875 vs BiMM's 0.4186).
|
|
223
|
+
|
|
224
|
+
### Text Editing
|
|
225
|
+
|
|
226
|
+
#### `getCaretBoundaries(text, options?)`
|
|
227
|
+
|
|
228
|
+
Returns an array of valid caret positions (indices where the cursor can rest) based on Khmer cluster boundaries.
|
|
229
|
+
|
|
230
|
+
```ts
|
|
231
|
+
import { getCaretBoundaries } from 'khmer-segment';
|
|
232
|
+
|
|
233
|
+
getCaretBoundaries(''); // [0]
|
|
234
|
+
getCaretBoundaries('ក'); // [0, 1]
|
|
235
|
+
getCaretBoundaries('ក្ក'); // [0, 3] — coeng+subscript is one cluster
|
|
236
|
+
getCaretBoundaries('កក'); // [0, 1, 2] — two clusters
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
#### `deleteBackward(text, cursorIndex, options?)`
|
|
240
|
+
|
|
241
|
+
Deletes the cluster (or character) before the cursor, respecting cluster boundaries.
|
|
242
|
+
|
|
243
|
+
```ts
|
|
244
|
+
import { deleteBackward } from 'khmer-segment';
|
|
245
|
+
|
|
246
|
+
deleteBackward('កក', 2); // { text: 'ក', cursorIndex: 1 }
|
|
247
|
+
deleteBackward('ក្កក', 4); // { text: 'ក្ក', cursorIndex: 3 } — deletes last cluster
|
|
248
|
+
deleteBackward('ក', 0); // { text: 'ក', cursorIndex: 0 } — no-op at start
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
#### `CaretOptions`
|
|
252
|
+
|
|
253
|
+
```ts
|
|
254
|
+
interface CaretOptions {
|
|
255
|
+
normalize?: boolean; // default: false — operate on raw text
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
interface DeleteResult {
|
|
259
|
+
text: string;
|
|
260
|
+
cursorIndex: number;
|
|
261
|
+
}
|
|
262
|
+
```
|
|
221
263
|
|
|
222
264
|
### Digit Grouping
|
|
223
265
|
|
|
@@ -287,9 +329,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
287
329
|
|
|
288
330
|
## Limitations
|
|
289
331
|
|
|
290
|
-
-
|
|
291
|
-
- No caret/backspace helpers yet
|
|
292
|
-
- Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~80% boundary F1 for dictionary-based matching)
|
|
332
|
+
- Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~86% boundary F1 for dictionary-based matching)
|
|
293
333
|
|
|
294
334
|
---
|
|
295
335
|
|
|
@@ -297,14 +337,14 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
297
337
|
|
|
298
338
|
Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segmentation-crf-khmer](https://github.com/phylypo/segmentation-crf-khmer)) with the default 101,107-word dictionary.
|
|
299
339
|
|
|
300
|
-
| Strategy
|
|
301
|
-
|
|
|
302
|
-
| **
|
|
303
|
-
|
|
|
304
|
-
|
|
|
305
|
-
|
|
|
340
|
+
| Strategy | Boundary F1 | Token F1 | Exact Match | OOV Rate | OOV Boundary F1 | Relative Speed |
|
|
341
|
+
| ----------- | ----------- | ---------- | ----------- | -------- | --------------- | --------------- |
|
|
342
|
+
| **Viterbi** | **0.8572** | **0.6744** | **1.4%** | 5.4% | **0.8875** | 1.4x |
|
|
343
|
+
| BiMM | 0.8041 | 0.6327 | 2.0% | 32.6% | 0.4186 | 1.0x (baseline) |
|
|
344
|
+
| FMM | 0.8024 | 0.6304 | 2.0% | 32.8% | — | 0.5x |
|
|
345
|
+
| BMM | 0.7981 | 0.6239 | 1.8% | 32.6% | — | 0.7x |
|
|
306
346
|
|
|
307
|
-
**Recommended:** `strategy: '
|
|
347
|
+
**Recommended:** `strategy: 'viterbi'` (default) for best accuracy. See [`docs/benchmark-results.md`](docs/benchmark-results.md) for full details and [`docs/benchmark-methodology.md`](docs/benchmark-methodology.md) for methodology.
|
|
308
348
|
|
|
309
349
|
---
|
|
310
350
|
|
|
@@ -336,22 +376,20 @@ Measured on the `kh_data_10000b` dataset (87,875 sentences from [phylypo/segment
|
|
|
336
376
|
- Corrected custom dictionary `size` to report unique non-empty words
|
|
337
377
|
- Added changelog, CI checks, and stricter prepublish formatting verification
|
|
338
378
|
|
|
339
|
-
### v0.3.0
|
|
379
|
+
### v0.3.0
|
|
340
380
|
|
|
341
|
-
- **Viterbi algorithm** — frequency-weighted DP segmentation
|
|
381
|
+
- **Viterbi algorithm** — frequency-weighted DP segmentation
|
|
342
382
|
- **Dictionary expansion** — 49,113 → 101,107 words (merged from Sovichea/khmer_segmenter + SIL + Royal Academy)
|
|
343
383
|
- **Full Unicode normalization** — composite vowel fixing, ROBAT ordering, stacked coeng support
|
|
344
384
|
- **Full KCC cluster model** — ROBAT continuation, independent vowel bases
|
|
345
385
|
- **Accuracy benchmarking** — 87,875-sentence gold standard, per-strategy metrics
|
|
346
|
-
- Benchmark results: BiMM Boundary F1 = 0.804, Viterbi = 0.735 (still needs cost model work)
|
|
347
386
|
|
|
348
|
-
### v0.4.0 (
|
|
387
|
+
### v0.4.0 (current)
|
|
349
388
|
|
|
350
|
-
-
|
|
351
|
-
-
|
|
352
|
-
-
|
|
353
|
-
-
|
|
354
|
-
- Compressed dictionary format
|
|
389
|
+
- **Default strategy switched to Viterbi** (penalty=10.0): Boundary F1 = 0.8572, Token F1 = 0.6744
|
|
390
|
+
- **`getCaretBoundaries(text)`** — returns valid caret positions based on Khmer cluster boundaries
|
|
391
|
+
- **`deleteBackward(text, cursorIndex)`** — cluster-safe backspace for text editors
|
|
392
|
+
- **Extended Viterbi penalty sweep** — range [0.25–10.0], documented in `docs/viterbi-penalty-sweep.md`
|
|
355
393
|
|
|
356
394
|
### Future
|
|
357
395
|
|
|
@@ -403,8 +441,9 @@ Features:
|
|
|
403
441
|
|
|
404
442
|
- Live Khmer text input with instant results
|
|
405
443
|
- Editable dictionary (add/remove words on the fly)
|
|
406
|
-
- Strategy selector (FMM / BMM / BiMM)
|
|
444
|
+
- Strategy selector (FMM / BMM / BiMM / Viterbi)
|
|
407
445
|
- Normalize toggle (On/Off)
|
|
446
|
+
- Caret boundary visualization
|
|
408
447
|
- Detection, normalization, cluster splitting, and segmentation panels
|
|
409
448
|
- JSON output with copy button
|
|
410
449
|
|
package/dist/index.cjs
CHANGED
|
@@ -23,6 +23,8 @@ __export(index_exports, {
|
|
|
23
23
|
containsKhmer: () => containsKhmer,
|
|
24
24
|
countClusters: () => countClusters,
|
|
25
25
|
createDictionary: () => createDictionary,
|
|
26
|
+
deleteBackward: () => deleteBackward,
|
|
27
|
+
getCaretBoundaries: () => getCaretBoundaries,
|
|
26
28
|
getClusterBoundaries: () => getClusterBoundaries,
|
|
27
29
|
isKhmerChar: () => isKhmerChar,
|
|
28
30
|
isKhmerText: () => isKhmerText,
|
|
@@ -44,6 +46,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
44
46
|
var DEPENDENT_VOWEL_END = 6085;
|
|
45
47
|
var SIGN_START = 6086;
|
|
46
48
|
var SIGN_END = 6099;
|
|
49
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
50
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
51
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
47
52
|
var KHMER_COENG = 6098;
|
|
48
53
|
var DIGIT_START = 6112;
|
|
49
54
|
var DIGIT_END = 6121;
|
|
@@ -81,6 +86,12 @@ function isAsciiDigit(cp) {
|
|
|
81
86
|
function isDigit(cp) {
|
|
82
87
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
83
88
|
}
|
|
89
|
+
function isKhmerSentencePunctuation(cp) {
|
|
90
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
91
|
+
}
|
|
92
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
93
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
94
|
+
}
|
|
84
95
|
function isClusterBase(cp) {
|
|
85
96
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
86
97
|
}
|
|
@@ -370,7 +381,7 @@ var DEFAULT_COST = 10;
|
|
|
370
381
|
var UNKNOWN_COST = 20;
|
|
371
382
|
var SINGLE_CONSONANT_PENALTY = 10;
|
|
372
383
|
var ORPHAN_SIGN_PENALTY = 50;
|
|
373
|
-
var DEFAULT_BOUNDARY_PENALTY =
|
|
384
|
+
var DEFAULT_BOUNDARY_PENALTY = 10;
|
|
374
385
|
function isClusterStart(cp) {
|
|
375
386
|
return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
|
|
376
387
|
}
|
|
@@ -452,7 +463,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
452
463
|
if (cost < dp[i + 1]) {
|
|
453
464
|
dp[i + 1] = cost;
|
|
454
465
|
from[i + 1] = i;
|
|
455
|
-
fromKnown[i + 1] =
|
|
466
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
456
467
|
}
|
|
457
468
|
continue;
|
|
458
469
|
}
|
|
@@ -596,7 +607,7 @@ function segmentWords(text, options) {
|
|
|
596
607
|
const dictionary = options?.dictionary;
|
|
597
608
|
let tokens;
|
|
598
609
|
if (dictionary) {
|
|
599
|
-
const strategy = options?.strategy ?? "
|
|
610
|
+
const strategy = options?.strategy ?? "viterbi";
|
|
600
611
|
switch (strategy) {
|
|
601
612
|
case "bmm":
|
|
602
613
|
tokens = bmmSegment(clusters, dictionary);
|
|
@@ -623,12 +634,54 @@ function segmentWords(text, options) {
|
|
|
623
634
|
});
|
|
624
635
|
}
|
|
625
636
|
tokens = groupDigitTokens(tokens);
|
|
637
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
626
638
|
return {
|
|
627
639
|
original: text,
|
|
628
640
|
normalized,
|
|
629
641
|
tokens
|
|
630
642
|
};
|
|
631
643
|
}
|
|
644
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
645
|
+
return tokens.map(
|
|
646
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
647
|
+
);
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
// src/core/caret.ts
|
|
651
|
+
function getCaretBoundaries(text, options) {
|
|
652
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
653
|
+
if (!src) return [0];
|
|
654
|
+
const clusters = splitClusters(src);
|
|
655
|
+
const positions = [0];
|
|
656
|
+
let offset = 0;
|
|
657
|
+
for (const cluster of clusters) {
|
|
658
|
+
offset += cluster.length;
|
|
659
|
+
positions.push(offset);
|
|
660
|
+
}
|
|
661
|
+
return positions;
|
|
662
|
+
}
|
|
663
|
+
function deleteBackward(text, cursorIndex, options) {
|
|
664
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
665
|
+
if (!Number.isInteger(cursorIndex)) {
|
|
666
|
+
throw new TypeError(
|
|
667
|
+
`cursorIndex must be an integer, got ${cursorIndex}`
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
const clamped = Math.max(0, Math.min(cursorIndex, src.length));
|
|
671
|
+
if (clamped === 0) {
|
|
672
|
+
return { text: src, cursorIndex: 0 };
|
|
673
|
+
}
|
|
674
|
+
const boundaries = getCaretBoundaries(src, { normalize: false });
|
|
675
|
+
let prev = 0;
|
|
676
|
+
for (const b of boundaries) {
|
|
677
|
+
if (b >= clamped) break;
|
|
678
|
+
prev = b;
|
|
679
|
+
}
|
|
680
|
+
return {
|
|
681
|
+
text: src.slice(0, prev) + src.slice(clamped),
|
|
682
|
+
cursorIndex: prev
|
|
683
|
+
};
|
|
684
|
+
}
|
|
632
685
|
|
|
633
686
|
// src/dictionary/trie.ts
|
|
634
687
|
var TrieNode = class {
|
|
@@ -728,6 +781,8 @@ function createDictionary(words, frequencies) {
|
|
|
728
781
|
containsKhmer,
|
|
729
782
|
countClusters,
|
|
730
783
|
createDictionary,
|
|
784
|
+
deleteBackward,
|
|
785
|
+
getCaretBoundaries,
|
|
731
786
|
getClusterBoundaries,
|
|
732
787
|
isKhmerChar,
|
|
733
788
|
isKhmerText,
|
package/dist/index.d.cts
CHANGED
|
@@ -54,9 +54,19 @@ interface KhmerDictionary {
|
|
|
54
54
|
getFrequency?(word: string): number | undefined;
|
|
55
55
|
size: number;
|
|
56
56
|
}
|
|
57
|
+
interface CaretOptions {
|
|
58
|
+
normalize?: boolean;
|
|
59
|
+
}
|
|
60
|
+
interface DeleteResult {
|
|
61
|
+
text: string;
|
|
62
|
+
cursorIndex: number;
|
|
63
|
+
}
|
|
57
64
|
|
|
58
65
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
59
66
|
|
|
67
|
+
declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
|
|
68
|
+
declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
|
|
69
|
+
|
|
60
70
|
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
61
71
|
|
|
62
|
-
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
|
72
|
+
export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.d.ts
CHANGED
|
@@ -54,9 +54,19 @@ interface KhmerDictionary {
|
|
|
54
54
|
getFrequency?(word: string): number | undefined;
|
|
55
55
|
size: number;
|
|
56
56
|
}
|
|
57
|
+
interface CaretOptions {
|
|
58
|
+
normalize?: boolean;
|
|
59
|
+
}
|
|
60
|
+
interface DeleteResult {
|
|
61
|
+
text: string;
|
|
62
|
+
cursorIndex: number;
|
|
63
|
+
}
|
|
57
64
|
|
|
58
65
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
59
66
|
|
|
67
|
+
declare function getCaretBoundaries(text: string, options?: CaretOptions): number[];
|
|
68
|
+
declare function deleteBackward(text: string, cursorIndex: number, options?: CaretOptions): DeleteResult;
|
|
69
|
+
|
|
60
70
|
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
61
71
|
|
|
62
|
-
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
|
72
|
+
export { type CaretOptions, type DeleteResult, type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, deleteBackward, getCaretBoundaries, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.js
CHANGED
|
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
9
9
|
var DEPENDENT_VOWEL_END = 6085;
|
|
10
10
|
var SIGN_START = 6086;
|
|
11
11
|
var SIGN_END = 6099;
|
|
12
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
13
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
14
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
12
15
|
var KHMER_COENG = 6098;
|
|
13
16
|
var DIGIT_START = 6112;
|
|
14
17
|
var DIGIT_END = 6121;
|
|
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
|
|
|
46
49
|
function isDigit(cp) {
|
|
47
50
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
48
51
|
}
|
|
52
|
+
function isKhmerSentencePunctuation(cp) {
|
|
53
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
54
|
+
}
|
|
55
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
56
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
57
|
+
}
|
|
49
58
|
function isClusterBase(cp) {
|
|
50
59
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
51
60
|
}
|
|
@@ -335,7 +344,7 @@ var DEFAULT_COST = 10;
|
|
|
335
344
|
var UNKNOWN_COST = 20;
|
|
336
345
|
var SINGLE_CONSONANT_PENALTY = 10;
|
|
337
346
|
var ORPHAN_SIGN_PENALTY = 50;
|
|
338
|
-
var DEFAULT_BOUNDARY_PENALTY =
|
|
347
|
+
var DEFAULT_BOUNDARY_PENALTY = 10;
|
|
339
348
|
function isClusterStart(cp) {
|
|
340
349
|
return cp >= 6016 && cp <= 6050 || cp >= 6051 && cp <= 6067;
|
|
341
350
|
}
|
|
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
417
426
|
if (cost < dp[i + 1]) {
|
|
418
427
|
dp[i + 1] = cost;
|
|
419
428
|
from[i + 1] = i;
|
|
420
|
-
fromKnown[i + 1] =
|
|
429
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
421
430
|
}
|
|
422
431
|
continue;
|
|
423
432
|
}
|
|
@@ -561,7 +570,7 @@ function segmentWords(text, options) {
|
|
|
561
570
|
const dictionary = options?.dictionary;
|
|
562
571
|
let tokens;
|
|
563
572
|
if (dictionary) {
|
|
564
|
-
const strategy = options?.strategy ?? "
|
|
573
|
+
const strategy = options?.strategy ?? "viterbi";
|
|
565
574
|
switch (strategy) {
|
|
566
575
|
case "bmm":
|
|
567
576
|
tokens = bmmSegment(clusters, dictionary);
|
|
@@ -588,12 +597,54 @@ function segmentWords(text, options) {
|
|
|
588
597
|
});
|
|
589
598
|
}
|
|
590
599
|
tokens = groupDigitTokens(tokens);
|
|
600
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
591
601
|
return {
|
|
592
602
|
original: text,
|
|
593
603
|
normalized,
|
|
594
604
|
tokens
|
|
595
605
|
};
|
|
596
606
|
}
|
|
607
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
608
|
+
return tokens.map(
|
|
609
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
610
|
+
);
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// src/core/caret.ts
|
|
614
|
+
function getCaretBoundaries(text, options) {
|
|
615
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
616
|
+
if (!src) return [0];
|
|
617
|
+
const clusters = splitClusters(src);
|
|
618
|
+
const positions = [0];
|
|
619
|
+
let offset = 0;
|
|
620
|
+
for (const cluster of clusters) {
|
|
621
|
+
offset += cluster.length;
|
|
622
|
+
positions.push(offset);
|
|
623
|
+
}
|
|
624
|
+
return positions;
|
|
625
|
+
}
|
|
626
|
+
function deleteBackward(text, cursorIndex, options) {
|
|
627
|
+
const src = options?.normalize ? normalizeKhmer(text) : text;
|
|
628
|
+
if (!Number.isInteger(cursorIndex)) {
|
|
629
|
+
throw new TypeError(
|
|
630
|
+
`cursorIndex must be an integer, got ${cursorIndex}`
|
|
631
|
+
);
|
|
632
|
+
}
|
|
633
|
+
const clamped = Math.max(0, Math.min(cursorIndex, src.length));
|
|
634
|
+
if (clamped === 0) {
|
|
635
|
+
return { text: src, cursorIndex: 0 };
|
|
636
|
+
}
|
|
637
|
+
const boundaries = getCaretBoundaries(src, { normalize: false });
|
|
638
|
+
let prev = 0;
|
|
639
|
+
for (const b of boundaries) {
|
|
640
|
+
if (b >= clamped) break;
|
|
641
|
+
prev = b;
|
|
642
|
+
}
|
|
643
|
+
return {
|
|
644
|
+
text: src.slice(0, prev) + src.slice(clamped),
|
|
645
|
+
cursorIndex: prev
|
|
646
|
+
};
|
|
647
|
+
}
|
|
597
648
|
|
|
598
649
|
// src/dictionary/trie.ts
|
|
599
650
|
var TrieNode = class {
|
|
@@ -692,6 +743,8 @@ export {
|
|
|
692
743
|
containsKhmer,
|
|
693
744
|
countClusters,
|
|
694
745
|
createDictionary,
|
|
746
|
+
deleteBackward,
|
|
747
|
+
getCaretBoundaries,
|
|
695
748
|
getClusterBoundaries,
|
|
696
749
|
isKhmerChar,
|
|
697
750
|
isKhmerText,
|