khmer-segment 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +394 -0
- package/dist/dictionary/index.cjs +196608 -0
- package/dist/dictionary/index.d.cts +24 -0
- package/dist/dictionary/index.d.ts +24 -0
- package/dist/dictionary/index.js +196579 -0
- package/dist/index.cjs +491 -0
- package/dist/index.d.cts +54 -0
- package/dist/index.d.ts +54 -0
- package/dist/index.js +455 -0
- package/package.json +64 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Phalla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
# khmer-segment
|
|
4
|
+
|
|
5
|
+
A framework-agnostic Khmer text processing library for JavaScript and TypeScript.
|
|
6
|
+
|
|
7
|
+
Works in **Next.js**, **Angular**, **React**, **Vue**, **Node.js**, and the **browser**.
|
|
8
|
+
|
|
9
|
+
Zero external dependencies. Tree-shakeable. Pure functions.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install khmer-segment
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```ts
|
|
24
|
+
import {
|
|
25
|
+
containsKhmer,
|
|
26
|
+
normalizeKhmer,
|
|
27
|
+
splitClusters,
|
|
28
|
+
countClusters,
|
|
29
|
+
createDictionary,
|
|
30
|
+
segmentWords,
|
|
31
|
+
} from 'khmer-segment';
|
|
32
|
+
|
|
33
|
+
// Detect Khmer text
|
|
34
|
+
containsKhmer('Hello សួស្តី'); // true
|
|
35
|
+
isKhmerText('សួស្តីអ្នក'); // true
|
|
36
|
+
|
|
37
|
+
// Normalize Unicode ordering
|
|
38
|
+
const text = normalizeKhmer('សួស្តីអ្នក');
|
|
39
|
+
|
|
40
|
+
// Split into grapheme clusters (not naive chars)
|
|
41
|
+
const clusters = splitClusters('សួស្តី'); // ["សួ", "ស្តី"]
|
|
42
|
+
countClusters('សួស្តី'); // 2
|
|
43
|
+
|
|
44
|
+
// Segment words with a dictionary
|
|
45
|
+
const dict = createDictionary(['សួស្តី', 'អ្នក', 'ទាំងអស់គ្នា']);
|
|
46
|
+
const result = segmentWords('សួស្តីអ្នកទាំងអស់គ្នា', { dictionary: dict });
|
|
47
|
+
|
|
48
|
+
console.log(result.tokens);
|
|
49
|
+
// [
|
|
50
|
+
// { value: "សួស្តី", start: 0, end: 6, isKnown: true },
|
|
51
|
+
// { value: "អ្នក", start: 6, end: 9, isKnown: true },
|
|
52
|
+
// { value: "ទាំងអស់គ្នា", start: 9, end: 19, isKnown: true },
|
|
53
|
+
// ]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## API Reference
|
|
59
|
+
|
|
60
|
+
### Detection
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
| Function | Description |
|
|
64
|
+
| --------------------- | --------------------------------------------------------- |
|
|
65
|
+
| `isKhmerChar(char)` | Returns `true` if the character is a Khmer code point |
|
|
66
|
+
| `containsKhmer(text)` | Returns `true` if the text contains any Khmer characters |
|
|
67
|
+
| `isKhmerText(text)` | Returns `true` if all non-whitespace characters are Khmer |
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
### Normalization
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
| Function | Description |
|
|
74
|
+
| -------------------------------- | ------------------------------------------------------------------------------------------ |
|
|
75
|
+
| `normalizeKhmer(text)` | Reorders Khmer characters into canonical order (base → coeng → shift signs → vowel → sign) |
|
|
76
|
+
| `normalizeKhmerCluster(cluster)` | Normalizes a single cluster |
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
### Cluster Utilities
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
| Function | Description |
|
|
83
|
+
| ---------------------------- | ------------------------------------------------- |
|
|
84
|
+
| `splitClusters(text)` | Splits text into Khmer-safe grapheme clusters |
|
|
85
|
+
| `countClusters(text)` | Returns the number of clusters in the text |
|
|
86
|
+
| `getClusterBoundaries(text)` | Returns `{ start, end }` offsets for each cluster |
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
### Segmentation
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
| Function | Description |
|
|
93
|
+
| ------------------------------ | -------------------------------------------------------------- |
|
|
94
|
+
| `segmentWords(text, options?)` | Segments text into word tokens using dictionary-based matching |
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
#### `SegmentOptions`
|
|
98
|
+
|
|
99
|
+
```ts
|
|
100
|
+
interface SegmentOptions {
|
|
101
|
+
strategy?: 'fmm' | 'bmm' | 'bimm'; // default: "fmm"
|
|
102
|
+
dictionary?: KhmerDictionary;
|
|
103
|
+
normalize?: boolean; // default: true
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
#### `SegmentResult`
|
|
108
|
+
|
|
109
|
+
```ts
|
|
110
|
+
interface SegmentResult {
|
|
111
|
+
original: string;
|
|
112
|
+
normalized: string;
|
|
113
|
+
tokens: SegmentToken[];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
interface SegmentToken {
|
|
117
|
+
value: string;
|
|
118
|
+
start: number;
|
|
119
|
+
end: number;
|
|
120
|
+
isKnown: boolean;
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Dictionary
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
| Function | Description |
|
|
128
|
+
| --------------------------------------- | ------------------------------------------------ |
|
|
129
|
+
| `createDictionary(words, frequencies?)` | Creates an in-memory dictionary from a word list |
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
```ts
|
|
133
|
+
const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្មែរ']);
|
|
134
|
+
|
|
135
|
+
dict.has('សួស្តី'); // true
|
|
136
|
+
dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
|
|
137
|
+
dict.hasSuffix!('ី'); // true
|
|
138
|
+
dict.size; // 3
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
#### `KhmerDictionary` interface
|
|
142
|
+
|
|
143
|
+
```ts
|
|
144
|
+
interface KhmerDictionary {
|
|
145
|
+
has(word: string): boolean;
|
|
146
|
+
hasPrefix?(value: string): boolean;
|
|
147
|
+
hasSuffix?(value: string): boolean;
|
|
148
|
+
getFrequency?(word: string): number | undefined;
|
|
149
|
+
size: number;
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
You can implement this interface for custom dictionary backends (remote, compressed, etc.).
|
|
154
|
+
|
|
155
|
+
### Default Dictionary (`khmer-segment/dictionary`)
|
|
156
|
+
|
|
157
|
+
A pre-built Khmer dictionary with **49,113 words** sourced from [khmerlbdict](https://github.com/silnrsi/khmerlbdict) (MIT) and the Royal Academy of Cambodia's Khmer Dictionary. Includes frequency data for future frequency-aware segmentation.
|
|
158
|
+
|
|
159
|
+
```ts
|
|
160
|
+
import {
|
|
161
|
+
getDefaultDictionary,
|
|
162
|
+
loadFrequencyDictionary,
|
|
163
|
+
} from 'khmer-segment/dictionary';
|
|
164
|
+
import { segmentWords } from 'khmer-segment';
|
|
165
|
+
|
|
166
|
+
const dict = getDefaultDictionary();
|
|
167
|
+
|
|
168
|
+
console.log(dict.size); // 49113
|
|
169
|
+
console.log(dict.has('កម្ពុជា')); // true
|
|
170
|
+
|
|
171
|
+
const result = segmentWords('សួស្តីអ្នកទាំងអស់គ្នា', { dictionary: dict });
|
|
172
|
+
|
|
173
|
+
const freqData = loadFrequencyDictionary();
|
|
174
|
+
console.log(freqData.words.length); // 49113
|
|
175
|
+
console.log(freqData.frequencies.get('ជា')); // 701541
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
This is a **separate import** — the core `khmer-segment` package stays small (~8KB). Only import the dictionary when you need it.
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## How It Works
|
|
183
|
+
|
|
184
|
+
### Segmentation Pipeline
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
input text
|
|
188
|
+
→ normalize (reorder Unicode marks into canonical order)
|
|
189
|
+
→ split into clusters (not naive chars)
|
|
190
|
+
→ run segmentation algorithm (FMM, BMM, or BiMM)
|
|
191
|
+
→ group consecutive digits into single tokens
|
|
192
|
+
→ return structured tokens
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Cluster Splitting
|
|
196
|
+
|
|
197
|
+
Khmer characters combine into grapheme clusters. A naive `text.split("")` breaks them incorrectly.
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
"ស្តី" → naive split: ["ស", "្", "ត", "ី"] (4 pieces, broken)
|
|
201
|
+
"ស្តី" → splitClusters: ["ស្តី"] (1 cluster, correct)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
A cluster starts with a **base** (consonant or independent vowel) and accumulates:
|
|
205
|
+
|
|
206
|
+
- `្` (coeng) + consonant → subscript pair
|
|
207
|
+
- dependent vowels
|
|
208
|
+
- diacritic signs
|
|
209
|
+
|
|
210
|
+
### FMM (Forward Maximum Matching)
|
|
211
|
+
|
|
212
|
+
Scans left-to-right, greedily matching the **longest** word at each position using trie-based prefix lookup. Falls back to single unknown tokens when no match is found.
|
|
213
|
+
|
|
214
|
+
### BMM (Backward Maximum Matching)
|
|
215
|
+
|
|
216
|
+
Same idea as FMM, but scans right-to-left. Can produce different segmentation on ambiguous input where FMM greedily matches from the left.
|
|
217
|
+
|
|
218
|
+
### BiMM (Bidirectional Maximum Matching)
|
|
219
|
+
|
|
220
|
+
Runs both FMM and BMM, then picks the better result using heuristics: fewer unknown tokens wins; if tied, fewer total tokens (longer matches) wins; if still tied, FMM is preferred. This generally produces better results than either FMM or BMM alone.
|
|
221
|
+
|
|
222
|
+
### Digit Grouping
|
|
223
|
+
|
|
224
|
+
Consecutive Khmer digit clusters (and ASCII digits) are automatically merged into a single token after segmentation, so `១៨៤` or `184` becomes one token instead of three separate tokens.
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## No Dictionary Provided
|
|
229
|
+
|
|
230
|
+
When no dictionary is passed to `segmentWords()`, it returns each cluster as an unknown token:
|
|
231
|
+
|
|
232
|
+
```ts
|
|
233
|
+
const result = segmentWords('កខគ');
|
|
234
|
+
// tokens: [
|
|
235
|
+
// { value: "ក", isKnown: false },
|
|
236
|
+
// { value: "ខ", isKnown: false },
|
|
237
|
+
// { value: "គ", isKnown: false },
|
|
238
|
+
// ]
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Dictionary Strategy
|
|
244
|
+
|
|
245
|
+
The library ships a **separate optional dictionary** via `khmer-segment/dictionary` with 49,113 Khmer words. This keeps the core package small (~8KB).
|
|
246
|
+
|
|
247
|
+
Options:
|
|
248
|
+
|
|
249
|
+
- Use the pre-built default: `getDefaultDictionary()` from `khmer-segment/dictionary`
|
|
250
|
+
- Provide your own word list via `createDictionary(words)`
|
|
251
|
+
- Load a JSON file at runtime
|
|
252
|
+
- Combine both: spread default words + your custom words
|
|
253
|
+
- Implement the `KhmerDictionary` interface for custom backends
|
|
254
|
+
|
|
255
|
+
```ts
|
|
256
|
+
// Option 1: Use the built-in dictionary
|
|
257
|
+
import { getDefaultDictionary } from 'khmer-segment/dictionary';
|
|
258
|
+
const dict = getDefaultDictionary();
|
|
259
|
+
|
|
260
|
+
// Option 2: Custom word list only
|
|
261
|
+
import { createDictionary } from 'khmer-segment';
|
|
262
|
+
const dict = createDictionary(['សួស្តី', 'អ្នក']);
|
|
263
|
+
|
|
264
|
+
// Option 3: Combine default + custom words
|
|
265
|
+
import { loadFrequencyDictionary } from 'khmer-segment/dictionary';
|
|
266
|
+
import { createDictionary } from 'khmer-segment';
|
|
267
|
+
const { words, frequencies } = loadFrequencyDictionary();
|
|
268
|
+
const dict = createDictionary([...words, 'custom_word'], frequencies);
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Framework Compatibility
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
| Environment | Support |
|
|
277
|
+
| ------------------- | ------- |
|
|
278
|
+
| Node.js (ESM + CJS) | Yes |
|
|
279
|
+
| Browser (ESM) | Yes |
|
|
280
|
+
| Next.js | Yes |
|
|
281
|
+
| React | Yes |
|
|
282
|
+
| Angular | Yes |
|
|
283
|
+
| Vue | Yes |
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
No framework-specific code in the core. Tree-shakeable with `sideEffects: false`.
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## Limitations
|
|
291
|
+
|
|
292
|
+
- No frequency-aware segmentation yet
|
|
293
|
+
- Normalization covers canonical reordering (base → coeng → shift signs → vowel → sign), not all edge cases
|
|
294
|
+
- No caret/backspace helpers yet
|
|
295
|
+
- Dictionary-based approaches have an inherent accuracy ceiling compared to statistical/ML methods (e.g. CRF achieves ~99.7% accuracy vs ~95–97% for dictionary-based matching)
|
|
296
|
+
- `splitClusters` uses a simplified Khmer Character Cluster (KCC) model — it groups base + coeng + vowel + sign but does not enforce the full KCC specification
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## Roadmap
|
|
301
|
+
|
|
302
|
+
### v0.1.0
|
|
303
|
+
|
|
304
|
+
- `isKhmerChar`, `containsKhmer`, `isKhmerText`
|
|
305
|
+
- `normalizeKhmer`, `normalizeKhmerCluster`
|
|
306
|
+
- `splitClusters`, `countClusters`, `getClusterBoundaries`
|
|
307
|
+
- `createDictionary` (trie-based in-memory)
|
|
308
|
+
- `segmentWords` with FMM
|
|
309
|
+
- Default dictionary (34K+ words, separate import)
|
|
310
|
+
|
|
311
|
+
### v0.2.0 (current)
|
|
312
|
+
|
|
313
|
+
- BMM (Backward Maximum Matching) algorithm
|
|
314
|
+
- BiMM (Bidirectional Maximum Matching) algorithm
|
|
315
|
+
- Digit grouping (consecutive Khmer digits merged into single tokens)
|
|
316
|
+
- Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
|
|
317
|
+
- Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
|
|
318
|
+
- 149 tests
|
|
319
|
+
- `compareTyping(expected, actual)` for MonkeyType-like apps
|
|
320
|
+
- Better token metadata (`isKhmer`, `clusterCount`)
|
|
321
|
+
|
|
322
|
+
### v0.3.0
|
|
323
|
+
|
|
324
|
+
- `deleteBackward(text, cursorIndex)` — cluster-safe backspace
|
|
325
|
+
- `getCaretBoundaries(text)` — caret-safe navigation
|
|
326
|
+
- Frequency-aware segmentation
|
|
327
|
+
- Compressed dictionary format
|
|
328
|
+
|
|
329
|
+
### Future
|
|
330
|
+
|
|
331
|
+
- `khmer-segment/react` — `useKhmerSegments`, `useKhmerTyping`
|
|
332
|
+
- `khmer-segment/angular` — injectable service, pipe
|
|
333
|
+
- Compressed dictionary format
|
|
334
|
+
- ICU-style line-breaking helpers
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
## Development
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
npm install # install dependencies
|
|
342
|
+
npm run build # build with tsup (ESM + CJS + types)
|
|
343
|
+
npm test # run vitest
|
|
344
|
+
npm run test:watch # watch mode
|
|
345
|
+
npm run lint # TypeScript type check
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
---
|
|
349
|
+
|
|
350
|
+
## Testing
|
|
351
|
+
|
|
352
|
+
### Automated Tests
|
|
353
|
+
|
|
354
|
+
```bash
|
|
355
|
+
npm test # run 98 tests with vitest
|
|
356
|
+
npm run test:watch # watch mode — re-runs on changes
|
|
357
|
+
npm run lint # TypeScript type check
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### Manual Testing (Playground)
|
|
361
|
+
|
|
362
|
+
An interactive playground is available for live manual testing of all library functions.
|
|
363
|
+
|
|
364
|
+
```bash
|
|
365
|
+
cd playground
|
|
366
|
+
npm install
|
|
367
|
+
npm run dev
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
Open the URL shown (typically **[http://localhost:5173](http://localhost:5173)**) in your browser.
|
|
371
|
+
|
|
372
|
+
Features:
|
|
373
|
+
|
|
374
|
+
- Live Khmer text input with instant results
|
|
375
|
+
- Editable dictionary (add/remove words on the fly)
|
|
376
|
+
- Strategy selector (FMM / BMM / BiMM)
|
|
377
|
+
- Normalize toggle (On/Off)
|
|
378
|
+
- Detection, normalization, cluster splitting, and segmentation panels
|
|
379
|
+
- JSON output with copy button
|
|
380
|
+
|
|
381
|
+
---
|
|
382
|
+
|
|
383
|
+
## References & Further Reading
|
|
384
|
+
|
|
385
|
+
- **[Word Segmentation of Khmer Text Using Conditional Random Fields](https://medium.com/@phylypo/segmentation-of-khmer-text-using-conditional-random-fields-3a2d4d73956a)** — Phylypo Tum (2019). Comprehensive overview of Khmer segmentation approaches from dictionary-based to CRF, achieving 99.7% accuracy with Linear Chain CRF.
|
|
386
|
+
- **[Khmer Word Segmentation Using Conditional Random Fields](https://www.niptict.edu.kh/khmer-word-segmentation-tool/)** — Vichea Chea, Ye Kyaw Thu, et al. (2015). The prior state-of-the-art CRF model for Khmer segmentation (98.5% accuracy, 5-tag system).
|
|
387
|
+
- **[Benchmark dataset and Python notebooks](https://github.com/phylypo/segmentation-crf-khmer)** — 10K+ segmented Khmer news articles useful for evaluating segmentation quality.
|
|
388
|
+
- **[khmerlbdict](https://github.com/silnrsi/khmerlbdict)** — Source of the default dictionary used by this library (MIT license, 34K+ words).
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
## License
|
|
393
|
+
|
|
394
|
+
MIT
|