khmer-segment 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
1
  # khmer-segment
4
2
 
5
3
  A framework-agnostic Khmer text processing library for JavaScript and TypeScript.
@@ -23,6 +21,7 @@ npm install khmer-segment
23
21
  ```ts
24
22
  import {
25
23
  containsKhmer,
24
+ isKhmerText,
26
25
  normalizeKhmer,
27
26
  splitClusters,
28
27
  countClusters,
@@ -59,41 +58,33 @@ console.log(result.tokens);
59
58
 
60
59
  ### Detection
61
60
 
62
-
63
61
  | Function | Description |
64
62
  | --------------------- | --------------------------------------------------------- |
65
63
  | `isKhmerChar(char)` | Returns `true` if the character is a Khmer code point |
66
64
  | `containsKhmer(text)` | Returns `true` if the text contains any Khmer characters |
67
65
  | `isKhmerText(text)` | Returns `true` if all non-whitespace characters are Khmer |
68
66
 
69
-
70
67
  ### Normalization
71
68
 
72
-
73
69
  | Function | Description |
74
70
  | -------------------------------- | ------------------------------------------------------------------------------------------ |
75
71
  | `normalizeKhmer(text)` | Reorders Khmer characters into canonical order (base → coeng → shift signs → vowel → sign) |
76
72
  | `normalizeKhmerCluster(cluster)` | Normalizes a single cluster |
77
73
 
78
-
79
74
  ### Cluster Utilities
80
75
 
81
-
82
76
  | Function | Description |
83
77
  | ---------------------------- | ------------------------------------------------- |
84
78
  | `splitClusters(text)` | Splits text into Khmer-safe grapheme clusters |
85
79
  | `countClusters(text)` | Returns the number of clusters in the text |
86
80
  | `getClusterBoundaries(text)` | Returns `{ start, end }` offsets for each cluster |
87
81
 
88
-
89
82
  ### Segmentation
90
83
 
91
-
92
84
  | Function | Description |
93
85
  | ------------------------------ | -------------------------------------------------------------- |
94
86
  | `segmentWords(text, options?)` | Segments text into word tokens using dictionary-based matching |
95
87
 
96
-
97
88
  #### `SegmentOptions`
98
89
 
99
90
  ```ts
@@ -115,27 +106,27 @@ interface SegmentResult {
115
106
 
116
107
  interface SegmentToken {
117
108
  value: string;
118
- start: number;
119
- end: number;
109
+ start: number; // zero-based offset into result.normalized
110
+ end: number; // exclusive offset into result.normalized
120
111
  isKnown: boolean;
121
112
  }
122
113
  ```
123
114
 
124
- ### Dictionary
115
+ When normalization is enabled, token offsets always refer to `result.normalized`. Invisible characters such as ZWS/ZWJ/BOM may be stripped during normalization, so offsets may not line up with the original input string.
125
116
 
117
+ ### Dictionary
126
118
 
127
119
  | Function | Description |
128
120
  | --------------------------------------- | ------------------------------------------------ |
129
121
  | `createDictionary(words, frequencies?)` | Creates an in-memory dictionary from a word list |
130
122
 
131
-
132
123
  ```ts
133
124
  const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្មែរ']);
134
125
 
135
126
  dict.has('សួស្តី'); // true
136
127
  dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
137
128
  dict.hasSuffix!('ី'); // true
138
- dict.size; // 3
129
+ dict.size; // 3 unique words
139
130
  ```
140
131
 
141
132
  #### `KhmerDictionary` interface
@@ -175,7 +166,9 @@ console.log(freqData.words.length); // 49113
175
166
  console.log(freqData.frequencies.get('ជា')); // 701541
176
167
  ```
177
168
 
178
- This is a **separate import** — the core `khmer-segment` package stays small (~8KB). Only import the dictionary when you need it.
169
+ This is a **separate import** — the core `khmer-segment` package stays small (~11KB). The dictionary module is ~3.9MB. Only import the dictionary when you need it.
170
+
171
+ `loadFrequencyDictionary()` builds its return value from cached dictionary data, but each call returns fresh arrays and a fresh `Map`. You can safely extend or mutate the returned data without affecting later calls.
179
172
 
180
173
  ---
181
174
 
@@ -242,7 +235,7 @@ const result = segmentWords('កខគ');
242
235
 
243
236
  ## Dictionary Strategy
244
237
 
245
- The library ships a **separate optional dictionary** via `khmer-segment/dictionary` with 49,113 Khmer words. This keeps the core package small (~8KB).
238
+ The library ships a **separate optional dictionary** via `khmer-segment/dictionary` with 49,113 Khmer words. This keeps the core package small (~11KB).
246
239
 
247
240
  Options:
248
241
 
@@ -272,7 +265,6 @@ const dict = createDictionary([...words, 'custom_word'], frequencies);
272
265
 
273
266
  ## Framework Compatibility
274
267
 
275
-
276
268
  | Environment | Support |
277
269
  | ------------------- | ------- |
278
270
  | Node.js (ESM + CJS) | Yes |
@@ -282,7 +274,6 @@ const dict = createDictionary([...words, 'custom_word'], frequencies);
282
274
  | Angular | Yes |
283
275
  | Vue | Yes |
284
276
 
285
-
286
277
  No framework-specific code in the core. Tree-shakeable with `sideEffects: false`.
287
278
 
288
279
  ---
@@ -308,16 +299,22 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
308
299
  - `segmentWords` with FMM
309
300
  - Default dictionary (34K+ words, separate import)
310
301
 
311
- ### v0.2.0 (current)
302
+ ### v0.2.1
312
303
 
313
304
  - BMM (Backward Maximum Matching) algorithm
314
305
  - BiMM (Bidirectional Maximum Matching) algorithm
315
306
  - Digit grouping (consecutive Khmer digits merged into single tokens)
316
307
  - Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
317
308
  - Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
318
- - 149 tests
319
- - `compareTyping(expected, actual)` for MonkeyType-like apps
320
- - Better token metadata (`isKhmer`, `clusterCount`)
309
+ - Rebuilt dictionary with 49,113 words (merged from 10 sources)
310
+
311
+ ### v0.2.2 (current)
312
+
313
+ - Clarified that token offsets are measured against `result.normalized`
314
+ - Expanded Vitest coverage across normalization, dictionary, and segmentation behavior
315
+ - Made `loadFrequencyDictionary()` safe to reuse across calls without shared-state pollution
316
+ - Corrected custom dictionary `size` to report unique non-empty words
317
+ - Added changelog, CI checks, and stricter prepublish formatting verification
321
318
 
322
319
  ### v0.3.0
323
320
 
@@ -341,6 +338,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
341
338
  npm install # install dependencies
342
339
  npm run build # build with tsup (ESM + CJS + types)
343
340
  npm test # run vitest
341
+ npm run test:perf # optional performance-focused checks
344
342
  npm run test:watch # watch mode
345
343
  npm run lint # TypeScript type check
346
344
  ```
@@ -352,7 +350,8 @@ npm run lint # TypeScript type check
352
350
  ### Automated Tests
353
351
 
354
352
  ```bash
355
- npm test # run 98 tests with vitest
353
+ npm test # run the main Vitest correctness suite
354
+ npm run test:perf # optional performance-focused checks
356
355
  npm run test:watch # watch mode — re-runs on changes
357
356
  npm run lint # TypeScript type check
358
357
  ```
@@ -385,10 +384,10 @@ Features:
385
384
  - **[Word Segmentation of Khmer Text Using Conditional Random Fields](https://medium.com/@phylypo/segmentation-of-khmer-text-using-conditional-random-fields-3a2d4d73956a)** — Phylypo Tum (2019). Comprehensive overview of Khmer segmentation approaches from dictionary-based to CRF, achieving 99.7% accuracy with Linear Chain CRF.
386
385
  - **[Khmer Word Segmentation Using Conditional Random Fields](https://www.niptict.edu.kh/khmer-word-segmentation-tool/)** — Vichea Chea, Ye Kyaw Thu, et al. (2015). The prior state-of-the-art CRF model for Khmer segmentation (98.5% accuracy, 5-tag system).
387
386
  - **[Benchmark dataset and Python notebooks](https://github.com/phylypo/segmentation-crf-khmer)** — 10K+ segmented Khmer news articles useful for evaluating segmentation quality.
388
- - **[khmerlbdict](https://github.com/silnrsi/khmerlbdict)** — Source of the default dictionary used by this library (MIT license, 34K+ words).
387
+ - **[khmerlbdict](https://github.com/silnrsi/khmerlbdict)** — Source of the default dictionary used by this library (MIT license). Merged with Royal Academy of Cambodia's Khmer Dictionary for a total of 49,113 words.
389
388
 
390
389
  ---
391
390
 
392
391
  ## License
393
392
 
394
- MIT
393
+ MIT
@@ -87,15 +87,19 @@ var MemoryDictionary = class {
87
87
  this.trie = new Trie();
88
88
  this.reverseTrie = new Trie();
89
89
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
90
- let count = 0;
90
+ const uniqueWords = /* @__PURE__ */ new Set();
91
91
  for (const word of words) {
92
+ if (word.length > 0) {
93
+ uniqueWords.add(word);
94
+ }
95
+ }
96
+ for (const word of uniqueWords) {
92
97
  if (word.length > 0) {
93
98
  this.trie.insert(word);
94
99
  this.reverseTrie.insert([...word].reverse().join(""));
95
- count++;
96
100
  }
97
101
  }
98
- this.size = count;
102
+ this.size = uniqueWords.size;
99
103
  }
100
104
  has(word) {
101
105
  return this.trie.has(word);
@@ -196590,14 +196594,28 @@ function getDefaultDictionary() {
196590
196594
  var cached = null;
196591
196595
  function loadFrequencyDictionary() {
196592
196596
  if (!cached) {
196593
- const entries = khmer_words_default;
196594
- const words = entries.map((e) => e.word);
196597
+ const entries = Object.freeze(
196598
+ khmer_words_default.map(
196599
+ (entry) => Object.freeze({
196600
+ word: entry.word,
196601
+ freq: entry.freq
196602
+ })
196603
+ )
196604
+ );
196605
+ const words = Object.freeze(entries.map((entry) => entry.word));
196595
196606
  const frequencies = new Map(
196596
- entries.map((e) => [e.word, e.freq])
196607
+ entries.map((entry) => [entry.word, entry.freq])
196597
196608
  );
196598
- cached = { words, entries, frequencies };
196609
+ cached = Object.freeze({ words, entries, frequencies });
196599
196610
  }
196600
- return cached;
196611
+ return {
196612
+ words: [...cached.words],
196613
+ entries: cached.entries.map((entry) => ({
196614
+ word: entry.word,
196615
+ freq: entry.freq
196616
+ })),
196617
+ frequencies: new Map(cached.frequencies)
196618
+ };
196601
196619
  }
196602
196620
  // Annotate the CommonJS export names for ESM import in node:
196603
196621
  0 && (module.exports = {
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
19
19
  }
20
20
  declare function loadFrequencyDictionary(): FrequencyDictionary;
21
21
 
22
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
22
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
23
23
 
24
24
  export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
19
19
  }
20
20
  declare function loadFrequencyDictionary(): FrequencyDictionary;
21
21
 
22
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
22
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
23
23
 
24
24
  export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
@@ -59,15 +59,19 @@ var MemoryDictionary = class {
59
59
  this.trie = new Trie();
60
60
  this.reverseTrie = new Trie();
61
61
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
62
- let count = 0;
62
+ const uniqueWords = /* @__PURE__ */ new Set();
63
63
  for (const word of words) {
64
+ if (word.length > 0) {
65
+ uniqueWords.add(word);
66
+ }
67
+ }
68
+ for (const word of uniqueWords) {
64
69
  if (word.length > 0) {
65
70
  this.trie.insert(word);
66
71
  this.reverseTrie.insert([...word].reverse().join(""));
67
- count++;
68
72
  }
69
73
  }
70
- this.size = count;
74
+ this.size = uniqueWords.size;
71
75
  }
72
76
  has(word) {
73
77
  return this.trie.has(word);
@@ -196562,14 +196566,28 @@ function getDefaultDictionary() {
196562
196566
  var cached = null;
196563
196567
  function loadFrequencyDictionary() {
196564
196568
  if (!cached) {
196565
- const entries = khmer_words_default;
196566
- const words = entries.map((e) => e.word);
196569
+ const entries = Object.freeze(
196570
+ khmer_words_default.map(
196571
+ (entry) => Object.freeze({
196572
+ word: entry.word,
196573
+ freq: entry.freq
196574
+ })
196575
+ )
196576
+ );
196577
+ const words = Object.freeze(entries.map((entry) => entry.word));
196567
196578
  const frequencies = new Map(
196568
- entries.map((e) => [e.word, e.freq])
196579
+ entries.map((entry) => [entry.word, entry.freq])
196569
196580
  );
196570
- cached = { words, entries, frequencies };
196581
+ cached = Object.freeze({ words, entries, frequencies });
196571
196582
  }
196572
- return cached;
196583
+ return {
196584
+ words: [...cached.words],
196585
+ entries: cached.entries.map((entry) => ({
196586
+ word: entry.word,
196587
+ freq: entry.freq
196588
+ })),
196589
+ frequencies: new Map(cached.frequencies)
196590
+ };
196573
196591
  }
196574
196592
  export {
196575
196593
  createDictionary,
package/dist/index.cjs CHANGED
@@ -447,15 +447,19 @@ var MemoryDictionary = class {
447
447
  this.trie = new Trie();
448
448
  this.reverseTrie = new Trie();
449
449
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
450
- let count = 0;
450
+ const uniqueWords = /* @__PURE__ */ new Set();
451
451
  for (const word of words) {
452
+ if (word.length > 0) {
453
+ uniqueWords.add(word);
454
+ }
455
+ }
456
+ for (const word of uniqueWords) {
452
457
  if (word.length > 0) {
453
458
  this.trie.insert(word);
454
459
  this.reverseTrie.insert([...word].reverse().join(""));
455
- count++;
456
460
  }
457
461
  }
458
- this.size = count;
462
+ this.size = uniqueWords.size;
459
463
  }
460
464
  has(word) {
461
465
  return this.trie.has(word);
package/dist/index.d.cts CHANGED
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
14
14
 
15
15
  interface SegmentToken {
16
16
  value: string;
17
+ /** Zero-based start offset into `SegmentResult.normalized`. */
17
18
  start: number;
19
+ /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
18
20
  end: number;
19
21
  isKnown: boolean;
20
22
  }
@@ -25,6 +27,7 @@ interface SegmentOptions {
25
27
  }
26
28
  interface SegmentResult {
27
29
  original: string;
30
+ /** Normalized text used to compute token boundaries and offsets. */
28
31
  normalized: string;
29
32
  tokens: SegmentToken[];
30
33
  }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
49
52
 
50
53
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
51
54
 
52
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
55
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
53
56
 
54
57
  export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.d.ts CHANGED
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
14
14
 
15
15
  interface SegmentToken {
16
16
  value: string;
17
+ /** Zero-based start offset into `SegmentResult.normalized`. */
17
18
  start: number;
19
+ /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
18
20
  end: number;
19
21
  isKnown: boolean;
20
22
  }
@@ -25,6 +27,7 @@ interface SegmentOptions {
25
27
  }
26
28
  interface SegmentResult {
27
29
  original: string;
30
+ /** Normalized text used to compute token boundaries and offsets. */
28
31
  normalized: string;
29
32
  tokens: SegmentToken[];
30
33
  }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
49
52
 
50
53
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
51
54
 
52
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
55
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
53
56
 
54
57
  export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.js CHANGED
@@ -412,15 +412,19 @@ var MemoryDictionary = class {
412
412
  this.trie = new Trie();
413
413
  this.reverseTrie = new Trie();
414
414
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
415
- let count = 0;
415
+ const uniqueWords = /* @__PURE__ */ new Set();
416
416
  for (const word of words) {
417
+ if (word.length > 0) {
418
+ uniqueWords.add(word);
419
+ }
420
+ }
421
+ for (const word of uniqueWords) {
417
422
  if (word.length > 0) {
418
423
  this.trie.insert(word);
419
424
  this.reverseTrie.insert([...word].reverse().join(""));
420
- count++;
421
425
  }
422
426
  }
423
- this.size = count;
427
+ this.size = uniqueWords.size;
424
428
  }
425
429
  has(word) {
426
430
  return this.trie.has(word);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "khmer-segment",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -27,11 +27,12 @@
27
27
  "build": "tsup",
28
28
  "dev": "tsup --watch",
29
29
  "test": "vitest run",
30
+ "test:perf": "vitest run --config vitest.perf.config.ts",
30
31
  "test:watch": "vitest",
31
32
  "lint": "tsc --noEmit",
32
33
  "format": "prettier --write .",
33
34
  "format:check": "prettier --check .",
34
- "prepublishOnly": "npm run build && npm run test && npm run lint",
35
+ "prepublishOnly": "npm run build && npm run test && npm run lint && npm run format:check",
35
36
  "playground:dev": "npm run dev --prefix playground",
36
37
  "playground:build": "npm run build --prefix playground"
37
38
  },