khmer-segment 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -106,12 +106,14 @@ interface SegmentResult {
106
106
 
107
107
  interface SegmentToken {
108
108
  value: string;
109
- start: number;
110
- end: number;
109
+ start: number; // zero-based offset into result.normalized
110
+ end: number; // exclusive offset into result.normalized
111
111
  isKnown: boolean;
112
112
  }
113
113
  ```
114
114
 
115
+ When normalization is enabled, token offsets always refer to `result.normalized`. Invisible characters such as ZWS/ZWJ/BOM may be stripped during normalization, so offsets may not line up with the original input string.
116
+
115
117
  ### Dictionary
116
118
 
117
119
  | Function | Description |
@@ -124,7 +126,7 @@ const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្ម
124
126
  dict.has('សួស្តី'); // true
125
127
  dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
126
128
  dict.hasSuffix!('ី'); // true
127
- dict.size; // 3
129
+ dict.size; // 3 unique words
128
130
  ```
129
131
 
130
132
  #### `KhmerDictionary` interface
@@ -166,6 +168,8 @@ console.log(freqData.frequencies.get('ជា')); // 701541
166
168
 
167
169
  This is a **separate import** — the core `khmer-segment` package stays small (~11KB). The dictionary module is ~3.9MB. Only import the dictionary when you need it.
168
170
 
171
+ `loadFrequencyDictionary()` builds its return value from cached dictionary data, but each call returns fresh arrays and a fresh `Map`. You can safely extend or mutate the returned data without affecting later calls.
172
+
169
173
  ---
170
174
 
171
175
  ## How It Works
@@ -295,16 +299,23 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
295
299
  - `segmentWords` with FMM
296
300
  - Default dictionary (34K+ words, separate import)
297
301
 
298
- ### v0.2.0 (current)
302
+ ### v0.2.1
299
303
 
300
304
  - BMM (Backward Maximum Matching) algorithm
301
305
  - BiMM (Bidirectional Maximum Matching) algorithm
302
306
  - Digit grouping (consecutive Khmer digits merged into single tokens)
303
307
  - Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
304
308
  - Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
305
- - 149 tests
306
309
  - Rebuilt dictionary with 49,113 words (merged from 10 sources)
307
310
 
311
+ ### v0.2.2 (current)
312
+
313
+ - Clarified that token offsets are measured against `result.normalized`
314
+ - Expanded Vitest coverage across normalization, dictionary, and segmentation behavior
315
+ - Made `loadFrequencyDictionary()` safe to reuse across calls without shared-state pollution
316
+ - Corrected custom dictionary `size` to report unique non-empty words
317
+ - Added changelog, CI checks, and stricter prepublish formatting verification
318
+
308
319
  ### v0.3.0
309
320
 
310
321
  - `deleteBackward(text, cursorIndex)` — cluster-safe backspace
@@ -327,6 +338,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
327
338
  npm install # install dependencies
328
339
  npm run build # build with tsup (ESM + CJS + types)
329
340
  npm test # run vitest
341
+ npm run test:perf # optional performance-focused checks
330
342
  npm run test:watch # watch mode
331
343
  npm run lint # TypeScript type check
332
344
  ```
@@ -338,7 +350,8 @@ npm run lint # TypeScript type check
338
350
  ### Automated Tests
339
351
 
340
352
  ```bash
341
- npm test # run 149 tests with vitest
353
+ npm test # run the main Vitest correctness suite
354
+ npm run test:perf # optional performance-focused checks
342
355
  npm run test:watch # watch mode — re-runs on changes
343
356
  npm run lint # TypeScript type check
344
357
  ```
@@ -87,15 +87,19 @@ var MemoryDictionary = class {
87
87
  this.trie = new Trie();
88
88
  this.reverseTrie = new Trie();
89
89
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
90
- let count = 0;
90
+ const uniqueWords = /* @__PURE__ */ new Set();
91
91
  for (const word of words) {
92
+ if (word.length > 0) {
93
+ uniqueWords.add(word);
94
+ }
95
+ }
96
+ for (const word of uniqueWords) {
92
97
  if (word.length > 0) {
93
98
  this.trie.insert(word);
94
99
  this.reverseTrie.insert([...word].reverse().join(""));
95
- count++;
96
100
  }
97
101
  }
98
- this.size = count;
102
+ this.size = uniqueWords.size;
99
103
  }
100
104
  has(word) {
101
105
  return this.trie.has(word);
@@ -196590,14 +196594,28 @@ function getDefaultDictionary() {
196590
196594
  var cached = null;
196591
196595
  function loadFrequencyDictionary() {
196592
196596
  if (!cached) {
196593
- const entries = khmer_words_default;
196594
- const words = entries.map((e) => e.word);
196597
+ const entries = Object.freeze(
196598
+ khmer_words_default.map(
196599
+ (entry) => Object.freeze({
196600
+ word: entry.word,
196601
+ freq: entry.freq
196602
+ })
196603
+ )
196604
+ );
196605
+ const words = Object.freeze(entries.map((entry) => entry.word));
196595
196606
  const frequencies = new Map(
196596
- entries.map((e) => [e.word, e.freq])
196607
+ entries.map((entry) => [entry.word, entry.freq])
196597
196608
  );
196598
- cached = { words, entries, frequencies };
196609
+ cached = Object.freeze({ words, entries, frequencies });
196599
196610
  }
196600
- return cached;
196611
+ return {
196612
+ words: [...cached.words],
196613
+ entries: cached.entries.map((entry) => ({
196614
+ word: entry.word,
196615
+ freq: entry.freq
196616
+ })),
196617
+ frequencies: new Map(cached.frequencies)
196618
+ };
196601
196619
  }
196602
196620
  // Annotate the CommonJS export names for ESM import in node:
196603
196621
  0 && (module.exports = {
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
19
19
  }
20
20
  declare function loadFrequencyDictionary(): FrequencyDictionary;
21
21
 
22
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
22
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
23
23
 
24
24
  export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
19
19
  }
20
20
  declare function loadFrequencyDictionary(): FrequencyDictionary;
21
21
 
22
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
22
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
23
23
 
24
24
  export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
@@ -59,15 +59,19 @@ var MemoryDictionary = class {
59
59
  this.trie = new Trie();
60
60
  this.reverseTrie = new Trie();
61
61
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
62
- let count = 0;
62
+ const uniqueWords = /* @__PURE__ */ new Set();
63
63
  for (const word of words) {
64
+ if (word.length > 0) {
65
+ uniqueWords.add(word);
66
+ }
67
+ }
68
+ for (const word of uniqueWords) {
64
69
  if (word.length > 0) {
65
70
  this.trie.insert(word);
66
71
  this.reverseTrie.insert([...word].reverse().join(""));
67
- count++;
68
72
  }
69
73
  }
70
- this.size = count;
74
+ this.size = uniqueWords.size;
71
75
  }
72
76
  has(word) {
73
77
  return this.trie.has(word);
@@ -196562,14 +196566,28 @@ function getDefaultDictionary() {
196562
196566
  var cached = null;
196563
196567
  function loadFrequencyDictionary() {
196564
196568
  if (!cached) {
196565
- const entries = khmer_words_default;
196566
- const words = entries.map((e) => e.word);
196569
+ const entries = Object.freeze(
196570
+ khmer_words_default.map(
196571
+ (entry) => Object.freeze({
196572
+ word: entry.word,
196573
+ freq: entry.freq
196574
+ })
196575
+ )
196576
+ );
196577
+ const words = Object.freeze(entries.map((entry) => entry.word));
196567
196578
  const frequencies = new Map(
196568
- entries.map((e) => [e.word, e.freq])
196579
+ entries.map((entry) => [entry.word, entry.freq])
196569
196580
  );
196570
- cached = { words, entries, frequencies };
196581
+ cached = Object.freeze({ words, entries, frequencies });
196571
196582
  }
196572
- return cached;
196583
+ return {
196584
+ words: [...cached.words],
196585
+ entries: cached.entries.map((entry) => ({
196586
+ word: entry.word,
196587
+ freq: entry.freq
196588
+ })),
196589
+ frequencies: new Map(cached.frequencies)
196590
+ };
196573
196591
  }
196574
196592
  export {
196575
196593
  createDictionary,
package/dist/index.cjs CHANGED
@@ -447,15 +447,19 @@ var MemoryDictionary = class {
447
447
  this.trie = new Trie();
448
448
  this.reverseTrie = new Trie();
449
449
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
450
- let count = 0;
450
+ const uniqueWords = /* @__PURE__ */ new Set();
451
451
  for (const word of words) {
452
+ if (word.length > 0) {
453
+ uniqueWords.add(word);
454
+ }
455
+ }
456
+ for (const word of uniqueWords) {
452
457
  if (word.length > 0) {
453
458
  this.trie.insert(word);
454
459
  this.reverseTrie.insert([...word].reverse().join(""));
455
- count++;
456
460
  }
457
461
  }
458
- this.size = count;
462
+ this.size = uniqueWords.size;
459
463
  }
460
464
  has(word) {
461
465
  return this.trie.has(word);
package/dist/index.d.cts CHANGED
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
14
14
 
15
15
  interface SegmentToken {
16
16
  value: string;
17
+ /** Zero-based start offset into `SegmentResult.normalized`. */
17
18
  start: number;
19
+ /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
18
20
  end: number;
19
21
  isKnown: boolean;
20
22
  }
@@ -25,6 +27,7 @@ interface SegmentOptions {
25
27
  }
26
28
  interface SegmentResult {
27
29
  original: string;
30
+ /** Normalized text used to compute token boundaries and offsets. */
28
31
  normalized: string;
29
32
  tokens: SegmentToken[];
30
33
  }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
49
52
 
50
53
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
51
54
 
52
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
55
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
53
56
 
54
57
  export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.d.ts CHANGED
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
14
14
 
15
15
  interface SegmentToken {
16
16
  value: string;
17
+ /** Zero-based start offset into `SegmentResult.normalized`. */
17
18
  start: number;
19
+ /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
18
20
  end: number;
19
21
  isKnown: boolean;
20
22
  }
@@ -25,6 +27,7 @@ interface SegmentOptions {
25
27
  }
26
28
  interface SegmentResult {
27
29
  original: string;
30
+ /** Normalized text used to compute token boundaries and offsets. */
28
31
  normalized: string;
29
32
  tokens: SegmentToken[];
30
33
  }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
49
52
 
50
53
  declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
51
54
 
52
- declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
55
+ declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
53
56
 
54
57
  export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
package/dist/index.js CHANGED
@@ -412,15 +412,19 @@ var MemoryDictionary = class {
412
412
  this.trie = new Trie();
413
413
  this.reverseTrie = new Trie();
414
414
  this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
415
- let count = 0;
415
+ const uniqueWords = /* @__PURE__ */ new Set();
416
416
  for (const word of words) {
417
+ if (word.length > 0) {
418
+ uniqueWords.add(word);
419
+ }
420
+ }
421
+ for (const word of uniqueWords) {
417
422
  if (word.length > 0) {
418
423
  this.trie.insert(word);
419
424
  this.reverseTrie.insert([...word].reverse().join(""));
420
- count++;
421
425
  }
422
426
  }
423
- this.size = count;
427
+ this.size = uniqueWords.size;
424
428
  }
425
429
  has(word) {
426
430
  return this.trie.has(word);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "khmer-segment",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -27,11 +27,12 @@
27
27
  "build": "tsup",
28
28
  "dev": "tsup --watch",
29
29
  "test": "vitest run",
30
+ "test:perf": "vitest run --config vitest.perf.config.ts",
30
31
  "test:watch": "vitest",
31
32
  "lint": "tsc --noEmit",
32
33
  "format": "prettier --write .",
33
34
  "format:check": "prettier --check .",
34
- "prepublishOnly": "npm run build && npm run test && npm run lint",
35
+ "prepublishOnly": "npm run build && npm run test && npm run lint && npm run format:check",
35
36
  "playground:dev": "npm run dev --prefix playground",
36
37
  "playground:build": "npm run build --prefix playground"
37
38
  },