npm - khmer-segment - Versions diffs - 0.2.1 → 0.2.2 - Mend

khmer-segment 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +19 -6
package/dist/dictionary/index.cjs +26 -8
package/dist/dictionary/index.d.cts +1 -1
package/dist/dictionary/index.d.ts +1 -1
package/dist/dictionary/index.js +26 -8
package/dist/index.cjs +7 -3
package/dist/index.d.cts +4 -1
package/dist/index.d.ts +4 -1
package/dist/index.js +7 -3
package/package.json +3 -2

package/README.md CHANGED Viewed

@@ -106,12 +106,14 @@ interface SegmentResult {
 interface SegmentToken {
     value: string;
-    start: number;
-    end: number;
+    start: number; // zero-based offset into result.normalized
+    end: number; // exclusive offset into result.normalized
     isKnown: boolean;
 }
 ```
+When normalization is enabled, token offsets always refer to `result.normalized`. Invisible characters such as ZWS/ZWJ/BOM may be stripped during normalization, so offsets may not line up with the original input string.
 ### Dictionary
 | Function                                | Description                                      |
@@ -124,7 +126,7 @@ const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្ម
 dict.has('សួស្តី'); // true
 dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
 dict.hasSuffix!('ី'); // true
-dict.size; // 3
+dict.size; // 3 unique words
 ```
 #### `KhmerDictionary` interface
@@ -166,6 +168,8 @@ console.log(freqData.frequencies.get('ជា')); // 701541
 This is a **separate import** — the core `khmer-segment` package stays small (~11KB). The dictionary module is ~3.9MB. Only import the dictionary when you need it.
+`loadFrequencyDictionary()` builds its return value from cached dictionary data, but each call returns fresh arrays and a fresh `Map`. You can safely extend or mutate the returned data without affecting later calls.
 ---
 ## How It Works
@@ -295,16 +299,23 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
 - `segmentWords` with FMM
 - Default dictionary (34K+ words, separate import)
-### v0.2.0 (current)
+### v0.2.1
 - BMM (Backward Maximum Matching) algorithm
 - BiMM (Bidirectional Maximum Matching) algorithm
 - Digit grouping (consecutive Khmer digits merged into single tokens)
 - Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
 - Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
-- 149 tests
 - Rebuilt dictionary with 49,113 words (merged from 10 sources)
+### v0.2.2 (current)
+- Clarified that token offsets are measured against `result.normalized`
+- Expanded Vitest coverage across normalization, dictionary, and segmentation behavior
+- Made `loadFrequencyDictionary()` safe to reuse across calls without shared-state pollution
+- Corrected custom dictionary `size` to report unique non-empty words
+- Added changelog, CI checks, and stricter prepublish formatting verification
 ### v0.3.0
 - `deleteBackward(text, cursorIndex)` — cluster-safe backspace
@@ -327,6 +338,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
 npm install       # install dependencies
 npm run build     # build with tsup (ESM + CJS + types)
 npm test          # run vitest
+npm run test:perf # optional performance-focused checks
 npm run test:watch  # watch mode
 npm run lint      # TypeScript type check
 ```
@@ -338,7 +350,8 @@ npm run lint      # TypeScript type check
 ### Automated Tests
 ```bash
-npm test              # run 149 tests with vitest
+npm test              # run the main Vitest correctness suite
+npm run test:perf     # optional performance-focused checks
 npm run test:watch    # watch mode — re-runs on changes
 npm run lint          # TypeScript type check
 ```

package/dist/dictionary/index.cjs CHANGED Viewed

@@ -87,15 +87,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);
@@ -196590,14 +196594,28 @@ function getDefaultDictionary() {
 var cached = null;
 function loadFrequencyDictionary() {
   if (!cached) {
-    const entries = khmer_words_default;
-    const words = entries.map((e) => e.word);
+    const entries = Object.freeze(
+      khmer_words_default.map(
+        (entry) => Object.freeze({
+          word: entry.word,
+          freq: entry.freq
+        })
+      )
+    );
+    const words = Object.freeze(entries.map((entry) => entry.word));
     const frequencies = new Map(
-      entries.map((e) => [e.word, e.freq])
+      entries.map((entry) => [entry.word, entry.freq])
     );
-    cached = { words, entries, frequencies };
+    cached = Object.freeze({ words, entries, frequencies });
   }
-  return cached;
+  return {
+    words: [...cached.words],
+    entries: cached.entries.map((entry) => ({
+      word: entry.word,
+      freq: entry.freq
+    })),
+    frequencies: new Map(cached.frequencies)
+  };
 }
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {

package/dist/dictionary/index.d.cts CHANGED Viewed

@@ -19,6 +19,6 @@ interface FrequencyDictionary {
 }
 declare function loadFrequencyDictionary(): FrequencyDictionary;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };

package/dist/dictionary/index.d.ts CHANGED Viewed

@@ -19,6 +19,6 @@ interface FrequencyDictionary {
 }
 declare function loadFrequencyDictionary(): FrequencyDictionary;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };

package/dist/dictionary/index.js CHANGED Viewed

@@ -59,15 +59,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);
@@ -196562,14 +196566,28 @@ function getDefaultDictionary() {
 var cached = null;
 function loadFrequencyDictionary() {
   if (!cached) {
-    const entries = khmer_words_default;
-    const words = entries.map((e) => e.word);
+    const entries = Object.freeze(
+      khmer_words_default.map(
+        (entry) => Object.freeze({
+          word: entry.word,
+          freq: entry.freq
+        })
+      )
+    );
+    const words = Object.freeze(entries.map((entry) => entry.word));
     const frequencies = new Map(
-      entries.map((e) => [e.word, e.freq])
+      entries.map((entry) => [entry.word, entry.freq])
     );
-    cached = { words, entries, frequencies };
+    cached = Object.freeze({ words, entries, frequencies });
   }
-  return cached;
+  return {
+    words: [...cached.words],
+    entries: cached.entries.map((entry) => ({
+      word: entry.word,
+      freq: entry.freq
+    })),
+    frequencies: new Map(cached.frequencies)
+  };
 }
 export {
   createDictionary,

package/dist/index.cjs CHANGED Viewed

@@ -447,15 +447,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);

package/dist/index.d.cts CHANGED Viewed

@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
 interface SegmentToken {
     value: string;
+    /** Zero-based start offset into `SegmentResult.normalized`. */
     start: number;
+    /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
     end: number;
     isKnown: boolean;
 }
@@ -25,6 +27,7 @@ interface SegmentOptions {
 }
 interface SegmentResult {
     original: string;
+    /** Normalized text used to compute token boundaries and offsets. */
     normalized: string;
     tokens: SegmentToken[];
 }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
 declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };

package/dist/index.d.ts CHANGED Viewed

@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
 interface SegmentToken {
     value: string;
+    /** Zero-based start offset into `SegmentResult.normalized`. */
     start: number;
+    /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
     end: number;
     isKnown: boolean;
 }
@@ -25,6 +27,7 @@ interface SegmentOptions {
 }
 interface SegmentResult {
     original: string;
+    /** Normalized text used to compute token boundaries and offsets. */
     normalized: string;
     tokens: SegmentToken[];
 }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
 declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };

package/dist/index.js CHANGED Viewed

@@ -412,15 +412,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "khmer-segment",
-    "version": "0.2.1",
+    "version": "0.2.2",
     "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
     "type": "module",
     "main": "./dist/index.cjs",
@@ -27,11 +27,12 @@
         "build": "tsup",
         "dev": "tsup --watch",
         "test": "vitest run",
+        "test:perf": "vitest run --config vitest.perf.config.ts",
         "test:watch": "vitest",
         "lint": "tsc --noEmit",
         "format": "prettier --write .",
         "format:check": "prettier --check .",
-        "prepublishOnly": "npm run build && npm run test && npm run lint",
+        "prepublishOnly": "npm run build && npm run test && npm run lint && npm run format:check",
         "playground:dev": "npm run dev --prefix playground",
         "playground:build": "npm run build --prefix playground"
     },