npm - khmer-segment - Versions diffs - 0.2.0 → 0.2.2 - Mend

khmer-segment 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md +25 -26
package/dist/dictionary/index.cjs +26 -8
package/dist/dictionary/index.d.cts +1 -1
package/dist/dictionary/index.d.ts +1 -1
package/dist/dictionary/index.js +26 -8
package/dist/index.cjs +7 -3
package/dist/index.d.cts +4 -1
package/dist/index.d.ts +4 -1
package/dist/index.js +7 -3
package/package.json +3 -2

package/README.md CHANGED Viewed

@@ -1,5 +1,3 @@
 # khmer-segment
 A framework-agnostic Khmer text processing library for JavaScript and TypeScript.
@@ -23,6 +21,7 @@ npm install khmer-segment
 ```ts
 import {
     containsKhmer,
+    isKhmerText,
     normalizeKhmer,
     splitClusters,
     countClusters,
@@ -59,41 +58,33 @@ console.log(result.tokens);
 ### Detection
 | Function              | Description                                               |
 | --------------------- | --------------------------------------------------------- |
 | `isKhmerChar(char)`   | Returns `true` if the character is a Khmer code point     |
 | `containsKhmer(text)` | Returns `true` if the text contains any Khmer characters  |
 | `isKhmerText(text)`   | Returns `true` if all non-whitespace characters are Khmer |
 ### Normalization
 | Function                         | Description                                                                                |
 | -------------------------------- | ------------------------------------------------------------------------------------------ |
 | `normalizeKhmer(text)`           | Reorders Khmer characters into canonical order (base → coeng → shift signs → vowel → sign) |
 | `normalizeKhmerCluster(cluster)` | Normalizes a single cluster                                                                |
 ### Cluster Utilities
 | Function                     | Description                                       |
 | ---------------------------- | ------------------------------------------------- |
 | `splitClusters(text)`        | Splits text into Khmer-safe grapheme clusters     |
 | `countClusters(text)`        | Returns the number of clusters in the text        |
 | `getClusterBoundaries(text)` | Returns `{ start, end }` offsets for each cluster |
 ### Segmentation
 | Function                       | Description                                                    |
 | ------------------------------ | -------------------------------------------------------------- |
 | `segmentWords(text, options?)` | Segments text into word tokens using dictionary-based matching |
 #### `SegmentOptions`
 ```ts
@@ -115,27 +106,27 @@ interface SegmentResult {
 interface SegmentToken {
     value: string;
-    start: number;
-    end: number;
+    start: number; // zero-based offset into result.normalized
+    end: number; // exclusive offset into result.normalized
     isKnown: boolean;
 }
 ```
-### Dictionary
+When normalization is enabled, token offsets always refer to `result.normalized`. Invisible characters such as ZWS/ZWJ/BOM may be stripped during normalization, so offsets may not line up with the original input string.
+### Dictionary
 | Function                                | Description                                      |
 | --------------------------------------- | ------------------------------------------------ |
 | `createDictionary(words, frequencies?)` | Creates an in-memory dictionary from a word list |
 ```ts
 const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្មែរ']);
 dict.has('សួស្តី'); // true
 dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
 dict.hasSuffix!('ី'); // true
-dict.size; // 3
+dict.size; // 3 unique words
 ```
 #### `KhmerDictionary` interface
@@ -175,7 +166,9 @@ console.log(freqData.words.length); // 49113
 console.log(freqData.frequencies.get('ជា')); // 701541
 ```
-This is a **separate import** — the core `khmer-segment` package stays small (~8KB). Only import the dictionary when you need it.
+This is a **separate import** — the core `khmer-segment` package stays small (~11KB). The dictionary module is ~3.9MB. Only import the dictionary when you need it.
+`loadFrequencyDictionary()` builds its return value from cached dictionary data, but each call returns fresh arrays and a fresh `Map`. You can safely extend or mutate the returned data without affecting later calls.
 ---
@@ -242,7 +235,7 @@ const result = segmentWords('កខគ');
 ## Dictionary Strategy
-The library ships a **separate optional dictionary** via `khmer-segment/dictionary` with 49,113 Khmer words. This keeps the core package small (~8KB).
+The library ships a **separate optional dictionary** via `khmer-segment/dictionary` with 49,113 Khmer words. This keeps the core package small (~11KB).
 Options:
@@ -272,7 +265,6 @@ const dict = createDictionary([...words, 'custom_word'], frequencies);
 ## Framework Compatibility
 | Environment         | Support |
 | ------------------- | ------- |
 | Node.js (ESM + CJS) | Yes     |
@@ -282,7 +274,6 @@ const dict = createDictionary([...words, 'custom_word'], frequencies);
 | Angular             | Yes     |
 | Vue                 | Yes     |
 No framework-specific code in the core. Tree-shakeable with `sideEffects: false`.
 ---
@@ -308,16 +299,22 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
 - `segmentWords` with FMM
 - Default dictionary (34K+ words, separate import)
-### v0.2.0 (current)
+### v0.2.1
 - BMM (Backward Maximum Matching) algorithm
 - BiMM (Bidirectional Maximum Matching) algorithm
 - Digit grouping (consecutive Khmer digits merged into single tokens)
 - Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
 - Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
-- 149 tests
-- `compareTyping(expected, actual)` for MonkeyType-like apps
-- Better token metadata (`isKhmer`, `clusterCount`)
+- Rebuilt dictionary with 49,113 words (merged from 10 sources)
+### v0.2.2 (current)
+- Clarified that token offsets are measured against `result.normalized`
+- Expanded Vitest coverage across normalization, dictionary, and segmentation behavior
+- Made `loadFrequencyDictionary()` safe to reuse across calls without shared-state pollution
+- Corrected custom dictionary `size` to report unique non-empty words
+- Added changelog, CI checks, and stricter prepublish formatting verification
 ### v0.3.0
@@ -341,6 +338,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
 npm install       # install dependencies
 npm run build     # build with tsup (ESM + CJS + types)
 npm test          # run vitest
+npm run test:perf # optional performance-focused checks
 npm run test:watch  # watch mode
 npm run lint      # TypeScript type check
 ```
@@ -352,7 +350,8 @@ npm run lint      # TypeScript type check
 ### Automated Tests
 ```bash
-npm test              # run 98 tests with vitest
+npm test              # run the main Vitest correctness suite
+npm run test:perf     # optional performance-focused checks
 npm run test:watch    # watch mode — re-runs on changes
 npm run lint          # TypeScript type check
 ```
@@ -385,10 +384,10 @@ Features:
 - **[Word Segmentation of Khmer Text Using Conditional Random Fields](https://medium.com/@phylypo/segmentation-of-khmer-text-using-conditional-random-fields-3a2d4d73956a)** — Phylypo Tum (2019). Comprehensive overview of Khmer segmentation approaches from dictionary-based to CRF, achieving 99.7% accuracy with Linear Chain CRF.
 - **[Khmer Word Segmentation Using Conditional Random Fields](https://www.niptict.edu.kh/khmer-word-segmentation-tool/)** — Vichea Chea, Ye Kyaw Thu, et al. (2015). The prior state-of-the-art CRF model for Khmer segmentation (98.5% accuracy, 5-tag system).
 - **[Benchmark dataset and Python notebooks](https://github.com/phylypo/segmentation-crf-khmer)** — 10K+ segmented Khmer news articles useful for evaluating segmentation quality.
-- **[khmerlbdict](https://github.com/silnrsi/khmerlbdict)** — Source of the default dictionary used by this library (MIT license, 34K+ words).
+- **[khmerlbdict](https://github.com/silnrsi/khmerlbdict)** — Source of the default dictionary used by this library (MIT license). Merged with Royal Academy of Cambodia's Khmer Dictionary for a total of 49,113 words.
 ---
 ## License
-MIT
+MIT

package/dist/dictionary/index.cjs CHANGED Viewed

@@ -87,15 +87,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);
@@ -196590,14 +196594,28 @@ function getDefaultDictionary() {
 var cached = null;
 function loadFrequencyDictionary() {
   if (!cached) {
-    const entries = khmer_words_default;
-    const words = entries.map((e) => e.word);
+    const entries = Object.freeze(
+      khmer_words_default.map(
+        (entry) => Object.freeze({
+          word: entry.word,
+          freq: entry.freq
+        })
+      )
+    );
+    const words = Object.freeze(entries.map((entry) => entry.word));
     const frequencies = new Map(
-      entries.map((e) => [e.word, e.freq])
+      entries.map((entry) => [entry.word, entry.freq])
     );
-    cached = { words, entries, frequencies };
+    cached = Object.freeze({ words, entries, frequencies });
   }
-  return cached;
+  return {
+    words: [...cached.words],
+    entries: cached.entries.map((entry) => ({
+      word: entry.word,
+      freq: entry.freq
+    })),
+    frequencies: new Map(cached.frequencies)
+  };
 }
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {

package/dist/dictionary/index.d.cts CHANGED Viewed

@@ -19,6 +19,6 @@ interface FrequencyDictionary {
 }
 declare function loadFrequencyDictionary(): FrequencyDictionary;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };

package/dist/dictionary/index.d.ts CHANGED Viewed

@@ -19,6 +19,6 @@ interface FrequencyDictionary {
 }
 declare function loadFrequencyDictionary(): FrequencyDictionary;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };

package/dist/dictionary/index.js CHANGED Viewed

@@ -59,15 +59,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);
@@ -196562,14 +196566,28 @@ function getDefaultDictionary() {
 var cached = null;
 function loadFrequencyDictionary() {
   if (!cached) {
-    const entries = khmer_words_default;
-    const words = entries.map((e) => e.word);
+    const entries = Object.freeze(
+      khmer_words_default.map(
+        (entry) => Object.freeze({
+          word: entry.word,
+          freq: entry.freq
+        })
+      )
+    );
+    const words = Object.freeze(entries.map((entry) => entry.word));
     const frequencies = new Map(
-      entries.map((e) => [e.word, e.freq])
+      entries.map((entry) => [entry.word, entry.freq])
     );
-    cached = { words, entries, frequencies };
+    cached = Object.freeze({ words, entries, frequencies });
   }
-  return cached;
+  return {
+    words: [...cached.words],
+    entries: cached.entries.map((entry) => ({
+      word: entry.word,
+      freq: entry.freq
+    })),
+    frequencies: new Map(cached.frequencies)
+  };
 }
 export {
   createDictionary,

package/dist/index.cjs CHANGED Viewed

@@ -447,15 +447,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);

package/dist/index.d.cts CHANGED Viewed

@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
 interface SegmentToken {
     value: string;
+    /** Zero-based start offset into `SegmentResult.normalized`. */
     start: number;
+    /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
     end: number;
     isKnown: boolean;
 }
@@ -25,6 +27,7 @@ interface SegmentOptions {
 }
 interface SegmentResult {
     original: string;
+    /** Normalized text used to compute token boundaries and offsets. */
     normalized: string;
     tokens: SegmentToken[];
 }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
 declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };

package/dist/index.d.ts CHANGED Viewed

@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
 interface SegmentToken {
     value: string;
+    /** Zero-based start offset into `SegmentResult.normalized`. */
     start: number;
+    /** Zero-based exclusive end offset into `SegmentResult.normalized`. */
     end: number;
     isKnown: boolean;
 }
@@ -25,6 +27,7 @@ interface SegmentOptions {
 }
 interface SegmentResult {
     original: string;
+    /** Normalized text used to compute token boundaries and offsets. */
     normalized: string;
     tokens: SegmentToken[];
 }
@@ -49,6 +52,6 @@ interface KhmerDictionary {
 declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
-declare function createDictionary(words: string[], frequencies?: Map<string, number>): KhmerDictionary;
+declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
 export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };

package/dist/index.js CHANGED Viewed

@@ -412,15 +412,19 @@ var MemoryDictionary = class {
     this.trie = new Trie();
     this.reverseTrie = new Trie();
     this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
-    let count = 0;
+    const uniqueWords = /* @__PURE__ */ new Set();
     for (const word of words) {
+      if (word.length > 0) {
+        uniqueWords.add(word);
+      }
+    }
+    for (const word of uniqueWords) {
       if (word.length > 0) {
         this.trie.insert(word);
         this.reverseTrie.insert([...word].reverse().join(""));
-        count++;
       }
     }
-    this.size = count;
+    this.size = uniqueWords.size;
   }
   has(word) {
     return this.trie.has(word);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "khmer-segment",
-    "version": "0.2.0",
+    "version": "0.2.2",
     "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
     "type": "module",
     "main": "./dist/index.cjs",
@@ -27,11 +27,12 @@
         "build": "tsup",
         "dev": "tsup --watch",
         "test": "vitest run",
+        "test:perf": "vitest run --config vitest.perf.config.ts",
         "test:watch": "vitest",
         "lint": "tsc --noEmit",
         "format": "prettier --write .",
         "format:check": "prettier --check .",
-        "prepublishOnly": "npm run build && npm run test && npm run lint",
+        "prepublishOnly": "npm run build && npm run test && npm run lint && npm run format:check",
         "playground:dev": "npm run dev --prefix playground",
         "playground:build": "npm run build --prefix playground"
     },