khmer-segment 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -6
- package/dist/dictionary/index.cjs +26 -8
- package/dist/dictionary/index.d.cts +1 -1
- package/dist/dictionary/index.d.ts +1 -1
- package/dist/dictionary/index.js +26 -8
- package/dist/index.cjs +7 -3
- package/dist/index.d.cts +4 -1
- package/dist/index.d.ts +4 -1
- package/dist/index.js +7 -3
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -106,12 +106,14 @@ interface SegmentResult {
|
|
|
106
106
|
|
|
107
107
|
interface SegmentToken {
|
|
108
108
|
value: string;
|
|
109
|
-
start: number;
|
|
110
|
-
end: number;
|
|
109
|
+
start: number; // zero-based offset into result.normalized
|
|
110
|
+
end: number; // exclusive offset into result.normalized
|
|
111
111
|
isKnown: boolean;
|
|
112
112
|
}
|
|
113
113
|
```
|
|
114
114
|
|
|
115
|
+
When normalization is enabled, token offsets always refer to `result.normalized`. Invisible characters such as ZWS/ZWJ/BOM may be stripped during normalization, so offsets may not line up with the original input string.
|
|
116
|
+
|
|
115
117
|
### Dictionary
|
|
116
118
|
|
|
117
119
|
| Function | Description |
|
|
@@ -124,7 +126,7 @@ const dict = createDictionary(['សួស្តី', 'អ្នក', 'ខ្ម
|
|
|
124
126
|
dict.has('សួស្តី'); // true
|
|
125
127
|
dict.hasPrefix!('សួ'); // true (trie-based O(k) lookup)
|
|
126
128
|
dict.hasSuffix!('ី'); // true
|
|
127
|
-
dict.size; // 3
|
|
129
|
+
dict.size; // 3 unique words
|
|
128
130
|
```
|
|
129
131
|
|
|
130
132
|
#### `KhmerDictionary` interface
|
|
@@ -166,6 +168,8 @@ console.log(freqData.frequencies.get('ជា')); // 701541
|
|
|
166
168
|
|
|
167
169
|
This is a **separate import** — the core `khmer-segment` package stays small (~11KB). The dictionary module is ~3.9MB. Only import the dictionary when you need it.
|
|
168
170
|
|
|
171
|
+
`loadFrequencyDictionary()` builds its return value from cached dictionary data, but each call returns fresh arrays and a fresh `Map`. You can safely extend or mutate the returned data without affecting later calls.
|
|
172
|
+
|
|
169
173
|
---
|
|
170
174
|
|
|
171
175
|
## How It Works
|
|
@@ -295,16 +299,23 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
295
299
|
- `segmentWords` with FMM
|
|
296
300
|
- Default dictionary (34K+ words, separate import)
|
|
297
301
|
|
|
298
|
-
### v0.2.
|
|
302
|
+
### v0.2.1
|
|
299
303
|
|
|
300
304
|
- BMM (Backward Maximum Matching) algorithm
|
|
301
305
|
- BiMM (Bidirectional Maximum Matching) algorithm
|
|
302
306
|
- Digit grouping (consecutive Khmer digits merged into single tokens)
|
|
303
307
|
- Fixed normalization for MUUSIKATOAN (៉) and TRIISAP (៊) — shift signs now placed before vowels
|
|
304
308
|
- Fixed Unicode range constants (NIKAHIT, REAHMUK, YUUKEALAKHMOU are signs, not vowels)
|
|
305
|
-
- 149 tests
|
|
306
309
|
- Rebuilt dictionary with 49,113 words (merged from 10 sources)
|
|
307
310
|
|
|
311
|
+
### v0.2.2 (current)
|
|
312
|
+
|
|
313
|
+
- Clarified that token offsets are measured against `result.normalized`
|
|
314
|
+
- Expanded Vitest coverage across normalization, dictionary, and segmentation behavior
|
|
315
|
+
- Made `loadFrequencyDictionary()` safe to reuse across calls without shared-state pollution
|
|
316
|
+
- Corrected custom dictionary `size` to report unique non-empty words
|
|
317
|
+
- Added changelog, CI checks, and stricter prepublish formatting verification
|
|
318
|
+
|
|
308
319
|
### v0.3.0
|
|
309
320
|
|
|
310
321
|
- `deleteBackward(text, cursorIndex)` — cluster-safe backspace
|
|
@@ -327,6 +338,7 @@ No framework-specific code in the core. Tree-shakeable with `sideEffects: false`
|
|
|
327
338
|
npm install # install dependencies
|
|
328
339
|
npm run build # build with tsup (ESM + CJS + types)
|
|
329
340
|
npm test # run vitest
|
|
341
|
+
npm run test:perf # optional performance-focused checks
|
|
330
342
|
npm run test:watch # watch mode
|
|
331
343
|
npm run lint # TypeScript type check
|
|
332
344
|
```
|
|
@@ -338,7 +350,8 @@ npm run lint # TypeScript type check
|
|
|
338
350
|
### Automated Tests
|
|
339
351
|
|
|
340
352
|
```bash
|
|
341
|
-
npm test # run
|
|
353
|
+
npm test # run the main Vitest correctness suite
|
|
354
|
+
npm run test:perf # optional performance-focused checks
|
|
342
355
|
npm run test:watch # watch mode — re-runs on changes
|
|
343
356
|
npm run lint # TypeScript type check
|
|
344
357
|
```
|
|
@@ -87,15 +87,19 @@ var MemoryDictionary = class {
|
|
|
87
87
|
this.trie = new Trie();
|
|
88
88
|
this.reverseTrie = new Trie();
|
|
89
89
|
this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
|
|
90
|
-
|
|
90
|
+
const uniqueWords = /* @__PURE__ */ new Set();
|
|
91
91
|
for (const word of words) {
|
|
92
|
+
if (word.length > 0) {
|
|
93
|
+
uniqueWords.add(word);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
for (const word of uniqueWords) {
|
|
92
97
|
if (word.length > 0) {
|
|
93
98
|
this.trie.insert(word);
|
|
94
99
|
this.reverseTrie.insert([...word].reverse().join(""));
|
|
95
|
-
count++;
|
|
96
100
|
}
|
|
97
101
|
}
|
|
98
|
-
this.size =
|
|
102
|
+
this.size = uniqueWords.size;
|
|
99
103
|
}
|
|
100
104
|
has(word) {
|
|
101
105
|
return this.trie.has(word);
|
|
@@ -196590,14 +196594,28 @@ function getDefaultDictionary() {
|
|
|
196590
196594
|
var cached = null;
|
|
196591
196595
|
function loadFrequencyDictionary() {
|
|
196592
196596
|
if (!cached) {
|
|
196593
|
-
const entries =
|
|
196594
|
-
|
|
196597
|
+
const entries = Object.freeze(
|
|
196598
|
+
khmer_words_default.map(
|
|
196599
|
+
(entry) => Object.freeze({
|
|
196600
|
+
word: entry.word,
|
|
196601
|
+
freq: entry.freq
|
|
196602
|
+
})
|
|
196603
|
+
)
|
|
196604
|
+
);
|
|
196605
|
+
const words = Object.freeze(entries.map((entry) => entry.word));
|
|
196595
196606
|
const frequencies = new Map(
|
|
196596
|
-
entries.map((
|
|
196607
|
+
entries.map((entry) => [entry.word, entry.freq])
|
|
196597
196608
|
);
|
|
196598
|
-
cached = { words, entries, frequencies };
|
|
196609
|
+
cached = Object.freeze({ words, entries, frequencies });
|
|
196599
196610
|
}
|
|
196600
|
-
return
|
|
196611
|
+
return {
|
|
196612
|
+
words: [...cached.words],
|
|
196613
|
+
entries: cached.entries.map((entry) => ({
|
|
196614
|
+
word: entry.word,
|
|
196615
|
+
freq: entry.freq
|
|
196616
|
+
})),
|
|
196617
|
+
frequencies: new Map(cached.frequencies)
|
|
196618
|
+
};
|
|
196601
196619
|
}
|
|
196602
196620
|
// Annotate the CommonJS export names for ESM import in node:
|
|
196603
196621
|
0 && (module.exports = {
|
|
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
|
|
|
19
19
|
}
|
|
20
20
|
declare function loadFrequencyDictionary(): FrequencyDictionary;
|
|
21
21
|
|
|
22
|
-
declare function createDictionary(words: string[], frequencies?:
|
|
22
|
+
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
23
23
|
|
|
24
24
|
export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
|
|
@@ -19,6 +19,6 @@ interface FrequencyDictionary {
|
|
|
19
19
|
}
|
|
20
20
|
declare function loadFrequencyDictionary(): FrequencyDictionary;
|
|
21
21
|
|
|
22
|
-
declare function createDictionary(words: string[], frequencies?:
|
|
22
|
+
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
23
23
|
|
|
24
24
|
export { type DictionaryEntry, type FrequencyDictionary, type KhmerDictionary, createDictionary, getDefaultDictionary, loadFrequencyDictionary };
|
package/dist/dictionary/index.js
CHANGED
|
@@ -59,15 +59,19 @@ var MemoryDictionary = class {
|
|
|
59
59
|
this.trie = new Trie();
|
|
60
60
|
this.reverseTrie = new Trie();
|
|
61
61
|
this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
|
|
62
|
-
|
|
62
|
+
const uniqueWords = /* @__PURE__ */ new Set();
|
|
63
63
|
for (const word of words) {
|
|
64
|
+
if (word.length > 0) {
|
|
65
|
+
uniqueWords.add(word);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
for (const word of uniqueWords) {
|
|
64
69
|
if (word.length > 0) {
|
|
65
70
|
this.trie.insert(word);
|
|
66
71
|
this.reverseTrie.insert([...word].reverse().join(""));
|
|
67
|
-
count++;
|
|
68
72
|
}
|
|
69
73
|
}
|
|
70
|
-
this.size =
|
|
74
|
+
this.size = uniqueWords.size;
|
|
71
75
|
}
|
|
72
76
|
has(word) {
|
|
73
77
|
return this.trie.has(word);
|
|
@@ -196562,14 +196566,28 @@ function getDefaultDictionary() {
|
|
|
196562
196566
|
var cached = null;
|
|
196563
196567
|
function loadFrequencyDictionary() {
|
|
196564
196568
|
if (!cached) {
|
|
196565
|
-
const entries =
|
|
196566
|
-
|
|
196569
|
+
const entries = Object.freeze(
|
|
196570
|
+
khmer_words_default.map(
|
|
196571
|
+
(entry) => Object.freeze({
|
|
196572
|
+
word: entry.word,
|
|
196573
|
+
freq: entry.freq
|
|
196574
|
+
})
|
|
196575
|
+
)
|
|
196576
|
+
);
|
|
196577
|
+
const words = Object.freeze(entries.map((entry) => entry.word));
|
|
196567
196578
|
const frequencies = new Map(
|
|
196568
|
-
entries.map((
|
|
196579
|
+
entries.map((entry) => [entry.word, entry.freq])
|
|
196569
196580
|
);
|
|
196570
|
-
cached = { words, entries, frequencies };
|
|
196581
|
+
cached = Object.freeze({ words, entries, frequencies });
|
|
196571
196582
|
}
|
|
196572
|
-
return
|
|
196583
|
+
return {
|
|
196584
|
+
words: [...cached.words],
|
|
196585
|
+
entries: cached.entries.map((entry) => ({
|
|
196586
|
+
word: entry.word,
|
|
196587
|
+
freq: entry.freq
|
|
196588
|
+
})),
|
|
196589
|
+
frequencies: new Map(cached.frequencies)
|
|
196590
|
+
};
|
|
196573
196591
|
}
|
|
196574
196592
|
export {
|
|
196575
196593
|
createDictionary,
|
package/dist/index.cjs
CHANGED
|
@@ -447,15 +447,19 @@ var MemoryDictionary = class {
|
|
|
447
447
|
this.trie = new Trie();
|
|
448
448
|
this.reverseTrie = new Trie();
|
|
449
449
|
this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
|
|
450
|
-
|
|
450
|
+
const uniqueWords = /* @__PURE__ */ new Set();
|
|
451
451
|
for (const word of words) {
|
|
452
|
+
if (word.length > 0) {
|
|
453
|
+
uniqueWords.add(word);
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
for (const word of uniqueWords) {
|
|
452
457
|
if (word.length > 0) {
|
|
453
458
|
this.trie.insert(word);
|
|
454
459
|
this.reverseTrie.insert([...word].reverse().join(""));
|
|
455
|
-
count++;
|
|
456
460
|
}
|
|
457
461
|
}
|
|
458
|
-
this.size =
|
|
462
|
+
this.size = uniqueWords.size;
|
|
459
463
|
}
|
|
460
464
|
has(word) {
|
|
461
465
|
return this.trie.has(word);
|
package/dist/index.d.cts
CHANGED
|
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
|
|
|
14
14
|
|
|
15
15
|
interface SegmentToken {
|
|
16
16
|
value: string;
|
|
17
|
+
/** Zero-based start offset into `SegmentResult.normalized`. */
|
|
17
18
|
start: number;
|
|
19
|
+
/** Zero-based exclusive end offset into `SegmentResult.normalized`. */
|
|
18
20
|
end: number;
|
|
19
21
|
isKnown: boolean;
|
|
20
22
|
}
|
|
@@ -25,6 +27,7 @@ interface SegmentOptions {
|
|
|
25
27
|
}
|
|
26
28
|
interface SegmentResult {
|
|
27
29
|
original: string;
|
|
30
|
+
/** Normalized text used to compute token boundaries and offsets. */
|
|
28
31
|
normalized: string;
|
|
29
32
|
tokens: SegmentToken[];
|
|
30
33
|
}
|
|
@@ -49,6 +52,6 @@ interface KhmerDictionary {
|
|
|
49
52
|
|
|
50
53
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
51
54
|
|
|
52
|
-
declare function createDictionary(words: string[], frequencies?:
|
|
55
|
+
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
53
56
|
|
|
54
57
|
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.d.ts
CHANGED
|
@@ -14,7 +14,9 @@ declare function getClusterBoundaries(text: string): Array<{
|
|
|
14
14
|
|
|
15
15
|
interface SegmentToken {
|
|
16
16
|
value: string;
|
|
17
|
+
/** Zero-based start offset into `SegmentResult.normalized`. */
|
|
17
18
|
start: number;
|
|
19
|
+
/** Zero-based exclusive end offset into `SegmentResult.normalized`. */
|
|
18
20
|
end: number;
|
|
19
21
|
isKnown: boolean;
|
|
20
22
|
}
|
|
@@ -25,6 +27,7 @@ interface SegmentOptions {
|
|
|
25
27
|
}
|
|
26
28
|
interface SegmentResult {
|
|
27
29
|
original: string;
|
|
30
|
+
/** Normalized text used to compute token boundaries and offsets. */
|
|
28
31
|
normalized: string;
|
|
29
32
|
tokens: SegmentToken[];
|
|
30
33
|
}
|
|
@@ -49,6 +52,6 @@ interface KhmerDictionary {
|
|
|
49
52
|
|
|
50
53
|
declare function segmentWords(text: string, options?: SegmentOptions): SegmentResult;
|
|
51
54
|
|
|
52
|
-
declare function createDictionary(words: string[], frequencies?:
|
|
55
|
+
declare function createDictionary(words: string[], frequencies?: ReadonlyMap<string, number>): KhmerDictionary;
|
|
53
56
|
|
|
54
57
|
export { type KhmerDictionary, type SegmentOptions, type SegmentResult, type SegmentToken, type TypingComparisonResult, containsKhmer, countClusters, createDictionary, getClusterBoundaries, isKhmerChar, isKhmerText, normalizeKhmer, normalizeKhmerCluster, segmentWords, splitClusters };
|
package/dist/index.js
CHANGED
|
@@ -412,15 +412,19 @@ var MemoryDictionary = class {
|
|
|
412
412
|
this.trie = new Trie();
|
|
413
413
|
this.reverseTrie = new Trie();
|
|
414
414
|
this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
|
|
415
|
-
|
|
415
|
+
const uniqueWords = /* @__PURE__ */ new Set();
|
|
416
416
|
for (const word of words) {
|
|
417
|
+
if (word.length > 0) {
|
|
418
|
+
uniqueWords.add(word);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
for (const word of uniqueWords) {
|
|
417
422
|
if (word.length > 0) {
|
|
418
423
|
this.trie.insert(word);
|
|
419
424
|
this.reverseTrie.insert([...word].reverse().join(""));
|
|
420
|
-
count++;
|
|
421
425
|
}
|
|
422
426
|
}
|
|
423
|
-
this.size =
|
|
427
|
+
this.size = uniqueWords.size;
|
|
424
428
|
}
|
|
425
429
|
has(word) {
|
|
426
430
|
return this.trie.has(word);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "khmer-segment",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.2",
|
|
4
4
|
"description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -27,11 +27,12 @@
|
|
|
27
27
|
"build": "tsup",
|
|
28
28
|
"dev": "tsup --watch",
|
|
29
29
|
"test": "vitest run",
|
|
30
|
+
"test:perf": "vitest run --config vitest.perf.config.ts",
|
|
30
31
|
"test:watch": "vitest",
|
|
31
32
|
"lint": "tsc --noEmit",
|
|
32
33
|
"format": "prettier --write .",
|
|
33
34
|
"format:check": "prettier --check .",
|
|
34
|
-
"prepublishOnly": "npm run build && npm run test && npm run lint",
|
|
35
|
+
"prepublishOnly": "npm run build && npm run test && npm run lint && npm run format:check",
|
|
35
36
|
"playground:dev": "npm run dev --prefix playground",
|
|
36
37
|
"playground:build": "npm run build --prefix playground"
|
|
37
38
|
},
|