khmer-segment 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,455 @@
1
+ // src/constants/unicode.ts
2
+ var KHMER_RANGE_START = 6016;
3
+ var KHMER_RANGE_END = 6143;
4
+ var CONSONANT_START = 6016;
5
+ var CONSONANT_END = 6050;
6
+ var INDEPENDENT_VOWEL_START = 6051;
7
+ var INDEPENDENT_VOWEL_END = 6067;
8
+ var DEPENDENT_VOWEL_START = 6068;
9
+ var DEPENDENT_VOWEL_END = 6085;
10
+ var SIGN_START = 6086;
11
+ var SIGN_END = 6099;
12
+ var KHMER_COENG = 6098;
13
+ var DIGIT_START = 6112;
14
+ var DIGIT_END = 6121;
15
+ var ASCII_DIGIT_START = 48;
16
+ var ASCII_DIGIT_END = 57;
17
+
18
+ // src/constants/char-categories.ts
19
+ function isKhmerCodePoint(cp) {
20
+ return cp >= KHMER_RANGE_START && cp <= KHMER_RANGE_END;
21
+ }
22
+ function isConsonant(cp) {
23
+ return cp >= CONSONANT_START && cp <= CONSONANT_END;
24
+ }
25
+ function isIndependentVowel(cp) {
26
+ return cp >= INDEPENDENT_VOWEL_START && cp <= INDEPENDENT_VOWEL_END;
27
+ }
28
+ function isDependentVowel(cp) {
29
+ return cp >= DEPENDENT_VOWEL_START && cp <= DEPENDENT_VOWEL_END;
30
+ }
31
+ function isSign(cp) {
32
+ return cp >= SIGN_START && cp <= SIGN_END;
33
+ }
34
+ function isShiftSign(cp) {
35
+ return cp === 6089 || cp === 6090;
36
+ }
37
+ function isCoeng(cp) {
38
+ return cp === KHMER_COENG;
39
+ }
40
+ function isKhmerDigit(cp) {
41
+ return cp >= DIGIT_START && cp <= DIGIT_END;
42
+ }
43
+ function isAsciiDigit(cp) {
44
+ return cp >= ASCII_DIGIT_START && cp <= ASCII_DIGIT_END;
45
+ }
46
+ function isDigit(cp) {
47
+ return isKhmerDigit(cp) || isAsciiDigit(cp);
48
+ }
49
+ function isClusterBase(cp) {
50
+ return isConsonant(cp) || isIndependentVowel(cp);
51
+ }
52
+
53
+ // src/core/detect.ts
54
+ function isKhmerChar(char) {
55
+ if (!char) return false;
56
+ const cp = char.codePointAt(0);
57
+ return isKhmerCodePoint(cp);
58
+ }
59
+ function containsKhmer(text) {
60
+ for (const ch of text) {
61
+ if (isKhmerChar(ch)) return true;
62
+ }
63
+ return false;
64
+ }
65
+ function isKhmerText(text) {
66
+ if (!text.length) return false;
67
+ let hasKhmer = false;
68
+ for (const ch of text) {
69
+ if (/\s/.test(ch)) continue;
70
+ if (!isKhmerChar(ch)) return false;
71
+ hasKhmer = true;
72
+ }
73
+ return hasKhmer;
74
+ }
75
+
76
+ // src/core/cluster.ts
77
+ function splitClusters(text) {
78
+ if (!text) return [];
79
+ const chars = [...text];
80
+ const clusters = [];
81
+ let i = 0;
82
+ while (i < chars.length) {
83
+ const cp = chars[i].codePointAt(0);
84
+ if (isClusterBase(cp)) {
85
+ let cluster = chars[i];
86
+ i++;
87
+ while (i < chars.length) {
88
+ const nextCp = chars[i].codePointAt(0);
89
+ if (isCoeng(nextCp)) {
90
+ cluster += chars[i];
91
+ i++;
92
+ if (i < chars.length && isConsonant(chars[i].codePointAt(0))) {
93
+ cluster += chars[i];
94
+ i++;
95
+ }
96
+ } else if (isDependentVowel(nextCp) || isSign(nextCp)) {
97
+ cluster += chars[i];
98
+ i++;
99
+ } else {
100
+ break;
101
+ }
102
+ }
103
+ clusters.push(cluster);
104
+ } else {
105
+ clusters.push(chars[i]);
106
+ i++;
107
+ }
108
+ }
109
+ return clusters;
110
+ }
111
+ function countClusters(text) {
112
+ return splitClusters(text).length;
113
+ }
114
+ function getClusterBoundaries(text) {
115
+ const clusters = splitClusters(text);
116
+ const boundaries = [];
117
+ let offset = 0;
118
+ for (const cluster of clusters) {
119
+ boundaries.push({ start: offset, end: offset + cluster.length });
120
+ offset += cluster.length;
121
+ }
122
+ return boundaries;
123
+ }
124
+
125
+ // src/core/normalize.ts
126
+ var INVISIBLE_CHARS = /[\u200B\u200C\u200D\u2060\u200E\u200F\uFEFF]/g;
127
+ function normalizeKhmerCluster(cluster) {
128
+ const chars = [...cluster];
129
+ if (chars.length <= 1) return chars.join("");
130
+ let i = 0;
131
+ const base = [];
132
+ const coengPairs = [];
133
+ const shiftSigns = [];
134
+ const vowels = [];
135
+ const otherSigns = [];
136
+ const other = [];
137
+ base.push(chars[i]);
138
+ i++;
139
+ while (i < chars.length) {
140
+ const cp = chars[i].codePointAt(0);
141
+ if (isCoeng(cp)) {
142
+ let pair = chars[i];
143
+ i++;
144
+ if (i < chars.length && isConsonant(chars[i].codePointAt(0))) {
145
+ pair += chars[i];
146
+ i++;
147
+ }
148
+ coengPairs.push(pair);
149
+ } else if (isShiftSign(cp)) {
150
+ shiftSigns.push(chars[i]);
151
+ i++;
152
+ } else if (isDependentVowel(cp)) {
153
+ vowels.push(chars[i]);
154
+ i++;
155
+ } else if (isSign(cp)) {
156
+ otherSigns.push(chars[i]);
157
+ i++;
158
+ } else {
159
+ other.push(chars[i]);
160
+ i++;
161
+ }
162
+ }
163
+ return [
164
+ ...base,
165
+ ...coengPairs,
166
+ ...shiftSigns,
167
+ ...vowels,
168
+ ...otherSigns,
169
+ ...other
170
+ ].join("");
171
+ }
172
+ function normalizeKhmer(text) {
173
+ const cleaned = text.replace(INVISIBLE_CHARS, "");
174
+ const clusters = splitClusters(cleaned);
175
+ return clusters.map((cluster) => {
176
+ const firstCp = cluster.codePointAt(0);
177
+ if (isKhmerCodePoint(firstCp)) {
178
+ return normalizeKhmerCluster(cluster);
179
+ }
180
+ return cluster;
181
+ }).join("");
182
+ }
183
+
184
+ // src/algorithms/fmm.ts
185
+ function fmmSegment(clusters, dictionary) {
186
+ const tokens = [];
187
+ const hasPrefixFn = dictionary.hasPrefix?.bind(dictionary);
188
+ let i = 0;
189
+ let offset = 0;
190
+ while (i < clusters.length) {
191
+ let matched = false;
192
+ let maxLen = clusters.length - i;
193
+ if (hasPrefixFn) {
194
+ maxLen = 1;
195
+ let candidate = clusters[i];
196
+ while (maxLen < clusters.length - i && hasPrefixFn(candidate + clusters[i + maxLen])) {
197
+ maxLen++;
198
+ candidate += clusters[i + maxLen - 1];
199
+ }
200
+ }
201
+ for (let len = maxLen; len >= 1; len--) {
202
+ const word = clusters.slice(i, i + len).join("");
203
+ if (dictionary.has(word)) {
204
+ const start = offset;
205
+ const end = offset + word.length;
206
+ tokens.push({ value: word, start, end, isKnown: true });
207
+ offset = end;
208
+ i += len;
209
+ matched = true;
210
+ break;
211
+ }
212
+ }
213
+ if (!matched) {
214
+ const word = clusters[i];
215
+ const start = offset;
216
+ const end = offset + word.length;
217
+ tokens.push({ value: word, start, end, isKnown: false });
218
+ offset = end;
219
+ i++;
220
+ }
221
+ }
222
+ return tokens;
223
+ }
224
+
225
+ // src/algorithms/bmm.ts
226
+ function bmmSegment(clusters, dictionary) {
227
+ const tokens = [];
228
+ const hasSuffixFn = dictionary.hasSuffix?.bind(dictionary);
229
+ let i = clusters.length - 1;
230
+ while (i >= 0) {
231
+ let matched = false;
232
+ let maxLen = i + 1;
233
+ if (hasSuffixFn) {
234
+ maxLen = 1;
235
+ let candidate = clusters[i];
236
+ while (maxLen < i + 1 && hasSuffixFn(candidate)) {
237
+ maxLen++;
238
+ candidate = clusters[i - maxLen + 1] + candidate;
239
+ }
240
+ }
241
+ for (let len = maxLen; len >= 1; len--) {
242
+ const word = clusters.slice(i - len + 1, i + 1).join("");
243
+ if (dictionary.has(word)) {
244
+ tokens.push({ value: word, start: 0, end: 0, isKnown: true });
245
+ i -= len;
246
+ matched = true;
247
+ break;
248
+ }
249
+ }
250
+ if (!matched) {
251
+ tokens.push({
252
+ value: clusters[i],
253
+ start: 0,
254
+ end: 0,
255
+ isKnown: false
256
+ });
257
+ i--;
258
+ }
259
+ }
260
+ tokens.reverse();
261
+ let offset = 0;
262
+ for (const token of tokens) {
263
+ token.start = offset;
264
+ offset += token.value.length;
265
+ token.end = offset;
266
+ }
267
+ return tokens;
268
+ }
269
+
270
+ // src/algorithms/bimm.ts
271
+ function bimmSegment(clusters, dictionary) {
272
+ const fmmResult = fmmSegment(clusters, dictionary);
273
+ const bmmResult = bmmSegment(clusters, dictionary);
274
+ const fmmUnknowns = fmmResult.filter((t) => !t.isKnown).length;
275
+ const bmmUnknowns = bmmResult.filter((t) => !t.isKnown).length;
276
+ if (fmmUnknowns !== bmmUnknowns) {
277
+ return fmmUnknowns < bmmUnknowns ? fmmResult : bmmResult;
278
+ }
279
+ if (fmmResult.length !== bmmResult.length) {
280
+ return fmmResult.length < bmmResult.length ? fmmResult : bmmResult;
281
+ }
282
+ return fmmResult;
283
+ }
284
+
285
+ // src/algorithms/group-digits.ts
286
+ function isDigitStr(s) {
287
+ if (s.length !== 1) return false;
288
+ const cp = s.codePointAt(0);
289
+ return isDigit(cp);
290
+ }
291
+ function groupDigitTokens(tokens) {
292
+ if (tokens.length === 0) return [];
293
+ const result = [];
294
+ let i = 0;
295
+ while (i < tokens.length) {
296
+ if (isDigitStr(tokens[i].value)) {
297
+ const start = tokens[i].start;
298
+ let combined = tokens[i].value;
299
+ let end = tokens[i].end;
300
+ let known = tokens[i].isKnown;
301
+ i++;
302
+ while (i < tokens.length && isDigitStr(tokens[i].value)) {
303
+ combined += tokens[i].value;
304
+ end = tokens[i].end;
305
+ known = known || tokens[i].isKnown;
306
+ i++;
307
+ }
308
+ result.push({ value: combined, start, end, isKnown: known });
309
+ } else {
310
+ result.push(tokens[i]);
311
+ i++;
312
+ }
313
+ }
314
+ return result;
315
+ }
316
+
317
+ // src/core/segment.ts
318
+ function segmentWords(text, options) {
319
+ const shouldNormalize = options?.normalize !== false;
320
+ const normalized = shouldNormalize ? normalizeKhmer(text) : text;
321
+ const clusters = splitClusters(normalized);
322
+ const dictionary = options?.dictionary;
323
+ let tokens;
324
+ if (dictionary) {
325
+ const strategy = options?.strategy ?? "fmm";
326
+ switch (strategy) {
327
+ case "bmm":
328
+ tokens = bmmSegment(clusters, dictionary);
329
+ break;
330
+ case "bimm":
331
+ tokens = bimmSegment(clusters, dictionary);
332
+ break;
333
+ default:
334
+ tokens = fmmSegment(clusters, dictionary);
335
+ break;
336
+ }
337
+ } else {
338
+ let offset = 0;
339
+ tokens = clusters.map((cluster) => {
340
+ const start = offset;
341
+ const end = offset + cluster.length;
342
+ offset = end;
343
+ return { value: cluster, start, end, isKnown: false };
344
+ });
345
+ }
346
+ tokens = groupDigitTokens(tokens);
347
+ return {
348
+ original: text,
349
+ normalized,
350
+ tokens
351
+ };
352
+ }
353
+
354
+ // src/dictionary/trie.ts
355
+ var TrieNode = class {
356
+ constructor() {
357
+ this.children = /* @__PURE__ */ new Map();
358
+ this.isEndOfWord = false;
359
+ }
360
+ };
361
+ var Trie = class {
362
+ constructor() {
363
+ this.root = new TrieNode();
364
+ }
365
+ insert(word) {
366
+ let node = this.root;
367
+ for (const ch of word) {
368
+ if (!node.children.has(ch)) {
369
+ node.children.set(ch, new TrieNode());
370
+ }
371
+ node = node.children.get(ch);
372
+ }
373
+ node.isEndOfWord = true;
374
+ }
375
+ has(word) {
376
+ let node = this.root;
377
+ for (const ch of word) {
378
+ if (!node.children.has(ch)) return false;
379
+ node = node.children.get(ch);
380
+ }
381
+ return node.isEndOfWord;
382
+ }
383
+ hasPrefix(prefix) {
384
+ let node = this.root;
385
+ for (const ch of prefix) {
386
+ if (!node.children.has(ch)) return false;
387
+ node = node.children.get(ch);
388
+ }
389
+ return true;
390
+ }
391
+ hasSuffix(suffix) {
392
+ return this.hasReverse(suffix, this.root);
393
+ }
394
+ hasReverse(suffix, node) {
395
+ if (suffix.length === 0) return node.isEndOfWord;
396
+ const lastChar = suffix[suffix.length - 1];
397
+ for (const [ch, child] of node.children) {
398
+ if (ch === lastChar && this.hasReverse(suffix.slice(0, -1), child)) {
399
+ return true;
400
+ }
401
+ if (this.hasReverse(suffix, child)) {
402
+ return true;
403
+ }
404
+ }
405
+ return false;
406
+ }
407
+ };
408
+
409
+ // src/dictionary/memory-dictionary.ts
410
+ var MemoryDictionary = class {
411
+ constructor(words, frequencies) {
412
+ this.trie = new Trie();
413
+ this.reverseTrie = new Trie();
414
+ this.freqMap = frequencies ?? /* @__PURE__ */ new Map();
415
+ let count = 0;
416
+ for (const word of words) {
417
+ if (word.length > 0) {
418
+ this.trie.insert(word);
419
+ this.reverseTrie.insert([...word].reverse().join(""));
420
+ count++;
421
+ }
422
+ }
423
+ this.size = count;
424
+ }
425
+ has(word) {
426
+ return this.trie.has(word);
427
+ }
428
+ hasPrefix(value) {
429
+ return this.trie.hasPrefix(value);
430
+ }
431
+ hasSuffix(value) {
432
+ return this.reverseTrie.hasPrefix([...value].reverse().join(""));
433
+ }
434
+ getFrequency(word) {
435
+ return this.freqMap.get(word);
436
+ }
437
+ };
438
+
439
+ // src/dictionary/create-dictionary.ts
440
+ function createDictionary(words, frequencies) {
441
+ return new MemoryDictionary(words, frequencies);
442
+ }
443
+ export {
444
+ containsKhmer,
445
+ countClusters,
446
+ createDictionary,
447
+ getClusterBoundaries,
448
+ isKhmerChar,
449
+ isKhmerText,
450
+ normalizeKhmer,
451
+ normalizeKhmerCluster,
452
+ segmentWords,
453
+ splitClusters
454
+ };
455
+ //# sourceMappingURL=index.js.map
package/package.json ADDED
@@ -0,0 +1,64 @@
1
+ {
2
+ "name": "khmer-segment",
3
+ "version": "0.2.0",
4
+ "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
+ "type": "module",
6
+ "main": "./dist/index.cjs",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "require": "./dist/index.cjs"
14
+ },
15
+ "./dictionary": {
16
+ "types": "./dist/dictionary/index.d.ts",
17
+ "import": "./dist/dictionary/index.js",
18
+ "require": "./dist/dictionary/index.cjs"
19
+ }
20
+ },
21
+ "files": [
22
+ "dist",
23
+ "!dist/**/*.map"
24
+ ],
25
+ "sideEffects": false,
26
+ "scripts": {
27
+ "build": "tsup",
28
+ "dev": "tsup --watch",
29
+ "test": "vitest run",
30
+ "test:watch": "vitest",
31
+ "lint": "tsc --noEmit",
32
+ "format": "prettier --write .",
33
+ "format:check": "prettier --check .",
34
+ "prepublishOnly": "npm run build && npm run test && npm run lint",
35
+ "playground:dev": "npm run dev --prefix playground",
36
+ "playground:build": "npm run build --prefix playground"
37
+ },
38
+ "keywords": [
39
+ "khmer",
40
+ "unicode",
41
+ "segmentation",
42
+ "nlp",
43
+ "typescript",
44
+ "javascript"
45
+ ],
46
+ "author": "Phalla Doll",
47
+ "license": "MIT",
48
+ "repository": {
49
+ "type": "git",
50
+ "url": "git+https://github.com/phalla-doll/khmer-segment-js.git"
51
+ },
52
+ "homepage": "https://github.com/phalla-doll/khmer-segment-js#readme",
53
+ "bugs": {
54
+ "url": "https://github.com/phalla-doll/khmer-segment-js/issues"
55
+ },
56
+ "devDependencies": {
57
+ "@huggingface/hub": "^2.11.0",
58
+ "@types/node": "^25.5.2",
59
+ "prettier": "^3.8.1",
60
+ "tsup": "^8.0.0",
61
+ "typescript": "^5.0.0",
62
+ "vitest": "^3.0.0"
63
+ }
64
+ }