khmer-segment 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +18 -5
- package/dist/index.js +18 -5
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -44,6 +44,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
44
44
|
var DEPENDENT_VOWEL_END = 6085;
|
|
45
45
|
var SIGN_START = 6086;
|
|
46
46
|
var SIGN_END = 6099;
|
|
47
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
48
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
49
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
47
50
|
var KHMER_COENG = 6098;
|
|
48
51
|
var DIGIT_START = 6112;
|
|
49
52
|
var DIGIT_END = 6121;
|
|
@@ -81,6 +84,12 @@ function isAsciiDigit(cp) {
|
|
|
81
84
|
function isDigit(cp) {
|
|
82
85
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
83
86
|
}
|
|
87
|
+
function isKhmerSentencePunctuation(cp) {
|
|
88
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
89
|
+
}
|
|
90
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
91
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
92
|
+
}
|
|
84
93
|
function isClusterBase(cp) {
|
|
85
94
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
86
95
|
}
|
|
@@ -443,7 +452,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
443
452
|
if (cost < dp[j]) {
|
|
444
453
|
dp[j] = cost;
|
|
445
454
|
from[j] = i;
|
|
446
|
-
fromKnown[j] =
|
|
455
|
+
fromKnown[j] = true;
|
|
447
456
|
}
|
|
448
457
|
continue;
|
|
449
458
|
}
|
|
@@ -452,7 +461,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
452
461
|
if (cost < dp[i + 1]) {
|
|
453
462
|
dp[i + 1] = cost;
|
|
454
463
|
from[i + 1] = i;
|
|
455
|
-
fromKnown[i + 1] =
|
|
464
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
456
465
|
}
|
|
457
466
|
continue;
|
|
458
467
|
}
|
|
@@ -573,15 +582,13 @@ function groupDigitTokens(tokens) {
|
|
|
573
582
|
const start = tokens[i].start;
|
|
574
583
|
let combined = tokens[i].value;
|
|
575
584
|
let end = tokens[i].end;
|
|
576
|
-
let known = tokens[i].isKnown;
|
|
577
585
|
i++;
|
|
578
586
|
while (i < tokens.length && isDigitStr(tokens[i].value)) {
|
|
579
587
|
combined += tokens[i].value;
|
|
580
588
|
end = tokens[i].end;
|
|
581
|
-
known = known || tokens[i].isKnown;
|
|
582
589
|
i++;
|
|
583
590
|
}
|
|
584
|
-
result.push({ value: combined, start, end, isKnown:
|
|
591
|
+
result.push({ value: combined, start, end, isKnown: true });
|
|
585
592
|
} else {
|
|
586
593
|
result.push(tokens[i]);
|
|
587
594
|
i++;
|
|
@@ -625,12 +632,18 @@ function segmentWords(text, options) {
|
|
|
625
632
|
});
|
|
626
633
|
}
|
|
627
634
|
tokens = groupDigitTokens(tokens);
|
|
635
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
628
636
|
return {
|
|
629
637
|
original: text,
|
|
630
638
|
normalized,
|
|
631
639
|
tokens
|
|
632
640
|
};
|
|
633
641
|
}
|
|
642
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
643
|
+
return tokens.map(
|
|
644
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
645
|
+
);
|
|
646
|
+
}
|
|
634
647
|
|
|
635
648
|
// src/dictionary/trie.ts
|
|
636
649
|
var TrieNode = class {
|
package/dist/index.js
CHANGED
|
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
9
9
|
var DEPENDENT_VOWEL_END = 6085;
|
|
10
10
|
var SIGN_START = 6086;
|
|
11
11
|
var SIGN_END = 6099;
|
|
12
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
13
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
14
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
12
15
|
var KHMER_COENG = 6098;
|
|
13
16
|
var DIGIT_START = 6112;
|
|
14
17
|
var DIGIT_END = 6121;
|
|
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
|
|
|
46
49
|
function isDigit(cp) {
|
|
47
50
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
48
51
|
}
|
|
52
|
+
function isKhmerSentencePunctuation(cp) {
|
|
53
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
54
|
+
}
|
|
55
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
56
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
57
|
+
}
|
|
49
58
|
function isClusterBase(cp) {
|
|
50
59
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
51
60
|
}
|
|
@@ -408,7 +417,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
408
417
|
if (cost < dp[j]) {
|
|
409
418
|
dp[j] = cost;
|
|
410
419
|
from[j] = i;
|
|
411
|
-
fromKnown[j] =
|
|
420
|
+
fromKnown[j] = true;
|
|
412
421
|
}
|
|
413
422
|
continue;
|
|
414
423
|
}
|
|
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
417
426
|
if (cost < dp[i + 1]) {
|
|
418
427
|
dp[i + 1] = cost;
|
|
419
428
|
from[i + 1] = i;
|
|
420
|
-
fromKnown[i + 1] =
|
|
429
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
421
430
|
}
|
|
422
431
|
continue;
|
|
423
432
|
}
|
|
@@ -538,15 +547,13 @@ function groupDigitTokens(tokens) {
|
|
|
538
547
|
const start = tokens[i].start;
|
|
539
548
|
let combined = tokens[i].value;
|
|
540
549
|
let end = tokens[i].end;
|
|
541
|
-
let known = tokens[i].isKnown;
|
|
542
550
|
i++;
|
|
543
551
|
while (i < tokens.length && isDigitStr(tokens[i].value)) {
|
|
544
552
|
combined += tokens[i].value;
|
|
545
553
|
end = tokens[i].end;
|
|
546
|
-
known = known || tokens[i].isKnown;
|
|
547
554
|
i++;
|
|
548
555
|
}
|
|
549
|
-
result.push({ value: combined, start, end, isKnown:
|
|
556
|
+
result.push({ value: combined, start, end, isKnown: true });
|
|
550
557
|
} else {
|
|
551
558
|
result.push(tokens[i]);
|
|
552
559
|
i++;
|
|
@@ -590,12 +597,18 @@ function segmentWords(text, options) {
|
|
|
590
597
|
});
|
|
591
598
|
}
|
|
592
599
|
tokens = groupDigitTokens(tokens);
|
|
600
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
593
601
|
return {
|
|
594
602
|
original: text,
|
|
595
603
|
normalized,
|
|
596
604
|
tokens
|
|
597
605
|
};
|
|
598
606
|
}
|
|
607
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
608
|
+
return tokens.map(
|
|
609
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
610
|
+
);
|
|
611
|
+
}
|
|
599
612
|
|
|
600
613
|
// src/dictionary/trie.ts
|
|
601
614
|
var TrieNode = class {
|