khmer-segment 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +16 -1
- package/dist/index.js +16 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -44,6 +44,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
44
44
|
var DEPENDENT_VOWEL_END = 6085;
|
|
45
45
|
var SIGN_START = 6086;
|
|
46
46
|
var SIGN_END = 6099;
|
|
47
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
48
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
49
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
47
50
|
var KHMER_COENG = 6098;
|
|
48
51
|
var DIGIT_START = 6112;
|
|
49
52
|
var DIGIT_END = 6121;
|
|
@@ -81,6 +84,12 @@ function isAsciiDigit(cp) {
|
|
|
81
84
|
function isDigit(cp) {
|
|
82
85
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
83
86
|
}
|
|
87
|
+
function isKhmerSentencePunctuation(cp) {
|
|
88
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
89
|
+
}
|
|
90
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
91
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
92
|
+
}
|
|
84
93
|
function isClusterBase(cp) {
|
|
85
94
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
86
95
|
}
|
|
@@ -452,7 +461,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
452
461
|
if (cost < dp[i + 1]) {
|
|
453
462
|
dp[i + 1] = cost;
|
|
454
463
|
from[i + 1] = i;
|
|
455
|
-
fromKnown[i + 1] =
|
|
464
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
456
465
|
}
|
|
457
466
|
continue;
|
|
458
467
|
}
|
|
@@ -623,12 +632,18 @@ function segmentWords(text, options) {
|
|
|
623
632
|
});
|
|
624
633
|
}
|
|
625
634
|
tokens = groupDigitTokens(tokens);
|
|
635
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
626
636
|
return {
|
|
627
637
|
original: text,
|
|
628
638
|
normalized,
|
|
629
639
|
tokens
|
|
630
640
|
};
|
|
631
641
|
}
|
|
642
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
643
|
+
return tokens.map(
|
|
644
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
645
|
+
);
|
|
646
|
+
}
|
|
632
647
|
|
|
633
648
|
// src/dictionary/trie.ts
|
|
634
649
|
var TrieNode = class {
|
package/dist/index.js
CHANGED
|
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
|
|
|
9
9
|
var DEPENDENT_VOWEL_END = 6085;
|
|
10
10
|
var SIGN_START = 6086;
|
|
11
11
|
var SIGN_END = 6099;
|
|
12
|
+
var KHMER_PUNCT_KHAN = 6100;
|
|
13
|
+
var KHMER_PUNCT_BARIYOOSAN = 6101;
|
|
14
|
+
var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
|
|
12
15
|
var KHMER_COENG = 6098;
|
|
13
16
|
var DIGIT_START = 6112;
|
|
14
17
|
var DIGIT_END = 6121;
|
|
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
|
|
|
46
49
|
function isDigit(cp) {
|
|
47
50
|
return isKhmerDigit(cp) || isAsciiDigit(cp);
|
|
48
51
|
}
|
|
52
|
+
function isKhmerSentencePunctuation(cp) {
|
|
53
|
+
return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
|
|
54
|
+
}
|
|
55
|
+
function isKhmerSentencePunctuationToken(value) {
|
|
56
|
+
return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
|
|
57
|
+
}
|
|
49
58
|
function isClusterBase(cp) {
|
|
50
59
|
return isConsonant(cp) || isIndependentVowel(cp);
|
|
51
60
|
}
|
|
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
|
|
|
417
426
|
if (cost < dp[i + 1]) {
|
|
418
427
|
dp[i + 1] = cost;
|
|
419
428
|
from[i + 1] = i;
|
|
420
|
-
fromKnown[i + 1] =
|
|
429
|
+
fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
|
|
421
430
|
}
|
|
422
431
|
continue;
|
|
423
432
|
}
|
|
@@ -588,12 +597,18 @@ function segmentWords(text, options) {
|
|
|
588
597
|
});
|
|
589
598
|
}
|
|
590
599
|
tokens = groupDigitTokens(tokens);
|
|
600
|
+
tokens = markKhmerSentencePunctuationKnown(tokens);
|
|
591
601
|
return {
|
|
592
602
|
original: text,
|
|
593
603
|
normalized,
|
|
594
604
|
tokens
|
|
595
605
|
};
|
|
596
606
|
}
|
|
607
|
+
function markKhmerSentencePunctuationKnown(tokens) {
|
|
608
|
+
return tokens.map(
|
|
609
|
+
(token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
|
|
610
|
+
);
|
|
611
|
+
}
|
|
597
612
|
|
|
598
613
|
// src/dictionary/trie.ts
|
|
599
614
|
var TrieNode = class {
|