khmer-segment 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -44,6 +44,9 @@ var DEPENDENT_VOWEL_START = 6068;
44
44
  var DEPENDENT_VOWEL_END = 6085;
45
45
  var SIGN_START = 6086;
46
46
  var SIGN_END = 6099;
47
+ var KHMER_PUNCT_KHAN = 6100;
48
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
49
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
47
50
  var KHMER_COENG = 6098;
48
51
  var DIGIT_START = 6112;
49
52
  var DIGIT_END = 6121;
@@ -81,6 +84,12 @@ function isAsciiDigit(cp) {
81
84
  function isDigit(cp) {
82
85
  return isKhmerDigit(cp) || isAsciiDigit(cp);
83
86
  }
87
+ function isKhmerSentencePunctuation(cp) {
88
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
89
+ }
90
+ function isKhmerSentencePunctuationToken(value) {
91
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
92
+ }
84
93
  function isClusterBase(cp) {
85
94
  return isConsonant(cp) || isIndependentVowel(cp);
86
95
  }
@@ -452,7 +461,7 @@ function viterbiSegment(clusters, dictionary, options) {
452
461
  if (cost < dp[i + 1]) {
453
462
  dp[i + 1] = cost;
454
463
  from[i + 1] = i;
455
- fromKnown[i + 1] = false;
464
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
456
465
  }
457
466
  continue;
458
467
  }
@@ -623,12 +632,18 @@ function segmentWords(text, options) {
623
632
  });
624
633
  }
625
634
  tokens = groupDigitTokens(tokens);
635
+ tokens = markKhmerSentencePunctuationKnown(tokens);
626
636
  return {
627
637
  original: text,
628
638
  normalized,
629
639
  tokens
630
640
  };
631
641
  }
642
+ function markKhmerSentencePunctuationKnown(tokens) {
643
+ return tokens.map(
644
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
645
+ );
646
+ }
632
647
 
633
648
  // src/dictionary/trie.ts
634
649
  var TrieNode = class {
package/dist/index.js CHANGED
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
9
9
  var DEPENDENT_VOWEL_END = 6085;
10
10
  var SIGN_START = 6086;
11
11
  var SIGN_END = 6099;
12
+ var KHMER_PUNCT_KHAN = 6100;
13
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
14
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
12
15
  var KHMER_COENG = 6098;
13
16
  var DIGIT_START = 6112;
14
17
  var DIGIT_END = 6121;
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
46
49
  function isDigit(cp) {
47
50
  return isKhmerDigit(cp) || isAsciiDigit(cp);
48
51
  }
52
+ function isKhmerSentencePunctuation(cp) {
53
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
54
+ }
55
+ function isKhmerSentencePunctuationToken(value) {
56
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
57
+ }
49
58
  function isClusterBase(cp) {
50
59
  return isConsonant(cp) || isIndependentVowel(cp);
51
60
  }
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
417
426
  if (cost < dp[i + 1]) {
418
427
  dp[i + 1] = cost;
419
428
  from[i + 1] = i;
420
- fromKnown[i + 1] = false;
429
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
421
430
  }
422
431
  continue;
423
432
  }
@@ -588,12 +597,18 @@ function segmentWords(text, options) {
588
597
  });
589
598
  }
590
599
  tokens = groupDigitTokens(tokens);
600
+ tokens = markKhmerSentencePunctuationKnown(tokens);
591
601
  return {
592
602
  original: text,
593
603
  normalized,
594
604
  tokens
595
605
  };
596
606
  }
607
+ function markKhmerSentencePunctuationKnown(tokens) {
608
+ return tokens.map(
609
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
610
+ );
611
+ }
597
612
 
598
613
  // src/dictionary/trie.ts
599
614
  var TrieNode = class {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "khmer-segment",
3
- "version": "0.3.1",
3
+ "version": "0.3.2",
4
4
  "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",