khmer-segment 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -44,6 +44,9 @@ var DEPENDENT_VOWEL_START = 6068;
44
44
  var DEPENDENT_VOWEL_END = 6085;
45
45
  var SIGN_START = 6086;
46
46
  var SIGN_END = 6099;
47
+ var KHMER_PUNCT_KHAN = 6100;
48
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
49
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
47
50
  var KHMER_COENG = 6098;
48
51
  var DIGIT_START = 6112;
49
52
  var DIGIT_END = 6121;
@@ -81,6 +84,12 @@ function isAsciiDigit(cp) {
81
84
  function isDigit(cp) {
82
85
  return isKhmerDigit(cp) || isAsciiDigit(cp);
83
86
  }
87
+ function isKhmerSentencePunctuation(cp) {
88
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
89
+ }
90
+ function isKhmerSentencePunctuationToken(value) {
91
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
92
+ }
84
93
  function isClusterBase(cp) {
85
94
  return isConsonant(cp) || isIndependentVowel(cp);
86
95
  }
@@ -443,7 +452,7 @@ function viterbiSegment(clusters, dictionary, options) {
443
452
  if (cost < dp[j]) {
444
453
  dp[j] = cost;
445
454
  from[j] = i;
446
- fromKnown[j] = false;
455
+ fromKnown[j] = true;
447
456
  }
448
457
  continue;
449
458
  }
@@ -452,7 +461,7 @@ function viterbiSegment(clusters, dictionary, options) {
452
461
  if (cost < dp[i + 1]) {
453
462
  dp[i + 1] = cost;
454
463
  from[i + 1] = i;
455
- fromKnown[i + 1] = false;
464
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
456
465
  }
457
466
  continue;
458
467
  }
@@ -573,15 +582,13 @@ function groupDigitTokens(tokens) {
573
582
  const start = tokens[i].start;
574
583
  let combined = tokens[i].value;
575
584
  let end = tokens[i].end;
576
- let known = tokens[i].isKnown;
577
585
  i++;
578
586
  while (i < tokens.length && isDigitStr(tokens[i].value)) {
579
587
  combined += tokens[i].value;
580
588
  end = tokens[i].end;
581
- known = known || tokens[i].isKnown;
582
589
  i++;
583
590
  }
584
- result.push({ value: combined, start, end, isKnown: known });
591
+ result.push({ value: combined, start, end, isKnown: true });
585
592
  } else {
586
593
  result.push(tokens[i]);
587
594
  i++;
@@ -625,12 +632,18 @@ function segmentWords(text, options) {
625
632
  });
626
633
  }
627
634
  tokens = groupDigitTokens(tokens);
635
+ tokens = markKhmerSentencePunctuationKnown(tokens);
628
636
  return {
629
637
  original: text,
630
638
  normalized,
631
639
  tokens
632
640
  };
633
641
  }
642
+ function markKhmerSentencePunctuationKnown(tokens) {
643
+ return tokens.map(
644
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
645
+ );
646
+ }
634
647
 
635
648
  // src/dictionary/trie.ts
636
649
  var TrieNode = class {
package/dist/index.js CHANGED
@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
9
9
  var DEPENDENT_VOWEL_END = 6085;
10
10
  var SIGN_START = 6086;
11
11
  var SIGN_END = 6099;
12
+ var KHMER_PUNCT_KHAN = 6100;
13
+ var KHMER_PUNCT_BARIYOOSAN = 6101;
14
+ var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
12
15
  var KHMER_COENG = 6098;
13
16
  var DIGIT_START = 6112;
14
17
  var DIGIT_END = 6121;
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
46
49
  function isDigit(cp) {
47
50
  return isKhmerDigit(cp) || isAsciiDigit(cp);
48
51
  }
52
+ function isKhmerSentencePunctuation(cp) {
53
+ return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
54
+ }
55
+ function isKhmerSentencePunctuationToken(value) {
56
+ return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
57
+ }
49
58
  function isClusterBase(cp) {
50
59
  return isConsonant(cp) || isIndependentVowel(cp);
51
60
  }
@@ -408,7 +417,7 @@ function viterbiSegment(clusters, dictionary, options) {
408
417
  if (cost < dp[j]) {
409
418
  dp[j] = cost;
410
419
  from[j] = i;
411
- fromKnown[j] = false;
420
+ fromKnown[j] = true;
412
421
  }
413
422
  continue;
414
423
  }
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
417
426
  if (cost < dp[i + 1]) {
418
427
  dp[i + 1] = cost;
419
428
  from[i + 1] = i;
420
- fromKnown[i + 1] = false;
429
+ fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
421
430
  }
422
431
  continue;
423
432
  }
@@ -538,15 +547,13 @@ function groupDigitTokens(tokens) {
538
547
  const start = tokens[i].start;
539
548
  let combined = tokens[i].value;
540
549
  let end = tokens[i].end;
541
- let known = tokens[i].isKnown;
542
550
  i++;
543
551
  while (i < tokens.length && isDigitStr(tokens[i].value)) {
544
552
  combined += tokens[i].value;
545
553
  end = tokens[i].end;
546
- known = known || tokens[i].isKnown;
547
554
  i++;
548
555
  }
549
- result.push({ value: combined, start, end, isKnown: known });
556
+ result.push({ value: combined, start, end, isKnown: true });
550
557
  } else {
551
558
  result.push(tokens[i]);
552
559
  i++;
@@ -590,12 +597,18 @@ function segmentWords(text, options) {
590
597
  });
591
598
  }
592
599
  tokens = groupDigitTokens(tokens);
600
+ tokens = markKhmerSentencePunctuationKnown(tokens);
593
601
  return {
594
602
  original: text,
595
603
  normalized,
596
604
  tokens
597
605
  };
598
606
  }
607
+ function markKhmerSentencePunctuationKnown(tokens) {
608
+ return tokens.map(
609
+ (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
610
+ );
611
+ }
599
612
 
600
613
  // src/dictionary/trie.ts
601
614
  var TrieNode = class {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "khmer-segment",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",