npm - khmer-segment - Versions diffs - 0.3.0 → 0.3.2 - Mend

khmer-segment 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.cjs CHANGED Viewed

@@ -44,6 +44,9 @@ var DEPENDENT_VOWEL_START = 6068;
 var DEPENDENT_VOWEL_END = 6085;
 var SIGN_START = 6086;
 var SIGN_END = 6099;
+var KHMER_PUNCT_KHAN = 6100;
+var KHMER_PUNCT_BARIYOOSAN = 6101;
+var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
 var KHMER_COENG = 6098;
 var DIGIT_START = 6112;
 var DIGIT_END = 6121;
@@ -81,6 +84,12 @@ function isAsciiDigit(cp) {
 function isDigit(cp) {
   return isKhmerDigit(cp) || isAsciiDigit(cp);
 }
+function isKhmerSentencePunctuation(cp) {
+  return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
+}
+function isKhmerSentencePunctuationToken(value) {
+  return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
+}
 function isClusterBase(cp) {
   return isConsonant(cp) || isIndependentVowel(cp);
 }
@@ -443,7 +452,7 @@ function viterbiSegment(clusters, dictionary, options) {
       if (cost < dp[j]) {
         dp[j] = cost;
         from[j] = i;
-        fromKnown[j] = false;
+        fromKnown[j] = true;
       }
       continue;
     }
@@ -452,7 +461,7 @@ function viterbiSegment(clusters, dictionary, options) {
       if (cost < dp[i + 1]) {
         dp[i + 1] = cost;
         from[i + 1] = i;
-        fromKnown[i + 1] = false;
+        fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
       }
       continue;
     }
@@ -573,15 +582,13 @@ function groupDigitTokens(tokens) {
       const start = tokens[i].start;
       let combined = tokens[i].value;
       let end = tokens[i].end;
-      let known = tokens[i].isKnown;
       i++;
       while (i < tokens.length && isDigitStr(tokens[i].value)) {
         combined += tokens[i].value;
         end = tokens[i].end;
-        known = known || tokens[i].isKnown;
         i++;
       }
-      result.push({ value: combined, start, end, isKnown: known });
+      result.push({ value: combined, start, end, isKnown: true });
     } else {
       result.push(tokens[i]);
       i++;
@@ -625,12 +632,18 @@ function segmentWords(text, options) {
     });
   }
   tokens = groupDigitTokens(tokens);
+  tokens = markKhmerSentencePunctuationKnown(tokens);
   return {
     original: text,
     normalized,
     tokens
   };
 }
+function markKhmerSentencePunctuationKnown(tokens) {
+  return tokens.map(
+    (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
+  );
+}
 // src/dictionary/trie.ts
 var TrieNode = class {

package/dist/index.js CHANGED Viewed

@@ -9,6 +9,9 @@ var DEPENDENT_VOWEL_START = 6068;
 var DEPENDENT_VOWEL_END = 6085;
 var SIGN_START = 6086;
 var SIGN_END = 6099;
+var KHMER_PUNCT_KHAN = 6100;
+var KHMER_PUNCT_BARIYOOSAN = 6101;
+var KHMER_PUNCT_CAMNUC_PII_KUUH = 6102;
 var KHMER_COENG = 6098;
 var DIGIT_START = 6112;
 var DIGIT_END = 6121;
@@ -46,6 +49,12 @@ function isAsciiDigit(cp) {
 function isDigit(cp) {
   return isKhmerDigit(cp) || isAsciiDigit(cp);
 }
+function isKhmerSentencePunctuation(cp) {
+  return cp === KHMER_PUNCT_KHAN || cp === KHMER_PUNCT_BARIYOOSAN || cp === KHMER_PUNCT_CAMNUC_PII_KUUH;
+}
+function isKhmerSentencePunctuationToken(value) {
+  return value.length === 1 && isKhmerSentencePunctuation(value.codePointAt(0));
+}
 function isClusterBase(cp) {
   return isConsonant(cp) || isIndependentVowel(cp);
 }
@@ -408,7 +417,7 @@ function viterbiSegment(clusters, dictionary, options) {
       if (cost < dp[j]) {
         dp[j] = cost;
         from[j] = i;
-        fromKnown[j] = false;
+        fromKnown[j] = true;
       }
       continue;
     }
@@ -417,7 +426,7 @@ function viterbiSegment(clusters, dictionary, options) {
       if (cost < dp[i + 1]) {
         dp[i + 1] = cost;
         from[i + 1] = i;
-        fromKnown[i + 1] = false;
+        fromKnown[i + 1] = isKhmerSentencePunctuation(cp);
       }
       continue;
     }
@@ -538,15 +547,13 @@ function groupDigitTokens(tokens) {
       const start = tokens[i].start;
       let combined = tokens[i].value;
       let end = tokens[i].end;
-      let known = tokens[i].isKnown;
       i++;
       while (i < tokens.length && isDigitStr(tokens[i].value)) {
         combined += tokens[i].value;
         end = tokens[i].end;
-        known = known || tokens[i].isKnown;
         i++;
       }
-      result.push({ value: combined, start, end, isKnown: known });
+      result.push({ value: combined, start, end, isKnown: true });
     } else {
       result.push(tokens[i]);
       i++;
@@ -590,12 +597,18 @@ function segmentWords(text, options) {
     });
   }
   tokens = groupDigitTokens(tokens);
+  tokens = markKhmerSentencePunctuationKnown(tokens);
   return {
     original: text,
     normalized,
     tokens
   };
 }
+function markKhmerSentencePunctuationKnown(tokens) {
+  return tokens.map(
+    (token) => isKhmerSentencePunctuationToken(token.value) ? { ...token, isKnown: true } : token
+  );
+}
 // src/dictionary/trie.ts
 var TrieNode = class {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "khmer-segment",
-    "version": "0.3.0",
+    "version": "0.3.2",
     "description": "Khmer text segmentation, normalization, and cluster utilities for JavaScript and TypeScript.",
     "type": "module",
     "main": "./dist/index.cjs",