npm - @shaxpir/duiduidui-models - Versions diffs - 1.16.0 → 1.17.1 - Mend

@shaxpir/duiduidui-models 1.16.0 → 1.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/models/Phrase.d.ts +0 -1
package/dist/models/Term.d.ts +0 -1
package/dist/models/Term.js +0 -2
package/dist/util/PinyinParser.d.ts +13 -3
package/dist/util/PinyinParser.js +48 -13
package/dist/util/PinyinValidator.d.ts +4 -1
package/dist/util/PinyinValidator.js +8 -1
package/package.json +1 -1

package/dist/models/Phrase.d.ts CHANGED Viewed

@@ -17,7 +17,6 @@ export interface Phrase {
     sense_rank: number;
     difficulty: number;
     pinyin: string;
-    pinyin_tokenized: string;
     transliteration: string;
     translation: string;
     notes: string;

package/dist/models/Term.d.ts CHANGED Viewed

@@ -42,7 +42,6 @@ export interface TermPayload extends BayesianScore {
     implied_review_count: number;
     hanzi_count?: number;
     pinyin?: string;
-    pinyin_tokenized?: string;
     transliteration?: string;
     translation?: string;
     notes?: string;

package/dist/models/Term.js CHANGED Viewed

@@ -38,7 +38,6 @@ class Term extends Content_1.Content {
                 difficulty: difficulty,
                 hanzi_count: textOrPhrase.length,
                 pinyin: '',
-                pinyin_tokenized: '',
                 transliteration: '',
                 translation: '',
                 notes: '',
@@ -71,7 +70,6 @@ class Term extends Content_1.Content {
                 hanzi_count: phrase.hanzi_count,
                 difficulty: phrase.difficulty,
                 pinyin: phrase.pinyin,
-                pinyin_tokenized: phrase.pinyin_tokenized,
                 transliteration: phrase.transliteration,
                 translation: phrase.translation,
                 notes: phrase.notes,

package/dist/util/PinyinParser.d.ts CHANGED Viewed

@@ -37,7 +37,10 @@ export declare const PinyinParser: {
      * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
      * and erhua) and return it with proper syllable spacing.
      *
-     * Preserves original tone marks and casing. Preserves punctuation in place.
+     * The output is lowercased with punctuation stripped, producing clean
+     * space-separated syllables suitable for search indexing and tokenization.
+     * Tone marks are preserved.
+     *
      * Throws if any letter sequence cannot be parsed as pinyin.
      *
      * Examples:
@@ -45,7 +48,8 @@ export declare const PinyinParser: {
      *   'nǎr'           → 'nǎ er'
      *   "xī'ān"         → 'xī ān'
      *   "gē'rmen"       → 'gē er men'
-     *   'nǐhǎo，shìjiè' → 'nǐ hǎo，shì jiè'
+     *   'Nǐ hǎo!'       → 'nǐ hǎo'
+     *   'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
      */
     ensurePinyinSpacing(text: string): string;
     /**
@@ -58,7 +62,13 @@ export declare const PinyinParser: {
      */
     parseWithSpecialCases(text: string): PinyinParseResult[];
     /**
-     * Get the best parsing from multiple options
+     * Get the best parsing from multiple options.
+     *
+     * Primary criterion: fewer syllables (more natural word-level grouping).
+     * Tiebreaker: when two parses have the same syllable count, prefer the
+     * one whose first syllable is shorter. This avoids greedy long matches
+     * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
+     * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
      */
     getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
     /**

package/dist/util/PinyinParser.js CHANGED Viewed

@@ -59,8 +59,8 @@ exports.PinyinParser = {
         if (!text || text.length === 0)
             return [];
         let normalized = this.normalizeApostrophes(text.toLowerCase().trim());
-        // Expand 'r (erhua via apostrophe) to 'er before splitting
-        normalized = normalized.replace(/'r/g, "'er");
+        // Expand 'r (erhua via apostrophe) to 'er, but not when followed by a vowel
+        normalized = normalized.replace(/'r(?![aeiouāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüv])/gi, "'er");
         if (normalized.includes("'")) {
             return this._parseApostropheSplit(normalized, true);
         }
@@ -70,7 +70,10 @@ exports.PinyinParser = {
      * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
      * and erhua) and return it with proper syllable spacing.
      *
-     * Preserves original tone marks and casing. Preserves punctuation in place.
+     * The output is lowercased with punctuation stripped, producing clean
+     * space-separated syllables suitable for search indexing and tokenization.
+     * Tone marks are preserved.
+     *
      * Throws if any letter sequence cannot be parsed as pinyin.
      *
      * Examples:
@@ -78,13 +81,15 @@ exports.PinyinParser = {
      *   'nǎr'           → 'nǎ er'
      *   "xī'ān"         → 'xī ān'
      *   "gē'rmen"       → 'gē er men'
-     *   'nǐhǎo，shìjiè' → 'nǐ hǎo，shì jiè'
+     *   'Nǐ hǎo!'       → 'nǐ hǎo'
+     *   'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
      */
     ensurePinyinSpacing(text) {
-        // Normalize apostrophes
-        text = this.normalizeApostrophes(text);
-        // Expand 'r (erhua via apostrophe) to 'er, then replace apostrophes with spaces
-        text = text.replace(/'r/g, "'er");
+        // Normalize apostrophes and lowercase
+        text = this.normalizeApostrophes(text).toLowerCase();
+        // Expand 'r (erhua via apostrophe) to 'er, but not when 'r is followed by
+        // a vowel (which would mean 'r starts a syllable like 'rén, not erhua).
+        text = text.replace(/'r(?![aeiouāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüv])/gi, "'er");
         text = text.replace(/'/g, ' ');
         text = text.replace(/ +/g, ' ');
         const parts = [];
@@ -106,14 +111,27 @@ exports.PinyinParser = {
                 i = j;
             }
             else {
+                // Non-letter run: preserve ellipsis (…) as a token (used in patterns
+                // like 太…了), but strip all other punctuation and collapse to a space.
                 let j = i;
                 while (j < text.length && !isLetter(text[j]))
                     j++;
-                parts.push(text.substring(i, j));
+                const nonLetterRun = text.substring(i, j);
+                if (nonLetterRun.includes('…')) {
+                    // Preserve ellipsis with surrounding spaces
+                    if (parts.length > 0)
+                        parts.push(' ');
+                    parts.push('…');
+                    if (j < text.length)
+                        parts.push(' ');
+                }
+                else if (parts.length > 0 && j < text.length) {
+                    parts.push(' ');
+                }
                 i = j;
             }
         }
-        return parts.join('');
+        return parts.join('').trim();
     },
     /**
      * Check if a string could be compound pinyin
@@ -153,13 +171,30 @@ exports.PinyinParser = {
         return this._parseDP(normalized, false);
     },
     /**
-     * Get the best parsing from multiple options
+     * Get the best parsing from multiple options.
+     *
+     * Primary criterion: fewer syllables (more natural word-level grouping).
+     * Tiebreaker: when two parses have the same syllable count, prefer the
+     * one whose first syllable is shorter. This avoids greedy long matches
+     * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
+     * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
      */
     getBestParsing(results) {
         if (results.length === 0)
             return null;
-        // Prefer fewer syllables (more natural parsing)
-        results.sort((a, b) => a.syllables.length - b.syllables.length);
+        results.sort((a, b) => {
+            // Primary: fewer syllables
+            if (a.syllables.length !== b.syllables.length) {
+                return a.syllables.length - b.syllables.length;
+            }
+            // Tiebreaker: shorter first syllable
+            for (let i = 0; i < a.syllables.length; i++) {
+                if (a.syllables[i].length !== b.syllables[i].length) {
+                    return a.syllables[i].length - b.syllables[i].length;
+                }
+            }
+            return 0;
+        });
         return results[0];
     },
     /**

package/dist/util/PinyinValidator.d.ts CHANGED Viewed

@@ -24,7 +24,10 @@ export declare const PinyinValidator: {
      */
     removeAccentMarks(text: string): string;
     /**
-     * Check if a string is a valid pinyin syllable (with or without tone marks)
+     * Check if a string is a valid pinyin syllable (with or without tone marks).
+     * A valid single syllable has at most one tone mark — two tone marks means
+     * two syllables have been merged (e.g. "zhùān" looks like "zhuan" after
+     * stripping tones, but the two marks prove it's "zhù" + "ān").
      */
     isValidPinyin(text: string): boolean;
     /**

package/dist/util/PinyinValidator.js CHANGED Viewed

@@ -244,11 +244,18 @@ exports.PinyinValidator = {
         });
     },
     /**
-     * Check if a string is a valid pinyin syllable (with or without tone marks)
+     * Check if a string is a valid pinyin syllable (with or without tone marks).
+     * A valid single syllable has at most one tone mark — two tone marks means
+     * two syllables have been merged (e.g. "zhùān" looks like "zhuan" after
+     * stripping tones, but the two marks prove it's "zhù" + "ān").
      */
     isValidPinyin(text) {
         if (!text || text.length === 0)
             return false;
+        // Count tone marks: a single syllable can have at most one
+        const toneCount = (text.match(TONE_MARKS) || []).length;
+        if (toneCount > 1)
+            return false;
         const normalized = stripToneMarks(text);
         return VALID_SYLLABLES.has(normalized);
     },

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@shaxpir/duiduidui-models",
-  "version": "1.16.0",
+  "version": "1.17.1",
   "repository": {
     "type": "git",
     "url": "https://github.com/shaxpir/duiduidui-models"