npm - @shaxpir/duiduidui-models - Versions diffs - 1.10.4 → 1.11.0 - Mend

@shaxpir/duiduidui-models 1.10.4 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/util/PinyinValidator.d.ts +22 -1
package/dist/util/PinyinValidator.js +216 -108
package/dist/util/SearchTokenizer.d.ts +5 -0
package/dist/util/SearchTokenizer.js +19 -4
package/package.json +1 -1

package/dist/util/PinyinValidator.d.ts CHANGED Viewed

@@ -1,5 +1,14 @@
 /**
- * Utility for validating pinyin tokens
+ * Utility for validating pinyin tokens.
+ *
+ * Uses a curated set of attested Mandarin syllables rather than
+ * combinatorial initial+final matching, which would accept non-existent
+ * syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
+ *
+ * Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
+ * or where the initial only pairs with ü). getSearchVariants() expands any
+ * of these into all equivalent forms so downstream search covers every
+ * convention the database might use.
  */
 export declare const PinyinValidator: {
     /**
@@ -10,6 +19,18 @@ export declare const PinyinValidator: {
      * Check if a string could be a pinyin token (more lenient, for prefix matching)
      */
     couldBePinyinPrefix(text: string): boolean;
+    /**
+     * Return all u/ü/v spelling variants for a syllable (with or without tone
+     * marks). The input syllable itself is always included (tone-stripped).
+     * Syllables with no ü ambiguity return a single-element array.
+     *
+     * Examples:
+     *   'lu'  → ['lu', 'lü', 'lv']
+     *   'lǜ'  → ['lü', 'lu', 'lv']
+     *   'jv'  → ['jv', 'ju', 'jü']
+     *   'ba'  → ['ba']
+     */
+    getSearchVariants(text: string): string[];
     /**
      * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
      * Returns empty array if not valid pinyin

package/dist/util/PinyinValidator.js CHANGED Viewed

@@ -1,42 +1,209 @@
 "use strict";
 /**
- * Utility for validating pinyin tokens
+ * Utility for validating pinyin tokens.
+ *
+ * Uses a curated set of attested Mandarin syllables rather than
+ * combinatorial initial+final matching, which would accept non-existent
+ * syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
+ *
+ * Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
+ * or where the initial only pairs with ü). getSearchVariants() expands any
+ * of these into all equivalent forms so downstream search covers every
+ * convention the database might use.
  */
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.PinyinValidator = void 0;
-// Valid pinyin initials (including empty string for syllables like 'a', 'e')
-const INITIALS = new Set([
-    '', // empty initial
-    'b', 'p', 'm', 'f',
-    'd', 't', 'n', 'l',
-    'g', 'k', 'h',
-    'j', 'q', 'x',
-    'zh', 'ch', 'sh', 'r',
-    'z', 'c', 's',
-    'y', 'w'
-]);
-// Valid pinyin finals
-const FINALS = new Set([
-    'a', 'o', 'e', 'i', 'u', 'ü', 'v', // 'v' is often used instead of 'ü'
-    'ai', 'ei', 'ui', 'ao', 'ou', 'iu',
-    'ie', 'üe', 've', 'ue', 'er',
-    'an', 'en', 'in', 'un', 'ün', 'vn',
-    'ang', 'eng', 'ing', 'ong',
-    'ia', 'iao', 'ian', 'iang', 'iong',
-    'ua', 'uo', 'uai', 'uan', 'uang',
-    'üan', 'van', 'yuan'
-]);
-// Common standalone syllables
-const STANDALONE_SYLLABLES = new Set([
+// All valid Mandarin pinyin syllables (toneless, lowercase).
+// Three spellings accepted for ü: ü, v, and (for j/q/x/y) u.
+const VALID_SYLLABLES = new Set([
+    // === Zero-initial (standalone vowel syllables) ===
     'a', 'ai', 'an', 'ang', 'ao',
     'e', 'ei', 'en', 'eng', 'er',
     'o', 'ou',
-    'yi', 'ya', 'yao', 'ye', 'you', 'yan', 'yang', 'yin', 'ying', 'yong',
-    'wu', 'wa', 'wo', 'wai', 'wei', 'wan', 'wang', 'wen', 'weng',
-    'yu', 'yue', 'yuan', 'yun'
+    // === y- initial (represents i-/ü- standalone) ===
+    'ya', 'yao', 'yan', 'yang',
+    'ye', 'yi', 'yin', 'ying',
+    'yo',
+    'yong', 'you',
+    'yu', 'yuan', 'yue', 'yun', // standard u spelling (actually ü)
+    'yü', 'yüan', 'yüe', 'yün', // explicit ü spelling
+    'yv', 'yvan', 'yve', 'yvn', // v-as-ü spelling
+    // === w- initial (represents u- standalone) ===
+    'wa', 'wai', 'wan', 'wang',
+    'wei', 'wen', 'weng',
+    'wo', 'wu',
+    // === b- ===
+    'ba', 'bai', 'ban', 'bang', 'bao',
+    'bei', 'ben', 'beng',
+    'bi', 'bian', 'biao', 'bie', 'bin', 'bing',
+    'bo', 'bu',
+    // === p- ===
+    'pa', 'pai', 'pan', 'pang', 'pao',
+    'pei', 'pen', 'peng',
+    'pi', 'pian', 'piao', 'pie', 'pin', 'ping',
+    'po', 'pou', 'pu',
+    // === m- ===
+    'ma', 'mai', 'man', 'mang', 'mao',
+    'mei', 'men', 'meng',
+    'mi', 'mian', 'miao', 'mie', 'min', 'ming',
+    'miu',
+    'mo', 'mou', 'mu',
+    // === f- ===
+    'fa', 'fan', 'fang',
+    'fei', 'fen', 'feng',
+    'fo', 'fou', 'fu',
+    // === d- ===
+    'da', 'dai', 'dan', 'dang', 'dao',
+    'de', 'dei', 'den', 'deng',
+    'di', 'dia', 'dian', 'diao', 'die', 'ding', 'diu',
+    'dong', 'dou', 'du', 'duan', 'dui', 'dun', 'duo',
+    // === t- ===
+    'ta', 'tai', 'tan', 'tang', 'tao',
+    'te', 'tei', 'teng',
+    'ti', 'tian', 'tiao', 'tie', 'ting',
+    'tong', 'tou', 'tu', 'tuan', 'tui', 'tun', 'tuo',
+    // === n- ===
+    'na', 'nai', 'nan', 'nang', 'nao',
+    'ne', 'nei', 'nen', 'neng',
+    'ni', 'nian', 'niang', 'niao', 'nie', 'nin', 'ning', 'niu',
+    'nong', 'nou', 'nu', 'nuan', 'nuo',
+    'nü', 'nüe',
+    'nv', 'nve', // v-as-ü
+    'nue', // u-as-ü (beginner-friendly alias for nüe)
+    // === l- ===
+    'la', 'lai', 'lan', 'lang', 'lao',
+    'le', 'lei', 'leng',
+    'li', 'lia', 'lian', 'liang', 'liao', 'lie', 'lin', 'ling', 'liu',
+    'lo',
+    'long', 'lou', 'lu', 'luan', 'lun', 'luo',
+    'lü', 'lüe',
+    'lv', 'lve', // v-as-ü
+    'lue', // u-as-ü (beginner-friendly alias for lüe)
+    // === g- ===
+    'ga', 'gai', 'gan', 'gang', 'gao',
+    'ge', 'gei', 'gen', 'geng',
+    'gong', 'gou', 'gu', 'gua', 'guai', 'guan', 'guang', 'gui', 'gun', 'guo',
+    // === k- ===
+    'ka', 'kai', 'kan', 'kang', 'kao',
+    'ke', 'kei', 'ken', 'keng',
+    'kong', 'kou', 'ku', 'kua', 'kuai', 'kuan', 'kuang', 'kui', 'kun', 'kuo',
+    // === h- ===
+    'ha', 'hai', 'han', 'hang', 'hao',
+    'he', 'hei', 'hen', 'heng',
+    'hong', 'hou', 'hu', 'hua', 'huai', 'huan', 'huang', 'hui', 'hun', 'huo',
+    // === j- (u is always ü phonetically) ===
+    'ji', 'jia', 'jian', 'jiang', 'jiao', 'jie', 'jin', 'jing', 'jiong', 'jiu',
+    'ju', 'juan', 'jue', 'jun', // standard u spelling
+    'jü', 'jüan', 'jüe', 'jün', // explicit ü spelling
+    'jv', 'jvan', 'jve', 'jvn', // v-as-ü spelling
+    // === q- (u is always ü phonetically) ===
+    'qi', 'qia', 'qian', 'qiang', 'qiao', 'qie', 'qin', 'qing', 'qiong', 'qiu',
+    'qu', 'quan', 'que', 'qun', // standard u spelling
+    'qü', 'qüan', 'qüe', 'qün', // explicit ü spelling
+    'qv', 'qvan', 'qve', 'qvn', // v-as-ü spelling
+    // === x- (u is always ü phonetically) ===
+    'xi', 'xia', 'xian', 'xiang', 'xiao', 'xie', 'xin', 'xing', 'xiong', 'xiu',
+    'xu', 'xuan', 'xue', 'xun', // standard u spelling
+    'xü', 'xüan', 'xüe', 'xün', // explicit ü spelling
+    'xv', 'xvan', 'xve', 'xvn', // v-as-ü spelling
+    // === zh- ===
+    'zha', 'zhai', 'zhan', 'zhang', 'zhao',
+    'zhe', 'zhei', 'zhen', 'zheng',
+    'zhi',
+    'zhong', 'zhou', 'zhu', 'zhua', 'zhuai', 'zhuan', 'zhuang', 'zhui', 'zhun', 'zhuo',
+    // === ch- ===
+    'cha', 'chai', 'chan', 'chang', 'chao',
+    'che', 'chen', 'cheng',
+    'chi',
+    'chong', 'chou', 'chu', 'chua', 'chuai', 'chuan', 'chuang', 'chui', 'chun', 'chuo',
+    // === sh- ===
+    'sha', 'shai', 'shan', 'shang', 'shao',
+    'she', 'shei', 'shen', 'sheng',
+    'shi',
+    'shou', 'shu', 'shua', 'shuai', 'shuan', 'shuang', 'shui', 'shun', 'shuo',
+    // === r- ===
+    'ran', 'rang', 'rao',
+    're', 'ren', 'reng',
+    'ri',
+    'rong', 'rou', 'ru', 'rua', 'ruan', 'rui', 'run', 'ruo',
+    // === z- ===
+    'za', 'zai', 'zan', 'zang', 'zao',
+    'ze', 'zei', 'zen', 'zeng',
+    'zi',
+    'zong', 'zou', 'zu', 'zuan', 'zui', 'zun', 'zuo',
+    // === c- ===
+    'ca', 'cai', 'can', 'cang', 'cao',
+    'ce', 'cen', 'ceng',
+    'ci',
+    'cong', 'cou', 'cu', 'cuan', 'cui', 'cun', 'cuo',
+    // === s- ===
+    'sa', 'sai', 'san', 'sang', 'sao',
+    'se', 'sen', 'seng',
+    'si',
+    'song', 'sou', 'su', 'suan', 'sui', 'sun', 'suo',
 ]);
+// Precompute all valid prefixes for O(1) prefix lookup
+const VALID_PREFIXES = (() => {
+    const prefixes = new Set();
+    for (const syllable of VALID_SYLLABLES) {
+        for (let i = 1; i <= syllable.length; i++) {
+            prefixes.add(syllable.substring(0, i));
+        }
+    }
+    return prefixes;
+})();
+// ---------------------------------------------------------------------------
+// Precompute u ↔ ü ↔ v variant groups.
+//
+// Two syllables are "umlaut-equivalent" when one can be obtained from the
+// other by swapping every u ↔ ü, u ↔ v, or ü ↔ v. For example:
+//   lu ↔ lü ↔ lv      (n/l initials — both u and ü exist as distinct words)
+//   ju ↔ jü ↔ jv      (j/q/x/y — u IS ü, different spellings of same sound)
+//   bu → [bu]          (b+u has no ü counterpart, so no expansion)
+// ---------------------------------------------------------------------------
+const UMLAUT_VARIANT_MAP = (() => {
+    const map = new Map();
+    for (const syllable of VALID_SYLLABLES) {
+        if (map.has(syllable))
+            continue;
+        const variants = new Set();
+        variants.add(syllable);
+        // Try each possible single-character substitution
+        const substitutions = [
+            [/u/g, 'ü'], [/u/g, 'v'],
+            [/ü/g, 'u'], [/ü/g, 'v'],
+            [/v/g, 'u'], [/v/g, 'ü'],
+        ];
+        for (const [pattern, replacement] of substitutions) {
+            const candidate = syllable.replace(pattern, replacement);
+            if (candidate !== syllable && VALID_SYLLABLES.has(candidate)) {
+                variants.add(candidate);
+            }
+        }
+        const variantArray = Array.from(variants);
+        for (const v of variantArray) {
+            map.set(v, variantArray);
+        }
+    }
+    return map;
+})();
 // Tone marks that might appear in pinyin (including v with combining tone marks)
 const TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]|v[\u0301\u030C\u0300]?/g;
+const TONE_MAP = {
+    'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
+    'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
+    'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
+    'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
+    'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
+    'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
+};
+function stripToneMarks(text) {
+    return text.toLowerCase().replace(TONE_MARKS, (match) => {
+        if (match.startsWith('v'))
+            return 'v';
+        return TONE_MAP[match] || match;
+    });
+}
 exports.PinyinValidator = {
     /**
      * Check if a string is a valid pinyin syllable (with or without tone marks)
@@ -44,53 +211,8 @@ exports.PinyinValidator = {
     isValidPinyin(text) {
         if (!text || text.length === 0)
             return false;
-        // Convert to lowercase and remove tone marks for validation
-        const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
-            // Convert tone marks back to base vowels
-            const toneMap = {
-                'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
-                'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
-                'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
-                'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
-                'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
-                'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
-            };
-            // Handle v with combining tone marks
-            if (match.startsWith('v')) {
-                return 'v';
-            }
-            return toneMap[match] || match;
-        });
-        // Check if it's a standalone syllable
-        if (STANDALONE_SYLLABLES.has(normalized)) {
-            return true;
-        }
-        // Try to parse as initial + final
-        // Check longest possible initial first (2 chars)
-        if (normalized.length >= 2) {
-            const possibleInitial2 = normalized.substring(0, 2);
-            if (INITIALS.has(possibleInitial2)) {
-                const remaining = normalized.substring(2);
-                if (FINALS.has(remaining)) {
-                    return true;
-                }
-            }
-        }
-        // Check single character initial
-        if (normalized.length >= 1) {
-            const possibleInitial1 = normalized.substring(0, 1);
-            if (INITIALS.has(possibleInitial1)) {
-                const remaining = normalized.substring(1);
-                if (FINALS.has(remaining)) {
-                    return true;
-                }
-            }
-        }
-        // Check if the whole string is a valid final (for syllables without initials)
-        if (INITIALS.has('') && FINALS.has(normalized)) {
-            return true;
-        }
-        return false;
+        const normalized = stripToneMarks(text);
+        return VALID_SYLLABLES.has(normalized);
     },
     /**
      * Check if a string could be a pinyin token (more lenient, for prefix matching)
@@ -98,45 +220,31 @@ exports.PinyinValidator = {
     couldBePinyinPrefix(text) {
         if (!text || text.length === 0)
             return false;
-        const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
-            const toneMap = {
-                'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
-                'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
-                'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
-                'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
-                'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
-                'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
-            };
-            // Handle v with combining tone marks
-            if (match.startsWith('v')) {
-                return 'v';
-            }
-            return toneMap[match] || match;
-        });
-        // Check if it's already valid pinyin
-        if (this.isValidPinyin(text))
-            return true;
-        // Check if any standalone syllable starts with this prefix
-        for (const syllable of STANDALONE_SYLLABLES) {
-            if (syllable.startsWith(normalized))
-                return true;
-        }
-        // Check if it could be the start of initial + final combination
-        // Check if it matches any initial exactly or partially
-        for (const initial of INITIALS) {
-            if (initial.startsWith(normalized) || normalized.startsWith(initial)) {
-                return true;
-            }
-        }
-        return false;
+        const normalized = stripToneMarks(text);
+        return VALID_PREFIXES.has(normalized);
+    },
+    /**
+     * Return all u/ü/v spelling variants for a syllable (with or without tone
+     * marks). The input syllable itself is always included (tone-stripped).
+     * Syllables with no ü ambiguity return a single-element array.
+     *
+     * Examples:
+     *   'lu'  → ['lu', 'lü', 'lv']
+     *   'lǜ'  → ['lü', 'lu', 'lv']
+     *   'jv'  → ['jv', 'ju', 'jü']
+     *   'ba'  → ['ba']
+     */
+    getSearchVariants(text) {
+        if (!text || text.length === 0)
+            return [];
+        const normalized = stripToneMarks(text);
+        return UMLAUT_VARIANT_MAP.get(normalized) || (VALID_SYLLABLES.has(normalized) ? [normalized] : []);
     },
     /**
      * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
      * Returns empty array if not valid pinyin
      */
     splitPinyinTokens(text) {
-        // This is a simplified version - full implementation would need
-        // more sophisticated parsing to handle ambiguous cases
         const tokens = [];
         const normalized = text.toLowerCase();
         let remaining = normalized;

package/dist/util/SearchTokenizer.d.ts CHANGED Viewed

@@ -65,6 +65,11 @@ export declare class SearchTokenizer {
      * Classifies a single token
      */
     private static classifyToken;
+    /**
+     * Expand parsed pinyin syllables to include all u/ü/v spelling variants
+     * so that downstream search covers every convention the database might use.
+     */
+    private static expandPinyinVariants;
     /**
      * Checks if a string contains only hanzi characters
      */

package/dist/util/SearchTokenizer.js CHANGED Viewed

@@ -2,6 +2,7 @@
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.SearchTokenizer = exports.TokenType = void 0;
 const PinyinParser_1 = require("./PinyinParser");
+const PinyinValidator_1 = require("./PinyinValidator");
 /**
  * Token types for search query tokenization
  */
@@ -216,7 +217,7 @@ class SearchTokenizer {
                         type: TokenType.PINYIN,
                         normalized,
                         isPossiblePinyin: true,
-                        pinyinVariants: pinyinWithApostrophe
+                        pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
                     };
                 }
                 else {
@@ -227,7 +228,7 @@ class SearchTokenizer {
                         type: TokenType.AMBIGUOUS,
                         normalized,
                         isPossiblePinyin: true,
-                        pinyinVariants: pinyinWithApostrophe
+                        pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
                     };
                 }
             }
@@ -248,7 +249,7 @@ class SearchTokenizer {
                     type: TokenType.PINYIN,
                     normalized,
                     isPossiblePinyin: true,
-                    pinyinVariants: pinyinParsing
+                    pinyinVariants: this.expandPinyinVariants(pinyinParsing)
                 };
             }
         }
@@ -261,7 +262,7 @@ class SearchTokenizer {
                 type: TokenType.AMBIGUOUS,
                 normalized,
                 isPossiblePinyin: true,
-                pinyinVariants: pinyinParsing
+                pinyinVariants: this.expandPinyinVariants(pinyinParsing)
             };
         }
         // Default to English
@@ -271,6 +272,20 @@ class SearchTokenizer {
             normalized
         };
     }
+    /**
+     * Expand parsed pinyin syllables to include all u/ü/v spelling variants
+     * so that downstream search covers every convention the database might use.
+     */
+    static expandPinyinVariants(syllables) {
+        const result = new Set();
+        for (const syllable of syllables) {
+            result.add(syllable); // Keep original (possibly with tone marks)
+            for (const variant of PinyinValidator_1.PinyinValidator.getSearchVariants(syllable)) {
+                result.add(variant);
+            }
+        }
+        return Array.from(result);
+    }
     /**
      * Checks if a string contains only hanzi characters
      */

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@shaxpir/duiduidui-models",
-  "version": "1.10.4",
+  "version": "1.11.0",
   "repository": {
     "type": "git",
     "url": "https://github.com/shaxpir/duiduidui-models"