@shaxpir/duiduidui-models 1.10.4 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,14 @@
1
1
  /**
2
- * Utility for validating pinyin tokens
2
+ * Utility for validating pinyin tokens.
3
+ *
4
+ * Uses a curated set of attested Mandarin syllables rather than
5
+ * combinatorial initial+final matching, which would accept non-existent
6
+ * syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
7
+ *
8
+ * Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
9
+ * or where the initial only pairs with ü). getSearchVariants() expands any
10
+ * of these into all equivalent forms so downstream search covers every
11
+ * convention the database might use.
3
12
  */
4
13
  export declare const PinyinValidator: {
5
14
  /**
@@ -10,6 +19,18 @@ export declare const PinyinValidator: {
10
19
  * Check if a string could be a pinyin token (more lenient, for prefix matching)
11
20
  */
12
21
  couldBePinyinPrefix(text: string): boolean;
22
+ /**
23
+ * Return all u/ü/v spelling variants for a syllable (with or without tone
24
+ * marks). The input syllable itself is always included (tone-stripped).
25
+ * Syllables with no ü ambiguity return a single-element array.
26
+ *
27
+ * Examples:
28
+ * 'lu' → ['lu', 'lü', 'lv']
29
+ * 'lǜ' → ['lü', 'lu', 'lv']
30
+ * 'jv' → ['jv', 'ju', 'jü']
31
+ * 'ba' → ['ba']
32
+ */
33
+ getSearchVariants(text: string): string[];
13
34
  /**
14
35
  * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
15
36
  * Returns empty array if not valid pinyin
@@ -1,42 +1,209 @@
1
1
  "use strict";
2
2
  /**
3
- * Utility for validating pinyin tokens
3
+ * Utility for validating pinyin tokens.
4
+ *
5
+ * Uses a curated set of attested Mandarin syllables rather than
6
+ * combinatorial initial+final matching, which would accept non-existent
7
+ * syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
8
+ *
9
+ * Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
10
+ * or where the initial only pairs with ü). getSearchVariants() expands any
11
+ * of these into all equivalent forms so downstream search covers every
12
+ * convention the database might use.
4
13
  */
5
14
  Object.defineProperty(exports, "__esModule", { value: true });
6
15
  exports.PinyinValidator = void 0;
7
- // Valid pinyin initials (including empty string for syllables like 'a', 'e')
8
- const INITIALS = new Set([
9
- '', // empty initial
10
- 'b', 'p', 'm', 'f',
11
- 'd', 't', 'n', 'l',
12
- 'g', 'k', 'h',
13
- 'j', 'q', 'x',
14
- 'zh', 'ch', 'sh', 'r',
15
- 'z', 'c', 's',
16
- 'y', 'w'
17
- ]);
18
- // Valid pinyin finals
19
- const FINALS = new Set([
20
- 'a', 'o', 'e', 'i', 'u', 'ü', 'v', // 'v' is often used instead of 'ü'
21
- 'ai', 'ei', 'ui', 'ao', 'ou', 'iu',
22
- 'ie', 'üe', 've', 'ue', 'er',
23
- 'an', 'en', 'in', 'un', 'ün', 'vn',
24
- 'ang', 'eng', 'ing', 'ong',
25
- 'ia', 'iao', 'ian', 'iang', 'iong',
26
- 'ua', 'uo', 'uai', 'uan', 'uang',
27
- 'üan', 'van', 'yuan'
28
- ]);
29
- // Common standalone syllables
30
- const STANDALONE_SYLLABLES = new Set([
16
+ // All valid Mandarin pinyin syllables (toneless, lowercase).
17
+ // Three spellings accepted for ü: ü, v, and (for j/q/x/y) u.
18
+ const VALID_SYLLABLES = new Set([
19
+ // === Zero-initial (standalone vowel syllables) ===
31
20
  'a', 'ai', 'an', 'ang', 'ao',
32
21
  'e', 'ei', 'en', 'eng', 'er',
33
22
  'o', 'ou',
34
- 'yi', 'ya', 'yao', 'ye', 'you', 'yan', 'yang', 'yin', 'ying', 'yong',
35
- 'wu', 'wa', 'wo', 'wai', 'wei', 'wan', 'wang', 'wen', 'weng',
36
- 'yu', 'yue', 'yuan', 'yun'
23
+ // === y- initial (represents i-/ü- standalone) ===
24
+ 'ya', 'yao', 'yan', 'yang',
25
+ 'ye', 'yi', 'yin', 'ying',
26
+ 'yo',
27
+ 'yong', 'you',
28
+ 'yu', 'yuan', 'yue', 'yun', // standard u spelling (actually ü)
29
+ 'yü', 'yüan', 'yüe', 'yün', // explicit ü spelling
30
+ 'yv', 'yvan', 'yve', 'yvn', // v-as-ü spelling
31
+ // === w- initial (represents u- standalone) ===
32
+ 'wa', 'wai', 'wan', 'wang',
33
+ 'wei', 'wen', 'weng',
34
+ 'wo', 'wu',
35
+ // === b- ===
36
+ 'ba', 'bai', 'ban', 'bang', 'bao',
37
+ 'bei', 'ben', 'beng',
38
+ 'bi', 'bian', 'biao', 'bie', 'bin', 'bing',
39
+ 'bo', 'bu',
40
+ // === p- ===
41
+ 'pa', 'pai', 'pan', 'pang', 'pao',
42
+ 'pei', 'pen', 'peng',
43
+ 'pi', 'pian', 'piao', 'pie', 'pin', 'ping',
44
+ 'po', 'pou', 'pu',
45
+ // === m- ===
46
+ 'ma', 'mai', 'man', 'mang', 'mao',
47
+ 'mei', 'men', 'meng',
48
+ 'mi', 'mian', 'miao', 'mie', 'min', 'ming',
49
+ 'miu',
50
+ 'mo', 'mou', 'mu',
51
+ // === f- ===
52
+ 'fa', 'fan', 'fang',
53
+ 'fei', 'fen', 'feng',
54
+ 'fo', 'fou', 'fu',
55
+ // === d- ===
56
+ 'da', 'dai', 'dan', 'dang', 'dao',
57
+ 'de', 'dei', 'den', 'deng',
58
+ 'di', 'dia', 'dian', 'diao', 'die', 'ding', 'diu',
59
+ 'dong', 'dou', 'du', 'duan', 'dui', 'dun', 'duo',
60
+ // === t- ===
61
+ 'ta', 'tai', 'tan', 'tang', 'tao',
62
+ 'te', 'tei', 'teng',
63
+ 'ti', 'tian', 'tiao', 'tie', 'ting',
64
+ 'tong', 'tou', 'tu', 'tuan', 'tui', 'tun', 'tuo',
65
+ // === n- ===
66
+ 'na', 'nai', 'nan', 'nang', 'nao',
67
+ 'ne', 'nei', 'nen', 'neng',
68
+ 'ni', 'nian', 'niang', 'niao', 'nie', 'nin', 'ning', 'niu',
69
+ 'nong', 'nou', 'nu', 'nuan', 'nuo',
70
+ 'nü', 'nüe',
71
+ 'nv', 'nve', // v-as-ü
72
+ 'nue', // u-as-ü (beginner-friendly alias for nüe)
73
+ // === l- ===
74
+ 'la', 'lai', 'lan', 'lang', 'lao',
75
+ 'le', 'lei', 'leng',
76
+ 'li', 'lia', 'lian', 'liang', 'liao', 'lie', 'lin', 'ling', 'liu',
77
+ 'lo',
78
+ 'long', 'lou', 'lu', 'luan', 'lun', 'luo',
79
+ 'lü', 'lüe',
80
+ 'lv', 'lve', // v-as-ü
81
+ 'lue', // u-as-ü (beginner-friendly alias for lüe)
82
+ // === g- ===
83
+ 'ga', 'gai', 'gan', 'gang', 'gao',
84
+ 'ge', 'gei', 'gen', 'geng',
85
+ 'gong', 'gou', 'gu', 'gua', 'guai', 'guan', 'guang', 'gui', 'gun', 'guo',
86
+ // === k- ===
87
+ 'ka', 'kai', 'kan', 'kang', 'kao',
88
+ 'ke', 'kei', 'ken', 'keng',
89
+ 'kong', 'kou', 'ku', 'kua', 'kuai', 'kuan', 'kuang', 'kui', 'kun', 'kuo',
90
+ // === h- ===
91
+ 'ha', 'hai', 'han', 'hang', 'hao',
92
+ 'he', 'hei', 'hen', 'heng',
93
+ 'hong', 'hou', 'hu', 'hua', 'huai', 'huan', 'huang', 'hui', 'hun', 'huo',
94
+ // === j- (u is always ü phonetically) ===
95
+ 'ji', 'jia', 'jian', 'jiang', 'jiao', 'jie', 'jin', 'jing', 'jiong', 'jiu',
96
+ 'ju', 'juan', 'jue', 'jun', // standard u spelling
97
+ 'jü', 'jüan', 'jüe', 'jün', // explicit ü spelling
98
+ 'jv', 'jvan', 'jve', 'jvn', // v-as-ü spelling
99
+ // === q- (u is always ü phonetically) ===
100
+ 'qi', 'qia', 'qian', 'qiang', 'qiao', 'qie', 'qin', 'qing', 'qiong', 'qiu',
101
+ 'qu', 'quan', 'que', 'qun', // standard u spelling
102
+ 'qü', 'qüan', 'qüe', 'qün', // explicit ü spelling
103
+ 'qv', 'qvan', 'qve', 'qvn', // v-as-ü spelling
104
+ // === x- (u is always ü phonetically) ===
105
+ 'xi', 'xia', 'xian', 'xiang', 'xiao', 'xie', 'xin', 'xing', 'xiong', 'xiu',
106
+ 'xu', 'xuan', 'xue', 'xun', // standard u spelling
107
+ 'xü', 'xüan', 'xüe', 'xün', // explicit ü spelling
108
+ 'xv', 'xvan', 'xve', 'xvn', // v-as-ü spelling
109
+ // === zh- ===
110
+ 'zha', 'zhai', 'zhan', 'zhang', 'zhao',
111
+ 'zhe', 'zhei', 'zhen', 'zheng',
112
+ 'zhi',
113
+ 'zhong', 'zhou', 'zhu', 'zhua', 'zhuai', 'zhuan', 'zhuang', 'zhui', 'zhun', 'zhuo',
114
+ // === ch- ===
115
+ 'cha', 'chai', 'chan', 'chang', 'chao',
116
+ 'che', 'chen', 'cheng',
117
+ 'chi',
118
+ 'chong', 'chou', 'chu', 'chua', 'chuai', 'chuan', 'chuang', 'chui', 'chun', 'chuo',
119
+ // === sh- ===
120
+ 'sha', 'shai', 'shan', 'shang', 'shao',
121
+ 'she', 'shei', 'shen', 'sheng',
122
+ 'shi',
123
+ 'shou', 'shu', 'shua', 'shuai', 'shuan', 'shuang', 'shui', 'shun', 'shuo',
124
+ // === r- ===
125
+ 'ran', 'rang', 'rao',
126
+ 're', 'ren', 'reng',
127
+ 'ri',
128
+ 'rong', 'rou', 'ru', 'rua', 'ruan', 'rui', 'run', 'ruo',
129
+ // === z- ===
130
+ 'za', 'zai', 'zan', 'zang', 'zao',
131
+ 'ze', 'zei', 'zen', 'zeng',
132
+ 'zi',
133
+ 'zong', 'zou', 'zu', 'zuan', 'zui', 'zun', 'zuo',
134
+ // === c- ===
135
+ 'ca', 'cai', 'can', 'cang', 'cao',
136
+ 'ce', 'cen', 'ceng',
137
+ 'ci',
138
+ 'cong', 'cou', 'cu', 'cuan', 'cui', 'cun', 'cuo',
139
+ // === s- ===
140
+ 'sa', 'sai', 'san', 'sang', 'sao',
141
+ 'se', 'sen', 'seng',
142
+ 'si',
143
+ 'song', 'sou', 'su', 'suan', 'sui', 'sun', 'suo',
37
144
  ]);
145
+ // Precompute all valid prefixes for O(1) prefix lookup
146
+ const VALID_PREFIXES = (() => {
147
+ const prefixes = new Set();
148
+ for (const syllable of VALID_SYLLABLES) {
149
+ for (let i = 1; i <= syllable.length; i++) {
150
+ prefixes.add(syllable.substring(0, i));
151
+ }
152
+ }
153
+ return prefixes;
154
+ })();
155
+ // ---------------------------------------------------------------------------
156
+ // Precompute u ↔ ü ↔ v variant groups.
157
+ //
158
+ // Two syllables are "umlaut-equivalent" when one can be obtained from the
159
+ // other by swapping every u ↔ ü, u ↔ v, or ü ↔ v. For example:
160
+ // lu ↔ lü ↔ lv (n/l initials — both u and ü exist as distinct words)
161
+ // ju ↔ jü ↔ jv (j/q/x/y — u IS ü, different spellings of same sound)
162
+ // bu → [bu] (b+u has no ü counterpart, so no expansion)
163
+ // ---------------------------------------------------------------------------
164
+ const UMLAUT_VARIANT_MAP = (() => {
165
+ const map = new Map();
166
+ for (const syllable of VALID_SYLLABLES) {
167
+ if (map.has(syllable))
168
+ continue;
169
+ const variants = new Set();
170
+ variants.add(syllable);
171
+ // Try each possible single-character substitution
172
+ const substitutions = [
173
+ [/u/g, 'ü'], [/u/g, 'v'],
174
+ [/ü/g, 'u'], [/ü/g, 'v'],
175
+ [/v/g, 'u'], [/v/g, 'ü'],
176
+ ];
177
+ for (const [pattern, replacement] of substitutions) {
178
+ const candidate = syllable.replace(pattern, replacement);
179
+ if (candidate !== syllable && VALID_SYLLABLES.has(candidate)) {
180
+ variants.add(candidate);
181
+ }
182
+ }
183
+ const variantArray = Array.from(variants);
184
+ for (const v of variantArray) {
185
+ map.set(v, variantArray);
186
+ }
187
+ }
188
+ return map;
189
+ })();
38
190
  // Tone marks that might appear in pinyin (including v with combining tone marks)
39
191
  const TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]|v[\u0301\u030C\u0300]?/g;
192
+ const TONE_MAP = {
193
+ 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
194
+ 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
195
+ 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
196
+ 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
197
+ 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
198
+ 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
199
+ };
200
+ function stripToneMarks(text) {
201
+ return text.toLowerCase().replace(TONE_MARKS, (match) => {
202
+ if (match.startsWith('v'))
203
+ return 'v';
204
+ return TONE_MAP[match] || match;
205
+ });
206
+ }
40
207
  exports.PinyinValidator = {
41
208
  /**
42
209
  * Check if a string is a valid pinyin syllable (with or without tone marks)
@@ -44,53 +211,8 @@ exports.PinyinValidator = {
44
211
  isValidPinyin(text) {
45
212
  if (!text || text.length === 0)
46
213
  return false;
47
- // Convert to lowercase and remove tone marks for validation
48
- const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
49
- // Convert tone marks back to base vowels
50
- const toneMap = {
51
- 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
52
- 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
53
- 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
54
- 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
55
- 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
56
- 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
57
- };
58
- // Handle v with combining tone marks
59
- if (match.startsWith('v')) {
60
- return 'v';
61
- }
62
- return toneMap[match] || match;
63
- });
64
- // Check if it's a standalone syllable
65
- if (STANDALONE_SYLLABLES.has(normalized)) {
66
- return true;
67
- }
68
- // Try to parse as initial + final
69
- // Check longest possible initial first (2 chars)
70
- if (normalized.length >= 2) {
71
- const possibleInitial2 = normalized.substring(0, 2);
72
- if (INITIALS.has(possibleInitial2)) {
73
- const remaining = normalized.substring(2);
74
- if (FINALS.has(remaining)) {
75
- return true;
76
- }
77
- }
78
- }
79
- // Check single character initial
80
- if (normalized.length >= 1) {
81
- const possibleInitial1 = normalized.substring(0, 1);
82
- if (INITIALS.has(possibleInitial1)) {
83
- const remaining = normalized.substring(1);
84
- if (FINALS.has(remaining)) {
85
- return true;
86
- }
87
- }
88
- }
89
- // Check if the whole string is a valid final (for syllables without initials)
90
- if (INITIALS.has('') && FINALS.has(normalized)) {
91
- return true;
92
- }
93
- return false;
214
+ const normalized = stripToneMarks(text);
215
+ return VALID_SYLLABLES.has(normalized);
94
216
  },
95
217
  /**
96
218
  * Check if a string could be a pinyin token (more lenient, for prefix matching)
@@ -98,45 +220,31 @@ exports.PinyinValidator = {
98
220
  couldBePinyinPrefix(text) {
99
221
  if (!text || text.length === 0)
100
222
  return false;
101
- const normalized = text.toLowerCase().replace(TONE_MARKS, (match) => {
102
- const toneMap = {
103
- 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
104
- 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
105
- 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
106
- 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
107
- 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
108
- 'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
109
- };
110
- // Handle v with combining tone marks
111
- if (match.startsWith('v')) {
112
- return 'v';
113
- }
114
- return toneMap[match] || match;
115
- });
116
- // Check if it's already valid pinyin
117
- if (this.isValidPinyin(text))
118
- return true;
119
- // Check if any standalone syllable starts with this prefix
120
- for (const syllable of STANDALONE_SYLLABLES) {
121
- if (syllable.startsWith(normalized))
122
- return true;
123
- }
124
- // Check if it could be the start of initial + final combination
125
- // Check if it matches any initial exactly or partially
126
- for (const initial of INITIALS) {
127
- if (initial.startsWith(normalized) || normalized.startsWith(initial)) {
128
- return true;
129
- }
130
- }
131
- return false;
223
+ const normalized = stripToneMarks(text);
224
+ return VALID_PREFIXES.has(normalized);
225
+ },
226
+ /**
227
+ * Return all u/ü/v spelling variants for a syllable (with or without tone
228
+ * marks). The input syllable itself is always included (tone-stripped).
229
+ * Syllables with no ü ambiguity return a single-element array.
230
+ *
231
+ * Examples:
232
+ * 'lu' → ['lu', 'lü', 'lv']
233
+ * 'lǜ' → ['', 'lu', 'lv']
234
+ * 'jv' → ['jv', 'ju', 'jü']
235
+ * 'ba' → ['ba']
236
+ */
237
+ getSearchVariants(text) {
238
+ if (!text || text.length === 0)
239
+ return [];
240
+ const normalized = stripToneMarks(text);
241
+ return UMLAUT_VARIANT_MAP.get(normalized) || (VALID_SYLLABLES.has(normalized) ? [normalized] : []);
132
242
  },
133
243
  /**
134
244
  * Split a string into potential pinyin tokens (for compound pinyin like "nihao")
135
245
  * Returns empty array if not valid pinyin
136
246
  */
137
247
  splitPinyinTokens(text) {
138
- // This is a simplified version - full implementation would need
139
- // more sophisticated parsing to handle ambiguous cases
140
248
  const tokens = [];
141
249
  const normalized = text.toLowerCase();
142
250
  let remaining = normalized;
@@ -65,6 +65,11 @@ export declare class SearchTokenizer {
65
65
  * Classifies a single token
66
66
  */
67
67
  private static classifyToken;
68
+ /**
69
+ * Expand parsed pinyin syllables to include all u/ü/v spelling variants
70
+ * so that downstream search covers every convention the database might use.
71
+ */
72
+ private static expandPinyinVariants;
68
73
  /**
69
74
  * Checks if a string contains only hanzi characters
70
75
  */
@@ -2,6 +2,7 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.SearchTokenizer = exports.TokenType = void 0;
4
4
  const PinyinParser_1 = require("./PinyinParser");
5
+ const PinyinValidator_1 = require("./PinyinValidator");
5
6
  /**
6
7
  * Token types for search query tokenization
7
8
  */
@@ -216,7 +217,7 @@ class SearchTokenizer {
216
217
  type: TokenType.PINYIN,
217
218
  normalized,
218
219
  isPossiblePinyin: true,
219
- pinyinVariants: pinyinWithApostrophe
220
+ pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
220
221
  };
221
222
  }
222
223
  else {
@@ -227,7 +228,7 @@ class SearchTokenizer {
227
228
  type: TokenType.AMBIGUOUS,
228
229
  normalized,
229
230
  isPossiblePinyin: true,
230
- pinyinVariants: pinyinWithApostrophe
231
+ pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
231
232
  };
232
233
  }
233
234
  }
@@ -248,7 +249,7 @@ class SearchTokenizer {
248
249
  type: TokenType.PINYIN,
249
250
  normalized,
250
251
  isPossiblePinyin: true,
251
- pinyinVariants: pinyinParsing
252
+ pinyinVariants: this.expandPinyinVariants(pinyinParsing)
252
253
  };
253
254
  }
254
255
  }
@@ -261,7 +262,7 @@ class SearchTokenizer {
261
262
  type: TokenType.AMBIGUOUS,
262
263
  normalized,
263
264
  isPossiblePinyin: true,
264
- pinyinVariants: pinyinParsing
265
+ pinyinVariants: this.expandPinyinVariants(pinyinParsing)
265
266
  };
266
267
  }
267
268
  // Default to English
@@ -271,6 +272,20 @@ class SearchTokenizer {
271
272
  normalized
272
273
  };
273
274
  }
275
+ /**
276
+ * Expand parsed pinyin syllables to include all u/ü/v spelling variants
277
+ * so that downstream search covers every convention the database might use.
278
+ */
279
+ static expandPinyinVariants(syllables) {
280
+ const result = new Set();
281
+ for (const syllable of syllables) {
282
+ result.add(syllable); // Keep original (possibly with tone marks)
283
+ for (const variant of PinyinValidator_1.PinyinValidator.getSearchVariants(syllable)) {
284
+ result.add(variant);
285
+ }
286
+ }
287
+ return Array.from(result);
288
+ }
274
289
  /**
275
290
  * Checks if a string contains only hanzi characters
276
291
  */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shaxpir/duiduidui-models",
3
- "version": "1.10.4",
3
+ "version": "1.11.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/shaxpir/duiduidui-models"