@shaxpir/duiduidui-models 1.10.3 → 1.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Utility for validating pinyin tokens
|
|
2
|
+
* Utility for validating pinyin tokens.
|
|
3
|
+
*
|
|
4
|
+
* Uses a curated set of attested Mandarin syllables rather than
|
|
5
|
+
* combinatorial initial+final matching, which would accept non-existent
|
|
6
|
+
* syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
|
|
7
|
+
*
|
|
8
|
+
* Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
|
|
9
|
+
* or where the initial only pairs with ü). getSearchVariants() expands any
|
|
10
|
+
* of these into all equivalent forms so downstream search covers every
|
|
11
|
+
* convention the database might use.
|
|
3
12
|
*/
|
|
4
13
|
export declare const PinyinValidator: {
|
|
5
14
|
/**
|
|
@@ -10,6 +19,18 @@ export declare const PinyinValidator: {
|
|
|
10
19
|
* Check if a string could be a pinyin token (more lenient, for prefix matching)
|
|
11
20
|
*/
|
|
12
21
|
couldBePinyinPrefix(text: string): boolean;
|
|
22
|
+
/**
|
|
23
|
+
* Return all u/ü/v spelling variants for a syllable (with or without tone
|
|
24
|
+
* marks). The input syllable itself is always included (tone-stripped).
|
|
25
|
+
* Syllables with no ü ambiguity return a single-element array.
|
|
26
|
+
*
|
|
27
|
+
* Examples:
|
|
28
|
+
* 'lu' → ['lu', 'lü', 'lv']
|
|
29
|
+
* 'lǜ' → ['lü', 'lu', 'lv']
|
|
30
|
+
* 'jv' → ['jv', 'ju', 'jü']
|
|
31
|
+
* 'ba' → ['ba']
|
|
32
|
+
*/
|
|
33
|
+
getSearchVariants(text: string): string[];
|
|
13
34
|
/**
|
|
14
35
|
* Split a string into potential pinyin tokens (for compound pinyin like "nihao")
|
|
15
36
|
* Returns empty array if not valid pinyin
|
|
@@ -1,42 +1,209 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
/**
|
|
3
|
-
* Utility for validating pinyin tokens
|
|
3
|
+
* Utility for validating pinyin tokens.
|
|
4
|
+
*
|
|
5
|
+
* Uses a curated set of attested Mandarin syllables rather than
|
|
6
|
+
* combinatorial initial+final matching, which would accept non-existent
|
|
7
|
+
* syllables like "ho", "no", "so", "to", "be", "do", "go", "pe".
|
|
8
|
+
*
|
|
9
|
+
* Accepts three spellings for ü syllables: ü, v, and u (where unambiguous
|
|
10
|
+
* or where the initial only pairs with ü). getSearchVariants() expands any
|
|
11
|
+
* of these into all equivalent forms so downstream search covers every
|
|
12
|
+
* convention the database might use.
|
|
4
13
|
*/
|
|
5
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
15
|
exports.PinyinValidator = void 0;
|
|
7
|
-
//
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
'd', 't', 'n', 'l',
|
|
12
|
-
'g', 'k', 'h',
|
|
13
|
-
'j', 'q', 'x',
|
|
14
|
-
'zh', 'ch', 'sh', 'r',
|
|
15
|
-
'z', 'c', 's',
|
|
16
|
-
'y', 'w'
|
|
17
|
-
]);
|
|
18
|
-
// Valid pinyin finals
|
|
19
|
-
const FINALS = new Set([
|
|
20
|
-
'a', 'o', 'e', 'i', 'u', 'ü', 'v', // 'v' is often used instead of 'ü'
|
|
21
|
-
'ai', 'ei', 'ui', 'ao', 'ou', 'iu',
|
|
22
|
-
'ie', 'üe', 've', 'ue', 'er',
|
|
23
|
-
'an', 'en', 'in', 'un', 'ün', 'vn',
|
|
24
|
-
'ang', 'eng', 'ing', 'ong',
|
|
25
|
-
'ia', 'iao', 'ian', 'iang', 'iong',
|
|
26
|
-
'ua', 'uo', 'uai', 'uan', 'uang',
|
|
27
|
-
'üan', 'van', 'yuan'
|
|
28
|
-
]);
|
|
29
|
-
// Common standalone syllables
|
|
30
|
-
const STANDALONE_SYLLABLES = new Set([
|
|
16
|
+
// All valid Mandarin pinyin syllables (toneless, lowercase).
|
|
17
|
+
// Three spellings accepted for ü: ü, v, and (for j/q/x/y) u.
|
|
18
|
+
const VALID_SYLLABLES = new Set([
|
|
19
|
+
// === Zero-initial (standalone vowel syllables) ===
|
|
31
20
|
'a', 'ai', 'an', 'ang', 'ao',
|
|
32
21
|
'e', 'ei', 'en', 'eng', 'er',
|
|
33
22
|
'o', 'ou',
|
|
34
|
-
|
|
35
|
-
'
|
|
36
|
-
'
|
|
23
|
+
// === y- initial (represents i-/ü- standalone) ===
|
|
24
|
+
'ya', 'yao', 'yan', 'yang',
|
|
25
|
+
'ye', 'yi', 'yin', 'ying',
|
|
26
|
+
'yo',
|
|
27
|
+
'yong', 'you',
|
|
28
|
+
'yu', 'yuan', 'yue', 'yun', // standard u spelling (actually ü)
|
|
29
|
+
'yü', 'yüan', 'yüe', 'yün', // explicit ü spelling
|
|
30
|
+
'yv', 'yvan', 'yve', 'yvn', // v-as-ü spelling
|
|
31
|
+
// === w- initial (represents u- standalone) ===
|
|
32
|
+
'wa', 'wai', 'wan', 'wang',
|
|
33
|
+
'wei', 'wen', 'weng',
|
|
34
|
+
'wo', 'wu',
|
|
35
|
+
// === b- ===
|
|
36
|
+
'ba', 'bai', 'ban', 'bang', 'bao',
|
|
37
|
+
'bei', 'ben', 'beng',
|
|
38
|
+
'bi', 'bian', 'biao', 'bie', 'bin', 'bing',
|
|
39
|
+
'bo', 'bu',
|
|
40
|
+
// === p- ===
|
|
41
|
+
'pa', 'pai', 'pan', 'pang', 'pao',
|
|
42
|
+
'pei', 'pen', 'peng',
|
|
43
|
+
'pi', 'pian', 'piao', 'pie', 'pin', 'ping',
|
|
44
|
+
'po', 'pou', 'pu',
|
|
45
|
+
// === m- ===
|
|
46
|
+
'ma', 'mai', 'man', 'mang', 'mao',
|
|
47
|
+
'mei', 'men', 'meng',
|
|
48
|
+
'mi', 'mian', 'miao', 'mie', 'min', 'ming',
|
|
49
|
+
'miu',
|
|
50
|
+
'mo', 'mou', 'mu',
|
|
51
|
+
// === f- ===
|
|
52
|
+
'fa', 'fan', 'fang',
|
|
53
|
+
'fei', 'fen', 'feng',
|
|
54
|
+
'fo', 'fou', 'fu',
|
|
55
|
+
// === d- ===
|
|
56
|
+
'da', 'dai', 'dan', 'dang', 'dao',
|
|
57
|
+
'de', 'dei', 'den', 'deng',
|
|
58
|
+
'di', 'dia', 'dian', 'diao', 'die', 'ding', 'diu',
|
|
59
|
+
'dong', 'dou', 'du', 'duan', 'dui', 'dun', 'duo',
|
|
60
|
+
// === t- ===
|
|
61
|
+
'ta', 'tai', 'tan', 'tang', 'tao',
|
|
62
|
+
'te', 'tei', 'teng',
|
|
63
|
+
'ti', 'tian', 'tiao', 'tie', 'ting',
|
|
64
|
+
'tong', 'tou', 'tu', 'tuan', 'tui', 'tun', 'tuo',
|
|
65
|
+
// === n- ===
|
|
66
|
+
'na', 'nai', 'nan', 'nang', 'nao',
|
|
67
|
+
'ne', 'nei', 'nen', 'neng',
|
|
68
|
+
'ni', 'nian', 'niang', 'niao', 'nie', 'nin', 'ning', 'niu',
|
|
69
|
+
'nong', 'nou', 'nu', 'nuan', 'nuo',
|
|
70
|
+
'nü', 'nüe',
|
|
71
|
+
'nv', 'nve', // v-as-ü
|
|
72
|
+
'nue', // u-as-ü (beginner-friendly alias for nüe)
|
|
73
|
+
// === l- ===
|
|
74
|
+
'la', 'lai', 'lan', 'lang', 'lao',
|
|
75
|
+
'le', 'lei', 'leng',
|
|
76
|
+
'li', 'lia', 'lian', 'liang', 'liao', 'lie', 'lin', 'ling', 'liu',
|
|
77
|
+
'lo',
|
|
78
|
+
'long', 'lou', 'lu', 'luan', 'lun', 'luo',
|
|
79
|
+
'lü', 'lüe',
|
|
80
|
+
'lv', 'lve', // v-as-ü
|
|
81
|
+
'lue', // u-as-ü (beginner-friendly alias for lüe)
|
|
82
|
+
// === g- ===
|
|
83
|
+
'ga', 'gai', 'gan', 'gang', 'gao',
|
|
84
|
+
'ge', 'gei', 'gen', 'geng',
|
|
85
|
+
'gong', 'gou', 'gu', 'gua', 'guai', 'guan', 'guang', 'gui', 'gun', 'guo',
|
|
86
|
+
// === k- ===
|
|
87
|
+
'ka', 'kai', 'kan', 'kang', 'kao',
|
|
88
|
+
'ke', 'kei', 'ken', 'keng',
|
|
89
|
+
'kong', 'kou', 'ku', 'kua', 'kuai', 'kuan', 'kuang', 'kui', 'kun', 'kuo',
|
|
90
|
+
// === h- ===
|
|
91
|
+
'ha', 'hai', 'han', 'hang', 'hao',
|
|
92
|
+
'he', 'hei', 'hen', 'heng',
|
|
93
|
+
'hong', 'hou', 'hu', 'hua', 'huai', 'huan', 'huang', 'hui', 'hun', 'huo',
|
|
94
|
+
// === j- (u is always ü phonetically) ===
|
|
95
|
+
'ji', 'jia', 'jian', 'jiang', 'jiao', 'jie', 'jin', 'jing', 'jiong', 'jiu',
|
|
96
|
+
'ju', 'juan', 'jue', 'jun', // standard u spelling
|
|
97
|
+
'jü', 'jüan', 'jüe', 'jün', // explicit ü spelling
|
|
98
|
+
'jv', 'jvan', 'jve', 'jvn', // v-as-ü spelling
|
|
99
|
+
// === q- (u is always ü phonetically) ===
|
|
100
|
+
'qi', 'qia', 'qian', 'qiang', 'qiao', 'qie', 'qin', 'qing', 'qiong', 'qiu',
|
|
101
|
+
'qu', 'quan', 'que', 'qun', // standard u spelling
|
|
102
|
+
'qü', 'qüan', 'qüe', 'qün', // explicit ü spelling
|
|
103
|
+
'qv', 'qvan', 'qve', 'qvn', // v-as-ü spelling
|
|
104
|
+
// === x- (u is always ü phonetically) ===
|
|
105
|
+
'xi', 'xia', 'xian', 'xiang', 'xiao', 'xie', 'xin', 'xing', 'xiong', 'xiu',
|
|
106
|
+
'xu', 'xuan', 'xue', 'xun', // standard u spelling
|
|
107
|
+
'xü', 'xüan', 'xüe', 'xün', // explicit ü spelling
|
|
108
|
+
'xv', 'xvan', 'xve', 'xvn', // v-as-ü spelling
|
|
109
|
+
// === zh- ===
|
|
110
|
+
'zha', 'zhai', 'zhan', 'zhang', 'zhao',
|
|
111
|
+
'zhe', 'zhei', 'zhen', 'zheng',
|
|
112
|
+
'zhi',
|
|
113
|
+
'zhong', 'zhou', 'zhu', 'zhua', 'zhuai', 'zhuan', 'zhuang', 'zhui', 'zhun', 'zhuo',
|
|
114
|
+
// === ch- ===
|
|
115
|
+
'cha', 'chai', 'chan', 'chang', 'chao',
|
|
116
|
+
'che', 'chen', 'cheng',
|
|
117
|
+
'chi',
|
|
118
|
+
'chong', 'chou', 'chu', 'chua', 'chuai', 'chuan', 'chuang', 'chui', 'chun', 'chuo',
|
|
119
|
+
// === sh- ===
|
|
120
|
+
'sha', 'shai', 'shan', 'shang', 'shao',
|
|
121
|
+
'she', 'shei', 'shen', 'sheng',
|
|
122
|
+
'shi',
|
|
123
|
+
'shou', 'shu', 'shua', 'shuai', 'shuan', 'shuang', 'shui', 'shun', 'shuo',
|
|
124
|
+
// === r- ===
|
|
125
|
+
'ran', 'rang', 'rao',
|
|
126
|
+
're', 'ren', 'reng',
|
|
127
|
+
'ri',
|
|
128
|
+
'rong', 'rou', 'ru', 'rua', 'ruan', 'rui', 'run', 'ruo',
|
|
129
|
+
// === z- ===
|
|
130
|
+
'za', 'zai', 'zan', 'zang', 'zao',
|
|
131
|
+
'ze', 'zei', 'zen', 'zeng',
|
|
132
|
+
'zi',
|
|
133
|
+
'zong', 'zou', 'zu', 'zuan', 'zui', 'zun', 'zuo',
|
|
134
|
+
// === c- ===
|
|
135
|
+
'ca', 'cai', 'can', 'cang', 'cao',
|
|
136
|
+
'ce', 'cen', 'ceng',
|
|
137
|
+
'ci',
|
|
138
|
+
'cong', 'cou', 'cu', 'cuan', 'cui', 'cun', 'cuo',
|
|
139
|
+
// === s- ===
|
|
140
|
+
'sa', 'sai', 'san', 'sang', 'sao',
|
|
141
|
+
'se', 'sen', 'seng',
|
|
142
|
+
'si',
|
|
143
|
+
'song', 'sou', 'su', 'suan', 'sui', 'sun', 'suo',
|
|
37
144
|
]);
|
|
145
|
+
// Precompute all valid prefixes for O(1) prefix lookup
|
|
146
|
+
const VALID_PREFIXES = (() => {
|
|
147
|
+
const prefixes = new Set();
|
|
148
|
+
for (const syllable of VALID_SYLLABLES) {
|
|
149
|
+
for (let i = 1; i <= syllable.length; i++) {
|
|
150
|
+
prefixes.add(syllable.substring(0, i));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return prefixes;
|
|
154
|
+
})();
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
// Precompute u ↔ ü ↔ v variant groups.
|
|
157
|
+
//
|
|
158
|
+
// Two syllables are "umlaut-equivalent" when one can be obtained from the
|
|
159
|
+
// other by swapping every u ↔ ü, u ↔ v, or ü ↔ v. For example:
|
|
160
|
+
// lu ↔ lü ↔ lv (n/l initials — both u and ü exist as distinct words)
|
|
161
|
+
// ju ↔ jü ↔ jv (j/q/x/y — u IS ü, different spellings of same sound)
|
|
162
|
+
// bu → [bu] (b+u has no ü counterpart, so no expansion)
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
const UMLAUT_VARIANT_MAP = (() => {
|
|
165
|
+
const map = new Map();
|
|
166
|
+
for (const syllable of VALID_SYLLABLES) {
|
|
167
|
+
if (map.has(syllable))
|
|
168
|
+
continue;
|
|
169
|
+
const variants = new Set();
|
|
170
|
+
variants.add(syllable);
|
|
171
|
+
// Try each possible single-character substitution
|
|
172
|
+
const substitutions = [
|
|
173
|
+
[/u/g, 'ü'], [/u/g, 'v'],
|
|
174
|
+
[/ü/g, 'u'], [/ü/g, 'v'],
|
|
175
|
+
[/v/g, 'u'], [/v/g, 'ü'],
|
|
176
|
+
];
|
|
177
|
+
for (const [pattern, replacement] of substitutions) {
|
|
178
|
+
const candidate = syllable.replace(pattern, replacement);
|
|
179
|
+
if (candidate !== syllable && VALID_SYLLABLES.has(candidate)) {
|
|
180
|
+
variants.add(candidate);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
const variantArray = Array.from(variants);
|
|
184
|
+
for (const v of variantArray) {
|
|
185
|
+
map.set(v, variantArray);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return map;
|
|
189
|
+
})();
|
|
38
190
|
// Tone marks that might appear in pinyin (including v with combining tone marks)
|
|
39
191
|
const TONE_MARKS = /[āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ]|v[\u0301\u030C\u0300]?/g;
|
|
192
|
+
const TONE_MAP = {
|
|
193
|
+
'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
|
|
194
|
+
'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
|
|
195
|
+
'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
|
|
196
|
+
'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
|
|
197
|
+
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
198
|
+
'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
|
|
199
|
+
};
|
|
200
|
+
function stripToneMarks(text) {
|
|
201
|
+
return text.toLowerCase().replace(TONE_MARKS, (match) => {
|
|
202
|
+
if (match.startsWith('v'))
|
|
203
|
+
return 'v';
|
|
204
|
+
return TONE_MAP[match] || match;
|
|
205
|
+
});
|
|
206
|
+
}
|
|
40
207
|
exports.PinyinValidator = {
|
|
41
208
|
/**
|
|
42
209
|
* Check if a string is a valid pinyin syllable (with or without tone marks)
|
|
@@ -44,53 +211,8 @@ exports.PinyinValidator = {
|
|
|
44
211
|
isValidPinyin(text) {
|
|
45
212
|
if (!text || text.length === 0)
|
|
46
213
|
return false;
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
// Convert tone marks back to base vowels
|
|
50
|
-
const toneMap = {
|
|
51
|
-
'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
|
|
52
|
-
'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
|
|
53
|
-
'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
|
|
54
|
-
'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
|
|
55
|
-
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
56
|
-
'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
|
|
57
|
-
};
|
|
58
|
-
// Handle v with combining tone marks
|
|
59
|
-
if (match.startsWith('v')) {
|
|
60
|
-
return 'v';
|
|
61
|
-
}
|
|
62
|
-
return toneMap[match] || match;
|
|
63
|
-
});
|
|
64
|
-
// Check if it's a standalone syllable
|
|
65
|
-
if (STANDALONE_SYLLABLES.has(normalized)) {
|
|
66
|
-
return true;
|
|
67
|
-
}
|
|
68
|
-
// Try to parse as initial + final
|
|
69
|
-
// Check longest possible initial first (2 chars)
|
|
70
|
-
if (normalized.length >= 2) {
|
|
71
|
-
const possibleInitial2 = normalized.substring(0, 2);
|
|
72
|
-
if (INITIALS.has(possibleInitial2)) {
|
|
73
|
-
const remaining = normalized.substring(2);
|
|
74
|
-
if (FINALS.has(remaining)) {
|
|
75
|
-
return true;
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
// Check single character initial
|
|
80
|
-
if (normalized.length >= 1) {
|
|
81
|
-
const possibleInitial1 = normalized.substring(0, 1);
|
|
82
|
-
if (INITIALS.has(possibleInitial1)) {
|
|
83
|
-
const remaining = normalized.substring(1);
|
|
84
|
-
if (FINALS.has(remaining)) {
|
|
85
|
-
return true;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
// Check if the whole string is a valid final (for syllables without initials)
|
|
90
|
-
if (INITIALS.has('') && FINALS.has(normalized)) {
|
|
91
|
-
return true;
|
|
92
|
-
}
|
|
93
|
-
return false;
|
|
214
|
+
const normalized = stripToneMarks(text);
|
|
215
|
+
return VALID_SYLLABLES.has(normalized);
|
|
94
216
|
},
|
|
95
217
|
/**
|
|
96
218
|
* Check if a string could be a pinyin token (more lenient, for prefix matching)
|
|
@@ -98,45 +220,31 @@ exports.PinyinValidator = {
|
|
|
98
220
|
couldBePinyinPrefix(text) {
|
|
99
221
|
if (!text || text.length === 0)
|
|
100
222
|
return false;
|
|
101
|
-
const normalized = text
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
for (const syllable of STANDALONE_SYLLABLES) {
|
|
121
|
-
if (syllable.startsWith(normalized))
|
|
122
|
-
return true;
|
|
123
|
-
}
|
|
124
|
-
// Check if it could be the start of initial + final combination
|
|
125
|
-
// Check if it matches any initial exactly or partially
|
|
126
|
-
for (const initial of INITIALS) {
|
|
127
|
-
if (initial.startsWith(normalized) || normalized.startsWith(initial)) {
|
|
128
|
-
return true;
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
return false;
|
|
223
|
+
const normalized = stripToneMarks(text);
|
|
224
|
+
return VALID_PREFIXES.has(normalized);
|
|
225
|
+
},
|
|
226
|
+
/**
|
|
227
|
+
* Return all u/ü/v spelling variants for a syllable (with or without tone
|
|
228
|
+
* marks). The input syllable itself is always included (tone-stripped).
|
|
229
|
+
* Syllables with no ü ambiguity return a single-element array.
|
|
230
|
+
*
|
|
231
|
+
* Examples:
|
|
232
|
+
* 'lu' → ['lu', 'lü', 'lv']
|
|
233
|
+
* 'lǜ' → ['lü', 'lu', 'lv']
|
|
234
|
+
* 'jv' → ['jv', 'ju', 'jü']
|
|
235
|
+
* 'ba' → ['ba']
|
|
236
|
+
*/
|
|
237
|
+
getSearchVariants(text) {
|
|
238
|
+
if (!text || text.length === 0)
|
|
239
|
+
return [];
|
|
240
|
+
const normalized = stripToneMarks(text);
|
|
241
|
+
return UMLAUT_VARIANT_MAP.get(normalized) || (VALID_SYLLABLES.has(normalized) ? [normalized] : []);
|
|
132
242
|
},
|
|
133
243
|
/**
|
|
134
244
|
* Split a string into potential pinyin tokens (for compound pinyin like "nihao")
|
|
135
245
|
* Returns empty array if not valid pinyin
|
|
136
246
|
*/
|
|
137
247
|
splitPinyinTokens(text) {
|
|
138
|
-
// This is a simplified version - full implementation would need
|
|
139
|
-
// more sophisticated parsing to handle ambiguous cases
|
|
140
248
|
const tokens = [];
|
|
141
249
|
const normalized = text.toLowerCase();
|
|
142
250
|
let remaining = normalized;
|
|
@@ -65,6 +65,11 @@ export declare class SearchTokenizer {
|
|
|
65
65
|
* Classifies a single token
|
|
66
66
|
*/
|
|
67
67
|
private static classifyToken;
|
|
68
|
+
/**
|
|
69
|
+
* Expand parsed pinyin syllables to include all u/ü/v spelling variants
|
|
70
|
+
* so that downstream search covers every convention the database might use.
|
|
71
|
+
*/
|
|
72
|
+
private static expandPinyinVariants;
|
|
68
73
|
/**
|
|
69
74
|
* Checks if a string contains only hanzi characters
|
|
70
75
|
*/
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.SearchTokenizer = exports.TokenType = void 0;
|
|
4
4
|
const PinyinParser_1 = require("./PinyinParser");
|
|
5
|
+
const PinyinValidator_1 = require("./PinyinValidator");
|
|
5
6
|
/**
|
|
6
7
|
* Token types for search query tokenization
|
|
7
8
|
*/
|
|
@@ -216,7 +217,7 @@ class SearchTokenizer {
|
|
|
216
217
|
type: TokenType.PINYIN,
|
|
217
218
|
normalized,
|
|
218
219
|
isPossiblePinyin: true,
|
|
219
|
-
pinyinVariants: pinyinWithApostrophe
|
|
220
|
+
pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
|
|
220
221
|
};
|
|
221
222
|
}
|
|
222
223
|
else {
|
|
@@ -227,7 +228,7 @@ class SearchTokenizer {
|
|
|
227
228
|
type: TokenType.AMBIGUOUS,
|
|
228
229
|
normalized,
|
|
229
230
|
isPossiblePinyin: true,
|
|
230
|
-
pinyinVariants: pinyinWithApostrophe
|
|
231
|
+
pinyinVariants: this.expandPinyinVariants(pinyinWithApostrophe)
|
|
231
232
|
};
|
|
232
233
|
}
|
|
233
234
|
}
|
|
@@ -248,7 +249,7 @@ class SearchTokenizer {
|
|
|
248
249
|
type: TokenType.PINYIN,
|
|
249
250
|
normalized,
|
|
250
251
|
isPossiblePinyin: true,
|
|
251
|
-
pinyinVariants: pinyinParsing
|
|
252
|
+
pinyinVariants: this.expandPinyinVariants(pinyinParsing)
|
|
252
253
|
};
|
|
253
254
|
}
|
|
254
255
|
}
|
|
@@ -261,7 +262,7 @@ class SearchTokenizer {
|
|
|
261
262
|
type: TokenType.AMBIGUOUS,
|
|
262
263
|
normalized,
|
|
263
264
|
isPossiblePinyin: true,
|
|
264
|
-
pinyinVariants: pinyinParsing
|
|
265
|
+
pinyinVariants: this.expandPinyinVariants(pinyinParsing)
|
|
265
266
|
};
|
|
266
267
|
}
|
|
267
268
|
// Default to English
|
|
@@ -271,6 +272,20 @@ class SearchTokenizer {
|
|
|
271
272
|
normalized
|
|
272
273
|
};
|
|
273
274
|
}
|
|
275
|
+
/**
|
|
276
|
+
* Expand parsed pinyin syllables to include all u/ü/v spelling variants
|
|
277
|
+
* so that downstream search covers every convention the database might use.
|
|
278
|
+
*/
|
|
279
|
+
static expandPinyinVariants(syllables) {
|
|
280
|
+
const result = new Set();
|
|
281
|
+
for (const syllable of syllables) {
|
|
282
|
+
result.add(syllable); // Keep original (possibly with tone marks)
|
|
283
|
+
for (const variant of PinyinValidator_1.PinyinValidator.getSearchVariants(syllable)) {
|
|
284
|
+
result.add(variant);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
return Array.from(result);
|
|
288
|
+
}
|
|
274
289
|
/**
|
|
275
290
|
* Checks if a string contains only hanzi characters
|
|
276
291
|
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@shaxpir/duiduidui-models",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.11.0",
|
|
4
4
|
"repository": {
|
|
5
5
|
"type": "git",
|
|
6
6
|
"url": "https://github.com/shaxpir/duiduidui-models"
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"dist/"
|
|
17
17
|
],
|
|
18
18
|
"dependencies": {
|
|
19
|
-
"@shaxpir/duiduidui-models": "^1.
|
|
19
|
+
"@shaxpir/duiduidui-models": "^1.10.3",
|
|
20
20
|
"@shaxpir/sharedb": "^6.0.6",
|
|
21
21
|
"@shaxpir/shaxpir-common": "^1.4.1",
|
|
22
22
|
"ot-json1": "1.0.1",
|