@shaxpir/duiduidui-models 1.15.0 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/models/Phrase.d.ts +0 -1
- package/dist/models/Term.d.ts +0 -1
- package/dist/models/Term.js +0 -2
- package/dist/util/PinyinParser.d.ts +55 -9
- package/dist/util/PinyinParser.js +203 -73
- package/dist/util/PinyinValidator.d.ts +12 -0
- package/dist/util/PinyinValidator.js +38 -0
- package/package.json +1 -1
package/dist/models/Phrase.d.ts
CHANGED
package/dist/models/Term.d.ts
CHANGED
package/dist/models/Term.js
CHANGED
|
@@ -38,7 +38,6 @@ class Term extends Content_1.Content {
|
|
|
38
38
|
difficulty: difficulty,
|
|
39
39
|
hanzi_count: textOrPhrase.length,
|
|
40
40
|
pinyin: '',
|
|
41
|
-
pinyin_tokenized: '',
|
|
42
41
|
transliteration: '',
|
|
43
42
|
translation: '',
|
|
44
43
|
notes: '',
|
|
@@ -71,7 +70,6 @@ class Term extends Content_1.Content {
|
|
|
71
70
|
hanzi_count: phrase.hanzi_count,
|
|
72
71
|
difficulty: phrase.difficulty,
|
|
73
72
|
pinyin: phrase.pinyin,
|
|
74
|
-
pinyin_tokenized: phrase.pinyin_tokenized,
|
|
75
73
|
transliteration: phrase.transliteration,
|
|
76
74
|
translation: phrase.translation,
|
|
77
75
|
notes: phrase.notes,
|
|
@@ -16,32 +16,78 @@ export declare const PinyinParser: {
|
|
|
16
16
|
*/
|
|
17
17
|
parseAll(text: string): PinyinParseResult[];
|
|
18
18
|
/**
|
|
19
|
-
* Parse
|
|
19
|
+
* Parse and return only the best parsing
|
|
20
|
+
* Returns null if the text cannot be parsed as pinyin
|
|
20
21
|
*/
|
|
21
|
-
|
|
22
|
+
parse(text: string): string[] | null;
|
|
22
23
|
/**
|
|
23
|
-
* Parse
|
|
24
|
+
* Parse with erhua (儿化) awareness.
|
|
25
|
+
* Recognizes patterns like 'nǎr' where the trailing 'r' is an erhua suffix
|
|
26
|
+
* (the base syllable is valid but base+r is not).
|
|
27
|
+
* Erhua 'r' is expanded to 'er' in the output.
|
|
28
|
+
*
|
|
29
|
+
* Also handles 'r after apostrophe: nǎ'r → ['nǎ', 'er']
|
|
24
30
|
*/
|
|
25
|
-
|
|
31
|
+
parseWithErhua(text: string): string[] | null;
|
|
26
32
|
/**
|
|
27
|
-
*
|
|
33
|
+
* Parse with erhua awareness, returning all possible parsings.
|
|
28
34
|
*/
|
|
29
|
-
|
|
35
|
+
parseAllWithErhua(text: string): PinyinParseResult[];
|
|
30
36
|
/**
|
|
31
|
-
*
|
|
32
|
-
*
|
|
37
|
+
* Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
|
|
38
|
+
* and erhua) and return it with proper syllable spacing.
|
|
39
|
+
*
|
|
40
|
+
* The output is lowercased with punctuation stripped, producing clean
|
|
41
|
+
* space-separated syllables suitable for search indexing and tokenization.
|
|
42
|
+
* Tone marks are preserved.
|
|
43
|
+
*
|
|
44
|
+
* Throws if any letter sequence cannot be parsed as pinyin.
|
|
45
|
+
*
|
|
46
|
+
* Examples:
|
|
47
|
+
* 'nǐhǎo' → 'nǐ hǎo'
|
|
48
|
+
* 'nǎr' → 'nǎ er'
|
|
49
|
+
* "xī'ān" → 'xī ān'
|
|
50
|
+
* "gē'rmen" → 'gē er men'
|
|
51
|
+
* 'Nǐ hǎo!' → 'nǐ hǎo'
|
|
52
|
+
* 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
|
|
33
53
|
*/
|
|
34
|
-
|
|
54
|
+
ensurePinyinSpacing(text: string): string;
|
|
35
55
|
/**
|
|
36
56
|
* Check if a string could be compound pinyin
|
|
37
57
|
*/
|
|
38
58
|
couldBeCompoundPinyin(text: string): boolean;
|
|
39
59
|
/**
|
|
40
60
|
* Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
|
|
61
|
+
* @deprecated Use parseWithErhua() instead for erhua handling
|
|
41
62
|
*/
|
|
42
63
|
parseWithSpecialCases(text: string): PinyinParseResult[];
|
|
64
|
+
/**
|
|
65
|
+
* Get the best parsing from multiple options.
|
|
66
|
+
*
|
|
67
|
+
* Primary criterion: fewer syllables (more natural word-level grouping).
|
|
68
|
+
* Tiebreaker: when two parses have the same syllable count, prefer the
|
|
69
|
+
* one whose first syllable is shorter. This avoids greedy long matches
|
|
70
|
+
* (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
|
|
71
|
+
* remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
|
|
72
|
+
*/
|
|
73
|
+
getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
|
|
43
74
|
/**
|
|
44
75
|
* Validate that all syllables in a parsing are legitimate
|
|
45
76
|
*/
|
|
46
77
|
validateParsing(syllables: string[]): boolean;
|
|
78
|
+
/**
|
|
79
|
+
* Parse pinyin string that contains apostrophes.
|
|
80
|
+
* Splits on apostrophes and parses each segment independently.
|
|
81
|
+
*/
|
|
82
|
+
_parseApostropheSplit(text: string, erhua: boolean): PinyinParseResult[];
|
|
83
|
+
/**
|
|
84
|
+
* Core DP parser. Finds all valid syllable decompositions of a pinyin string.
|
|
85
|
+
* When erhua=true, also recognizes trailing 'r' as erhua suffix (expanded to 'er')
|
|
86
|
+
* when the base syllable is valid but base+r is not.
|
|
87
|
+
*
|
|
88
|
+
* Works on original text (preserves case and tone marks) because
|
|
89
|
+
* PinyinValidator.isValidPinyin() handles normalization internally.
|
|
90
|
+
*/
|
|
91
|
+
_parseDP(text: string, erhua: boolean): PinyinParseResult[];
|
|
92
|
+
parseAmbiguous(text: string): PinyinParseResult[];
|
|
47
93
|
};
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.PinyinParser = void 0;
|
|
4
4
|
const PinyinValidator_1 = require("./PinyinValidator");
|
|
5
|
+
// Check if a character is a Unicode letter (including diacritics)
|
|
6
|
+
const isLetter = (char) => /\p{L}/u.test(char);
|
|
5
7
|
exports.PinyinParser = {
|
|
6
8
|
/**
|
|
7
9
|
* Normalize apostrophes to straight apostrophe for pinyin parsing.
|
|
@@ -23,90 +25,112 @@ exports.PinyinParser = {
|
|
|
23
25
|
const normalized = this.normalizeApostrophes(text.toLowerCase().trim());
|
|
24
26
|
// Handle explicit apostrophes first
|
|
25
27
|
if (normalized.includes("'")) {
|
|
26
|
-
return this.
|
|
28
|
+
return this._parseApostropheSplit(normalized, false);
|
|
27
29
|
}
|
|
28
30
|
// For strings without apostrophes, try all possible parsings
|
|
29
|
-
return this.
|
|
31
|
+
return this._parseDP(normalized, false);
|
|
30
32
|
},
|
|
31
33
|
/**
|
|
32
|
-
* Parse
|
|
34
|
+
* Parse and return only the best parsing
|
|
35
|
+
* Returns null if the text cannot be parsed as pinyin
|
|
33
36
|
*/
|
|
34
|
-
|
|
35
|
-
const
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
let firstPartParsings = this.parseAmbiguous(parts[0]);
|
|
39
|
-
// For subsequent parts, we know they start with a vowel (that's why there's an apostrophe)
|
|
40
|
-
for (let i = 1; i < parts.length; i++) {
|
|
41
|
-
const part = parts[i];
|
|
42
|
-
const partParsings = this.parseAmbiguous(part);
|
|
43
|
-
// Combine all previous results with all current part results
|
|
44
|
-
const newResults = [];
|
|
45
|
-
if (firstPartParsings.length === 0) {
|
|
46
|
-
firstPartParsings = [{ syllables: [] }];
|
|
47
|
-
}
|
|
48
|
-
for (const prevResult of firstPartParsings) {
|
|
49
|
-
for (const partResult of partParsings) {
|
|
50
|
-
newResults.push({
|
|
51
|
-
syllables: [...prevResult.syllables, ...partResult.syllables]
|
|
52
|
-
});
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
firstPartParsings = newResults;
|
|
56
|
-
}
|
|
57
|
-
return firstPartParsings.filter((result) => result.syllables.length > 0);
|
|
37
|
+
parse(text) {
|
|
38
|
+
const results = this.parseAll(text);
|
|
39
|
+
const best = this.getBestParsing(results);
|
|
40
|
+
return best ? best.syllables : null;
|
|
58
41
|
},
|
|
59
42
|
/**
|
|
60
|
-
* Parse
|
|
43
|
+
* Parse with erhua (儿化) awareness.
|
|
44
|
+
* Recognizes patterns like 'nǎr' where the trailing 'r' is an erhua suffix
|
|
45
|
+
* (the base syllable is valid but base+r is not).
|
|
46
|
+
* Erhua 'r' is expanded to 'er' in the output.
|
|
47
|
+
*
|
|
48
|
+
* Also handles 'r after apostrophe: nǎ'r → ['nǎ', 'er']
|
|
61
49
|
*/
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
const memo = new Map();
|
|
67
|
-
const parseRecursive = (remaining) => {
|
|
68
|
-
if (remaining.length === 0) {
|
|
69
|
-
return [{ syllables: [] }];
|
|
70
|
-
}
|
|
71
|
-
if (memo.has(remaining)) {
|
|
72
|
-
return memo.get(remaining);
|
|
73
|
-
}
|
|
74
|
-
const results = [];
|
|
75
|
-
// Try all possible syllable lengths from longest to shortest
|
|
76
|
-
for (let len = Math.min(6, remaining.length); len >= 1; len--) {
|
|
77
|
-
const candidate = remaining.substring(0, len);
|
|
78
|
-
if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
|
|
79
|
-
const restResults = parseRecursive(remaining.substring(len));
|
|
80
|
-
for (const restResult of restResults) {
|
|
81
|
-
results.push({
|
|
82
|
-
syllables: [candidate, ...restResult.syllables]
|
|
83
|
-
});
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
memo.set(remaining, results);
|
|
88
|
-
return results;
|
|
89
|
-
};
|
|
90
|
-
return parseRecursive(text);
|
|
50
|
+
parseWithErhua(text) {
|
|
51
|
+
const results = this.parseAllWithErhua(text);
|
|
52
|
+
const best = this.getBestParsing(results);
|
|
53
|
+
return best ? best.syllables : null;
|
|
91
54
|
},
|
|
92
55
|
/**
|
|
93
|
-
*
|
|
56
|
+
* Parse with erhua awareness, returning all possible parsings.
|
|
94
57
|
*/
|
|
95
|
-
|
|
96
|
-
if (
|
|
97
|
-
return
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
58
|
+
parseAllWithErhua(text) {
|
|
59
|
+
if (!text || text.length === 0)
|
|
60
|
+
return [];
|
|
61
|
+
let normalized = this.normalizeApostrophes(text.toLowerCase().trim());
|
|
62
|
+
// Expand 'r (erhua via apostrophe) to 'er before splitting
|
|
63
|
+
normalized = normalized.replace(/'r/g, "'er");
|
|
64
|
+
if (normalized.includes("'")) {
|
|
65
|
+
return this._parseApostropheSplit(normalized, true);
|
|
66
|
+
}
|
|
67
|
+
return this._parseDP(normalized, true);
|
|
101
68
|
},
|
|
102
69
|
/**
|
|
103
|
-
*
|
|
104
|
-
*
|
|
70
|
+
* Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
|
|
71
|
+
* and erhua) and return it with proper syllable spacing.
|
|
72
|
+
*
|
|
73
|
+
* The output is lowercased with punctuation stripped, producing clean
|
|
74
|
+
* space-separated syllables suitable for search indexing and tokenization.
|
|
75
|
+
* Tone marks are preserved.
|
|
76
|
+
*
|
|
77
|
+
* Throws if any letter sequence cannot be parsed as pinyin.
|
|
78
|
+
*
|
|
79
|
+
* Examples:
|
|
80
|
+
* 'nǐhǎo' → 'nǐ hǎo'
|
|
81
|
+
* 'nǎr' → 'nǎ er'
|
|
82
|
+
* "xī'ān" → 'xī ān'
|
|
83
|
+
* "gē'rmen" → 'gē er men'
|
|
84
|
+
* 'Nǐ hǎo!' → 'nǐ hǎo'
|
|
85
|
+
* 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
|
|
105
86
|
*/
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
87
|
+
ensurePinyinSpacing(text) {
|
|
88
|
+
// Normalize apostrophes and lowercase
|
|
89
|
+
text = this.normalizeApostrophes(text).toLowerCase();
|
|
90
|
+
// Expand 'r (erhua via apostrophe) to 'er, then replace apostrophes with spaces
|
|
91
|
+
text = text.replace(/'r/g, "'er");
|
|
92
|
+
text = text.replace(/'/g, ' ');
|
|
93
|
+
text = text.replace(/ +/g, ' ');
|
|
94
|
+
const parts = [];
|
|
95
|
+
let i = 0;
|
|
96
|
+
while (i < text.length) {
|
|
97
|
+
if (isLetter(text[i])) {
|
|
98
|
+
let j = i;
|
|
99
|
+
while (j < text.length && isLetter(text[j]))
|
|
100
|
+
j++;
|
|
101
|
+
const letterRun = text.substring(i, j);
|
|
102
|
+
const results = this._parseDP(letterRun, true);
|
|
103
|
+
const best = this.getBestParsing(results);
|
|
104
|
+
if (best) {
|
|
105
|
+
parts.push(best.syllables.join(' '));
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
throw new Error(`Unable to parse as pinyin: "${letterRun}" in "${text}"`);
|
|
109
|
+
}
|
|
110
|
+
i = j;
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
// Non-letter run: preserve ellipsis (…) as a token (used in patterns
|
|
114
|
+
// like 太…了), but strip all other punctuation and collapse to a space.
|
|
115
|
+
let j = i;
|
|
116
|
+
while (j < text.length && !isLetter(text[j]))
|
|
117
|
+
j++;
|
|
118
|
+
const nonLetterRun = text.substring(i, j);
|
|
119
|
+
if (nonLetterRun.includes('…')) {
|
|
120
|
+
// Preserve ellipsis with surrounding spaces
|
|
121
|
+
if (parts.length > 0)
|
|
122
|
+
parts.push(' ');
|
|
123
|
+
parts.push('…');
|
|
124
|
+
if (j < text.length)
|
|
125
|
+
parts.push(' ');
|
|
126
|
+
}
|
|
127
|
+
else if (parts.length > 0 && j < text.length) {
|
|
128
|
+
parts.push(' ');
|
|
129
|
+
}
|
|
130
|
+
i = j;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return parts.join('').trim();
|
|
110
134
|
},
|
|
111
135
|
/**
|
|
112
136
|
* Check if a string could be compound pinyin
|
|
@@ -119,6 +143,7 @@ exports.PinyinParser = {
|
|
|
119
143
|
},
|
|
120
144
|
/**
|
|
121
145
|
* Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
|
|
146
|
+
* @deprecated Use parseWithErhua() instead for erhua handling
|
|
122
147
|
*/
|
|
123
148
|
parseWithSpecialCases(text) {
|
|
124
149
|
const normalized = text.toLowerCase();
|
|
@@ -129,7 +154,7 @@ exports.PinyinParser = {
|
|
|
129
154
|
if (PinyinValidator_1.PinyinValidator.isValidPinyin(match)) {
|
|
130
155
|
return [{ syllables: [match] }];
|
|
131
156
|
}
|
|
132
|
-
const baseResults = this.
|
|
157
|
+
const baseResults = this._parseDP(base, false);
|
|
133
158
|
return baseResults.map((result) => ({
|
|
134
159
|
syllables: [...result.syllables, 'r']
|
|
135
160
|
}));
|
|
@@ -142,12 +167,117 @@ exports.PinyinParser = {
|
|
|
142
167
|
}
|
|
143
168
|
}
|
|
144
169
|
// If no special cases match, fall back to regular parsing
|
|
145
|
-
return this.
|
|
170
|
+
return this._parseDP(normalized, false);
|
|
171
|
+
},
|
|
172
|
+
/**
|
|
173
|
+
* Get the best parsing from multiple options.
|
|
174
|
+
*
|
|
175
|
+
* Primary criterion: fewer syllables (more natural word-level grouping).
|
|
176
|
+
* Tiebreaker: when two parses have the same syllable count, prefer the
|
|
177
|
+
* one whose first syllable is shorter. This avoids greedy long matches
|
|
178
|
+
* (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
|
|
179
|
+
* remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
|
|
180
|
+
*/
|
|
181
|
+
getBestParsing(results) {
|
|
182
|
+
if (results.length === 0)
|
|
183
|
+
return null;
|
|
184
|
+
results.sort((a, b) => {
|
|
185
|
+
// Primary: fewer syllables
|
|
186
|
+
if (a.syllables.length !== b.syllables.length) {
|
|
187
|
+
return a.syllables.length - b.syllables.length;
|
|
188
|
+
}
|
|
189
|
+
// Tiebreaker: shorter first syllable
|
|
190
|
+
for (let i = 0; i < a.syllables.length; i++) {
|
|
191
|
+
if (a.syllables[i].length !== b.syllables[i].length) {
|
|
192
|
+
return a.syllables[i].length - b.syllables[i].length;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return 0;
|
|
196
|
+
});
|
|
197
|
+
return results[0];
|
|
146
198
|
},
|
|
147
199
|
/**
|
|
148
200
|
* Validate that all syllables in a parsing are legitimate
|
|
149
201
|
*/
|
|
150
202
|
validateParsing(syllables) {
|
|
151
203
|
return syllables.every(syllable => PinyinValidator_1.PinyinValidator.isValidPinyin(syllable));
|
|
152
|
-
}
|
|
204
|
+
},
|
|
205
|
+
// ---------------------------------------------------------------------------
|
|
206
|
+
// Internal methods
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
/**
|
|
209
|
+
* Parse pinyin string that contains apostrophes.
|
|
210
|
+
* Splits on apostrophes and parses each segment independently.
|
|
211
|
+
*/
|
|
212
|
+
_parseApostropheSplit(text, erhua) {
|
|
213
|
+
const parts = text.split("'");
|
|
214
|
+
let combined = [{ syllables: [] }];
|
|
215
|
+
for (const part of parts) {
|
|
216
|
+
if (!part)
|
|
217
|
+
continue; // skip empty segments from consecutive apostrophes
|
|
218
|
+
const partResults = this._parseDP(part, erhua);
|
|
219
|
+
if (partResults.length === 0)
|
|
220
|
+
return [];
|
|
221
|
+
const newCombined = [];
|
|
222
|
+
for (const prev of combined) {
|
|
223
|
+
for (const curr of partResults) {
|
|
224
|
+
newCombined.push({
|
|
225
|
+
syllables: [...prev.syllables, ...curr.syllables]
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
combined = newCombined;
|
|
230
|
+
}
|
|
231
|
+
return combined.filter((result) => result.syllables.length > 0);
|
|
232
|
+
},
|
|
233
|
+
/**
|
|
234
|
+
* Core DP parser. Finds all valid syllable decompositions of a pinyin string.
|
|
235
|
+
* When erhua=true, also recognizes trailing 'r' as erhua suffix (expanded to 'er')
|
|
236
|
+
* when the base syllable is valid but base+r is not.
|
|
237
|
+
*
|
|
238
|
+
* Works on original text (preserves case and tone marks) because
|
|
239
|
+
* PinyinValidator.isValidPinyin() handles normalization internally.
|
|
240
|
+
*/
|
|
241
|
+
_parseDP(text, erhua) {
|
|
242
|
+
if (!text)
|
|
243
|
+
return [];
|
|
244
|
+
const memo = new Map();
|
|
245
|
+
const parseFromPos = (pos) => {
|
|
246
|
+
if (pos >= text.length)
|
|
247
|
+
return [{ syllables: [] }];
|
|
248
|
+
if (memo.has(pos))
|
|
249
|
+
return memo.get(pos);
|
|
250
|
+
const results = [];
|
|
251
|
+
const maxLen = Math.min(6, text.length - pos);
|
|
252
|
+
for (let len = maxLen; len >= 1; len--) {
|
|
253
|
+
const candidate = text.substring(pos, pos + len);
|
|
254
|
+
if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
|
|
255
|
+
// Standard match
|
|
256
|
+
for (const rest of parseFromPos(pos + len)) {
|
|
257
|
+
results.push({ syllables: [candidate, ...rest.syllables] });
|
|
258
|
+
}
|
|
259
|
+
// Erhua: if next char is 'r' and candidate+'r' is NOT a valid syllable,
|
|
260
|
+
// treat the 'r' as an erhua suffix → expand to 'er'
|
|
261
|
+
if (erhua) {
|
|
262
|
+
const nextPos = pos + len;
|
|
263
|
+
if (nextPos < text.length && text[nextPos].toLowerCase() === 'r') {
|
|
264
|
+
const withR = text.substring(pos, nextPos + 1);
|
|
265
|
+
if (!PinyinValidator_1.PinyinValidator.isValidPinyin(withR)) {
|
|
266
|
+
for (const rest of parseFromPos(nextPos + 1)) {
|
|
267
|
+
results.push({ syllables: [candidate, 'er', ...rest.syllables] });
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
memo.set(pos, results);
|
|
275
|
+
return results;
|
|
276
|
+
};
|
|
277
|
+
return parseFromPos(0);
|
|
278
|
+
},
|
|
279
|
+
// Legacy aliases for backward compatibility
|
|
280
|
+
parseAmbiguous(text) {
|
|
281
|
+
return this._parseDP(text, false);
|
|
282
|
+
},
|
|
153
283
|
};
|
|
@@ -11,6 +11,18 @@
|
|
|
11
11
|
* convention the database might use.
|
|
12
12
|
*/
|
|
13
13
|
export declare const PinyinValidator: {
|
|
14
|
+
/**
|
|
15
|
+
* Strip tone marks from pinyin, lowercasing the result.
|
|
16
|
+
* Maps ǖǘǚǜ → ü, toned vowels → base vowels, v with combining marks → v.
|
|
17
|
+
* Used for search/matching where tone distinctions are irrelevant.
|
|
18
|
+
*/
|
|
19
|
+
stripToneMarks(text: string): string;
|
|
20
|
+
/**
|
|
21
|
+
* Remove all accent/tone marks from pinyin without lowercasing.
|
|
22
|
+
* Maps ü and ǖǘǚǜ → v (pipeline convention for normalized pinyin).
|
|
23
|
+
* Used by the data pipeline to create pinyin_normalized fields.
|
|
24
|
+
*/
|
|
25
|
+
removeAccentMarks(text: string): string;
|
|
14
26
|
/**
|
|
15
27
|
* Check if a string is a valid pinyin syllable (with or without tone marks)
|
|
16
28
|
*/
|
|
@@ -197,6 +197,24 @@ const TONE_MAP = {
|
|
|
197
197
|
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
198
198
|
'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
|
|
199
199
|
};
|
|
200
|
+
// Maps toned vowels to base letters with ü→v (pipeline convention)
|
|
201
|
+
// Includes both lowercase and uppercase variants.
|
|
202
|
+
const ACCENT_MAP = {
|
|
203
|
+
'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
|
|
204
|
+
'Ā': 'A', 'Á': 'A', 'Ǎ': 'A', 'À': 'A',
|
|
205
|
+
'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
|
|
206
|
+
'Ē': 'E', 'É': 'E', 'Ě': 'E', 'È': 'E',
|
|
207
|
+
'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
|
|
208
|
+
'Ī': 'I', 'Í': 'I', 'Ǐ': 'I', 'Ì': 'I',
|
|
209
|
+
'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
|
|
210
|
+
'Ō': 'O', 'Ó': 'O', 'Ǒ': 'O', 'Ò': 'O',
|
|
211
|
+
'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
|
|
212
|
+
'Ū': 'U', 'Ú': 'U', 'Ǔ': 'U', 'Ù': 'U',
|
|
213
|
+
'ǖ': 'v', 'ǘ': 'v', 'ǚ': 'v', 'ǜ': 'v',
|
|
214
|
+
'Ǖ': 'V', 'Ǘ': 'V', 'Ǚ': 'V', 'Ǜ': 'V',
|
|
215
|
+
'ü': 'v', 'Ü': 'V', 'ǹ': 'n', 'Ǹ': 'N',
|
|
216
|
+
};
|
|
217
|
+
const ACCENT_MARKS = /[āáǎàĀÁǍÀēéěèĒÉĚÈīíǐìĪÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛüÜǹǸ]|v[\u0301\u030C\u0300]?/g;
|
|
200
218
|
function stripToneMarks(text) {
|
|
201
219
|
return text.toLowerCase().replace(TONE_MARKS, (match) => {
|
|
202
220
|
if (match.startsWith('v'))
|
|
@@ -205,6 +223,26 @@ function stripToneMarks(text) {
|
|
|
205
223
|
});
|
|
206
224
|
}
|
|
207
225
|
exports.PinyinValidator = {
|
|
226
|
+
/**
|
|
227
|
+
* Strip tone marks from pinyin, lowercasing the result.
|
|
228
|
+
* Maps ǖǘǚǜ → ü, toned vowels → base vowels, v with combining marks → v.
|
|
229
|
+
* Used for search/matching where tone distinctions are irrelevant.
|
|
230
|
+
*/
|
|
231
|
+
stripToneMarks(text) {
|
|
232
|
+
return stripToneMarks(text);
|
|
233
|
+
},
|
|
234
|
+
/**
|
|
235
|
+
* Remove all accent/tone marks from pinyin without lowercasing.
|
|
236
|
+
* Maps ü and ǖǘǚǜ → v (pipeline convention for normalized pinyin).
|
|
237
|
+
* Used by the data pipeline to create pinyin_normalized fields.
|
|
238
|
+
*/
|
|
239
|
+
removeAccentMarks(text) {
|
|
240
|
+
return text.replace(ACCENT_MARKS, (match) => {
|
|
241
|
+
if (match.startsWith('v'))
|
|
242
|
+
return 'v';
|
|
243
|
+
return ACCENT_MAP[match] || match;
|
|
244
|
+
});
|
|
245
|
+
},
|
|
208
246
|
/**
|
|
209
247
|
* Check if a string is a valid pinyin syllable (with or without tone marks)
|
|
210
248
|
*/
|