@shaxpir/duiduidui-models 1.15.0 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,32 +16,68 @@ export declare const PinyinParser: {
16
16
  */
17
17
  parseAll(text: string): PinyinParseResult[];
18
18
  /**
19
- * Parse pinyin string that contains apostrophes
19
+ * Parse and return only the best parsing
20
+ * Returns null if the text cannot be parsed as pinyin
20
21
  */
21
- parseWithApostrophes(text: string): PinyinParseResult[];
22
+ parse(text: string): string[] | null;
22
23
  /**
23
- * Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
24
+ * Parse with erhua (儿化) awareness.
25
+ * Recognizes patterns like 'nǎr' where the trailing 'r' is an erhua suffix
26
+ * (the base syllable is valid but base+r is not).
27
+ * Erhua 'r' is expanded to 'er' in the output.
28
+ *
29
+ * Also handles 'r after apostrophe: nǎ'r → ['nǎ', 'er']
24
30
  */
25
- parseAmbiguous(text: string): PinyinParseResult[];
31
+ parseWithErhua(text: string): string[] | null;
26
32
  /**
27
- * Get the best parsing from multiple options
33
+ * Parse with erhua awareness, returning all possible parsings.
28
34
  */
29
- getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
35
+ parseAllWithErhua(text: string): PinyinParseResult[];
30
36
  /**
31
- * Parse and return only the best parsing
32
- * Returns null if the text cannot be parsed as pinyin
37
+ * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
38
+ * and erhua) and return it with proper syllable spacing.
39
+ *
40
+ * Preserves original tone marks and casing. Preserves punctuation in place.
41
+ * Throws if any letter sequence cannot be parsed as pinyin.
42
+ *
43
+ * Examples:
44
+ * 'nǐhǎo' → 'nǐ hǎo'
45
+ * 'nǎr' → 'nǎ er'
46
+ * "xī'ān" → 'xī ān'
47
+ * "gē'rmen" → 'gē er men'
48
+ * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
33
49
  */
34
- parse(text: string): string[] | null;
50
+ ensurePinyinSpacing(text: string): string;
35
51
  /**
36
52
  * Check if a string could be compound pinyin
37
53
  */
38
54
  couldBeCompoundPinyin(text: string): boolean;
39
55
  /**
40
56
  * Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
57
+ * @deprecated Use parseWithErhua() instead for erhua handling
41
58
  */
42
59
  parseWithSpecialCases(text: string): PinyinParseResult[];
60
+ /**
61
+ * Get the best parsing from multiple options
62
+ */
63
+ getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
43
64
  /**
44
65
  * Validate that all syllables in a parsing are legitimate
45
66
  */
46
67
  validateParsing(syllables: string[]): boolean;
68
+ /**
69
+ * Parse pinyin string that contains apostrophes.
70
+ * Splits on apostrophes and parses each segment independently.
71
+ */
72
+ _parseApostropheSplit(text: string, erhua: boolean): PinyinParseResult[];
73
+ /**
74
+ * Core DP parser. Finds all valid syllable decompositions of a pinyin string.
75
+ * When erhua=true, also recognizes trailing 'r' as erhua suffix (expanded to 'er')
76
+ * when the base syllable is valid but base+r is not.
77
+ *
78
+ * Works on original text (preserves case and tone marks) because
79
+ * PinyinValidator.isValidPinyin() handles normalization internally.
80
+ */
81
+ _parseDP(text: string, erhua: boolean): PinyinParseResult[];
82
+ parseAmbiguous(text: string): PinyinParseResult[];
47
83
  };
@@ -2,6 +2,8 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.PinyinParser = void 0;
4
4
  const PinyinValidator_1 = require("./PinyinValidator");
5
+ // Check if a character is a Unicode letter (including diacritics)
6
+ const isLetter = (char) => /\p{L}/u.test(char);
5
7
  exports.PinyinParser = {
6
8
  /**
7
9
  * Normalize apostrophes to straight apostrophe for pinyin parsing.
@@ -23,90 +25,95 @@ exports.PinyinParser = {
23
25
  const normalized = this.normalizeApostrophes(text.toLowerCase().trim());
24
26
  // Handle explicit apostrophes first
25
27
  if (normalized.includes("'")) {
26
- return this.parseWithApostrophes(normalized);
28
+ return this._parseApostropheSplit(normalized, false);
27
29
  }
28
30
  // For strings without apostrophes, try all possible parsings
29
- return this.parseAmbiguous(normalized);
31
+ return this._parseDP(normalized, false);
30
32
  },
31
33
  /**
32
- * Parse pinyin string that contains apostrophes
34
+ * Parse and return only the best parsing
35
+ * Returns null if the text cannot be parsed as pinyin
33
36
  */
34
- parseWithApostrophes(text) {
35
- const parts = text.split("'");
36
- const results = [];
37
- // The first part doesn't have a preceding apostrophe
38
- let firstPartParsings = this.parseAmbiguous(parts[0]);
39
- // For subsequent parts, we know they start with a vowel (that's why there's an apostrophe)
40
- for (let i = 1; i < parts.length; i++) {
41
- const part = parts[i];
42
- const partParsings = this.parseAmbiguous(part);
43
- // Combine all previous results with all current part results
44
- const newResults = [];
45
- if (firstPartParsings.length === 0) {
46
- firstPartParsings = [{ syllables: [] }];
47
- }
48
- for (const prevResult of firstPartParsings) {
49
- for (const partResult of partParsings) {
50
- newResults.push({
51
- syllables: [...prevResult.syllables, ...partResult.syllables]
52
- });
53
- }
54
- }
55
- firstPartParsings = newResults;
56
- }
57
- return firstPartParsings.filter((result) => result.syllables.length > 0);
37
+ parse(text) {
38
+ const results = this.parseAll(text);
39
+ const best = this.getBestParsing(results);
40
+ return best ? best.syllables : null;
58
41
  },
59
42
  /**
60
- * Parse ambiguous pinyin string (no apostrophes) into all possible valid combinations
43
+ * Parse with erhua (儿化) awareness.
44
+ * Recognizes patterns like 'nǎr' where the trailing 'r' is an erhua suffix
45
+ * (the base syllable is valid but base+r is not).
46
+ * Erhua 'r' is expanded to 'er' in the output.
47
+ *
48
+ * Also handles 'r after apostrophe: nǎ'r → ['nǎ', 'er']
61
49
  */
62
- parseAmbiguous(text) {
63
- if (!text)
64
- return [];
65
- // Use dynamic programming to find all valid parsings
66
- const memo = new Map();
67
- const parseRecursive = (remaining) => {
68
- if (remaining.length === 0) {
69
- return [{ syllables: [] }];
70
- }
71
- if (memo.has(remaining)) {
72
- return memo.get(remaining);
73
- }
74
- const results = [];
75
- // Try all possible syllable lengths from longest to shortest
76
- for (let len = Math.min(6, remaining.length); len >= 1; len--) {
77
- const candidate = remaining.substring(0, len);
78
- if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
79
- const restResults = parseRecursive(remaining.substring(len));
80
- for (const restResult of restResults) {
81
- results.push({
82
- syllables: [candidate, ...restResult.syllables]
83
- });
84
- }
85
- }
86
- }
87
- memo.set(remaining, results);
88
- return results;
89
- };
90
- return parseRecursive(text);
50
+ parseWithErhua(text) {
51
+ const results = this.parseAllWithErhua(text);
52
+ const best = this.getBestParsing(results);
53
+ return best ? best.syllables : null;
91
54
  },
92
55
  /**
93
- * Get the best parsing from multiple options
56
+ * Parse with erhua awareness, returning all possible parsings.
94
57
  */
95
- getBestParsing(results) {
96
- if (results.length === 0)
97
- return null;
98
- // Prefer fewer syllables (more natural parsing)
99
- results.sort((a, b) => a.syllables.length - b.syllables.length);
100
- return results[0];
58
+ parseAllWithErhua(text) {
59
+ if (!text || text.length === 0)
60
+ return [];
61
+ let normalized = this.normalizeApostrophes(text.toLowerCase().trim());
62
+ // Expand 'r (erhua via apostrophe) to 'er before splitting
63
+ normalized = normalized.replace(/'r/g, "'er");
64
+ if (normalized.includes("'")) {
65
+ return this._parseApostropheSplit(normalized, true);
66
+ }
67
+ return this._parseDP(normalized, true);
101
68
  },
102
69
  /**
103
- * Parse and return only the best parsing
104
- * Returns null if the text cannot be parsed as pinyin
70
+ * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
71
+ * and erhua) and return it with proper syllable spacing.
72
+ *
73
+ * Preserves original tone marks and casing. Preserves punctuation in place.
74
+ * Throws if any letter sequence cannot be parsed as pinyin.
75
+ *
76
+ * Examples:
77
+ * 'nǐhǎo' → 'nǐ hǎo'
78
+ * 'nǎr' → 'nǎ er'
79
+ * "xī'ān" → 'xī ān'
80
+ * "gē'rmen" → 'gē er men'
81
+ * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
105
82
  */
106
- parse(text) {
107
- const results = this.parseAll(text);
108
- const best = this.getBestParsing(results);
109
- return best ? best.syllables : null;
83
+ ensurePinyinSpacing(text) {
84
+ // Normalize apostrophes
85
+ text = this.normalizeApostrophes(text);
86
+ // Expand 'r (erhua via apostrophe) to 'er, then replace apostrophes with spaces
87
+ text = text.replace(/'r/g, "'er");
88
+ text = text.replace(/'/g, ' ');
89
+ text = text.replace(/ +/g, ' ');
90
+ const parts = [];
91
+ let i = 0;
92
+ while (i < text.length) {
93
+ if (isLetter(text[i])) {
94
+ let j = i;
95
+ while (j < text.length && isLetter(text[j]))
96
+ j++;
97
+ const letterRun = text.substring(i, j);
98
+ const results = this._parseDP(letterRun, true);
99
+ const best = this.getBestParsing(results);
100
+ if (best) {
101
+ parts.push(best.syllables.join(' '));
102
+ }
103
+ else {
104
+ throw new Error(`Unable to parse as pinyin: "${letterRun}" in "${text}"`);
105
+ }
106
+ i = j;
107
+ }
108
+ else {
109
+ let j = i;
110
+ while (j < text.length && !isLetter(text[j]))
111
+ j++;
112
+ parts.push(text.substring(i, j));
113
+ i = j;
114
+ }
115
+ }
116
+ return parts.join('');
110
117
  },
111
118
  /**
112
119
  * Check if a string could be compound pinyin
@@ -119,6 +126,7 @@ exports.PinyinParser = {
119
126
  },
120
127
  /**
121
128
  * Handle special cases like 'zhèr', 'nǎr', 'zhèlǐ', etc.
129
+ * @deprecated Use parseWithErhua() instead for erhua handling
122
130
  */
123
131
  parseWithSpecialCases(text) {
124
132
  const normalized = text.toLowerCase();
@@ -129,7 +137,7 @@ exports.PinyinParser = {
129
137
  if (PinyinValidator_1.PinyinValidator.isValidPinyin(match)) {
130
138
  return [{ syllables: [match] }];
131
139
  }
132
- const baseResults = this.parseAmbiguous(base);
140
+ const baseResults = this._parseDP(base, false);
133
141
  return baseResults.map((result) => ({
134
142
  syllables: [...result.syllables, 'r']
135
143
  }));
@@ -142,12 +150,100 @@ exports.PinyinParser = {
142
150
  }
143
151
  }
144
152
  // If no special cases match, fall back to regular parsing
145
- return this.parseAmbiguous(normalized);
153
+ return this._parseDP(normalized, false);
154
+ },
155
+ /**
156
+ * Get the best parsing from multiple options
157
+ */
158
+ getBestParsing(results) {
159
+ if (results.length === 0)
160
+ return null;
161
+ // Prefer fewer syllables (more natural parsing)
162
+ results.sort((a, b) => a.syllables.length - b.syllables.length);
163
+ return results[0];
146
164
  },
147
165
  /**
148
166
  * Validate that all syllables in a parsing are legitimate
149
167
  */
150
168
  validateParsing(syllables) {
151
169
  return syllables.every(syllable => PinyinValidator_1.PinyinValidator.isValidPinyin(syllable));
152
- }
170
+ },
171
+ // ---------------------------------------------------------------------------
172
+ // Internal methods
173
+ // ---------------------------------------------------------------------------
174
+ /**
175
+ * Parse pinyin string that contains apostrophes.
176
+ * Splits on apostrophes and parses each segment independently.
177
+ */
178
+ _parseApostropheSplit(text, erhua) {
179
+ const parts = text.split("'");
180
+ let combined = [{ syllables: [] }];
181
+ for (const part of parts) {
182
+ if (!part)
183
+ continue; // skip empty segments from consecutive apostrophes
184
+ const partResults = this._parseDP(part, erhua);
185
+ if (partResults.length === 0)
186
+ return [];
187
+ const newCombined = [];
188
+ for (const prev of combined) {
189
+ for (const curr of partResults) {
190
+ newCombined.push({
191
+ syllables: [...prev.syllables, ...curr.syllables]
192
+ });
193
+ }
194
+ }
195
+ combined = newCombined;
196
+ }
197
+ return combined.filter((result) => result.syllables.length > 0);
198
+ },
199
+ /**
200
+ * Core DP parser. Finds all valid syllable decompositions of a pinyin string.
201
+ * When erhua=true, also recognizes trailing 'r' as erhua suffix (expanded to 'er')
202
+ * when the base syllable is valid but base+r is not.
203
+ *
204
+ * Works on original text (preserves case and tone marks) because
205
+ * PinyinValidator.isValidPinyin() handles normalization internally.
206
+ */
207
+ _parseDP(text, erhua) {
208
+ if (!text)
209
+ return [];
210
+ const memo = new Map();
211
+ const parseFromPos = (pos) => {
212
+ if (pos >= text.length)
213
+ return [{ syllables: [] }];
214
+ if (memo.has(pos))
215
+ return memo.get(pos);
216
+ const results = [];
217
+ const maxLen = Math.min(6, text.length - pos);
218
+ for (let len = maxLen; len >= 1; len--) {
219
+ const candidate = text.substring(pos, pos + len);
220
+ if (PinyinValidator_1.PinyinValidator.isValidPinyin(candidate)) {
221
+ // Standard match
222
+ for (const rest of parseFromPos(pos + len)) {
223
+ results.push({ syllables: [candidate, ...rest.syllables] });
224
+ }
225
+ // Erhua: if next char is 'r' and candidate+'r' is NOT a valid syllable,
226
+ // treat the 'r' as an erhua suffix → expand to 'er'
227
+ if (erhua) {
228
+ const nextPos = pos + len;
229
+ if (nextPos < text.length && text[nextPos].toLowerCase() === 'r') {
230
+ const withR = text.substring(pos, nextPos + 1);
231
+ if (!PinyinValidator_1.PinyinValidator.isValidPinyin(withR)) {
232
+ for (const rest of parseFromPos(nextPos + 1)) {
233
+ results.push({ syllables: [candidate, 'er', ...rest.syllables] });
234
+ }
235
+ }
236
+ }
237
+ }
238
+ }
239
+ }
240
+ memo.set(pos, results);
241
+ return results;
242
+ };
243
+ return parseFromPos(0);
244
+ },
245
+ // Legacy aliases for backward compatibility
246
+ parseAmbiguous(text) {
247
+ return this._parseDP(text, false);
248
+ },
153
249
  };
@@ -11,6 +11,18 @@
11
11
  * convention the database might use.
12
12
  */
13
13
  export declare const PinyinValidator: {
14
+ /**
15
+ * Strip tone marks from pinyin, lowercasing the result.
16
+ * Maps ǖǘǚǜ → ü, toned vowels → base vowels, v with combining marks → v.
17
+ * Used for search/matching where tone distinctions are irrelevant.
18
+ */
19
+ stripToneMarks(text: string): string;
20
+ /**
21
+ * Remove all accent/tone marks from pinyin without lowercasing.
22
+ * Maps ü and ǖǘǚǜ → v (pipeline convention for normalized pinyin).
23
+ * Used by the data pipeline to create pinyin_normalized fields.
24
+ */
25
+ removeAccentMarks(text: string): string;
14
26
  /**
15
27
  * Check if a string is a valid pinyin syllable (with or without tone marks)
16
28
  */
@@ -197,6 +197,24 @@ const TONE_MAP = {
197
197
  'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
198
198
  'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü'
199
199
  };
200
+ // Maps toned vowels to base letters with ü→v (pipeline convention)
201
+ // Includes both lowercase and uppercase variants.
202
+ const ACCENT_MAP = {
203
+ 'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
204
+ 'Ā': 'A', 'Á': 'A', 'Ǎ': 'A', 'À': 'A',
205
+ 'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
206
+ 'Ē': 'E', 'É': 'E', 'Ě': 'E', 'È': 'E',
207
+ 'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
208
+ 'Ī': 'I', 'Í': 'I', 'Ǐ': 'I', 'Ì': 'I',
209
+ 'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
210
+ 'Ō': 'O', 'Ó': 'O', 'Ǒ': 'O', 'Ò': 'O',
211
+ 'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
212
+ 'Ū': 'U', 'Ú': 'U', 'Ǔ': 'U', 'Ù': 'U',
213
+ 'ǖ': 'v', 'ǘ': 'v', 'ǚ': 'v', 'ǜ': 'v',
214
+ 'Ǖ': 'V', 'Ǘ': 'V', 'Ǚ': 'V', 'Ǜ': 'V',
215
+ 'ü': 'v', 'Ü': 'V', 'ǹ': 'n', 'Ǹ': 'N',
216
+ };
217
+ const ACCENT_MARKS = /[āáǎàĀÁǍÀēéěèĒÉĚÈīíǐìĪÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛüÜǹǸ]|v[\u0301\u030C\u0300]?/g;
200
218
  function stripToneMarks(text) {
201
219
  return text.toLowerCase().replace(TONE_MARKS, (match) => {
202
220
  if (match.startsWith('v'))
@@ -205,6 +223,26 @@ function stripToneMarks(text) {
205
223
  });
206
224
  }
207
225
  exports.PinyinValidator = {
226
+ /**
227
+ * Strip tone marks from pinyin, lowercasing the result.
228
+ * Maps ǖǘǚǜ → ü, toned vowels → base vowels, v with combining marks → v.
229
+ * Used for search/matching where tone distinctions are irrelevant.
230
+ */
231
+ stripToneMarks(text) {
232
+ return stripToneMarks(text);
233
+ },
234
+ /**
235
+ * Remove all accent/tone marks from pinyin without lowercasing.
236
+ * Maps ü and ǖǘǚǜ → v (pipeline convention for normalized pinyin).
237
+ * Used by the data pipeline to create pinyin_normalized fields.
238
+ */
239
+ removeAccentMarks(text) {
240
+ return text.replace(ACCENT_MARKS, (match) => {
241
+ if (match.startsWith('v'))
242
+ return 'v';
243
+ return ACCENT_MAP[match] || match;
244
+ });
245
+ },
208
246
  /**
209
247
  * Check if a string is a valid pinyin syllable (with or without tone marks)
210
248
  */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shaxpir/duiduidui-models",
3
- "version": "1.15.0",
3
+ "version": "1.16.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/shaxpir/duiduidui-models"