@shaxpir/duiduidui-models 1.16.0 → 1.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,6 @@ export interface Phrase {
17
17
  sense_rank: number;
18
18
  difficulty: number;
19
19
  pinyin: string;
20
- pinyin_tokenized: string;
21
20
  transliteration: string;
22
21
  translation: string;
23
22
  notes: string;
@@ -42,7 +42,6 @@ export interface TermPayload extends BayesianScore {
42
42
  implied_review_count: number;
43
43
  hanzi_count?: number;
44
44
  pinyin?: string;
45
- pinyin_tokenized?: string;
46
45
  transliteration?: string;
47
46
  translation?: string;
48
47
  notes?: string;
@@ -38,7 +38,6 @@ class Term extends Content_1.Content {
38
38
  difficulty: difficulty,
39
39
  hanzi_count: textOrPhrase.length,
40
40
  pinyin: '',
41
- pinyin_tokenized: '',
42
41
  transliteration: '',
43
42
  translation: '',
44
43
  notes: '',
@@ -71,7 +70,6 @@ class Term extends Content_1.Content {
71
70
  hanzi_count: phrase.hanzi_count,
72
71
  difficulty: phrase.difficulty,
73
72
  pinyin: phrase.pinyin,
74
- pinyin_tokenized: phrase.pinyin_tokenized,
75
73
  transliteration: phrase.transliteration,
76
74
  translation: phrase.translation,
77
75
  notes: phrase.notes,
@@ -37,7 +37,10 @@ export declare const PinyinParser: {
37
37
  * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
38
38
  * and erhua) and return it with proper syllable spacing.
39
39
  *
40
- * Preserves original tone marks and casing. Preserves punctuation in place.
40
+ * The output is lowercased with punctuation stripped, producing clean
41
+ * space-separated syllables suitable for search indexing and tokenization.
42
+ * Tone marks are preserved.
43
+ *
41
44
  * Throws if any letter sequence cannot be parsed as pinyin.
42
45
  *
43
46
  * Examples:
@@ -45,7 +48,8 @@ export declare const PinyinParser: {
45
48
  * 'nǎr' → 'nǎ er'
46
49
  * "xī'ān" → 'xī ān'
47
50
  * "gē'rmen" → 'gē er men'
48
- * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
51
+ * 'Nǐ hǎo!' → 'nǐ hǎo'
52
+ * 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
49
53
  */
50
54
  ensurePinyinSpacing(text: string): string;
51
55
  /**
@@ -58,7 +62,13 @@ export declare const PinyinParser: {
58
62
  */
59
63
  parseWithSpecialCases(text: string): PinyinParseResult[];
60
64
  /**
61
- * Get the best parsing from multiple options
65
+ * Get the best parsing from multiple options.
66
+ *
67
+ * Primary criterion: fewer syllables (more natural word-level grouping).
68
+ * Tiebreaker: when two parses have the same syllable count, prefer the
69
+ * one whose first syllable is shorter. This avoids greedy long matches
70
+ * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
71
+ * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
62
72
  */
63
73
  getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
64
74
  /**
@@ -59,8 +59,8 @@ exports.PinyinParser = {
59
59
  if (!text || text.length === 0)
60
60
  return [];
61
61
  let normalized = this.normalizeApostrophes(text.toLowerCase().trim());
62
- // Expand 'r (erhua via apostrophe) to 'er before splitting
63
- normalized = normalized.replace(/'r/g, "'er");
62
+ // Expand 'r (erhua via apostrophe) to 'er, but not when followed by a vowel
63
+ normalized = normalized.replace(/'r(?![aeiouāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüv])/gi, "'er");
64
64
  if (normalized.includes("'")) {
65
65
  return this._parseApostropheSplit(normalized, true);
66
66
  }
@@ -70,7 +70,10 @@ exports.PinyinParser = {
70
70
  * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
71
71
  * and erhua) and return it with proper syllable spacing.
72
72
  *
73
- * Preserves original tone marks and casing. Preserves punctuation in place.
73
+ * The output is lowercased with punctuation stripped, producing clean
74
+ * space-separated syllables suitable for search indexing and tokenization.
75
+ * Tone marks are preserved.
76
+ *
74
77
  * Throws if any letter sequence cannot be parsed as pinyin.
75
78
  *
76
79
  * Examples:
@@ -78,13 +81,15 @@ exports.PinyinParser = {
78
81
  * 'nǎr' → 'nǎ er'
79
82
  * "xī'ān" → 'xī ān'
80
83
  * "gē'rmen" → 'gē er men'
81
- * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
84
+ * 'Nǐ hǎo!' → 'nǐ hǎo'
85
+ * 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
82
86
  */
83
87
  ensurePinyinSpacing(text) {
84
- // Normalize apostrophes
85
- text = this.normalizeApostrophes(text);
86
- // Expand 'r (erhua via apostrophe) to 'er, then replace apostrophes with spaces
87
- text = text.replace(/'r/g, "'er");
88
+ // Normalize apostrophes and lowercase
89
+ text = this.normalizeApostrophes(text).toLowerCase();
90
+ // Expand 'r (erhua via apostrophe) to 'er, but not when 'r is followed by
91
+ // a vowel (which would mean 'r starts a syllable like 'rén, not erhua).
92
+ text = text.replace(/'r(?![aeiouāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜüv])/gi, "'er");
88
93
  text = text.replace(/'/g, ' ');
89
94
  text = text.replace(/ +/g, ' ');
90
95
  const parts = [];
@@ -106,14 +111,27 @@ exports.PinyinParser = {
106
111
  i = j;
107
112
  }
108
113
  else {
114
+ // Non-letter run: preserve ellipsis (…) as a token (used in patterns
115
+ // like 太…了), but strip all other punctuation and collapse to a space.
109
116
  let j = i;
110
117
  while (j < text.length && !isLetter(text[j]))
111
118
  j++;
112
- parts.push(text.substring(i, j));
119
+ const nonLetterRun = text.substring(i, j);
120
+ if (nonLetterRun.includes('…')) {
121
+ // Preserve ellipsis with surrounding spaces
122
+ if (parts.length > 0)
123
+ parts.push(' ');
124
+ parts.push('…');
125
+ if (j < text.length)
126
+ parts.push(' ');
127
+ }
128
+ else if (parts.length > 0 && j < text.length) {
129
+ parts.push(' ');
130
+ }
113
131
  i = j;
114
132
  }
115
133
  }
116
- return parts.join('');
134
+ return parts.join('').trim();
117
135
  },
118
136
  /**
119
137
  * Check if a string could be compound pinyin
@@ -153,13 +171,30 @@ exports.PinyinParser = {
153
171
  return this._parseDP(normalized, false);
154
172
  },
155
173
  /**
156
- * Get the best parsing from multiple options
174
+ * Get the best parsing from multiple options.
175
+ *
176
+ * Primary criterion: fewer syllables (more natural word-level grouping).
177
+ * Tiebreaker: when two parses have the same syllable count, prefer the
178
+ * one whose first syllable is shorter. This avoids greedy long matches
179
+ * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
180
+ * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
157
181
  */
158
182
  getBestParsing(results) {
159
183
  if (results.length === 0)
160
184
  return null;
161
- // Prefer fewer syllables (more natural parsing)
162
- results.sort((a, b) => a.syllables.length - b.syllables.length);
185
+ results.sort((a, b) => {
186
+ // Primary: fewer syllables
187
+ if (a.syllables.length !== b.syllables.length) {
188
+ return a.syllables.length - b.syllables.length;
189
+ }
190
+ // Tiebreaker: shorter first syllable
191
+ for (let i = 0; i < a.syllables.length; i++) {
192
+ if (a.syllables[i].length !== b.syllables[i].length) {
193
+ return a.syllables[i].length - b.syllables[i].length;
194
+ }
195
+ }
196
+ return 0;
197
+ });
163
198
  return results[0];
164
199
  },
165
200
  /**
@@ -24,7 +24,10 @@ export declare const PinyinValidator: {
24
24
  */
25
25
  removeAccentMarks(text: string): string;
26
26
  /**
27
- * Check if a string is a valid pinyin syllable (with or without tone marks)
27
+ * Check if a string is a valid pinyin syllable (with or without tone marks).
28
+ * A valid single syllable has at most one tone mark — two tone marks means
29
+ * two syllables have been merged (e.g. "zhùān" looks like "zhuan" after
30
+ * stripping tones, but the two marks prove it's "zhù" + "ān").
28
31
  */
29
32
  isValidPinyin(text: string): boolean;
30
33
  /**
@@ -244,11 +244,18 @@ exports.PinyinValidator = {
244
244
  });
245
245
  },
246
246
  /**
247
- * Check if a string is a valid pinyin syllable (with or without tone marks)
247
+ * Check if a string is a valid pinyin syllable (with or without tone marks).
248
+ * A valid single syllable has at most one tone mark — two tone marks means
249
+ * two syllables have been merged (e.g. "zhùān" looks like "zhuan" after
250
+ * stripping tones, but the two marks prove it's "zhù" + "ān").
248
251
  */
249
252
  isValidPinyin(text) {
250
253
  if (!text || text.length === 0)
251
254
  return false;
255
+ // Count tone marks: a single syllable can have at most one
256
+ const toneCount = (text.match(TONE_MARKS) || []).length;
257
+ if (toneCount > 1)
258
+ return false;
252
259
  const normalized = stripToneMarks(text);
253
260
  return VALID_SYLLABLES.has(normalized);
254
261
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shaxpir/duiduidui-models",
3
- "version": "1.16.0",
3
+ "version": "1.17.1",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/shaxpir/duiduidui-models"