@shaxpir/duiduidui-models 1.16.0 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,6 @@ export interface Phrase {
17
17
  sense_rank: number;
18
18
  difficulty: number;
19
19
  pinyin: string;
20
- pinyin_tokenized: string;
21
20
  transliteration: string;
22
21
  translation: string;
23
22
  notes: string;
@@ -42,7 +42,6 @@ export interface TermPayload extends BayesianScore {
42
42
  implied_review_count: number;
43
43
  hanzi_count?: number;
44
44
  pinyin?: string;
45
- pinyin_tokenized?: string;
46
45
  transliteration?: string;
47
46
  translation?: string;
48
47
  notes?: string;
@@ -38,7 +38,6 @@ class Term extends Content_1.Content {
38
38
  difficulty: difficulty,
39
39
  hanzi_count: textOrPhrase.length,
40
40
  pinyin: '',
41
- pinyin_tokenized: '',
42
41
  transliteration: '',
43
42
  translation: '',
44
43
  notes: '',
@@ -71,7 +70,6 @@ class Term extends Content_1.Content {
71
70
  hanzi_count: phrase.hanzi_count,
72
71
  difficulty: phrase.difficulty,
73
72
  pinyin: phrase.pinyin,
74
- pinyin_tokenized: phrase.pinyin_tokenized,
75
73
  transliteration: phrase.transliteration,
76
74
  translation: phrase.translation,
77
75
  notes: phrase.notes,
@@ -37,7 +37,10 @@ export declare const PinyinParser: {
37
37
  * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
38
38
  * and erhua) and return it with proper syllable spacing.
39
39
  *
40
- * Preserves original tone marks and casing. Preserves punctuation in place.
40
+ * The output is lowercased with punctuation stripped, producing clean
41
+ * space-separated syllables suitable for search indexing and tokenization.
42
+ * Tone marks are preserved.
43
+ *
41
44
  * Throws if any letter sequence cannot be parsed as pinyin.
42
45
  *
43
46
  * Examples:
@@ -45,7 +48,8 @@ export declare const PinyinParser: {
45
48
  * 'nǎr' → 'nǎ er'
46
49
  * "xī'ān" → 'xī ān'
47
50
  * "gē'rmen" → 'gē er men'
48
- * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
51
+ * 'Nǐ hǎo!' → 'nǐ hǎo'
52
+ * 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
49
53
  */
50
54
  ensurePinyinSpacing(text: string): string;
51
55
  /**
@@ -58,7 +62,13 @@ export declare const PinyinParser: {
58
62
  */
59
63
  parseWithSpecialCases(text: string): PinyinParseResult[];
60
64
  /**
61
- * Get the best parsing from multiple options
65
+ * Get the best parsing from multiple options.
66
+ *
67
+ * Primary criterion: fewer syllables (more natural word-level grouping).
68
+ * Tiebreaker: when two parses have the same syllable count, prefer the
69
+ * one whose first syllable is shorter. This avoids greedy long matches
70
+ * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
71
+ * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
62
72
  */
63
73
  getBestParsing(results: PinyinParseResult[]): PinyinParseResult | null;
64
74
  /**
@@ -70,7 +70,10 @@ exports.PinyinParser = {
70
70
  * Take a raw pinyin string (possibly with punctuation, spaces, apostrophes,
71
71
  * and erhua) and return it with proper syllable spacing.
72
72
  *
73
- * Preserves original tone marks and casing. Preserves punctuation in place.
73
+ * The output is lowercased with punctuation stripped, producing clean
74
+ * space-separated syllables suitable for search indexing and tokenization.
75
+ * Tone marks are preserved.
76
+ *
74
77
  * Throws if any letter sequence cannot be parsed as pinyin.
75
78
  *
76
79
  * Examples:
@@ -78,11 +81,12 @@ exports.PinyinParser = {
78
81
  * 'nǎr' → 'nǎ er'
79
82
  * "xī'ān" → 'xī ān'
80
83
  * "gē'rmen" → 'gē er men'
81
- * 'nǐhǎo,shìjiè' → 'nǐ hǎo,shì jiè'
84
+ * 'Nǐ hǎo!' → 'nǐ hǎo'
85
+ * 'tā shēng bìng le, jīng cháng' → 'tā shēng bìng le jīng cháng'
82
86
  */
83
87
  ensurePinyinSpacing(text) {
84
- // Normalize apostrophes
85
- text = this.normalizeApostrophes(text);
88
+ // Normalize apostrophes and lowercase
89
+ text = this.normalizeApostrophes(text).toLowerCase();
86
90
  // Expand 'r (erhua via apostrophe) to 'er, then replace apostrophes with spaces
87
91
  text = text.replace(/'r/g, "'er");
88
92
  text = text.replace(/'/g, ' ');
@@ -106,14 +110,27 @@ exports.PinyinParser = {
106
110
  i = j;
107
111
  }
108
112
  else {
113
+ // Non-letter run: preserve ellipsis (…) as a token (used in patterns
114
+ // like 太…了), but strip all other punctuation and collapse to a space.
109
115
  let j = i;
110
116
  while (j < text.length && !isLetter(text[j]))
111
117
  j++;
112
- parts.push(text.substring(i, j));
118
+ const nonLetterRun = text.substring(i, j);
119
+ if (nonLetterRun.includes('…')) {
120
+ // Preserve ellipsis with surrounding spaces
121
+ if (parts.length > 0)
122
+ parts.push(' ');
123
+ parts.push('…');
124
+ if (j < text.length)
125
+ parts.push(' ');
126
+ }
127
+ else if (parts.length > 0 && j < text.length) {
128
+ parts.push(' ');
129
+ }
113
130
  i = j;
114
131
  }
115
132
  }
116
- return parts.join('');
133
+ return parts.join('').trim();
117
134
  },
118
135
  /**
119
136
  * Check if a string could be compound pinyin
@@ -153,13 +170,30 @@ exports.PinyinParser = {
153
170
  return this._parseDP(normalized, false);
154
171
  },
155
172
  /**
156
- * Get the best parsing from multiple options
173
+ * Get the best parsing from multiple options.
174
+ *
175
+ * Primary criterion: fewer syllables (more natural word-level grouping).
176
+ * Tiebreaker: when two parses have the same syllable count, prefer the
177
+ * one whose first syllable is shorter. This avoids greedy long matches
178
+ * (e.g. "dàng" in "dàngāo") that leave uncommon standalone-vowel
179
+ * remainders (e.g. "āo") when a more balanced split exists ("dàn gāo").
157
180
  */
158
181
  getBestParsing(results) {
159
182
  if (results.length === 0)
160
183
  return null;
161
- // Prefer fewer syllables (more natural parsing)
162
- results.sort((a, b) => a.syllables.length - b.syllables.length);
184
+ results.sort((a, b) => {
185
+ // Primary: fewer syllables
186
+ if (a.syllables.length !== b.syllables.length) {
187
+ return a.syllables.length - b.syllables.length;
188
+ }
189
+ // Tiebreaker: shorter first syllable
190
+ for (let i = 0; i < a.syllables.length; i++) {
191
+ if (a.syllables[i].length !== b.syllables[i].length) {
192
+ return a.syllables[i].length - b.syllables[i].length;
193
+ }
194
+ }
195
+ return 0;
196
+ });
163
197
  return results[0];
164
198
  },
165
199
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shaxpir/duiduidui-models",
3
- "version": "1.16.0",
3
+ "version": "1.17.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/shaxpir/duiduidui-models"