euparliamentmonitor 0.9.25 → 0.9.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -59,19 +59,16 @@ export function budgetFor(lang, surface) {
|
|
|
59
59
|
/**
|
|
60
60
|
* CJK full-width clause boundaries — the breakpoints CJK readers
|
|
61
61
|
* expect a snippet to end at. Listed in preferred-break order: a
|
|
62
|
-
* sentence-final mark beats a
|
|
62
|
+
* sentence-final mark beats a semicolon which beats a middle-dot.
|
|
63
|
+
*
|
|
64
|
+
* **Note**: `、` (U+3001, enumeration comma) is deliberately excluded.
|
|
65
|
+
* Cutting at an enumeration comma leaves a grammatically broken list
|
|
66
|
+
* fragment (e.g. "民主问责、") which downstream `ensureTerminator`
|
|
67
|
+
* closes with `。`, producing nonsensical "民主问责、。". The enumeration
|
|
68
|
+
* comma is semantically equivalent to Latin `,` — a list separator,
|
|
69
|
+
* not a sentence boundary.
|
|
63
70
|
*/
|
|
64
|
-
const CJK_CLAUSE_BOUNDARIES = [
|
|
65
|
-
'。',
|
|
66
|
-
'!',
|
|
67
|
-
'?',
|
|
68
|
-
'、',
|
|
69
|
-
';',
|
|
70
|
-
':',
|
|
71
|
-
'——',
|
|
72
|
-
'—',
|
|
73
|
-
'・',
|
|
74
|
-
];
|
|
71
|
+
const CJK_CLAUSE_BOUNDARIES = ['。', '!', '?', ';', ':', '——', '—', '・'];
|
|
75
72
|
/**
|
|
76
73
|
* RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
|
|
77
74
|
* and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
|
|
@@ -91,11 +88,15 @@ const SOFT_MIN_RATIO = 0.55;
|
|
|
91
88
|
* `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
|
|
92
89
|
* intact when they sit at a natural sentence boundary.
|
|
93
90
|
*
|
|
91
|
+
* Includes `、` (U+3001, CJK enumeration comma) which should never
|
|
92
|
+
* appear at the end of a truncated snippet — it signals a list
|
|
93
|
+
* continuation that never arrives.
|
|
94
|
+
*
|
|
94
95
|
* @param s - Input string to trim
|
|
95
96
|
* @returns Input with trailing separator-class characters removed
|
|
96
97
|
*/
|
|
97
98
|
function trimTrailingSeparators(s) {
|
|
98
|
-
return s.replace(/[\s
|
|
99
|
+
return s.replace(/[\s,;:—\-–·•…、]+$/u, '');
|
|
99
100
|
}
|
|
100
101
|
/**
|
|
101
102
|
* Pick the highest-priority clause boundary inside a candidate window.
|
|
@@ -141,20 +142,28 @@ export function clampForBudget(text, lang, surface) {
|
|
|
141
142
|
const softMin = Math.floor(budget * SOFT_MIN_RATIO);
|
|
142
143
|
// Reserve one char for the ellipsis we may append.
|
|
143
144
|
const window = trimmed.slice(0, budget - 1);
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
145
|
+
// Korean uses Western-style punctuation (. ! ? ,) and inter-word
|
|
146
|
+
// spaces despite being classified as CJK for pixel-budget purposes.
|
|
147
|
+
// Use Latin clause boundaries and allow the whitespace fallback so we
|
|
148
|
+
// don't hard-cut mid-token (e.g. splitting "2026-04-26" → "2026-0").
|
|
149
|
+
const useLatinBoundaries = lang === 'ko';
|
|
150
|
+
const boundaries = useLatinBoundaries
|
|
151
|
+
? HEADLINE_CLAUSE_BOUNDARIES
|
|
152
|
+
: family === 'cjk'
|
|
153
|
+
? CJK_CLAUSE_BOUNDARIES
|
|
154
|
+
: family === 'rtl'
|
|
155
|
+
? RTL_CLAUSE_BOUNDARIES
|
|
156
|
+
: HEADLINE_CLAUSE_BOUNDARIES;
|
|
149
157
|
const clauseCut = findClauseCut(window, boundaries, softMin);
|
|
150
158
|
if (clauseCut > 0) {
|
|
151
159
|
const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
|
|
152
160
|
if (cleaned.length >= softMin)
|
|
153
161
|
return cleaned;
|
|
154
162
|
}
|
|
155
|
-
// Whitespace-aware fallback.
|
|
156
|
-
// skip this step for
|
|
157
|
-
|
|
163
|
+
// Whitespace-aware fallback. Chinese and Japanese text often has no
|
|
164
|
+
// ASCII spaces, so skip this step for them and fall straight through
|
|
165
|
+
// to the hard cut. Korean is the exception — it uses inter-word spaces.
|
|
166
|
+
if (family !== 'cjk' || lang === 'ko') {
|
|
158
167
|
const lastSpace = window.lastIndexOf(' ');
|
|
159
168
|
if (lastSpace >= softMin) {
|
|
160
169
|
const safe = trimTrailingSeparators(window.slice(0, lastSpace));
|
|
@@ -68,7 +68,14 @@ export function resolveUniqueAnalysisDir(baseDir) {
|
|
|
68
68
|
}
|
|
69
69
|
suffix++;
|
|
70
70
|
}
|
|
71
|
-
|
|
71
|
+
// Guarantee the UUID suffix contains at least one hex letter (a-f) so that
|
|
72
|
+
// it never looks like a pure numeric suffix and can't collide with the
|
|
73
|
+
// numeric range above. The probability of retrying is ≈ (10/16)^8 < 0.025%.
|
|
74
|
+
let uuidSuffix;
|
|
75
|
+
do {
|
|
76
|
+
uuidSuffix = randomUUID().slice(0, 8);
|
|
77
|
+
} while (/^\d+$/.test(uuidSuffix));
|
|
78
|
+
const candidate = `${baseDir}-${uuidSuffix}`;
|
|
72
79
|
fs.mkdirSync(candidate, { recursive: true });
|
|
73
80
|
return candidate;
|
|
74
81
|
}
|