npm - euparliamentmonitor - Versions diffs - 0.9.25 → 0.9.26 - Mend

euparliamentmonitor 0.9.25 → 0.9.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/scripts/aggregator/metadata/seo-budgets.js +30 -21
package/scripts/utils/fs/directory.js +8 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "euparliamentmonitor",
-  "version": "0.9.25",
+  "version": "0.9.26",
   "type": "module",
   "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
   "main": "scripts/index.js",

package/scripts/aggregator/metadata/seo-budgets.js CHANGED Viewed

@@ -59,19 +59,16 @@ export function budgetFor(lang, surface) {
 /**
  * CJK full-width clause boundaries — the breakpoints CJK readers
  * expect a snippet to end at. Listed in preferred-break order: a
- * sentence-final mark beats a comma which beats a middle-dot.
+ * sentence-final mark beats a semicolon which beats a middle-dot.
+ *
+ * **Note**: `、` (U+3001, enumeration comma) is deliberately excluded.
+ * Cutting at an enumeration comma leaves a grammatically broken list
+ * fragment (e.g. "民主问责、") which downstream `ensureTerminator`
+ * closes with `。`, producing nonsensical "民主问责、。". The enumeration
+ * comma is semantically equivalent to Latin `,` — a list separator,
+ * not a sentence boundary.
  */
-const CJK_CLAUSE_BOUNDARIES = [
-    '。',
-    '！',
-    '？',
-    '、',
-    '；',
-    '：',
-    '——',
-    '—',
-    '・',
-];
+const CJK_CLAUSE_BOUNDARIES = ['。', '！', '？', '；', '：', '——', '—', '・'];
 /**
  * RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
  * and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
@@ -91,11 +88,15 @@ const SOFT_MIN_RATIO = 0.55;
  * `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
  * intact when they sit at a natural sentence boundary.
  *
+ * Includes `、` (U+3001, CJK enumeration comma) which should never
+ * appear at the end of a truncated snippet — it signals a list
+ * continuation that never arrives.
+ *
  * @param s - Input string to trim
  * @returns Input with trailing separator-class characters removed
  */
 function trimTrailingSeparators(s) {
-    return s.replace(/[\s,;:—\-–·•…]+$/u, '');
+    return s.replace(/[\s,;:—\-–·•…、]+$/u, '');
 }
 /**
  * Pick the highest-priority clause boundary inside a candidate window.
@@ -141,20 +142,28 @@ export function clampForBudget(text, lang, surface) {
     const softMin = Math.floor(budget * SOFT_MIN_RATIO);
     // Reserve one char for the ellipsis we may append.
     const window = trimmed.slice(0, budget - 1);
-    const boundaries = family === 'cjk'
-        ? CJK_CLAUSE_BOUNDARIES
-        : family === 'rtl'
-            ? RTL_CLAUSE_BOUNDARIES
-            : HEADLINE_CLAUSE_BOUNDARIES;
+    // Korean uses Western-style punctuation (. ! ? ,) and inter-word
+    // spaces despite being classified as CJK for pixel-budget purposes.
+    // Use Latin clause boundaries and allow the whitespace fallback so we
+    // don't hard-cut mid-token (e.g. splitting "2026-04-26" → "2026-0").
+    const useLatinBoundaries = lang === 'ko';
+    const boundaries = useLatinBoundaries
+        ? HEADLINE_CLAUSE_BOUNDARIES
+        : family === 'cjk'
+            ? CJK_CLAUSE_BOUNDARIES
+            : family === 'rtl'
+                ? RTL_CLAUSE_BOUNDARIES
+                : HEADLINE_CLAUSE_BOUNDARIES;
     const clauseCut = findClauseCut(window, boundaries, softMin);
     if (clauseCut > 0) {
         const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
         if (cleaned.length >= softMin)
             return cleaned;
     }
-    // Whitespace-aware fallback. CJK text often has no ASCII spaces, so
-    // skip this step for CJK and fall straight through to the hard cut.
-    if (family !== 'cjk') {
+    // Whitespace-aware fallback. Chinese and Japanese text often has no
+    // ASCII spaces, so skip this step for them and fall straight through
+    // to the hard cut. Korean is the exception — it uses inter-word spaces.
+    if (family !== 'cjk' || lang === 'ko') {
         const lastSpace = window.lastIndexOf(' ');
         if (lastSpace >= softMin) {
             const safe = trimTrailingSeparators(window.slice(0, lastSpace));

package/scripts/utils/fs/directory.js CHANGED Viewed

@@ -68,7 +68,14 @@ export function resolveUniqueAnalysisDir(baseDir) {
         }
         suffix++;
     }
-    const candidate = `${baseDir}-${randomUUID().slice(0, 8)}`;
+    // Guarantee the UUID suffix contains at least one hex letter (a-f) so that
+    // it never looks like a pure numeric suffix and can't collide with the
+    // numeric range above.  The probability of retrying is ≈ (10/16)^8 < 0.025%.
+    let uuidSuffix;
+    do {
+        uuidSuffix = randomUUID().slice(0, 8);
+    } while (/^\d+$/.test(uuidSuffix));
+    const candidate = `${baseDir}-${uuidSuffix}`;
     fs.mkdirSync(candidate, { recursive: true });
     return candidate;
 }