euparliamentmonitor 0.9.25 → 0.9.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "euparliamentmonitor",
3
- "version": "0.9.25",
3
+ "version": "0.9.26",
4
4
  "type": "module",
5
5
  "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
6
6
  "main": "scripts/index.js",
@@ -59,19 +59,16 @@ export function budgetFor(lang, surface) {
59
59
  /**
60
60
  * CJK full-width clause boundaries — the breakpoints CJK readers
61
61
  * expect a snippet to end at. Listed in preferred-break order: a
62
- * sentence-final mark beats a comma which beats a middle-dot.
62
+ * sentence-final mark beats a semicolon which beats a middle-dot.
63
+ *
64
+ * **Note**: `、` (U+3001, enumeration comma) is deliberately excluded.
65
+ * Cutting at an enumeration comma leaves a grammatically broken list
66
+ * fragment (e.g. "民主问责、") which downstream `ensureTerminator`
67
+ * closes with `。`, producing nonsensical "民主问责、。". The enumeration
68
+ * comma is semantically equivalent to Latin `,` — a list separator,
69
+ * not a sentence boundary.
63
70
  */
64
- const CJK_CLAUSE_BOUNDARIES = [
65
- '。',
66
- '!',
67
- '?',
68
- '、',
69
- ';',
70
- ':',
71
- '——',
72
- '—',
73
- '・',
74
- ];
71
+ const CJK_CLAUSE_BOUNDARIES = ['。', '!', '?', ';', ':', '——', '—', '・'];
75
72
  /**
76
73
  * RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
77
74
  * and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
@@ -91,11 +88,15 @@ const SOFT_MIN_RATIO = 0.55;
91
88
  * `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
92
89
  * intact when they sit at a natural sentence boundary.
93
90
  *
91
+ * Includes `、` (U+3001, CJK enumeration comma) which should never
92
+ * appear at the end of a truncated snippet — it signals a list
93
+ * continuation that never arrives.
94
+ *
94
95
  * @param s - Input string to trim
95
96
  * @returns Input with trailing separator-class characters removed
96
97
  */
97
98
  function trimTrailingSeparators(s) {
98
- return s.replace(/[\s,;:—\-–·•…]+$/u, '');
99
+ return s.replace(/[\s,;:—\-–·•…、]+$/u, '');
99
100
  }
100
101
  /**
101
102
  * Pick the highest-priority clause boundary inside a candidate window.
@@ -141,20 +142,28 @@ export function clampForBudget(text, lang, surface) {
141
142
  const softMin = Math.floor(budget * SOFT_MIN_RATIO);
142
143
  // Reserve one char for the ellipsis we may append.
143
144
  const window = trimmed.slice(0, budget - 1);
144
- const boundaries = family === 'cjk'
145
- ? CJK_CLAUSE_BOUNDARIES
146
- : family === 'rtl'
147
- ? RTL_CLAUSE_BOUNDARIES
148
- : HEADLINE_CLAUSE_BOUNDARIES;
145
+ // Korean uses Western-style punctuation (. ! ? ,) and inter-word
146
+ // spaces despite being classified as CJK for pixel-budget purposes.
147
+ // Use Latin clause boundaries and allow the whitespace fallback so we
148
+ // don't hard-cut mid-token (e.g. splitting "2026-04-26" → "2026-0").
149
+ const useLatinBoundaries = lang === 'ko';
150
+ const boundaries = useLatinBoundaries
151
+ ? HEADLINE_CLAUSE_BOUNDARIES
152
+ : family === 'cjk'
153
+ ? CJK_CLAUSE_BOUNDARIES
154
+ : family === 'rtl'
155
+ ? RTL_CLAUSE_BOUNDARIES
156
+ : HEADLINE_CLAUSE_BOUNDARIES;
149
157
  const clauseCut = findClauseCut(window, boundaries, softMin);
150
158
  if (clauseCut > 0) {
151
159
  const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
152
160
  if (cleaned.length >= softMin)
153
161
  return cleaned;
154
162
  }
155
- // Whitespace-aware fallback. CJK text often has no ASCII spaces, so
156
- // skip this step for CJK and fall straight through to the hard cut.
157
- if (family !== 'cjk') {
163
+ // Whitespace-aware fallback. Chinese and Japanese text often has no
164
+ // ASCII spaces, so skip this step for them and fall straight through
165
+ // to the hard cut. Korean is the exception — it uses inter-word spaces.
166
+ if (family !== 'cjk' || lang === 'ko') {
158
167
  const lastSpace = window.lastIndexOf(' ');
159
168
  if (lastSpace >= softMin) {
160
169
  const safe = trimTrailingSeparators(window.slice(0, lastSpace));
@@ -68,7 +68,14 @@ export function resolveUniqueAnalysisDir(baseDir) {
68
68
  }
69
69
  suffix++;
70
70
  }
71
- const candidate = `${baseDir}-${randomUUID().slice(0, 8)}`;
71
+ // Guarantee the UUID suffix contains at least one hex letter (a-f) so that
72
+ // it never looks like a pure numeric suffix and can't collide with the
73
+ // numeric range above. The probability of retrying is ≈ (10/16)^8 < 0.025%.
74
+ let uuidSuffix;
75
+ do {
76
+ uuidSuffix = randomUUID().slice(0, 8);
77
+ } while (/^\d+$/.test(uuidSuffix));
78
+ const candidate = `${baseDir}-${uuidSuffix}`;
72
79
  fs.mkdirSync(candidate, { recursive: true });
73
80
  return candidate;
74
81
  }