euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -0,0 +1,202 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ import { HEADLINE_CLAUSE_BOUNDARIES } from './text-utils.js';
4
+ /**
5
+ * Iteration helper — all three script families in a deterministic
6
+ * order (latin → cjk → rtl). Exported so test matrices and downstream
7
+ * tooling can walk every column of {@link SEO_BUDGETS} without
8
+ * duplicating the literal list.
9
+ */
10
+ export const ALL_SCRIPT_FAMILIES = ['latin', 'cjk', 'rtl'];
11
+ /**
12
+ * Classify a locale code into a script family. Used to look up the
13
+ * correct byte cap in {@link SEO_BUDGETS}.
14
+ *
15
+ * @param lang - BCP-47 language tag (one of the 14 publishing locales)
16
+ * @returns Script family for SEO budget lookup
17
+ */
18
+ export function classifyScript(lang) {
19
+ if (lang === 'ar' || lang === 'he')
20
+ return 'rtl';
21
+ if (lang === 'ja' || lang === 'ko' || lang === 'zh')
22
+ return 'cjk';
23
+ return 'latin';
24
+ }
25
+ /**
26
+ * Per-surface × per-script byte cap table. Numbers reflect the
27
+ * narrower of Google / Bing / Facebook / Twitter documented envelopes,
28
+ * with a ~5 % safety margin so a snippet on the edge of the budget
29
+ * isn't truncated mid-glyph by the rendering platform.
30
+ *
31
+ * For `jsonLdHeadline` the Schema.org `NewsArticle.headline` cap is
32
+ * script-independent (Google validates the literal character count at
33
+ * 110) — same value across the row.
34
+ */
35
+ export const SEO_BUDGETS = {
36
+ title: { latin: 60, cjk: 30, rtl: 55 },
37
+ metaDescription: { latin: 155, cjk: 78, rtl: 150 },
38
+ ogTitle: { latin: 95, cjk: 47, rtl: 90 },
39
+ ogDescription: { latin: 200, cjk: 100, rtl: 195 },
40
+ twitterTitle: { latin: 70, cjk: 35, rtl: 70 },
41
+ twitterDescription: { latin: 200, cjk: 100, rtl: 195 },
42
+ imageAlt: { latin: 125, cjk: 60, rtl: 120 },
43
+ jsonLdHeadline: { latin: 110, cjk: 110, rtl: 110 },
44
+ };
45
+ /**
46
+ * Resolve the byte cap for one `(lang, surface)` pair.
47
+ *
48
+ * @param lang - Publishing locale
49
+ * @param surface - SEO surface (see {@link SeoSurface})
50
+ * @returns Byte cap (positive integer)
51
+ */
52
+ export function budgetFor(lang, surface) {
53
+ const family = classifyScript(lang);
54
+ return SEO_BUDGETS[surface][family];
55
+ }
56
+ // ────────────────────────────────────────────────────────────────────────
57
+ // Script-aware truncator
58
+ // ────────────────────────────────────────────────────────────────────────
59
+ /**
60
+ * CJK full-width clause boundaries — the breakpoints CJK readers
61
+ * expect a snippet to end at. Listed in preferred-break order: a
62
+ * sentence-final mark beats a comma which beats a middle-dot.
63
+ */
64
+ const CJK_CLAUSE_BOUNDARIES = [
65
+ '。',
66
+ '!',
67
+ '?',
68
+ '、',
69
+ ';',
70
+ ':',
71
+ '——',
72
+ '—',
73
+ '・',
74
+ ];
75
+ /**
76
+ * RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
77
+ * and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
78
+ * `.` and `,` directly). Listed in preferred-break order.
79
+ */
80
+ const RTL_CLAUSE_BOUNDARIES = ['. ', '؟ ', '! ', '، ', '؛ ', ' — ', ' – '];
81
+ /**
82
+ * Soft-minimum fraction of the budget at which a clause-boundary break
83
+ * is acceptable. Below this fraction we fall through to whitespace
84
+ * truncation so we never ship a near-empty snippet just because the
85
+ * input started with a short clause.
86
+ */
87
+ const SOFT_MIN_RATIO = 0.55;
88
+ /**
89
+ * Trim trailing punctuation that would otherwise leave a snippet
90
+ * ending on a dangling separator or ellipsis. Mirrors the spirit of
91
+ * `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
92
+ * intact when they sit at a natural sentence boundary.
93
+ *
94
+ * @param s - Input string to trim
95
+ * @returns Input with trailing separator-class characters removed
96
+ */
97
+ function trimTrailingSeparators(s) {
98
+ return s.replace(/[\s,;:—\-–·•…]+$/u, '');
99
+ }
100
+ /**
101
+ * Pick the highest-priority clause boundary inside a candidate window.
102
+ * Iterates the boundary vocabulary in declared (preference) order and
103
+ * returns the first index that sits past the soft minimum.
104
+ *
105
+ * @param window - Candidate cut window (`text.slice(0, budget)`)
106
+ * @param boundaries - Boundary vocabulary, in preference order
107
+ * @param softMin - Soft-minimum cut position (chars)
108
+ * @returns Cut index, or -1 when no boundary qualifies
109
+ */
110
+ function findClauseCut(window, boundaries, softMin) {
111
+ for (const boundary of boundaries) {
112
+ const idx = window.lastIndexOf(boundary);
113
+ if (idx >= softMin) {
114
+ return idx + boundary.length;
115
+ }
116
+ }
117
+ return -1;
118
+ }
119
+ /**
120
+ * Truncate `text` to fit `(lang, surface)` SEO byte budget. Prefers a
121
+ * natural clause boundary inside the script's punctuation vocabulary
122
+ * (CJK / RTL / Latin) before falling back to a whitespace break.
123
+ *
124
+ * Always returns `text` verbatim when it already fits (no ellipsis
125
+ * appended). When truncation happens an ellipsis (`…`) is appended for
126
+ * Latin / RTL; for CJK the full-width ellipsis (`…`) reads as a
127
+ * partial-thought marker and is also appended — Schema.org and Google
128
+ * accept either glyph in `headline` / `description`.
129
+ *
130
+ * @param text - Source text (already plain-text — no Markdown / HTML)
131
+ * @param lang - Publishing locale
132
+ * @param surface - Target SEO surface
133
+ * @returns Clamped text ≤ `budgetFor(lang, surface)` characters
134
+ */
135
+ export function clampForBudget(text, lang, surface) {
136
+ const trimmed = text.trim();
137
+ const budget = budgetFor(lang, surface);
138
+ if (trimmed.length <= budget)
139
+ return trimmed;
140
+ const family = classifyScript(lang);
141
+ const softMin = Math.floor(budget * SOFT_MIN_RATIO);
142
+ // Reserve one char for the ellipsis we may append.
143
+ const window = trimmed.slice(0, budget - 1);
144
+ const boundaries = family === 'cjk'
145
+ ? CJK_CLAUSE_BOUNDARIES
146
+ : family === 'rtl'
147
+ ? RTL_CLAUSE_BOUNDARIES
148
+ : HEADLINE_CLAUSE_BOUNDARIES;
149
+ const clauseCut = findClauseCut(window, boundaries, softMin);
150
+ if (clauseCut > 0) {
151
+ const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
152
+ if (cleaned.length >= softMin)
153
+ return cleaned;
154
+ }
155
+ // Whitespace-aware fallback. CJK text often has no ASCII spaces, so
156
+ // skip this step for CJK and fall straight through to the hard cut.
157
+ if (family !== 'cjk') {
158
+ const lastSpace = window.lastIndexOf(' ');
159
+ if (lastSpace >= softMin) {
160
+ const safe = trimTrailingSeparators(window.slice(0, lastSpace));
161
+ return `${safe}…`;
162
+ }
163
+ }
164
+ const hardCut = trimTrailingSeparators(window);
165
+ return `${hardCut}…`;
166
+ }
167
+ /**
168
+ * Compose `{title}{separator}{siteTitle}` while honouring the
169
+ * `(lang, surface)` budget. Drops the brand suffix entirely when the
170
+ * article title alone is already at or past the budget. Prefers the
171
+ * short site title when supplied and the full suffix doesn't fit.
172
+ *
173
+ * @param title - Article title (plain text)
174
+ * @param lang - Publishing locale
175
+ * @param surface - Target SEO surface (`title` / `ogTitle` / `twitterTitle`)
176
+ * @param opts - Optional brand suffix wiring
177
+ * @returns Composed title ≤ budget
178
+ */
179
+ export function clampTitleForSurface(title, lang, surface, opts = {}) {
180
+ const budget = budgetFor(lang, surface);
181
+ const cleanTitle = title.trim();
182
+ const sep = opts.separator ?? '';
183
+ const full = opts.siteTitle ?? '';
184
+ const short = opts.shortSiteTitle ?? '';
185
+ // No brand suffix wiring — just clamp the title in isolation.
186
+ if (!full)
187
+ return clampForBudget(cleanTitle, lang, surface);
188
+ const fullSuffix = `${sep}${full}`;
189
+ const shortSuffix = short ? `${sep}${short}` : '';
190
+ // Best case: title + full suffix fits.
191
+ if (cleanTitle.length + fullSuffix.length <= budget) {
192
+ return `${cleanTitle}${fullSuffix}`;
193
+ }
194
+ // Second best: title + short suffix fits.
195
+ if (shortSuffix && cleanTitle.length + shortSuffix.length <= budget) {
196
+ return `${cleanTitle}${shortSuffix}`;
197
+ }
198
+ // Third: keep the title (clamped), drop the brand. Better SERP than
199
+ // a truncated headline followed by a clipped brand suffix.
200
+ return clampForBudget(cleanTitle, lang, surface);
201
+ }
202
+ //# sourceMappingURL=seo-budgets.js.map
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Remove any trailing whitespace, stop-words (the/a/an/of/…) and
3
+ * trailing punctuation (including any pre-existing ellipsis). Implemented
4
+ * imperatively to avoid super-linear regex backtracking on the
5
+ * `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
6
+ *
7
+ * @param input - Pre-clipped string to clean up
8
+ * @returns Cleaned string with no trailing stop-words or punctuation
9
+ */
10
+ export declare function stripTrailingStopWordsAndPunctuation(input: string): string;
11
+ /**
12
+ * Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
13
+ * an ellipsis when truncation actually happens. Does not break words if
14
+ * avoidable — a trailing partial word is trimmed back to the previous
15
+ * space first.
16
+ *
17
+ * @param text - Raw description text
18
+ * @returns Truncated description with trailing ellipsis when clipped
19
+ */
20
+ export declare function truncateDescription(text: string): string;
21
+ /**
22
+ * Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
23
+ * characters using the same sentence-boundary-preserving logic as
24
+ * {@link truncateDescription}. Returns `''` when the input is empty
25
+ * or shorter than the meta-description maximum (no point in emitting
26
+ * an "extended" description that's actually shorter than the regular
27
+ * one).
28
+ *
29
+ * @param text - Raw extended-description text (e.g. full BLUF paragraph)
30
+ * @returns Truncated extended description, or `''` when not worth emitting
31
+ */
32
+ export declare function truncateExtendedDescription(text: string): string;
33
+ /**
34
+ * Clamp a title to `TITLE_MAX_LENGTH` characters in the same
35
+ * word-boundary-preserving fashion as {@link truncateDescription}.
36
+ *
37
+ * **No mid-sentence ellipsis.** When the title overruns the budget and
38
+ * no natural clause boundary exists inside the
39
+ * `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window, this function returns
40
+ * an empty string instead of a mid-sentence `…` truncation. The empty
41
+ * return tells the caller to fall through to the next tier of the
42
+ * resolver ladder (template-fallback title with category + date),
43
+ * producing a complete, scan-friendly title rather than a clipped
44
+ * editorial fragment. Live-site regression (2026-05): titles such as
45
+ * `AI Trade Strategy: A Legislative First with Structural…` and
46
+ * `The European Parliament's 24 standing committees continued…`
47
+ * were emitted before this guard.
48
+ *
49
+ * @param text - Raw title text
50
+ * @returns Clause-truncated title (no ellipsis), or `''` when no
51
+ * editorial clause boundary exists in the window
52
+ */
53
+ export declare function truncateTitle(text: string): string;
54
+ /**
55
+ * Return the first complete sentence from a prose paragraph, suitable
56
+ * for use as a fallback editorial title when the artefact H1 is
57
+ * categorical (e.g. `# EU Parliament Committee Reports`) and the
58
+ * resolver must derive `<title>` from the BLUF / lede summary instead.
59
+ *
60
+ * A "sentence" is the prefix up to the first sentence-terminator
61
+ * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
62
+ * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
63
+ * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
64
+ * so they don't terminate the sentence prematurely. When no
65
+ * acceptable terminator exists in the window, returns `''` so the
66
+ * resolver falls through to the next tier instead of feeding an
67
+ * over-budget paragraph into {@link truncateTitle} (which would also
68
+ * return `''`).
69
+ *
70
+ * @param paragraph - Prose paragraph (post-`stripInlineMarkdown`)
71
+ * @returns First sentence, or `''` when none can be identified within
72
+ * the soft-min window
73
+ */
74
+ export declare function extractFirstSentence(paragraph: string): string;
75
+ //# sourceMappingURL=text-truncate.d.ts.map
@@ -0,0 +1,277 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Aggregator/Metadata/TextTruncate
5
+ * @description Byte-budget truncators and sentence-extraction helpers
6
+ * extracted from `text-utils.ts` to keep both modules under the 600-line
7
+ * drift-guard budget enforced by `test/unit/source-file-size.test.js`.
8
+ *
9
+ * This file is the **clamping layer** of the metadata text pipeline —
10
+ * after `shouldSkipDescriptionLine`/`stripInlineMarkdown` produce a
11
+ * candidate description / title, the helpers here apply the SEO-budget
12
+ * shape rules:
13
+ *
14
+ * - {@link truncateDescription} — clamp to `DESCRIPTION_MAX_LENGTH` on a
15
+ * sentence/word boundary, appending `…` when truncation occurs.
16
+ * - {@link truncateExtendedDescription} — clamp to the longer
17
+ * `EXTENDED_DESCRIPTION_MAX_LENGTH` (used by `og:description`).
18
+ * - {@link truncateTitle} — clamp to `TITLE_MAX_LENGTH` on a
19
+ * **clause** boundary, returning `''` rather than emitting a
20
+ * mid-sentence ellipsised title.
21
+ * - {@link extractFirstSentence} — return the first complete sentence
22
+ * from a prose paragraph, or `''` when no clean terminator is
23
+ * available within the soft-min window.
24
+ *
25
+ * Bounded-context rules match `text-utils.ts`:
26
+ * - **No upward imports** — pure helpers, no I/O, no globals.
27
+ * - **Deterministic** — same input always produces same output.
28
+ * - **Locale-agnostic** — operates on raw prose in any of the 14
29
+ * publishing languages.
30
+ */
31
+ import { ABBREVIATION_PREFIXES, DESCRIPTION_MAX_LENGTH, DESCRIPTION_MIN_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MIN_LENGTH, HEADLINE_CLAUSE_BOUNDARIES, HEADLINE_HARD_MIN, HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH, TRAILING_PUNCT, TRAILING_STOP_WORDS, } from './text-utils-constants.js';
32
+ /**
33
+ * Remove any trailing whitespace, stop-words (the/a/an/of/…) and
34
+ * trailing punctuation (including any pre-existing ellipsis). Implemented
35
+ * imperatively to avoid super-linear regex backtracking on the
36
+ * `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
37
+ *
38
+ * @param input - Pre-clipped string to clean up
39
+ * @returns Cleaned string with no trailing stop-words or punctuation
40
+ */
41
+ export function stripTrailingStopWordsAndPunctuation(input) {
42
+ let result = input;
43
+ let changed = true;
44
+ while (changed) {
45
+ changed = false;
46
+ while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
47
+ result = result.slice(0, -1);
48
+ changed = true;
49
+ }
50
+ const lastSpace = result.lastIndexOf(' ');
51
+ if (lastSpace >= 0) {
52
+ const tail = result.slice(lastSpace + 1).toLowerCase();
53
+ if (TRAILING_STOP_WORDS.has(tail)) {
54
+ result = result.slice(0, lastSpace);
55
+ changed = true;
56
+ }
57
+ }
58
+ }
59
+ return result;
60
+ }
61
+ /**
62
+ * Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
63
+ * an ellipsis when truncation actually happens. Does not break words if
64
+ * avoidable — a trailing partial word is trimmed back to the previous
65
+ * space first.
66
+ *
67
+ * @param text - Raw description text
68
+ * @returns Truncated description with trailing ellipsis when clipped
69
+ */
70
+ export function truncateDescription(text) {
71
+ if (text.length <= DESCRIPTION_MAX_LENGTH)
72
+ return text;
73
+ const cut = text.slice(0, DESCRIPTION_MAX_LENGTH);
74
+ // Prefer the last full sentence terminator within the cut so we don't
75
+ // end on a dangling determiner ("…year. The"). Period/!/? followed by
76
+ // a space marks a clean boundary. Only honour the boundary when it
77
+ // sits past the soft minimum so we keep enough body text to be useful.
78
+ const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
79
+ if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
80
+ return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
81
+ }
82
+ const earlySentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
83
+ if (earlySentenceEnd >= Math.floor(DESCRIPTION_MIN_LENGTH / 3)) {
84
+ return cut.slice(0, earlySentenceEnd + 1).replace(/\s+$/, '');
85
+ }
86
+ const lastSpace = cut.lastIndexOf(' ');
87
+ let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
88
+ // Drop dangling stop-words and trailing punctuation/ellipsis so we
89
+ // never emit broken copy ("…year. The" → "…year.") or double-ellipsis
90
+ // ("The……") when the upstream input already carried an ellipsis.
91
+ safe = stripTrailingStopWordsAndPunctuation(safe);
92
+ return safe;
93
+ }
94
+ /**
95
+ * Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
96
+ * characters using the same sentence-boundary-preserving logic as
97
+ * {@link truncateDescription}. Returns `''` when the input is empty
98
+ * or shorter than the meta-description maximum (no point in emitting
99
+ * an "extended" description that's actually shorter than the regular
100
+ * one).
101
+ *
102
+ * @param text - Raw extended-description text (e.g. full BLUF paragraph)
103
+ * @returns Truncated extended description, or `''` when not worth emitting
104
+ */
105
+ export function truncateExtendedDescription(text) {
106
+ const trimmed = text.trim();
107
+ if (!trimmed)
108
+ return '';
109
+ // Don't emit an extended description that is shorter than the
110
+ // short meta-description budget — there's no SEO win and it would
111
+ // make `og:description` shorter than `<meta description>`.
112
+ if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
113
+ return '';
114
+ if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
115
+ return trimmed;
116
+ const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH);
117
+ const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
118
+ if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
119
+ return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
120
+ }
121
+ const earlySentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
122
+ if (earlySentenceEnd >= Math.floor(EXTENDED_DESCRIPTION_MIN_LENGTH / 2)) {
123
+ return cut.slice(0, earlySentenceEnd + 1).replace(/\s+$/, '');
124
+ }
125
+ const lastSpace = cut.lastIndexOf(' ');
126
+ let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
127
+ safe = stripTrailingStopWordsAndPunctuation(safe);
128
+ return safe;
129
+ }
130
+ /**
131
+ * Clamp a title to `TITLE_MAX_LENGTH` characters in the same
132
+ * word-boundary-preserving fashion as {@link truncateDescription}.
133
+ *
134
+ * **No mid-sentence ellipsis.** When the title overruns the budget and
135
+ * no natural clause boundary exists inside the
136
+ * `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window, this function returns
137
+ * an empty string instead of a mid-sentence `…` truncation. The empty
138
+ * return tells the caller to fall through to the next tier of the
139
+ * resolver ladder (template-fallback title with category + date),
140
+ * producing a complete, scan-friendly title rather than a clipped
141
+ * editorial fragment. Live-site regression (2026-05): titles such as
142
+ * `AI Trade Strategy: A Legislative First with Structural…` and
143
+ * `The European Parliament's 24 standing committees continued…`
144
+ * were emitted before this guard.
145
+ *
146
+ * @param text - Raw title text
147
+ * @returns Clause-truncated title (no ellipsis), or `''` when no
148
+ * editorial clause boundary exists in the window
149
+ */
150
+ export function truncateTitle(text) {
151
+ if (text.length <= TITLE_MAX_LENGTH)
152
+ return text;
153
+ // Prefer ending at a natural clause boundary inside the
154
+ // `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
155
+ // title reads as a complete journalistic clause rather than a
156
+ // mid-sentence prose snippet. Iterate boundaries in priority order;
157
+ // when a candidate falls in the window, break there and drop the
158
+ // ellipsis since the result is grammatically complete.
159
+ const search = text.slice(0, TITLE_MAX_LENGTH);
160
+ for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
161
+ const idx = search.lastIndexOf(boundary);
162
+ if (idx >= HEADLINE_SOFT_MIN) {
163
+ const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
164
+ if (clean.length >= HEADLINE_SOFT_MIN)
165
+ return clean;
166
+ }
167
+ }
168
+ // Second-tier fallback: when nothing landed in the soft window, look
169
+ // for the strongest boundary (`: ` or ` — `) inside the harder
170
+ // `[HEADLINE_HARD_MIN, HEADLINE_SOFT_MIN]` floor. This rescues
171
+ // Reader-Briefing-style ledes like
172
+ // `Immediate priority: DMA enforcement — …` whose clauses cluster in
173
+ // the opening 30-60 chars, while still keeping the soft-min guard
174
+ // active for runaway prose. We restrict the boundary set to `: ` and
175
+ // ` — ` (the two strongest semantic breaks) to avoid emitting trivial
176
+ // comma-split or full-stop-split fragments from short prose.
177
+ const STRONG_BOUNDARIES = [': ', ' — ', ' – '];
178
+ for (const boundary of STRONG_BOUNDARIES) {
179
+ const idx = search.indexOf(boundary);
180
+ if (idx >= HEADLINE_HARD_MIN && idx < HEADLINE_SOFT_MIN) {
181
+ const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
182
+ if (clean.length >= HEADLINE_HARD_MIN)
183
+ return clean;
184
+ }
185
+ }
186
+ // No clause boundary in either window — refuse to emit a mid-sentence
187
+ // truncation. Caller falls through to template-fallback composition.
188
+ return '';
189
+ }
190
+ // ────────────────────────────────────────────────────────────────────────
191
+ // Sentence extraction
192
+ // ────────────────────────────────────────────────────────────────────────
193
+ /**
194
+ * Return the first complete sentence from a prose paragraph, suitable
195
+ * for use as a fallback editorial title when the artefact H1 is
196
+ * categorical (e.g. `# EU Parliament Committee Reports`) and the
197
+ * resolver must derive `<title>` from the BLUF / lede summary instead.
198
+ *
199
+ * A "sentence" is the prefix up to the first sentence-terminator
200
+ * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
201
+ * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
202
+ * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
203
+ * so they don't terminate the sentence prematurely. When no
204
+ * acceptable terminator exists in the window, returns `''` so the
205
+ * resolver falls through to the next tier instead of feeding an
206
+ * over-budget paragraph into {@link truncateTitle} (which would also
207
+ * return `''`).
208
+ *
209
+ * @param paragraph - Prose paragraph (post-`stripInlineMarkdown`)
210
+ * @returns First sentence, or `''` when none can be identified within
211
+ * the soft-min window
212
+ */
213
+ export function extractFirstSentence(paragraph) {
214
+ const trimmed = paragraph.trim();
215
+ if (trimmed.length <= HEADLINE_SOFT_MIN)
216
+ return trimmed;
217
+ // Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
218
+ // we'd rather let truncateTitle clause-truncate the original
219
+ // paragraph than return a too-long first sentence.
220
+ const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
221
+ // Skip common abbreviations that contain a period inside a token
222
+ // (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
223
+ // candidate terminator positions; a position counts only when the
224
+ // char before it is *not* part of a known abbreviation token.
225
+ const terminators = ['. ', '! ', '? ', '; '];
226
+ let bestIdx = -1;
227
+ for (const t of terminators) {
228
+ let from = HEADLINE_SOFT_MIN;
229
+ let idx;
230
+ while ((idx = window.indexOf(t, from)) !== -1) {
231
+ if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
232
+ if (bestIdx === -1 || idx < bestIdx)
233
+ bestIdx = idx;
234
+ break;
235
+ }
236
+ from = idx + t.length;
237
+ }
238
+ }
239
+ if (bestIdx >= HEADLINE_SOFT_MIN) {
240
+ return trimmed.slice(0, bestIdx + 1).trim();
241
+ }
242
+ // No sentence terminator inside the window — return `''` so the
243
+ // resolver falls through to the next tier instead of feeding a full
244
+ // paragraph into {@link truncateTitle} (which would now return `''`
245
+ // anyway). Being explicit here keeps the tier-1/2 split obvious.
246
+ return '';
247
+ }
248
+ /**
249
+ * Check whether the character preceding the `.` at `idx` in `text`
250
+ * indicates an abbreviation (so the `.` is not a sentence terminator).
251
+ * Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
252
+ * single-letter initials pattern (`U.S.`, `E.U.`).
253
+ *
254
+ * @param text - Source text (lowercased segment + original mixed-case)
255
+ * @param idx - Index of the `.` character in `text`
256
+ * @returns `true` when the period at `idx` is part of an abbreviation
257
+ */
258
+ function isAbbreviationBoundary(text, idx) {
259
+ // All-caps single-letter initial like `U.S.` or `E.U.` — char at
260
+ // idx-1 is a capital letter, and idx-2 is either start of string,
261
+ // whitespace, or another single-letter+period pair.
262
+ if (idx >= 1) {
263
+ const prev = text.charCodeAt(idx - 1);
264
+ const isUpperLetter = prev >= 65 && prev <= 90;
265
+ if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
266
+ return true;
267
+ }
268
+ }
269
+ // ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
270
+ // start of the word, then compare lowercased.
271
+ let start = idx;
272
+ while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
273
+ start--;
274
+ const token = text.slice(start, idx + 1).toLowerCase();
275
+ return ABBREVIATION_PREFIXES.includes(token);
276
+ }
277
+ //# sourceMappingURL=text-truncate.js.map
@@ -0,0 +1,96 @@
1
+ /**
2
+ * @module Aggregator/Metadata/TextUtilsConstants
3
+ * @description Shared byte-budget constants and vocabularies used by
4
+ * the metadata text helpers. Extracted from `text-utils.ts` so the
5
+ * truncation/extraction helpers can live in `text-truncate.ts`
6
+ * without creating a circular import — both modules import from
7
+ * here, and `text-utils.ts` re-exports the truncators for back-compat
8
+ * with existing call-sites.
9
+ *
10
+ * **No imports.** This is a pure leaf module: only constants and
11
+ * vocabularies, no functions, no I/O.
12
+ */
13
+ /** Maximum `<meta description>` length we will emit. */
14
+ export declare const DESCRIPTION_MAX_LENGTH = 180;
15
+ /**
16
+ * Maximum `og:description` / `twitter:description` length we will
17
+ * emit. Facebook truncates at ~300 characters in the preview card;
18
+ * Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
19
+ * (which use the full OG payload) get the full BLUF context, then
20
+ * let Twitter clip naturally. Below this length the extended
21
+ * description is emitted verbatim; above it we sentence-boundary
22
+ * truncate.
23
+ */
24
+ export declare const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
25
+ /** Target minimum extended-description length before we even emit it. */
26
+ export declare const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
27
+ /** Target minimum `<meta description>` length before we append context. */
28
+ export declare const DESCRIPTION_MIN_LENGTH = 140;
29
+ /**
30
+ * Length below which a raw description is considered too short to stand
31
+ * on its own and gets enriched with date/context. Independent from
32
+ * {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
33
+ * truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
34
+ * clean 100-140 char prose lede is preserved verbatim instead of being
35
+ * padded with date/context boilerplate.
36
+ */
37
+ export declare const ENRICHMENT_TRIGGER_LENGTH = 100;
38
+ /** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
39
+ export declare const TITLE_MAX_LENGTH = 140;
40
+ /**
41
+ * Soft target for headline-style titles produced as a fallback from
42
+ * BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
43
+ * truncator first looks for a natural clause boundary
44
+ * (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
45
+ * window and breaks there instead of mid-clause-with-ellipsis. This
46
+ * turns a 137-character truncated prose paragraph into a complete
47
+ * journalistic clause, which scans much better in news cards and SERP
48
+ * snippets without sacrificing the keyword-rich opening.
49
+ */
50
+ export declare const HEADLINE_SOFT_MIN = 60;
51
+ /**
52
+ * Lower floor for clause-boundary acceptance when the soft-min window
53
+ * returns nothing. Used by {@link truncateTitle} as a second-tier
54
+ * fallback: when a long prose paragraph has its only natural clause
55
+ * boundaries (`: `, ` — `) clustered in the opening 30-60 characters
56
+ * (typical of Reader-Briefing-style ledes like `Immediate priority:
57
+ * DMA enforcement — …`), accept the strongest such boundary rather
58
+ * than fall through to template-fallback composition. This keeps
59
+ * scan-friendly editorial fragments intact while still rejecting
60
+ * fragments shorter than a typical news-card title.
61
+ */
62
+ export declare const HEADLINE_HARD_MIN = 30;
63
+ /**
64
+ * Punctuation marks that signal a natural clause boundary inside a
65
+ * BLUF / lede paragraph. Listed in preferred-break order: a colon or
66
+ * em-dash that introduces a list of consequences is the best break,
67
+ * full stops are next, and semicolons last. Single ASCII space is
68
+ * always a fallback boundary handled separately.
69
+ */
70
+ export declare const HEADLINE_CLAUSE_BOUNDARIES: readonly string[];
71
+ /**
72
+ * Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
73
+ * (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
74
+ * metadata, never prose.
75
+ */
76
+ export declare const EMOJI_BANNER_CHARS: string[];
77
+ /**
78
+ * Label prefixes that a prose description must never start with. Every
79
+ * entry matches case-insensitively at the start of a trimmed line, followed
80
+ * by optional space and a colon.
81
+ */
82
+ export declare const METADATA_LINE_PREFIXES: readonly string[];
83
+ /** Connector / determiner words that read as broken copy when they are
84
+ * the final token before a truncation ellipsis. */
85
+ export declare const TRAILING_STOP_WORDS: Set<string>;
86
+ /** Trailing characters we always strip before appending our own ellipsis,
87
+ * so we never emit double-ellipsis or stray punctuation. */
88
+ export declare const TRAILING_PUNCT: RegExp;
89
+ /**
90
+ * Abbreviation tokens (lowercase, including the trailing period) that
91
+ * should NOT count as sentence terminators when `extractFirstSentence`
92
+ * scans for a `.` boundary. Single-letter all-caps initials
93
+ * (`U.S.`, `E.U.`) are handled by the all-caps-initial check.
94
+ */
95
+ export declare const ABBREVIATION_PREFIXES: readonly string[];
96
+ //# sourceMappingURL=text-utils-constants.d.ts.map