euparliamentmonitor 0.9.12 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,48 +9,82 @@
9
9
  * published article carry a unique, content-reflective headline and
10
10
  * description in every language variant.
11
11
  *
12
- * Priority ladder (per language, highest wins):
12
+ * Priority ladder (per language, highest wins) — matches the editorial
13
+ * contract documented in
14
+ * [`.github/prompts/04-article-generation.md`](../../.github/prompts/04-article-generation.md) § 6.2:
13
15
  *
14
16
  * 1. **Manifest override** — `manifest.title` / `manifest.description` on
15
17
  * the analysis-run manifest, either as a plain string (applied to every
16
18
  * language) or a `LanguageMap<string>` object for explicit per-language
17
- * values. Authored by Stage-B agents when they have an editorial
18
- * headline for the day.
19
- * 2. **Artefact editorial H1** — first `# …` heading from the first
19
+ * values.
20
+ * 2. **Localized executive brief** — for non-English `<lang>`, the
21
+ * translated sibling `executive-brief_<lang>.md` (or
22
+ * `extended/executive-brief_<lang>.md`) under the run directory.
23
+ * Resolved via `editorial-brief-resolver.ts`. This is the authoritative
24
+ * localized source produced by the `news-translate` workflow.
25
+ * 3. **English executive brief, verbatim** — the English brief
26
+ * (`executive-brief.md` / `extended/executive-brief.md`) used as a
27
+ * fall-through when a locale has no translated brief yet. Recorded in
28
+ * `metadataFallback[<lang>] = "en"` so editors can audit which locales
29
+ * fell through.
30
+ * 4. **Artefact editorial H1** — first `# …` heading from the first
20
31
  * substantive artefact under the run directory (e.g.
21
32
  * `intelligence/synthesis-summary.md`, `breaking-news-analysis.md`).
22
33
  * Accepted only when the heading is not a generic
23
34
  * `${humanize(articleType)} — ${date}` form.
24
- * 3. **Aggregated-markdown H1** — the first `# …` heading in the aggregator
25
- * output, accepted under the same non-generic rule. In practice this
26
- * tier rarely fires because the aggregator itself writes the generic
27
- * default, but it covers hand-edited or historic aggregates.
28
- * 4. **First strong prose paragraph** — the first line of the aggregated
29
- * Markdown that survives {@link shouldSkipDescriptionLine}. Used for
30
- * `description`; also used for `title` as a last editorial-content
31
- * resort when every heading-level source is generic.
32
- * 5. **Localized template** the per-article-type `*_TITLES` generator
33
- * from `src/constants/language-articles.ts`. Always parameterised by
34
- * date (or derived values), so the title changes from run to run even
35
- * when this last tier fires but still the "boring repeated" option.
36
- *
37
- * Artifact-derived highlights (tiers 2–4) are used as page-specific
38
- * context across all 14 variants: English can use them directly, while
39
- * non-English variants keep the localized article-type template and append
40
- * the editorial topic/summary. This prevents duplicate metadata across
41
- * same-type pages while keeping the surrounding snippet language-specific
42
- * until full per-language body translations are present.
35
+ * 5. **Aggregated-markdown H1** — the first `# …` heading in the aggregator
36
+ * output, accepted under the same non-generic rule.
37
+ * 6. **First strong prose paragraph** — the first line of the aggregated
38
+ * Markdown that survives {@link shouldSkipDescriptionLine}.
39
+ * 7. **Localized template** — the per-article-type `*_TITLES` generator
40
+ * from `src/constants/language-articles.ts`. Last resort.
41
+ *
42
+ * Tiers 2–6 produce the same shape ({headline, summary}); the resolver
43
+ * picks the highest-available tier per language. When a localized brief
44
+ * (tier 2) is present, the headline replaces the localized template
45
+ * verbatim no concatenation. Locales without a translated brief inherit
46
+ * the English brief content (tier 3) so SEO surfaces never fall back to
47
+ * boring type-level templates while real editorial content exists.
43
48
  */
44
49
  import fs from 'fs';
45
50
  import path from 'path';
46
51
  import { ALL_LANGUAGES, getLocalizedString } from '../constants/language-core.js';
47
52
  import { BREAKING_NEWS_TITLES, COMMITTEE_REPORTS_TITLES, ELECTION_CYCLE_TITLES, LOCALIZED_KEYWORDS, MONTH_AHEAD_TITLES, MONTHLY_REVIEW_TITLES, MOTIONS_TITLES, PROPOSITIONS_TITLES, QUARTER_AHEAD_TITLES, QUARTER_IN_REVIEW_TITLES, TERM_OUTLOOK_TITLES, WEEK_AHEAD_TITLES, WEEKLY_REVIEW_TITLES, YEAR_AHEAD_TITLES, YEAR_IN_REVIEW_TITLES, } from '../constants/language-articles.js';
53
+ import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
48
54
  /** Maximum `<meta description>` length we will emit. */
49
55
  const DESCRIPTION_MAX_LENGTH = 180;
50
56
  /** Target minimum `<meta description>` length before we append context. */
51
57
  const DESCRIPTION_MIN_LENGTH = 140;
58
+ /**
59
+ * Length below which a raw description is considered too short to stand
60
+ * on its own and gets enriched with date/context. Independent from
61
+ * {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
62
+ * truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
63
+ * clean 100-140 char prose lede is preserved verbatim instead of being
64
+ * padded with date/context boilerplate.
65
+ */
66
+ const ENRICHMENT_TRIGGER_LENGTH = 100;
52
67
  /** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
53
68
  const TITLE_MAX_LENGTH = 140;
69
+ /**
70
+ * Soft target for headline-style titles produced as a fallback from
71
+ * BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
72
+ * truncator first looks for a natural clause boundary
73
+ * (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
74
+ * window and breaks there instead of mid-clause-with-ellipsis. This
75
+ * turns a 137-character truncated prose paragraph into a complete
76
+ * journalistic clause, which scans much better in news cards and SERP
77
+ * snippets without sacrificing the keyword-rich opening.
78
+ */
79
+ const HEADLINE_SOFT_MIN = 60;
80
+ /**
81
+ * Punctuation marks that signal a natural clause boundary inside a
82
+ * BLUF / lede paragraph. Listed in preferred-break order: a colon or
83
+ * em-dash that introduces a list of consequences is the best break,
84
+ * full stops are next, and semicolons last. Single ASCII space is
85
+ * always a fallback boundary handled separately.
86
+ */
87
+ const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
54
88
  /** Localized labels used to enrich short or duplicate-prone meta descriptions. */
55
89
  const SEO_CONTEXT_LABELS = {
56
90
  en: {
@@ -236,6 +270,8 @@ const ARTIFACT_CATEGORY_PREFIXES = [
236
270
  'economic context',
237
271
  'executive brief',
238
272
  'executive briefing',
273
+ 'executive intelligence brief',
274
+ 'executive intelligence briefing',
239
275
  'executive summary',
240
276
  'forward indicators',
241
277
  'historical baseline',
@@ -393,8 +429,37 @@ export function shouldSkipDescriptionLine(line) {
393
429
  }
394
430
  if (/^[-*_=~.]{3,}$/.test(line))
395
431
  return true;
432
+ if (isLocalizedBannerRow(line))
433
+ return true;
396
434
  return false;
397
435
  }
436
+ /**
437
+ * Language-agnostic banner-row detector. Stage-B artefacts open with a
438
+ * metadata banner of the shape
439
+ * `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
440
+ * and its localized siblings — notably Japanese / Chinese / Korean briefs
441
+ * which place the full-width colon `:` **inside** the bold span
442
+ * (`**日付:**`) rather than after it. The `METADATA_LINE_PREFIXES` table
443
+ * only covers the English vocabulary; this helper catches the structural
444
+ * shape directly: a line that starts with `**`, contains at least one
445
+ * `|` separator, and carries two-or-more bold key markers that end with
446
+ * — or are followed by — an ASCII colon `:` or full-width colon `:`.
447
+ * Banner rows look identical in every language we publish, so detecting
448
+ * them here keeps localized briefs from leaking their first banner line
449
+ * into the `<meta description>`.
450
+ *
451
+ * @param line - Trimmed source line
452
+ * @returns `true` when the line is a banner row in any locale
453
+ */
454
+ function isLocalizedBannerRow(line) {
455
+ if (!line.startsWith('**'))
456
+ return false;
457
+ if (!line.includes('|'))
458
+ return false;
459
+ const inside = (line.match(/\*\*[^*]+[::]\s*\*\*/g) ?? []).length;
460
+ const after = (line.match(/\*\*[^*]+\*\*\s*[::]/g) ?? []).length;
461
+ return inside + after >= 2;
462
+ }
398
463
  /**
399
464
  * Strip inline Markdown decorations so we can use the remaining text as
400
465
  * plain-text meta-tag content. Removes link syntax, emphasis, inline code
@@ -544,12 +609,141 @@ export function truncateDescription(text) {
544
609
  export function truncateTitle(text) {
545
610
  if (text.length <= TITLE_MAX_LENGTH)
546
611
  return text;
612
+ // Prefer ending at a natural clause boundary inside the
613
+ // `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
614
+ // title reads as a complete journalistic clause rather than a
615
+ // mid-sentence prose snippet. Iterate boundaries in priority order;
616
+ // when a candidate falls in the window, break there and drop the
617
+ // ellipsis since the result is grammatically complete.
618
+ const search = text.slice(0, TITLE_MAX_LENGTH);
619
+ for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
620
+ const idx = search.lastIndexOf(boundary);
621
+ if (idx >= HEADLINE_SOFT_MIN) {
622
+ const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
623
+ if (clean.length >= HEADLINE_SOFT_MIN)
624
+ return clean;
625
+ }
626
+ }
547
627
  const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
548
628
  const lastSpace = cut.lastIndexOf(' ');
549
629
  let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
550
630
  safe = stripTrailingStopWordsAndPunctuation(safe);
551
631
  return `${safe}…`;
552
632
  }
633
+ /**
634
+ * Return the first complete sentence from a prose paragraph, suitable
635
+ * for use as a fallback editorial title when the artefact H1 is
636
+ * categorical (e.g. `# EU Parliament Committee Reports`) and the
637
+ * resolver must derive `<title>` from the BLUF / lede summary instead.
638
+ *
639
+ * A "sentence" is the prefix up to the first sentence-terminator
640
+ * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
641
+ * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
642
+ * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
643
+ * so they don't terminate the sentence prematurely. When no
644
+ * acceptable terminator exists in the window, returns the entire
645
+ * input unchanged so {@link truncateTitle} can handle clause-boundary
646
+ * truncation downstream.
647
+ *
648
+ * This produces journalistically clean titles even for the
649
+ * propositions / committee-reports cases where the BLUF paragraph
650
+ * opens with a single long sentence that exceeds 140 chars —
651
+ * `truncateTitle` then breaks on a clause boundary, and the result is
652
+ * still grammatical because the input was a sentence prefix rather
653
+ * than an arbitrary paragraph slice.
654
+ *
655
+ * @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
656
+ * @returns First sentence, or the original paragraph when none can be
657
+ * identified within the soft-min window
658
+ */
659
+ export function extractFirstSentence(paragraph) {
660
+ const trimmed = paragraph.trim();
661
+ if (trimmed.length <= HEADLINE_SOFT_MIN)
662
+ return trimmed;
663
+ // Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
664
+ // we'd rather let truncateTitle clause-truncate the original
665
+ // paragraph than return a too-long first sentence.
666
+ const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
667
+ // Skip common abbreviations that contain a period inside a token
668
+ // (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
669
+ // candidate terminator positions; a position counts only when the
670
+ // char before it is *not* part of a known abbreviation token.
671
+ const terminators = ['. ', '! ', '? ', '; '];
672
+ let bestIdx = -1;
673
+ for (const t of terminators) {
674
+ let from = HEADLINE_SOFT_MIN;
675
+ let idx;
676
+ while ((idx = window.indexOf(t, from)) !== -1) {
677
+ if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
678
+ if (bestIdx === -1 || idx < bestIdx)
679
+ bestIdx = idx;
680
+ break;
681
+ }
682
+ from = idx + t.length;
683
+ }
684
+ }
685
+ if (bestIdx >= HEADLINE_SOFT_MIN) {
686
+ return trimmed.slice(0, bestIdx + 1).trim();
687
+ }
688
+ return trimmed;
689
+ }
690
+ /**
691
+ * Abbreviation tokens (lowercase, including the trailing period) that
692
+ * should NOT count as sentence terminators when {@link extractFirstSentence}
693
+ * scans for a `.` boundary. Single-letter all-caps initials
694
+ * (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
695
+ */
696
+ const ABBREVIATION_PREFIXES = [
697
+ 'mr.',
698
+ 'mrs.',
699
+ 'ms.',
700
+ 'dr.',
701
+ 'st.',
702
+ 'no.',
703
+ 'vs.',
704
+ 'e.g.',
705
+ 'i.e.',
706
+ 'etc.',
707
+ 'cf.',
708
+ 'al.',
709
+ // EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
710
+ 'q1.',
711
+ 'q2.',
712
+ 'q3.',
713
+ 'q4.',
714
+ 'h1.',
715
+ 'h2.',
716
+ 'fy.',
717
+ ];
718
+ /**
719
+ * Check whether the character preceding the `.` at `idx` in `text`
720
+ * indicates an abbreviation (so the `.` is not a sentence terminator).
721
+ * Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
722
+ * single-letter initials pattern (`U.S.`, `E.U.`).
723
+ *
724
+ * @param text - Source text (lowercased segment + original mixed-case)
725
+ * @param idx - Index of the `.` character in `text`
726
+ * @returns `true` when the period at `idx` is part of an abbreviation
727
+ */
728
+ function isAbbreviationBoundary(text, idx) {
729
+ // All-caps single-letter initial like `U.S.` or `E.U.` — char at
730
+ // idx-1 is a capital letter, and idx-2 is either start of string,
731
+ // whitespace, or another single-letter+period pair.
732
+ if (idx >= 1) {
733
+ const prev = text.charCodeAt(idx - 1);
734
+ const isUpperLetter = prev >= 65 && prev <= 90;
735
+ if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
736
+ return true;
737
+ }
738
+ }
739
+ // ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
740
+ // start of the word, then compare lowercased.
741
+ let start = idx;
742
+ while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
743
+ start--;
744
+ const token = text.slice(start, idx + 1).toLowerCase();
745
+ return ABBREVIATION_PREFIXES.includes(token);
746
+ }
553
747
  /**
554
748
  * Return the first Markdown H1 (`# …`) in the supplied text, stripped of
555
749
  * the leading `#` and trailing anchor syntax. Returns an empty string when
@@ -573,15 +767,48 @@ export function extractFirstH1(markdown) {
573
767
  return '';
574
768
  }
575
769
  /**
576
- * Walk every line of the Markdown source and return the first line that
577
- * survives {@link shouldSkipDescriptionLine}. Inline Markdown decorations
578
- * are stripped and the result is truncated to fit `<meta description>`.
770
+ * Process one Markdown line against the in-progress paragraph buffer.
771
+ * Returns the desired loop control: `'continue'` (skip silently),
772
+ * `'break'` (paragraph terminated emit), or `'collected'` (line was
773
+ * pushed into the buffer; caller checks the cap separately).
774
+ *
775
+ * Factored out of the two extractors to reduce cognitive complexity.
776
+ *
777
+ * @param line - Trimmed Markdown line
778
+ * @param buf - In-progress paragraph buffer (mutated on `'collected'`)
779
+ * @returns Loop control directive
780
+ */
781
+ function collectProseLine(line, buf) {
782
+ const hasBuffer = buf.lines.length > 0;
783
+ if (hasBuffer && line === '')
784
+ return 'break';
785
+ if (line === '')
786
+ return 'continue';
787
+ if (shouldSkipDescriptionLine(line))
788
+ return hasBuffer ? 'break' : 'continue';
789
+ const plain = stripLeadingProseLabel(stripInlineMarkdown(line));
790
+ if (!hasBuffer && plain.length < 40)
791
+ return 'continue';
792
+ buf.lines.push(plain);
793
+ buf.byteCount += plain.length + 1;
794
+ return 'collected';
795
+ }
796
+ /**
797
+ * Walk every line of the Markdown source and return the first paragraph
798
+ * that survives {@link shouldSkipDescriptionLine}. Consecutive non-blank
799
+ * prose lines are joined with a single space so hard-wrapped ledes
800
+ * (column-95 conventional wrap) produce a clean 140-180-character
801
+ * description rather than just the first 60-90-char line.
802
+ *
803
+ * Inline Markdown decorations are stripped and the result is truncated
804
+ * to fit `<meta description>`.
579
805
  *
580
806
  * @param markdown - Markdown source
581
807
  * @returns Prose description, or empty string when nothing qualifies
582
808
  */
583
809
  export function extractStrongProseLine(markdown) {
584
810
  let inFence = false;
811
+ const buf = { lines: [], byteCount: 0 };
585
812
  for (const raw of markdown.split('\n')) {
586
813
  const line = raw.trim();
587
814
  if (line.startsWith('```') || line.startsWith('~~~')) {
@@ -590,58 +817,93 @@ export function extractStrongProseLine(markdown) {
590
817
  }
591
818
  if (inFence)
592
819
  continue;
593
- if (shouldSkipDescriptionLine(line))
594
- continue;
595
- const plain = stripLeadingProseLabel(stripInlineMarkdown(line));
596
- if (plain.length < 40)
820
+ const directive = collectProseLine(line, buf);
821
+ if (directive === 'continue')
597
822
  continue;
598
- return truncateDescription(plain);
823
+ if (directive === 'break')
824
+ break;
825
+ if (buf.byteCount >= DESCRIPTION_MAX_LENGTH)
826
+ break;
599
827
  }
600
- return '';
828
+ if (buf.lines.length === 0)
829
+ return '';
830
+ return truncateDescription(buf.lines.join(' '));
601
831
  }
602
832
  /**
603
- * Walk the body of an editorial artefact and, when it contains a `## …`
604
- * heading whose text matches one of `EDITORIAL_LEDE_HEADINGS`,
605
- * return the first prose paragraph that follows that heading. This is
606
- * the journalist's lede ("60-Second Read", "TL;DR", "BLUF — …", …) and
607
- * is exactly the sentence that should power `<meta description>` and
608
- * the OG/Twitter description fields.
833
+ * Classify one Markdown line for the {@link extractLedeAfterHeading}
834
+ * walker. The returned directive is then applied to walker state by
835
+ * {@link applyLedeDirective}.
609
836
  *
610
- * Returns the empty string when no lede heading is found or no qualifying
611
- * prose follows it. Inline Markdown is stripped and the result is
612
- * truncated to fit `<meta description>`.
837
+ * @param line - Trimmed Markdown line
838
+ * @param isInFence - True when the previous line opened a fenced block
839
+ * @param inLede - True when the previous line was inside a lede heading block
840
+ * @param hasBuffered - True when at least one prose line has been collected
841
+ * @returns Directive describing how the walker should treat this line
842
+ */
843
+ function classifyLedeLine(line, isInFence, inLede, hasBuffered) {
844
+ if (line.startsWith('```') || line.startsWith('~~~'))
845
+ return { kind: 'fence' };
846
+ if (isInFence)
847
+ return { kind: 'pause' };
848
+ if (/^#{2,3}\s+/.test(line)) {
849
+ if (hasBuffered)
850
+ return { kind: 'pause' };
851
+ const headingText = normaliseHeadingText(line.replace(/^#{2,3}\s+/, ''));
852
+ const match = EDITORIAL_LEDE_HEADINGS.some((h) => isLedeHeadingMatch(headingText, h));
853
+ return { kind: 'heading', inLede: match };
854
+ }
855
+ return inLede ? { kind: 'collect' } : { kind: 'pause' };
856
+ }
857
+ /**
858
+ * Apply one directive emitted by {@link classifyLedeLine} to the walk
859
+ * state. Returns `'break'` to stop the walk, `'continue'` to skip to
860
+ * the next line, or `'collect'` when the caller should now run
861
+ * {@link collectProseLine}. Mutates `state` for fence/in-lede toggles.
613
862
  *
614
- * @param markdown - Editorial artefact source
615
- * @returns Lede paragraph, or empty string when none matched
863
+ * @param directive - Classification of the current line
864
+ * @param state - Walk state (mutated in place)
865
+ * @param state.inFence - True when the current line is inside a fenced block
866
+ * @param state.inLede - True when the current line is inside a lede heading block
867
+ * @param hasBuffered - Whether any prose has already been collected
868
+ * @returns Loop control directive
616
869
  */
870
+ function applyLedeDirective(directive, state, hasBuffered) {
871
+ if (directive.kind === 'fence') {
872
+ state.inFence = !state.inFence;
873
+ return 'continue';
874
+ }
875
+ if (directive.kind === 'heading') {
876
+ if (hasBuffered)
877
+ return 'break';
878
+ state.inLede = directive.inLede;
879
+ return 'continue';
880
+ }
881
+ if (directive.kind === 'pause')
882
+ return 'continue';
883
+ return 'collect';
884
+ }
617
885
  export function extractLedeAfterHeading(markdown) {
618
- const lines = markdown.split('\n');
619
- let inLede = false;
620
- let inFence = false;
621
- for (let i = 0; i < lines.length; i++) {
622
- const raw = lines[i] ?? '';
886
+ const state = { inFence: false, inLede: false };
887
+ const buf = { lines: [], byteCount: 0 };
888
+ for (const raw of markdown.split('\n')) {
623
889
  const line = raw.trim();
624
- if (line.startsWith('```') || line.startsWith('~~~')) {
625
- inFence = !inFence;
626
- continue;
627
- }
628
- if (inFence)
629
- continue;
630
- if (/^#{2,3}\s+/.test(line)) {
631
- const headingText = normaliseHeadingText(line.replace(/^#{2,3}\s+/, ''));
632
- inLede = EDITORIAL_LEDE_HEADINGS.some((h) => headingText === h || headingText.startsWith(`${h} `) || headingText.startsWith(`${h}:`));
633
- continue;
634
- }
635
- if (!inLede)
890
+ const directive = classifyLedeLine(line, state.inFence, state.inLede, buf.lines.length > 0);
891
+ const action = applyLedeDirective(directive, state, buf.lines.length > 0);
892
+ if (action === 'break')
893
+ break;
894
+ if (action === 'continue')
636
895
  continue;
637
- if (shouldSkipDescriptionLine(line))
896
+ const collect = collectProseLine(line, buf);
897
+ if (collect === 'continue')
638
898
  continue;
639
- const plain = stripLeadingProseLabel(stripInlineMarkdown(line));
640
- if (plain.length < 40)
641
- continue;
642
- return truncateDescription(plain);
899
+ if (collect === 'break')
900
+ break;
901
+ if (buf.byteCount >= DESCRIPTION_MAX_LENGTH)
902
+ break;
643
903
  }
644
- return '';
904
+ if (buf.lines.length === 0)
905
+ return '';
906
+ return truncateDescription(buf.lines.join(' '));
645
907
  }
646
908
  /**
647
909
  * Normalise a Markdown heading's text for comparison against the
@@ -660,6 +922,32 @@ function normaliseHeadingText(raw) {
660
922
  .trim()
661
923
  .toLowerCase();
662
924
  }
925
+ /**
926
+ * Word-boundary match against an editorial-lede whitelist entry. Matches
927
+ * when the normalised heading equals the whitelist entry exactly, or when
928
+ * the entry is followed by any non-alphanumeric character — covering
929
+ * localized parenthetical glosses written with ASCII or full-width
930
+ * punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
931
+ * `bluf — 핵심 결론`, `60-second read — what happened`).
932
+ *
933
+ * @param headingText - Normalised heading text (lower-case, decoration-stripped)
934
+ * @param whitelistEntry - Lower-case whitelist entry from
935
+ * {@link EDITORIAL_LEDE_HEADINGS}
936
+ * @returns `true` when `headingText` begins with `whitelistEntry` at a
937
+ * word boundary
938
+ */
939
+ function isLedeHeadingMatch(headingText, whitelistEntry) {
940
+ if (headingText === whitelistEntry)
941
+ return true;
942
+ if (!headingText.startsWith(whitelistEntry))
943
+ return false;
944
+ const next = headingText.charAt(whitelistEntry.length);
945
+ // Word boundary — anything that is not an ASCII letter/digit is a
946
+ // separator we accept. This works uniformly across ASCII parentheses,
947
+ // CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
948
+ // ideographic full-width colon `:`.
949
+ return next === '' || !/[a-z0-9]/.test(next);
950
+ }
663
951
  /**
664
952
  * Return `true` when an artefact-H1 begins with one of the
665
953
  * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
@@ -830,8 +1118,151 @@ export function isGenericHeading(heading, articleType, date) {
830
1118
  if (trailingDateOnly.test(normalized)) {
831
1119
  return true;
832
1120
  }
1121
+ if (isCategoryNounHeading(normalized, articleType))
1122
+ return true;
1123
+ if (isBareInstitutionalHeading(normalized))
1124
+ return true;
833
1125
  return false;
834
1126
  }
1127
+ /**
1128
+ * Lower-cased institutional self-references that an executive-brief
1129
+ * authoring template sometimes emits as the H1 when the agent forgot to
1130
+ * substitute a real headline. They identify the publisher / institution
1131
+ * but carry **zero editorial information** — they would produce
1132
+ * pathological `<title>EU Parliament</title>` strings if surfaced.
1133
+ * Matched after whitespace collapse + lowercase, with any trailing
1134
+ * punctuation / single-date qualifier stripped so `EU Parliament ·
1135
+ * 2026-05-15` and `Hack23 AB —` both resolve here. Date *ranges*
1136
+ * (`(May 2026)`, `: 19–22 May 2026`) are preserved as editorial
1137
+ * content, matching the {@link isCategoryNounHeading} contract.
1138
+ */
1139
+ const BARE_INSTITUTIONAL_HEADINGS = [
1140
+ 'eu parliament',
1141
+ 'european parliament',
1142
+ 'the european parliament',
1143
+ 'ep',
1144
+ 'ep10',
1145
+ 'ep11',
1146
+ 'hack23',
1147
+ 'hack23 ab',
1148
+ 'eu parliament monitor',
1149
+ 'european parliament monitor',
1150
+ 'executive brief',
1151
+ 'briefing',
1152
+ 'intelligence brief',
1153
+ 'intelligence briefing',
1154
+ ];
1155
+ /**
1156
+ * Return `true` when the heading is one of {@link BARE_INSTITUTIONAL_HEADINGS}
1157
+ * — an institutional self-reference with no editorial content. Strips a
1158
+ * trailing single-date qualifier first so `EU Parliament — 2026-05-15`
1159
+ * and `Hack23 AB · 2026-05-15` are caught. Date ranges and any token
1160
+ * after the institutional noun are preserved (so
1161
+ * `EU Parliament Week Ahead: 19–22 May 2026` is *not* flagged here —
1162
+ * that path is owned by {@link isCategoryNounHeading} for `week-ahead`).
1163
+ *
1164
+ * @param normalized - Heading text after whitespace collapse
1165
+ * @returns `true` when the heading is bare institutional boilerplate
1166
+ */
1167
+ function isBareInstitutionalHeading(normalized) {
1168
+ let core = normalized.toLowerCase();
1169
+ // Same single-date / parenthetical stripping as isCategoryNounHeading
1170
+ // so the same heading shape is recognized via either gate.
1171
+ core = core.replace(/\s*[·:—–-]\s*\d{4}-\d{2}-\d{2}\s*$/u, '');
1172
+ core = core.replace(/\s*\(\s*[a-z]{3,9}\s+\d{4}\s*\)\s*$/u, '');
1173
+ core = core.replace(/\s*\(\s*\d{4}\s*\)\s*$/u, '');
1174
+ core = core.replace(/[\s\-—–:·.]+$/u, '').trim();
1175
+ return BARE_INSTITUTIONAL_HEADINGS.includes(core);
1176
+ }
1177
+ /**
1178
+ * Curated category-noun whitelist per article-type slug. These are the
1179
+ * boring "EU Parliament &lt;Type&gt;" / "EP10 &lt;Type&gt;" headings that the
1180
+ * executive-brief authoring conventions allow as decorative H1s but
1181
+ * which carry **no editorial information** — they merely restate the
1182
+ * article category. When such a heading reaches the metadata resolver
1183
+ * it must be flagged generic so the resolver falls through to the
1184
+ * BLUF / lede summary instead of using the category noun as `<title>`.
1185
+ *
1186
+ * Keys are slugs (`article-type` form). Values are lowercase category
1187
+ * cores, matched after stripping institutional prefixes
1188
+ * (`eu parliament `, `european parliament `, `ep `, `ep10 `, `ep11 `)
1189
+ * and trailing date qualifiers (`· 2026-05-15`, `— 2026-05-15`,
1190
+ * `(May 2026)`, `: 19–22 May 2026` is **kept** because date ranges
1191
+ * carry editorial info — only single-date suffixes are stripped).
1192
+ */
1193
+ const CATEGORY_NOUN_CORES = {
1194
+ breaking: ['breaking', 'breaking news'],
1195
+ 'week-in-review': ['week in review'],
1196
+ 'week-ahead': ['week ahead'],
1197
+ 'month-in-review': ['month in review'],
1198
+ 'month-ahead': ['month ahead'],
1199
+ 'quarter-in-review': ['quarter in review'],
1200
+ 'quarter-ahead': ['quarter ahead'],
1201
+ 'year-in-review': ['year in review'],
1202
+ 'year-ahead': ['year ahead'],
1203
+ 'committee-reports': [
1204
+ 'committee reports',
1205
+ 'committee activity',
1206
+ 'committee activity report',
1207
+ 'committee activity reports',
1208
+ ],
1209
+ motions: [
1210
+ 'motions',
1211
+ 'motions and adopted texts',
1212
+ 'plenary votes and resolutions',
1213
+ 'plenary votes resolutions',
1214
+ ],
1215
+ propositions: ['propositions', 'legislative propositions', 'legislative procedures'],
1216
+ 'election-cycle': ['election cycle'],
1217
+ 'term-outlook': ['term outlook'],
1218
+ };
1219
+ /**
1220
+ * Return `true` when the heading is a bare category-noun string for the
1221
+ * supplied `articleType` slug, regardless of the institutional prefix
1222
+ * (`EU Parliament `, `European Parliament `, `EP `, `EP10 `, `EP11 `).
1223
+ * Strips a trailing single-date qualifier (` · YYYY-MM-DD`,
1224
+ * ` — YYYY-MM-DD`, `(May 2026)`, `(2026)`) before matching; date-range
1225
+ * qualifiers (`: 19–22 May 2026`) carry editorial information and are
1226
+ * NOT stripped, so headings like `EP Week Ahead: 19–22 May 2026` are
1227
+ * preserved as legitimate editorial headlines.
1228
+ *
1229
+ * @param normalized - Heading text after whitespace collapse
1230
+ * @param articleType - Article-type slug
1231
+ * @returns `true` when the heading is category-noun boilerplate
1232
+ */
1233
+ function isCategoryNounHeading(normalized, articleType) {
1234
+ const cores = CATEGORY_NOUN_CORES[articleType];
1235
+ if (!cores || cores.length === 0)
1236
+ return false;
1237
+ let core = normalized.toLowerCase();
1238
+ // Strip institutional prefix (longest-first match).
1239
+ const prefixes = [
1240
+ "the european parliament's ",
1241
+ 'european parliament ',
1242
+ 'eu parliament ',
1243
+ 'ep11 ',
1244
+ 'ep10 ',
1245
+ 'ep ',
1246
+ ];
1247
+ for (const p of prefixes) {
1248
+ if (core.startsWith(p)) {
1249
+ core = core.slice(p.length);
1250
+ break;
1251
+ }
1252
+ }
1253
+ // Strip trailing single-date qualifier. We deliberately do NOT strip
1254
+ // date *ranges* (`19–22 may 2026`, `28-30 april 2026`) because those
1255
+ // identify a specific reporting window — that IS editorial content.
1256
+ // Patterns stripped:
1257
+ // ` · 2026-05-15`, ` — 2026-05-15`, ` - 2026-05-15`, `: 2026-05-15`
1258
+ // ` (may 2026)`, ` (2026)`
1259
+ core = core.replace(/\s*[·:—–-]\s*\d{4}-\d{2}-\d{2}\s*$/u, '');
1260
+ core = core.replace(/\s*\(\s*[a-z]{3,9}\s+\d{4}\s*\)\s*$/u, '');
1261
+ core = core.replace(/\s*\(\s*\d{4}\s*\)\s*$/u, '');
1262
+ // Trailing punctuation residue.
1263
+ core = core.replace(/[\s\-—–:·]+$/u, '').trim();
1264
+ return cores.includes(core);
1265
+ }
835
1266
  /**
836
1267
  * Escape regex metacharacters so a dynamic string can be embedded safely
837
1268
  * in a pattern built at runtime.
@@ -858,7 +1289,17 @@ export function extractArtifactHighlight(runDir, articleType, date) {
858
1289
  const direct = scanCandidatesForHighlight(runDir, EDITORIAL_ARTEFACT_CANDIDATES, articleType, date);
859
1290
  if (direct.headline)
860
1291
  return { headline: direct.headline, summary: direct.summary };
861
- const topLevel = safeReaddir(runDir).filter((f) => f.endsWith('.md') && f !== 'manifest.json');
1292
+ // Top-level fallback scan used only when none of the canonical
1293
+ // editorial artefacts produced a non-generic H1. We must NOT pick up
1294
+ // translated sibling briefs (`executive-brief_<lang>.md`,
1295
+ // `synthesis-summary_<lang>.md`, …) here, because their H1s are
1296
+ // legitimate localized headlines that the English-only
1297
+ // {@link isGenericHeading} detector cannot recognise as boilerplate.
1298
+ // Letting them through poisoned the English `<title>` and
1299
+ // `<meta description>` for the 2026-05-15 batch with Arabic content
1300
+ // from `executive-brief_ar.md`. See {@link isTranslatedSiblingBrief}
1301
+ // and the regression test in `test/unit/article-metadata.test.js`.
1302
+ const topLevel = safeReaddir(runDir).filter((f) => f.endsWith('.md') && f !== 'manifest.json' && !isTranslatedSiblingBrief(f));
862
1303
  const fallback = scanCandidatesForHighlight(runDir, topLevel, articleType, date);
863
1304
  if (fallback.headline)
864
1305
  return { headline: fallback.headline, summary: fallback.summary };
@@ -868,6 +1309,28 @@ export function extractArtifactHighlight(runDir, articleType, date) {
868
1309
  }
869
1310
  return null;
870
1311
  }
1312
+ /**
1313
+ * Filename suffix pattern that identifies a translated sibling brief
1314
+ * (e.g. `executive-brief_ar.md`, `synthesis-summary_zh.md`). The
1315
+ * `_<lang>` token is matched against {@link ALL_LANGUAGES} so we never
1316
+ * exclude a legitimate English artefact whose name happens to end in
1317
+ * `_<two-letter-suffix>.md`.
1318
+ */
1319
+ const TRANSLATED_SIBLING_SUFFIX_RE = new RegExp(`_(${ALL_LANGUAGES.join('|')})\\.md$`, 'i');
1320
+ /**
1321
+ * Return `true` when a top-level `.md` filename looks like a translated
1322
+ * sibling of a canonical editorial artefact (e.g.
1323
+ * `executive-brief_ar.md`). These files must be excluded from the
1324
+ * top-level fallback scan in {@link extractArtifactHighlight} because
1325
+ * their localized H1s evade the English-only generic-heading detector
1326
+ * and would otherwise hijack the English SEO surfaces.
1327
+ *
1328
+ * @param filename - Run-relative `.md` filename (no path separators)
1329
+ * @returns `true` when the file is a translated sibling brief
1330
+ */
1331
+ export function isTranslatedSiblingBrief(filename) {
1332
+ return TRANSLATED_SIBLING_SUFFIX_RE.test(filename);
1333
+ }
871
1334
  /**
872
1335
  * Walk a list of candidate artefact paths and return the first
873
1336
  * non-generic headline + summary pair, plus the first usable lede
@@ -925,6 +1388,25 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
925
1388
  if (headline && !isGenericHeading(headline, articleType, date)) {
926
1389
  return { cleanHighlight: { headline: truncateTitle(headline), summary } };
927
1390
  }
1391
+ // The artefact H1 is generic boilerplate (`Executive Brief — EU Parliament
1392
+ // Breaking News`). Before falling back to a stripped category-core
1393
+ // headline, try to surface the FIRST NAMED PRIORITY FINDING from the
1394
+ // brief's `## Key Developments` / `## Priority Dossiers` /
1395
+ // `## Top Findings` block. This is the canonical Stage-B authoring
1396
+ // pattern (see `analysis/templates/executive-brief.md`) — every brief
1397
+ // lists its top dossiers as `**Name** (procedure-code, date) — paragraph`
1398
+ // or `### N. Name (committee)`. Surfacing that name produces a
1399
+ // distinctive editorial headline ("Digital Markets Act Enforcement",
1400
+ // "Ukraine War Accountability") instead of a stripped category noun.
1401
+ const priority = extractPriorityFindingHighlight(body);
1402
+ if (priority?.headline) {
1403
+ return {
1404
+ cleanHighlight: {
1405
+ headline: truncateTitle(priority.headline),
1406
+ summary: priority.summary || summary,
1407
+ },
1408
+ };
1409
+ }
928
1410
  if (headline) {
929
1411
  const stripped = stripArtifactCategoryAffix(headline);
930
1412
  if (stripped && !isGenericHeading(stripped, articleType, date)) {
@@ -933,6 +1415,565 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
933
1415
  }
934
1416
  return { summary };
935
1417
  }
1418
+ /**
1419
+ * Section headings inside the executive brief that introduce the
1420
+ * named-priority-finding block (matched case-insensitively against the
1421
+ * decoration-stripped heading text, see {@link normaliseHeadingText}).
1422
+ */
1423
+ const PRIORITY_FINDING_SECTION_HEADINGS = [
1424
+ 'key developments',
1425
+ 'key findings',
1426
+ 'key intelligence summary',
1427
+ 'key judgements',
1428
+ 'key judgments',
1429
+ 'headline intelligence',
1430
+ 'headline judgements',
1431
+ 'headline judgments',
1432
+ 'lead story',
1433
+ 'policy intelligence alerts',
1434
+ 'priority dossiers',
1435
+ 'priority dossiers under committee scrutiny',
1436
+ 'priority findings',
1437
+ 'priority intelligence assessment',
1438
+ 'priority items',
1439
+ 'top findings',
1440
+ 'top developments',
1441
+ 'top dossiers',
1442
+ 'top trigger events',
1443
+ 'top triggers',
1444
+ 'trigger events',
1445
+ 'top documents',
1446
+ 'top procedures',
1447
+ 'top 3 triggers',
1448
+ 'wep assessment',
1449
+ 'high priority',
1450
+ 'highest priority',
1451
+ ];
1452
+ /**
1453
+ * Mine the FIRST named priority finding from an executive-brief–style
1454
+ * artefact body. Looks for a section heading from
1455
+ * {@link PRIORITY_FINDING_SECTION_HEADINGS} and returns the first dossier
1456
+ * name + descriptive paragraph found inside it. Supports the three
1457
+ * canonical Stage-B authoring patterns:
1458
+ *
1459
+ * 1. **Bold-in-numbered-list** (breaking briefs):
1460
+ * `1. **Digital Markets Act Enforcement** (TA-10-2026-0160, 2026-04-30)`
1461
+ * ` Parliament adopted a resolution …`
1462
+ * 2. **Numbered subheading** (committee briefs):
1463
+ * `### 1. Clean Industrial Deal Implementation (ITRE/ENVI)`
1464
+ * `The Clean Industrial Deal framework …`
1465
+ * 3. **Bold-leading paragraph** (synthesis variants):
1466
+ * `**Trigger 1: DMA Enforcement Resolution** (TA-10-2026-0160)`
1467
+ * `- Significance: 🟢 HIGH IMPACT …`
1468
+ *
1469
+ * Trailing parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
1470
+ * `(ITRE/ENVI)`) is stripped from the headline so it stays headline-shaped
1471
+ * (`Digital Markets Act Enforcement`) rather than boilerplate
1472
+ * (`Digital Markets Act Enforcement (TA-10-2026-0160, 2026-04-30)`).
1473
+ *
1474
+ * @param body - Editorial artefact body
1475
+ * @returns `{headline, summary}` when a priority finding was identified;
1476
+ * `null` when the body has no priority section or no usable item inside
1477
+ */
1478
+ export function extractPriorityFindingHighlight(body) {
1479
+ if (!body)
1480
+ return null;
1481
+ const lines = body.split('\n');
1482
+ return scanPrioritySection(lines) ?? scanH2StoryHeadings(lines);
1483
+ }
1484
+ /**
1485
+ * Strategy 1 — scan inside the first recognised priority-finding
1486
+ * section heading for a usable item (Pattern A/B/C/D). Returns `null`
1487
+ * when the section is absent or contains no matchable item.
1488
+ *
1489
+ * @param lines - Body lines (already split on `\n`)
1490
+ * @returns `{headline, summary}` when an item was identified
1491
+ */
1492
+ function scanPrioritySection(lines) {
1493
+ const sectionStart = findPrioritySectionStart(lines);
1494
+ if (sectionStart < 0)
1495
+ return null;
1496
+ for (let i = sectionStart + 1; i < lines.length; i++) {
1497
+ const line = (lines[i] ?? '').trim();
1498
+ if (!line)
1499
+ continue;
1500
+ // Stop at the next H2 (sibling section) but allow `### …` and
1501
+ // `#### …` subheadings inside (e.g. `### 🔴 HIGH PRIORITY` between
1502
+ // the section header and the first list item).
1503
+ if (/^##(?!#)/.test(line))
1504
+ return null;
1505
+ const candidate = extractPriorityFindingItem(lines, i);
1506
+ if (candidate)
1507
+ return candidate;
1508
+ }
1509
+ return null;
1510
+ }
1511
+ /**
1512
+ * Story-keyword tokens used by `## Lead Story:` / `## Story N:` /
1513
+ * `## Trigger N:` H2 heading detection. Kept as a runtime list so the
1514
+ * regex stays bounded and bypasses the unsafe-regex lint by avoiding
1515
+ * deep alternation.
1516
+ */
1517
+ const H2_STORY_TOKENS = [
1518
+ 'Lead Story',
1519
+ 'Story',
1520
+ 'Trigger',
1521
+ 'Alert',
1522
+ 'Judgement',
1523
+ 'Judgment',
1524
+ ];
1525
+ /**
1526
+ * Strategy 2 — walk every `## …` H2 heading and try to recognise a
1527
+ * story-style heading (`## 📌 Lead Story: Russia Accountability`,
1528
+ * `## Story 1 — DMA Enforcement`). Used as a fallback when no priority
1529
+ * section was found, because motions briefs publish each lead story as
1530
+ * its own H2 without a parent section.
1531
+ *
1532
+ * @param lines - Body lines (already split on `\n`)
1533
+ * @returns `{headline, summary}` when a story heading was identified
1534
+ */
1535
+ function scanH2StoryHeadings(lines) {
1536
+ for (let i = 0; i < lines.length; i++) {
1537
+ const line = (lines[i] ?? '').trim();
1538
+ if (!line.startsWith('## '))
1539
+ continue;
1540
+ const headingText = line.replace(/^##\s+/u, '');
1541
+ const storyHeadline = extractH2StoryHeadline(headingText);
1542
+ if (!storyHeadline)
1543
+ continue;
1544
+ const result = buildPriorityResult(storyHeadline, '', lines, i);
1545
+ if (result?.headline)
1546
+ return result;
1547
+ }
1548
+ return null;
1549
+ }
1550
+ /**
1551
+ * Recognise the H2-story shape (`📌 Lead Story: Title`, `Story 1 —
1552
+ * Title`, `Trigger 2: Title`) and return the residual headline portion.
1553
+ * Returns an empty string when the heading does not match a story
1554
+ * keyword. Implemented as discrete string operations (rather than one
1555
+ * dense regex) to keep the function under the unsafe-regex linter and
1556
+ * cognitive-complexity budgets.
1557
+ *
1558
+ * @param headingText - Heading text with the leading `## ` already removed
1559
+ * @returns Residual headline or empty string
1560
+ */
1561
+ function extractH2StoryHeadline(headingText) {
1562
+ // Strip a short leading decoration / emoji block (up to 4 non-alphanumerics).
1563
+ const stripped = headingText.replace(/^[^A-Za-z0-9]{0,4}\s*/u, '');
1564
+ for (const token of H2_STORY_TOKENS) {
1565
+ if (!stripped.toLowerCase().startsWith(token.toLowerCase()))
1566
+ continue;
1567
+ let rest = stripped.slice(token.length).trim();
1568
+ // `Story 1` / `Trigger 2` — accept and consume the trailing digit.
1569
+ if (token !== 'Lead Story') {
1570
+ const digit = rest.match(/^\d+\b/u);
1571
+ if (!digit)
1572
+ continue;
1573
+ rest = rest.slice(digit[0].length).trim();
1574
+ }
1575
+ // Require an explicit `:` / `—` / `–` / `-` / `.` separator before
1576
+ // the residual headline so plain prose H2s never match.
1577
+ const sep = rest.match(/^[:—–\-.]\s+(.+)$/u);
1578
+ if (sep?.[1])
1579
+ return sep[1].trim();
1580
+ }
1581
+ return '';
1582
+ }
1583
+ /**
1584
+ * Locate the line index of the first priority-finding section heading
1585
+ * inside an artefact body. Returns `-1` when no such heading exists.
1586
+ *
1587
+ * @param lines - Body lines (already split on `\n`)
1588
+ * @returns Line index of the `## …` heading, or `-1`
1589
+ */
1590
+ function findPrioritySectionStart(lines) {
1591
+ for (let i = 0; i < lines.length; i++) {
1592
+ const line = (lines[i] ?? '').trim();
1593
+ const match = line.match(/^#{2,4}\s+(.+)$/u);
1594
+ if (!match)
1595
+ continue;
1596
+ const text = normaliseHeadingText(match[1] ?? '');
1597
+ if (!text)
1598
+ continue;
1599
+ if (headingMatchesPriorityProbe(text))
1600
+ return i;
1601
+ }
1602
+ return -1;
1603
+ }
1604
+ /**
1605
+ * Word-boundary substring matcher for the priority-finding section
1606
+ * detector. Extracted from {@link findPrioritySectionStart} to keep its
1607
+ * cognitive complexity within budget.
1608
+ *
1609
+ * @param text - Heading text already normalised by {@link normaliseHeadingText}
1610
+ * @returns `true` when one of {@link PRIORITY_FINDING_SECTION_HEADINGS}
1611
+ * appears as a word-bounded substring of {@link text}
1612
+ */
1613
+ function headingMatchesPriorityProbe(text) {
1614
+ for (const probe of PRIORITY_FINDING_SECTION_HEADINGS) {
1615
+ if (text === probe)
1616
+ return true;
1617
+ const idx = text.indexOf(probe);
1618
+ if (idx < 0)
1619
+ continue;
1620
+ const before = idx === 0 ? ' ' : (text[idx - 1] ?? ' ');
1621
+ const after = text[idx + probe.length] ?? ' ';
1622
+ if (!/[A-Za-z0-9]/.test(before) && !/[A-Za-z0-9]/.test(after))
1623
+ return true;
1624
+ }
1625
+ return false;
1626
+ }
1627
+ /**
1628
+ * Try to recognise a priority-finding item starting at {@link i}. Returns
1629
+ * the resolved `{headline, summary}` pair when the item matches one of the
1630
+ * three authoring patterns; returns `null` otherwise so the caller can
1631
+ * advance to the next line.
1632
+ *
1633
+ * @param lines - Body lines (already split on `\n`)
1634
+ * @param i - Index of the candidate line
1635
+ * @returns Priority-finding pair when matched, `null` otherwise
1636
+ */
1637
+ function extractPriorityFindingItem(lines, i) {
1638
+ const line = (lines[i] ?? '').trim();
1639
+ // Pattern A — numbered list item with bold title:
1640
+ // `1. **Digital Markets Act Enforcement** (TA-10-2026-0160, 2026-04-30)`
1641
+ const numberedBold = line.match(/^\d+\.\s+\*\*([^*]+?)\*\*\s*(.*)$/u);
1642
+ if (numberedBold) {
1643
+ return buildPriorityResult(numberedBold[1] ?? '', numberedBold[2] ?? '', lines, i);
1644
+ }
1645
+ // Pattern B — numbered subheading. Requires an explicit separator
1646
+ // (`:` / `.` / `)` / `·` / `–` / `—` / `-`) after the number so
1647
+ // dotted decimal section labels like `### 2.1 Close to Adoption`
1648
+ // do NOT leak into the headline. Examples:
1649
+ // `### 1. Clean Industrial Deal Implementation (ITRE/ENVI)`
1650
+ // `### 1 · Headline Judgements` (middle dot)
1651
+ // `### KJ-1: Digital Regulation Enforcement …`
1652
+ // `### KF-3: Banking Union Completion`
1653
+ // `### T-2: DMA Enforcement Resolution`
1654
+ // Two narrow patterns instead of one wide alternation to keep the
1655
+ // pattern within the unsafe-regex linter's complexity budget.
1656
+ const numericHeading = line.match(/^#{3,4}\s+\d+[:.)·–—\s-]\s*(.+)$/u);
1657
+ if (numericHeading) {
1658
+ return buildPriorityResult(numericHeading[1] ?? '', '', lines, i);
1659
+ }
1660
+ const tagHeading = line.match(/^#{3,4}\s+[A-Z]{1,3}-?\d+[:.)·–—\s-]\s*(.+)$/u);
1661
+ if (tagHeading) {
1662
+ return buildPriorityResult(tagHeading[1] ?? '', '', lines, i);
1663
+ }
1664
+ // Pattern D — word-prefixed subheading (`### Alert 1 — Title 🔴`,
1665
+ // `### Judgement 1 — Title`, `### Trigger 1: DMA Enforcement`):
1666
+ const wordTaggedHeading = line.match(/^#{3,4}\s+(?:Alert|Judgement|Judgment|Finding|Story|Item|Trigger|Highlight|Dossier|Priority|Top)\s+\d+\s*[:.)·–—\s-]+(.+)$/iu);
1667
+ if (wordTaggedHeading) {
1668
+ return buildPriorityResult(wordTaggedHeading[1] ?? '', '', lines, i);
1669
+ }
1670
+ // Pattern C — bold-leading paragraph trigger:
1671
+ // `**Trigger 1: DMA Enforcement Resolution** (TA-10-2026-0160)`
1672
+ // `**Digital Markets Act Enforcement**`
1673
+ // Rejected when:
1674
+ // - the bold body is longer than a plausible headline (>110 chars) —
1675
+ // that's a bold paragraph lede masquerading as a headline (e.g.
1676
+ // `**This period captures the April 2026 Strasbourg …**`)
1677
+ // - the bold body is a metadata key (`**Admiralty Grade: B/2**`,
1678
+ // `**Reporting Window:** …`, `**Date:** …`) — these are banner
1679
+ // rows, not editorial headlines
1680
+ const boldOnly = line.match(/^\*\*([^*]+?)\*\*\s*(.*)$/u);
1681
+ if (boldOnly && !line.startsWith('**Confidence') && !isMetadataBoldLine(line)) {
1682
+ const candidate = (boldOnly[1] ?? '').trim();
1683
+ if (candidate.length > 0 && candidate.length <= 110) {
1684
+ return buildPriorityResult(candidate, boldOnly[2] ?? '', lines, i);
1685
+ }
1686
+ }
1687
+ return null;
1688
+ }
1689
+ /**
1690
+ * Bold prefix tokens that indicate a metadata banner row rather than an
1691
+ * editorial headline. The Stage-B brief template uses these consistently
1692
+ * as the lede block (`**Reporting Window:** 3 Apr – 1 May 2026`,
1693
+ * `**Admiralty Grade:** B/2`, `**Date:** 2026-05-15`); they must never
1694
+ * leak into `<title>`.
1695
+ */
1696
+ const PRIORITY_METADATA_BOLD_PREFIXES = [
1697
+ 'admiralty',
1698
+ 'classification',
1699
+ 'confidence',
1700
+ 'data sources',
1701
+ 'data quality',
1702
+ 'date',
1703
+ 'generated',
1704
+ 'lead author',
1705
+ 'methodology',
1706
+ 'reporting window',
1707
+ 'run',
1708
+ 'session',
1709
+ 'source',
1710
+ 'sources',
1711
+ 'time horizon',
1712
+ 'wep',
1713
+ ];
1714
+ /**
1715
+ * Recognise a metadata-banner bold line (`**Admiralty Grade: B/2**`,
1716
+ * `**Reporting Window:** 3 Apr – 1 May 2026`). The check is
1717
+ * deliberately case-insensitive and tolerant of trailing colons inside
1718
+ * or outside the bold delimiters.
1719
+ *
1720
+ * @param line - Trimmed source line (already known to start with `**`)
1721
+ * @returns `true` when the line is a metadata banner that must be
1722
+ * skipped by Pattern C
1723
+ */
1724
+ function isMetadataBoldLine(line) {
1725
+ const inner = line
1726
+ .replace(/^\*\*([^*]+?)\*\*.*$/u, '$1')
1727
+ .trim()
1728
+ .toLowerCase();
1729
+ for (const prefix of PRIORITY_METADATA_BOLD_PREFIXES) {
1730
+ if (inner === prefix)
1731
+ return true;
1732
+ if (inner.startsWith(`${prefix}:`))
1733
+ return true;
1734
+ if (inner.startsWith(`${prefix} `) && inner.includes(':'))
1735
+ return true;
1736
+ if (inner.startsWith(`${prefix}—`) || inner.startsWith(`${prefix} —`))
1737
+ return true;
1738
+ }
1739
+ return false;
1740
+ }
1741
+ /**
1742
+ * Compose the `{headline, summary}` pair for one matched priority-finding
1743
+ * item. Cleans `Trigger N:` / `N.` prefixes off the headline, strips the
1744
+ * trailing `(TA-10-…, …)` / `(ITRE/ENVI)` metadata, and gathers the
1745
+ * following prose lines as the summary.
1746
+ *
1747
+ * @param rawHeadline - Raw bold title or numbered-heading text
1748
+ * @param tail - Same-line trailing text (after the bold close / heading)
1749
+ * @param lines - Body lines (already split on `\n`)
1750
+ * @param i - Index of the matched line
1751
+ * @returns Cleaned `{headline, summary}` — headline may be empty when
1752
+ * cleaning collapses it below a minimum length, in which case the
1753
+ * caller falls through
1754
+ */
1755
+ function buildPriorityResult(rawHeadline, tail, lines, i) {
1756
+ const cleaned = cleanPriorityHeadline(rawHeadline);
1757
+ if (cleaned.length < 5)
1758
+ return null;
1759
+ const summaryLines = collectPrioritySummaryLines(tail, lines, i);
1760
+ const summary = truncateDescription(summaryLines.join(' '));
1761
+ return { headline: cleaned, summary };
1762
+ }
1763
+ /**
1764
+ * Decide whether a follow-up line is a hard stop for priority-finding
1765
+ * summary gathering (next heading / next list item) — collapses three
1766
+ * boolean checks out of {@link buildPriorityResult}'s main loop.
1767
+ *
1768
+ * @param line - Trimmed follow-up line
1769
+ * @returns `true` when the gathering loop must break
1770
+ */
1771
+ function isPrioritySummaryStopper(line) {
1772
+ if (/^#{1,6}\s/.test(line))
1773
+ return true;
1774
+ if (/^\d+\.\s/.test(line))
1775
+ return true;
1776
+ if (/^[-*]\s/.test(line))
1777
+ return true;
1778
+ return false;
1779
+ }
1780
+ /**
1781
+ * Gather the summary prose for a priority-finding item — the same-line
1782
+ * tail (with leading procedure-code parens stripped) plus subsequent
1783
+ * prose lines until a blank line / new heading / new bullet is hit.
1784
+ *
1785
+ * @param tail - Same-line text that trails the bold/heading
1786
+ * @param lines - Full body lines
1787
+ * @param i - Index of the matched headline line
1788
+ * @returns Ordered list of summary segments (already clean)
1789
+ */
1790
+ function collectPrioritySummaryLines(tail, lines, i) {
1791
+ const summaryLines = [];
1792
+ // Strip leading parens-metadata (`(TA-10-2026-0160, 2026-04-30)`) and
1793
+ // trailing parens-metadata from the tail so the summary starts with
1794
+ // editorial prose, not a procedure-code citation.
1795
+ let tailText = stripInlineMarkdown(tail).trim();
1796
+ tailText = tailText.replace(/^\([^()]{3,80}\)\s*/u, '');
1797
+ tailText = stripPriorityTailMetadata(tailText).trim();
1798
+ if (tailText)
1799
+ summaryLines.push(tailText);
1800
+ for (let j = i + 1; j < lines.length; j++) {
1801
+ const next = (lines[j] ?? '').trim();
1802
+ if (!next) {
1803
+ if (summaryLines.length > 0)
1804
+ break;
1805
+ continue;
1806
+ }
1807
+ if (isPrioritySummaryStopper(next))
1808
+ break;
1809
+ if (next.startsWith('**Confidence') || next.startsWith('- **Confidence'))
1810
+ continue;
1811
+ if (shouldSkipDescriptionLine(next))
1812
+ continue;
1813
+ summaryLines.push(stripInlineMarkdown(next));
1814
+ if (summaryLines.join(' ').length >= DESCRIPTION_MAX_LENGTH)
1815
+ break;
1816
+ }
1817
+ return summaryLines;
1818
+ }
1819
+ /**
1820
+ * Normalise a priority-finding headline: drop the
1821
+ * `Trigger N:` / `Dossier N:` / leading-numeric prefix, strip trailing
1822
+ * parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
1823
+ * `(ITRE/ENVI)`), and trim residual punctuation. The result is a
1824
+ * headline-shaped string suitable for `<title>` use.
1825
+ *
1826
+ * @param raw - Raw bold-title or heading text
1827
+ * @returns Cleaned headline (may be empty after stripping)
1828
+ */
1829
+ /**
1830
+ * Leading priority-label tokens stripped by {@link cleanPriorityHeadline}
1831
+ * (`🔴 CRITICAL — Title` → `Title`). Kept as a list to bypass the
1832
+ * unsafe-regex lint by avoiding deep alternation in a single pattern.
1833
+ */
1834
+ const PRIORITY_LABEL_TOKENS = [
1835
+ 'CRITICAL',
1836
+ 'HIGH PRIORITY',
1837
+ 'HIGH',
1838
+ 'MEDIUM PRIORITY',
1839
+ 'MEDIUM',
1840
+ 'LOW PRIORITY',
1841
+ 'LOW',
1842
+ 'URGENT',
1843
+ 'ALERT',
1844
+ 'PRIORITY',
1845
+ ];
1846
+ /**
1847
+ * Trailing confidence-marker tokens stripped by
1848
+ * {@link cleanPriorityHeadline}. Same rationale as
1849
+ * {@link PRIORITY_LABEL_TOKENS}.
1850
+ */
1851
+ const PRIORITY_TRAILING_TOKENS = [
1852
+ 'CRITICAL',
1853
+ 'HIGH PRIORITY',
1854
+ 'HIGH',
1855
+ 'MEDIUM PRIORITY',
1856
+ 'MEDIUM',
1857
+ 'LOW PRIORITY',
1858
+ 'LOW',
1859
+ ];
1860
+ /**
1861
+ * Leading editorial-prefix tokens stripped by
1862
+ * {@link cleanPriorityHeadline} (`Trigger 1: Title` → `Title`).
1863
+ */
1864
+ const PRIORITY_LEADING_PREFIX_TOKENS = [
1865
+ 'Trigger',
1866
+ 'Dossier',
1867
+ 'Priority',
1868
+ 'Finding',
1869
+ 'Item',
1870
+ 'Highlight',
1871
+ 'Top',
1872
+ 'Story',
1873
+ 'Alert',
1874
+ 'Judgement',
1875
+ 'Judgment',
1876
+ ];
1877
+ /**
1878
+ * Strip a leading priority decoration (`🔴 `, `CRITICAL — `) from a
1879
+ * candidate headline. Extracted from {@link cleanPriorityHeadline} to
1880
+ * keep cognitive complexity within budget.
1881
+ *
1882
+ * @param text - Candidate headline (already trimmed)
1883
+ * @returns Headline with the leading decoration removed
1884
+ */
1885
+ function stripPriorityLeadingDecoration(text) {
1886
+ let out = text;
1887
+ for (let pass = 0; pass < 2; pass++) {
1888
+ out = out.replace(/^[^\p{L}\p{N}]+/u, '').trim();
1889
+ for (const token of PRIORITY_LABEL_TOKENS) {
1890
+ if (out.toLowerCase().startsWith(token.toLowerCase())) {
1891
+ const rest = out.slice(token.length).trim();
1892
+ const sep = rest.match(/^[:—–-]\s*(.+)$/u);
1893
+ if (sep?.[1]) {
1894
+ out = sep[1].trim();
1895
+ break;
1896
+ }
1897
+ }
1898
+ }
1899
+ }
1900
+ return out;
1901
+ }
1902
+ /**
1903
+ * Strip a leading editorial prefix (`Trigger 1: `, `Dossier 2: `) and a
1904
+ * stray leading ordinal (`1. `, `2.1 `) from a candidate headline.
1905
+ *
1906
+ * @param text - Candidate headline
1907
+ * @returns Headline with the leading editorial decoration removed
1908
+ */
1909
+ function stripPriorityLeadingPrefix(text) {
1910
+ let out = text;
1911
+ for (const token of PRIORITY_LEADING_PREFIX_TOKENS) {
1912
+ if (!out.toLowerCase().startsWith(token.toLowerCase()))
1913
+ continue;
1914
+ const rest = out.slice(token.length);
1915
+ const match = rest.match(/^\s+\d+\s*[:–—-]\s*(.+)$/u);
1916
+ if (match?.[1]) {
1917
+ out = match[1];
1918
+ break;
1919
+ }
1920
+ }
1921
+ // Drop a stray leading "1. " / "2) " ordinal.
1922
+ out = out.replace(/^\d+[.):·\s]\s*/u, '');
1923
+ return out;
1924
+ }
1925
+ /**
1926
+ * Strip a trailing confidence marker (`🔴 CRITICAL`, `🟡 MEDIUM`) from a
1927
+ * candidate headline. Single pass — caller invokes inside a fixed-point
1928
+ * loop.
1929
+ *
1930
+ * @param text - Candidate headline
1931
+ * @returns Headline with the trailing confidence marker removed
1932
+ */
1933
+ function stripPriorityTrailingMarker(text) {
1934
+ let out = text;
1935
+ for (const token of PRIORITY_TRAILING_TOKENS) {
1936
+ const pattern = new RegExp(`\\s+[^\\p{L}\\p{N}\\s]?\\s*${token}\\s*$`, 'iu');
1937
+ const next = out.replace(pattern, '');
1938
+ if (next !== out) {
1939
+ out = next;
1940
+ break;
1941
+ }
1942
+ }
1943
+ return out;
1944
+ }
1945
+ function cleanPriorityHeadline(raw) {
1946
+ let text = stripInlineMarkdown(raw).trim();
1947
+ text = stripPriorityLeadingDecoration(text);
1948
+ text = stripPriorityLeadingPrefix(text);
1949
+ // Trailing cleanup runs in a fixed-point loop so combined patterns
1950
+ // like "Title (Confidence, 80%): 🔴" collapse all the way down to
1951
+ // "Title".
1952
+ let previous = '';
1953
+ while (previous !== text) {
1954
+ previous = text;
1955
+ text = stripPriorityTrailingMarker(text);
1956
+ text = stripPriorityTailMetadata(text);
1957
+ // Drop a single trailing emoji left after metadata stripping.
1958
+ text = text.replace(/\s+[^\p{L}\p{N}\s]+\s*$/u, '');
1959
+ // Drop trailing colons / dashes left over.
1960
+ text = text.replace(/[\s:—–-]+$/u, '');
1961
+ text = text.trim();
1962
+ }
1963
+ return text;
1964
+ }
1965
+ /**
1966
+ * Strip the trailing parenthesised metadata that briefs append to every
1967
+ * priority-finding name — procedure codes, dates, committee tags. The
1968
+ * regex is intentionally non-greedy so it removes only the LAST
1969
+ * parenthesised group on the line.
1970
+ *
1971
+ * @param text - Headline or paragraph text
1972
+ * @returns Text with the trailing `(…)` stripped
1973
+ */
1974
+ function stripPriorityTailMetadata(text) {
1975
+ return text.replace(/\s*\([^()]{3,80}\)\s*$/u, '').trim();
1976
+ }
936
1977
  /**
937
1978
  * Read an artefact file, skipping any SPDX HTML-comment header rows so the
938
1979
  * first-H1 / first-prose logic is never derailed by the REUSE preamble.
@@ -1309,49 +2350,73 @@ function resolveEditorialContent(opts) {
1309
2350
  }
1310
2351
  const summary = artefactSummary || aggregatedSummary;
1311
2352
  if (summary) {
1312
- return { headline: truncateTitle(summary), summary };
2353
+ // The H1 is generic (category-noun, bare-institutional, or
2354
+ // template-style) so we have to derive `<title>` from the BLUF/
2355
+ // lede paragraph. Extract the first complete sentence so the
2356
+ // resulting title is grammatically self-contained — falling back
2357
+ // to clause-boundary truncation downstream when the sentence
2358
+ // itself overruns TITLE_MAX_LENGTH.
2359
+ const firstSentence = extractFirstSentence(summary);
2360
+ return { headline: truncateTitle(firstSentence), summary };
1313
2361
  }
1314
2362
  return { headline: '', summary: '' };
1315
2363
  }
1316
2364
  /**
1317
- * Enrich a localized fallback title with the article-specific editorial
1318
- * headline so translated variants are not reduced to duplicate type/date
1319
- * templates when the source artifacts carry a real story.
2365
+ * Pick the per-language SEO title from the resolved editorial pair and
2366
+ * the localized template fallback. The decision tree mirrors the priority
2367
+ * ladder in the module header:
1320
2368
  *
1321
- * @param lang - Target language code
1322
- * @param fallbackTitle - Localized article-type fallback title
1323
- * @param editorialHeadline - Artifact-derived editorial headline
2369
+ * - When an editorial headline exists (either translated brief or
2370
+ * English brief / aggregated source), use it **verbatim** — no
2371
+ * concatenation with the localized type/date template. Concatenation
2372
+ * historically produced strings like
2373
+ * `Senaste Nytt: Betydande Parlamentariska Händelser — 2026-05-15 — Breaking News: EP April 2026 Plenary Outcomes`
2374
+ * which mix two languages in a single `<title>` and are blocked by
2375
+ * `scripts/validate-manifest-seo.js`'s `english-fallthrough` gate.
2376
+ * - When no editorial headline exists at all, fall back to the
2377
+ * localized type/date template plus a run qualifier so same-type pages
2378
+ * remain distinguishable.
2379
+ *
2380
+ * @param fallbackTitle - Localized article-type template title
2381
+ * @param editorialHeadline - Editorial headline (localized or English)
1324
2382
  * @param runId - Optional run id used only when no editorial headline exists
1325
2383
  * @returns SEO title candidate
1326
2384
  */
1327
- function composeContextualTitle(lang, fallbackTitle, editorialHeadline, runId) {
1328
- if (lang === 'en') {
1329
- return editorialHeadline || withRunQualifier(fallbackTitle, runId);
1330
- }
1331
- if (editorialHeadline) {
1332
- return `${fallbackTitle} — ${editorialHeadline}`;
1333
- }
2385
+ function composeContextualTitle(fallbackTitle, editorialHeadline, runId) {
2386
+ if (editorialHeadline)
2387
+ return editorialHeadline;
1334
2388
  return withRunQualifier(fallbackTitle, runId);
1335
2389
  }
1336
2390
  /**
1337
- * Add localized article context, date, run id and evidence language to short
1338
- * meta descriptions. This turns generic type-level subtitles into
2391
+ * Add localized article context to short or duplicate-prone meta
2392
+ * descriptions. This turns generic type-level subtitles into
1339
2393
  * page-specific descriptions suitable for search snippets.
1340
2394
  *
2395
+ * Internal artefact identifiers (`runId`) are deliberately NOT included
2396
+ * in the description: they leak into Google snippets as opaque tokens
2397
+ * like `breaking-run255-1778894853` and provide no value to readers.
2398
+ * The verbose `evidence` boilerplate (`with source-linked voting,
2399
+ * committee and legislative intelligence`) is also dropped — it pads
2400
+ * bytes without adding editorial information and was the dominant
2401
+ * source of mid-sentence ellipsis truncation observed in production.
2402
+ *
2403
+ * The reader-hint suffix (`labels.reader`) is preserved because it
2404
+ * supplies a stable localized intent signal even when the lede is
2405
+ * very short.
2406
+ *
1341
2407
  * @param lang - Target language code
1342
2408
  * @param baseDescription - Best description from manifest/editorial/template
1343
2409
  * @param editorial - Artifact-derived headline and summary
1344
2410
  * @param editorial.headline - Artifact-derived headline
1345
2411
  * @param editorial.summary - Artifact-derived summary
1346
2412
  * @param date - ISO article date
1347
- * @param runId - Optional analysis run id
2413
+ * @param _runId - Reserved (formerly emitted; no longer used)
1348
2414
  * @returns Description in the target language context, capped for SEO snippets
1349
2415
  */
1350
- function composeContextualDescription(lang, baseDescription, editorial, date, runId) {
2416
+ function composeContextualDescription(lang, baseDescription, editorial, date, _runId) {
1351
2417
  const labels = getLocalizedString(SEO_CONTEXT_LABELS, lang);
1352
2418
  const parts = [baseDescription.trim()];
1353
- const runPart = runId ? ` · ${labels.run} ${runId}` : '';
1354
- parts.push(`${labels.date} ${date}${runPart}, ${labels.evidence}`);
2419
+ parts.push(`${labels.date} ${date}.`);
1355
2420
  const context = pickFirstNonEmpty([editorial.summary, editorial.headline]);
1356
2421
  if (context && !containsNormalized(parts[0] ?? '', context)) {
1357
2422
  parts.push(`${labels.context}: ${context}`);
@@ -1360,14 +2425,46 @@ function composeContextualDescription(lang, baseDescription, editorial, date, ru
1360
2425
  return truncateDescription(parts.join(' '));
1361
2426
  }
1362
2427
  /**
1363
- * Append a run qualifier to otherwise duplicate-prone fallback titles.
2428
+ * Append a short run qualifier to otherwise duplicate-prone fallback
2429
+ * titles. Sanitizes the raw `runId` (which is an internal artefact
2430
+ * identifier of the shape `<slug>-run<N>[-<unix-ts>]`) so user-facing
2431
+ * `<title>` strings never expose Unix timestamps or the full opaque
2432
+ * token. Only the short ordinal `N` is retained.
2433
+ *
2434
+ * Examples:
2435
+ * - `breaking-run255-1778894853` → `Run 255`
2436
+ * - `committee-reports-run330-1778735854` → `Run 330`
2437
+ * - `breaking-run-001` → `Run 001`
2438
+ *
2439
+ * When the runId does not match the canonical shape, the qualifier is
2440
+ * omitted entirely rather than leak an unknown-format token into SEO
2441
+ * surfaces.
1364
2442
  *
1365
2443
  * @param title - Base title
1366
- * @param runId - Optional run id
1367
- * @returns Title with run qualifier when available
2444
+ * @param runId - Optional run id (sanitized before use)
2445
+ * @returns Title with short run qualifier, or unchanged when sanitization fails
1368
2446
  */
1369
2447
  function withRunQualifier(title, runId) {
1370
- return runId ? `${title} — Run ${runId}` : title;
2448
+ if (!runId)
2449
+ return title;
2450
+ // Walk segments backwards: find the last `run<digits>` token. The
2451
+ // runId shape is `<slug>-run<N>[-<unix-ts>]` — we explicitly avoid a
2452
+ // single regex with overlapping `\d+` groups, which the SonarJS
2453
+ // unsafe-regex rule flags as catastrophic-backtracking-prone.
2454
+ const segments = runId.split('-');
2455
+ for (const seg of segments) {
2456
+ const m = /^run(\d+)$/u.exec(seg);
2457
+ if (m)
2458
+ return `${title} — Run ${m[1]}`;
2459
+ const m2 = /^run$/u.exec(seg);
2460
+ if (m2) {
2461
+ const idx = segments.indexOf(seg);
2462
+ const next = segments[idx + 1];
2463
+ if (next && /^\d+$/u.test(next))
2464
+ return `${title} — Run ${next}`;
2465
+ }
2466
+ }
2467
+ return title;
1371
2468
  }
1372
2469
  /**
1373
2470
  * Case-insensitive containment check after whitespace normalization.
@@ -1449,35 +2546,23 @@ function dedupeKeywords(candidates) {
1449
2546
  */
1450
2547
  export function resolveArticleMetadata(opts) {
1451
2548
  const manifest = opts.manifest ?? {};
1452
- const editorial = resolveEditorialContent(opts);
2549
+ const englishEditorial = resolveEditorialContent(opts);
1453
2550
  const template = buildTemplateFallback(opts.articleType, opts.date, manifest.committee);
1454
2551
  const runId = manifest.runId?.trim() ?? '';
1455
2552
  const result = Object.create(null);
1456
2553
  for (const lang of ALL_LANGUAGES) {
1457
- const manifestTitle = manifestOverrideFor(manifest.title, lang);
1458
- const manifestDescription = manifestOverrideFor(manifest.description, lang);
1459
- const fallback = template[lang];
1460
- const contextualTitle = composeContextualTitle(lang, fallback.title, editorial.headline, runId);
1461
- const titleCandidates = [manifestTitle, contextualTitle, fallback.title];
1462
- const descCandidates = [
1463
- manifestDescription,
1464
- lang === 'en' ? editorial.summary : '',
1465
- fallback.subtitle,
1466
- ];
1467
- const title = pickFirstNonEmpty(titleCandidates) || fallback.title;
1468
- const rawDescription = pickFirstNonEmpty(descCandidates) || fallback.subtitle;
1469
- const description = rawDescription.length >= DESCRIPTION_MIN_LENGTH &&
1470
- containsNormalized(rawDescription, opts.date)
1471
- ? rawDescription
1472
- : composeContextualDescription(lang, rawDescription, editorial, opts.date, runId);
1473
- const truncatedTitle = truncateTitle(title);
1474
- const truncatedDescription = truncateDescription(description);
2554
+ const entry = resolveOneLanguage({
2555
+ lang,
2556
+ manifest,
2557
+ englishEditorial,
2558
+ template: template[lang],
2559
+ runDir: opts.runDir,
2560
+ articleType: opts.articleType,
2561
+ date: opts.date,
2562
+ runId,
2563
+ });
1475
2564
  Object.defineProperty(result, lang, {
1476
- value: {
1477
- title: truncatedTitle,
1478
- description: truncatedDescription,
1479
- keywords: buildSeoKeywords(lang, opts.articleType, opts.date, runId, truncatedTitle, truncatedDescription),
1480
- },
2565
+ value: entry,
1481
2566
  enumerable: true,
1482
2567
  writable: true,
1483
2568
  configurable: true,
@@ -1485,6 +2570,96 @@ export function resolveArticleMetadata(opts) {
1485
2570
  }
1486
2571
  return result;
1487
2572
  }
2573
+ /**
2574
+ * Resolve `{title, description, keywords, source}` for one language. The
2575
+ * priority ladder is:
2576
+ *
2577
+ * 1. manifest override (per-language wins, then string fall-through)
2578
+ * 2. localized executive brief (`executive-brief_<lang>.md`) headline +
2579
+ * summary — only for non-English `<lang>`
2580
+ * 3. English executive brief / aggregated editorial — verbatim for
2581
+ * non-English locales that have no translated brief yet, so the
2582
+ * SEO surfaces never collapse to a boring type/date template while a
2583
+ * real editorial highlight exists
2584
+ * 4. localized template fallback
2585
+ *
2586
+ * @param input - Per-language inputs
2587
+ * @returns One resolved metadata entry
2588
+ */
2589
+ function resolveOneLanguage(input) {
2590
+ const manifestTitle = manifestOverrideFor(input.manifest.title, input.lang);
2591
+ const manifestDescription = manifestOverrideFor(input.manifest.description, input.lang);
2592
+ const perLanguage = resolvePerLanguageEditorial(input);
2593
+ const editorial = perLanguage.editorial;
2594
+ const contextualTitle = composeContextualTitle(input.template.title, editorial.headline, input.runId);
2595
+ const title = pickFirstNonEmpty([manifestTitle, contextualTitle, input.template.title]);
2596
+ const rawDescription = pickFirstNonEmpty([
2597
+ manifestDescription,
2598
+ editorial.summary,
2599
+ input.template.subtitle,
2600
+ ]);
2601
+ const description = rawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
2602
+ ? rawDescription
2603
+ : composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
2604
+ const truncatedTitle = truncateTitle(title);
2605
+ const truncatedDescription = truncateDescription(description);
2606
+ const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
2607
+ return {
2608
+ title: truncatedTitle,
2609
+ description: truncatedDescription,
2610
+ keywords: buildSeoKeywords(input.lang, input.articleType, input.date, input.runId, truncatedTitle, truncatedDescription),
2611
+ source,
2612
+ };
2613
+ }
2614
+ /**
2615
+ * Select the editorial `{headline, summary}` pair for one language,
2616
+ * preferring the translated `executive-brief_<lang>.md` over the English
2617
+ * brief. Records which tier provided the content so the caller can wire
2618
+ * up the editorial fallback note and the manifest-SEO validator without
2619
+ * re-scanning the run directory.
2620
+ *
2621
+ * - For `lang === 'en'`: always returns the English `englishEditorial`
2622
+ * pair (whose source is the canonical English brief / aggregated
2623
+ * Markdown / artefact ladder in {@link resolveEditorialContent}).
2624
+ * - For non-English `<lang>`: probes `runDir` for
2625
+ * `executive-brief_<lang>.md` (and the `extended/` sibling) and
2626
+ * prefers its headline + lede. Falls through to the English editorial
2627
+ * when no translated brief exists.
2628
+ *
2629
+ * @param input - Per-language inputs
2630
+ * @returns Editorial pair plus the tier that produced it
2631
+ */
2632
+ function resolvePerLanguageEditorial(input) {
2633
+ if (input.lang !== 'en' && input.runDir) {
2634
+ const localized = resolveLocalizedBriefHighlight(input.runDir, input.lang, input.articleType, input.date);
2635
+ if (localized && (localized.headline || localized.summary)) {
2636
+ // Prefer the localized headline; if missing, allow the localized
2637
+ // summary to drive the title via {@link composeContextualTitle}'s
2638
+ // `editorialHeadline || fallbackTitle` path while still feeding the
2639
+ // localized summary into the description.
2640
+ return {
2641
+ editorial: {
2642
+ headline: localized.headline,
2643
+ summary: localized.summary,
2644
+ },
2645
+ source: 'localized-brief',
2646
+ };
2647
+ }
2648
+ }
2649
+ // No localized brief — fall through to the English editorial pair.
2650
+ if (input.englishEditorial.headline || input.englishEditorial.summary) {
2651
+ return {
2652
+ editorial: input.englishEditorial,
2653
+ source: input.lang === 'en' ? 'english-editorial' : 'english-brief',
2654
+ };
2655
+ }
2656
+ // Nothing editorial at all → caller will fall back to the localized
2657
+ // template.
2658
+ return {
2659
+ editorial: { headline: '', summary: '' },
2660
+ source: 'template',
2661
+ };
2662
+ }
1488
2663
  /**
1489
2664
  * Return the first non-empty, trimmed entry from a candidate list, or
1490
2665
  * the empty string when every entry is blank.