euparliamentmonitor 0.9.12 → 0.9.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -3
- package/scripts/aggregator/analysis-aggregator.js +7 -2
- package/scripts/aggregator/article-generator.js +2 -1
- package/scripts/aggregator/article-metadata.d.ts +90 -18
- package/scripts/aggregator/article-metadata.js +1290 -115
- package/scripts/aggregator/editorial-brief-resolver.d.ts +67 -0
- package/scripts/aggregator/editorial-brief-resolver.js +218 -0
- package/scripts/discover-untranslated-briefs.js +329 -0
- package/scripts/generators/news-indexes.d.ts +28 -0
- package/scripts/generators/news-indexes.js +110 -20
- package/scripts/templates/sync-template-frontmatter.js +4 -4
- package/scripts/validate-brief-translations.js +517 -0
- package/scripts/validate-manifest-seo.js +581 -0
|
@@ -9,48 +9,82 @@
|
|
|
9
9
|
* published article carry a unique, content-reflective headline and
|
|
10
10
|
* description in every language variant.
|
|
11
11
|
*
|
|
12
|
-
* Priority ladder (per language, highest wins)
|
|
12
|
+
* Priority ladder (per language, highest wins) — matches the editorial
|
|
13
|
+
* contract documented in
|
|
14
|
+
* [`.github/prompts/04-article-generation.md`](../../.github/prompts/04-article-generation.md) § 6.2:
|
|
13
15
|
*
|
|
14
16
|
* 1. **Manifest override** — `manifest.title` / `manifest.description` on
|
|
15
17
|
* the analysis-run manifest, either as a plain string (applied to every
|
|
16
18
|
* language) or a `LanguageMap<string>` object for explicit per-language
|
|
17
|
-
* values.
|
|
18
|
-
*
|
|
19
|
-
*
|
|
19
|
+
* values.
|
|
20
|
+
* 2. **Localized executive brief** — for non-English `<lang>`, the
|
|
21
|
+
* translated sibling `executive-brief_<lang>.md` (or
|
|
22
|
+
* `extended/executive-brief_<lang>.md`) under the run directory.
|
|
23
|
+
* Resolved via `editorial-brief-resolver.ts`. This is the authoritative
|
|
24
|
+
* localized source produced by the `news-translate` workflow.
|
|
25
|
+
* 3. **English executive brief, verbatim** — the English brief
|
|
26
|
+
* (`executive-brief.md` / `extended/executive-brief.md`) used as a
|
|
27
|
+
* fall-through when a locale has no translated brief yet. Recorded in
|
|
28
|
+
* `metadataFallback[<lang>] = "en"` so editors can audit which locales
|
|
29
|
+
* fell through.
|
|
30
|
+
* 4. **Artefact editorial H1** — first `# …` heading from the first
|
|
20
31
|
* substantive artefact under the run directory (e.g.
|
|
21
32
|
* `intelligence/synthesis-summary.md`, `breaking-news-analysis.md`).
|
|
22
33
|
* Accepted only when the heading is not a generic
|
|
23
34
|
* `${humanize(articleType)} — ${date}` form.
|
|
24
|
-
*
|
|
25
|
-
* output, accepted under the same non-generic rule.
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
*
|
|
34
|
-
*
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
* Artifact-derived highlights (tiers 2–4) are used as page-specific
|
|
38
|
-
* context across all 14 variants: English can use them directly, while
|
|
39
|
-
* non-English variants keep the localized article-type template and append
|
|
40
|
-
* the editorial topic/summary. This prevents duplicate metadata across
|
|
41
|
-
* same-type pages while keeping the surrounding snippet language-specific
|
|
42
|
-
* until full per-language body translations are present.
|
|
35
|
+
* 5. **Aggregated-markdown H1** — the first `# …` heading in the aggregator
|
|
36
|
+
* output, accepted under the same non-generic rule.
|
|
37
|
+
* 6. **First strong prose paragraph** — the first line of the aggregated
|
|
38
|
+
* Markdown that survives {@link shouldSkipDescriptionLine}.
|
|
39
|
+
* 7. **Localized template** — the per-article-type `*_TITLES` generator
|
|
40
|
+
* from `src/constants/language-articles.ts`. Last resort.
|
|
41
|
+
*
|
|
42
|
+
* Tiers 2–6 produce the same shape ({headline, summary}); the resolver
|
|
43
|
+
* picks the highest-available tier per language. When a localized brief
|
|
44
|
+
* (tier 2) is present, the headline replaces the localized template
|
|
45
|
+
* verbatim — no concatenation. Locales without a translated brief inherit
|
|
46
|
+
* the English brief content (tier 3) so SEO surfaces never fall back to
|
|
47
|
+
* boring type-level templates while real editorial content exists.
|
|
43
48
|
*/
|
|
44
49
|
import fs from 'fs';
|
|
45
50
|
import path from 'path';
|
|
46
51
|
import { ALL_LANGUAGES, getLocalizedString } from '../constants/language-core.js';
|
|
47
52
|
import { BREAKING_NEWS_TITLES, COMMITTEE_REPORTS_TITLES, ELECTION_CYCLE_TITLES, LOCALIZED_KEYWORDS, MONTH_AHEAD_TITLES, MONTHLY_REVIEW_TITLES, MOTIONS_TITLES, PROPOSITIONS_TITLES, QUARTER_AHEAD_TITLES, QUARTER_IN_REVIEW_TITLES, TERM_OUTLOOK_TITLES, WEEK_AHEAD_TITLES, WEEKLY_REVIEW_TITLES, YEAR_AHEAD_TITLES, YEAR_IN_REVIEW_TITLES, } from '../constants/language-articles.js';
|
|
53
|
+
import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
|
|
48
54
|
/** Maximum `<meta description>` length we will emit. */
|
|
49
55
|
const DESCRIPTION_MAX_LENGTH = 180;
|
|
50
56
|
/** Target minimum `<meta description>` length before we append context. */
|
|
51
57
|
const DESCRIPTION_MIN_LENGTH = 140;
|
|
58
|
+
/**
|
|
59
|
+
* Length below which a raw description is considered too short to stand
|
|
60
|
+
* on its own and gets enriched with date/context. Independent from
|
|
61
|
+
* {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
|
|
62
|
+
* truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
|
|
63
|
+
* clean 100-140 char prose lede is preserved verbatim instead of being
|
|
64
|
+
* padded with date/context boilerplate.
|
|
65
|
+
*/
|
|
66
|
+
const ENRICHMENT_TRIGGER_LENGTH = 100;
|
|
52
67
|
/** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
|
|
53
68
|
const TITLE_MAX_LENGTH = 140;
|
|
69
|
+
/**
|
|
70
|
+
* Soft target for headline-style titles produced as a fallback from
|
|
71
|
+
* BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
|
|
72
|
+
* truncator first looks for a natural clause boundary
|
|
73
|
+
* (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
|
|
74
|
+
* window and breaks there instead of mid-clause-with-ellipsis. This
|
|
75
|
+
* turns a 137-character truncated prose paragraph into a complete
|
|
76
|
+
* journalistic clause, which scans much better in news cards and SERP
|
|
77
|
+
* snippets without sacrificing the keyword-rich opening.
|
|
78
|
+
*/
|
|
79
|
+
const HEADLINE_SOFT_MIN = 60;
|
|
80
|
+
/**
|
|
81
|
+
* Punctuation marks that signal a natural clause boundary inside a
|
|
82
|
+
* BLUF / lede paragraph. Listed in preferred-break order: a colon or
|
|
83
|
+
* em-dash that introduces a list of consequences is the best break,
|
|
84
|
+
* full stops are next, and semicolons last. Single ASCII space is
|
|
85
|
+
* always a fallback boundary handled separately.
|
|
86
|
+
*/
|
|
87
|
+
const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
|
|
54
88
|
/** Localized labels used to enrich short or duplicate-prone meta descriptions. */
|
|
55
89
|
const SEO_CONTEXT_LABELS = {
|
|
56
90
|
en: {
|
|
@@ -236,6 +270,8 @@ const ARTIFACT_CATEGORY_PREFIXES = [
|
|
|
236
270
|
'economic context',
|
|
237
271
|
'executive brief',
|
|
238
272
|
'executive briefing',
|
|
273
|
+
'executive intelligence brief',
|
|
274
|
+
'executive intelligence briefing',
|
|
239
275
|
'executive summary',
|
|
240
276
|
'forward indicators',
|
|
241
277
|
'historical baseline',
|
|
@@ -393,8 +429,37 @@ export function shouldSkipDescriptionLine(line) {
|
|
|
393
429
|
}
|
|
394
430
|
if (/^[-*_=~.]{3,}$/.test(line))
|
|
395
431
|
return true;
|
|
432
|
+
if (isLocalizedBannerRow(line))
|
|
433
|
+
return true;
|
|
396
434
|
return false;
|
|
397
435
|
}
|
|
436
|
+
/**
|
|
437
|
+
* Language-agnostic banner-row detector. Stage-B artefacts open with a
|
|
438
|
+
* metadata banner of the shape
|
|
439
|
+
* `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
|
|
440
|
+
* and its localized siblings — notably Japanese / Chinese / Korean briefs
|
|
441
|
+
* which place the full-width colon `:` **inside** the bold span
|
|
442
|
+
* (`**日付:**`) rather than after it. The `METADATA_LINE_PREFIXES` table
|
|
443
|
+
* only covers the English vocabulary; this helper catches the structural
|
|
444
|
+
* shape directly: a line that starts with `**`, contains at least one
|
|
445
|
+
* `|` separator, and carries two-or-more bold key markers that end with
|
|
446
|
+
* — or are followed by — an ASCII colon `:` or full-width colon `:`.
|
|
447
|
+
* Banner rows look identical in every language we publish, so detecting
|
|
448
|
+
* them here keeps localized briefs from leaking their first banner line
|
|
449
|
+
* into the `<meta description>`.
|
|
450
|
+
*
|
|
451
|
+
* @param line - Trimmed source line
|
|
452
|
+
* @returns `true` when the line is a banner row in any locale
|
|
453
|
+
*/
|
|
454
|
+
function isLocalizedBannerRow(line) {
|
|
455
|
+
if (!line.startsWith('**'))
|
|
456
|
+
return false;
|
|
457
|
+
if (!line.includes('|'))
|
|
458
|
+
return false;
|
|
459
|
+
const inside = (line.match(/\*\*[^*]+[::]\s*\*\*/g) ?? []).length;
|
|
460
|
+
const after = (line.match(/\*\*[^*]+\*\*\s*[::]/g) ?? []).length;
|
|
461
|
+
return inside + after >= 2;
|
|
462
|
+
}
|
|
398
463
|
/**
|
|
399
464
|
* Strip inline Markdown decorations so we can use the remaining text as
|
|
400
465
|
* plain-text meta-tag content. Removes link syntax, emphasis, inline code
|
|
@@ -544,12 +609,141 @@ export function truncateDescription(text) {
|
|
|
544
609
|
export function truncateTitle(text) {
|
|
545
610
|
if (text.length <= TITLE_MAX_LENGTH)
|
|
546
611
|
return text;
|
|
612
|
+
// Prefer ending at a natural clause boundary inside the
|
|
613
|
+
// `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
|
|
614
|
+
// title reads as a complete journalistic clause rather than a
|
|
615
|
+
// mid-sentence prose snippet. Iterate boundaries in priority order;
|
|
616
|
+
// when a candidate falls in the window, break there and drop the
|
|
617
|
+
// ellipsis since the result is grammatically complete.
|
|
618
|
+
const search = text.slice(0, TITLE_MAX_LENGTH);
|
|
619
|
+
for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
|
|
620
|
+
const idx = search.lastIndexOf(boundary);
|
|
621
|
+
if (idx >= HEADLINE_SOFT_MIN) {
|
|
622
|
+
const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
|
|
623
|
+
if (clean.length >= HEADLINE_SOFT_MIN)
|
|
624
|
+
return clean;
|
|
625
|
+
}
|
|
626
|
+
}
|
|
547
627
|
const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
|
|
548
628
|
const lastSpace = cut.lastIndexOf(' ');
|
|
549
629
|
let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
|
|
550
630
|
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
551
631
|
return `${safe}…`;
|
|
552
632
|
}
|
|
633
|
+
/**
|
|
634
|
+
* Return the first complete sentence from a prose paragraph, suitable
|
|
635
|
+
* for use as a fallback editorial title when the artefact H1 is
|
|
636
|
+
* categorical (e.g. `# EU Parliament Committee Reports`) and the
|
|
637
|
+
* resolver must derive `<title>` from the BLUF / lede summary instead.
|
|
638
|
+
*
|
|
639
|
+
* A "sentence" is the prefix up to the first sentence-terminator
|
|
640
|
+
* (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
|
|
641
|
+
* TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
|
|
642
|
+
* `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
|
|
643
|
+
* so they don't terminate the sentence prematurely. When no
|
|
644
|
+
* acceptable terminator exists in the window, returns the entire
|
|
645
|
+
* input unchanged so {@link truncateTitle} can handle clause-boundary
|
|
646
|
+
* truncation downstream.
|
|
647
|
+
*
|
|
648
|
+
* This produces journalistically clean titles even for the
|
|
649
|
+
* propositions / committee-reports cases where the BLUF paragraph
|
|
650
|
+
* opens with a single long sentence that exceeds 140 chars —
|
|
651
|
+
* `truncateTitle` then breaks on a clause boundary, and the result is
|
|
652
|
+
* still grammatical because the input was a sentence prefix rather
|
|
653
|
+
* than an arbitrary paragraph slice.
|
|
654
|
+
*
|
|
655
|
+
* @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
|
|
656
|
+
* @returns First sentence, or the original paragraph when none can be
|
|
657
|
+
* identified within the soft-min window
|
|
658
|
+
*/
|
|
659
|
+
export function extractFirstSentence(paragraph) {
|
|
660
|
+
const trimmed = paragraph.trim();
|
|
661
|
+
if (trimmed.length <= HEADLINE_SOFT_MIN)
|
|
662
|
+
return trimmed;
|
|
663
|
+
// Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
|
|
664
|
+
// we'd rather let truncateTitle clause-truncate the original
|
|
665
|
+
// paragraph than return a too-long first sentence.
|
|
666
|
+
const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
|
|
667
|
+
// Skip common abbreviations that contain a period inside a token
|
|
668
|
+
// (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
|
|
669
|
+
// candidate terminator positions; a position counts only when the
|
|
670
|
+
// char before it is *not* part of a known abbreviation token.
|
|
671
|
+
const terminators = ['. ', '! ', '? ', '; '];
|
|
672
|
+
let bestIdx = -1;
|
|
673
|
+
for (const t of terminators) {
|
|
674
|
+
let from = HEADLINE_SOFT_MIN;
|
|
675
|
+
let idx;
|
|
676
|
+
while ((idx = window.indexOf(t, from)) !== -1) {
|
|
677
|
+
if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
|
|
678
|
+
if (bestIdx === -1 || idx < bestIdx)
|
|
679
|
+
bestIdx = idx;
|
|
680
|
+
break;
|
|
681
|
+
}
|
|
682
|
+
from = idx + t.length;
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
if (bestIdx >= HEADLINE_SOFT_MIN) {
|
|
686
|
+
return trimmed.slice(0, bestIdx + 1).trim();
|
|
687
|
+
}
|
|
688
|
+
return trimmed;
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Abbreviation tokens (lowercase, including the trailing period) that
|
|
692
|
+
* should NOT count as sentence terminators when {@link extractFirstSentence}
|
|
693
|
+
* scans for a `.` boundary. Single-letter all-caps initials
|
|
694
|
+
* (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
|
|
695
|
+
*/
|
|
696
|
+
const ABBREVIATION_PREFIXES = [
|
|
697
|
+
'mr.',
|
|
698
|
+
'mrs.',
|
|
699
|
+
'ms.',
|
|
700
|
+
'dr.',
|
|
701
|
+
'st.',
|
|
702
|
+
'no.',
|
|
703
|
+
'vs.',
|
|
704
|
+
'e.g.',
|
|
705
|
+
'i.e.',
|
|
706
|
+
'etc.',
|
|
707
|
+
'cf.',
|
|
708
|
+
'al.',
|
|
709
|
+
// EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
|
|
710
|
+
'q1.',
|
|
711
|
+
'q2.',
|
|
712
|
+
'q3.',
|
|
713
|
+
'q4.',
|
|
714
|
+
'h1.',
|
|
715
|
+
'h2.',
|
|
716
|
+
'fy.',
|
|
717
|
+
];
|
|
718
|
+
/**
|
|
719
|
+
* Check whether the character preceding the `.` at `idx` in `text`
|
|
720
|
+
* indicates an abbreviation (so the `.` is not a sentence terminator).
|
|
721
|
+
* Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
|
|
722
|
+
* single-letter initials pattern (`U.S.`, `E.U.`).
|
|
723
|
+
*
|
|
724
|
+
* @param text - Source text (lowercased segment + original mixed-case)
|
|
725
|
+
* @param idx - Index of the `.` character in `text`
|
|
726
|
+
* @returns `true` when the period at `idx` is part of an abbreviation
|
|
727
|
+
*/
|
|
728
|
+
function isAbbreviationBoundary(text, idx) {
|
|
729
|
+
// All-caps single-letter initial like `U.S.` or `E.U.` — char at
|
|
730
|
+
// idx-1 is a capital letter, and idx-2 is either start of string,
|
|
731
|
+
// whitespace, or another single-letter+period pair.
|
|
732
|
+
if (idx >= 1) {
|
|
733
|
+
const prev = text.charCodeAt(idx - 1);
|
|
734
|
+
const isUpperLetter = prev >= 65 && prev <= 90;
|
|
735
|
+
if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
|
|
736
|
+
return true;
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
// ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
|
|
740
|
+
// start of the word, then compare lowercased.
|
|
741
|
+
let start = idx;
|
|
742
|
+
while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
|
|
743
|
+
start--;
|
|
744
|
+
const token = text.slice(start, idx + 1).toLowerCase();
|
|
745
|
+
return ABBREVIATION_PREFIXES.includes(token);
|
|
746
|
+
}
|
|
553
747
|
/**
|
|
554
748
|
* Return the first Markdown H1 (`# …`) in the supplied text, stripped of
|
|
555
749
|
* the leading `#` and trailing anchor syntax. Returns an empty string when
|
|
@@ -573,15 +767,48 @@ export function extractFirstH1(markdown) {
|
|
|
573
767
|
return '';
|
|
574
768
|
}
|
|
575
769
|
/**
|
|
576
|
-
*
|
|
577
|
-
*
|
|
578
|
-
*
|
|
770
|
+
* Process one Markdown line against the in-progress paragraph buffer.
|
|
771
|
+
* Returns the desired loop control: `'continue'` (skip silently),
|
|
772
|
+
* `'break'` (paragraph terminated — emit), or `'collected'` (line was
|
|
773
|
+
* pushed into the buffer; caller checks the cap separately).
|
|
774
|
+
*
|
|
775
|
+
* Factored out of the two extractors to reduce cognitive complexity.
|
|
776
|
+
*
|
|
777
|
+
* @param line - Trimmed Markdown line
|
|
778
|
+
* @param buf - In-progress paragraph buffer (mutated on `'collected'`)
|
|
779
|
+
* @returns Loop control directive
|
|
780
|
+
*/
|
|
781
|
+
function collectProseLine(line, buf) {
|
|
782
|
+
const hasBuffer = buf.lines.length > 0;
|
|
783
|
+
if (hasBuffer && line === '')
|
|
784
|
+
return 'break';
|
|
785
|
+
if (line === '')
|
|
786
|
+
return 'continue';
|
|
787
|
+
if (shouldSkipDescriptionLine(line))
|
|
788
|
+
return hasBuffer ? 'break' : 'continue';
|
|
789
|
+
const plain = stripLeadingProseLabel(stripInlineMarkdown(line));
|
|
790
|
+
if (!hasBuffer && plain.length < 40)
|
|
791
|
+
return 'continue';
|
|
792
|
+
buf.lines.push(plain);
|
|
793
|
+
buf.byteCount += plain.length + 1;
|
|
794
|
+
return 'collected';
|
|
795
|
+
}
|
|
796
|
+
/**
|
|
797
|
+
* Walk every line of the Markdown source and return the first paragraph
|
|
798
|
+
* that survives {@link shouldSkipDescriptionLine}. Consecutive non-blank
|
|
799
|
+
* prose lines are joined with a single space so hard-wrapped ledes
|
|
800
|
+
* (column-95 conventional wrap) produce a clean 140-180-character
|
|
801
|
+
* description rather than just the first 60-90-char line.
|
|
802
|
+
*
|
|
803
|
+
* Inline Markdown decorations are stripped and the result is truncated
|
|
804
|
+
* to fit `<meta description>`.
|
|
579
805
|
*
|
|
580
806
|
* @param markdown - Markdown source
|
|
581
807
|
* @returns Prose description, or empty string when nothing qualifies
|
|
582
808
|
*/
|
|
583
809
|
export function extractStrongProseLine(markdown) {
|
|
584
810
|
let inFence = false;
|
|
811
|
+
const buf = { lines: [], byteCount: 0 };
|
|
585
812
|
for (const raw of markdown.split('\n')) {
|
|
586
813
|
const line = raw.trim();
|
|
587
814
|
if (line.startsWith('```') || line.startsWith('~~~')) {
|
|
@@ -590,58 +817,93 @@ export function extractStrongProseLine(markdown) {
|
|
|
590
817
|
}
|
|
591
818
|
if (inFence)
|
|
592
819
|
continue;
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
const plain = stripLeadingProseLabel(stripInlineMarkdown(line));
|
|
596
|
-
if (plain.length < 40)
|
|
820
|
+
const directive = collectProseLine(line, buf);
|
|
821
|
+
if (directive === 'continue')
|
|
597
822
|
continue;
|
|
598
|
-
|
|
823
|
+
if (directive === 'break')
|
|
824
|
+
break;
|
|
825
|
+
if (buf.byteCount >= DESCRIPTION_MAX_LENGTH)
|
|
826
|
+
break;
|
|
599
827
|
}
|
|
600
|
-
|
|
828
|
+
if (buf.lines.length === 0)
|
|
829
|
+
return '';
|
|
830
|
+
return truncateDescription(buf.lines.join(' '));
|
|
601
831
|
}
|
|
602
832
|
/**
|
|
603
|
-
*
|
|
604
|
-
*
|
|
605
|
-
*
|
|
606
|
-
* the journalist's lede ("60-Second Read", "TL;DR", "BLUF — …", …) and
|
|
607
|
-
* is exactly the sentence that should power `<meta description>` and
|
|
608
|
-
* the OG/Twitter description fields.
|
|
833
|
+
* Classify one Markdown line for the {@link extractLedeAfterHeading}
|
|
834
|
+
* walker. The returned directive is then applied to walker state by
|
|
835
|
+
* {@link applyLedeDirective}.
|
|
609
836
|
*
|
|
610
|
-
*
|
|
611
|
-
*
|
|
612
|
-
*
|
|
837
|
+
* @param line - Trimmed Markdown line
|
|
838
|
+
* @param isInFence - True when the previous line opened a fenced block
|
|
839
|
+
* @param inLede - True when the previous line was inside a lede heading block
|
|
840
|
+
* @param hasBuffered - True when at least one prose line has been collected
|
|
841
|
+
* @returns Directive describing how the walker should treat this line
|
|
842
|
+
*/
|
|
843
|
+
function classifyLedeLine(line, isInFence, inLede, hasBuffered) {
|
|
844
|
+
if (line.startsWith('```') || line.startsWith('~~~'))
|
|
845
|
+
return { kind: 'fence' };
|
|
846
|
+
if (isInFence)
|
|
847
|
+
return { kind: 'pause' };
|
|
848
|
+
if (/^#{2,3}\s+/.test(line)) {
|
|
849
|
+
if (hasBuffered)
|
|
850
|
+
return { kind: 'pause' };
|
|
851
|
+
const headingText = normaliseHeadingText(line.replace(/^#{2,3}\s+/, ''));
|
|
852
|
+
const match = EDITORIAL_LEDE_HEADINGS.some((h) => isLedeHeadingMatch(headingText, h));
|
|
853
|
+
return { kind: 'heading', inLede: match };
|
|
854
|
+
}
|
|
855
|
+
return inLede ? { kind: 'collect' } : { kind: 'pause' };
|
|
856
|
+
}
|
|
857
|
+
/**
|
|
858
|
+
* Apply one directive emitted by {@link classifyLedeLine} to the walk
|
|
859
|
+
* state. Returns `'break'` to stop the walk, `'continue'` to skip to
|
|
860
|
+
* the next line, or `'collect'` when the caller should now run
|
|
861
|
+
* {@link collectProseLine}. Mutates `state` for fence/in-lede toggles.
|
|
613
862
|
*
|
|
614
|
-
* @param
|
|
615
|
-
* @
|
|
863
|
+
* @param directive - Classification of the current line
|
|
864
|
+
* @param state - Walk state (mutated in place)
|
|
865
|
+
* @param state.inFence - True when the current line is inside a fenced block
|
|
866
|
+
* @param state.inLede - True when the current line is inside a lede heading block
|
|
867
|
+
* @param hasBuffered - Whether any prose has already been collected
|
|
868
|
+
* @returns Loop control directive
|
|
616
869
|
*/
|
|
870
|
+
function applyLedeDirective(directive, state, hasBuffered) {
|
|
871
|
+
if (directive.kind === 'fence') {
|
|
872
|
+
state.inFence = !state.inFence;
|
|
873
|
+
return 'continue';
|
|
874
|
+
}
|
|
875
|
+
if (directive.kind === 'heading') {
|
|
876
|
+
if (hasBuffered)
|
|
877
|
+
return 'break';
|
|
878
|
+
state.inLede = directive.inLede;
|
|
879
|
+
return 'continue';
|
|
880
|
+
}
|
|
881
|
+
if (directive.kind === 'pause')
|
|
882
|
+
return 'continue';
|
|
883
|
+
return 'collect';
|
|
884
|
+
}
|
|
617
885
|
export function extractLedeAfterHeading(markdown) {
|
|
618
|
-
const
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
for (let i = 0; i < lines.length; i++) {
|
|
622
|
-
const raw = lines[i] ?? '';
|
|
886
|
+
const state = { inFence: false, inLede: false };
|
|
887
|
+
const buf = { lines: [], byteCount: 0 };
|
|
888
|
+
for (const raw of markdown.split('\n')) {
|
|
623
889
|
const line = raw.trim();
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
if (
|
|
629
|
-
continue;
|
|
630
|
-
if (/^#{2,3}\s+/.test(line)) {
|
|
631
|
-
const headingText = normaliseHeadingText(line.replace(/^#{2,3}\s+/, ''));
|
|
632
|
-
inLede = EDITORIAL_LEDE_HEADINGS.some((h) => headingText === h || headingText.startsWith(`${h} `) || headingText.startsWith(`${h}:`));
|
|
633
|
-
continue;
|
|
634
|
-
}
|
|
635
|
-
if (!inLede)
|
|
890
|
+
const directive = classifyLedeLine(line, state.inFence, state.inLede, buf.lines.length > 0);
|
|
891
|
+
const action = applyLedeDirective(directive, state, buf.lines.length > 0);
|
|
892
|
+
if (action === 'break')
|
|
893
|
+
break;
|
|
894
|
+
if (action === 'continue')
|
|
636
895
|
continue;
|
|
637
|
-
|
|
896
|
+
const collect = collectProseLine(line, buf);
|
|
897
|
+
if (collect === 'continue')
|
|
638
898
|
continue;
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
899
|
+
if (collect === 'break')
|
|
900
|
+
break;
|
|
901
|
+
if (buf.byteCount >= DESCRIPTION_MAX_LENGTH)
|
|
902
|
+
break;
|
|
643
903
|
}
|
|
644
|
-
|
|
904
|
+
if (buf.lines.length === 0)
|
|
905
|
+
return '';
|
|
906
|
+
return truncateDescription(buf.lines.join(' '));
|
|
645
907
|
}
|
|
646
908
|
/**
|
|
647
909
|
* Normalise a Markdown heading's text for comparison against the
|
|
@@ -660,6 +922,32 @@ function normaliseHeadingText(raw) {
|
|
|
660
922
|
.trim()
|
|
661
923
|
.toLowerCase();
|
|
662
924
|
}
|
|
925
|
+
/**
|
|
926
|
+
* Word-boundary match against an editorial-lede whitelist entry. Matches
|
|
927
|
+
* when the normalised heading equals the whitelist entry exactly, or when
|
|
928
|
+
* the entry is followed by any non-alphanumeric character — covering
|
|
929
|
+
* localized parenthetical glosses written with ASCII or full-width
|
|
930
|
+
* punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
|
|
931
|
+
* `bluf — 핵심 결론`, `60-second read — what happened`).
|
|
932
|
+
*
|
|
933
|
+
* @param headingText - Normalised heading text (lower-case, decoration-stripped)
|
|
934
|
+
* @param whitelistEntry - Lower-case whitelist entry from
|
|
935
|
+
* {@link EDITORIAL_LEDE_HEADINGS}
|
|
936
|
+
* @returns `true` when `headingText` begins with `whitelistEntry` at a
|
|
937
|
+
* word boundary
|
|
938
|
+
*/
|
|
939
|
+
function isLedeHeadingMatch(headingText, whitelistEntry) {
|
|
940
|
+
if (headingText === whitelistEntry)
|
|
941
|
+
return true;
|
|
942
|
+
if (!headingText.startsWith(whitelistEntry))
|
|
943
|
+
return false;
|
|
944
|
+
const next = headingText.charAt(whitelistEntry.length);
|
|
945
|
+
// Word boundary — anything that is not an ASCII letter/digit is a
|
|
946
|
+
// separator we accept. This works uniformly across ASCII parentheses,
|
|
947
|
+
// CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
|
|
948
|
+
// ideographic full-width colon `:`.
|
|
949
|
+
return next === '' || !/[a-z0-9]/.test(next);
|
|
950
|
+
}
|
|
663
951
|
/**
|
|
664
952
|
* Return `true` when an artefact-H1 begins with one of the
|
|
665
953
|
* `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
|
|
@@ -830,8 +1118,151 @@ export function isGenericHeading(heading, articleType, date) {
|
|
|
830
1118
|
if (trailingDateOnly.test(normalized)) {
|
|
831
1119
|
return true;
|
|
832
1120
|
}
|
|
1121
|
+
if (isCategoryNounHeading(normalized, articleType))
|
|
1122
|
+
return true;
|
|
1123
|
+
if (isBareInstitutionalHeading(normalized))
|
|
1124
|
+
return true;
|
|
833
1125
|
return false;
|
|
834
1126
|
}
|
|
1127
|
+
/**
|
|
1128
|
+
* Lower-cased institutional self-references that an executive-brief
|
|
1129
|
+
* authoring template sometimes emits as the H1 when the agent forgot to
|
|
1130
|
+
* substitute a real headline. They identify the publisher / institution
|
|
1131
|
+
* but carry **zero editorial information** — they would produce
|
|
1132
|
+
* pathological `<title>EU Parliament</title>` strings if surfaced.
|
|
1133
|
+
* Matched after whitespace collapse + lowercase, with any trailing
|
|
1134
|
+
* punctuation / single-date qualifier stripped so `EU Parliament ·
|
|
1135
|
+
* 2026-05-15` and `Hack23 AB —` both resolve here. Date *ranges*
|
|
1136
|
+
* (`(May 2026)`, `: 19–22 May 2026`) are preserved as editorial
|
|
1137
|
+
* content, matching the {@link isCategoryNounHeading} contract.
|
|
1138
|
+
*/
|
|
1139
|
+
const BARE_INSTITUTIONAL_HEADINGS = [
|
|
1140
|
+
'eu parliament',
|
|
1141
|
+
'european parliament',
|
|
1142
|
+
'the european parliament',
|
|
1143
|
+
'ep',
|
|
1144
|
+
'ep10',
|
|
1145
|
+
'ep11',
|
|
1146
|
+
'hack23',
|
|
1147
|
+
'hack23 ab',
|
|
1148
|
+
'eu parliament monitor',
|
|
1149
|
+
'european parliament monitor',
|
|
1150
|
+
'executive brief',
|
|
1151
|
+
'briefing',
|
|
1152
|
+
'intelligence brief',
|
|
1153
|
+
'intelligence briefing',
|
|
1154
|
+
];
|
|
1155
|
+
/**
|
|
1156
|
+
* Return `true` when the heading is one of {@link BARE_INSTITUTIONAL_HEADINGS}
|
|
1157
|
+
* — an institutional self-reference with no editorial content. Strips a
|
|
1158
|
+
* trailing single-date qualifier first so `EU Parliament — 2026-05-15`
|
|
1159
|
+
* and `Hack23 AB · 2026-05-15` are caught. Date ranges and any token
|
|
1160
|
+
* after the institutional noun are preserved (so
|
|
1161
|
+
* `EU Parliament Week Ahead: 19–22 May 2026` is *not* flagged here —
|
|
1162
|
+
* that path is owned by {@link isCategoryNounHeading} for `week-ahead`).
|
|
1163
|
+
*
|
|
1164
|
+
* @param normalized - Heading text after whitespace collapse
|
|
1165
|
+
* @returns `true` when the heading is bare institutional boilerplate
|
|
1166
|
+
*/
|
|
1167
|
+
function isBareInstitutionalHeading(normalized) {
|
|
1168
|
+
let core = normalized.toLowerCase();
|
|
1169
|
+
// Same single-date / parenthetical stripping as isCategoryNounHeading
|
|
1170
|
+
// so the same heading shape is recognized via either gate.
|
|
1171
|
+
core = core.replace(/\s*[·:—–-]\s*\d{4}-\d{2}-\d{2}\s*$/u, '');
|
|
1172
|
+
core = core.replace(/\s*\(\s*[a-z]{3,9}\s+\d{4}\s*\)\s*$/u, '');
|
|
1173
|
+
core = core.replace(/\s*\(\s*\d{4}\s*\)\s*$/u, '');
|
|
1174
|
+
core = core.replace(/[\s\-—–:·.]+$/u, '').trim();
|
|
1175
|
+
return BARE_INSTITUTIONAL_HEADINGS.includes(core);
|
|
1176
|
+
}
|
|
1177
|
+
/**
|
|
1178
|
+
* Curated category-noun whitelist per article-type slug. These are the
|
|
1179
|
+
* boring "EU Parliament <Type>" / "EP10 <Type>" headings that the
|
|
1180
|
+
* executive-brief authoring conventions allow as decorative H1s but
|
|
1181
|
+
* which carry **no editorial information** — they merely restate the
|
|
1182
|
+
* article category. When such a heading reaches the metadata resolver
|
|
1183
|
+
* it must be flagged generic so the resolver falls through to the
|
|
1184
|
+
* BLUF / lede summary instead of using the category noun as `<title>`.
|
|
1185
|
+
*
|
|
1186
|
+
* Keys are slugs (`article-type` form). Values are lowercase category
|
|
1187
|
+
* cores, matched after stripping institutional prefixes
|
|
1188
|
+
* (`eu parliament `, `european parliament `, `ep `, `ep10 `, `ep11 `)
|
|
1189
|
+
* and trailing date qualifiers (`· 2026-05-15`, `— 2026-05-15`,
|
|
1190
|
+
* `(May 2026)`, `: 19–22 May 2026` is **kept** because date ranges
|
|
1191
|
+
* carry editorial info — only single-date suffixes are stripped).
|
|
1192
|
+
*/
|
|
1193
|
+
const CATEGORY_NOUN_CORES = {
|
|
1194
|
+
breaking: ['breaking', 'breaking news'],
|
|
1195
|
+
'week-in-review': ['week in review'],
|
|
1196
|
+
'week-ahead': ['week ahead'],
|
|
1197
|
+
'month-in-review': ['month in review'],
|
|
1198
|
+
'month-ahead': ['month ahead'],
|
|
1199
|
+
'quarter-in-review': ['quarter in review'],
|
|
1200
|
+
'quarter-ahead': ['quarter ahead'],
|
|
1201
|
+
'year-in-review': ['year in review'],
|
|
1202
|
+
'year-ahead': ['year ahead'],
|
|
1203
|
+
'committee-reports': [
|
|
1204
|
+
'committee reports',
|
|
1205
|
+
'committee activity',
|
|
1206
|
+
'committee activity report',
|
|
1207
|
+
'committee activity reports',
|
|
1208
|
+
],
|
|
1209
|
+
motions: [
|
|
1210
|
+
'motions',
|
|
1211
|
+
'motions and adopted texts',
|
|
1212
|
+
'plenary votes and resolutions',
|
|
1213
|
+
'plenary votes resolutions',
|
|
1214
|
+
],
|
|
1215
|
+
propositions: ['propositions', 'legislative propositions', 'legislative procedures'],
|
|
1216
|
+
'election-cycle': ['election cycle'],
|
|
1217
|
+
'term-outlook': ['term outlook'],
|
|
1218
|
+
};
|
|
1219
|
+
/**
|
|
1220
|
+
* Return `true` when the heading is a bare category-noun string for the
|
|
1221
|
+
* supplied `articleType` slug, regardless of the institutional prefix
|
|
1222
|
+
* (`EU Parliament `, `European Parliament `, `EP `, `EP10 `, `EP11 `).
|
|
1223
|
+
* Strips a trailing single-date qualifier (` · YYYY-MM-DD`,
|
|
1224
|
+
* ` — YYYY-MM-DD`, `(May 2026)`, `(2026)`) before matching; date-range
|
|
1225
|
+
* qualifiers (`: 19–22 May 2026`) carry editorial information and are
|
|
1226
|
+
* NOT stripped, so headings like `EP Week Ahead: 19–22 May 2026` are
|
|
1227
|
+
* preserved as legitimate editorial headlines.
|
|
1228
|
+
*
|
|
1229
|
+
* @param normalized - Heading text after whitespace collapse
|
|
1230
|
+
* @param articleType - Article-type slug
|
|
1231
|
+
* @returns `true` when the heading is category-noun boilerplate
|
|
1232
|
+
*/
|
|
1233
|
+
function isCategoryNounHeading(normalized, articleType) {
|
|
1234
|
+
const cores = CATEGORY_NOUN_CORES[articleType];
|
|
1235
|
+
if (!cores || cores.length === 0)
|
|
1236
|
+
return false;
|
|
1237
|
+
let core = normalized.toLowerCase();
|
|
1238
|
+
// Strip institutional prefix (longest-first match).
|
|
1239
|
+
const prefixes = [
|
|
1240
|
+
"the european parliament's ",
|
|
1241
|
+
'european parliament ',
|
|
1242
|
+
'eu parliament ',
|
|
1243
|
+
'ep11 ',
|
|
1244
|
+
'ep10 ',
|
|
1245
|
+
'ep ',
|
|
1246
|
+
];
|
|
1247
|
+
for (const p of prefixes) {
|
|
1248
|
+
if (core.startsWith(p)) {
|
|
1249
|
+
core = core.slice(p.length);
|
|
1250
|
+
break;
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
// Strip trailing single-date qualifier. We deliberately do NOT strip
|
|
1254
|
+
// date *ranges* (`19–22 may 2026`, `28-30 april 2026`) because those
|
|
1255
|
+
// identify a specific reporting window — that IS editorial content.
|
|
1256
|
+
// Patterns stripped:
|
|
1257
|
+
// ` · 2026-05-15`, ` — 2026-05-15`, ` - 2026-05-15`, `: 2026-05-15`
|
|
1258
|
+
// ` (may 2026)`, ` (2026)`
|
|
1259
|
+
core = core.replace(/\s*[·:—–-]\s*\d{4}-\d{2}-\d{2}\s*$/u, '');
|
|
1260
|
+
core = core.replace(/\s*\(\s*[a-z]{3,9}\s+\d{4}\s*\)\s*$/u, '');
|
|
1261
|
+
core = core.replace(/\s*\(\s*\d{4}\s*\)\s*$/u, '');
|
|
1262
|
+
// Trailing punctuation residue.
|
|
1263
|
+
core = core.replace(/[\s\-—–:·]+$/u, '').trim();
|
|
1264
|
+
return cores.includes(core);
|
|
1265
|
+
}
|
|
835
1266
|
/**
|
|
836
1267
|
* Escape regex metacharacters so a dynamic string can be embedded safely
|
|
837
1268
|
* in a pattern built at runtime.
|
|
@@ -858,7 +1289,17 @@ export function extractArtifactHighlight(runDir, articleType, date) {
|
|
|
858
1289
|
const direct = scanCandidatesForHighlight(runDir, EDITORIAL_ARTEFACT_CANDIDATES, articleType, date);
|
|
859
1290
|
if (direct.headline)
|
|
860
1291
|
return { headline: direct.headline, summary: direct.summary };
|
|
861
|
-
|
|
1292
|
+
// Top-level fallback scan — used only when none of the canonical
|
|
1293
|
+
// editorial artefacts produced a non-generic H1. We must NOT pick up
|
|
1294
|
+
// translated sibling briefs (`executive-brief_<lang>.md`,
|
|
1295
|
+
// `synthesis-summary_<lang>.md`, …) here, because their H1s are
|
|
1296
|
+
// legitimate localized headlines that the English-only
|
|
1297
|
+
// {@link isGenericHeading} detector cannot recognise as boilerplate.
|
|
1298
|
+
// Letting them through poisoned the English `<title>` and
|
|
1299
|
+
// `<meta description>` for the 2026-05-15 batch with Arabic content
|
|
1300
|
+
// from `executive-brief_ar.md`. See {@link isTranslatedSiblingBrief}
|
|
1301
|
+
// and the regression test in `test/unit/article-metadata.test.js`.
|
|
1302
|
+
const topLevel = safeReaddir(runDir).filter((f) => f.endsWith('.md') && f !== 'manifest.json' && !isTranslatedSiblingBrief(f));
|
|
862
1303
|
const fallback = scanCandidatesForHighlight(runDir, topLevel, articleType, date);
|
|
863
1304
|
if (fallback.headline)
|
|
864
1305
|
return { headline: fallback.headline, summary: fallback.summary };
|
|
@@ -868,6 +1309,28 @@ export function extractArtifactHighlight(runDir, articleType, date) {
|
|
|
868
1309
|
}
|
|
869
1310
|
return null;
|
|
870
1311
|
}
|
|
1312
|
+
/**
|
|
1313
|
+
* Filename suffix pattern that identifies a translated sibling brief
|
|
1314
|
+
* (e.g. `executive-brief_ar.md`, `synthesis-summary_zh.md`). The
|
|
1315
|
+
* `_<lang>` token is matched against {@link ALL_LANGUAGES} so we never
|
|
1316
|
+
* exclude a legitimate English artefact whose name happens to end in
|
|
1317
|
+
* `_<two-letter-suffix>.md`.
|
|
1318
|
+
*/
|
|
1319
|
+
const TRANSLATED_SIBLING_SUFFIX_RE = new RegExp(`_(${ALL_LANGUAGES.join('|')})\\.md$`, 'i');
|
|
1320
|
+
/**
|
|
1321
|
+
* Return `true` when a top-level `.md` filename looks like a translated
|
|
1322
|
+
* sibling of a canonical editorial artefact (e.g.
|
|
1323
|
+
* `executive-brief_ar.md`). These files must be excluded from the
|
|
1324
|
+
* top-level fallback scan in {@link extractArtifactHighlight} because
|
|
1325
|
+
* their localized H1s evade the English-only generic-heading detector
|
|
1326
|
+
* and would otherwise hijack the English SEO surfaces.
|
|
1327
|
+
*
|
|
1328
|
+
* @param filename - Run-relative `.md` filename (no path separators)
|
|
1329
|
+
* @returns `true` when the file is a translated sibling brief
|
|
1330
|
+
*/
|
|
1331
|
+
export function isTranslatedSiblingBrief(filename) {
|
|
1332
|
+
return TRANSLATED_SIBLING_SUFFIX_RE.test(filename);
|
|
1333
|
+
}
|
|
871
1334
|
/**
|
|
872
1335
|
* Walk a list of candidate artefact paths and return the first
|
|
873
1336
|
* non-generic headline + summary pair, plus the first usable lede
|
|
@@ -925,6 +1388,25 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
|
|
|
925
1388
|
if (headline && !isGenericHeading(headline, articleType, date)) {
|
|
926
1389
|
return { cleanHighlight: { headline: truncateTitle(headline), summary } };
|
|
927
1390
|
}
|
|
1391
|
+
// The artefact H1 is generic boilerplate (`Executive Brief — EU Parliament
|
|
1392
|
+
// Breaking News`). Before falling back to a stripped category-core
|
|
1393
|
+
// headline, try to surface the FIRST NAMED PRIORITY FINDING from the
|
|
1394
|
+
// brief's `## Key Developments` / `## Priority Dossiers` /
|
|
1395
|
+
// `## Top Findings` block. This is the canonical Stage-B authoring
|
|
1396
|
+
// pattern (see `analysis/templates/executive-brief.md`) — every brief
|
|
1397
|
+
// lists its top dossiers as `**Name** (procedure-code, date) — paragraph`
|
|
1398
|
+
// or `### N. Name (committee)`. Surfacing that name produces a
|
|
1399
|
+
// distinctive editorial headline ("Digital Markets Act Enforcement",
|
|
1400
|
+
// "Ukraine War Accountability") instead of a stripped category noun.
|
|
1401
|
+
const priority = extractPriorityFindingHighlight(body);
|
|
1402
|
+
if (priority?.headline) {
|
|
1403
|
+
return {
|
|
1404
|
+
cleanHighlight: {
|
|
1405
|
+
headline: truncateTitle(priority.headline),
|
|
1406
|
+
summary: priority.summary || summary,
|
|
1407
|
+
},
|
|
1408
|
+
};
|
|
1409
|
+
}
|
|
928
1410
|
if (headline) {
|
|
929
1411
|
const stripped = stripArtifactCategoryAffix(headline);
|
|
930
1412
|
if (stripped && !isGenericHeading(stripped, articleType, date)) {
|
|
@@ -933,6 +1415,565 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
|
|
|
933
1415
|
}
|
|
934
1416
|
return { summary };
|
|
935
1417
|
}
|
|
1418
|
+
/**
|
|
1419
|
+
* Section headings inside the executive brief that introduce the
|
|
1420
|
+
* named-priority-finding block (matched case-insensitively against the
|
|
1421
|
+
* decoration-stripped heading text, see {@link normaliseHeadingText}).
|
|
1422
|
+
*/
|
|
1423
|
+
const PRIORITY_FINDING_SECTION_HEADINGS = [
|
|
1424
|
+
'key developments',
|
|
1425
|
+
'key findings',
|
|
1426
|
+
'key intelligence summary',
|
|
1427
|
+
'key judgements',
|
|
1428
|
+
'key judgments',
|
|
1429
|
+
'headline intelligence',
|
|
1430
|
+
'headline judgements',
|
|
1431
|
+
'headline judgments',
|
|
1432
|
+
'lead story',
|
|
1433
|
+
'policy intelligence alerts',
|
|
1434
|
+
'priority dossiers',
|
|
1435
|
+
'priority dossiers under committee scrutiny',
|
|
1436
|
+
'priority findings',
|
|
1437
|
+
'priority intelligence assessment',
|
|
1438
|
+
'priority items',
|
|
1439
|
+
'top findings',
|
|
1440
|
+
'top developments',
|
|
1441
|
+
'top dossiers',
|
|
1442
|
+
'top trigger events',
|
|
1443
|
+
'top triggers',
|
|
1444
|
+
'trigger events',
|
|
1445
|
+
'top documents',
|
|
1446
|
+
'top procedures',
|
|
1447
|
+
'top 3 triggers',
|
|
1448
|
+
'wep assessment',
|
|
1449
|
+
'high priority',
|
|
1450
|
+
'highest priority',
|
|
1451
|
+
];
|
|
1452
|
+
/**
|
|
1453
|
+
* Mine the FIRST named priority finding from an executive-brief–style
|
|
1454
|
+
* artefact body. Looks for a section heading from
|
|
1455
|
+
* {@link PRIORITY_FINDING_SECTION_HEADINGS} and returns the first dossier
|
|
1456
|
+
* name + descriptive paragraph found inside it. Supports the three
|
|
1457
|
+
* canonical Stage-B authoring patterns:
|
|
1458
|
+
*
|
|
1459
|
+
* 1. **Bold-in-numbered-list** (breaking briefs):
|
|
1460
|
+
* `1. **Digital Markets Act Enforcement** (TA-10-2026-0160, 2026-04-30)`
|
|
1461
|
+
* ` Parliament adopted a resolution …`
|
|
1462
|
+
* 2. **Numbered subheading** (committee briefs):
|
|
1463
|
+
* `### 1. Clean Industrial Deal Implementation (ITRE/ENVI)`
|
|
1464
|
+
* `The Clean Industrial Deal framework …`
|
|
1465
|
+
* 3. **Bold-leading paragraph** (synthesis variants):
|
|
1466
|
+
* `**Trigger 1: DMA Enforcement Resolution** (TA-10-2026-0160)`
|
|
1467
|
+
* `- Significance: 🟢 HIGH IMPACT …`
|
|
1468
|
+
*
|
|
1469
|
+
* Trailing parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
|
|
1470
|
+
* `(ITRE/ENVI)`) is stripped from the headline so it stays headline-shaped
|
|
1471
|
+
* (`Digital Markets Act Enforcement`) rather than boilerplate
|
|
1472
|
+
* (`Digital Markets Act Enforcement (TA-10-2026-0160, 2026-04-30)`).
|
|
1473
|
+
*
|
|
1474
|
+
* @param body - Editorial artefact body
|
|
1475
|
+
* @returns `{headline, summary}` when a priority finding was identified;
|
|
1476
|
+
* `null` when the body has no priority section or no usable item inside
|
|
1477
|
+
*/
|
|
1478
|
+
export function extractPriorityFindingHighlight(body) {
|
|
1479
|
+
if (!body)
|
|
1480
|
+
return null;
|
|
1481
|
+
const lines = body.split('\n');
|
|
1482
|
+
return scanPrioritySection(lines) ?? scanH2StoryHeadings(lines);
|
|
1483
|
+
}
|
|
1484
|
+
/**
|
|
1485
|
+
* Strategy 1 — scan inside the first recognised priority-finding
|
|
1486
|
+
* section heading for a usable item (Pattern A/B/C/D). Returns `null`
|
|
1487
|
+
* when the section is absent or contains no matchable item.
|
|
1488
|
+
*
|
|
1489
|
+
* @param lines - Body lines (already split on `\n`)
|
|
1490
|
+
* @returns `{headline, summary}` when an item was identified
|
|
1491
|
+
*/
|
|
1492
|
+
function scanPrioritySection(lines) {
|
|
1493
|
+
const sectionStart = findPrioritySectionStart(lines);
|
|
1494
|
+
if (sectionStart < 0)
|
|
1495
|
+
return null;
|
|
1496
|
+
for (let i = sectionStart + 1; i < lines.length; i++) {
|
|
1497
|
+
const line = (lines[i] ?? '').trim();
|
|
1498
|
+
if (!line)
|
|
1499
|
+
continue;
|
|
1500
|
+
// Stop at the next H2 (sibling section) but allow `### …` and
|
|
1501
|
+
// `#### …` subheadings inside (e.g. `### 🔴 HIGH PRIORITY` between
|
|
1502
|
+
// the section header and the first list item).
|
|
1503
|
+
if (/^##(?!#)/.test(line))
|
|
1504
|
+
return null;
|
|
1505
|
+
const candidate = extractPriorityFindingItem(lines, i);
|
|
1506
|
+
if (candidate)
|
|
1507
|
+
return candidate;
|
|
1508
|
+
}
|
|
1509
|
+
return null;
|
|
1510
|
+
}
|
|
1511
|
+
/**
|
|
1512
|
+
* Story-keyword tokens used by `## Lead Story:` / `## Story N:` /
|
|
1513
|
+
* `## Trigger N:` H2 heading detection. Kept as a runtime list so the
|
|
1514
|
+
* regex stays bounded and bypasses the unsafe-regex lint by avoiding
|
|
1515
|
+
* deep alternation.
|
|
1516
|
+
*/
|
|
1517
|
+
const H2_STORY_TOKENS = [
|
|
1518
|
+
'Lead Story',
|
|
1519
|
+
'Story',
|
|
1520
|
+
'Trigger',
|
|
1521
|
+
'Alert',
|
|
1522
|
+
'Judgement',
|
|
1523
|
+
'Judgment',
|
|
1524
|
+
];
|
|
1525
|
+
/**
|
|
1526
|
+
* Strategy 2 — walk every `## …` H2 heading and try to recognise a
|
|
1527
|
+
* story-style heading (`## 📌 Lead Story: Russia Accountability`,
|
|
1528
|
+
* `## Story 1 — DMA Enforcement`). Used as a fallback when no priority
|
|
1529
|
+
* section was found, because motions briefs publish each lead story as
|
|
1530
|
+
* its own H2 without a parent section.
|
|
1531
|
+
*
|
|
1532
|
+
* @param lines - Body lines (already split on `\n`)
|
|
1533
|
+
* @returns `{headline, summary}` when a story heading was identified
|
|
1534
|
+
*/
|
|
1535
|
+
function scanH2StoryHeadings(lines) {
|
|
1536
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1537
|
+
const line = (lines[i] ?? '').trim();
|
|
1538
|
+
if (!line.startsWith('## '))
|
|
1539
|
+
continue;
|
|
1540
|
+
const headingText = line.replace(/^##\s+/u, '');
|
|
1541
|
+
const storyHeadline = extractH2StoryHeadline(headingText);
|
|
1542
|
+
if (!storyHeadline)
|
|
1543
|
+
continue;
|
|
1544
|
+
const result = buildPriorityResult(storyHeadline, '', lines, i);
|
|
1545
|
+
if (result?.headline)
|
|
1546
|
+
return result;
|
|
1547
|
+
}
|
|
1548
|
+
return null;
|
|
1549
|
+
}
|
|
1550
|
+
/**
|
|
1551
|
+
* Recognise the H2-story shape (`📌 Lead Story: Title`, `Story 1 —
|
|
1552
|
+
* Title`, `Trigger 2: Title`) and return the residual headline portion.
|
|
1553
|
+
* Returns an empty string when the heading does not match a story
|
|
1554
|
+
* keyword. Implemented as discrete string operations (rather than one
|
|
1555
|
+
* dense regex) to keep the function under the unsafe-regex linter and
|
|
1556
|
+
* cognitive-complexity budgets.
|
|
1557
|
+
*
|
|
1558
|
+
* @param headingText - Heading text with the leading `## ` already removed
|
|
1559
|
+
* @returns Residual headline or empty string
|
|
1560
|
+
*/
|
|
1561
|
+
function extractH2StoryHeadline(headingText) {
|
|
1562
|
+
// Strip a short leading decoration / emoji block (up to 4 non-alphanumerics).
|
|
1563
|
+
const stripped = headingText.replace(/^[^A-Za-z0-9]{0,4}\s*/u, '');
|
|
1564
|
+
for (const token of H2_STORY_TOKENS) {
|
|
1565
|
+
if (!stripped.toLowerCase().startsWith(token.toLowerCase()))
|
|
1566
|
+
continue;
|
|
1567
|
+
let rest = stripped.slice(token.length).trim();
|
|
1568
|
+
// `Story 1` / `Trigger 2` — accept and consume the trailing digit.
|
|
1569
|
+
if (token !== 'Lead Story') {
|
|
1570
|
+
const digit = rest.match(/^\d+\b/u);
|
|
1571
|
+
if (!digit)
|
|
1572
|
+
continue;
|
|
1573
|
+
rest = rest.slice(digit[0].length).trim();
|
|
1574
|
+
}
|
|
1575
|
+
// Require an explicit `:` / `—` / `–` / `-` / `.` separator before
|
|
1576
|
+
// the residual headline so plain prose H2s never match.
|
|
1577
|
+
const sep = rest.match(/^[:—–\-.]\s+(.+)$/u);
|
|
1578
|
+
if (sep?.[1])
|
|
1579
|
+
return sep[1].trim();
|
|
1580
|
+
}
|
|
1581
|
+
return '';
|
|
1582
|
+
}
|
|
1583
|
+
/**
|
|
1584
|
+
* Locate the line index of the first priority-finding section heading
|
|
1585
|
+
* inside an artefact body. Returns `-1` when no such heading exists.
|
|
1586
|
+
*
|
|
1587
|
+
* @param lines - Body lines (already split on `\n`)
|
|
1588
|
+
* @returns Line index of the `## …` heading, or `-1`
|
|
1589
|
+
*/
|
|
1590
|
+
function findPrioritySectionStart(lines) {
|
|
1591
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1592
|
+
const line = (lines[i] ?? '').trim();
|
|
1593
|
+
const match = line.match(/^#{2,4}\s+(.+)$/u);
|
|
1594
|
+
if (!match)
|
|
1595
|
+
continue;
|
|
1596
|
+
const text = normaliseHeadingText(match[1] ?? '');
|
|
1597
|
+
if (!text)
|
|
1598
|
+
continue;
|
|
1599
|
+
if (headingMatchesPriorityProbe(text))
|
|
1600
|
+
return i;
|
|
1601
|
+
}
|
|
1602
|
+
return -1;
|
|
1603
|
+
}
|
|
1604
|
+
/**
|
|
1605
|
+
* Word-boundary substring matcher for the priority-finding section
|
|
1606
|
+
* detector. Extracted from {@link findPrioritySectionStart} to keep its
|
|
1607
|
+
* cognitive complexity within budget.
|
|
1608
|
+
*
|
|
1609
|
+
* @param text - Heading text already normalised by {@link normaliseHeadingText}
|
|
1610
|
+
* @returns `true` when one of {@link PRIORITY_FINDING_SECTION_HEADINGS}
|
|
1611
|
+
* appears as a word-bounded substring of {@link text}
|
|
1612
|
+
*/
|
|
1613
|
+
function headingMatchesPriorityProbe(text) {
|
|
1614
|
+
for (const probe of PRIORITY_FINDING_SECTION_HEADINGS) {
|
|
1615
|
+
if (text === probe)
|
|
1616
|
+
return true;
|
|
1617
|
+
const idx = text.indexOf(probe);
|
|
1618
|
+
if (idx < 0)
|
|
1619
|
+
continue;
|
|
1620
|
+
const before = idx === 0 ? ' ' : (text[idx - 1] ?? ' ');
|
|
1621
|
+
const after = text[idx + probe.length] ?? ' ';
|
|
1622
|
+
if (!/[A-Za-z0-9]/.test(before) && !/[A-Za-z0-9]/.test(after))
|
|
1623
|
+
return true;
|
|
1624
|
+
}
|
|
1625
|
+
return false;
|
|
1626
|
+
}
|
|
1627
|
+
/**
|
|
1628
|
+
* Try to recognise a priority-finding item starting at {@link i}. Returns
|
|
1629
|
+
* the resolved `{headline, summary}` pair when the item matches one of the
|
|
1630
|
+
* three authoring patterns; returns `null` otherwise so the caller can
|
|
1631
|
+
* advance to the next line.
|
|
1632
|
+
*
|
|
1633
|
+
* @param lines - Body lines (already split on `\n`)
|
|
1634
|
+
* @param i - Index of the candidate line
|
|
1635
|
+
* @returns Priority-finding pair when matched, `null` otherwise
|
|
1636
|
+
*/
|
|
1637
|
+
function extractPriorityFindingItem(lines, i) {
|
|
1638
|
+
const line = (lines[i] ?? '').trim();
|
|
1639
|
+
// Pattern A — numbered list item with bold title:
|
|
1640
|
+
// `1. **Digital Markets Act Enforcement** (TA-10-2026-0160, 2026-04-30)`
|
|
1641
|
+
const numberedBold = line.match(/^\d+\.\s+\*\*([^*]+?)\*\*\s*(.*)$/u);
|
|
1642
|
+
if (numberedBold) {
|
|
1643
|
+
return buildPriorityResult(numberedBold[1] ?? '', numberedBold[2] ?? '', lines, i);
|
|
1644
|
+
}
|
|
1645
|
+
// Pattern B — numbered subheading. Requires an explicit separator
|
|
1646
|
+
// (`:` / `.` / `)` / `·` / `–` / `—` / `-`) after the number so
|
|
1647
|
+
// dotted decimal section labels like `### 2.1 Close to Adoption`
|
|
1648
|
+
// do NOT leak into the headline. Examples:
|
|
1649
|
+
// `### 1. Clean Industrial Deal Implementation (ITRE/ENVI)`
|
|
1650
|
+
// `### 1 · Headline Judgements` (middle dot)
|
|
1651
|
+
// `### KJ-1: Digital Regulation Enforcement …`
|
|
1652
|
+
// `### KF-3: Banking Union Completion`
|
|
1653
|
+
// `### T-2: DMA Enforcement Resolution`
|
|
1654
|
+
// Two narrow patterns instead of one wide alternation to keep the
|
|
1655
|
+
// pattern within the unsafe-regex linter's complexity budget.
|
|
1656
|
+
const numericHeading = line.match(/^#{3,4}\s+\d+[:.)·–—\s-]\s*(.+)$/u);
|
|
1657
|
+
if (numericHeading) {
|
|
1658
|
+
return buildPriorityResult(numericHeading[1] ?? '', '', lines, i);
|
|
1659
|
+
}
|
|
1660
|
+
const tagHeading = line.match(/^#{3,4}\s+[A-Z]{1,3}-?\d+[:.)·–—\s-]\s*(.+)$/u);
|
|
1661
|
+
if (tagHeading) {
|
|
1662
|
+
return buildPriorityResult(tagHeading[1] ?? '', '', lines, i);
|
|
1663
|
+
}
|
|
1664
|
+
// Pattern D — word-prefixed subheading (`### Alert 1 — Title 🔴`,
|
|
1665
|
+
// `### Judgement 1 — Title`, `### Trigger 1: DMA Enforcement`):
|
|
1666
|
+
const wordTaggedHeading = line.match(/^#{3,4}\s+(?:Alert|Judgement|Judgment|Finding|Story|Item|Trigger|Highlight|Dossier|Priority|Top)\s+\d+\s*[:.)·–—\s-]+(.+)$/iu);
|
|
1667
|
+
if (wordTaggedHeading) {
|
|
1668
|
+
return buildPriorityResult(wordTaggedHeading[1] ?? '', '', lines, i);
|
|
1669
|
+
}
|
|
1670
|
+
// Pattern C — bold-leading paragraph trigger:
|
|
1671
|
+
// `**Trigger 1: DMA Enforcement Resolution** (TA-10-2026-0160)`
|
|
1672
|
+
// `**Digital Markets Act Enforcement**`
|
|
1673
|
+
// Rejected when:
|
|
1674
|
+
// - the bold body is longer than a plausible headline (>110 chars) —
|
|
1675
|
+
// that's a bold paragraph lede masquerading as a headline (e.g.
|
|
1676
|
+
// `**This period captures the April 2026 Strasbourg …**`)
|
|
1677
|
+
// - the bold body is a metadata key (`**Admiralty Grade: B/2**`,
|
|
1678
|
+
// `**Reporting Window:** …`, `**Date:** …`) — these are banner
|
|
1679
|
+
// rows, not editorial headlines
|
|
1680
|
+
const boldOnly = line.match(/^\*\*([^*]+?)\*\*\s*(.*)$/u);
|
|
1681
|
+
if (boldOnly && !line.startsWith('**Confidence') && !isMetadataBoldLine(line)) {
|
|
1682
|
+
const candidate = (boldOnly[1] ?? '').trim();
|
|
1683
|
+
if (candidate.length > 0 && candidate.length <= 110) {
|
|
1684
|
+
return buildPriorityResult(candidate, boldOnly[2] ?? '', lines, i);
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
return null;
|
|
1688
|
+
}
|
|
1689
|
+
/**
|
|
1690
|
+
* Bold prefix tokens that indicate a metadata banner row rather than an
|
|
1691
|
+
* editorial headline. The Stage-B brief template uses these consistently
|
|
1692
|
+
* as the lede block (`**Reporting Window:** 3 Apr – 1 May 2026`,
|
|
1693
|
+
* `**Admiralty Grade:** B/2`, `**Date:** 2026-05-15`); they must never
|
|
1694
|
+
* leak into `<title>`.
|
|
1695
|
+
*/
|
|
1696
|
+
const PRIORITY_METADATA_BOLD_PREFIXES = [
|
|
1697
|
+
'admiralty',
|
|
1698
|
+
'classification',
|
|
1699
|
+
'confidence',
|
|
1700
|
+
'data sources',
|
|
1701
|
+
'data quality',
|
|
1702
|
+
'date',
|
|
1703
|
+
'generated',
|
|
1704
|
+
'lead author',
|
|
1705
|
+
'methodology',
|
|
1706
|
+
'reporting window',
|
|
1707
|
+
'run',
|
|
1708
|
+
'session',
|
|
1709
|
+
'source',
|
|
1710
|
+
'sources',
|
|
1711
|
+
'time horizon',
|
|
1712
|
+
'wep',
|
|
1713
|
+
];
|
|
1714
|
+
/**
|
|
1715
|
+
* Recognise a metadata-banner bold line (`**Admiralty Grade: B/2**`,
|
|
1716
|
+
* `**Reporting Window:** 3 Apr – 1 May 2026`). The check is
|
|
1717
|
+
* deliberately case-insensitive and tolerant of trailing colons inside
|
|
1718
|
+
* or outside the bold delimiters.
|
|
1719
|
+
*
|
|
1720
|
+
* @param line - Trimmed source line (already known to start with `**`)
|
|
1721
|
+
* @returns `true` when the line is a metadata banner that must be
|
|
1722
|
+
* skipped by Pattern C
|
|
1723
|
+
*/
|
|
1724
|
+
function isMetadataBoldLine(line) {
|
|
1725
|
+
const inner = line
|
|
1726
|
+
.replace(/^\*\*([^*]+?)\*\*.*$/u, '$1')
|
|
1727
|
+
.trim()
|
|
1728
|
+
.toLowerCase();
|
|
1729
|
+
for (const prefix of PRIORITY_METADATA_BOLD_PREFIXES) {
|
|
1730
|
+
if (inner === prefix)
|
|
1731
|
+
return true;
|
|
1732
|
+
if (inner.startsWith(`${prefix}:`))
|
|
1733
|
+
return true;
|
|
1734
|
+
if (inner.startsWith(`${prefix} `) && inner.includes(':'))
|
|
1735
|
+
return true;
|
|
1736
|
+
if (inner.startsWith(`${prefix}—`) || inner.startsWith(`${prefix} —`))
|
|
1737
|
+
return true;
|
|
1738
|
+
}
|
|
1739
|
+
return false;
|
|
1740
|
+
}
|
|
1741
|
+
/**
|
|
1742
|
+
* Compose the `{headline, summary}` pair for one matched priority-finding
|
|
1743
|
+
* item. Cleans `Trigger N:` / `N.` prefixes off the headline, strips the
|
|
1744
|
+
* trailing `(TA-10-…, …)` / `(ITRE/ENVI)` metadata, and gathers the
|
|
1745
|
+
* following prose lines as the summary.
|
|
1746
|
+
*
|
|
1747
|
+
* @param rawHeadline - Raw bold title or numbered-heading text
|
|
1748
|
+
* @param tail - Same-line trailing text (after the bold close / heading)
|
|
1749
|
+
* @param lines - Body lines (already split on `\n`)
|
|
1750
|
+
* @param i - Index of the matched line
|
|
1751
|
+
* @returns Cleaned `{headline, summary}` — headline may be empty when
|
|
1752
|
+
* cleaning collapses it below a minimum length, in which case the
|
|
1753
|
+
* caller falls through
|
|
1754
|
+
*/
|
|
1755
|
+
function buildPriorityResult(rawHeadline, tail, lines, i) {
|
|
1756
|
+
const cleaned = cleanPriorityHeadline(rawHeadline);
|
|
1757
|
+
if (cleaned.length < 5)
|
|
1758
|
+
return null;
|
|
1759
|
+
const summaryLines = collectPrioritySummaryLines(tail, lines, i);
|
|
1760
|
+
const summary = truncateDescription(summaryLines.join(' '));
|
|
1761
|
+
return { headline: cleaned, summary };
|
|
1762
|
+
}
|
|
1763
|
+
/**
|
|
1764
|
+
* Decide whether a follow-up line is a hard stop for priority-finding
|
|
1765
|
+
* summary gathering (next heading / next list item) — collapses three
|
|
1766
|
+
* boolean checks out of {@link buildPriorityResult}'s main loop.
|
|
1767
|
+
*
|
|
1768
|
+
* @param line - Trimmed follow-up line
|
|
1769
|
+
* @returns `true` when the gathering loop must break
|
|
1770
|
+
*/
|
|
1771
|
+
function isPrioritySummaryStopper(line) {
|
|
1772
|
+
if (/^#{1,6}\s/.test(line))
|
|
1773
|
+
return true;
|
|
1774
|
+
if (/^\d+\.\s/.test(line))
|
|
1775
|
+
return true;
|
|
1776
|
+
if (/^[-*]\s/.test(line))
|
|
1777
|
+
return true;
|
|
1778
|
+
return false;
|
|
1779
|
+
}
|
|
1780
|
+
/**
|
|
1781
|
+
* Gather the summary prose for a priority-finding item — the same-line
|
|
1782
|
+
* tail (with leading procedure-code parens stripped) plus subsequent
|
|
1783
|
+
* prose lines until a blank line / new heading / new bullet is hit.
|
|
1784
|
+
*
|
|
1785
|
+
* @param tail - Same-line text that trails the bold/heading
|
|
1786
|
+
* @param lines - Full body lines
|
|
1787
|
+
* @param i - Index of the matched headline line
|
|
1788
|
+
* @returns Ordered list of summary segments (already clean)
|
|
1789
|
+
*/
|
|
1790
|
+
function collectPrioritySummaryLines(tail, lines, i) {
|
|
1791
|
+
const summaryLines = [];
|
|
1792
|
+
// Strip leading parens-metadata (`(TA-10-2026-0160, 2026-04-30)`) and
|
|
1793
|
+
// trailing parens-metadata from the tail so the summary starts with
|
|
1794
|
+
// editorial prose, not a procedure-code citation.
|
|
1795
|
+
let tailText = stripInlineMarkdown(tail).trim();
|
|
1796
|
+
tailText = tailText.replace(/^\([^()]{3,80}\)\s*/u, '');
|
|
1797
|
+
tailText = stripPriorityTailMetadata(tailText).trim();
|
|
1798
|
+
if (tailText)
|
|
1799
|
+
summaryLines.push(tailText);
|
|
1800
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
1801
|
+
const next = (lines[j] ?? '').trim();
|
|
1802
|
+
if (!next) {
|
|
1803
|
+
if (summaryLines.length > 0)
|
|
1804
|
+
break;
|
|
1805
|
+
continue;
|
|
1806
|
+
}
|
|
1807
|
+
if (isPrioritySummaryStopper(next))
|
|
1808
|
+
break;
|
|
1809
|
+
if (next.startsWith('**Confidence') || next.startsWith('- **Confidence'))
|
|
1810
|
+
continue;
|
|
1811
|
+
if (shouldSkipDescriptionLine(next))
|
|
1812
|
+
continue;
|
|
1813
|
+
summaryLines.push(stripInlineMarkdown(next));
|
|
1814
|
+
if (summaryLines.join(' ').length >= DESCRIPTION_MAX_LENGTH)
|
|
1815
|
+
break;
|
|
1816
|
+
}
|
|
1817
|
+
return summaryLines;
|
|
1818
|
+
}
|
|
1819
|
+
/**
|
|
1820
|
+
* Normalise a priority-finding headline: drop the
|
|
1821
|
+
* `Trigger N:` / `Dossier N:` / leading-numeric prefix, strip trailing
|
|
1822
|
+
* parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
|
|
1823
|
+
* `(ITRE/ENVI)`), and trim residual punctuation. The result is a
|
|
1824
|
+
* headline-shaped string suitable for `<title>` use.
|
|
1825
|
+
*
|
|
1826
|
+
* @param raw - Raw bold-title or heading text
|
|
1827
|
+
* @returns Cleaned headline (may be empty after stripping)
|
|
1828
|
+
*/
|
|
1829
|
+
/**
|
|
1830
|
+
* Leading priority-label tokens stripped by {@link cleanPriorityHeadline}
|
|
1831
|
+
* (`🔴 CRITICAL — Title` → `Title`). Kept as a list to bypass the
|
|
1832
|
+
* unsafe-regex lint by avoiding deep alternation in a single pattern.
|
|
1833
|
+
*/
|
|
1834
|
+
const PRIORITY_LABEL_TOKENS = [
|
|
1835
|
+
'CRITICAL',
|
|
1836
|
+
'HIGH PRIORITY',
|
|
1837
|
+
'HIGH',
|
|
1838
|
+
'MEDIUM PRIORITY',
|
|
1839
|
+
'MEDIUM',
|
|
1840
|
+
'LOW PRIORITY',
|
|
1841
|
+
'LOW',
|
|
1842
|
+
'URGENT',
|
|
1843
|
+
'ALERT',
|
|
1844
|
+
'PRIORITY',
|
|
1845
|
+
];
|
|
1846
|
+
/**
|
|
1847
|
+
* Trailing confidence-marker tokens stripped by
|
|
1848
|
+
* {@link cleanPriorityHeadline}. Same rationale as
|
|
1849
|
+
* {@link PRIORITY_LABEL_TOKENS}.
|
|
1850
|
+
*/
|
|
1851
|
+
const PRIORITY_TRAILING_TOKENS = [
|
|
1852
|
+
'CRITICAL',
|
|
1853
|
+
'HIGH PRIORITY',
|
|
1854
|
+
'HIGH',
|
|
1855
|
+
'MEDIUM PRIORITY',
|
|
1856
|
+
'MEDIUM',
|
|
1857
|
+
'LOW PRIORITY',
|
|
1858
|
+
'LOW',
|
|
1859
|
+
];
|
|
1860
|
+
/**
|
|
1861
|
+
* Leading editorial-prefix tokens stripped by
|
|
1862
|
+
* {@link cleanPriorityHeadline} (`Trigger 1: Title` → `Title`).
|
|
1863
|
+
*/
|
|
1864
|
+
const PRIORITY_LEADING_PREFIX_TOKENS = [
|
|
1865
|
+
'Trigger',
|
|
1866
|
+
'Dossier',
|
|
1867
|
+
'Priority',
|
|
1868
|
+
'Finding',
|
|
1869
|
+
'Item',
|
|
1870
|
+
'Highlight',
|
|
1871
|
+
'Top',
|
|
1872
|
+
'Story',
|
|
1873
|
+
'Alert',
|
|
1874
|
+
'Judgement',
|
|
1875
|
+
'Judgment',
|
|
1876
|
+
];
|
|
1877
|
+
/**
|
|
1878
|
+
* Strip a leading priority decoration (`🔴 `, `CRITICAL — `) from a
|
|
1879
|
+
* candidate headline. Extracted from {@link cleanPriorityHeadline} to
|
|
1880
|
+
* keep cognitive complexity within budget.
|
|
1881
|
+
*
|
|
1882
|
+
* @param text - Candidate headline (already trimmed)
|
|
1883
|
+
* @returns Headline with the leading decoration removed
|
|
1884
|
+
*/
|
|
1885
|
+
function stripPriorityLeadingDecoration(text) {
|
|
1886
|
+
let out = text;
|
|
1887
|
+
for (let pass = 0; pass < 2; pass++) {
|
|
1888
|
+
out = out.replace(/^[^\p{L}\p{N}]+/u, '').trim();
|
|
1889
|
+
for (const token of PRIORITY_LABEL_TOKENS) {
|
|
1890
|
+
if (out.toLowerCase().startsWith(token.toLowerCase())) {
|
|
1891
|
+
const rest = out.slice(token.length).trim();
|
|
1892
|
+
const sep = rest.match(/^[:—–-]\s*(.+)$/u);
|
|
1893
|
+
if (sep?.[1]) {
|
|
1894
|
+
out = sep[1].trim();
|
|
1895
|
+
break;
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
return out;
|
|
1901
|
+
}
|
|
1902
|
+
/**
|
|
1903
|
+
* Strip a leading editorial prefix (`Trigger 1: `, `Dossier 2: `) and a
|
|
1904
|
+
* stray leading ordinal (`1. `, `2.1 `) from a candidate headline.
|
|
1905
|
+
*
|
|
1906
|
+
* @param text - Candidate headline
|
|
1907
|
+
* @returns Headline with the leading editorial decoration removed
|
|
1908
|
+
*/
|
|
1909
|
+
function stripPriorityLeadingPrefix(text) {
|
|
1910
|
+
let out = text;
|
|
1911
|
+
for (const token of PRIORITY_LEADING_PREFIX_TOKENS) {
|
|
1912
|
+
if (!out.toLowerCase().startsWith(token.toLowerCase()))
|
|
1913
|
+
continue;
|
|
1914
|
+
const rest = out.slice(token.length);
|
|
1915
|
+
const match = rest.match(/^\s+\d+\s*[:–—-]\s*(.+)$/u);
|
|
1916
|
+
if (match?.[1]) {
|
|
1917
|
+
out = match[1];
|
|
1918
|
+
break;
|
|
1919
|
+
}
|
|
1920
|
+
}
|
|
1921
|
+
// Drop a stray leading "1. " / "2) " ordinal.
|
|
1922
|
+
out = out.replace(/^\d+[.):·\s]\s*/u, '');
|
|
1923
|
+
return out;
|
|
1924
|
+
}
|
|
1925
|
+
/**
|
|
1926
|
+
* Strip a trailing confidence marker (`🔴 CRITICAL`, `🟡 MEDIUM`) from a
|
|
1927
|
+
* candidate headline. Single pass — caller invokes inside a fixed-point
|
|
1928
|
+
* loop.
|
|
1929
|
+
*
|
|
1930
|
+
* @param text - Candidate headline
|
|
1931
|
+
* @returns Headline with the trailing confidence marker removed
|
|
1932
|
+
*/
|
|
1933
|
+
function stripPriorityTrailingMarker(text) {
|
|
1934
|
+
let out = text;
|
|
1935
|
+
for (const token of PRIORITY_TRAILING_TOKENS) {
|
|
1936
|
+
const pattern = new RegExp(`\\s+[^\\p{L}\\p{N}\\s]?\\s*${token}\\s*$`, 'iu');
|
|
1937
|
+
const next = out.replace(pattern, '');
|
|
1938
|
+
if (next !== out) {
|
|
1939
|
+
out = next;
|
|
1940
|
+
break;
|
|
1941
|
+
}
|
|
1942
|
+
}
|
|
1943
|
+
return out;
|
|
1944
|
+
}
|
|
1945
|
+
function cleanPriorityHeadline(raw) {
|
|
1946
|
+
let text = stripInlineMarkdown(raw).trim();
|
|
1947
|
+
text = stripPriorityLeadingDecoration(text);
|
|
1948
|
+
text = stripPriorityLeadingPrefix(text);
|
|
1949
|
+
// Trailing cleanup runs in a fixed-point loop so combined patterns
|
|
1950
|
+
// like "Title (Confidence, 80%): 🔴" collapse all the way down to
|
|
1951
|
+
// "Title".
|
|
1952
|
+
let previous = '';
|
|
1953
|
+
while (previous !== text) {
|
|
1954
|
+
previous = text;
|
|
1955
|
+
text = stripPriorityTrailingMarker(text);
|
|
1956
|
+
text = stripPriorityTailMetadata(text);
|
|
1957
|
+
// Drop a single trailing emoji left after metadata stripping.
|
|
1958
|
+
text = text.replace(/\s+[^\p{L}\p{N}\s]+\s*$/u, '');
|
|
1959
|
+
// Drop trailing colons / dashes left over.
|
|
1960
|
+
text = text.replace(/[\s:—–-]+$/u, '');
|
|
1961
|
+
text = text.trim();
|
|
1962
|
+
}
|
|
1963
|
+
return text;
|
|
1964
|
+
}
|
|
1965
|
+
/**
|
|
1966
|
+
* Strip the trailing parenthesised metadata that briefs append to every
|
|
1967
|
+
* priority-finding name — procedure codes, dates, committee tags. The
|
|
1968
|
+
* regex is intentionally non-greedy so it removes only the LAST
|
|
1969
|
+
* parenthesised group on the line.
|
|
1970
|
+
*
|
|
1971
|
+
* @param text - Headline or paragraph text
|
|
1972
|
+
* @returns Text with the trailing `(…)` stripped
|
|
1973
|
+
*/
|
|
1974
|
+
function stripPriorityTailMetadata(text) {
|
|
1975
|
+
return text.replace(/\s*\([^()]{3,80}\)\s*$/u, '').trim();
|
|
1976
|
+
}
|
|
936
1977
|
/**
|
|
937
1978
|
* Read an artefact file, skipping any SPDX HTML-comment header rows so the
|
|
938
1979
|
* first-H1 / first-prose logic is never derailed by the REUSE preamble.
|
|
@@ -1309,49 +2350,73 @@ function resolveEditorialContent(opts) {
|
|
|
1309
2350
|
}
|
|
1310
2351
|
const summary = artefactSummary || aggregatedSummary;
|
|
1311
2352
|
if (summary) {
|
|
1312
|
-
|
|
2353
|
+
// The H1 is generic (category-noun, bare-institutional, or
|
|
2354
|
+
// template-style) so we have to derive `<title>` from the BLUF/
|
|
2355
|
+
// lede paragraph. Extract the first complete sentence so the
|
|
2356
|
+
// resulting title is grammatically self-contained — falling back
|
|
2357
|
+
// to clause-boundary truncation downstream when the sentence
|
|
2358
|
+
// itself overruns TITLE_MAX_LENGTH.
|
|
2359
|
+
const firstSentence = extractFirstSentence(summary);
|
|
2360
|
+
return { headline: truncateTitle(firstSentence), summary };
|
|
1313
2361
|
}
|
|
1314
2362
|
return { headline: '', summary: '' };
|
|
1315
2363
|
}
|
|
1316
2364
|
/**
|
|
1317
|
-
*
|
|
1318
|
-
*
|
|
1319
|
-
*
|
|
2365
|
+
* Pick the per-language SEO title from the resolved editorial pair and
|
|
2366
|
+
* the localized template fallback. The decision tree mirrors the priority
|
|
2367
|
+
* ladder in the module header:
|
|
1320
2368
|
*
|
|
1321
|
-
*
|
|
1322
|
-
*
|
|
1323
|
-
*
|
|
2369
|
+
* - When an editorial headline exists (either translated brief or
|
|
2370
|
+
* English brief / aggregated source), use it **verbatim** — no
|
|
2371
|
+
* concatenation with the localized type/date template. Concatenation
|
|
2372
|
+
* historically produced strings like
|
|
2373
|
+
* `Senaste Nytt: Betydande Parlamentariska Händelser — 2026-05-15 — Breaking News: EP April 2026 Plenary Outcomes`
|
|
2374
|
+
* which mix two languages in a single `<title>` and are blocked by
|
|
2375
|
+
* `scripts/validate-manifest-seo.js`'s `english-fallthrough` gate.
|
|
2376
|
+
* - When no editorial headline exists at all, fall back to the
|
|
2377
|
+
* localized type/date template plus a run qualifier so same-type pages
|
|
2378
|
+
* remain distinguishable.
|
|
2379
|
+
*
|
|
2380
|
+
* @param fallbackTitle - Localized article-type template title
|
|
2381
|
+
* @param editorialHeadline - Editorial headline (localized or English)
|
|
1324
2382
|
* @param runId - Optional run id used only when no editorial headline exists
|
|
1325
2383
|
* @returns SEO title candidate
|
|
1326
2384
|
*/
|
|
1327
|
-
function composeContextualTitle(
|
|
1328
|
-
if (
|
|
1329
|
-
return editorialHeadline
|
|
1330
|
-
}
|
|
1331
|
-
if (editorialHeadline) {
|
|
1332
|
-
return `${fallbackTitle} — ${editorialHeadline}`;
|
|
1333
|
-
}
|
|
2385
|
+
function composeContextualTitle(fallbackTitle, editorialHeadline, runId) {
|
|
2386
|
+
if (editorialHeadline)
|
|
2387
|
+
return editorialHeadline;
|
|
1334
2388
|
return withRunQualifier(fallbackTitle, runId);
|
|
1335
2389
|
}
|
|
1336
2390
|
/**
|
|
1337
|
-
* Add localized article context
|
|
1338
|
-
*
|
|
2391
|
+
* Add localized article context to short or duplicate-prone meta
|
|
2392
|
+
* descriptions. This turns generic type-level subtitles into
|
|
1339
2393
|
* page-specific descriptions suitable for search snippets.
|
|
1340
2394
|
*
|
|
2395
|
+
* Internal artefact identifiers (`runId`) are deliberately NOT included
|
|
2396
|
+
* in the description: they leak into Google snippets as opaque tokens
|
|
2397
|
+
* like `breaking-run255-1778894853` and provide no value to readers.
|
|
2398
|
+
* The verbose `evidence` boilerplate (`with source-linked voting,
|
|
2399
|
+
* committee and legislative intelligence`) is also dropped — it pads
|
|
2400
|
+
* bytes without adding editorial information and was the dominant
|
|
2401
|
+
* source of mid-sentence ellipsis truncation observed in production.
|
|
2402
|
+
*
|
|
2403
|
+
* The reader-hint suffix (`labels.reader`) is preserved because it
|
|
2404
|
+
* supplies a stable localized intent signal even when the lede is
|
|
2405
|
+
* very short.
|
|
2406
|
+
*
|
|
1341
2407
|
* @param lang - Target language code
|
|
1342
2408
|
* @param baseDescription - Best description from manifest/editorial/template
|
|
1343
2409
|
* @param editorial - Artifact-derived headline and summary
|
|
1344
2410
|
* @param editorial.headline - Artifact-derived headline
|
|
1345
2411
|
* @param editorial.summary - Artifact-derived summary
|
|
1346
2412
|
* @param date - ISO article date
|
|
1347
|
-
* @param
|
|
2413
|
+
* @param _runId - Reserved (formerly emitted; no longer used)
|
|
1348
2414
|
* @returns Description in the target language context, capped for SEO snippets
|
|
1349
2415
|
*/
|
|
1350
|
-
function composeContextualDescription(lang, baseDescription, editorial, date,
|
|
2416
|
+
function composeContextualDescription(lang, baseDescription, editorial, date, _runId) {
|
|
1351
2417
|
const labels = getLocalizedString(SEO_CONTEXT_LABELS, lang);
|
|
1352
2418
|
const parts = [baseDescription.trim()];
|
|
1353
|
-
|
|
1354
|
-
parts.push(`${labels.date} ${date}${runPart}, ${labels.evidence}`);
|
|
2419
|
+
parts.push(`${labels.date} ${date}.`);
|
|
1355
2420
|
const context = pickFirstNonEmpty([editorial.summary, editorial.headline]);
|
|
1356
2421
|
if (context && !containsNormalized(parts[0] ?? '', context)) {
|
|
1357
2422
|
parts.push(`${labels.context}: ${context}`);
|
|
@@ -1360,14 +2425,46 @@ function composeContextualDescription(lang, baseDescription, editorial, date, ru
|
|
|
1360
2425
|
return truncateDescription(parts.join(' '));
|
|
1361
2426
|
}
|
|
1362
2427
|
/**
|
|
1363
|
-
* Append a run qualifier to otherwise duplicate-prone fallback
|
|
2428
|
+
* Append a short run qualifier to otherwise duplicate-prone fallback
|
|
2429
|
+
* titles. Sanitizes the raw `runId` (which is an internal artefact
|
|
2430
|
+
* identifier of the shape `<slug>-run<N>[-<unix-ts>]`) so user-facing
|
|
2431
|
+
* `<title>` strings never expose Unix timestamps or the full opaque
|
|
2432
|
+
* token. Only the short ordinal `N` is retained.
|
|
2433
|
+
*
|
|
2434
|
+
* Examples:
|
|
2435
|
+
* - `breaking-run255-1778894853` → `Run 255`
|
|
2436
|
+
* - `committee-reports-run330-1778735854` → `Run 330`
|
|
2437
|
+
* - `breaking-run-001` → `Run 001`
|
|
2438
|
+
*
|
|
2439
|
+
* When the runId does not match the canonical shape, the qualifier is
|
|
2440
|
+
* omitted entirely rather than leak an unknown-format token into SEO
|
|
2441
|
+
* surfaces.
|
|
1364
2442
|
*
|
|
1365
2443
|
* @param title - Base title
|
|
1366
|
-
* @param runId - Optional run id
|
|
1367
|
-
* @returns Title with run qualifier when
|
|
2444
|
+
* @param runId - Optional run id (sanitized before use)
|
|
2445
|
+
* @returns Title with short run qualifier, or unchanged when sanitization fails
|
|
1368
2446
|
*/
|
|
1369
2447
|
function withRunQualifier(title, runId) {
|
|
1370
|
-
|
|
2448
|
+
if (!runId)
|
|
2449
|
+
return title;
|
|
2450
|
+
// Walk segments backwards: find the last `run<digits>` token. The
|
|
2451
|
+
// runId shape is `<slug>-run<N>[-<unix-ts>]` — we explicitly avoid a
|
|
2452
|
+
// single regex with overlapping `\d+` groups, which the SonarJS
|
|
2453
|
+
// unsafe-regex rule flags as catastrophic-backtracking-prone.
|
|
2454
|
+
const segments = runId.split('-');
|
|
2455
|
+
for (const seg of segments) {
|
|
2456
|
+
const m = /^run(\d+)$/u.exec(seg);
|
|
2457
|
+
if (m)
|
|
2458
|
+
return `${title} — Run ${m[1]}`;
|
|
2459
|
+
const m2 = /^run$/u.exec(seg);
|
|
2460
|
+
if (m2) {
|
|
2461
|
+
const idx = segments.indexOf(seg);
|
|
2462
|
+
const next = segments[idx + 1];
|
|
2463
|
+
if (next && /^\d+$/u.test(next))
|
|
2464
|
+
return `${title} — Run ${next}`;
|
|
2465
|
+
}
|
|
2466
|
+
}
|
|
2467
|
+
return title;
|
|
1371
2468
|
}
|
|
1372
2469
|
/**
|
|
1373
2470
|
* Case-insensitive containment check after whitespace normalization.
|
|
@@ -1449,35 +2546,23 @@ function dedupeKeywords(candidates) {
|
|
|
1449
2546
|
*/
|
|
1450
2547
|
export function resolveArticleMetadata(opts) {
|
|
1451
2548
|
const manifest = opts.manifest ?? {};
|
|
1452
|
-
const
|
|
2549
|
+
const englishEditorial = resolveEditorialContent(opts);
|
|
1453
2550
|
const template = buildTemplateFallback(opts.articleType, opts.date, manifest.committee);
|
|
1454
2551
|
const runId = manifest.runId?.trim() ?? '';
|
|
1455
2552
|
const result = Object.create(null);
|
|
1456
2553
|
for (const lang of ALL_LANGUAGES) {
|
|
1457
|
-
const
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
const title = pickFirstNonEmpty(titleCandidates) || fallback.title;
|
|
1468
|
-
const rawDescription = pickFirstNonEmpty(descCandidates) || fallback.subtitle;
|
|
1469
|
-
const description = rawDescription.length >= DESCRIPTION_MIN_LENGTH &&
|
|
1470
|
-
containsNormalized(rawDescription, opts.date)
|
|
1471
|
-
? rawDescription
|
|
1472
|
-
: composeContextualDescription(lang, rawDescription, editorial, opts.date, runId);
|
|
1473
|
-
const truncatedTitle = truncateTitle(title);
|
|
1474
|
-
const truncatedDescription = truncateDescription(description);
|
|
2554
|
+
const entry = resolveOneLanguage({
|
|
2555
|
+
lang,
|
|
2556
|
+
manifest,
|
|
2557
|
+
englishEditorial,
|
|
2558
|
+
template: template[lang],
|
|
2559
|
+
runDir: opts.runDir,
|
|
2560
|
+
articleType: opts.articleType,
|
|
2561
|
+
date: opts.date,
|
|
2562
|
+
runId,
|
|
2563
|
+
});
|
|
1475
2564
|
Object.defineProperty(result, lang, {
|
|
1476
|
-
value:
|
|
1477
|
-
title: truncatedTitle,
|
|
1478
|
-
description: truncatedDescription,
|
|
1479
|
-
keywords: buildSeoKeywords(lang, opts.articleType, opts.date, runId, truncatedTitle, truncatedDescription),
|
|
1480
|
-
},
|
|
2565
|
+
value: entry,
|
|
1481
2566
|
enumerable: true,
|
|
1482
2567
|
writable: true,
|
|
1483
2568
|
configurable: true,
|
|
@@ -1485,6 +2570,96 @@ export function resolveArticleMetadata(opts) {
|
|
|
1485
2570
|
}
|
|
1486
2571
|
return result;
|
|
1487
2572
|
}
|
|
2573
|
+
/**
|
|
2574
|
+
* Resolve `{title, description, keywords, source}` for one language. The
|
|
2575
|
+
* priority ladder is:
|
|
2576
|
+
*
|
|
2577
|
+
* 1. manifest override (per-language wins, then string fall-through)
|
|
2578
|
+
* 2. localized executive brief (`executive-brief_<lang>.md`) headline +
|
|
2579
|
+
* summary — only for non-English `<lang>`
|
|
2580
|
+
* 3. English executive brief / aggregated editorial — verbatim for
|
|
2581
|
+
* non-English locales that have no translated brief yet, so the
|
|
2582
|
+
* SEO surfaces never collapse to a boring type/date template while a
|
|
2583
|
+
* real editorial highlight exists
|
|
2584
|
+
* 4. localized template fallback
|
|
2585
|
+
*
|
|
2586
|
+
* @param input - Per-language inputs
|
|
2587
|
+
* @returns One resolved metadata entry
|
|
2588
|
+
*/
|
|
2589
|
+
function resolveOneLanguage(input) {
|
|
2590
|
+
const manifestTitle = manifestOverrideFor(input.manifest.title, input.lang);
|
|
2591
|
+
const manifestDescription = manifestOverrideFor(input.manifest.description, input.lang);
|
|
2592
|
+
const perLanguage = resolvePerLanguageEditorial(input);
|
|
2593
|
+
const editorial = perLanguage.editorial;
|
|
2594
|
+
const contextualTitle = composeContextualTitle(input.template.title, editorial.headline, input.runId);
|
|
2595
|
+
const title = pickFirstNonEmpty([manifestTitle, contextualTitle, input.template.title]);
|
|
2596
|
+
const rawDescription = pickFirstNonEmpty([
|
|
2597
|
+
manifestDescription,
|
|
2598
|
+
editorial.summary,
|
|
2599
|
+
input.template.subtitle,
|
|
2600
|
+
]);
|
|
2601
|
+
const description = rawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
|
|
2602
|
+
? rawDescription
|
|
2603
|
+
: composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
|
|
2604
|
+
const truncatedTitle = truncateTitle(title);
|
|
2605
|
+
const truncatedDescription = truncateDescription(description);
|
|
2606
|
+
const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
|
|
2607
|
+
return {
|
|
2608
|
+
title: truncatedTitle,
|
|
2609
|
+
description: truncatedDescription,
|
|
2610
|
+
keywords: buildSeoKeywords(input.lang, input.articleType, input.date, input.runId, truncatedTitle, truncatedDescription),
|
|
2611
|
+
source,
|
|
2612
|
+
};
|
|
2613
|
+
}
|
|
2614
|
+
/**
|
|
2615
|
+
* Select the editorial `{headline, summary}` pair for one language,
|
|
2616
|
+
* preferring the translated `executive-brief_<lang>.md` over the English
|
|
2617
|
+
* brief. Records which tier provided the content so the caller can wire
|
|
2618
|
+
* up the editorial fallback note and the manifest-SEO validator without
|
|
2619
|
+
* re-scanning the run directory.
|
|
2620
|
+
*
|
|
2621
|
+
* - For `lang === 'en'`: always returns the English `englishEditorial`
|
|
2622
|
+
* pair (whose source is the canonical English brief / aggregated
|
|
2623
|
+
* Markdown / artefact ladder in {@link resolveEditorialContent}).
|
|
2624
|
+
* - For non-English `<lang>`: probes `runDir` for
|
|
2625
|
+
* `executive-brief_<lang>.md` (and the `extended/` sibling) and
|
|
2626
|
+
* prefers its headline + lede. Falls through to the English editorial
|
|
2627
|
+
* when no translated brief exists.
|
|
2628
|
+
*
|
|
2629
|
+
* @param input - Per-language inputs
|
|
2630
|
+
* @returns Editorial pair plus the tier that produced it
|
|
2631
|
+
*/
|
|
2632
|
+
function resolvePerLanguageEditorial(input) {
|
|
2633
|
+
if (input.lang !== 'en' && input.runDir) {
|
|
2634
|
+
const localized = resolveLocalizedBriefHighlight(input.runDir, input.lang, input.articleType, input.date);
|
|
2635
|
+
if (localized && (localized.headline || localized.summary)) {
|
|
2636
|
+
// Prefer the localized headline; if missing, allow the localized
|
|
2637
|
+
// summary to drive the title via {@link composeContextualTitle}'s
|
|
2638
|
+
// `editorialHeadline || fallbackTitle` path while still feeding the
|
|
2639
|
+
// localized summary into the description.
|
|
2640
|
+
return {
|
|
2641
|
+
editorial: {
|
|
2642
|
+
headline: localized.headline,
|
|
2643
|
+
summary: localized.summary,
|
|
2644
|
+
},
|
|
2645
|
+
source: 'localized-brief',
|
|
2646
|
+
};
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
// No localized brief — fall through to the English editorial pair.
|
|
2650
|
+
if (input.englishEditorial.headline || input.englishEditorial.summary) {
|
|
2651
|
+
return {
|
|
2652
|
+
editorial: input.englishEditorial,
|
|
2653
|
+
source: input.lang === 'en' ? 'english-editorial' : 'english-brief',
|
|
2654
|
+
};
|
|
2655
|
+
}
|
|
2656
|
+
// Nothing editorial at all → caller will fall back to the localized
|
|
2657
|
+
// template.
|
|
2658
|
+
return {
|
|
2659
|
+
editorial: { headline: '', summary: '' },
|
|
2660
|
+
source: 'template',
|
|
2661
|
+
};
|
|
2662
|
+
}
|
|
1488
2663
|
/**
|
|
1489
2664
|
* Return the first non-empty, trimmed entry from a candidate list, or
|
|
1490
2665
|
* the empty string when every entry is blank.
|