euparliamentmonitor 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/package.json +6 -4
  2. package/scripts/aggregator/article-generator.js +2 -1
  3. package/scripts/aggregator/article-html.d.ts +9 -0
  4. package/scripts/aggregator/article-html.js +134 -13
  5. package/scripts/aggregator/article-metadata.d.ts +25 -161
  6. package/scripts/aggregator/article-metadata.js +71 -649
  7. package/scripts/aggregator/editorial-brief-resolver.d.ts +9 -0
  8. package/scripts/aggregator/editorial-brief-resolver.js +3 -1
  9. package/scripts/aggregator/metadata/date-labels.d.ts +122 -0
  10. package/scripts/aggregator/metadata/date-labels.js +209 -0
  11. package/scripts/aggregator/metadata/text-utils.d.ts +188 -0
  12. package/scripts/aggregator/metadata/text-utils.js +542 -0
  13. package/scripts/constants/og-locales.d.ts +15 -0
  14. package/scripts/constants/og-locales.js +17 -0
  15. package/scripts/constants/seo/index.d.ts +21 -0
  16. package/scripts/constants/seo/index.js +23 -0
  17. package/scripts/constants/seo/og-locales.d.ts +59 -0
  18. package/scripts/constants/seo/og-locales.js +59 -0
  19. package/scripts/constants/seo/social-handles.d.ts +50 -0
  20. package/scripts/constants/seo/social-handles.js +65 -0
  21. package/scripts/constants/social-handles.d.ts +11 -0
  22. package/scripts/constants/social-handles.js +13 -0
  23. package/scripts/discover-untranslated-briefs.js +224 -19
  24. package/scripts/generators/news-indexes.d.ts +35 -0
  25. package/scripts/generators/news-indexes.js +67 -6
  26. package/scripts/generators/political-intelligence/html.js +14 -6
  27. package/scripts/generators/seo-copy.js +42 -0
  28. package/scripts/generators/sitemap/html.js +13 -5
  29. package/scripts/lint-src-todos.js +124 -0
  30. package/scripts/utils/copy-test-reports.js +1 -1
  31. package/scripts/utils/generate-docs-index.js +1 -1
  32. package/scripts/validate-brief-translations.js +158 -18
@@ -51,40 +51,12 @@ import path from 'path';
51
51
  import { ALL_LANGUAGES, getLocalizedString } from '../constants/language-core.js';
52
52
  import { BREAKING_NEWS_TITLES, COMMITTEE_REPORTS_TITLES, ELECTION_CYCLE_TITLES, LOCALIZED_KEYWORDS, MONTH_AHEAD_TITLES, MONTHLY_REVIEW_TITLES, MOTIONS_TITLES, PROPOSITIONS_TITLES, QUARTER_AHEAD_TITLES, QUARTER_IN_REVIEW_TITLES, TERM_OUTLOOK_TITLES, WEEK_AHEAD_TITLES, WEEKLY_REVIEW_TITLES, YEAR_AHEAD_TITLES, YEAR_IN_REVIEW_TITLES, } from '../constants/language-articles.js';
53
53
  import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
54
- /** Maximum `<meta description>` length we will emit. */
55
- const DESCRIPTION_MAX_LENGTH = 180;
56
- /** Target minimum `<meta description>` length before we append context. */
57
- const DESCRIPTION_MIN_LENGTH = 140;
58
- /**
59
- * Length below which a raw description is considered too short to stand
60
- * on its own and gets enriched with date/context. Independent from
61
- * {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
62
- * truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
63
- * clean 100-140 char prose lede is preserved verbatim instead of being
64
- * padded with date/context boilerplate.
65
- */
66
- const ENRICHMENT_TRIGGER_LENGTH = 100;
67
- /** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
68
- const TITLE_MAX_LENGTH = 140;
69
- /**
70
- * Soft target for headline-style titles produced as a fallback from
71
- * BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
72
- * truncator first looks for a natural clause boundary
73
- * (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
74
- * window and breaks there instead of mid-clause-with-ellipsis. This
75
- * turns a 137-character truncated prose paragraph into a complete
76
- * journalistic clause, which scans much better in news cards and SERP
77
- * snippets without sacrificing the keyword-rich opening.
78
- */
79
- const HEADLINE_SOFT_MIN = 60;
80
- /**
81
- * Punctuation marks that signal a natural clause boundary inside a
82
- * BLUF / lede paragraph. Listed in preferred-break order: a colon or
83
- * em-dash that introduces a list of consequences is the best break,
84
- * full stops are next, and semicolons last. Single ASCII space is
85
- * always a fallback boundary handled separately.
86
- */
87
- const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
54
+ // Text-utility constants + helpers extracted into the `metadata/`
55
+ // bounded context as pure leaf-module functions. Re-exported here for
56
+ // back-compat with existing call sites; new code should import directly
57
+ // from `./metadata/text-utils.js`.
58
+ import { DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, ENRICHMENT_TRIGGER_LENGTH, shouldSkipDescriptionLine, stripLeadingProseLabel, stripInlineMarkdown, truncateDescription, truncateExtendedDescription, truncateTitle, extractFirstSentence, } from './metadata/text-utils.js';
59
+ export { shouldSkipDescriptionLine, stripLeadingProseLabel, stripInlineMarkdown, truncateDescription, truncateExtendedDescription, truncateTitle, extractFirstSentence, } from './metadata/text-utils.js';
88
60
  /** Localized labels used to enrich short or duplicate-prone meta descriptions. */
89
61
  const SEO_CONTEXT_LABELS = {
90
62
  en: {
@@ -315,435 +287,6 @@ const ARTIFACT_CATEGORY_PREFIXES = [
315
287
  'weekly outlook',
316
288
  'wildcards blackswans',
317
289
  ];
318
- /**
319
- * Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
320
- * (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
321
- * metadata, never prose.
322
- */
323
- const EMOJI_BANNER_CHARS = [
324
- '📋',
325
- '📅',
326
- '🔍',
327
- '🏛',
328
- '📰',
329
- '📊',
330
- '🏷',
331
- '📈',
332
- '📉',
333
- '⚠',
334
- '🔔',
335
- '🎯',
336
- '🗳',
337
- '🏢',
338
- '📄',
339
- ];
340
- /**
341
- * Label prefixes that a prose description must never start with. Every
342
- * entry matches case-insensitively at the start of a trimmed line, followed
343
- * by optional space and a colon.
344
- */
345
- const METADATA_LINE_PREFIXES = [
346
- 'Admiralty Grade',
347
- 'Analysis Date',
348
- 'Analysis Owner',
349
- 'Article Type',
350
- 'Article Window',
351
- 'Assessment Date',
352
- 'Briefing',
353
- 'Briefing Date',
354
- 'Classification',
355
- 'Classification Date',
356
- 'Confidence',
357
- 'Confidence in Evidence',
358
- 'Data Sources',
359
- 'Date',
360
- 'Document Type',
361
- 'Filing Date',
362
- 'Generated',
363
- 'Horizon',
364
- 'IMF Status',
365
- 'Last Updated',
366
- 'Parliamentary Status',
367
- 'Parliamentary Term',
368
- 'Period',
369
- 'Prepared',
370
- 'Purpose',
371
- 'Region',
372
- 'Reporting',
373
- 'Reporting Period',
374
- 'Reporting Window',
375
- 'Run',
376
- 'Run ID',
377
- 'Series',
378
- 'Series Run',
379
- 'Source',
380
- 'Sources',
381
- 'SPDX-FileCopyrightText',
382
- 'SPDX-License-Identifier',
383
- 'Topic',
384
- 'Type',
385
- 'WEP Band',
386
- 'WEP Grade',
387
- 'Window',
388
- ];
389
- /**
390
- * Return `true` when a line cannot serve as a prose description. Rejects
391
- * Markdown structural lines (headings, blockquotes, tables, HTML),
392
- * mermaid/chart directives, emoji-banner metadata rows, and the known
393
- * `Key: value` banners that Stage-B agents emit as artefact preamble.
394
- *
395
- * @param line - Trimmed line from the aggregated Markdown source
396
- * @returns `true` when the line is not prose and should be skipped
397
- */
398
- export function shouldSkipDescriptionLine(line) {
399
- if (line.length === 0)
400
- return true;
401
- if (line.startsWith('#'))
402
- return true;
403
- if (line.startsWith('>'))
404
- return true;
405
- if (line.startsWith('<'))
406
- return true;
407
- if (line.startsWith('|'))
408
- return true;
409
- if (line.startsWith('---') || line.startsWith('==='))
410
- return true;
411
- if (line.startsWith('```') || line.startsWith('~~~'))
412
- return true;
413
- if (line.startsWith('%%'))
414
- return true;
415
- if (/^title\s/i.test(line))
416
- return true;
417
- if (EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)))
418
- return true;
419
- const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
420
- for (const prefix of METADATA_LINE_PREFIXES) {
421
- const lower = labelSource.toLowerCase();
422
- const prefixLower = prefix.toLowerCase();
423
- if (lower.startsWith(`${prefixLower}:`) ||
424
- lower.startsWith(`${prefixLower} :`) ||
425
- lower.startsWith(`${prefixLower}**:`) ||
426
- lower.startsWith(`${prefixLower}*:`)) {
427
- return true;
428
- }
429
- }
430
- if (/^[-*_=~.]{3,}$/.test(line))
431
- return true;
432
- if (isLocalizedBannerRow(line))
433
- return true;
434
- return false;
435
- }
436
- /**
437
- * Language-agnostic banner-row detector. Stage-B artefacts open with a
438
- * metadata banner of the shape
439
- * `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
440
- * and its localized siblings — notably Japanese / Chinese / Korean briefs
441
- * which place the full-width colon `:` **inside** the bold span
442
- * (`**日付:**`) rather than after it. The `METADATA_LINE_PREFIXES` table
443
- * only covers the English vocabulary; this helper catches the structural
444
- * shape directly: a line that starts with `**`, contains at least one
445
- * `|` separator, and carries two-or-more bold key markers that end with
446
- * — or are followed by — an ASCII colon `:` or full-width colon `:`.
447
- * Banner rows look identical in every language we publish, so detecting
448
- * them here keeps localized briefs from leaking their first banner line
449
- * into the `<meta description>`.
450
- *
451
- * @param line - Trimmed source line
452
- * @returns `true` when the line is a banner row in any locale
453
- */
454
- function isLocalizedBannerRow(line) {
455
- if (!line.startsWith('**'))
456
- return false;
457
- if (!line.includes('|'))
458
- return false;
459
- const inside = (line.match(/\*\*[^*]+[::]\s*\*\*/g) ?? []).length;
460
- const after = (line.match(/\*\*[^*]+\*\*\s*[::]/g) ?? []).length;
461
- return inside + after >= 2;
462
- }
463
- /**
464
- * Strip inline Markdown decorations so we can use the remaining text as
465
- * plain-text meta-tag content. Removes link syntax, emphasis, inline code
466
- * backticks, and HTML-entity fragments that the Markdown source sometimes
467
- * smuggles in. Keeps the visible text readable.
468
- *
469
- * @param raw - Trimmed Markdown line
470
- * @returns Plain-text variant
471
- */
472
- /**
473
- * Strip a leading all-caps prose label (e.g. `SITUATION:`, `KEY MOTION:`,
474
- * `BLUF:`, `BOTTOM LINE:`, `TIER-1:`) from a prose line. These labels
475
- * are common in BLUF-style editorial writing — they survive
476
- * {@link stripInlineMarkdown} (which strips the `**bold**` wrapper but
477
- * keeps the literal text) and would otherwise leak into the SEO
478
- * description as a confusing all-caps shout.
479
- *
480
- * Matches up to 4 hyphenated all-caps tokens, optionally followed by a
481
- * digit suffix (`TIER-1`), terminating at a colon. Returns the original
482
- * line when no opener is present.
483
- *
484
- * @param line - Plain prose line (post-{@link stripInlineMarkdown})
485
- * @returns Line with the all-caps opener removed
486
- */
487
- export function stripLeadingProseLabel(line) {
488
- const colonIdx = line.indexOf(': ');
489
- if (colonIdx < 2 || colonIdx > 80)
490
- return line;
491
- const label = line.slice(0, colonIdx);
492
- const rest = line.slice(colonIdx + 2).trim();
493
- if (rest.length < 20)
494
- return line;
495
- if (!/^[A-Z][A-Z0-9 -]{1,79}$/.test(label))
496
- return line;
497
- if (label.length < 3)
498
- return line;
499
- return rest;
500
- }
501
- /**
502
- * Strip inline Markdown decorations so we can use the remaining text as
503
- * plain-text meta-tag content. Removes link syntax, emphasis, inline code
504
- * backticks, and HTML-entity fragments that the Markdown source sometimes
505
- * smuggles in. Keeps the visible text readable.
506
- *
507
- * @param raw - Trimmed Markdown line
508
- * @returns Plain-text variant
509
- */
510
- export function stripInlineMarkdown(raw) {
511
- return raw
512
- .replace(/!\[([^\]\n]{0,500})\]\(([^)\n]{0,500})\)/g, '$1')
513
- .replace(/\[([^\]\n]{1,500})\]\(([^)\n]{0,500})\)/g, '$1')
514
- .replace(/`([^`\n]{1,500})`/g, '$1')
515
- .replace(/\*\*([^*\n]{1,500})\*\*/g, '$1')
516
- .replace(/__([^_\n]{1,500})__/g, '$1')
517
- .replace(/\*([^*\n]{1,500})\*/g, '$1')
518
- .replace(/_([^_\n]{1,500})_/g, '$1')
519
- .replace(/~~([^~\n]{1,500})~~/g, '$1')
520
- .replace(/\s+/g, ' ')
521
- .trim();
522
- }
523
- /** Connector / determiner words that read as broken copy when they are
524
- * the final token before a truncation ellipsis. */
525
- const TRAILING_STOP_WORDS = new Set([
526
- 'the',
527
- 'a',
528
- 'an',
529
- 'of',
530
- 'to',
531
- 'for',
532
- 'in',
533
- 'on',
534
- 'at',
535
- 'by',
536
- 'and',
537
- 'or',
538
- 'with',
539
- 'from',
540
- ]);
541
- /** Trailing characters we always strip before appending our own ellipsis,
542
- * so we never emit double-ellipsis or stray punctuation. */
543
- const TRAILING_PUNCT = /[.,;:—\-…\s]/u;
544
- /**
545
- * Repeatedly strip trailing stop-words (separated by a single space) and
546
- * trailing punctuation (including any pre-existing ellipsis). Implemented
547
- * imperatively to avoid super-linear regex backtracking on the
548
- * `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
549
- *
550
- * @param input - Pre-clipped string to clean up
551
- * @returns Cleaned string with no trailing stop-words or punctuation
552
- */
553
- function stripTrailingStopWordsAndPunctuation(input) {
554
- let result = input;
555
- let changed = true;
556
- while (changed) {
557
- changed = false;
558
- while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
559
- result = result.slice(0, -1);
560
- changed = true;
561
- }
562
- const lastSpace = result.lastIndexOf(' ');
563
- if (lastSpace >= 0) {
564
- const tail = result.slice(lastSpace + 1).toLowerCase();
565
- if (TRAILING_STOP_WORDS.has(tail)) {
566
- result = result.slice(0, lastSpace);
567
- changed = true;
568
- }
569
- }
570
- }
571
- return result;
572
- }
573
- /**
574
- * Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
575
- * an ellipsis when truncation actually happens. Does not break words if
576
- * avoidable — a trailing partial word is trimmed back to the previous
577
- * space first.
578
- *
579
- * @param text - Raw description text
580
- * @returns Truncated description with trailing ellipsis when clipped
581
- */
582
- export function truncateDescription(text) {
583
- if (text.length <= DESCRIPTION_MAX_LENGTH)
584
- return text;
585
- const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 1);
586
- // Prefer the last full sentence terminator within the cut so we don't
587
- // end on a dangling determiner ("…year. The"). Period/!/? followed by
588
- // a space marks a clean boundary. Only honour the boundary when it
589
- // sits past the soft minimum so we keep enough body text to be useful.
590
- const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
591
- if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
592
- return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
593
- }
594
- const lastSpace = cut.lastIndexOf(' ');
595
- let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
596
- // Drop dangling stop-words and trailing punctuation/ellipsis so we
597
- // never emit broken copy ("…year. The" → "…year.") or double-ellipsis
598
- // ("The……") when the upstream input already carried an ellipsis.
599
- safe = stripTrailingStopWordsAndPunctuation(safe);
600
- return `${safe}…`;
601
- }
602
- /**
603
- * Clamp a title to `TITLE_MAX_LENGTH` characters in the same
604
- * word-boundary-preserving fashion as {@link truncateDescription}.
605
- *
606
- * @param text - Raw title text
607
- * @returns Truncated title with trailing ellipsis when clipped
608
- */
609
- export function truncateTitle(text) {
610
- if (text.length <= TITLE_MAX_LENGTH)
611
- return text;
612
- // Prefer ending at a natural clause boundary inside the
613
- // `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
614
- // title reads as a complete journalistic clause rather than a
615
- // mid-sentence prose snippet. Iterate boundaries in priority order;
616
- // when a candidate falls in the window, break there and drop the
617
- // ellipsis since the result is grammatically complete.
618
- const search = text.slice(0, TITLE_MAX_LENGTH);
619
- for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
620
- const idx = search.lastIndexOf(boundary);
621
- if (idx >= HEADLINE_SOFT_MIN) {
622
- const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
623
- if (clean.length >= HEADLINE_SOFT_MIN)
624
- return clean;
625
- }
626
- }
627
- const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
628
- const lastSpace = cut.lastIndexOf(' ');
629
- let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
630
- safe = stripTrailingStopWordsAndPunctuation(safe);
631
- return `${safe}…`;
632
- }
633
- /**
634
- * Return the first complete sentence from a prose paragraph, suitable
635
- * for use as a fallback editorial title when the artefact H1 is
636
- * categorical (e.g. `# EU Parliament Committee Reports`) and the
637
- * resolver must derive `<title>` from the BLUF / lede summary instead.
638
- *
639
- * A "sentence" is the prefix up to the first sentence-terminator
640
- * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
641
- * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
642
- * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
643
- * so they don't terminate the sentence prematurely. When no
644
- * acceptable terminator exists in the window, returns the entire
645
- * input unchanged so {@link truncateTitle} can handle clause-boundary
646
- * truncation downstream.
647
- *
648
- * This produces journalistically clean titles even for the
649
- * propositions / committee-reports cases where the BLUF paragraph
650
- * opens with a single long sentence that exceeds 140 chars —
651
- * `truncateTitle` then breaks on a clause boundary, and the result is
652
- * still grammatical because the input was a sentence prefix rather
653
- * than an arbitrary paragraph slice.
654
- *
655
- * @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
656
- * @returns First sentence, or the original paragraph when none can be
657
- * identified within the soft-min window
658
- */
659
- export function extractFirstSentence(paragraph) {
660
- const trimmed = paragraph.trim();
661
- if (trimmed.length <= HEADLINE_SOFT_MIN)
662
- return trimmed;
663
- // Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
664
- // we'd rather let truncateTitle clause-truncate the original
665
- // paragraph than return a too-long first sentence.
666
- const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
667
- // Skip common abbreviations that contain a period inside a token
668
- // (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
669
- // candidate terminator positions; a position counts only when the
670
- // char before it is *not* part of a known abbreviation token.
671
- const terminators = ['. ', '! ', '? ', '; '];
672
- let bestIdx = -1;
673
- for (const t of terminators) {
674
- let from = HEADLINE_SOFT_MIN;
675
- let idx;
676
- while ((idx = window.indexOf(t, from)) !== -1) {
677
- if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
678
- if (bestIdx === -1 || idx < bestIdx)
679
- bestIdx = idx;
680
- break;
681
- }
682
- from = idx + t.length;
683
- }
684
- }
685
- if (bestIdx >= HEADLINE_SOFT_MIN) {
686
- return trimmed.slice(0, bestIdx + 1).trim();
687
- }
688
- return trimmed;
689
- }
690
- /**
691
- * Abbreviation tokens (lowercase, including the trailing period) that
692
- * should NOT count as sentence terminators when {@link extractFirstSentence}
693
- * scans for a `.` boundary. Single-letter all-caps initials
694
- * (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
695
- */
696
- const ABBREVIATION_PREFIXES = [
697
- 'mr.',
698
- 'mrs.',
699
- 'ms.',
700
- 'dr.',
701
- 'st.',
702
- 'no.',
703
- 'vs.',
704
- 'e.g.',
705
- 'i.e.',
706
- 'etc.',
707
- 'cf.',
708
- 'al.',
709
- // EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
710
- 'q1.',
711
- 'q2.',
712
- 'q3.',
713
- 'q4.',
714
- 'h1.',
715
- 'h2.',
716
- 'fy.',
717
- ];
718
- /**
719
- * Check whether the character preceding the `.` at `idx` in `text`
720
- * indicates an abbreviation (so the `.` is not a sentence terminator).
721
- * Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
722
- * single-letter initials pattern (`U.S.`, `E.U.`).
723
- *
724
- * @param text - Source text (lowercased segment + original mixed-case)
725
- * @param idx - Index of the `.` character in `text`
726
- * @returns `true` when the period at `idx` is part of an abbreviation
727
- */
728
- function isAbbreviationBoundary(text, idx) {
729
- // All-caps single-letter initial like `U.S.` or `E.U.` — char at
730
- // idx-1 is a capital letter, and idx-2 is either start of string,
731
- // whitespace, or another single-letter+period pair.
732
- if (idx >= 1) {
733
- const prev = text.charCodeAt(idx - 1);
734
- const isUpperLetter = prev >= 65 && prev <= 90;
735
- if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
736
- return true;
737
- }
738
- }
739
- // ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
740
- // start of the word, then compare lowercased.
741
- let start = idx;
742
- while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
743
- start--;
744
- const token = text.slice(start, idx + 1).toLowerCase();
745
- return ABBREVIATION_PREFIXES.includes(token);
746
- }
747
290
  /**
748
291
  * Return the first Markdown H1 (`# …`) in the supplied text, stripped of
749
292
  * the leading `#` and trailing anchor syntax. Returns an empty string when
@@ -905,6 +448,41 @@ export function extractLedeAfterHeading(markdown) {
905
448
  return '';
906
449
  return truncateDescription(buf.lines.join(' '));
907
450
  }
451
+ /**
452
+ * Same parsing rules as {@link extractLedeAfterHeading} but with a
453
+ * larger byte budget so the full BLUF paragraph (typically 200-300
454
+ * characters in the editorial style guide) is captured for use as
455
+ * `og:description` / `twitter:description`. Returns the joined
456
+ * paragraph clamped via {@link truncateExtendedDescription} (which
457
+ * returns `''` when the result wouldn't be longer than the regular
458
+ * meta description).
459
+ *
460
+ * @param markdown - Brief body (SPDX preamble already stripped)
461
+ * @returns Extended lede paragraph, or `''` when not worth emitting
462
+ */
463
+ export function extractExtendedLedeAfterHeading(markdown) {
464
+ const state = { inFence: false, inLede: false };
465
+ const buf = { lines: [], byteCount: 0 };
466
+ for (const raw of markdown.split('\n')) {
467
+ const line = raw.trim();
468
+ const directive = classifyLedeLine(line, state.inFence, state.inLede, buf.lines.length > 0);
469
+ const action = applyLedeDirective(directive, state, buf.lines.length > 0);
470
+ if (action === 'break')
471
+ break;
472
+ if (action === 'continue')
473
+ continue;
474
+ const collect = collectProseLine(line, buf);
475
+ if (collect === 'continue')
476
+ continue;
477
+ if (collect === 'break')
478
+ break;
479
+ if (buf.byteCount >= EXTENDED_DESCRIPTION_MAX_LENGTH)
480
+ break;
481
+ }
482
+ if (buf.lines.length === 0)
483
+ return '';
484
+ return truncateExtendedDescription(buf.lines.join(' '));
485
+ }
908
486
  /**
909
487
  * Normalise a Markdown heading's text for comparison against the
910
488
  * editorial-lede heading whitelist. Strips inline Markdown decorations
@@ -2112,189 +1690,12 @@ function templateForType(lang, articleType, inputs) {
2112
1690
  };
2113
1691
  }
2114
1692
  }
2115
- /** Milliseconds in one UTC day used by date-window derivation helpers. */
2116
- const MS_PER_DAY = 86_400_000;
2117
- /**
2118
- * Parse an ISO date and return the `[start, end]` week range as ISO
2119
- * strings. Week starts on Monday and ends on the following Sunday.
2120
- *
2121
- * @param date - ISO date string (`YYYY-MM-DD`)
2122
- * @returns `{ start, end }` both in `YYYY-MM-DD` form
2123
- */
2124
- export function deriveWeekRange(date) {
2125
- const parsed = parseIsoDate(date);
2126
- if (!parsed)
2127
- return { start: date, end: date };
2128
- const day = parsed.getUTCDay();
2129
- const shift = (day + 6) % 7;
2130
- const startMs = parsed.getTime() - shift * MS_PER_DAY;
2131
- const endMs = startMs + 6 * MS_PER_DAY;
2132
- return { start: formatIsoDate(new Date(startMs)), end: formatIsoDate(new Date(endMs)) };
2133
- }
2134
- /**
2135
- * Return the D-36 → D-8 reporting window for the `week-in-review`
2136
- * article type. EP roll-call voting data is published with a 2–6 week
2137
- * lag, so using the most-recent 7 days structurally produces a
2138
- * vote-empty dataset. Shifting 8 days back and widening to 28 days
2139
- * (start = D-36, end = D-8) ensures the window always contains at
2140
- * least one full EP plenary week with published roll-call data
2141
- * (ADR-006). Direction is consistent with the workflow's
2142
- * `DATE_FROM` (start = D-36) → `DATE_TO` (end = D-8) variables.
2143
- *
2144
- * @param date - ISO article date string (`YYYY-MM-DD`) — typically TODAY
2145
- * @returns `{ start: D-36, end: D-8 }` both as `YYYY-MM-DD` ISO strings
2146
- */
2147
- export function deriveReportingWindowForWeekInReview(date) {
2148
- const parsed = parseIsoDate(date);
2149
- if (!parsed)
2150
- return { start: date, end: date };
2151
- return {
2152
- start: formatIsoDate(new Date(parsed.getTime() - 36 * MS_PER_DAY)),
2153
- end: formatIsoDate(new Date(parsed.getTime() - 8 * MS_PER_DAY)),
2154
- };
2155
- }
2156
- /**
2157
- * Return a human-friendly month label for an ISO date — English month
2158
- * name + four-digit year (e.g. `April 2026`). The non-English template
2159
- * generators accept this same label verbatim because they interpolate it
2160
- * into a localized sentence rather than translating the month itself.
2161
- *
2162
- * @param date - ISO date string
2163
- * @returns Month label, or the input when parsing fails
2164
- */
2165
- export function deriveMonthLabel(date) {
2166
- const parsed = parseIsoDate(date);
2167
- if (!parsed)
2168
- return date;
2169
- const monthNames = [
2170
- 'January',
2171
- 'February',
2172
- 'March',
2173
- 'April',
2174
- 'May',
2175
- 'June',
2176
- 'July',
2177
- 'August',
2178
- 'September',
2179
- 'October',
2180
- 'November',
2181
- 'December',
2182
- ];
2183
- const name = monthNames[parsed.getUTCMonth()] ?? '';
2184
- return `${name} ${parsed.getUTCFullYear()}`.trim();
2185
- }
2186
- /**
2187
- * Return a quarter label for an ISO date — `Q<n> <YYYY>` (e.g. `Q2 2026`).
2188
- * Used by `quarter-ahead` and `quarter-in-review` title generators.
2189
- *
2190
- * @param date - ISO date string
2191
- * @returns Quarter label, or the input when parsing fails
2192
- */
2193
- export function deriveQuarterLabel(date) {
2194
- const parsed = parseIsoDate(date);
2195
- if (!parsed)
2196
- return date;
2197
- const quarter = Math.floor(parsed.getUTCMonth() / 3) + 1;
2198
- return `Q${quarter} ${parsed.getUTCFullYear()}`;
2199
- }
2200
- /**
2201
- * Return a four-digit year label for an ISO date. Used by `year-ahead`
2202
- * and `year-in-review` title generators.
2203
- *
2204
- * @param date - ISO date string
2205
- * @returns Year label, or the input when parsing fails
2206
- */
2207
- export function deriveYearLabel(date) {
2208
- const parsed = parseIsoDate(date);
2209
- if (!parsed)
2210
- return date;
2211
- return String(parsed.getUTCFullYear());
2212
- }
2213
- /**
2214
- * EP-term constants — keep these in sync with
2215
- * {@link analysis/methodologies/electoral-cycle-methodology.md}.
2216
- * - EP10: 16 Jul 2024 → ~end of June 2029
2217
- * - EP11: ~Jul 2029 → ~Jun 2034
2218
- */
2219
- const EP10_START_YEAR = 2024;
2220
- const EP10_END_YEAR = 2029;
2221
- const EP11_END_YEAR = 2034;
2222
- const EP_ELECTION_MONTH = 6; // June
2223
- /**
2224
- * Return the EP-term label for an ISO date — `EP10 → 2029` or `EP11 → 2034`.
2225
- * Used by `term-outlook` title generator.
2226
- *
2227
- * @param date - ISO date string
2228
- * @returns Term label, or the input when parsing fails
2229
- */
2230
- export function deriveTermLabel(date) {
2231
- const parsed = parseIsoDate(date);
2232
- if (!parsed)
2233
- return date;
2234
- const year = parsed.getUTCFullYear();
2235
- const month = parsed.getUTCMonth() + 1;
2236
- if (year < EP10_START_YEAR)
2237
- return `EP9 → ${EP10_START_YEAR}`;
2238
- if (year < EP10_END_YEAR || (year === EP10_END_YEAR && month <= EP_ELECTION_MONTH)) {
2239
- return `EP10 → ${EP10_END_YEAR}`;
2240
- }
2241
- if (year < EP11_END_YEAR || (year === EP11_END_YEAR && month <= EP_ELECTION_MONTH)) {
2242
- return `EP11 → ${EP11_END_YEAR}`;
2243
- }
2244
- const yearsBeyond = year - EP11_END_YEAR;
2245
- const offset = month <= EP_ELECTION_MONTH ? 0 : 1;
2246
- const termsBeyond = Math.floor((yearsBeyond - 1 + offset) / 5) + 1;
2247
- const termIndex = 11 + termsBeyond;
2248
- const termEnd = EP11_END_YEAR + 5 * termsBeyond;
2249
- return `EP${termIndex} → ${termEnd}`;
2250
- }
2251
- /**
2252
- * Return the election-cycle label for an ISO date — pairs the outgoing
2253
- * and incoming EP terms with the election year (e.g. `EP10 → EP11 (2029)`).
2254
- * Used by the `election-cycle` title generator.
2255
- *
2256
- * @param date - ISO date string
2257
- * @returns Cycle label, or the input when parsing fails
2258
- */
2259
- export function deriveElectionCycleLabel(date) {
2260
- const parsed = parseIsoDate(date);
2261
- if (!parsed)
2262
- return date;
2263
- const year = parsed.getUTCFullYear();
2264
- if (year <= EP10_END_YEAR)
2265
- return `EP10 → EP11 (${EP10_END_YEAR})`;
2266
- if (year <= EP11_END_YEAR)
2267
- return `EP11 → EP12 (${EP11_END_YEAR})`;
2268
- const cyclesBeyond = Math.ceil((year - EP11_END_YEAR) / 5);
2269
- const electionYear = EP11_END_YEAR + 5 * cyclesBeyond;
2270
- const out = 11 + cyclesBeyond;
2271
- return `EP${out} → EP${out + 1} (${electionYear})`;
2272
- }
2273
- /**
2274
- * Parse an ISO date string as UTC midnight. Returns `null` for malformed
2275
- * input so callers can skip month/week derivation gracefully.
2276
- *
2277
- * @param iso - ISO date string
2278
- * @returns Parsed `Date` or `null`
2279
- */
2280
- function parseIsoDate(iso) {
2281
- if (!/^\d{4}-\d{2}-\d{2}$/.test(iso))
2282
- return null;
2283
- const parsed = new Date(`${iso}T00:00:00Z`);
2284
- return Number.isNaN(parsed.getTime()) ? null : parsed;
2285
- }
2286
- /**
2287
- * Format a `Date` as `YYYY-MM-DD` in UTC.
2288
- *
2289
- * @param d - Date object
2290
- * @returns ISO date string
2291
- */
2292
- function formatIsoDate(d) {
2293
- const y = d.getUTCFullYear();
2294
- const m = String(d.getUTCMonth() + 1).padStart(2, '0');
2295
- const day = String(d.getUTCDate()).padStart(2, '0');
2296
- return `${y}-${m}-${day}`;
2297
- }
1693
+ // Date-label helpers extracted into the `metadata/` bounded context as
1694
+ // pure leaf-module functions. Re-exported here for back-compat with
1695
+ // existing call sites; new code should import directly from
1696
+ // `./metadata/date-labels.js`.
1697
+ import { deriveWeekRange, deriveReportingWindowForWeekInReview, deriveMonthLabel, deriveQuarterLabel, deriveYearLabel, deriveTermLabel, deriveElectionCycleLabel, } from './metadata/date-labels.js';
1698
+ export { deriveWeekRange, deriveReportingWindowForWeekInReview, deriveMonthLabel, deriveQuarterLabel, deriveYearLabel, deriveTermLabel, deriveElectionCycleLabel, } from './metadata/date-labels.js';
2298
1699
  /**
2299
1700
  * Extract a manifest override value for a single language. Accepts either
2300
1701
  * a plain string (applied to every language) or a `LanguageMap` object.
@@ -2334,6 +1735,7 @@ function resolveEditorialContent(opts) {
2334
1735
  return {
2335
1736
  headline: highlight.headline,
2336
1737
  summary: highlight.summary,
1738
+ extendedSummary: extractExtendedLedeAfterHeading(markdown),
2337
1739
  };
2338
1740
  }
2339
1741
  if (highlight?.summary) {
@@ -2342,10 +1744,12 @@ function resolveEditorialContent(opts) {
2342
1744
  }
2343
1745
  const aggregatedH1 = extractFirstH1(markdown);
2344
1746
  const aggregatedSummary = extractStrongProseLine(markdown);
1747
+ const aggregatedExtended = extractExtendedLedeAfterHeading(markdown);
2345
1748
  if (aggregatedH1 && !isGenericHeading(aggregatedH1, articleType, date)) {
2346
1749
  return {
2347
1750
  headline: truncateTitle(aggregatedH1),
2348
1751
  summary: artefactSummary || aggregatedSummary,
1752
+ extendedSummary: aggregatedExtended,
2349
1753
  };
2350
1754
  }
2351
1755
  const summary = artefactSummary || aggregatedSummary;
@@ -2357,9 +1761,13 @@ function resolveEditorialContent(opts) {
2357
1761
  // to clause-boundary truncation downstream when the sentence
2358
1762
  // itself overruns TITLE_MAX_LENGTH.
2359
1763
  const firstSentence = extractFirstSentence(summary);
2360
- return { headline: truncateTitle(firstSentence), summary };
1764
+ return {
1765
+ headline: truncateTitle(firstSentence),
1766
+ summary,
1767
+ extendedSummary: aggregatedExtended,
1768
+ };
2361
1769
  }
2362
- return { headline: '', summary: '' };
1770
+ return { headline: '', summary: '', extendedSummary: '' };
2363
1771
  }
2364
1772
  /**
2365
1773
  * Pick the per-language SEO title from the resolved editorial pair and
@@ -2603,10 +2011,23 @@ function resolveOneLanguage(input) {
2603
2011
  : composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
2604
2012
  const truncatedTitle = truncateTitle(title);
2605
2013
  const truncatedDescription = truncateDescription(description);
2014
+ // The extended description tracks the same source as the short
2015
+ // description: when a manifest description overrides, use it
2016
+ // verbatim (no point synthesising an extended form from the brief
2017
+ // when the editor explicitly chose the manifest copy); otherwise
2018
+ // use the editorial extended summary lifted from the brief BLUF.
2019
+ // `truncateExtendedDescription` returns `''` when the candidate
2020
+ // wouldn't be longer than the regular meta description, so callers
2021
+ // can fall back to {@link description} via a simple `||`.
2022
+ const extendedSource = manifestDescription
2023
+ ? manifestDescription
2024
+ : editorial.extendedSummary || rawDescription;
2025
+ const truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
2606
2026
  const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
2607
2027
  return {
2608
2028
  title: truncatedTitle,
2609
2029
  description: truncatedDescription,
2030
+ extendedDescription: truncatedExtendedDescription,
2610
2031
  keywords: buildSeoKeywords(input.lang, input.articleType, input.date, input.runId, truncatedTitle, truncatedDescription),
2611
2032
  source,
2612
2033
  };
@@ -2641,6 +2062,7 @@ function resolvePerLanguageEditorial(input) {
2641
2062
  editorial: {
2642
2063
  headline: localized.headline,
2643
2064
  summary: localized.summary,
2065
+ extendedSummary: localized.extendedSummary,
2644
2066
  },
2645
2067
  source: 'localized-brief',
2646
2068
  };
@@ -2656,7 +2078,7 @@ function resolvePerLanguageEditorial(input) {
2656
2078
  // Nothing editorial at all → caller will fall back to the localized
2657
2079
  // template.
2658
2080
  return {
2659
- editorial: { headline: '', summary: '' },
2081
+ editorial: { headline: '', summary: '', extendedSummary: '' },
2660
2082
  source: 'template',
2661
2083
  };
2662
2084
  }