npm - euparliamentmonitor - Versions diffs - 0.9.22 → 0.9.24 - Mend

euparliamentmonitor 0.9.22 → 0.9.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/package.json +2 -1
package/scripts/aggregator/article-metadata.js +69 -14
package/scripts/aggregator/editorial-brief-resolver.js +23 -0
package/scripts/aggregator/generator/slug.js +0 -22
package/scripts/aggregator/html/headline.d.ts +41 -9
package/scripts/aggregator/html/headline.js +69 -10
package/scripts/aggregator/html/shell.js +73 -17
package/scripts/aggregator/manifest/index.d.ts +1 -1
package/scripts/aggregator/manifest/index.js +1 -1
package/scripts/aggregator/manifest/resolver.d.ts +28 -1
package/scripts/aggregator/manifest/resolver.js +61 -5
package/scripts/aggregator/markdown-renderer.js +11 -0
package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
package/scripts/aggregator/metadata/artifact-walker.js +27 -8
package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
package/scripts/aggregator/metadata/brief-body.js +69 -0
package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
package/scripts/aggregator/metadata/heading-rules.js +78 -270
package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
package/scripts/aggregator/metadata/keyword-filters.js +156 -0
package/scripts/aggregator/metadata/lede-extractor.js +11 -2
package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
package/scripts/aggregator/metadata/seo-budgets.js +202 -0
package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
package/scripts/aggregator/metadata/text-truncate.js +277 -0
package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
package/scripts/aggregator/metadata/text-utils.js +119 -439
package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
package/scripts/aggregator/metadata/title-rejection.js +179 -0
package/scripts/dump-article-seo.js +75 -2
package/scripts/fix-mermaid-diagrams.js +931 -0
package/scripts/validate-article-seo.js +534 -0
package/scripts/validate-mermaid-diagrams.js +306 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "euparliamentmonitor",
-  "version": "0.9.22",
+  "version": "0.9.24",
   "type": "module",
   "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
   "main": "scripts/index.js",
@@ -66,6 +66,7 @@
     "discover:untranslated-briefs": "node scripts/discover-untranslated-briefs.js",
     "validate:translations": "node scripts/validate-brief-translations.js",
     "validate:manifest-seo": "node scripts/validate-manifest-seo.js",
+    "validate:seo": "node scripts/validate-article-seo.js",
     "sync:templates": "node scripts/templates/sync-template-frontmatter.js",
     "sync:templates:check": "node scripts/templates/sync-template-frontmatter.js --check",
     "prior-run-diff": "node scripts/aggregator/prior-run-diff.js",

package/scripts/aggregator/article-metadata.js CHANGED Viewed

@@ -64,7 +64,7 @@
 import { ALL_LANGUAGES } from '../constants/language-core.js';
 import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
 import { buildTemplateFallback } from './metadata/template-fallback.js';
-import { buildSeoKeywords, composeContextualDescription, composeContextualTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, } from './metadata/resolve-helpers.js';
+import { buildSeoKeywords, composeContextualDescription, composeContextualExtendedDescription, composeContextualTitle, deriveHeadlineFromSummary, hasLeakySeoToken, isUsableResolvedTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, sanitizeDescriptionCandidate, } from './metadata/resolve-helpers.js';
 import { ENRICHMENT_TRIGGER_LENGTH, truncateDescription, truncateExtendedDescription, truncateTitle, } from './metadata/text-utils.js';
 export { shouldSkipDescriptionLine, stripLeadingProseLabel, stripInlineMarkdown, truncateDescription, truncateExtendedDescription, truncateTitle, extractFirstSentence, } from './metadata/text-utils.js';
 export { isArtifactCategoryHeading, stripArtifactCategoryAffix, isGenericHeading, } from './metadata/heading-rules.js';
@@ -109,6 +109,7 @@ export function resolveArticleMetadata(opts) {
     }
     return result;
 }
+const LOCALIZED_BRIEF_SOURCE = 'localized-brief';
 /**
  * Resolve `{title, description, keywords, source}` for one language.
  *
@@ -122,20 +123,74 @@ function resolveOneLanguage(input) {
     const editorial = perLanguage.editorial;
     const contextualTitle = composeContextualTitle(input.template.title, editorial.headline, input.runId);
     const title = pickFirstNonEmpty([manifestTitle, contextualTitle, input.template.title]);
-    const rawDescription = pickFirstNonEmpty([
-        manifestDescription,
-        editorial.summary,
-        input.template.subtitle,
+    const rawDescription = sanitizeDescriptionCandidate(pickFirstNonEmpty([manifestDescription, editorial.summary, input.template.subtitle]));
+    const safeEditorial = {
+        headline: isUsableResolvedTitle(editorial.headline) ? editorial.headline.trim() : '',
+        summary: sanitizeDescriptionCandidate(editorial.summary),
+        extendedSummary: sanitizeDescriptionCandidate(editorial.extendedSummary),
+    };
+    const normalizedRawDescription = rawDescription || sanitizeDescriptionCandidate(input.template.subtitle);
+    const skipEnrichment = perLanguage.source === LOCALIZED_BRIEF_SOURCE && normalizedRawDescription.length > 0;
+    const description = skipEnrichment || normalizedRawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
+        ? normalizedRawDescription
+        : composeContextualDescription(input.lang, normalizedRawDescription, safeEditorial, input.date, input.runId);
+    const clippedTitle = truncateTitle(title).trim();
+    const explicitTitle = manifestTitle && !hasLeakySeoToken(manifestTitle) ? truncateTitle(manifestTitle).trim() : '';
+    const allowShortResolvedTitle = perLanguage.source === LOCALIZED_BRIEF_SOURCE;
+    const resolvedTitleCandidate = clippedTitle &&
+        !hasLeakySeoToken(clippedTitle) &&
+        (allowShortResolvedTitle || isUsableResolvedTitle(clippedTitle))
+        ? clippedTitle
+        : '';
+    const summaryDerivedTitle = deriveHeadlineFromSummary(safeEditorial.summary || normalizedRawDescription);
+    // `truncateTitle` returns '' when an editorial title overruns the
+    // budget with no acceptable clause boundary — fall back to the
+    // localized template title in that case so we never emit an empty
+    // `<title>`. Live regression: 2026-05-22 breaking
+    // `AI Trade Strategy: A Legislative First with Structural…` clipped
+    // to '' after the no-ellipsis guard landed; template fallback
+    // (`Extended Executive Brief — Breaking News`) is preferable to a
+    // blank `<title>`.
+    //
+    // The fallback path passes the template title back through
+    // {@link composeContextualTitle} (with an empty editorial headline)
+    // so `withRunQualifier` re-appends the `— Run N` suffix. Without
+    // this, two same-date / same-articleType runs (republish, hot-fix
+    // re-run) would collapse to byte-identical `<title>` strings, and
+    // the duplicate-title gate in `scripts/validate-article-seo.js`
+    // would (correctly) fail CI.
+    const contextualFallback = composeContextualTitle(input.template.title, '', input.runId);
+    const truncatedTitle = pickFirstNonEmpty([
+        explicitTitle,
+        resolvedTitleCandidate,
+        isUsableResolvedTitle(summaryDerivedTitle, { allowFullSentence: true })
+            ? summaryDerivedTitle
+            : '',
+        truncateTitle(contextualFallback),
+        contextualFallback,
     ]);
-    const description = rawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
-        ? rawDescription
-        : composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
-    const truncatedTitle = truncateTitle(title);
     const truncatedDescription = truncateDescription(description);
-    const extendedSource = manifestDescription
-        ? manifestDescription
-        : editorial.extendedSummary || rawDescription;
-    const truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
+    const extendedSource = sanitizeDescriptionCandidate(manifestDescription || safeEditorial.extendedSummary || normalizedRawDescription);
+    // Two-tier extended-description resolution:
+    // 1. Direct truncation — preferred when the editorial source paragraph
+    //    is already ≥181 chars (the truncator's gating threshold). This
+    //    yields the highest-fidelity og:description text.
+    // 2. Contextual synthesis — when direct truncation returns '' (source
+    //    was too short), synthesize a longer string by stitching together
+    //    `<source> + Date: YYYY-MM-DD + Context: <editorial> + <reader>`.
+    //    This is the **only** SEO path that surfaces the localized
+    //    "for democratic-accountability readers …" framing (the short
+    //    <meta description> no longer carries it — see comment in
+    //    {@link composeContextualDescription}). The synthesized string is
+    //    re-clamped to the 200–300 char og:description budget.
+    //
+    // Live regression (2026-05): 56 breaking briefs shipped with empty
+    // extendedDescription because their lead paragraph was only 80–150
+    // chars. AI-overview and Discover surfaces dropped them entirely.
+    let truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
+    if (!truncatedExtendedDescription) {
+        truncatedExtendedDescription = composeContextualExtendedDescription(input.lang, extendedSource || normalizedRawDescription, safeEditorial, input.date);
+    }
     const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
     return {
         title: truncatedTitle,
@@ -163,7 +218,7 @@ function resolvePerLanguageEditorial(input) {
                     summary: localized.summary,
                     extendedSummary: localized.extendedSummary,
                 },
-                source: 'localized-brief',
+                source: LOCALIZED_BRIEF_SOURCE,
             };
         }
     }

package/scripts/aggregator/editorial-brief-resolver.js CHANGED Viewed

@@ -30,6 +30,7 @@
 import fs from 'fs';
 import path from 'path';
 import { extractFirstH1, extractLedeAfterHeading, extractExtendedLedeAfterHeading, extractStrongProseLine, isGenericHeading, stripArtifactCategoryAffix, truncateTitle, } from './article-metadata.js';
+import { extractBriefingHighlight } from './metadata/briefing-highlight.js';
 /**
  * Run-relative candidate paths for a translated brief, in precedence
  * order. Mirrors the `executive-brief.md` → `extended/executive-brief.md`
@@ -174,6 +175,28 @@ export function resolveLocalizedBriefHighlight(runDir, lang, articleType, date)
         const body = readArtefactBody(abs);
         if (!body)
             continue;
+        // Tier 1 (NEW, May-2026): structural extraction of `## Strategic
+        // Intelligence Summary` / `## Reader Briefing` sections. The
+        // briefing extractor is language-agnostic — it matches on the
+        // English section headings, which the translation pipeline
+        // preserves verbatim under the localized brief contract — so a
+        // Swedish brief whose synthesis section is still written as
+        // `## Strategic Intelligence Summary` (with translated body
+        // prose) will resolve correctly here. When the translator has
+        // additionally localized the section heading the matcher falls
+        // back to the legacy lede/H1 path below, producing the
+        // localized H1 as headline.
+        const briefing = extractBriefingHighlight(body);
+        if (briefing && (briefing.headline || briefing.summary)) {
+            const fallbackHeadline = deriveHeadline(body, articleType, date);
+            return {
+                headline: briefing.headline || fallbackHeadline,
+                summary: briefing.summary,
+                extendedSummary: briefing.extendedSummary || extractExtendedLedeAfterHeading(body),
+                sourceFile: rel,
+                sourceLang: lang,
+            };
+        }
         const headline = deriveHeadline(body, articleType, date);
         const lede = extractLedeAfterHeading(body);
         const summary = lede || extractStrongProseLine(body);

package/scripts/aggregator/generator/slug.js CHANGED Viewed

@@ -37,27 +37,6 @@ export function buildArticleSlug(date, articleType, runSuffix) {
 export function sanitizeRunSuffix(runId) {
     return _sanitizeRunSuffix(runId);
 }
-/**
- * Return `true` when a line should be skipped when hunting for the default
- * description. Thin wrapper preserved for back-compat — real logic lives
- * in `src/aggregator/article-metadata.ts`'s `shouldSkipDescriptionLine`.
- *
- * @param line - Trimmed line from the aggregated Markdown source
- * @returns `true` when the line is not prose and should be skipped
- */
-function shouldSkipDescriptionLine(line) {
-    if (line.length === 0)
-        return true;
-    if (line.startsWith('#'))
-        return true;
-    if (line.startsWith('>'))
-        return true;
-    if (line.startsWith('<'))
-        return true;
-    if (line.startsWith('|'))
-        return true;
-    return false;
-}
 /** Description used when no prose paragraph qualifies. */
 const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from committed analysis artifacts.';
 /**
@@ -73,7 +52,6 @@ const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from co
  * @returns Plain-text description, truncated to ≤300 characters
  */
 export function extractDefaultDescription(markdown) {
-    void shouldSkipDescriptionLine;
     const strong = extractStrongProseLine(markdown);
     return strong.length > 0 ? strong : FALLBACK_DESCRIPTION;
 }

package/scripts/aggregator/html/headline.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
-import type { LanguageCode } from '../../types/index.js';
+import type { LanguageCode, LanguageMap } from '../../types/index.js';
+import { type SeoSurface } from '../metadata/seo-budgets.js';
 /**
  * Resolve a localized article type label *without* the leading icon
  * emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
@@ -28,19 +29,50 @@ export declare const HEADLINE_LIMIT = 110;
 export declare function truncateHeadline(title: string): string;
 /**
  * Build the localized `<title>` separator for the
- * `{articleTitle} {sep} {siteTitle}` pattern. LTR locales use the
- * right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
- * left-pointing guillemet («) so the visual hierarchy reads from the
- * primary title towards the site name without breaking bidi flow.
+ * `{articleTitle} {sep} {siteTitle}` pattern.
  *
- * The previous em-dash separator collided with em-dashes inside
- * article titles (the editorial style uses `Title — Subtitle`) and
- * rendered ambiguously in screen readers.
+ * Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
+ * scans cleanly in SERP cards and never collides with em-dashes that
+ * the editorial style routinely uses inside titles. CJK locales use
+ * the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
+ * Google CJK separator and renders correctly in JP / KO / ZH fonts.
+ * RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
+ * stroke that preserves bidi flow without injecting a Latin guillemet
+ * that would force a direction change mid-title.
  *
  * @param lang - Target language code
- * @returns `" » "` for LTR locales, `" « "` for RTL
+ * @returns Per-script separator
  */
 export declare function getTitleSeparator(lang: LanguageCode): string;
+/**
+ * Short brand fallback per script family. Used by
+ * {@link buildPageTitle} when the full `SITE_NAME` would push the
+ * `<title>` past the SERP budget but a shorter variant would fit.
+ *
+ * - Latin → "EPM" (3 chars, ASCII-safe in news cards)
+ * - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
+ * - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
+ *   SERP cards (Bing/Google render the Latin token RTL-isolated)
+ *
+ * Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
+ * future editorial change (e.g. a registered Arabic brand) only
+ * touches the table.
+ */
+export declare const SHORT_SITE_NAMES: LanguageMap;
+/**
+ * Compose a title for one SEO surface using the per-script byte
+ * budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
+ * the article title alone fills the budget (better SERP than a
+ * truncated headline followed by a clipped brand) and falls through
+ * to a short-brand variant when that fits but the full one does not.
+ *
+ * @param title - Article title (plain text, already markdown-stripped)
+ * @param lang - Target language code
+ * @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
+ * @param surface - Optional SEO surface; defaults to `<title>` budget
+ * @returns Composed, budget-clamped title
+ */
+export declare function buildPageTitle(title: string, lang: LanguageCode, siteTitle: string, surface?: SeoSurface): string;
 /**
  * Resolve a localized article type label with icon. Falls back to the
  * humanised slug when a translation isn't available.

package/scripts/aggregator/html/headline.js CHANGED Viewed

@@ -7,8 +7,9 @@
  * icon), the page-title separator that respects bidi direction, and the
  * Schema.org-compatible truncated headline used in JSON-LD.
  */
-import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString, getTextDirection, } from '../../constants/languages.js';
+import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString, } from '../../constants/languages.js';
 import { ArticleCategory } from '../../types/index.js';
+import { classifyScript, clampTitleForSurface } from '../metadata/seo-budgets.js';
 /**
  * Resolve a localized article type label *without* the leading icon
  * emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
@@ -51,20 +52,78 @@ export function truncateHeadline(title) {
 }
 /**
  * Build the localized `<title>` separator for the
- * `{articleTitle} {sep} {siteTitle}` pattern. LTR locales use the
- * right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
- * left-pointing guillemet («) so the visual hierarchy reads from the
- * primary title towards the site name without breaking bidi flow.
+ * `{articleTitle} {sep} {siteTitle}` pattern.
  *
- * The previous em-dash separator collided with em-dashes inside
- * article titles (the editorial style uses `Title — Subtitle`) and
- * rendered ambiguously in screen readers.
+ * Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
+ * scans cleanly in SERP cards and never collides with em-dashes that
+ * the editorial style routinely uses inside titles. CJK locales use
+ * the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
+ * Google CJK separator and renders correctly in JP / KO / ZH fonts.
+ * RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
+ * stroke that preserves bidi flow without injecting a Latin guillemet
+ * that would force a direction change mid-title.
  *
  * @param lang - Target language code
- * @returns `" » "` for LTR locales, `" « "` for RTL
+ * @returns Per-script separator
  */
 export function getTitleSeparator(lang) {
-    return getTextDirection(lang) === 'rtl' ? ' « ' : ' » ';
+    const family = classifyScript(lang);
+    if (family === 'cjk')
+        return ' ・ ';
+    if (family === 'rtl')
+        return ' ׀ ';
+    return ' | ';
+}
+/**
+ * Short brand fallback per script family. Used by
+ * {@link buildPageTitle} when the full `SITE_NAME` would push the
+ * `<title>` past the SERP budget but a shorter variant would fit.
+ *
+ * - Latin → "EPM" (3 chars, ASCII-safe in news cards)
+ * - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
+ * - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
+ *   SERP cards (Bing/Google render the Latin token RTL-isolated)
+ *
+ * Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
+ * future editorial change (e.g. a registered Arabic brand) only
+ * touches the table.
+ */
+export const SHORT_SITE_NAMES = {
+    en: 'EPM',
+    sv: 'EPM',
+    da: 'EPM',
+    no: 'EPM',
+    fi: 'EPM',
+    de: 'EPM',
+    fr: 'EPM',
+    es: 'EPM',
+    nl: 'EPM',
+    ar: 'EPM',
+    he: 'EPM',
+    ja: 'EPM',
+    ko: 'EPM',
+    zh: 'EPM',
+};
+/**
+ * Compose a title for one SEO surface using the per-script byte
+ * budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
+ * the article title alone fills the budget (better SERP than a
+ * truncated headline followed by a clipped brand) and falls through
+ * to a short-brand variant when that fits but the full one does not.
+ *
+ * @param title - Article title (plain text, already markdown-stripped)
+ * @param lang - Target language code
+ * @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
+ * @param surface - Optional SEO surface; defaults to `<title>` budget
+ * @returns Composed, budget-clamped title
+ */
+export function buildPageTitle(title, lang, siteTitle, surface = 'title') {
+    const shortSiteTitle = getLocalizedString(SHORT_SITE_NAMES, lang);
+    return clampTitleForSurface(title, lang, surface, {
+        siteTitle,
+        shortSiteTitle,
+        separator: getTitleSeparator(lang),
+    });
 }
 /**
  * Resolve a localized article type label with icon. Falls back to the

package/scripts/aggregator/html/shell.js CHANGED Viewed

@@ -20,7 +20,8 @@ import { stripHtmlTags } from '../../utils/html-sanitize.js';
 import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
 import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
 import { getSitemapFilename } from '../../generators/sitemap/index.js';
-import { truncateHeadline, getTitleSeparator, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
+import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
+import { clampForBudget } from '../metadata/seo-budgets.js';
 import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
 import { buildArticleToc } from './toc.js';
 import { blobUrl } from '../infra/github-urls.js';
@@ -28,6 +29,57 @@ import { blobUrl } from '../infra/github-urls.js';
 export const PUBLISHER_NAME = 'Hack23 AB';
 /** Site name used across meta tags and structured data. */
 export const SITE_NAME = 'EU Parliament Monitor';
+/**
+ * Compute the per-surface SEO-budget-clamped variants of the article
+ * title and description for a single render. See
+ * `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
+ * documented sources of every cap.
+ *
+ * @param options - The {@link WrapArticleOptions} carrying title /
+ *                  description / extendedDescription
+ * @param lang - Validated publishing locale (already coerced to a
+ *               supported `LanguageCode`)
+ * @param siteTitle - Resolved localized site title used as the brand
+ *                    suffix
+ * @returns One {@link SeoClampedSurfaces} record per article render
+ */
+function computeSeoClamps(options, lang, siteTitle) {
+    const pageTitle = buildPageTitle(options.title, lang, siteTitle);
+    const ogTitleClamped = clampForBudget(options.title, lang, 'ogTitle');
+    const twitterTitleClamped = clampForBudget(options.title, lang, 'twitterTitle');
+    const metaDescriptionClamped = clampForBudget(options.description, lang, 'metaDescription');
+    // og:description and twitter:description prefer the longer BLUF
+    // paragraph (extendedDescription) so social-card previews show the
+    // full lede; fall back to the short meta description when the
+    // extended one is empty.
+    const socialSource = options.extendedDescription && options.extendedDescription.length > 0
+        ? options.extendedDescription
+        : options.description;
+    const ogDescriptionClamped = clampForBudget(socialSource, lang, 'ogDescription');
+    const twitterDescriptionClamped = clampForBudget(socialSource, lang, 'twitterDescription');
+    const imageAltClamped = clampForBudget(`${options.title}${getTitleSeparator(lang)}${siteTitle}`, lang, 'imageAlt');
+    const jsonLdHeadline = truncateHeadline(options.title);
+    // Emit an `alternativeHeadline` whenever the headline truncator
+    // dropped more than a handful of characters from the full title.
+    // Schema.org's `NewsArticle.alternativeHeadline` field is exactly
+    // for the long-form variant of `headline` and lets Google's
+    // Knowledge Graph keep both versions for retrieval. The 5-char
+    // threshold avoids emitting trivially redundant pairs when the
+    // truncator only trimmed trailing whitespace or punctuation.
+    const fullTitleTrimmed = options.title.trim();
+    const altCandidate = fullTitleTrimmed.length - jsonLdHeadline.length > 5 ? fullTitleTrimmed : undefined;
+    return {
+        pageTitle,
+        ogTitleClamped,
+        twitterTitleClamped,
+        metaDescriptionClamped,
+        ogDescriptionClamped,
+        twitterDescriptionClamped,
+        imageAltClamped,
+        jsonLdHeadline,
+        ...(altCandidate ? { alternativeHeadline: altCandidate } : {}),
+    };
+}
 /**
  * Render the full article HTML document with the shared chrome.
  *
@@ -65,6 +117,17 @@ export function wrapArticleHtml(options) {
     // CodeQL-safe.
     const bodyText = stripHtmlTags(options.body);
     const wordCount = bodyText.split(/\s+/u).filter((w) => w.length > 0).length;
+    // Pre-compute the per-surface SEO-budget-clamped variants of title
+    // and description. Each surface gets its own clamp tuned to the
+    // documented platform envelope (Google/Bing SERP, Facebook/LinkedIn
+    // OG, Twitter card) and the script family (Latin / CJK / RTL —
+    // CJK glyphs render at ~2× Latin pixel width, so the same byte
+    // count occupies twice the SERP width). See
+    // `src/aggregator/metadata/seo-budgets.ts` for the budget table and
+    // `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
+    // documented sources of every cap.
+    const seoClamps = computeSeoClamps(options, safeLang, siteTitle);
+    const { pageTitle, ogTitleClamped, twitterTitleClamped, metaDescriptionClamped, ogDescriptionClamped, twitterDescriptionClamped, imageAltClamped, jsonLdHeadline, alternativeHeadline, } = seoClamps;
     // Build the JSON-LD image graph. Google requires NewsArticle.image
     // to be an array (or single ImageObject) with explicit width/height
     // covering at least one of the 1:1, 4:3, 16:9 aspect ratios for
@@ -92,8 +155,9 @@ export function wrapArticleHtml(options) {
     const jsonLd = {
         '@context': 'https://schema.org',
         '@type': 'NewsArticle',
-        headline: truncateHeadline(options.title),
-        description: options.description,
+        headline: jsonLdHeadline,
+        ...(alternativeHeadline ? { alternativeHeadline } : {}),
+        description: metaDescriptionClamped,
         datePublished: options.date,
         dateModified: options.date,
         inLanguage: safeLang,
@@ -165,18 +229,10 @@ export function wrapArticleHtml(options) {
     };
     const structuredData = [jsonLd, breadcrumbLd];
     const jsonLdString = JSON.stringify(structuredData).replace(/</g, '\\u003c');
-    const pageTitle = `${options.title}${getTitleSeparator(safeLang)}${siteTitle}`;
     const keywords = (options.keywords ?? []).map((keyword) => keyword.trim()).filter(Boolean);
     const keywordsMeta = keywords.length > 0
         ? `  <meta name="keywords" content="${escapeHTML(keywords.join(', '))}">\n`
         : '';
-    // Use the longer extended description for og:description/twitter:description
-    // when available so social-card previews show the full BLUF
-    // paragraph; the short meta description stays within Google's
-    // ~160-char snippet budget.
-    const socialDescription = options.extendedDescription && options.extendedDescription.length > 0
-        ? options.extendedDescription
-        : options.description;
     const ogLocaleTags = buildOgLocaleTags(safeLang);
     const twitterAttribution = buildTwitterAttributionTags();
     const twitterAttributionBlock = twitterAttribution ? `\n${twitterAttribution}` : '';
@@ -196,7 +252,7 @@ export function wrapArticleHtml(options) {
   <meta http-equiv="Content-Language" content="${safeLang}">
   <meta name="referrer" content="no-referrer">
   <title>${escapeHTML(pageTitle)}</title>
-  <meta name="description" content="${escapeHTML(options.description)}">
+  <meta name="description" content="${escapeHTML(metaDescriptionClamped)}">
 ${keywordsMeta}  <meta name="robots" content="index, follow, max-snippet:-1, max-image-preview:large">
   <meta name="author" content="${PUBLISHER_NAME}">
   <meta name="publisher" content="${PUBLISHER_NAME}">
@@ -211,15 +267,15 @@ ${hreflangLinks}
   <link rel="alternate" type="application/rss+xml" title="EU Parliament Monitor RSS" href="${BASE_URL}/rss.xml">
   <link rel="preconnect" href="https://hack23.com" crossorigin>
   <meta property="og:type" content="article">
-  <meta property="og:title" content="${escapeHTML(options.title)}">
-  <meta property="og:description" content="${escapeHTML(socialDescription)}">
+  <meta property="og:title" content="${escapeHTML(ogTitleClamped)}">
+  <meta property="og:description" content="${escapeHTML(ogDescriptionClamped)}">
   <meta property="og:url" content="${canonicalUrl}">
   <meta property="og:site_name" content="EU Parliament Monitor">
 ${ogLocaleTags}
-${buildResponsiveSocialImageMeta(`${options.title}${getTitleSeparator(safeLang)}EU Parliament Monitor`)}
+${buildResponsiveSocialImageMeta(imageAltClamped)}
   <meta name="twitter:card" content="summary_large_image">
-  <meta name="twitter:title" content="${escapeHTML(options.title)}">
-  <meta name="twitter:description" content="${escapeHTML(socialDescription)}">${twitterAttributionBlock}
+  <meta name="twitter:title" content="${escapeHTML(twitterTitleClamped)}">
+  <meta name="twitter:description" content="${escapeHTML(twitterDescriptionClamped)}">${twitterAttributionBlock}
 ${buildResponsiveIconLinks('../')}
   <link rel="manifest" href="../site.webmanifest">
   <meta name="color-scheme" content="light dark">

package/scripts/aggregator/manifest/index.d.ts CHANGED Viewed

@@ -3,7 +3,7 @@
  * @description Public re-exports for the manifest bounded context.
  */
 export type { HorizonProfile, Manifest, ManifestFiles, ManifestHistoryEntry, ManifestMetadataOverride, MetadataManifest, } from './types.js';
-export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
+export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
 export { readManifest, parseManifest, type ReadManifestResult } from './reader.js';
 export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
 //# sourceMappingURL=index.d.ts.map

package/scripts/aggregator/manifest/index.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
 // SPDX-License-Identifier: Apache-2.0
-export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
+export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
 export { readManifest, parseManifest } from './reader.js';
 export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
 //# sourceMappingURL=index.js.map

package/scripts/aggregator/manifest/resolver.d.ts CHANGED Viewed

@@ -10,6 +10,30 @@
 import type { Manifest, ManifestFiles } from './types.js';
 /** Sentinel used when no schema variant supplies a usable article type. */
 export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
+/**
+ * Pattern matching trailing `-run<N>` taxonomy noise that historic
+ * Stage-B writers occasionally encode into `articleType` (e.g.
+ * `committee-reports-run47`, `motions-run41`, `breaking-run193`). Also
+ * tolerates the legacy double-prefixed `motions-runmotions-run-1777010709`
+ * pattern observed in 2025 manifests where the writer concatenated the
+ * articleType and runId. The leading `-run` makes the match greedy enough
+ * to catch both single-suffix and double-prefixed forms.
+ *
+ * Exported for unit tests.
+ */
+export declare const RUN_SUFFIX_PATTERN: RegExp;
+/**
+ * Strip a trailing `-run<N>` taxonomy-noise suffix from an article-type
+ * slug, but only when doing so yields a {@link CANONICAL_ARTICLE_TYPES}
+ * token. This is conservative: a non-canonical leading token (e.g.
+ * `custom-type-run5`) is returned untouched so we never silently
+ * collapse a genuinely new article type into something it isn't.
+ *
+ * @param slug - Raw article-type slug from a manifest field
+ * @returns Canonical slug when the suffix was successfully stripped,
+ *   otherwise the original input
+ */
+export declare function stripRunSuffix(slug: string): string;
 /**
  * Resolve the article-type slug from a manifest, tolerating historic schemas.
  *
@@ -19,7 +43,10 @@ export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
  *   3. `articleTypes[0]` — pre-aggregator-pipeline plural array
  *   4. `runType` — historic field on older breaking-run manifests
  *
- * Falls back to `'unknown'` when none of the above is a non-empty string.
+ * Each candidate is passed through {@link stripRunSuffix} so trailing
+ * `-run<N>` taxonomy noise never leaks into JSON-LD `articleSection`,
+ * the filesystem slug, or the SEO dump's article-type histogram. Falls
+ * back to `'unknown'` when none of the above is a non-empty string.
  *
  * @param manifest - Parsed manifest (any of the supported schemas)
  * @returns Article-type slug usable as a filename component