euparliamentmonitor 0.9.22 → 0.9.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/package.json +2 -1
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/generator/slug.js +0 -22
  5. package/scripts/aggregator/html/headline.d.ts +41 -9
  6. package/scripts/aggregator/html/headline.js +69 -10
  7. package/scripts/aggregator/html/shell.js +73 -17
  8. package/scripts/aggregator/manifest/index.d.ts +1 -1
  9. package/scripts/aggregator/manifest/index.js +1 -1
  10. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  11. package/scripts/aggregator/manifest/resolver.js +61 -5
  12. package/scripts/aggregator/markdown-renderer.js +11 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  14. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  15. package/scripts/aggregator/metadata/artifact-walker.js +27 -8
  16. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  17. package/scripts/aggregator/metadata/brief-body.js +69 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  19. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  21. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  22. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  23. package/scripts/aggregator/metadata/heading-rules.js +78 -270
  24. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  25. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  26. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  28. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  29. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  30. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  31. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  32. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  33. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  34. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  35. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  37. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  38. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  39. package/scripts/aggregator/metadata/text-utils.js +119 -439
  40. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  41. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  42. package/scripts/dump-article-seo.js +75 -2
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/validate-article-seo.js +534 -0
  45. package/scripts/validate-mermaid-diagrams.js +306 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "euparliamentmonitor",
3
- "version": "0.9.22",
3
+ "version": "0.9.24",
4
4
  "type": "module",
5
5
  "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
6
6
  "main": "scripts/index.js",
@@ -66,6 +66,7 @@
66
66
  "discover:untranslated-briefs": "node scripts/discover-untranslated-briefs.js",
67
67
  "validate:translations": "node scripts/validate-brief-translations.js",
68
68
  "validate:manifest-seo": "node scripts/validate-manifest-seo.js",
69
+ "validate:seo": "node scripts/validate-article-seo.js",
69
70
  "sync:templates": "node scripts/templates/sync-template-frontmatter.js",
70
71
  "sync:templates:check": "node scripts/templates/sync-template-frontmatter.js --check",
71
72
  "prior-run-diff": "node scripts/aggregator/prior-run-diff.js",
@@ -64,7 +64,7 @@
64
64
  import { ALL_LANGUAGES } from '../constants/language-core.js';
65
65
  import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
66
66
  import { buildTemplateFallback } from './metadata/template-fallback.js';
67
- import { buildSeoKeywords, composeContextualDescription, composeContextualTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, } from './metadata/resolve-helpers.js';
67
+ import { buildSeoKeywords, composeContextualDescription, composeContextualExtendedDescription, composeContextualTitle, deriveHeadlineFromSummary, hasLeakySeoToken, isUsableResolvedTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, sanitizeDescriptionCandidate, } from './metadata/resolve-helpers.js';
68
68
  import { ENRICHMENT_TRIGGER_LENGTH, truncateDescription, truncateExtendedDescription, truncateTitle, } from './metadata/text-utils.js';
69
69
  export { shouldSkipDescriptionLine, stripLeadingProseLabel, stripInlineMarkdown, truncateDescription, truncateExtendedDescription, truncateTitle, extractFirstSentence, } from './metadata/text-utils.js';
70
70
  export { isArtifactCategoryHeading, stripArtifactCategoryAffix, isGenericHeading, } from './metadata/heading-rules.js';
@@ -109,6 +109,7 @@ export function resolveArticleMetadata(opts) {
109
109
  }
110
110
  return result;
111
111
  }
112
+ const LOCALIZED_BRIEF_SOURCE = 'localized-brief';
112
113
  /**
113
114
  * Resolve `{title, description, keywords, source}` for one language.
114
115
  *
@@ -122,20 +123,74 @@ function resolveOneLanguage(input) {
122
123
  const editorial = perLanguage.editorial;
123
124
  const contextualTitle = composeContextualTitle(input.template.title, editorial.headline, input.runId);
124
125
  const title = pickFirstNonEmpty([manifestTitle, contextualTitle, input.template.title]);
125
- const rawDescription = pickFirstNonEmpty([
126
- manifestDescription,
127
- editorial.summary,
128
- input.template.subtitle,
126
+ const rawDescription = sanitizeDescriptionCandidate(pickFirstNonEmpty([manifestDescription, editorial.summary, input.template.subtitle]));
127
+ const safeEditorial = {
128
+ headline: isUsableResolvedTitle(editorial.headline) ? editorial.headline.trim() : '',
129
+ summary: sanitizeDescriptionCandidate(editorial.summary),
130
+ extendedSummary: sanitizeDescriptionCandidate(editorial.extendedSummary),
131
+ };
132
+ const normalizedRawDescription = rawDescription || sanitizeDescriptionCandidate(input.template.subtitle);
133
+ const skipEnrichment = perLanguage.source === LOCALIZED_BRIEF_SOURCE && normalizedRawDescription.length > 0;
134
+ const description = skipEnrichment || normalizedRawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
135
+ ? normalizedRawDescription
136
+ : composeContextualDescription(input.lang, normalizedRawDescription, safeEditorial, input.date, input.runId);
137
+ const clippedTitle = truncateTitle(title).trim();
138
+ const explicitTitle = manifestTitle && !hasLeakySeoToken(manifestTitle) ? truncateTitle(manifestTitle).trim() : '';
139
+ const allowShortResolvedTitle = perLanguage.source === LOCALIZED_BRIEF_SOURCE;
140
+ const resolvedTitleCandidate = clippedTitle &&
141
+ !hasLeakySeoToken(clippedTitle) &&
142
+ (allowShortResolvedTitle || isUsableResolvedTitle(clippedTitle))
143
+ ? clippedTitle
144
+ : '';
145
+ const summaryDerivedTitle = deriveHeadlineFromSummary(safeEditorial.summary || normalizedRawDescription);
146
+ // `truncateTitle` returns '' when an editorial title overruns the
147
+ // budget with no acceptable clause boundary — fall back to the
148
+ // localized template title in that case so we never emit an empty
149
+ // `<title>`. Live regression: 2026-05-22 breaking
150
+ // `AI Trade Strategy: A Legislative First with Structural…` clipped
151
+ // to '' after the no-ellipsis guard landed; template fallback
152
+ // (`Extended Executive Brief — Breaking News`) is preferable to a
153
+ // blank `<title>`.
154
+ //
155
+ // The fallback path passes the template title back through
156
+ // {@link composeContextualTitle} (with an empty editorial headline)
157
+ // so `withRunQualifier` re-appends the `— Run N` suffix. Without
158
+ // this, two same-date / same-articleType runs (republish, hot-fix
159
+ // re-run) would collapse to byte-identical `<title>` strings, and
160
+ // the duplicate-title gate in `scripts/validate-article-seo.js`
161
+ // would (correctly) fail CI.
162
+ const contextualFallback = composeContextualTitle(input.template.title, '', input.runId);
163
+ const truncatedTitle = pickFirstNonEmpty([
164
+ explicitTitle,
165
+ resolvedTitleCandidate,
166
+ isUsableResolvedTitle(summaryDerivedTitle, { allowFullSentence: true })
167
+ ? summaryDerivedTitle
168
+ : '',
169
+ truncateTitle(contextualFallback),
170
+ contextualFallback,
129
171
  ]);
130
- const description = rawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
131
- ? rawDescription
132
- : composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
133
- const truncatedTitle = truncateTitle(title);
134
172
  const truncatedDescription = truncateDescription(description);
135
- const extendedSource = manifestDescription
136
- ? manifestDescription
137
- : editorial.extendedSummary || rawDescription;
138
- const truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
173
+ const extendedSource = sanitizeDescriptionCandidate(manifestDescription || safeEditorial.extendedSummary || normalizedRawDescription);
174
+ // Two-tier extended-description resolution:
175
+ // 1. Direct truncation — preferred when the editorial source paragraph
176
+ // is already ≥181 chars (the truncator's gating threshold). This
177
+ // yields the highest-fidelity og:description text.
178
+ // 2. Contextual synthesis — when direct truncation returns '' (source
179
+ // was too short), synthesize a longer string by stitching together
180
+ // `<source> + Date: YYYY-MM-DD + Context: <editorial> + <reader>`.
181
+ // This is the **only** SEO path that surfaces the localized
182
+ // "for democratic-accountability readers …" framing (the short
183
+ // <meta description> no longer carries it — see comment in
184
+ // {@link composeContextualDescription}). The synthesized string is
185
+ // re-clamped to the 200–300 char og:description budget.
186
+ //
187
+ // Live regression (2026-05): 56 breaking briefs shipped with empty
188
+ // extendedDescription because their lead paragraph was only 80–150
189
+ // chars. AI-overview and Discover surfaces dropped them entirely.
190
+ let truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
191
+ if (!truncatedExtendedDescription) {
192
+ truncatedExtendedDescription = composeContextualExtendedDescription(input.lang, extendedSource || normalizedRawDescription, safeEditorial, input.date);
193
+ }
139
194
  const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
140
195
  return {
141
196
  title: truncatedTitle,
@@ -163,7 +218,7 @@ function resolvePerLanguageEditorial(input) {
163
218
  summary: localized.summary,
164
219
  extendedSummary: localized.extendedSummary,
165
220
  },
166
- source: 'localized-brief',
221
+ source: LOCALIZED_BRIEF_SOURCE,
167
222
  };
168
223
  }
169
224
  }
@@ -30,6 +30,7 @@
30
30
  import fs from 'fs';
31
31
  import path from 'path';
32
32
  import { extractFirstH1, extractLedeAfterHeading, extractExtendedLedeAfterHeading, extractStrongProseLine, isGenericHeading, stripArtifactCategoryAffix, truncateTitle, } from './article-metadata.js';
33
+ import { extractBriefingHighlight } from './metadata/briefing-highlight.js';
33
34
  /**
34
35
  * Run-relative candidate paths for a translated brief, in precedence
35
36
  * order. Mirrors the `executive-brief.md` → `extended/executive-brief.md`
@@ -174,6 +175,28 @@ export function resolveLocalizedBriefHighlight(runDir, lang, articleType, date)
174
175
  const body = readArtefactBody(abs);
175
176
  if (!body)
176
177
  continue;
178
+ // Tier 1 (NEW, May-2026): structural extraction of `## Strategic
179
+ // Intelligence Summary` / `## Reader Briefing` sections. The
180
+ // briefing extractor is language-agnostic — it matches on the
181
+ // English section headings, which the translation pipeline
182
+ // preserves verbatim under the localized brief contract — so a
183
+ // Swedish brief whose synthesis section is still written as
184
+ // `## Strategic Intelligence Summary` (with translated body
185
+ // prose) will resolve correctly here. When the translator has
186
+ // additionally localized the section heading the matcher falls
187
+ // back to the legacy lede/H1 path below, producing the
188
+ // localized H1 as headline.
189
+ const briefing = extractBriefingHighlight(body);
190
+ if (briefing && (briefing.headline || briefing.summary)) {
191
+ const fallbackHeadline = deriveHeadline(body, articleType, date);
192
+ return {
193
+ headline: briefing.headline || fallbackHeadline,
194
+ summary: briefing.summary,
195
+ extendedSummary: briefing.extendedSummary || extractExtendedLedeAfterHeading(body),
196
+ sourceFile: rel,
197
+ sourceLang: lang,
198
+ };
199
+ }
177
200
  const headline = deriveHeadline(body, articleType, date);
178
201
  const lede = extractLedeAfterHeading(body);
179
202
  const summary = lede || extractStrongProseLine(body);
@@ -37,27 +37,6 @@ export function buildArticleSlug(date, articleType, runSuffix) {
37
37
  export function sanitizeRunSuffix(runId) {
38
38
  return _sanitizeRunSuffix(runId);
39
39
  }
40
- /**
41
- * Return `true` when a line should be skipped when hunting for the default
42
- * description. Thin wrapper preserved for back-compat — real logic lives
43
- * in `src/aggregator/article-metadata.ts`'s `shouldSkipDescriptionLine`.
44
- *
45
- * @param line - Trimmed line from the aggregated Markdown source
46
- * @returns `true` when the line is not prose and should be skipped
47
- */
48
- function shouldSkipDescriptionLine(line) {
49
- if (line.length === 0)
50
- return true;
51
- if (line.startsWith('#'))
52
- return true;
53
- if (line.startsWith('>'))
54
- return true;
55
- if (line.startsWith('<'))
56
- return true;
57
- if (line.startsWith('|'))
58
- return true;
59
- return false;
60
- }
61
40
  /** Description used when no prose paragraph qualifies. */
62
41
  const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from committed analysis artifacts.';
63
42
  /**
@@ -73,7 +52,6 @@ const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from co
73
52
  * @returns Plain-text description, truncated to ≤300 characters
74
53
  */
75
54
  export function extractDefaultDescription(markdown) {
76
- void shouldSkipDescriptionLine;
77
55
  const strong = extractStrongProseLine(markdown);
78
56
  return strong.length > 0 ? strong : FALLBACK_DESCRIPTION;
79
57
  }
@@ -1,4 +1,5 @@
1
- import type { LanguageCode } from '../../types/index.js';
1
+ import type { LanguageCode, LanguageMap } from '../../types/index.js';
2
+ import { type SeoSurface } from '../metadata/seo-budgets.js';
2
3
  /**
3
4
  * Resolve a localized article type label *without* the leading icon
4
5
  * emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
@@ -28,19 +29,50 @@ export declare const HEADLINE_LIMIT = 110;
28
29
  export declare function truncateHeadline(title: string): string;
29
30
  /**
30
31
  * Build the localized `<title>` separator for the
31
- * `{articleTitle} {sep} {siteTitle}` pattern. LTR locales use the
32
- * right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
33
- * left-pointing guillemet («) so the visual hierarchy reads from the
34
- * primary title towards the site name without breaking bidi flow.
32
+ * `{articleTitle} {sep} {siteTitle}` pattern.
35
33
  *
36
- * The previous em-dash separator collided with em-dashes inside
37
- * article titles (the editorial style uses `Title Subtitle`) and
38
- * rendered ambiguously in screen readers.
34
+ * Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
35
+ * scans cleanly in SERP cards and never collides with em-dashes that
36
+ * the editorial style routinely uses inside titles. CJK locales use
37
+ * the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
38
+ * Google CJK separator and renders correctly in JP / KO / ZH fonts.
39
+ * RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
40
+ * stroke that preserves bidi flow without injecting a Latin guillemet
41
+ * that would force a direction change mid-title.
39
42
  *
40
43
  * @param lang - Target language code
41
- * @returns `" » "` for LTR locales, `" « "` for RTL
44
+ * @returns Per-script separator
42
45
  */
43
46
  export declare function getTitleSeparator(lang: LanguageCode): string;
47
+ /**
48
+ * Short brand fallback per script family. Used by
49
+ * {@link buildPageTitle} when the full `SITE_NAME` would push the
50
+ * `<title>` past the SERP budget but a shorter variant would fit.
51
+ *
52
+ * - Latin → "EPM" (3 chars, ASCII-safe in news cards)
53
+ * - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
54
+ * - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
55
+ * SERP cards (Bing/Google render the Latin token RTL-isolated)
56
+ *
57
+ * Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
58
+ * future editorial change (e.g. a registered Arabic brand) only
59
+ * touches the table.
60
+ */
61
+ export declare const SHORT_SITE_NAMES: LanguageMap;
62
+ /**
63
+ * Compose a title for one SEO surface using the per-script byte
64
+ * budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
65
+ * the article title alone fills the budget (better SERP than a
66
+ * truncated headline followed by a clipped brand) and falls through
67
+ * to a short-brand variant when that fits but the full one does not.
68
+ *
69
+ * @param title - Article title (plain text, already markdown-stripped)
70
+ * @param lang - Target language code
71
+ * @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
72
+ * @param surface - Optional SEO surface; defaults to `<title>` budget
73
+ * @returns Composed, budget-clamped title
74
+ */
75
+ export declare function buildPageTitle(title: string, lang: LanguageCode, siteTitle: string, surface?: SeoSurface): string;
44
76
  /**
45
77
  * Resolve a localized article type label with icon. Falls back to the
46
78
  * humanised slug when a translation isn't available.
@@ -7,8 +7,9 @@
7
7
  * icon), the page-title separator that respects bidi direction, and the
8
8
  * Schema.org-compatible truncated headline used in JSON-LD.
9
9
  */
10
- import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString, getTextDirection, } from '../../constants/languages.js';
10
+ import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString, } from '../../constants/languages.js';
11
11
  import { ArticleCategory } from '../../types/index.js';
12
+ import { classifyScript, clampTitleForSurface } from '../metadata/seo-budgets.js';
12
13
  /**
13
14
  * Resolve a localized article type label *without* the leading icon
14
15
  * emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
@@ -51,20 +52,78 @@ export function truncateHeadline(title) {
51
52
  }
52
53
  /**
53
54
  * Build the localized `<title>` separator for the
54
- * `{articleTitle} {sep} {siteTitle}` pattern. LTR locales use the
55
- * right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
56
- * left-pointing guillemet («) so the visual hierarchy reads from the
57
- * primary title towards the site name without breaking bidi flow.
55
+ * `{articleTitle} {sep} {siteTitle}` pattern.
58
56
  *
59
- * The previous em-dash separator collided with em-dashes inside
60
- * article titles (the editorial style uses `Title Subtitle`) and
61
- * rendered ambiguously in screen readers.
57
+ * Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
58
+ * scans cleanly in SERP cards and never collides with em-dashes that
59
+ * the editorial style routinely uses inside titles. CJK locales use
60
+ * the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
61
+ * Google CJK separator and renders correctly in JP / KO / ZH fonts.
62
+ * RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
63
+ * stroke that preserves bidi flow without injecting a Latin guillemet
64
+ * that would force a direction change mid-title.
62
65
  *
63
66
  * @param lang - Target language code
64
- * @returns `" » "` for LTR locales, `" « "` for RTL
67
+ * @returns Per-script separator
65
68
  */
66
69
  export function getTitleSeparator(lang) {
67
- return getTextDirection(lang) === 'rtl' ? ' « ' : ' » ';
70
+ const family = classifyScript(lang);
71
+ if (family === 'cjk')
72
+ return ' ・ ';
73
+ if (family === 'rtl')
74
+ return ' ׀ ';
75
+ return ' | ';
76
+ }
77
+ /**
78
+ * Short brand fallback per script family. Used by
79
+ * {@link buildPageTitle} when the full `SITE_NAME` would push the
80
+ * `<title>` past the SERP budget but a shorter variant would fit.
81
+ *
82
+ * - Latin → "EPM" (3 chars, ASCII-safe in news cards)
83
+ * - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
84
+ * - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
85
+ * SERP cards (Bing/Google render the Latin token RTL-isolated)
86
+ *
87
+ * Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
88
+ * future editorial change (e.g. a registered Arabic brand) only
89
+ * touches the table.
90
+ */
91
+ export const SHORT_SITE_NAMES = {
92
+ en: 'EPM',
93
+ sv: 'EPM',
94
+ da: 'EPM',
95
+ no: 'EPM',
96
+ fi: 'EPM',
97
+ de: 'EPM',
98
+ fr: 'EPM',
99
+ es: 'EPM',
100
+ nl: 'EPM',
101
+ ar: 'EPM',
102
+ he: 'EPM',
103
+ ja: 'EPM',
104
+ ko: 'EPM',
105
+ zh: 'EPM',
106
+ };
107
+ /**
108
+ * Compose a title for one SEO surface using the per-script byte
109
+ * budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
110
+ * the article title alone fills the budget (better SERP than a
111
+ * truncated headline followed by a clipped brand) and falls through
112
+ * to a short-brand variant when that fits but the full one does not.
113
+ *
114
+ * @param title - Article title (plain text, already markdown-stripped)
115
+ * @param lang - Target language code
116
+ * @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
117
+ * @param surface - Optional SEO surface; defaults to `<title>` budget
118
+ * @returns Composed, budget-clamped title
119
+ */
120
+ export function buildPageTitle(title, lang, siteTitle, surface = 'title') {
121
+ const shortSiteTitle = getLocalizedString(SHORT_SITE_NAMES, lang);
122
+ return clampTitleForSurface(title, lang, surface, {
123
+ siteTitle,
124
+ shortSiteTitle,
125
+ separator: getTitleSeparator(lang),
126
+ });
68
127
  }
69
128
  /**
70
129
  * Resolve a localized article type label with icon. Falls back to the
@@ -20,7 +20,8 @@ import { stripHtmlTags } from '../../utils/html-sanitize.js';
20
20
  import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
21
21
  import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
22
22
  import { getSitemapFilename } from '../../generators/sitemap/index.js';
23
- import { truncateHeadline, getTitleSeparator, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
23
+ import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
24
+ import { clampForBudget } from '../metadata/seo-budgets.js';
24
25
  import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
25
26
  import { buildArticleToc } from './toc.js';
26
27
  import { blobUrl } from '../infra/github-urls.js';
@@ -28,6 +29,57 @@ import { blobUrl } from '../infra/github-urls.js';
28
29
  export const PUBLISHER_NAME = 'Hack23 AB';
29
30
  /** Site name used across meta tags and structured data. */
30
31
  export const SITE_NAME = 'EU Parliament Monitor';
32
+ /**
33
+ * Compute the per-surface SEO-budget-clamped variants of the article
34
+ * title and description for a single render. See
35
+ * `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
36
+ * documented sources of every cap.
37
+ *
38
+ * @param options - The {@link WrapArticleOptions} carrying title /
39
+ * description / extendedDescription
40
+ * @param lang - Validated publishing locale (already coerced to a
41
+ * supported `LanguageCode`)
42
+ * @param siteTitle - Resolved localized site title used as the brand
43
+ * suffix
44
+ * @returns One {@link SeoClampedSurfaces} record per article render
45
+ */
46
+ function computeSeoClamps(options, lang, siteTitle) {
47
+ const pageTitle = buildPageTitle(options.title, lang, siteTitle);
48
+ const ogTitleClamped = clampForBudget(options.title, lang, 'ogTitle');
49
+ const twitterTitleClamped = clampForBudget(options.title, lang, 'twitterTitle');
50
+ const metaDescriptionClamped = clampForBudget(options.description, lang, 'metaDescription');
51
+ // og:description and twitter:description prefer the longer BLUF
52
+ // paragraph (extendedDescription) so social-card previews show the
53
+ // full lede; fall back to the short meta description when the
54
+ // extended one is empty.
55
+ const socialSource = options.extendedDescription && options.extendedDescription.length > 0
56
+ ? options.extendedDescription
57
+ : options.description;
58
+ const ogDescriptionClamped = clampForBudget(socialSource, lang, 'ogDescription');
59
+ const twitterDescriptionClamped = clampForBudget(socialSource, lang, 'twitterDescription');
60
+ const imageAltClamped = clampForBudget(`${options.title}${getTitleSeparator(lang)}${siteTitle}`, lang, 'imageAlt');
61
+ const jsonLdHeadline = truncateHeadline(options.title);
62
+ // Emit an `alternativeHeadline` whenever the headline truncator
63
+ // dropped more than a handful of characters from the full title.
64
+ // Schema.org's `NewsArticle.alternativeHeadline` field is exactly
65
+ // for the long-form variant of `headline` and lets Google's
66
+ // Knowledge Graph keep both versions for retrieval. The 5-char
67
+ // threshold avoids emitting trivially redundant pairs when the
68
+ // truncator only trimmed trailing whitespace or punctuation.
69
+ const fullTitleTrimmed = options.title.trim();
70
+ const altCandidate = fullTitleTrimmed.length - jsonLdHeadline.length > 5 ? fullTitleTrimmed : undefined;
71
+ return {
72
+ pageTitle,
73
+ ogTitleClamped,
74
+ twitterTitleClamped,
75
+ metaDescriptionClamped,
76
+ ogDescriptionClamped,
77
+ twitterDescriptionClamped,
78
+ imageAltClamped,
79
+ jsonLdHeadline,
80
+ ...(altCandidate ? { alternativeHeadline: altCandidate } : {}),
81
+ };
82
+ }
31
83
  /**
32
84
  * Render the full article HTML document with the shared chrome.
33
85
  *
@@ -65,6 +117,17 @@ export function wrapArticleHtml(options) {
65
117
  // CodeQL-safe.
66
118
  const bodyText = stripHtmlTags(options.body);
67
119
  const wordCount = bodyText.split(/\s+/u).filter((w) => w.length > 0).length;
120
+ // Pre-compute the per-surface SEO-budget-clamped variants of title
121
+ // and description. Each surface gets its own clamp tuned to the
122
+ // documented platform envelope (Google/Bing SERP, Facebook/LinkedIn
123
+ // OG, Twitter card) and the script family (Latin / CJK / RTL —
124
+ // CJK glyphs render at ~2× Latin pixel width, so the same byte
125
+ // count occupies twice the SERP width). See
126
+ // `src/aggregator/metadata/seo-budgets.ts` for the budget table and
127
+ // `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
128
+ // documented sources of every cap.
129
+ const seoClamps = computeSeoClamps(options, safeLang, siteTitle);
130
+ const { pageTitle, ogTitleClamped, twitterTitleClamped, metaDescriptionClamped, ogDescriptionClamped, twitterDescriptionClamped, imageAltClamped, jsonLdHeadline, alternativeHeadline, } = seoClamps;
68
131
  // Build the JSON-LD image graph. Google requires NewsArticle.image
69
132
  // to be an array (or single ImageObject) with explicit width/height
70
133
  // covering at least one of the 1:1, 4:3, 16:9 aspect ratios for
@@ -92,8 +155,9 @@ export function wrapArticleHtml(options) {
92
155
  const jsonLd = {
93
156
  '@context': 'https://schema.org',
94
157
  '@type': 'NewsArticle',
95
- headline: truncateHeadline(options.title),
96
- description: options.description,
158
+ headline: jsonLdHeadline,
159
+ ...(alternativeHeadline ? { alternativeHeadline } : {}),
160
+ description: metaDescriptionClamped,
97
161
  datePublished: options.date,
98
162
  dateModified: options.date,
99
163
  inLanguage: safeLang,
@@ -165,18 +229,10 @@ export function wrapArticleHtml(options) {
165
229
  };
166
230
  const structuredData = [jsonLd, breadcrumbLd];
167
231
  const jsonLdString = JSON.stringify(structuredData).replace(/</g, '\\u003c');
168
- const pageTitle = `${options.title}${getTitleSeparator(safeLang)}${siteTitle}`;
169
232
  const keywords = (options.keywords ?? []).map((keyword) => keyword.trim()).filter(Boolean);
170
233
  const keywordsMeta = keywords.length > 0
171
234
  ? ` <meta name="keywords" content="${escapeHTML(keywords.join(', '))}">\n`
172
235
  : '';
173
- // Use the longer extended description for og:description/twitter:description
174
- // when available so social-card previews show the full BLUF
175
- // paragraph; the short meta description stays within Google's
176
- // ~160-char snippet budget.
177
- const socialDescription = options.extendedDescription && options.extendedDescription.length > 0
178
- ? options.extendedDescription
179
- : options.description;
180
236
  const ogLocaleTags = buildOgLocaleTags(safeLang);
181
237
  const twitterAttribution = buildTwitterAttributionTags();
182
238
  const twitterAttributionBlock = twitterAttribution ? `\n${twitterAttribution}` : '';
@@ -196,7 +252,7 @@ export function wrapArticleHtml(options) {
196
252
  <meta http-equiv="Content-Language" content="${safeLang}">
197
253
  <meta name="referrer" content="no-referrer">
198
254
  <title>${escapeHTML(pageTitle)}</title>
199
- <meta name="description" content="${escapeHTML(options.description)}">
255
+ <meta name="description" content="${escapeHTML(metaDescriptionClamped)}">
200
256
  ${keywordsMeta} <meta name="robots" content="index, follow, max-snippet:-1, max-image-preview:large">
201
257
  <meta name="author" content="${PUBLISHER_NAME}">
202
258
  <meta name="publisher" content="${PUBLISHER_NAME}">
@@ -211,15 +267,15 @@ ${hreflangLinks}
211
267
  <link rel="alternate" type="application/rss+xml" title="EU Parliament Monitor RSS" href="${BASE_URL}/rss.xml">
212
268
  <link rel="preconnect" href="https://hack23.com" crossorigin>
213
269
  <meta property="og:type" content="article">
214
- <meta property="og:title" content="${escapeHTML(options.title)}">
215
- <meta property="og:description" content="${escapeHTML(socialDescription)}">
270
+ <meta property="og:title" content="${escapeHTML(ogTitleClamped)}">
271
+ <meta property="og:description" content="${escapeHTML(ogDescriptionClamped)}">
216
272
  <meta property="og:url" content="${canonicalUrl}">
217
273
  <meta property="og:site_name" content="EU Parliament Monitor">
218
274
  ${ogLocaleTags}
219
- ${buildResponsiveSocialImageMeta(`${options.title}${getTitleSeparator(safeLang)}EU Parliament Monitor`)}
275
+ ${buildResponsiveSocialImageMeta(imageAltClamped)}
220
276
  <meta name="twitter:card" content="summary_large_image">
221
- <meta name="twitter:title" content="${escapeHTML(options.title)}">
222
- <meta name="twitter:description" content="${escapeHTML(socialDescription)}">${twitterAttributionBlock}
277
+ <meta name="twitter:title" content="${escapeHTML(twitterTitleClamped)}">
278
+ <meta name="twitter:description" content="${escapeHTML(twitterDescriptionClamped)}">${twitterAttributionBlock}
223
279
  ${buildResponsiveIconLinks('../')}
224
280
  <link rel="manifest" href="../site.webmanifest">
225
281
  <meta name="color-scheme" content="light dark">
@@ -3,7 +3,7 @@
3
3
  * @description Public re-exports for the manifest bounded context.
4
4
  */
5
5
  export type { HorizonProfile, Manifest, ManifestFiles, ManifestHistoryEntry, ManifestMetadataOverride, MetadataManifest, } from './types.js';
6
- export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
6
+ export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
7
7
  export { readManifest, parseManifest, type ReadManifestResult } from './reader.js';
8
8
  export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
9
9
  //# sourceMappingURL=index.d.ts.map
@@ -1,6 +1,6 @@
1
1
  // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
2
  // SPDX-License-Identifier: Apache-2.0
3
- export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
3
+ export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
4
4
  export { readManifest, parseManifest } from './reader.js';
5
5
  export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
6
6
  //# sourceMappingURL=index.js.map
@@ -10,6 +10,30 @@
10
10
  import type { Manifest, ManifestFiles } from './types.js';
11
11
  /** Sentinel used when no schema variant supplies a usable article type. */
12
12
  export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
13
+ /**
14
+ * Pattern matching trailing `-run<N>` taxonomy noise that historic
15
+ * Stage-B writers occasionally encode into `articleType` (e.g.
16
+ * `committee-reports-run47`, `motions-run41`, `breaking-run193`). Also
17
+ * tolerates the legacy double-prefixed `motions-runmotions-run-1777010709`
18
+ * pattern observed in 2025 manifests where the writer concatenated the
19
+ * articleType and runId. The leading `-run` makes the match greedy enough
20
+ * to catch both single-suffix and double-prefixed forms.
21
+ *
22
+ * Exported for unit tests.
23
+ */
24
+ export declare const RUN_SUFFIX_PATTERN: RegExp;
25
+ /**
26
+ * Strip a trailing `-run<N>` taxonomy-noise suffix from an article-type
27
+ * slug, but only when doing so yields a {@link CANONICAL_ARTICLE_TYPES}
28
+ * token. This is conservative: a non-canonical leading token (e.g.
29
+ * `custom-type-run5`) is returned untouched so we never silently
30
+ * collapse a genuinely new article type into something it isn't.
31
+ *
32
+ * @param slug - Raw article-type slug from a manifest field
33
+ * @returns Canonical slug when the suffix was successfully stripped,
34
+ * otherwise the original input
35
+ */
36
+ export declare function stripRunSuffix(slug: string): string;
13
37
  /**
14
38
  * Resolve the article-type slug from a manifest, tolerating historic schemas.
15
39
  *
@@ -19,7 +43,10 @@ export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
19
43
  * 3. `articleTypes[0]` — pre-aggregator-pipeline plural array
20
44
  * 4. `runType` — historic field on older breaking-run manifests
21
45
  *
22
- * Falls back to `'unknown'` when none of the above is a non-empty string.
46
+ * Each candidate is passed through {@link stripRunSuffix} so trailing
47
+ * `-run<N>` taxonomy noise never leaks into JSON-LD `articleSection`,
48
+ * the filesystem slug, or the SEO dump's article-type histogram. Falls
49
+ * back to `'unknown'` when none of the above is a non-empty string.
23
50
  *
24
51
  * @param manifest - Parsed manifest (any of the supported schemas)
25
52
  * @returns Article-type slug usable as a filename component