euparliamentmonitor 0.9.22 → 0.9.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/generator/slug.js +0 -22
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +27 -8
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -270
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/dump-article-seo.js +75 -2
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "euparliamentmonitor",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.24",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
|
|
6
6
|
"main": "scripts/index.js",
|
|
@@ -66,6 +66,7 @@
|
|
|
66
66
|
"discover:untranslated-briefs": "node scripts/discover-untranslated-briefs.js",
|
|
67
67
|
"validate:translations": "node scripts/validate-brief-translations.js",
|
|
68
68
|
"validate:manifest-seo": "node scripts/validate-manifest-seo.js",
|
|
69
|
+
"validate:seo": "node scripts/validate-article-seo.js",
|
|
69
70
|
"sync:templates": "node scripts/templates/sync-template-frontmatter.js",
|
|
70
71
|
"sync:templates:check": "node scripts/templates/sync-template-frontmatter.js --check",
|
|
71
72
|
"prior-run-diff": "node scripts/aggregator/prior-run-diff.js",
|
|
@@ -64,7 +64,7 @@
|
|
|
64
64
|
import { ALL_LANGUAGES } from '../constants/language-core.js';
|
|
65
65
|
import { resolveLocalizedBriefHighlight } from './editorial-brief-resolver.js';
|
|
66
66
|
import { buildTemplateFallback } from './metadata/template-fallback.js';
|
|
67
|
-
import { buildSeoKeywords, composeContextualDescription, composeContextualTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, } from './metadata/resolve-helpers.js';
|
|
67
|
+
import { buildSeoKeywords, composeContextualDescription, composeContextualExtendedDescription, composeContextualTitle, deriveHeadlineFromSummary, hasLeakySeoToken, isUsableResolvedTitle, manifestOverrideFor, pickFirstNonEmpty, resolveEditorialContent, sanitizeDescriptionCandidate, } from './metadata/resolve-helpers.js';
|
|
68
68
|
import { ENRICHMENT_TRIGGER_LENGTH, truncateDescription, truncateExtendedDescription, truncateTitle, } from './metadata/text-utils.js';
|
|
69
69
|
export { shouldSkipDescriptionLine, stripLeadingProseLabel, stripInlineMarkdown, truncateDescription, truncateExtendedDescription, truncateTitle, extractFirstSentence, } from './metadata/text-utils.js';
|
|
70
70
|
export { isArtifactCategoryHeading, stripArtifactCategoryAffix, isGenericHeading, } from './metadata/heading-rules.js';
|
|
@@ -109,6 +109,7 @@ export function resolveArticleMetadata(opts) {
|
|
|
109
109
|
}
|
|
110
110
|
return result;
|
|
111
111
|
}
|
|
112
|
+
const LOCALIZED_BRIEF_SOURCE = 'localized-brief';
|
|
112
113
|
/**
|
|
113
114
|
* Resolve `{title, description, keywords, source}` for one language.
|
|
114
115
|
*
|
|
@@ -122,20 +123,74 @@ function resolveOneLanguage(input) {
|
|
|
122
123
|
const editorial = perLanguage.editorial;
|
|
123
124
|
const contextualTitle = composeContextualTitle(input.template.title, editorial.headline, input.runId);
|
|
124
125
|
const title = pickFirstNonEmpty([manifestTitle, contextualTitle, input.template.title]);
|
|
125
|
-
const rawDescription = pickFirstNonEmpty([
|
|
126
|
-
|
|
127
|
-
editorial.
|
|
128
|
-
|
|
126
|
+
const rawDescription = sanitizeDescriptionCandidate(pickFirstNonEmpty([manifestDescription, editorial.summary, input.template.subtitle]));
|
|
127
|
+
const safeEditorial = {
|
|
128
|
+
headline: isUsableResolvedTitle(editorial.headline) ? editorial.headline.trim() : '',
|
|
129
|
+
summary: sanitizeDescriptionCandidate(editorial.summary),
|
|
130
|
+
extendedSummary: sanitizeDescriptionCandidate(editorial.extendedSummary),
|
|
131
|
+
};
|
|
132
|
+
const normalizedRawDescription = rawDescription || sanitizeDescriptionCandidate(input.template.subtitle);
|
|
133
|
+
const skipEnrichment = perLanguage.source === LOCALIZED_BRIEF_SOURCE && normalizedRawDescription.length > 0;
|
|
134
|
+
const description = skipEnrichment || normalizedRawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
|
|
135
|
+
? normalizedRawDescription
|
|
136
|
+
: composeContextualDescription(input.lang, normalizedRawDescription, safeEditorial, input.date, input.runId);
|
|
137
|
+
const clippedTitle = truncateTitle(title).trim();
|
|
138
|
+
const explicitTitle = manifestTitle && !hasLeakySeoToken(manifestTitle) ? truncateTitle(manifestTitle).trim() : '';
|
|
139
|
+
const allowShortResolvedTitle = perLanguage.source === LOCALIZED_BRIEF_SOURCE;
|
|
140
|
+
const resolvedTitleCandidate = clippedTitle &&
|
|
141
|
+
!hasLeakySeoToken(clippedTitle) &&
|
|
142
|
+
(allowShortResolvedTitle || isUsableResolvedTitle(clippedTitle))
|
|
143
|
+
? clippedTitle
|
|
144
|
+
: '';
|
|
145
|
+
const summaryDerivedTitle = deriveHeadlineFromSummary(safeEditorial.summary || normalizedRawDescription);
|
|
146
|
+
// `truncateTitle` returns '' when an editorial title overruns the
|
|
147
|
+
// budget with no acceptable clause boundary — fall back to the
|
|
148
|
+
// localized template title in that case so we never emit an empty
|
|
149
|
+
// `<title>`. Live regression: 2026-05-22 breaking
|
|
150
|
+
// `AI Trade Strategy: A Legislative First with Structural…` clipped
|
|
151
|
+
// to '' after the no-ellipsis guard landed; template fallback
|
|
152
|
+
// (`Extended Executive Brief — Breaking News`) is preferable to a
|
|
153
|
+
// blank `<title>`.
|
|
154
|
+
//
|
|
155
|
+
// The fallback path passes the template title back through
|
|
156
|
+
// {@link composeContextualTitle} (with an empty editorial headline)
|
|
157
|
+
// so `withRunQualifier` re-appends the `— Run N` suffix. Without
|
|
158
|
+
// this, two same-date / same-articleType runs (republish, hot-fix
|
|
159
|
+
// re-run) would collapse to byte-identical `<title>` strings, and
|
|
160
|
+
// the duplicate-title gate in `scripts/validate-article-seo.js`
|
|
161
|
+
// would (correctly) fail CI.
|
|
162
|
+
const contextualFallback = composeContextualTitle(input.template.title, '', input.runId);
|
|
163
|
+
const truncatedTitle = pickFirstNonEmpty([
|
|
164
|
+
explicitTitle,
|
|
165
|
+
resolvedTitleCandidate,
|
|
166
|
+
isUsableResolvedTitle(summaryDerivedTitle, { allowFullSentence: true })
|
|
167
|
+
? summaryDerivedTitle
|
|
168
|
+
: '',
|
|
169
|
+
truncateTitle(contextualFallback),
|
|
170
|
+
contextualFallback,
|
|
129
171
|
]);
|
|
130
|
-
const description = rawDescription.length >= ENRICHMENT_TRIGGER_LENGTH
|
|
131
|
-
? rawDescription
|
|
132
|
-
: composeContextualDescription(input.lang, rawDescription, editorial, input.date, input.runId);
|
|
133
|
-
const truncatedTitle = truncateTitle(title);
|
|
134
172
|
const truncatedDescription = truncateDescription(description);
|
|
135
|
-
const extendedSource = manifestDescription
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
173
|
+
const extendedSource = sanitizeDescriptionCandidate(manifestDescription || safeEditorial.extendedSummary || normalizedRawDescription);
|
|
174
|
+
// Two-tier extended-description resolution:
|
|
175
|
+
// 1. Direct truncation — preferred when the editorial source paragraph
|
|
176
|
+
// is already ≥181 chars (the truncator's gating threshold). This
|
|
177
|
+
// yields the highest-fidelity og:description text.
|
|
178
|
+
// 2. Contextual synthesis — when direct truncation returns '' (source
|
|
179
|
+
// was too short), synthesize a longer string by stitching together
|
|
180
|
+
// `<source> + Date: YYYY-MM-DD + Context: <editorial> + <reader>`.
|
|
181
|
+
// This is the **only** SEO path that surfaces the localized
|
|
182
|
+
// "for democratic-accountability readers …" framing (the short
|
|
183
|
+
// <meta description> no longer carries it — see comment in
|
|
184
|
+
// {@link composeContextualDescription}). The synthesized string is
|
|
185
|
+
// re-clamped to the 200–300 char og:description budget.
|
|
186
|
+
//
|
|
187
|
+
// Live regression (2026-05): 56 breaking briefs shipped with empty
|
|
188
|
+
// extendedDescription because their lead paragraph was only 80–150
|
|
189
|
+
// chars. AI-overview and Discover surfaces dropped them entirely.
|
|
190
|
+
let truncatedExtendedDescription = truncateExtendedDescription(extendedSource);
|
|
191
|
+
if (!truncatedExtendedDescription) {
|
|
192
|
+
truncatedExtendedDescription = composeContextualExtendedDescription(input.lang, extendedSource || normalizedRawDescription, safeEditorial, input.date);
|
|
193
|
+
}
|
|
139
194
|
const source = manifestTitle || manifestDescription ? 'manifest' : perLanguage.source;
|
|
140
195
|
return {
|
|
141
196
|
title: truncatedTitle,
|
|
@@ -163,7 +218,7 @@ function resolvePerLanguageEditorial(input) {
|
|
|
163
218
|
summary: localized.summary,
|
|
164
219
|
extendedSummary: localized.extendedSummary,
|
|
165
220
|
},
|
|
166
|
-
source:
|
|
221
|
+
source: LOCALIZED_BRIEF_SOURCE,
|
|
167
222
|
};
|
|
168
223
|
}
|
|
169
224
|
}
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
import fs from 'fs';
|
|
31
31
|
import path from 'path';
|
|
32
32
|
import { extractFirstH1, extractLedeAfterHeading, extractExtendedLedeAfterHeading, extractStrongProseLine, isGenericHeading, stripArtifactCategoryAffix, truncateTitle, } from './article-metadata.js';
|
|
33
|
+
import { extractBriefingHighlight } from './metadata/briefing-highlight.js';
|
|
33
34
|
/**
|
|
34
35
|
* Run-relative candidate paths for a translated brief, in precedence
|
|
35
36
|
* order. Mirrors the `executive-brief.md` → `extended/executive-brief.md`
|
|
@@ -174,6 +175,28 @@ export function resolveLocalizedBriefHighlight(runDir, lang, articleType, date)
|
|
|
174
175
|
const body = readArtefactBody(abs);
|
|
175
176
|
if (!body)
|
|
176
177
|
continue;
|
|
178
|
+
// Tier 1 (NEW, May-2026): structural extraction of `## Strategic
|
|
179
|
+
// Intelligence Summary` / `## Reader Briefing` sections. The
|
|
180
|
+
// briefing extractor is language-agnostic — it matches on the
|
|
181
|
+
// English section headings, which the translation pipeline
|
|
182
|
+
// preserves verbatim under the localized brief contract — so a
|
|
183
|
+
// Swedish brief whose synthesis section is still written as
|
|
184
|
+
// `## Strategic Intelligence Summary` (with translated body
|
|
185
|
+
// prose) will resolve correctly here. When the translator has
|
|
186
|
+
// additionally localized the section heading the matcher falls
|
|
187
|
+
// back to the legacy lede/H1 path below, producing the
|
|
188
|
+
// localized H1 as headline.
|
|
189
|
+
const briefing = extractBriefingHighlight(body);
|
|
190
|
+
if (briefing && (briefing.headline || briefing.summary)) {
|
|
191
|
+
const fallbackHeadline = deriveHeadline(body, articleType, date);
|
|
192
|
+
return {
|
|
193
|
+
headline: briefing.headline || fallbackHeadline,
|
|
194
|
+
summary: briefing.summary,
|
|
195
|
+
extendedSummary: briefing.extendedSummary || extractExtendedLedeAfterHeading(body),
|
|
196
|
+
sourceFile: rel,
|
|
197
|
+
sourceLang: lang,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
177
200
|
const headline = deriveHeadline(body, articleType, date);
|
|
178
201
|
const lede = extractLedeAfterHeading(body);
|
|
179
202
|
const summary = lede || extractStrongProseLine(body);
|
|
@@ -37,27 +37,6 @@ export function buildArticleSlug(date, articleType, runSuffix) {
|
|
|
37
37
|
export function sanitizeRunSuffix(runId) {
|
|
38
38
|
return _sanitizeRunSuffix(runId);
|
|
39
39
|
}
|
|
40
|
-
/**
|
|
41
|
-
* Return `true` when a line should be skipped when hunting for the default
|
|
42
|
-
* description. Thin wrapper preserved for back-compat — real logic lives
|
|
43
|
-
* in `src/aggregator/article-metadata.ts`'s `shouldSkipDescriptionLine`.
|
|
44
|
-
*
|
|
45
|
-
* @param line - Trimmed line from the aggregated Markdown source
|
|
46
|
-
* @returns `true` when the line is not prose and should be skipped
|
|
47
|
-
*/
|
|
48
|
-
function shouldSkipDescriptionLine(line) {
|
|
49
|
-
if (line.length === 0)
|
|
50
|
-
return true;
|
|
51
|
-
if (line.startsWith('#'))
|
|
52
|
-
return true;
|
|
53
|
-
if (line.startsWith('>'))
|
|
54
|
-
return true;
|
|
55
|
-
if (line.startsWith('<'))
|
|
56
|
-
return true;
|
|
57
|
-
if (line.startsWith('|'))
|
|
58
|
-
return true;
|
|
59
|
-
return false;
|
|
60
|
-
}
|
|
61
40
|
/** Description used when no prose paragraph qualifies. */
|
|
62
41
|
const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from committed analysis artifacts.';
|
|
63
42
|
/**
|
|
@@ -73,7 +52,6 @@ const FALLBACK_DESCRIPTION = 'EU Parliament intelligence summary derived from co
|
|
|
73
52
|
* @returns Plain-text description, truncated to ≤300 characters
|
|
74
53
|
*/
|
|
75
54
|
export function extractDefaultDescription(markdown) {
|
|
76
|
-
void shouldSkipDescriptionLine;
|
|
77
55
|
const strong = extractStrongProseLine(markdown);
|
|
78
56
|
return strong.length > 0 ? strong : FALLBACK_DESCRIPTION;
|
|
79
57
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import type { LanguageCode } from '../../types/index.js';
|
|
1
|
+
import type { LanguageCode, LanguageMap } from '../../types/index.js';
|
|
2
|
+
import { type SeoSurface } from '../metadata/seo-budgets.js';
|
|
2
3
|
/**
|
|
3
4
|
* Resolve a localized article type label *without* the leading icon
|
|
4
5
|
* emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
|
|
@@ -28,19 +29,50 @@ export declare const HEADLINE_LIMIT = 110;
|
|
|
28
29
|
export declare function truncateHeadline(title: string): string;
|
|
29
30
|
/**
|
|
30
31
|
* Build the localized `<title>` separator for the
|
|
31
|
-
* `{articleTitle} {sep} {siteTitle}` pattern.
|
|
32
|
-
* right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
|
|
33
|
-
* left-pointing guillemet («) so the visual hierarchy reads from the
|
|
34
|
-
* primary title towards the site name without breaking bidi flow.
|
|
32
|
+
* `{articleTitle} {sep} {siteTitle}` pattern.
|
|
35
33
|
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
38
|
-
*
|
|
34
|
+
* Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
|
|
35
|
+
* scans cleanly in SERP cards and never collides with em-dashes that
|
|
36
|
+
* the editorial style routinely uses inside titles. CJK locales use
|
|
37
|
+
* the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
|
|
38
|
+
* Google CJK separator and renders correctly in JP / KO / ZH fonts.
|
|
39
|
+
* RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
|
|
40
|
+
* stroke that preserves bidi flow without injecting a Latin guillemet
|
|
41
|
+
* that would force a direction change mid-title.
|
|
39
42
|
*
|
|
40
43
|
* @param lang - Target language code
|
|
41
|
-
* @returns
|
|
44
|
+
* @returns Per-script separator
|
|
42
45
|
*/
|
|
43
46
|
export declare function getTitleSeparator(lang: LanguageCode): string;
|
|
47
|
+
/**
|
|
48
|
+
* Short brand fallback per script family. Used by
|
|
49
|
+
* {@link buildPageTitle} when the full `SITE_NAME` would push the
|
|
50
|
+
* `<title>` past the SERP budget but a shorter variant would fit.
|
|
51
|
+
*
|
|
52
|
+
* - Latin → "EPM" (3 chars, ASCII-safe in news cards)
|
|
53
|
+
* - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
|
|
54
|
+
* - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
|
|
55
|
+
* SERP cards (Bing/Google render the Latin token RTL-isolated)
|
|
56
|
+
*
|
|
57
|
+
* Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
|
|
58
|
+
* future editorial change (e.g. a registered Arabic brand) only
|
|
59
|
+
* touches the table.
|
|
60
|
+
*/
|
|
61
|
+
export declare const SHORT_SITE_NAMES: LanguageMap;
|
|
62
|
+
/**
|
|
63
|
+
* Compose a title for one SEO surface using the per-script byte
|
|
64
|
+
* budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
|
|
65
|
+
* the article title alone fills the budget (better SERP than a
|
|
66
|
+
* truncated headline followed by a clipped brand) and falls through
|
|
67
|
+
* to a short-brand variant when that fits but the full one does not.
|
|
68
|
+
*
|
|
69
|
+
* @param title - Article title (plain text, already markdown-stripped)
|
|
70
|
+
* @param lang - Target language code
|
|
71
|
+
* @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
|
|
72
|
+
* @param surface - Optional SEO surface; defaults to `<title>` budget
|
|
73
|
+
* @returns Composed, budget-clamped title
|
|
74
|
+
*/
|
|
75
|
+
export declare function buildPageTitle(title: string, lang: LanguageCode, siteTitle: string, surface?: SeoSurface): string;
|
|
44
76
|
/**
|
|
45
77
|
* Resolve a localized article type label with icon. Falls back to the
|
|
46
78
|
* humanised slug when a translation isn't available.
|
|
@@ -7,8 +7,9 @@
|
|
|
7
7
|
* icon), the page-title separator that respects bidi direction, and the
|
|
8
8
|
* Schema.org-compatible truncated headline used in JSON-LD.
|
|
9
9
|
*/
|
|
10
|
-
import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString,
|
|
10
|
+
import { ARTICLE_TYPE_LABELS, ARTICLE_TYPE_ICONS, getLocalizedString, } from '../../constants/languages.js';
|
|
11
11
|
import { ArticleCategory } from '../../types/index.js';
|
|
12
|
+
import { classifyScript, clampTitleForSurface } from '../metadata/seo-budgets.js';
|
|
12
13
|
/**
|
|
13
14
|
* Resolve a localized article type label *without* the leading icon
|
|
14
15
|
* emoji. Used for the OpenGraph `article:section` meta and the JSON-LD
|
|
@@ -51,20 +52,78 @@ export function truncateHeadline(title) {
|
|
|
51
52
|
}
|
|
52
53
|
/**
|
|
53
54
|
* Build the localized `<title>` separator for the
|
|
54
|
-
* `{articleTitle} {sep} {siteTitle}` pattern.
|
|
55
|
-
* right-pointing guillemet (»); RTL locales (Arabic, Hebrew) use the
|
|
56
|
-
* left-pointing guillemet («) so the visual hierarchy reads from the
|
|
57
|
-
* primary title towards the site name without breaking bidi flow.
|
|
55
|
+
* `{articleTitle} {sep} {siteTitle}` pattern.
|
|
58
56
|
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
57
|
+
* Latin scripts use the policy-mandated ASCII pipe (`" | "`), which
|
|
58
|
+
* scans cleanly in SERP cards and never collides with em-dashes that
|
|
59
|
+
* the editorial style routinely uses inside titles. CJK locales use
|
|
60
|
+
* the Katakana middle-dot (`" ・ "`, U+30FB) which is the documented
|
|
61
|
+
* Google CJK separator and renders correctly in JP / KO / ZH fonts.
|
|
62
|
+
* RTL locales use the Hebrew paseq (`" ׀ "`, U+05C0) — a vertical
|
|
63
|
+
* stroke that preserves bidi flow without injecting a Latin guillemet
|
|
64
|
+
* that would force a direction change mid-title.
|
|
62
65
|
*
|
|
63
66
|
* @param lang - Target language code
|
|
64
|
-
* @returns
|
|
67
|
+
* @returns Per-script separator
|
|
65
68
|
*/
|
|
66
69
|
export function getTitleSeparator(lang) {
|
|
67
|
-
|
|
70
|
+
const family = classifyScript(lang);
|
|
71
|
+
if (family === 'cjk')
|
|
72
|
+
return ' ・ ';
|
|
73
|
+
if (family === 'rtl')
|
|
74
|
+
return ' ׀ ';
|
|
75
|
+
return ' | ';
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Short brand fallback per script family. Used by
|
|
79
|
+
* {@link buildPageTitle} when the full `SITE_NAME` would push the
|
|
80
|
+
* `<title>` past the SERP budget but a shorter variant would fit.
|
|
81
|
+
*
|
|
82
|
+
* - Latin → "EPM" (3 chars, ASCII-safe in news cards)
|
|
83
|
+
* - CJK → "EPM" (Latin abbreviation reads correctly in JP / KO / ZH SERPs)
|
|
84
|
+
* - RTL → Arabic abbreviation "EPM" works in both Arabic and Hebrew
|
|
85
|
+
* SERP cards (Bing/Google render the Latin token RTL-isolated)
|
|
86
|
+
*
|
|
87
|
+
* Per-locale overrides live in {@link SHORT_SITE_NAMES} below so a
|
|
88
|
+
* future editorial change (e.g. a registered Arabic brand) only
|
|
89
|
+
* touches the table.
|
|
90
|
+
*/
|
|
91
|
+
export const SHORT_SITE_NAMES = {
|
|
92
|
+
en: 'EPM',
|
|
93
|
+
sv: 'EPM',
|
|
94
|
+
da: 'EPM',
|
|
95
|
+
no: 'EPM',
|
|
96
|
+
fi: 'EPM',
|
|
97
|
+
de: 'EPM',
|
|
98
|
+
fr: 'EPM',
|
|
99
|
+
es: 'EPM',
|
|
100
|
+
nl: 'EPM',
|
|
101
|
+
ar: 'EPM',
|
|
102
|
+
he: 'EPM',
|
|
103
|
+
ja: 'EPM',
|
|
104
|
+
ko: 'EPM',
|
|
105
|
+
zh: 'EPM',
|
|
106
|
+
};
|
|
107
|
+
/**
|
|
108
|
+
* Compose a title for one SEO surface using the per-script byte
|
|
109
|
+
* budget from `metadata/seo-budgets.ts`. Drops the brand suffix when
|
|
110
|
+
* the article title alone fills the budget (better SERP than a
|
|
111
|
+
* truncated headline followed by a clipped brand) and falls through
|
|
112
|
+
* to a short-brand variant when that fits but the full one does not.
|
|
113
|
+
*
|
|
114
|
+
* @param title - Article title (plain text, already markdown-stripped)
|
|
115
|
+
* @param lang - Target language code
|
|
116
|
+
* @param siteTitle - Full brand suffix (e.g. "EU Parliament Monitor")
|
|
117
|
+
* @param surface - Optional SEO surface; defaults to `<title>` budget
|
|
118
|
+
* @returns Composed, budget-clamped title
|
|
119
|
+
*/
|
|
120
|
+
export function buildPageTitle(title, lang, siteTitle, surface = 'title') {
|
|
121
|
+
const shortSiteTitle = getLocalizedString(SHORT_SITE_NAMES, lang);
|
|
122
|
+
return clampTitleForSurface(title, lang, surface, {
|
|
123
|
+
siteTitle,
|
|
124
|
+
shortSiteTitle,
|
|
125
|
+
separator: getTitleSeparator(lang),
|
|
126
|
+
});
|
|
68
127
|
}
|
|
69
128
|
/**
|
|
70
129
|
* Resolve a localized article type label with icon. Falls back to the
|
|
@@ -20,7 +20,8 @@ import { stripHtmlTags } from '../../utils/html-sanitize.js';
|
|
|
20
20
|
import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
|
|
21
21
|
import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
|
|
22
22
|
import { getSitemapFilename } from '../../generators/sitemap/index.js';
|
|
23
|
-
import { truncateHeadline, getTitleSeparator, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
|
|
23
|
+
import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
|
|
24
|
+
import { clampForBudget } from '../metadata/seo-budgets.js';
|
|
24
25
|
import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
|
|
25
26
|
import { buildArticleToc } from './toc.js';
|
|
26
27
|
import { blobUrl } from '../infra/github-urls.js';
|
|
@@ -28,6 +29,57 @@ import { blobUrl } from '../infra/github-urls.js';
|
|
|
28
29
|
export const PUBLISHER_NAME = 'Hack23 AB';
|
|
29
30
|
/** Site name used across meta tags and structured data. */
|
|
30
31
|
export const SITE_NAME = 'EU Parliament Monitor';
|
|
32
|
+
/**
|
|
33
|
+
* Compute the per-surface SEO-budget-clamped variants of the article
|
|
34
|
+
* title and description for a single render. See
|
|
35
|
+
* `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
|
|
36
|
+
* documented sources of every cap.
|
|
37
|
+
*
|
|
38
|
+
* @param options - The {@link WrapArticleOptions} carrying title /
|
|
39
|
+
* description / extendedDescription
|
|
40
|
+
* @param lang - Validated publishing locale (already coerced to a
|
|
41
|
+
* supported `LanguageCode`)
|
|
42
|
+
* @param siteTitle - Resolved localized site title used as the brand
|
|
43
|
+
* suffix
|
|
44
|
+
* @returns One {@link SeoClampedSurfaces} record per article render
|
|
45
|
+
*/
|
|
46
|
+
function computeSeoClamps(options, lang, siteTitle) {
|
|
47
|
+
const pageTitle = buildPageTitle(options.title, lang, siteTitle);
|
|
48
|
+
const ogTitleClamped = clampForBudget(options.title, lang, 'ogTitle');
|
|
49
|
+
const twitterTitleClamped = clampForBudget(options.title, lang, 'twitterTitle');
|
|
50
|
+
const metaDescriptionClamped = clampForBudget(options.description, lang, 'metaDescription');
|
|
51
|
+
// og:description and twitter:description prefer the longer BLUF
|
|
52
|
+
// paragraph (extendedDescription) so social-card previews show the
|
|
53
|
+
// full lede; fall back to the short meta description when the
|
|
54
|
+
// extended one is empty.
|
|
55
|
+
const socialSource = options.extendedDescription && options.extendedDescription.length > 0
|
|
56
|
+
? options.extendedDescription
|
|
57
|
+
: options.description;
|
|
58
|
+
const ogDescriptionClamped = clampForBudget(socialSource, lang, 'ogDescription');
|
|
59
|
+
const twitterDescriptionClamped = clampForBudget(socialSource, lang, 'twitterDescription');
|
|
60
|
+
const imageAltClamped = clampForBudget(`${options.title}${getTitleSeparator(lang)}${siteTitle}`, lang, 'imageAlt');
|
|
61
|
+
const jsonLdHeadline = truncateHeadline(options.title);
|
|
62
|
+
// Emit an `alternativeHeadline` whenever the headline truncator
|
|
63
|
+
// dropped more than a handful of characters from the full title.
|
|
64
|
+
// Schema.org's `NewsArticle.alternativeHeadline` field is exactly
|
|
65
|
+
// for the long-form variant of `headline` and lets Google's
|
|
66
|
+
// Knowledge Graph keep both versions for retrieval. The 5-char
|
|
67
|
+
// threshold avoids emitting trivially redundant pairs when the
|
|
68
|
+
// truncator only trimmed trailing whitespace or punctuation.
|
|
69
|
+
const fullTitleTrimmed = options.title.trim();
|
|
70
|
+
const altCandidate = fullTitleTrimmed.length - jsonLdHeadline.length > 5 ? fullTitleTrimmed : undefined;
|
|
71
|
+
return {
|
|
72
|
+
pageTitle,
|
|
73
|
+
ogTitleClamped,
|
|
74
|
+
twitterTitleClamped,
|
|
75
|
+
metaDescriptionClamped,
|
|
76
|
+
ogDescriptionClamped,
|
|
77
|
+
twitterDescriptionClamped,
|
|
78
|
+
imageAltClamped,
|
|
79
|
+
jsonLdHeadline,
|
|
80
|
+
...(altCandidate ? { alternativeHeadline: altCandidate } : {}),
|
|
81
|
+
};
|
|
82
|
+
}
|
|
31
83
|
/**
|
|
32
84
|
* Render the full article HTML document with the shared chrome.
|
|
33
85
|
*
|
|
@@ -65,6 +117,17 @@ export function wrapArticleHtml(options) {
|
|
|
65
117
|
// CodeQL-safe.
|
|
66
118
|
const bodyText = stripHtmlTags(options.body);
|
|
67
119
|
const wordCount = bodyText.split(/\s+/u).filter((w) => w.length > 0).length;
|
|
120
|
+
// Pre-compute the per-surface SEO-budget-clamped variants of title
|
|
121
|
+
// and description. Each surface gets its own clamp tuned to the
|
|
122
|
+
// documented platform envelope (Google/Bing SERP, Facebook/LinkedIn
|
|
123
|
+
// OG, Twitter card) and the script family (Latin / CJK / RTL —
|
|
124
|
+
// CJK glyphs render at ~2× Latin pixel width, so the same byte
|
|
125
|
+
// count occupies twice the SERP width). See
|
|
126
|
+
// `src/aggregator/metadata/seo-budgets.ts` for the budget table and
|
|
127
|
+
// `analysis/methodologies/seo-headers-policy.md` § 1.1 for the
|
|
128
|
+
// documented sources of every cap.
|
|
129
|
+
const seoClamps = computeSeoClamps(options, safeLang, siteTitle);
|
|
130
|
+
const { pageTitle, ogTitleClamped, twitterTitleClamped, metaDescriptionClamped, ogDescriptionClamped, twitterDescriptionClamped, imageAltClamped, jsonLdHeadline, alternativeHeadline, } = seoClamps;
|
|
68
131
|
// Build the JSON-LD image graph. Google requires NewsArticle.image
|
|
69
132
|
// to be an array (or single ImageObject) with explicit width/height
|
|
70
133
|
// covering at least one of the 1:1, 4:3, 16:9 aspect ratios for
|
|
@@ -92,8 +155,9 @@ export function wrapArticleHtml(options) {
|
|
|
92
155
|
const jsonLd = {
|
|
93
156
|
'@context': 'https://schema.org',
|
|
94
157
|
'@type': 'NewsArticle',
|
|
95
|
-
headline:
|
|
96
|
-
|
|
158
|
+
headline: jsonLdHeadline,
|
|
159
|
+
...(alternativeHeadline ? { alternativeHeadline } : {}),
|
|
160
|
+
description: metaDescriptionClamped,
|
|
97
161
|
datePublished: options.date,
|
|
98
162
|
dateModified: options.date,
|
|
99
163
|
inLanguage: safeLang,
|
|
@@ -165,18 +229,10 @@ export function wrapArticleHtml(options) {
|
|
|
165
229
|
};
|
|
166
230
|
const structuredData = [jsonLd, breadcrumbLd];
|
|
167
231
|
const jsonLdString = JSON.stringify(structuredData).replace(/</g, '\\u003c');
|
|
168
|
-
const pageTitle = `${options.title}${getTitleSeparator(safeLang)}${siteTitle}`;
|
|
169
232
|
const keywords = (options.keywords ?? []).map((keyword) => keyword.trim()).filter(Boolean);
|
|
170
233
|
const keywordsMeta = keywords.length > 0
|
|
171
234
|
? ` <meta name="keywords" content="${escapeHTML(keywords.join(', '))}">\n`
|
|
172
235
|
: '';
|
|
173
|
-
// Use the longer extended description for og:description/twitter:description
|
|
174
|
-
// when available so social-card previews show the full BLUF
|
|
175
|
-
// paragraph; the short meta description stays within Google's
|
|
176
|
-
// ~160-char snippet budget.
|
|
177
|
-
const socialDescription = options.extendedDescription && options.extendedDescription.length > 0
|
|
178
|
-
? options.extendedDescription
|
|
179
|
-
: options.description;
|
|
180
236
|
const ogLocaleTags = buildOgLocaleTags(safeLang);
|
|
181
237
|
const twitterAttribution = buildTwitterAttributionTags();
|
|
182
238
|
const twitterAttributionBlock = twitterAttribution ? `\n${twitterAttribution}` : '';
|
|
@@ -196,7 +252,7 @@ export function wrapArticleHtml(options) {
|
|
|
196
252
|
<meta http-equiv="Content-Language" content="${safeLang}">
|
|
197
253
|
<meta name="referrer" content="no-referrer">
|
|
198
254
|
<title>${escapeHTML(pageTitle)}</title>
|
|
199
|
-
<meta name="description" content="${escapeHTML(
|
|
255
|
+
<meta name="description" content="${escapeHTML(metaDescriptionClamped)}">
|
|
200
256
|
${keywordsMeta} <meta name="robots" content="index, follow, max-snippet:-1, max-image-preview:large">
|
|
201
257
|
<meta name="author" content="${PUBLISHER_NAME}">
|
|
202
258
|
<meta name="publisher" content="${PUBLISHER_NAME}">
|
|
@@ -211,15 +267,15 @@ ${hreflangLinks}
|
|
|
211
267
|
<link rel="alternate" type="application/rss+xml" title="EU Parliament Monitor RSS" href="${BASE_URL}/rss.xml">
|
|
212
268
|
<link rel="preconnect" href="https://hack23.com" crossorigin>
|
|
213
269
|
<meta property="og:type" content="article">
|
|
214
|
-
<meta property="og:title" content="${escapeHTML(
|
|
215
|
-
<meta property="og:description" content="${escapeHTML(
|
|
270
|
+
<meta property="og:title" content="${escapeHTML(ogTitleClamped)}">
|
|
271
|
+
<meta property="og:description" content="${escapeHTML(ogDescriptionClamped)}">
|
|
216
272
|
<meta property="og:url" content="${canonicalUrl}">
|
|
217
273
|
<meta property="og:site_name" content="EU Parliament Monitor">
|
|
218
274
|
${ogLocaleTags}
|
|
219
|
-
${buildResponsiveSocialImageMeta(
|
|
275
|
+
${buildResponsiveSocialImageMeta(imageAltClamped)}
|
|
220
276
|
<meta name="twitter:card" content="summary_large_image">
|
|
221
|
-
<meta name="twitter:title" content="${escapeHTML(
|
|
222
|
-
<meta name="twitter:description" content="${escapeHTML(
|
|
277
|
+
<meta name="twitter:title" content="${escapeHTML(twitterTitleClamped)}">
|
|
278
|
+
<meta name="twitter:description" content="${escapeHTML(twitterDescriptionClamped)}">${twitterAttributionBlock}
|
|
223
279
|
${buildResponsiveIconLinks('../')}
|
|
224
280
|
<link rel="manifest" href="../site.webmanifest">
|
|
225
281
|
<meta name="color-scheme" content="light dark">
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* @description Public re-exports for the manifest bounded context.
|
|
4
4
|
*/
|
|
5
5
|
export type { HorizonProfile, Manifest, ManifestFiles, ManifestHistoryEntry, ManifestMetadataOverride, MetadataManifest, } from './types.js';
|
|
6
|
-
export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
|
|
6
|
+
export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
|
|
7
7
|
export { readManifest, parseManifest, type ReadManifestResult } from './reader.js';
|
|
8
8
|
export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
|
|
9
9
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
|
|
3
|
+
export { resolveArticleType, resolveDate, resolveRunId, latestGateResult, flattenManifestFiles, stripRunSuffix, RUN_SUFFIX_PATTERN, UNKNOWN_ARTICLE_TYPE, } from './resolver.js';
|
|
4
4
|
export { readManifest, parseManifest } from './reader.js';
|
|
5
5
|
export { applyHorizonProfile, buildHorizonProfile } from './manifest-writer.js';
|
|
6
6
|
//# sourceMappingURL=index.js.map
|
|
@@ -10,6 +10,30 @@
|
|
|
10
10
|
import type { Manifest, ManifestFiles } from './types.js';
|
|
11
11
|
/** Sentinel used when no schema variant supplies a usable article type. */
|
|
12
12
|
export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
|
|
13
|
+
/**
|
|
14
|
+
* Pattern matching trailing `-run<N>` taxonomy noise that historic
|
|
15
|
+
* Stage-B writers occasionally encode into `articleType` (e.g.
|
|
16
|
+
* `committee-reports-run47`, `motions-run41`, `breaking-run193`). Also
|
|
17
|
+
* tolerates the legacy double-prefixed `motions-runmotions-run-1777010709`
|
|
18
|
+
* pattern observed in 2025 manifests where the writer concatenated the
|
|
19
|
+
* articleType and runId. The leading `-run` makes the match greedy enough
|
|
20
|
+
* to catch both single-suffix and double-prefixed forms.
|
|
21
|
+
*
|
|
22
|
+
* Exported for unit tests.
|
|
23
|
+
*/
|
|
24
|
+
export declare const RUN_SUFFIX_PATTERN: RegExp;
|
|
25
|
+
/**
|
|
26
|
+
* Strip a trailing `-run<N>` taxonomy-noise suffix from an article-type
|
|
27
|
+
* slug, but only when doing so yields a {@link CANONICAL_ARTICLE_TYPES}
|
|
28
|
+
* token. This is conservative: a non-canonical leading token (e.g.
|
|
29
|
+
* `custom-type-run5`) is returned untouched so we never silently
|
|
30
|
+
* collapse a genuinely new article type into something it isn't.
|
|
31
|
+
*
|
|
32
|
+
* @param slug - Raw article-type slug from a manifest field
|
|
33
|
+
* @returns Canonical slug when the suffix was successfully stripped,
|
|
34
|
+
* otherwise the original input
|
|
35
|
+
*/
|
|
36
|
+
export declare function stripRunSuffix(slug: string): string;
|
|
13
37
|
/**
|
|
14
38
|
* Resolve the article-type slug from a manifest, tolerating historic schemas.
|
|
15
39
|
*
|
|
@@ -19,7 +43,10 @@ export declare const UNKNOWN_ARTICLE_TYPE = "unknown";
|
|
|
19
43
|
* 3. `articleTypes[0]` — pre-aggregator-pipeline plural array
|
|
20
44
|
* 4. `runType` — historic field on older breaking-run manifests
|
|
21
45
|
*
|
|
22
|
-
*
|
|
46
|
+
* Each candidate is passed through {@link stripRunSuffix} so trailing
|
|
47
|
+
* `-run<N>` taxonomy noise never leaks into JSON-LD `articleSection`,
|
|
48
|
+
* the filesystem slug, or the SEO dump's article-type histogram. Falls
|
|
49
|
+
* back to `'unknown'` when none of the above is a non-empty string.
|
|
23
50
|
*
|
|
24
51
|
* @param manifest - Parsed manifest (any of the supported schemas)
|
|
25
52
|
* @returns Article-type slug usable as a filename component
|