euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -49,6 +49,39 @@ export declare function composeContextualDescription(lang: LanguageCode, baseDes
|
|
|
49
49
|
readonly headline: string;
|
|
50
50
|
readonly summary: string;
|
|
51
51
|
}, date: string, _runId: string): string;
|
|
52
|
+
/**
|
|
53
|
+
* Build a per-article `extendedDescription` (used for
|
|
54
|
+
* `og:description`, Twitter cards, and AI-overview surfaces) that is
|
|
55
|
+
* always ≥ {@link DESCRIPTION_MAX_LENGTH} characters whenever the
|
|
56
|
+
* editorial source paragraph is too short to satisfy
|
|
57
|
+
* {@link truncateExtendedDescription} on its own.
|
|
58
|
+
*
|
|
59
|
+
* This is the *only* code path that surfaces the localized
|
|
60
|
+
* `labels.reader` framing — the short `<meta description>` no longer
|
|
61
|
+
* carries it (see comment in {@link composeContextualDescription}).
|
|
62
|
+
* The structure is: `<base> <Date: YYYY-MM-DD.> <Context: …> <reader>`,
|
|
63
|
+
* passed through {@link truncateExtendedDescription} (300-char max with
|
|
64
|
+
* a 200-char min) so it occupies the Open Graph / Discover budget
|
|
65
|
+
* without exceeding it.
|
|
66
|
+
*
|
|
67
|
+
* @param lang - Target language code
|
|
68
|
+
* @param baseDescription - Best description from manifest/editorial/template
|
|
69
|
+
* @param editorial - Artifact-derived headline and summary
|
|
70
|
+
* @param editorial.headline - Artifact-derived headline
|
|
71
|
+
* @param editorial.summary - Artifact-derived summary
|
|
72
|
+
* @param date - ISO article date
|
|
73
|
+
* @returns Extended description ≥180 chars when feasible, otherwise `''`
|
|
74
|
+
*/
|
|
75
|
+
export declare function composeContextualExtendedDescription(lang: LanguageCode, baseDescription: string, editorial: {
|
|
76
|
+
readonly headline: string;
|
|
77
|
+
readonly summary: string;
|
|
78
|
+
}, date: string): string;
|
|
79
|
+
export declare function hasLeakySeoToken(value: string): boolean;
|
|
80
|
+
declare function sanitizeDescriptionCandidate(value: string): string;
|
|
81
|
+
declare function isUsableResolvedTitle(value: string, options?: {
|
|
82
|
+
readonly allowFullSentence?: boolean;
|
|
83
|
+
}): boolean;
|
|
84
|
+
declare function deriveHeadlineFromSummary(summary: string): string;
|
|
52
85
|
/**
|
|
53
86
|
* Append a short run qualifier to otherwise duplicate-prone fallback
|
|
54
87
|
* titles. Sanitizes the raw `runId` so user-facing `<title>` strings
|
|
@@ -88,4 +121,5 @@ export declare function buildSeoKeywords(lang: LanguageCode, articleType: string
|
|
|
88
121
|
* @returns First non-empty entry
|
|
89
122
|
*/
|
|
90
123
|
export declare function pickFirstNonEmpty(candidates: readonly string[]): string;
|
|
124
|
+
export { deriveHeadlineFromSummary, isUsableResolvedTitle, sanitizeDescriptionCandidate };
|
|
91
125
|
//# sourceMappingURL=resolve-helpers.d.ts.map
|
|
@@ -20,7 +20,14 @@ import { extractExtendedLedeAfterHeading, extractStrongProseLine } from './lede-
|
|
|
20
20
|
import { isGenericHeading } from './heading-rules.js';
|
|
21
21
|
import { humanizeSlug } from './slug.js';
|
|
22
22
|
import { SEO_CONTEXT_LABELS } from './template-fallback.js';
|
|
23
|
-
import {
|
|
23
|
+
import { EXTENDED_DESCRIPTION_MAX_LENGTH } from './text-utils-constants.js';
|
|
24
|
+
import { extractFirstSentence, shouldSkipDescriptionLine, truncateDescription, truncateExtendedDescription, truncateTitle, } from './text-utils.js';
|
|
25
|
+
import { readEnglishBriefBody } from './brief-body.js';
|
|
26
|
+
import { extractBriefingHighlight } from './briefing-highlight.js';
|
|
27
|
+
import { CROSS_SITE_KEYWORDS, isNoiseKeywordToken } from './keyword-filters.js';
|
|
28
|
+
import { findTitleRejectionReason } from './title-rejection.js';
|
|
29
|
+
const LEAKY_RUNID_RE = /\b[a-z][a-z-]*-run-?\d+-\d{8,}\b/iu;
|
|
30
|
+
const SEO_TITLE_FLOOR = 20;
|
|
24
31
|
/**
|
|
25
32
|
* Extract a manifest override value for a single language. Accepts either
|
|
26
33
|
* a plain string (applied to every language) or a `LanguageMap` object.
|
|
@@ -53,31 +60,80 @@ export function manifestOverrideFor(value, lang) {
|
|
|
53
60
|
*/
|
|
54
61
|
export function resolveEditorialContent(opts) {
|
|
55
62
|
const { articleType, date, markdown, runDir } = opts;
|
|
63
|
+
// Tier 1 (NEW, May-2026): structural extraction of `## Strategic
|
|
64
|
+
// Intelligence Summary` and `## Reader Briefing` from the English
|
|
65
|
+
// brief. These two sections are the editorial heart of every
|
|
66
|
+
// current-style executive brief — they are journalistically richer
|
|
67
|
+
// than the first non-generic H1 the legacy walker picks up, so we
|
|
68
|
+
// try them first. Returns `null` for the ~200 historical briefs
|
|
69
|
+
// that pre-date the style guide, in which case we fall through.
|
|
70
|
+
const briefBody = readEnglishBriefBody(runDir ?? '');
|
|
71
|
+
const briefing = briefBody ? extractBriefingHighlight(briefBody) : null;
|
|
72
|
+
// Bridge the briefing's `string | undefined` fields into plain
|
|
73
|
+
// strings so the downstream `||` fallback chains satisfy the
|
|
74
|
+
// `prefer-nullish-coalescing` lint rule (no nullable LHS).
|
|
75
|
+
const briefingHeadline = briefing?.headline ?? '';
|
|
76
|
+
const briefingSummary = briefing?.summary ?? '';
|
|
77
|
+
const briefingExtended = briefing?.extendedSummary ?? '';
|
|
78
|
+
if (briefingHeadline) {
|
|
79
|
+
return {
|
|
80
|
+
headline: briefingHeadline,
|
|
81
|
+
summary: briefingSummary,
|
|
82
|
+
extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
|
|
83
|
+
};
|
|
84
|
+
}
|
|
56
85
|
let artefactSummary = '';
|
|
57
86
|
if (runDir) {
|
|
58
87
|
const highlight = extractArtifactHighlight(runDir, articleType, date);
|
|
59
|
-
|
|
88
|
+
const highlightHeadline = highlight?.headline ?? '';
|
|
89
|
+
const highlightSummary = highlight?.summary ?? '';
|
|
90
|
+
if (highlightHeadline) {
|
|
60
91
|
return {
|
|
61
|
-
headline:
|
|
62
|
-
summary:
|
|
63
|
-
extendedSummary: extractExtendedLedeAfterHeading(markdown),
|
|
92
|
+
headline: highlightHeadline,
|
|
93
|
+
summary: briefingSummary || highlightSummary,
|
|
94
|
+
extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
|
|
64
95
|
};
|
|
65
96
|
}
|
|
66
|
-
if (
|
|
67
|
-
artefactSummary =
|
|
97
|
+
if (highlightSummary) {
|
|
98
|
+
artefactSummary = highlightSummary;
|
|
68
99
|
}
|
|
69
100
|
}
|
|
101
|
+
// Per the brief-only SEO contract (2026-05-24): when an executive
|
|
102
|
+
// brief is present, we **never** fall through to the aggregated
|
|
103
|
+
// `markdown` content (which is the assembled `article.md` body
|
|
104
|
+
// including all artefact prose). The brief is the only sanctioned
|
|
105
|
+
// source for `<title>` / `<meta description>` / keywords; if it
|
|
106
|
+
// failed to yield a usable headline above, the resolver returns
|
|
107
|
+
// empty so the localized template fallback (Breaking | YYYY-MM-DD,
|
|
108
|
+
// etc.) wins. Only legacy runs that ship without a brief at all are
|
|
109
|
+
// allowed to reach the aggregated-markdown fallback.
|
|
110
|
+
const briefPresent = briefBody.trim().length > 0;
|
|
111
|
+
if (briefPresent) {
|
|
112
|
+
if (artefactSummary) {
|
|
113
|
+
const firstSentence = extractFirstSentence(artefactSummary);
|
|
114
|
+
return {
|
|
115
|
+
headline: truncateTitle(firstSentence || artefactSummary),
|
|
116
|
+
summary: briefingSummary || artefactSummary,
|
|
117
|
+
extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
headline: '',
|
|
122
|
+
summary: briefingSummary,
|
|
123
|
+
extendedSummary: briefingExtended,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
70
126
|
const aggregatedH1 = extractFirstH1(markdown);
|
|
71
127
|
const aggregatedSummary = extractStrongProseLine(markdown);
|
|
72
128
|
const aggregatedExtended = extractExtendedLedeAfterHeading(markdown);
|
|
73
129
|
if (aggregatedH1 && !isGenericHeading(aggregatedH1, articleType, date)) {
|
|
74
130
|
return {
|
|
75
131
|
headline: truncateTitle(aggregatedH1),
|
|
76
|
-
summary: artefactSummary || aggregatedSummary,
|
|
77
|
-
extendedSummary: aggregatedExtended,
|
|
132
|
+
summary: briefingSummary || artefactSummary || aggregatedSummary,
|
|
133
|
+
extendedSummary: briefingExtended || aggregatedExtended,
|
|
78
134
|
};
|
|
79
135
|
}
|
|
80
|
-
const summary = artefactSummary || aggregatedSummary;
|
|
136
|
+
const summary = briefingSummary || artefactSummary || aggregatedSummary;
|
|
81
137
|
if (summary) {
|
|
82
138
|
// The H1 is generic (category-noun, bare-institutional, or
|
|
83
139
|
// template-style) so we have to derive `<title>` from the BLUF/
|
|
@@ -85,11 +141,15 @@ export function resolveEditorialContent(opts) {
|
|
|
85
141
|
// resulting title is grammatically self-contained — falling back
|
|
86
142
|
// to clause-boundary truncation downstream when the sentence
|
|
87
143
|
// itself overruns TITLE_MAX_LENGTH.
|
|
144
|
+
// Fall back to the raw summary when the first-sentence extractor
|
|
145
|
+
// returns '' — happens when the source is a single sentence with no
|
|
146
|
+
// `. ` terminator inside the soft-min window. `truncateTitle` will
|
|
147
|
+
// still apply clause-boundary truncation downstream.
|
|
88
148
|
const firstSentence = extractFirstSentence(summary);
|
|
89
149
|
return {
|
|
90
|
-
headline: truncateTitle(firstSentence),
|
|
150
|
+
headline: truncateTitle(firstSentence || summary),
|
|
91
151
|
summary,
|
|
92
|
-
extendedSummary: aggregatedExtended,
|
|
152
|
+
extendedSummary: briefingExtended || aggregatedExtended,
|
|
93
153
|
};
|
|
94
154
|
}
|
|
95
155
|
return { headline: '', summary: '', extendedSummary: '' };
|
|
@@ -133,10 +193,123 @@ export function composeContextualDescription(lang, baseDescription, editorial, d
|
|
|
133
193
|
if (context && !containsNormalized(parts[0] ?? '', context)) {
|
|
134
194
|
parts.push(`${labels.context}: ${context}`);
|
|
135
195
|
}
|
|
196
|
+
// NOTE: the localized `labels.reader` "for democratic-accountability
|
|
197
|
+
// readers …" hint is intentionally **not** appended here. That
|
|
198
|
+
// boilerplate inflates `<meta description>` past the 160-char SERP
|
|
199
|
+
// cutoff without surfacing any article-specific signal, so it is
|
|
200
|
+
// restricted to the longer {@link composeContextualExtendedDescription}
|
|
201
|
+
// path (used by `og:description` / AI-overview surfaces, which have
|
|
202
|
+
// a 250–300 char budget where the framing carries real value).
|
|
203
|
+
return truncateDescription(parts.join(' '));
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Build a per-article `extendedDescription` (used for
|
|
207
|
+
* `og:description`, Twitter cards, and AI-overview surfaces) that is
|
|
208
|
+
* always ≥ {@link DESCRIPTION_MAX_LENGTH} characters whenever the
|
|
209
|
+
* editorial source paragraph is too short to satisfy
|
|
210
|
+
* {@link truncateExtendedDescription} on its own.
|
|
211
|
+
*
|
|
212
|
+
* This is the *only* code path that surfaces the localized
|
|
213
|
+
* `labels.reader` framing — the short `<meta description>` no longer
|
|
214
|
+
* carries it (see comment in {@link composeContextualDescription}).
|
|
215
|
+
* The structure is: `<base> <Date: YYYY-MM-DD.> <Context: …> <reader>`,
|
|
216
|
+
* passed through {@link truncateExtendedDescription} (300-char max with
|
|
217
|
+
* a 200-char min) so it occupies the Open Graph / Discover budget
|
|
218
|
+
* without exceeding it.
|
|
219
|
+
*
|
|
220
|
+
* @param lang - Target language code
|
|
221
|
+
* @param baseDescription - Best description from manifest/editorial/template
|
|
222
|
+
* @param editorial - Artifact-derived headline and summary
|
|
223
|
+
* @param editorial.headline - Artifact-derived headline
|
|
224
|
+
* @param editorial.summary - Artifact-derived summary
|
|
225
|
+
* @param date - ISO article date
|
|
226
|
+
* @returns Extended description ≥180 chars when feasible, otherwise `''`
|
|
227
|
+
*/
|
|
228
|
+
export function composeContextualExtendedDescription(lang, baseDescription, editorial, date) {
|
|
229
|
+
const labels = getLocalizedString(SEO_CONTEXT_LABELS, lang);
|
|
230
|
+
const base = baseDescription.trim();
|
|
231
|
+
const parts = base ? [base] : [];
|
|
232
|
+
const datePart = `${labels.date} ${date}.`;
|
|
233
|
+
if (!containsNormalized(base, `${labels.date} ${date}`)) {
|
|
234
|
+
parts.push(datePart);
|
|
235
|
+
}
|
|
236
|
+
const context = pickFirstNonEmpty([editorial.summary, editorial.headline]);
|
|
237
|
+
if (context && !containsNormalized(parts.join(' '), context)) {
|
|
238
|
+
parts.push(`${labels.context}: ${context}`);
|
|
239
|
+
}
|
|
136
240
|
if (!containsNormalized(parts.join(' '), labels.reader)) {
|
|
137
241
|
parts.push(labels.reader);
|
|
138
242
|
}
|
|
139
|
-
|
|
243
|
+
// Synthesizer path: clamp to the 300-char og:description budget
|
|
244
|
+
// *without* enforcing the 181-char sentence-boundary floor that
|
|
245
|
+
// {@link truncateExtendedDescription} applies. The whole point of
|
|
246
|
+
// this helper is to produce a non-empty extended description when
|
|
247
|
+
// the editorial source paragraph was too short — accepting a
|
|
248
|
+
// 130-char synthesized string is strictly better than the empty
|
|
249
|
+
// fallback that was previously emitted on 56 breaking briefs.
|
|
250
|
+
// We delegate the actual clamp to {@link truncateDescription} on
|
|
251
|
+
// the joined buffer first (which won't trip because the buffer is
|
|
252
|
+
// already under 300), then truncate again only if it overruns
|
|
253
|
+
// the larger 300-char budget.
|
|
254
|
+
const joined = parts.join(' ').trim();
|
|
255
|
+
if (!joined)
|
|
256
|
+
return '';
|
|
257
|
+
if (joined.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
|
|
258
|
+
return joined;
|
|
259
|
+
// Overran the 300-char budget — apply the same sentence-boundary
|
|
260
|
+
// preserving truncation as truncateExtendedDescription.
|
|
261
|
+
return truncateExtendedDescription(joined);
|
|
262
|
+
}
|
|
263
|
+
export function hasLeakySeoToken(value) {
|
|
264
|
+
if (!value)
|
|
265
|
+
return false;
|
|
266
|
+
return value.toLowerCase().includes('analysis run') || LEAKY_RUNID_RE.test(value);
|
|
267
|
+
}
|
|
268
|
+
function stripLeadingFragmentSeparator(value) {
|
|
269
|
+
return value.replace(/^[:;—–-]\s+/u, '').trim();
|
|
270
|
+
}
|
|
271
|
+
function stripLeakySentences(value) {
|
|
272
|
+
if (!value)
|
|
273
|
+
return '';
|
|
274
|
+
const parts = value
|
|
275
|
+
.split(/(?<=[.!?])\s+/u)
|
|
276
|
+
.map((part) => part.trim())
|
|
277
|
+
.filter(Boolean);
|
|
278
|
+
const clean = parts.filter((part) => !hasLeakySeoToken(part));
|
|
279
|
+
return (clean.length > 0 ? clean : parts).join(' ').trim();
|
|
280
|
+
}
|
|
281
|
+
function sanitizeDescriptionCandidate(value) {
|
|
282
|
+
const cleaned = stripLeadingFragmentSeparator(stripLeakySentences(value));
|
|
283
|
+
return cleaned && !shouldSkipDescriptionLine(cleaned) ? cleaned : '';
|
|
284
|
+
}
|
|
285
|
+
function isUsableResolvedTitle(value, options) {
|
|
286
|
+
const cleaned = stripLeadingFragmentSeparator(value);
|
|
287
|
+
if (cleaned.length < SEO_TITLE_FLOOR)
|
|
288
|
+
return false;
|
|
289
|
+
if (hasLeakySeoToken(cleaned))
|
|
290
|
+
return false;
|
|
291
|
+
// Reject section-header leaks, ellipsis-truncated strings, doc-IDs,
|
|
292
|
+
// and full-sentence fragments. See `title-rejection.ts` for the
|
|
293
|
+
// canonical denylist + structural rules. Without these guards, the
|
|
294
|
+
// 216-article audit (2026-05-24) showed `Strategic significance`,
|
|
295
|
+
// `Threat Level`, `Convergence themes`, `TA-10-2026-0160`, and
|
|
296
|
+
// ellipsis-cut paragraphs reaching the `<title>` surface.
|
|
297
|
+
//
|
|
298
|
+
// When `allowFullSentence` is true, the `sentence-fragment` reason is
|
|
299
|
+
// tolerated. This is used for summary-derived titles where the first
|
|
300
|
+
// sentence of the summary is the intended payload (e.g. recess days
|
|
301
|
+
// whose summary leads with `No new breaking developments on …`).
|
|
302
|
+
const reason = findTitleRejectionReason(cleaned);
|
|
303
|
+
if (reason && !(options?.allowFullSentence && reason === 'sentence-fragment')) {
|
|
304
|
+
return false;
|
|
305
|
+
}
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
function deriveHeadlineFromSummary(summary) {
|
|
309
|
+
const cleaned = sanitizeDescriptionCandidate(summary);
|
|
310
|
+
if (!cleaned)
|
|
311
|
+
return '';
|
|
312
|
+
return truncateTitle(extractFirstSentence(cleaned) || cleaned);
|
|
140
313
|
}
|
|
141
314
|
/**
|
|
142
315
|
* Append a short run qualifier to otherwise duplicate-prone fallback
|
|
@@ -190,14 +363,22 @@ export function containsNormalized(haystack, needle) {
|
|
|
190
363
|
* @returns De-duplicated keywords for `<meta name="keywords">`
|
|
191
364
|
*/
|
|
192
365
|
export function buildSeoKeywords(lang, articleType, date, runId, title, description) {
|
|
366
|
+
// `runId` is intentionally unused: the previous implementation
|
|
367
|
+
// emitted `run <runId>` as a synthetic keyword, which surfaced
|
|
368
|
+
// opaque tokens like `run propositions-run261-1779431162` in
|
|
369
|
+
// `<meta name="keywords">`. The argument is preserved for callsite
|
|
370
|
+
// backward compatibility.
|
|
371
|
+
void runId;
|
|
193
372
|
const localized = getLocalizedString(LOCALIZED_KEYWORDS, lang);
|
|
194
373
|
const base = Object.getOwnPropertyDescriptor(localized, articleType)?.value;
|
|
195
374
|
const fallback = ['EU Parliament', 'European Parliament', 'political intelligence'];
|
|
196
375
|
const candidates = [
|
|
376
|
+
// Always-on cross-site portfolio keywords lead the list so they
|
|
377
|
+
// are guaranteed to survive the 16-entry budget cap.
|
|
378
|
+
...CROSS_SITE_KEYWORDS,
|
|
197
379
|
...(base ?? fallback),
|
|
198
380
|
humanizeSlug(articleType),
|
|
199
381
|
date,
|
|
200
|
-
...(runId ? [`run ${runId}`] : []),
|
|
201
382
|
...extractKeywordTerms(`${title} ${description}`),
|
|
202
383
|
];
|
|
203
384
|
return dedupeKeywords(candidates).slice(0, 16);
|
|
@@ -205,6 +386,11 @@ export function buildSeoKeywords(lang, articleType, date, runId, title, descript
|
|
|
205
386
|
/**
|
|
206
387
|
* Extract short keyword terms from resolved SEO copy.
|
|
207
388
|
*
|
|
389
|
+
* Filters out tokens that look like UUID hex fragments, run-id slugs,
|
|
390
|
+
* or digit-dominated noise (see {@link isNoiseKeywordToken}) so the
|
|
391
|
+
* keyword list never leaks internal aggregator identifiers into
|
|
392
|
+
* `<meta name="keywords">`.
|
|
393
|
+
*
|
|
208
394
|
* @param text - Title and description text
|
|
209
395
|
* @returns Candidate terms
|
|
210
396
|
*/
|
|
@@ -212,7 +398,7 @@ function extractKeywordTerms(text) {
|
|
|
212
398
|
return text
|
|
213
399
|
.split(/[^\p{L}\p{N}]+/u)
|
|
214
400
|
.map((token) => token.trim())
|
|
215
|
-
.filter((token) => token.length >= 4 &&
|
|
401
|
+
.filter((token) => token.length >= 4 && !isNoiseKeywordToken(token))
|
|
216
402
|
.slice(0, 18);
|
|
217
403
|
}
|
|
218
404
|
/**
|
|
@@ -250,4 +436,5 @@ export function pickFirstNonEmpty(candidates) {
|
|
|
250
436
|
}
|
|
251
437
|
return '';
|
|
252
438
|
}
|
|
439
|
+
export { deriveHeadlineFromSummary, isUsableResolvedTitle, sanitizeDescriptionCandidate };
|
|
253
440
|
//# sourceMappingURL=resolve-helpers.js.map
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Aggregator/Metadata/SeoBudgets
|
|
3
|
+
* @description Per-script SEO byte budgets and a script-aware clamp.
|
|
4
|
+
*
|
|
5
|
+
* Background. Google Search Central and Bing Webmaster Guidelines both
|
|
6
|
+
* document SERP snippet limits in **pixels**, not characters. Latin
|
|
7
|
+
* glyphs render at roughly half the pixel width of CJK glyphs, while
|
|
8
|
+
* Arabic/Hebrew letterforms sit between the two. A single `length`
|
|
9
|
+
* budget for `<title>` / `<meta description>` will always be wrong for
|
|
10
|
+
* at least one of the 14 publishing languages — typically over-truncating
|
|
11
|
+
* Latin copy and over-running CJK by a factor of two.
|
|
12
|
+
*
|
|
13
|
+
* This module provides:
|
|
14
|
+
*
|
|
15
|
+
* - {@link classifyScript} — three-way `latin | cjk | rtl` family
|
|
16
|
+
* classifier driven by the locale code (no glyph inspection — the
|
|
17
|
+
* BCP-47 language tag is authoritative because every publishing
|
|
18
|
+
* pipeline emits one full output per language).
|
|
19
|
+
* - {@link SEO_BUDGETS} — per-surface × per-script byte caps derived
|
|
20
|
+
* from the documented platform envelopes (Google ≤580 px title /
|
|
21
|
+
* ≤155 char description; Bing slightly more generous; Facebook ≤95
|
|
22
|
+
* chars on `og:title`; Twitter ≤70 / ≤200; LinkedIn shares OG).
|
|
23
|
+
* - {@link budgetFor} — typed accessor returning the byte cap for a
|
|
24
|
+
* `(lang, surface)` pair, with a uniform fallback to the strictest
|
|
25
|
+
* Latin budget when the locale is unknown.
|
|
26
|
+
* - {@link clampForBudget} — script-aware truncator that prefers
|
|
27
|
+
* natural clause boundaries (CJK full-width punctuation, RTL
|
|
28
|
+
* sentence punctuation, Latin clause separators) before falling
|
|
29
|
+
* back to whitespace breaks. Returns the input verbatim when it
|
|
30
|
+
* already fits.
|
|
31
|
+
*
|
|
32
|
+
* Pure, leaf module. No I/O, no dependencies on other aggregator
|
|
33
|
+
* modules beyond the existing `text-utils.ts` clause-boundary
|
|
34
|
+
* vocabulary.
|
|
35
|
+
*/
|
|
36
|
+
import type { LanguageCode } from '../../types/index.js';
|
|
37
|
+
/**
|
|
38
|
+
* Three-way script family used as the column key in {@link SEO_BUDGETS}.
|
|
39
|
+
* `cjk` covers Chinese / Japanese / Korean (~2× Latin pixel width per
|
|
40
|
+
* glyph); `rtl` covers Arabic / Hebrew (bidi + ligature handling).
|
|
41
|
+
*/
|
|
42
|
+
export type ScriptFamily = 'latin' | 'cjk' | 'rtl';
|
|
43
|
+
/**
|
|
44
|
+
* Iteration helper — all three script families in a deterministic
|
|
45
|
+
* order (latin → cjk → rtl). Exported so test matrices and downstream
|
|
46
|
+
* tooling can walk every column of {@link SEO_BUDGETS} without
|
|
47
|
+
* duplicating the literal list.
|
|
48
|
+
*/
|
|
49
|
+
export declare const ALL_SCRIPT_FAMILIES: readonly ScriptFamily[];
|
|
50
|
+
/**
|
|
51
|
+
* Classify a locale code into a script family. Used to look up the
|
|
52
|
+
* correct byte cap in {@link SEO_BUDGETS}.
|
|
53
|
+
*
|
|
54
|
+
* @param lang - BCP-47 language tag (one of the 14 publishing locales)
|
|
55
|
+
* @returns Script family for SEO budget lookup
|
|
56
|
+
*/
|
|
57
|
+
export declare function classifyScript(lang: string): ScriptFamily;
|
|
58
|
+
/**
|
|
59
|
+
* Public SEO surfaces this module budgets for. Each one has documented
|
|
60
|
+
* truncation behaviour by at least one major search engine or social
|
|
61
|
+
* platform.
|
|
62
|
+
*
|
|
63
|
+
* - `title` — HTML `<title>` (Google ≤580 px ≈ 60 Latin / 30 CJK / 55 RTL)
|
|
64
|
+
* - `metaDescription` — `<meta name="description">` (Google ≤~155 char)
|
|
65
|
+
* - `ogTitle` — Facebook / LinkedIn `og:title` (~95 Latin)
|
|
66
|
+
* - `ogDescription` — Facebook / LinkedIn `og:description` (~200 Latin)
|
|
67
|
+
* - `twitterTitle` — Twitter card title (≤70 Latin)
|
|
68
|
+
* - `twitterDescription` — Twitter card description (≤200 Latin)
|
|
69
|
+
* - `imageAlt` — `og:image:alt` / social card alt text (≤125 Latin)
|
|
70
|
+
* - `jsonLdHeadline` — Schema.org `NewsArticle.headline` (Google ≤110)
|
|
71
|
+
*/
|
|
72
|
+
export type SeoSurface = 'title' | 'metaDescription' | 'ogTitle' | 'ogDescription' | 'twitterTitle' | 'twitterDescription' | 'imageAlt' | 'jsonLdHeadline';
|
|
73
|
+
/**
|
|
74
|
+
* Per-surface × per-script byte cap table. Numbers reflect the
|
|
75
|
+
* narrower of Google / Bing / Facebook / Twitter documented envelopes,
|
|
76
|
+
* with a ~5 % safety margin so a snippet on the edge of the budget
|
|
77
|
+
* isn't truncated mid-glyph by the rendering platform.
|
|
78
|
+
*
|
|
79
|
+
* For `jsonLdHeadline` the Schema.org `NewsArticle.headline` cap is
|
|
80
|
+
* script-independent (Google validates the literal character count at
|
|
81
|
+
* 110) — same value across the row.
|
|
82
|
+
*/
|
|
83
|
+
export declare const SEO_BUDGETS: Readonly<Record<SeoSurface, Readonly<Record<ScriptFamily, number>>>>;
|
|
84
|
+
/**
|
|
85
|
+
* Resolve the byte cap for one `(lang, surface)` pair.
|
|
86
|
+
*
|
|
87
|
+
* @param lang - Publishing locale
|
|
88
|
+
* @param surface - SEO surface (see {@link SeoSurface})
|
|
89
|
+
* @returns Byte cap (positive integer)
|
|
90
|
+
*/
|
|
91
|
+
export declare function budgetFor(lang: LanguageCode | string, surface: SeoSurface): number;
|
|
92
|
+
/**
|
|
93
|
+
* Truncate `text` to fit `(lang, surface)` SEO byte budget. Prefers a
|
|
94
|
+
* natural clause boundary inside the script's punctuation vocabulary
|
|
95
|
+
* (CJK / RTL / Latin) before falling back to a whitespace break.
|
|
96
|
+
*
|
|
97
|
+
* Always returns `text` verbatim when it already fits (no ellipsis
|
|
98
|
+
* appended). When truncation happens an ellipsis (`…`) is appended for
|
|
99
|
+
* Latin / RTL; for CJK the full-width ellipsis (`…`) reads as a
|
|
100
|
+
* partial-thought marker and is also appended — Schema.org and Google
|
|
101
|
+
* accept either glyph in `headline` / `description`.
|
|
102
|
+
*
|
|
103
|
+
* @param text - Source text (already plain-text — no Markdown / HTML)
|
|
104
|
+
* @param lang - Publishing locale
|
|
105
|
+
* @param surface - Target SEO surface
|
|
106
|
+
* @returns Clamped text ≤ `budgetFor(lang, surface)` characters
|
|
107
|
+
*/
|
|
108
|
+
export declare function clampForBudget(text: string, lang: LanguageCode | string, surface: SeoSurface): string;
|
|
109
|
+
/**
|
|
110
|
+
* Optional inputs to {@link clampTitleForSurface}.
|
|
111
|
+
*
|
|
112
|
+
* `siteTitle` is the brand suffix (e.g. "EU Parliament Monitor") and
|
|
113
|
+
* `separator` is the localized glue (e.g. `" | "` / `" ・ "` / `" ׀ "`).
|
|
114
|
+
* When both are provided the function tries to keep the brand suffix
|
|
115
|
+
* inside the budget; when the article title alone already fills the
|
|
116
|
+
* budget the suffix is *dropped* (better SERP outcome than a truncated
|
|
117
|
+
* headline followed by a clipped brand).
|
|
118
|
+
*
|
|
119
|
+
* `shortSiteTitle` is the optional fallback used when the full brand
|
|
120
|
+
* suffix can't fit but a shorter variant would (e.g. `"EPM"` for CJK).
|
|
121
|
+
*/
|
|
122
|
+
export interface TitleSurfaceOptions {
|
|
123
|
+
readonly siteTitle?: string;
|
|
124
|
+
readonly shortSiteTitle?: string;
|
|
125
|
+
readonly separator?: string;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Compose `{title}{separator}{siteTitle}` while honouring the
|
|
129
|
+
* `(lang, surface)` budget. Drops the brand suffix entirely when the
|
|
130
|
+
* article title alone is already at or past the budget. Prefers the
|
|
131
|
+
* short site title when supplied and the full suffix doesn't fit.
|
|
132
|
+
*
|
|
133
|
+
* @param title - Article title (plain text)
|
|
134
|
+
* @param lang - Publishing locale
|
|
135
|
+
* @param surface - Target SEO surface (`title` / `ogTitle` / `twitterTitle`)
|
|
136
|
+
* @param opts - Optional brand suffix wiring
|
|
137
|
+
* @returns Composed title ≤ budget
|
|
138
|
+
*/
|
|
139
|
+
export declare function clampTitleForSurface(title: string, lang: LanguageCode | string, surface: SeoSurface, opts?: TitleSurfaceOptions): string;
|
|
140
|
+
//# sourceMappingURL=seo-budgets.d.ts.map
|