euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -49,6 +49,39 @@ export declare function composeContextualDescription(lang: LanguageCode, baseDes
49
49
  readonly headline: string;
50
50
  readonly summary: string;
51
51
  }, date: string, _runId: string): string;
52
+ /**
53
+ * Build a per-article `extendedDescription` (used for
54
+ * `og:description`, Twitter cards, and AI-overview surfaces) that is
55
+ * always ≥ {@link DESCRIPTION_MAX_LENGTH} characters whenever the
56
+ * editorial source paragraph is too short to satisfy
57
+ * {@link truncateExtendedDescription} on its own.
58
+ *
59
+ * This is the *only* code path that surfaces the localized
60
+ * `labels.reader` framing — the short `<meta description>` no longer
61
+ * carries it (see comment in {@link composeContextualDescription}).
62
+ * The structure is: `<base> <Date: YYYY-MM-DD.> <Context: …> <reader>`,
63
+ * passed through {@link truncateExtendedDescription} (300-char max with
64
+ * a 200-char min) so it occupies the Open Graph / Discover budget
65
+ * without exceeding it.
66
+ *
67
+ * @param lang - Target language code
68
+ * @param baseDescription - Best description from manifest/editorial/template
69
+ * @param editorial - Artifact-derived headline and summary
70
+ * @param editorial.headline - Artifact-derived headline
71
+ * @param editorial.summary - Artifact-derived summary
72
+ * @param date - ISO article date
73
+ * @returns Extended description ≥180 chars when feasible, otherwise `''`
74
+ */
75
+ export declare function composeContextualExtendedDescription(lang: LanguageCode, baseDescription: string, editorial: {
76
+ readonly headline: string;
77
+ readonly summary: string;
78
+ }, date: string): string;
79
+ export declare function hasLeakySeoToken(value: string): boolean;
80
+ declare function sanitizeDescriptionCandidate(value: string): string;
81
+ declare function isUsableResolvedTitle(value: string, options?: {
82
+ readonly allowFullSentence?: boolean;
83
+ }): boolean;
84
+ declare function deriveHeadlineFromSummary(summary: string): string;
52
85
  /**
53
86
  * Append a short run qualifier to otherwise duplicate-prone fallback
54
87
  * titles. Sanitizes the raw `runId` so user-facing `<title>` strings
@@ -88,4 +121,5 @@ export declare function buildSeoKeywords(lang: LanguageCode, articleType: string
88
121
  * @returns First non-empty entry
89
122
  */
90
123
  export declare function pickFirstNonEmpty(candidates: readonly string[]): string;
124
+ export { deriveHeadlineFromSummary, isUsableResolvedTitle, sanitizeDescriptionCandidate };
91
125
  //# sourceMappingURL=resolve-helpers.d.ts.map
@@ -20,7 +20,14 @@ import { extractExtendedLedeAfterHeading, extractStrongProseLine } from './lede-
20
20
  import { isGenericHeading } from './heading-rules.js';
21
21
  import { humanizeSlug } from './slug.js';
22
22
  import { SEO_CONTEXT_LABELS } from './template-fallback.js';
23
- import { extractFirstSentence, truncateDescription, truncateTitle } from './text-utils.js';
23
+ import { EXTENDED_DESCRIPTION_MAX_LENGTH } from './text-utils-constants.js';
24
+ import { extractFirstSentence, shouldSkipDescriptionLine, truncateDescription, truncateExtendedDescription, truncateTitle, } from './text-utils.js';
25
+ import { readEnglishBriefBody } from './brief-body.js';
26
+ import { extractBriefingHighlight } from './briefing-highlight.js';
27
+ import { CROSS_SITE_KEYWORDS, isNoiseKeywordToken } from './keyword-filters.js';
28
+ import { findTitleRejectionReason } from './title-rejection.js';
29
+ const LEAKY_RUNID_RE = /\b[a-z][a-z-]*-run-?\d+-\d{8,}\b/iu;
30
+ const SEO_TITLE_FLOOR = 20;
24
31
  /**
25
32
  * Extract a manifest override value for a single language. Accepts either
26
33
  * a plain string (applied to every language) or a `LanguageMap` object.
@@ -53,31 +60,80 @@ export function manifestOverrideFor(value, lang) {
53
60
  */
54
61
  export function resolveEditorialContent(opts) {
55
62
  const { articleType, date, markdown, runDir } = opts;
63
+ // Tier 1 (NEW, May-2026): structural extraction of `## Strategic
64
+ // Intelligence Summary` and `## Reader Briefing` from the English
65
+ // brief. These two sections are the editorial heart of every
66
+ // current-style executive brief — they are journalistically richer
67
+ // than the first non-generic H1 the legacy walker picks up, so we
68
+ // try them first. Returns `null` for the ~200 historical briefs
69
+ // that pre-date the style guide, in which case we fall through.
70
+ const briefBody = readEnglishBriefBody(runDir ?? '');
71
+ const briefing = briefBody ? extractBriefingHighlight(briefBody) : null;
72
+ // Bridge the briefing's `string | undefined` fields into plain
73
+ // strings so the downstream `||` fallback chains satisfy the
74
+ // `prefer-nullish-coalescing` lint rule (no nullable LHS).
75
+ const briefingHeadline = briefing?.headline ?? '';
76
+ const briefingSummary = briefing?.summary ?? '';
77
+ const briefingExtended = briefing?.extendedSummary ?? '';
78
+ if (briefingHeadline) {
79
+ return {
80
+ headline: briefingHeadline,
81
+ summary: briefingSummary,
82
+ extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
83
+ };
84
+ }
56
85
  let artefactSummary = '';
57
86
  if (runDir) {
58
87
  const highlight = extractArtifactHighlight(runDir, articleType, date);
59
- if (highlight?.headline) {
88
+ const highlightHeadline = highlight?.headline ?? '';
89
+ const highlightSummary = highlight?.summary ?? '';
90
+ if (highlightHeadline) {
60
91
  return {
61
- headline: highlight.headline,
62
- summary: highlight.summary,
63
- extendedSummary: extractExtendedLedeAfterHeading(markdown),
92
+ headline: highlightHeadline,
93
+ summary: briefingSummary || highlightSummary,
94
+ extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
64
95
  };
65
96
  }
66
- if (highlight?.summary) {
67
- artefactSummary = highlight.summary;
97
+ if (highlightSummary) {
98
+ artefactSummary = highlightSummary;
68
99
  }
69
100
  }
101
+ // Per the brief-only SEO contract (2026-05-24): when an executive
102
+ // brief is present, we **never** fall through to the aggregated
103
+ // `markdown` content (which is the assembled `article.md` body
104
+ // including all artefact prose). The brief is the only sanctioned
105
+ // source for `<title>` / `<meta description>` / keywords; if it
106
+ // failed to yield a usable headline above, the resolver returns
107
+ // empty so the localized template fallback (Breaking | YYYY-MM-DD,
108
+ // etc.) wins. Only legacy runs that ship without a brief at all are
109
+ // allowed to reach the aggregated-markdown fallback.
110
+ const briefPresent = briefBody.trim().length > 0;
111
+ if (briefPresent) {
112
+ if (artefactSummary) {
113
+ const firstSentence = extractFirstSentence(artefactSummary);
114
+ return {
115
+ headline: truncateTitle(firstSentence || artefactSummary),
116
+ summary: briefingSummary || artefactSummary,
117
+ extendedSummary: briefingExtended || extractExtendedLedeAfterHeading(markdown),
118
+ };
119
+ }
120
+ return {
121
+ headline: '',
122
+ summary: briefingSummary,
123
+ extendedSummary: briefingExtended,
124
+ };
125
+ }
70
126
  const aggregatedH1 = extractFirstH1(markdown);
71
127
  const aggregatedSummary = extractStrongProseLine(markdown);
72
128
  const aggregatedExtended = extractExtendedLedeAfterHeading(markdown);
73
129
  if (aggregatedH1 && !isGenericHeading(aggregatedH1, articleType, date)) {
74
130
  return {
75
131
  headline: truncateTitle(aggregatedH1),
76
- summary: artefactSummary || aggregatedSummary,
77
- extendedSummary: aggregatedExtended,
132
+ summary: briefingSummary || artefactSummary || aggregatedSummary,
133
+ extendedSummary: briefingExtended || aggregatedExtended,
78
134
  };
79
135
  }
80
- const summary = artefactSummary || aggregatedSummary;
136
+ const summary = briefingSummary || artefactSummary || aggregatedSummary;
81
137
  if (summary) {
82
138
  // The H1 is generic (category-noun, bare-institutional, or
83
139
  // template-style) so we have to derive `<title>` from the BLUF/
@@ -85,11 +141,15 @@ export function resolveEditorialContent(opts) {
85
141
  // resulting title is grammatically self-contained — falling back
86
142
  // to clause-boundary truncation downstream when the sentence
87
143
  // itself overruns TITLE_MAX_LENGTH.
144
+ // Fall back to the raw summary when the first-sentence extractor
145
+ // returns '' — happens when the source is a single sentence with no
146
+ // `. ` terminator inside the soft-min window. `truncateTitle` will
147
+ // still apply clause-boundary truncation downstream.
88
148
  const firstSentence = extractFirstSentence(summary);
89
149
  return {
90
- headline: truncateTitle(firstSentence),
150
+ headline: truncateTitle(firstSentence || summary),
91
151
  summary,
92
- extendedSummary: aggregatedExtended,
152
+ extendedSummary: briefingExtended || aggregatedExtended,
93
153
  };
94
154
  }
95
155
  return { headline: '', summary: '', extendedSummary: '' };
@@ -133,10 +193,123 @@ export function composeContextualDescription(lang, baseDescription, editorial, d
133
193
  if (context && !containsNormalized(parts[0] ?? '', context)) {
134
194
  parts.push(`${labels.context}: ${context}`);
135
195
  }
196
+ // NOTE: the localized `labels.reader` "for democratic-accountability
197
+ // readers …" hint is intentionally **not** appended here. That
198
+ // boilerplate inflates `<meta description>` past the 160-char SERP
199
+ // cutoff without surfacing any article-specific signal, so it is
200
+ // restricted to the longer {@link composeContextualExtendedDescription}
201
+ // path (used by `og:description` / AI-overview surfaces, which have
202
+ // a 250–300 char budget where the framing carries real value).
203
+ return truncateDescription(parts.join(' '));
204
+ }
205
+ /**
206
+ * Build a per-article `extendedDescription` (used for
207
+ * `og:description`, Twitter cards, and AI-overview surfaces) that is
208
+ * always ≥ {@link DESCRIPTION_MAX_LENGTH} characters whenever the
209
+ * editorial source paragraph is too short to satisfy
210
+ * {@link truncateExtendedDescription} on its own.
211
+ *
212
+ * This is the *only* code path that surfaces the localized
213
+ * `labels.reader` framing — the short `<meta description>` no longer
214
+ * carries it (see comment in {@link composeContextualDescription}).
215
+ * The structure is: `<base> <Date: YYYY-MM-DD.> <Context: …> <reader>`,
216
+ * passed through {@link truncateExtendedDescription} (300-char max with
217
+ * a 200-char min) so it occupies the Open Graph / Discover budget
218
+ * without exceeding it.
219
+ *
220
+ * @param lang - Target language code
221
+ * @param baseDescription - Best description from manifest/editorial/template
222
+ * @param editorial - Artifact-derived headline and summary
223
+ * @param editorial.headline - Artifact-derived headline
224
+ * @param editorial.summary - Artifact-derived summary
225
+ * @param date - ISO article date
226
+ * @returns Extended description ≥180 chars when feasible, otherwise `''`
227
+ */
228
+ export function composeContextualExtendedDescription(lang, baseDescription, editorial, date) {
229
+ const labels = getLocalizedString(SEO_CONTEXT_LABELS, lang);
230
+ const base = baseDescription.trim();
231
+ const parts = base ? [base] : [];
232
+ const datePart = `${labels.date} ${date}.`;
233
+ if (!containsNormalized(base, `${labels.date} ${date}`)) {
234
+ parts.push(datePart);
235
+ }
236
+ const context = pickFirstNonEmpty([editorial.summary, editorial.headline]);
237
+ if (context && !containsNormalized(parts.join(' '), context)) {
238
+ parts.push(`${labels.context}: ${context}`);
239
+ }
136
240
  if (!containsNormalized(parts.join(' '), labels.reader)) {
137
241
  parts.push(labels.reader);
138
242
  }
139
- return truncateDescription(parts.join(' '));
243
+ // Synthesizer path: clamp to the 300-char og:description budget
244
+ // *without* enforcing the 181-char sentence-boundary floor that
245
+ // {@link truncateExtendedDescription} applies. The whole point of
246
+ // this helper is to produce a non-empty extended description when
247
+ // the editorial source paragraph was too short — accepting a
248
+ // 130-char synthesized string is strictly better than the empty
249
+ // fallback that was previously emitted on 56 breaking briefs.
250
+ // We delegate the actual clamp to {@link truncateDescription} on
251
+ // the joined buffer first (which won't trip because the buffer is
252
+ // already under 300), then truncate again only if it overruns
253
+ // the larger 300-char budget.
254
+ const joined = parts.join(' ').trim();
255
+ if (!joined)
256
+ return '';
257
+ if (joined.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
258
+ return joined;
259
+ // Overran the 300-char budget — apply the same sentence-boundary
260
+ // preserving truncation as truncateExtendedDescription.
261
+ return truncateExtendedDescription(joined);
262
+ }
263
+ export function hasLeakySeoToken(value) {
264
+ if (!value)
265
+ return false;
266
+ return value.toLowerCase().includes('analysis run') || LEAKY_RUNID_RE.test(value);
267
+ }
268
+ function stripLeadingFragmentSeparator(value) {
269
+ return value.replace(/^[:;—–-]\s+/u, '').trim();
270
+ }
271
+ function stripLeakySentences(value) {
272
+ if (!value)
273
+ return '';
274
+ const parts = value
275
+ .split(/(?<=[.!?])\s+/u)
276
+ .map((part) => part.trim())
277
+ .filter(Boolean);
278
+ const clean = parts.filter((part) => !hasLeakySeoToken(part));
279
+ return (clean.length > 0 ? clean : parts).join(' ').trim();
280
+ }
281
+ function sanitizeDescriptionCandidate(value) {
282
+ const cleaned = stripLeadingFragmentSeparator(stripLeakySentences(value));
283
+ return cleaned && !shouldSkipDescriptionLine(cleaned) ? cleaned : '';
284
+ }
285
+ function isUsableResolvedTitle(value, options) {
286
+ const cleaned = stripLeadingFragmentSeparator(value);
287
+ if (cleaned.length < SEO_TITLE_FLOOR)
288
+ return false;
289
+ if (hasLeakySeoToken(cleaned))
290
+ return false;
291
+ // Reject section-header leaks, ellipsis-truncated strings, doc-IDs,
292
+ // and full-sentence fragments. See `title-rejection.ts` for the
293
+ // canonical denylist + structural rules. Without these guards, the
294
+ // 216-article audit (2026-05-24) showed `Strategic significance`,
295
+ // `Threat Level`, `Convergence themes`, `TA-10-2026-0160`, and
296
+ // ellipsis-cut paragraphs reaching the `<title>` surface.
297
+ //
298
+ // When `allowFullSentence` is true, the `sentence-fragment` reason is
299
+ // tolerated. This is used for summary-derived titles where the first
300
+ // sentence of the summary is the intended payload (e.g. recess days
301
+ // whose summary leads with `No new breaking developments on …`).
302
+ const reason = findTitleRejectionReason(cleaned);
303
+ if (reason && !(options?.allowFullSentence && reason === 'sentence-fragment')) {
304
+ return false;
305
+ }
306
+ return true;
307
+ }
308
+ function deriveHeadlineFromSummary(summary) {
309
+ const cleaned = sanitizeDescriptionCandidate(summary);
310
+ if (!cleaned)
311
+ return '';
312
+ return truncateTitle(extractFirstSentence(cleaned) || cleaned);
140
313
  }
141
314
  /**
142
315
  * Append a short run qualifier to otherwise duplicate-prone fallback
@@ -190,14 +363,22 @@ export function containsNormalized(haystack, needle) {
190
363
  * @returns De-duplicated keywords for `<meta name="keywords">`
191
364
  */
192
365
  export function buildSeoKeywords(lang, articleType, date, runId, title, description) {
366
+ // `runId` is intentionally unused: the previous implementation
367
+ // emitted `run <runId>` as a synthetic keyword, which surfaced
368
+ // opaque tokens like `run propositions-run261-1779431162` in
369
+ // `<meta name="keywords">`. The argument is preserved for callsite
370
+ // backward compatibility.
371
+ void runId;
193
372
  const localized = getLocalizedString(LOCALIZED_KEYWORDS, lang);
194
373
  const base = Object.getOwnPropertyDescriptor(localized, articleType)?.value;
195
374
  const fallback = ['EU Parliament', 'European Parliament', 'political intelligence'];
196
375
  const candidates = [
376
+ // Always-on cross-site portfolio keywords lead the list so they
377
+ // are guaranteed to survive the 16-entry budget cap.
378
+ ...CROSS_SITE_KEYWORDS,
197
379
  ...(base ?? fallback),
198
380
  humanizeSlug(articleType),
199
381
  date,
200
- ...(runId ? [`run ${runId}`] : []),
201
382
  ...extractKeywordTerms(`${title} ${description}`),
202
383
  ];
203
384
  return dedupeKeywords(candidates).slice(0, 16);
@@ -205,6 +386,11 @@ export function buildSeoKeywords(lang, articleType, date, runId, title, descript
205
386
  /**
206
387
  * Extract short keyword terms from resolved SEO copy.
207
388
  *
389
+ * Filters out tokens that look like UUID hex fragments, run-id slugs,
390
+ * or digit-dominated noise (see {@link isNoiseKeywordToken}) so the
391
+ * keyword list never leaks internal aggregator identifiers into
392
+ * `<meta name="keywords">`.
393
+ *
208
394
  * @param text - Title and description text
209
395
  * @returns Candidate terms
210
396
  */
@@ -212,7 +398,7 @@ function extractKeywordTerms(text) {
212
398
  return text
213
399
  .split(/[^\p{L}\p{N}]+/u)
214
400
  .map((token) => token.trim())
215
- .filter((token) => token.length >= 4 && !/^\d+$/.test(token))
401
+ .filter((token) => token.length >= 4 && !isNoiseKeywordToken(token))
216
402
  .slice(0, 18);
217
403
  }
218
404
  /**
@@ -250,4 +436,5 @@ export function pickFirstNonEmpty(candidates) {
250
436
  }
251
437
  return '';
252
438
  }
439
+ export { deriveHeadlineFromSummary, isUsableResolvedTitle, sanitizeDescriptionCandidate };
253
440
  //# sourceMappingURL=resolve-helpers.js.map
@@ -0,0 +1,140 @@
1
+ /**
2
+ * @module Aggregator/Metadata/SeoBudgets
3
+ * @description Per-script SEO byte budgets and a script-aware clamp.
4
+ *
5
+ * Background. Google Search Central and Bing Webmaster Guidelines both
6
+ * document SERP snippet limits in **pixels**, not characters. Latin
7
+ * glyphs render at roughly half the pixel width of CJK glyphs, while
8
+ * Arabic/Hebrew letterforms sit between the two. A single `length`
9
+ * budget for `<title>` / `<meta description>` will always be wrong for
10
+ * at least one of the 14 publishing languages — typically over-truncating
11
+ * Latin copy and over-running CJK by a factor of two.
12
+ *
13
+ * This module provides:
14
+ *
15
+ * - {@link classifyScript} — three-way `latin | cjk | rtl` family
16
+ * classifier driven by the locale code (no glyph inspection — the
17
+ * BCP-47 language tag is authoritative because every publishing
18
+ * pipeline emits one full output per language).
19
+ * - {@link SEO_BUDGETS} — per-surface × per-script byte caps derived
20
+ * from the documented platform envelopes (Google ≤580 px title /
21
+ * ≤155 char description; Bing slightly more generous; Facebook ≤95
22
+ * chars on `og:title`; Twitter ≤70 / ≤200; LinkedIn shares OG).
23
+ * - {@link budgetFor} — typed accessor returning the byte cap for a
24
+ * `(lang, surface)` pair, with a uniform fallback to the strictest
25
+ * Latin budget when the locale is unknown.
26
+ * - {@link clampForBudget} — script-aware truncator that prefers
27
+ * natural clause boundaries (CJK full-width punctuation, RTL
28
+ * sentence punctuation, Latin clause separators) before falling
29
+ * back to whitespace breaks. Returns the input verbatim when it
30
+ * already fits.
31
+ *
32
+ * Pure, leaf module. No I/O, no dependencies on other aggregator
33
+ * modules beyond the existing `text-utils.ts` clause-boundary
34
+ * vocabulary.
35
+ */
36
+ import type { LanguageCode } from '../../types/index.js';
37
+ /**
38
+ * Three-way script family used as the column key in {@link SEO_BUDGETS}.
39
+ * `cjk` covers Chinese / Japanese / Korean (~2× Latin pixel width per
40
+ * glyph); `rtl` covers Arabic / Hebrew (bidi + ligature handling).
41
+ */
42
+ export type ScriptFamily = 'latin' | 'cjk' | 'rtl';
43
+ /**
44
+ * Iteration helper — all three script families in a deterministic
45
+ * order (latin → cjk → rtl). Exported so test matrices and downstream
46
+ * tooling can walk every column of {@link SEO_BUDGETS} without
47
+ * duplicating the literal list.
48
+ */
49
+ export declare const ALL_SCRIPT_FAMILIES: readonly ScriptFamily[];
50
+ /**
51
+ * Classify a locale code into a script family. Used to look up the
52
+ * correct byte cap in {@link SEO_BUDGETS}.
53
+ *
54
+ * @param lang - BCP-47 language tag (one of the 14 publishing locales)
55
+ * @returns Script family for SEO budget lookup
56
+ */
57
+ export declare function classifyScript(lang: string): ScriptFamily;
58
+ /**
59
+ * Public SEO surfaces this module budgets for. Each one has documented
60
+ * truncation behaviour by at least one major search engine or social
61
+ * platform.
62
+ *
63
+ * - `title` — HTML `<title>` (Google ≤580 px ≈ 60 Latin / 30 CJK / 55 RTL)
64
+ * - `metaDescription` — `<meta name="description">` (Google ≤~155 char)
65
+ * - `ogTitle` — Facebook / LinkedIn `og:title` (~95 Latin)
66
+ * - `ogDescription` — Facebook / LinkedIn `og:description` (~200 Latin)
67
+ * - `twitterTitle` — Twitter card title (≤70 Latin)
68
+ * - `twitterDescription` — Twitter card description (≤200 Latin)
69
+ * - `imageAlt` — `og:image:alt` / social card alt text (≤125 Latin)
70
+ * - `jsonLdHeadline` — Schema.org `NewsArticle.headline` (Google ≤110)
71
+ */
72
+ export type SeoSurface = 'title' | 'metaDescription' | 'ogTitle' | 'ogDescription' | 'twitterTitle' | 'twitterDescription' | 'imageAlt' | 'jsonLdHeadline';
73
+ /**
74
+ * Per-surface × per-script byte cap table. Numbers reflect the
75
+ * narrower of Google / Bing / Facebook / Twitter documented envelopes,
76
+ * with a ~5 % safety margin so a snippet on the edge of the budget
77
+ * isn't truncated mid-glyph by the rendering platform.
78
+ *
79
+ * For `jsonLdHeadline` the Schema.org `NewsArticle.headline` cap is
80
+ * script-independent (Google validates the literal character count at
81
+ * 110) — same value across the row.
82
+ */
83
+ export declare const SEO_BUDGETS: Readonly<Record<SeoSurface, Readonly<Record<ScriptFamily, number>>>>;
84
+ /**
85
+ * Resolve the byte cap for one `(lang, surface)` pair.
86
+ *
87
+ * @param lang - Publishing locale
88
+ * @param surface - SEO surface (see {@link SeoSurface})
89
+ * @returns Byte cap (positive integer)
90
+ */
91
+ export declare function budgetFor(lang: LanguageCode | string, surface: SeoSurface): number;
92
+ /**
93
+ * Truncate `text` to fit `(lang, surface)` SEO byte budget. Prefers a
94
+ * natural clause boundary inside the script's punctuation vocabulary
95
+ * (CJK / RTL / Latin) before falling back to a whitespace break.
96
+ *
97
+ * Always returns `text` verbatim when it already fits (no ellipsis
98
+ * appended). When truncation happens an ellipsis (`…`) is appended for
99
+ * Latin / RTL; for CJK the full-width ellipsis (`…`) reads as a
100
+ * partial-thought marker and is also appended — Schema.org and Google
101
+ * accept either glyph in `headline` / `description`.
102
+ *
103
+ * @param text - Source text (already plain-text — no Markdown / HTML)
104
+ * @param lang - Publishing locale
105
+ * @param surface - Target SEO surface
106
+ * @returns Clamped text ≤ `budgetFor(lang, surface)` characters
107
+ */
108
+ export declare function clampForBudget(text: string, lang: LanguageCode | string, surface: SeoSurface): string;
109
+ /**
110
+ * Optional inputs to {@link clampTitleForSurface}.
111
+ *
112
+ * `siteTitle` is the brand suffix (e.g. "EU Parliament Monitor") and
113
+ * `separator` is the localized glue (e.g. `" | "` / `" ・ "` / `" ׀ "`).
114
+ * When both are provided the function tries to keep the brand suffix
115
+ * inside the budget; when the article title alone already fills the
116
+ * budget the suffix is *dropped* (better SERP outcome than a truncated
117
+ * headline followed by a clipped brand).
118
+ *
119
+ * `shortSiteTitle` is the optional fallback used when the full brand
120
+ * suffix can't fit but a shorter variant would (e.g. `"EPM"` for CJK).
121
+ */
122
+ export interface TitleSurfaceOptions {
123
+ readonly siteTitle?: string;
124
+ readonly shortSiteTitle?: string;
125
+ readonly separator?: string;
126
+ }
127
+ /**
128
+ * Compose `{title}{separator}{siteTitle}` while honouring the
129
+ * `(lang, surface)` budget. Drops the brand suffix entirely when the
130
+ * article title alone is already at or past the budget. Prefers the
131
+ * short site title when supplied and the full suffix doesn't fit.
132
+ *
133
+ * @param title - Article title (plain text)
134
+ * @param lang - Publishing locale
135
+ * @param surface - Target SEO surface (`title` / `ogTitle` / `twitterTitle`)
136
+ * @param opts - Optional brand suffix wiring
137
+ * @returns Composed title ≤ budget
138
+ */
139
+ export declare function clampTitleForSurface(title: string, lang: LanguageCode | string, surface: SeoSurface, opts?: TitleSurfaceOptions): string;
140
+ //# sourceMappingURL=seo-budgets.d.ts.map