euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
import { HEADLINE_CLAUSE_BOUNDARIES } from './text-utils.js';
|
|
4
|
+
/**
|
|
5
|
+
* Iteration helper — all three script families in a deterministic
|
|
6
|
+
* order (latin → cjk → rtl). Exported so test matrices and downstream
|
|
7
|
+
* tooling can walk every column of {@link SEO_BUDGETS} without
|
|
8
|
+
* duplicating the literal list.
|
|
9
|
+
*/
|
|
10
|
+
export const ALL_SCRIPT_FAMILIES = ['latin', 'cjk', 'rtl'];
|
|
11
|
+
/**
|
|
12
|
+
* Classify a locale code into a script family. Used to look up the
|
|
13
|
+
* correct byte cap in {@link SEO_BUDGETS}.
|
|
14
|
+
*
|
|
15
|
+
* @param lang - BCP-47 language tag (one of the 14 publishing locales)
|
|
16
|
+
* @returns Script family for SEO budget lookup
|
|
17
|
+
*/
|
|
18
|
+
export function classifyScript(lang) {
|
|
19
|
+
if (lang === 'ar' || lang === 'he')
|
|
20
|
+
return 'rtl';
|
|
21
|
+
if (lang === 'ja' || lang === 'ko' || lang === 'zh')
|
|
22
|
+
return 'cjk';
|
|
23
|
+
return 'latin';
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Per-surface × per-script byte cap table. Numbers reflect the
|
|
27
|
+
* narrower of Google / Bing / Facebook / Twitter documented envelopes,
|
|
28
|
+
* with a ~5 % safety margin so a snippet on the edge of the budget
|
|
29
|
+
* isn't truncated mid-glyph by the rendering platform.
|
|
30
|
+
*
|
|
31
|
+
* For `jsonLdHeadline` the Schema.org `NewsArticle.headline` cap is
|
|
32
|
+
* script-independent (Google validates the literal character count at
|
|
33
|
+
* 110) — same value across the row.
|
|
34
|
+
*/
|
|
35
|
+
export const SEO_BUDGETS = {
|
|
36
|
+
title: { latin: 60, cjk: 30, rtl: 55 },
|
|
37
|
+
metaDescription: { latin: 155, cjk: 78, rtl: 150 },
|
|
38
|
+
ogTitle: { latin: 95, cjk: 47, rtl: 90 },
|
|
39
|
+
ogDescription: { latin: 200, cjk: 100, rtl: 195 },
|
|
40
|
+
twitterTitle: { latin: 70, cjk: 35, rtl: 70 },
|
|
41
|
+
twitterDescription: { latin: 200, cjk: 100, rtl: 195 },
|
|
42
|
+
imageAlt: { latin: 125, cjk: 60, rtl: 120 },
|
|
43
|
+
jsonLdHeadline: { latin: 110, cjk: 110, rtl: 110 },
|
|
44
|
+
};
|
|
45
|
+
/**
|
|
46
|
+
* Resolve the byte cap for one `(lang, surface)` pair.
|
|
47
|
+
*
|
|
48
|
+
* @param lang - Publishing locale
|
|
49
|
+
* @param surface - SEO surface (see {@link SeoSurface})
|
|
50
|
+
* @returns Byte cap (positive integer)
|
|
51
|
+
*/
|
|
52
|
+
export function budgetFor(lang, surface) {
|
|
53
|
+
const family = classifyScript(lang);
|
|
54
|
+
return SEO_BUDGETS[surface][family];
|
|
55
|
+
}
|
|
56
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
57
|
+
// Script-aware truncator
|
|
58
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
59
|
+
/**
|
|
60
|
+
* CJK full-width clause boundaries — the breakpoints CJK readers
|
|
61
|
+
* expect a snippet to end at. Listed in preferred-break order: a
|
|
62
|
+
* sentence-final mark beats a comma which beats a middle-dot.
|
|
63
|
+
*/
|
|
64
|
+
const CJK_CLAUSE_BOUNDARIES = [
|
|
65
|
+
'。',
|
|
66
|
+
'!',
|
|
67
|
+
'?',
|
|
68
|
+
'、',
|
|
69
|
+
';',
|
|
70
|
+
':',
|
|
71
|
+
'——',
|
|
72
|
+
'—',
|
|
73
|
+
'・',
|
|
74
|
+
];
|
|
75
|
+
/**
|
|
76
|
+
* RTL sentence punctuation. Arabic uses U+061F (؟) for question mark
|
|
77
|
+
* and U+060C (،) for comma; full stop is the ASCII `.` (Hebrew uses
|
|
78
|
+
* `.` and `,` directly). Listed in preferred-break order.
|
|
79
|
+
*/
|
|
80
|
+
const RTL_CLAUSE_BOUNDARIES = ['. ', '؟ ', '! ', '، ', '؛ ', ' — ', ' – '];
|
|
81
|
+
/**
|
|
82
|
+
* Soft-minimum fraction of the budget at which a clause-boundary break
|
|
83
|
+
* is acceptable. Below this fraction we fall through to whitespace
|
|
84
|
+
* truncation so we never ship a near-empty snippet just because the
|
|
85
|
+
* input started with a short clause.
|
|
86
|
+
*/
|
|
87
|
+
const SOFT_MIN_RATIO = 0.55;
|
|
88
|
+
/**
|
|
89
|
+
* Trim trailing punctuation that would otherwise leave a snippet
|
|
90
|
+
* ending on a dangling separator or ellipsis. Mirrors the spirit of
|
|
91
|
+
* `text-utils.ts::TRAILING_PUNCT` but keeps full-width CJK marks
|
|
92
|
+
* intact when they sit at a natural sentence boundary.
|
|
93
|
+
*
|
|
94
|
+
* @param s - Input string to trim
|
|
95
|
+
* @returns Input with trailing separator-class characters removed
|
|
96
|
+
*/
|
|
97
|
+
function trimTrailingSeparators(s) {
|
|
98
|
+
return s.replace(/[\s,;:—\-–·•…]+$/u, '');
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Pick the highest-priority clause boundary inside a candidate window.
|
|
102
|
+
* Iterates the boundary vocabulary in declared (preference) order and
|
|
103
|
+
* returns the first index that sits past the soft minimum.
|
|
104
|
+
*
|
|
105
|
+
* @param window - Candidate cut window (`text.slice(0, budget)`)
|
|
106
|
+
* @param boundaries - Boundary vocabulary, in preference order
|
|
107
|
+
* @param softMin - Soft-minimum cut position (chars)
|
|
108
|
+
* @returns Cut index, or -1 when no boundary qualifies
|
|
109
|
+
*/
|
|
110
|
+
function findClauseCut(window, boundaries, softMin) {
|
|
111
|
+
for (const boundary of boundaries) {
|
|
112
|
+
const idx = window.lastIndexOf(boundary);
|
|
113
|
+
if (idx >= softMin) {
|
|
114
|
+
return idx + boundary.length;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return -1;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Truncate `text` to fit `(lang, surface)` SEO byte budget. Prefers a
|
|
121
|
+
* natural clause boundary inside the script's punctuation vocabulary
|
|
122
|
+
* (CJK / RTL / Latin) before falling back to a whitespace break.
|
|
123
|
+
*
|
|
124
|
+
* Always returns `text` verbatim when it already fits (no ellipsis
|
|
125
|
+
* appended). When truncation happens an ellipsis (`…`) is appended for
|
|
126
|
+
* Latin / RTL; for CJK the full-width ellipsis (`…`) reads as a
|
|
127
|
+
* partial-thought marker and is also appended — Schema.org and Google
|
|
128
|
+
* accept either glyph in `headline` / `description`.
|
|
129
|
+
*
|
|
130
|
+
* @param text - Source text (already plain-text — no Markdown / HTML)
|
|
131
|
+
* @param lang - Publishing locale
|
|
132
|
+
* @param surface - Target SEO surface
|
|
133
|
+
* @returns Clamped text ≤ `budgetFor(lang, surface)` characters
|
|
134
|
+
*/
|
|
135
|
+
export function clampForBudget(text, lang, surface) {
|
|
136
|
+
const trimmed = text.trim();
|
|
137
|
+
const budget = budgetFor(lang, surface);
|
|
138
|
+
if (trimmed.length <= budget)
|
|
139
|
+
return trimmed;
|
|
140
|
+
const family = classifyScript(lang);
|
|
141
|
+
const softMin = Math.floor(budget * SOFT_MIN_RATIO);
|
|
142
|
+
// Reserve one char for the ellipsis we may append.
|
|
143
|
+
const window = trimmed.slice(0, budget - 1);
|
|
144
|
+
const boundaries = family === 'cjk'
|
|
145
|
+
? CJK_CLAUSE_BOUNDARIES
|
|
146
|
+
: family === 'rtl'
|
|
147
|
+
? RTL_CLAUSE_BOUNDARIES
|
|
148
|
+
: HEADLINE_CLAUSE_BOUNDARIES;
|
|
149
|
+
const clauseCut = findClauseCut(window, boundaries, softMin);
|
|
150
|
+
if (clauseCut > 0) {
|
|
151
|
+
const cleaned = trimTrailingSeparators(trimmed.slice(0, clauseCut));
|
|
152
|
+
if (cleaned.length >= softMin)
|
|
153
|
+
return cleaned;
|
|
154
|
+
}
|
|
155
|
+
// Whitespace-aware fallback. CJK text often has no ASCII spaces, so
|
|
156
|
+
// skip this step for CJK and fall straight through to the hard cut.
|
|
157
|
+
if (family !== 'cjk') {
|
|
158
|
+
const lastSpace = window.lastIndexOf(' ');
|
|
159
|
+
if (lastSpace >= softMin) {
|
|
160
|
+
const safe = trimTrailingSeparators(window.slice(0, lastSpace));
|
|
161
|
+
return `${safe}…`;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
const hardCut = trimTrailingSeparators(window);
|
|
165
|
+
return `${hardCut}…`;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Compose `{title}{separator}{siteTitle}` while honouring the
|
|
169
|
+
* `(lang, surface)` budget. Drops the brand suffix entirely when the
|
|
170
|
+
* article title alone is already at or past the budget. Prefers the
|
|
171
|
+
* short site title when supplied and the full suffix doesn't fit.
|
|
172
|
+
*
|
|
173
|
+
* @param title - Article title (plain text)
|
|
174
|
+
* @param lang - Publishing locale
|
|
175
|
+
* @param surface - Target SEO surface (`title` / `ogTitle` / `twitterTitle`)
|
|
176
|
+
* @param opts - Optional brand suffix wiring
|
|
177
|
+
* @returns Composed title ≤ budget
|
|
178
|
+
*/
|
|
179
|
+
export function clampTitleForSurface(title, lang, surface, opts = {}) {
|
|
180
|
+
const budget = budgetFor(lang, surface);
|
|
181
|
+
const cleanTitle = title.trim();
|
|
182
|
+
const sep = opts.separator ?? '';
|
|
183
|
+
const full = opts.siteTitle ?? '';
|
|
184
|
+
const short = opts.shortSiteTitle ?? '';
|
|
185
|
+
// No brand suffix wiring — just clamp the title in isolation.
|
|
186
|
+
if (!full)
|
|
187
|
+
return clampForBudget(cleanTitle, lang, surface);
|
|
188
|
+
const fullSuffix = `${sep}${full}`;
|
|
189
|
+
const shortSuffix = short ? `${sep}${short}` : '';
|
|
190
|
+
// Best case: title + full suffix fits.
|
|
191
|
+
if (cleanTitle.length + fullSuffix.length <= budget) {
|
|
192
|
+
return `${cleanTitle}${fullSuffix}`;
|
|
193
|
+
}
|
|
194
|
+
// Second best: title + short suffix fits.
|
|
195
|
+
if (shortSuffix && cleanTitle.length + shortSuffix.length <= budget) {
|
|
196
|
+
return `${cleanTitle}${shortSuffix}`;
|
|
197
|
+
}
|
|
198
|
+
// Third: keep the title (clamped), drop the brand. Better SERP than
|
|
199
|
+
// a truncated headline followed by a clipped brand suffix.
|
|
200
|
+
return clampForBudget(cleanTitle, lang, surface);
|
|
201
|
+
}
|
|
202
|
+
//# sourceMappingURL=seo-budgets.js.map
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remove any trailing whitespace, stop-words (the/a/an/of/…) and
|
|
3
|
+
* trailing punctuation (including any pre-existing ellipsis). Implemented
|
|
4
|
+
* imperatively to avoid super-linear regex backtracking on the
|
|
5
|
+
* `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
|
|
6
|
+
*
|
|
7
|
+
* @param input - Pre-clipped string to clean up
|
|
8
|
+
* @returns Cleaned string with no trailing stop-words or punctuation
|
|
9
|
+
*/
|
|
10
|
+
export declare function stripTrailingStopWordsAndPunctuation(input: string): string;
|
|
11
|
+
/**
|
|
12
|
+
* Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
|
|
13
|
+
* an ellipsis when truncation actually happens. Does not break words if
|
|
14
|
+
* avoidable — a trailing partial word is trimmed back to the previous
|
|
15
|
+
* space first.
|
|
16
|
+
*
|
|
17
|
+
* @param text - Raw description text
|
|
18
|
+
* @returns Truncated description with trailing ellipsis when clipped
|
|
19
|
+
*/
|
|
20
|
+
export declare function truncateDescription(text: string): string;
|
|
21
|
+
/**
|
|
22
|
+
* Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
|
|
23
|
+
* characters using the same sentence-boundary-preserving logic as
|
|
24
|
+
* {@link truncateDescription}. Returns `''` when the input is empty
|
|
25
|
+
* or shorter than the meta-description maximum (no point in emitting
|
|
26
|
+
* an "extended" description that's actually shorter than the regular
|
|
27
|
+
* one).
|
|
28
|
+
*
|
|
29
|
+
* @param text - Raw extended-description text (e.g. full BLUF paragraph)
|
|
30
|
+
* @returns Truncated extended description, or `''` when not worth emitting
|
|
31
|
+
*/
|
|
32
|
+
export declare function truncateExtendedDescription(text: string): string;
|
|
33
|
+
/**
|
|
34
|
+
* Clamp a title to `TITLE_MAX_LENGTH` characters in the same
|
|
35
|
+
* word-boundary-preserving fashion as {@link truncateDescription}.
|
|
36
|
+
*
|
|
37
|
+
* **No mid-sentence ellipsis.** When the title overruns the budget and
|
|
38
|
+
* no natural clause boundary exists inside the
|
|
39
|
+
* `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window, this function returns
|
|
40
|
+
* an empty string instead of a mid-sentence `…` truncation. The empty
|
|
41
|
+
* return tells the caller to fall through to the next tier of the
|
|
42
|
+
* resolver ladder (template-fallback title with category + date),
|
|
43
|
+
* producing a complete, scan-friendly title rather than a clipped
|
|
44
|
+
* editorial fragment. Live-site regression (2026-05): titles such as
|
|
45
|
+
* `AI Trade Strategy: A Legislative First with Structural…` and
|
|
46
|
+
* `The European Parliament's 24 standing committees continued…`
|
|
47
|
+
* were emitted before this guard.
|
|
48
|
+
*
|
|
49
|
+
* @param text - Raw title text
|
|
50
|
+
* @returns Clause-truncated title (no ellipsis), or `''` when no
|
|
51
|
+
* editorial clause boundary exists in the window
|
|
52
|
+
*/
|
|
53
|
+
export declare function truncateTitle(text: string): string;
|
|
54
|
+
/**
|
|
55
|
+
* Return the first complete sentence from a prose paragraph, suitable
|
|
56
|
+
* for use as a fallback editorial title when the artefact H1 is
|
|
57
|
+
* categorical (e.g. `# EU Parliament Committee Reports`) and the
|
|
58
|
+
* resolver must derive `<title>` from the BLUF / lede summary instead.
|
|
59
|
+
*
|
|
60
|
+
* A "sentence" is the prefix up to the first sentence-terminator
|
|
61
|
+
* (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
|
|
62
|
+
* TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
|
|
63
|
+
* `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
|
|
64
|
+
* so they don't terminate the sentence prematurely. When no
|
|
65
|
+
* acceptable terminator exists in the window, returns `''` so the
|
|
66
|
+
* resolver falls through to the next tier instead of feeding an
|
|
67
|
+
* over-budget paragraph into {@link truncateTitle} (which would also
|
|
68
|
+
* return `''`).
|
|
69
|
+
*
|
|
70
|
+
* @param paragraph - Prose paragraph (post-`stripInlineMarkdown`)
|
|
71
|
+
* @returns First sentence, or `''` when none can be identified within
|
|
72
|
+
* the soft-min window
|
|
73
|
+
*/
|
|
74
|
+
export declare function extractFirstSentence(paragraph: string): string;
|
|
75
|
+
//# sourceMappingURL=text-truncate.d.ts.map
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Aggregator/Metadata/TextTruncate
|
|
5
|
+
* @description Byte-budget truncators and sentence-extraction helpers
|
|
6
|
+
* extracted from `text-utils.ts` to keep both modules under the 600-line
|
|
7
|
+
* drift-guard budget enforced by `test/unit/source-file-size.test.js`.
|
|
8
|
+
*
|
|
9
|
+
* This file is the **clamping layer** of the metadata text pipeline —
|
|
10
|
+
* after `shouldSkipDescriptionLine`/`stripInlineMarkdown` produce a
|
|
11
|
+
* candidate description / title, the helpers here apply the SEO-budget
|
|
12
|
+
* shape rules:
|
|
13
|
+
*
|
|
14
|
+
* - {@link truncateDescription} — clamp to `DESCRIPTION_MAX_LENGTH` on a
|
|
15
|
+
* sentence/word boundary, appending `…` when truncation occurs.
|
|
16
|
+
* - {@link truncateExtendedDescription} — clamp to the longer
|
|
17
|
+
* `EXTENDED_DESCRIPTION_MAX_LENGTH` (used by `og:description`).
|
|
18
|
+
* - {@link truncateTitle} — clamp to `TITLE_MAX_LENGTH` on a
|
|
19
|
+
* **clause** boundary, returning `''` rather than emitting a
|
|
20
|
+
* mid-sentence ellipsised title.
|
|
21
|
+
* - {@link extractFirstSentence} — return the first complete sentence
|
|
22
|
+
* from a prose paragraph, or `''` when no clean terminator is
|
|
23
|
+
* available within the soft-min window.
|
|
24
|
+
*
|
|
25
|
+
* Bounded-context rules match `text-utils.ts`:
|
|
26
|
+
* - **No upward imports** — pure helpers, no I/O, no globals.
|
|
27
|
+
* - **Deterministic** — same input always produces same output.
|
|
28
|
+
* - **Locale-agnostic** — operates on raw prose in any of the 14
|
|
29
|
+
* publishing languages.
|
|
30
|
+
*/
|
|
31
|
+
import { ABBREVIATION_PREFIXES, DESCRIPTION_MAX_LENGTH, DESCRIPTION_MIN_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MIN_LENGTH, HEADLINE_CLAUSE_BOUNDARIES, HEADLINE_HARD_MIN, HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH, TRAILING_PUNCT, TRAILING_STOP_WORDS, } from './text-utils-constants.js';
|
|
32
|
+
/**
|
|
33
|
+
* Remove any trailing whitespace, stop-words (the/a/an/of/…) and
|
|
34
|
+
* trailing punctuation (including any pre-existing ellipsis). Implemented
|
|
35
|
+
* imperatively to avoid super-linear regex backtracking on the
|
|
36
|
+
* `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
|
|
37
|
+
*
|
|
38
|
+
* @param input - Pre-clipped string to clean up
|
|
39
|
+
* @returns Cleaned string with no trailing stop-words or punctuation
|
|
40
|
+
*/
|
|
41
|
+
export function stripTrailingStopWordsAndPunctuation(input) {
|
|
42
|
+
let result = input;
|
|
43
|
+
let changed = true;
|
|
44
|
+
while (changed) {
|
|
45
|
+
changed = false;
|
|
46
|
+
while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
|
|
47
|
+
result = result.slice(0, -1);
|
|
48
|
+
changed = true;
|
|
49
|
+
}
|
|
50
|
+
const lastSpace = result.lastIndexOf(' ');
|
|
51
|
+
if (lastSpace >= 0) {
|
|
52
|
+
const tail = result.slice(lastSpace + 1).toLowerCase();
|
|
53
|
+
if (TRAILING_STOP_WORDS.has(tail)) {
|
|
54
|
+
result = result.slice(0, lastSpace);
|
|
55
|
+
changed = true;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return result;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
|
|
63
|
+
* an ellipsis when truncation actually happens. Does not break words if
|
|
64
|
+
* avoidable — a trailing partial word is trimmed back to the previous
|
|
65
|
+
* space first.
|
|
66
|
+
*
|
|
67
|
+
* @param text - Raw description text
|
|
68
|
+
* @returns Truncated description with trailing ellipsis when clipped
|
|
69
|
+
*/
|
|
70
|
+
export function truncateDescription(text) {
|
|
71
|
+
if (text.length <= DESCRIPTION_MAX_LENGTH)
|
|
72
|
+
return text;
|
|
73
|
+
const cut = text.slice(0, DESCRIPTION_MAX_LENGTH);
|
|
74
|
+
// Prefer the last full sentence terminator within the cut so we don't
|
|
75
|
+
// end on a dangling determiner ("…year. The"). Period/!/? followed by
|
|
76
|
+
// a space marks a clean boundary. Only honour the boundary when it
|
|
77
|
+
// sits past the soft minimum so we keep enough body text to be useful.
|
|
78
|
+
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
79
|
+
if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
|
|
80
|
+
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
81
|
+
}
|
|
82
|
+
const earlySentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
83
|
+
if (earlySentenceEnd >= Math.floor(DESCRIPTION_MIN_LENGTH / 3)) {
|
|
84
|
+
return cut.slice(0, earlySentenceEnd + 1).replace(/\s+$/, '');
|
|
85
|
+
}
|
|
86
|
+
const lastSpace = cut.lastIndexOf(' ');
|
|
87
|
+
let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
88
|
+
// Drop dangling stop-words and trailing punctuation/ellipsis so we
|
|
89
|
+
// never emit broken copy ("…year. The" → "…year.") or double-ellipsis
|
|
90
|
+
// ("The……") when the upstream input already carried an ellipsis.
|
|
91
|
+
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
92
|
+
return safe;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
|
|
96
|
+
* characters using the same sentence-boundary-preserving logic as
|
|
97
|
+
* {@link truncateDescription}. Returns `''` when the input is empty
|
|
98
|
+
* or shorter than the meta-description maximum (no point in emitting
|
|
99
|
+
* an "extended" description that's actually shorter than the regular
|
|
100
|
+
* one).
|
|
101
|
+
*
|
|
102
|
+
* @param text - Raw extended-description text (e.g. full BLUF paragraph)
|
|
103
|
+
* @returns Truncated extended description, or `''` when not worth emitting
|
|
104
|
+
*/
|
|
105
|
+
export function truncateExtendedDescription(text) {
|
|
106
|
+
const trimmed = text.trim();
|
|
107
|
+
if (!trimmed)
|
|
108
|
+
return '';
|
|
109
|
+
// Don't emit an extended description that is shorter than the
|
|
110
|
+
// short meta-description budget — there's no SEO win and it would
|
|
111
|
+
// make `og:description` shorter than `<meta description>`.
|
|
112
|
+
if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
|
|
113
|
+
return '';
|
|
114
|
+
if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
|
|
115
|
+
return trimmed;
|
|
116
|
+
const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH);
|
|
117
|
+
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
118
|
+
if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
|
|
119
|
+
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
120
|
+
}
|
|
121
|
+
const earlySentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
122
|
+
if (earlySentenceEnd >= Math.floor(EXTENDED_DESCRIPTION_MIN_LENGTH / 2)) {
|
|
123
|
+
return cut.slice(0, earlySentenceEnd + 1).replace(/\s+$/, '');
|
|
124
|
+
}
|
|
125
|
+
const lastSpace = cut.lastIndexOf(' ');
|
|
126
|
+
let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
127
|
+
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
128
|
+
return safe;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Clamp a title to `TITLE_MAX_LENGTH` characters in the same
|
|
132
|
+
* word-boundary-preserving fashion as {@link truncateDescription}.
|
|
133
|
+
*
|
|
134
|
+
* **No mid-sentence ellipsis.** When the title overruns the budget and
|
|
135
|
+
* no natural clause boundary exists inside the
|
|
136
|
+
* `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window, this function returns
|
|
137
|
+
* an empty string instead of a mid-sentence `…` truncation. The empty
|
|
138
|
+
* return tells the caller to fall through to the next tier of the
|
|
139
|
+
* resolver ladder (template-fallback title with category + date),
|
|
140
|
+
* producing a complete, scan-friendly title rather than a clipped
|
|
141
|
+
* editorial fragment. Live-site regression (2026-05): titles such as
|
|
142
|
+
* `AI Trade Strategy: A Legislative First with Structural…` and
|
|
143
|
+
* `The European Parliament's 24 standing committees continued…`
|
|
144
|
+
* were emitted before this guard.
|
|
145
|
+
*
|
|
146
|
+
* @param text - Raw title text
|
|
147
|
+
* @returns Clause-truncated title (no ellipsis), or `''` when no
|
|
148
|
+
* editorial clause boundary exists in the window
|
|
149
|
+
*/
|
|
150
|
+
export function truncateTitle(text) {
|
|
151
|
+
if (text.length <= TITLE_MAX_LENGTH)
|
|
152
|
+
return text;
|
|
153
|
+
// Prefer ending at a natural clause boundary inside the
|
|
154
|
+
// `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
|
|
155
|
+
// title reads as a complete journalistic clause rather than a
|
|
156
|
+
// mid-sentence prose snippet. Iterate boundaries in priority order;
|
|
157
|
+
// when a candidate falls in the window, break there and drop the
|
|
158
|
+
// ellipsis since the result is grammatically complete.
|
|
159
|
+
const search = text.slice(0, TITLE_MAX_LENGTH);
|
|
160
|
+
for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
|
|
161
|
+
const idx = search.lastIndexOf(boundary);
|
|
162
|
+
if (idx >= HEADLINE_SOFT_MIN) {
|
|
163
|
+
const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
|
|
164
|
+
if (clean.length >= HEADLINE_SOFT_MIN)
|
|
165
|
+
return clean;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Second-tier fallback: when nothing landed in the soft window, look
|
|
169
|
+
// for the strongest boundary (`: ` or ` — `) inside the harder
|
|
170
|
+
// `[HEADLINE_HARD_MIN, HEADLINE_SOFT_MIN]` floor. This rescues
|
|
171
|
+
// Reader-Briefing-style ledes like
|
|
172
|
+
// `Immediate priority: DMA enforcement — …` whose clauses cluster in
|
|
173
|
+
// the opening 30-60 chars, while still keeping the soft-min guard
|
|
174
|
+
// active for runaway prose. We restrict the boundary set to `: ` and
|
|
175
|
+
// ` — ` (the two strongest semantic breaks) to avoid emitting trivial
|
|
176
|
+
// comma-split or full-stop-split fragments from short prose.
|
|
177
|
+
const STRONG_BOUNDARIES = [': ', ' — ', ' – '];
|
|
178
|
+
for (const boundary of STRONG_BOUNDARIES) {
|
|
179
|
+
const idx = search.indexOf(boundary);
|
|
180
|
+
if (idx >= HEADLINE_HARD_MIN && idx < HEADLINE_SOFT_MIN) {
|
|
181
|
+
const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
|
|
182
|
+
if (clean.length >= HEADLINE_HARD_MIN)
|
|
183
|
+
return clean;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// No clause boundary in either window — refuse to emit a mid-sentence
|
|
187
|
+
// truncation. Caller falls through to template-fallback composition.
|
|
188
|
+
return '';
|
|
189
|
+
}
|
|
190
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
191
|
+
// Sentence extraction
|
|
192
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
193
|
+
/**
|
|
194
|
+
* Return the first complete sentence from a prose paragraph, suitable
|
|
195
|
+
* for use as a fallback editorial title when the artefact H1 is
|
|
196
|
+
* categorical (e.g. `# EU Parliament Committee Reports`) and the
|
|
197
|
+
* resolver must derive `<title>` from the BLUF / lede summary instead.
|
|
198
|
+
*
|
|
199
|
+
* A "sentence" is the prefix up to the first sentence-terminator
|
|
200
|
+
* (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
|
|
201
|
+
* TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
|
|
202
|
+
* `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
|
|
203
|
+
* so they don't terminate the sentence prematurely. When no
|
|
204
|
+
* acceptable terminator exists in the window, returns `''` so the
|
|
205
|
+
* resolver falls through to the next tier instead of feeding an
|
|
206
|
+
* over-budget paragraph into {@link truncateTitle} (which would also
|
|
207
|
+
* return `''`).
|
|
208
|
+
*
|
|
209
|
+
* @param paragraph - Prose paragraph (post-`stripInlineMarkdown`)
|
|
210
|
+
* @returns First sentence, or `''` when none can be identified within
|
|
211
|
+
* the soft-min window
|
|
212
|
+
*/
|
|
213
|
+
export function extractFirstSentence(paragraph) {
|
|
214
|
+
const trimmed = paragraph.trim();
|
|
215
|
+
if (trimmed.length <= HEADLINE_SOFT_MIN)
|
|
216
|
+
return trimmed;
|
|
217
|
+
// Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
|
|
218
|
+
// we'd rather let truncateTitle clause-truncate the original
|
|
219
|
+
// paragraph than return a too-long first sentence.
|
|
220
|
+
const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
|
|
221
|
+
// Skip common abbreviations that contain a period inside a token
|
|
222
|
+
// (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
|
|
223
|
+
// candidate terminator positions; a position counts only when the
|
|
224
|
+
// char before it is *not* part of a known abbreviation token.
|
|
225
|
+
const terminators = ['. ', '! ', '? ', '; '];
|
|
226
|
+
let bestIdx = -1;
|
|
227
|
+
for (const t of terminators) {
|
|
228
|
+
let from = HEADLINE_SOFT_MIN;
|
|
229
|
+
let idx;
|
|
230
|
+
while ((idx = window.indexOf(t, from)) !== -1) {
|
|
231
|
+
if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
|
|
232
|
+
if (bestIdx === -1 || idx < bestIdx)
|
|
233
|
+
bestIdx = idx;
|
|
234
|
+
break;
|
|
235
|
+
}
|
|
236
|
+
from = idx + t.length;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
if (bestIdx >= HEADLINE_SOFT_MIN) {
|
|
240
|
+
return trimmed.slice(0, bestIdx + 1).trim();
|
|
241
|
+
}
|
|
242
|
+
// No sentence terminator inside the window — return `''` so the
|
|
243
|
+
// resolver falls through to the next tier instead of feeding a full
|
|
244
|
+
// paragraph into {@link truncateTitle} (which would now return `''`
|
|
245
|
+
// anyway). Being explicit here keeps the tier-1/2 split obvious.
|
|
246
|
+
return '';
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Check whether the character preceding the `.` at `idx` in `text`
|
|
250
|
+
* indicates an abbreviation (so the `.` is not a sentence terminator).
|
|
251
|
+
* Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
|
|
252
|
+
* single-letter initials pattern (`U.S.`, `E.U.`).
|
|
253
|
+
*
|
|
254
|
+
* @param text - Source text (lowercased segment + original mixed-case)
|
|
255
|
+
* @param idx - Index of the `.` character in `text`
|
|
256
|
+
* @returns `true` when the period at `idx` is part of an abbreviation
|
|
257
|
+
*/
|
|
258
|
+
function isAbbreviationBoundary(text, idx) {
|
|
259
|
+
// All-caps single-letter initial like `U.S.` or `E.U.` — char at
|
|
260
|
+
// idx-1 is a capital letter, and idx-2 is either start of string,
|
|
261
|
+
// whitespace, or another single-letter+period pair.
|
|
262
|
+
if (idx >= 1) {
|
|
263
|
+
const prev = text.charCodeAt(idx - 1);
|
|
264
|
+
const isUpperLetter = prev >= 65 && prev <= 90;
|
|
265
|
+
if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
|
|
266
|
+
return true;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
|
|
270
|
+
// start of the word, then compare lowercased.
|
|
271
|
+
let start = idx;
|
|
272
|
+
while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
|
|
273
|
+
start--;
|
|
274
|
+
const token = text.slice(start, idx + 1).toLowerCase();
|
|
275
|
+
return ABBREVIATION_PREFIXES.includes(token);
|
|
276
|
+
}
|
|
277
|
+
//# sourceMappingURL=text-truncate.js.map
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Aggregator/Metadata/TextUtilsConstants
|
|
3
|
+
* @description Shared byte-budget constants and vocabularies used by
|
|
4
|
+
* the metadata text helpers. Extracted from `text-utils.ts` so the
|
|
5
|
+
* truncation/extraction helpers can live in `text-truncate.ts`
|
|
6
|
+
* without creating a circular import — both modules import from
|
|
7
|
+
* here, and `text-utils.ts` re-exports the truncators for back-compat
|
|
8
|
+
* with existing call-sites.
|
|
9
|
+
*
|
|
10
|
+
* **No imports.** This is a pure leaf module: only constants and
|
|
11
|
+
* vocabularies, no functions, no I/O.
|
|
12
|
+
*/
|
|
13
|
+
/** Maximum `<meta description>` length we will emit. */
|
|
14
|
+
export declare const DESCRIPTION_MAX_LENGTH = 180;
|
|
15
|
+
/**
|
|
16
|
+
* Maximum `og:description` / `twitter:description` length we will
|
|
17
|
+
* emit. Facebook truncates at ~300 characters in the preview card;
|
|
18
|
+
* Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
|
|
19
|
+
* (which use the full OG payload) get the full BLUF context, then
|
|
20
|
+
* let Twitter clip naturally. Below this length the extended
|
|
21
|
+
* description is emitted verbatim; above it we sentence-boundary
|
|
22
|
+
* truncate.
|
|
23
|
+
*/
|
|
24
|
+
export declare const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
|
|
25
|
+
/** Target minimum extended-description length before we even emit it. */
|
|
26
|
+
export declare const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
|
|
27
|
+
/** Target minimum `<meta description>` length before we append context. */
|
|
28
|
+
export declare const DESCRIPTION_MIN_LENGTH = 140;
|
|
29
|
+
/**
|
|
30
|
+
* Length below which a raw description is considered too short to stand
|
|
31
|
+
* on its own and gets enriched with date/context. Independent from
|
|
32
|
+
* {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
|
|
33
|
+
* truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
|
|
34
|
+
* clean 100-140 char prose lede is preserved verbatim instead of being
|
|
35
|
+
* padded with date/context boilerplate.
|
|
36
|
+
*/
|
|
37
|
+
export declare const ENRICHMENT_TRIGGER_LENGTH = 100;
|
|
38
|
+
/** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
|
|
39
|
+
export declare const TITLE_MAX_LENGTH = 140;
|
|
40
|
+
/**
|
|
41
|
+
* Soft target for headline-style titles produced as a fallback from
|
|
42
|
+
* BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
|
|
43
|
+
* truncator first looks for a natural clause boundary
|
|
44
|
+
* (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
|
|
45
|
+
* window and breaks there instead of mid-clause-with-ellipsis. This
|
|
46
|
+
* turns a 137-character truncated prose paragraph into a complete
|
|
47
|
+
* journalistic clause, which scans much better in news cards and SERP
|
|
48
|
+
* snippets without sacrificing the keyword-rich opening.
|
|
49
|
+
*/
|
|
50
|
+
export declare const HEADLINE_SOFT_MIN = 60;
|
|
51
|
+
/**
|
|
52
|
+
* Lower floor for clause-boundary acceptance when the soft-min window
|
|
53
|
+
* returns nothing. Used by {@link truncateTitle} as a second-tier
|
|
54
|
+
* fallback: when a long prose paragraph has its only natural clause
|
|
55
|
+
* boundaries (`: `, ` — `) clustered in the opening 30-60 characters
|
|
56
|
+
* (typical of Reader-Briefing-style ledes like `Immediate priority:
|
|
57
|
+
* DMA enforcement — …`), accept the strongest such boundary rather
|
|
58
|
+
* than fall through to template-fallback composition. This keeps
|
|
59
|
+
* scan-friendly editorial fragments intact while still rejecting
|
|
60
|
+
* fragments shorter than a typical news-card title.
|
|
61
|
+
*/
|
|
62
|
+
export declare const HEADLINE_HARD_MIN = 30;
|
|
63
|
+
/**
|
|
64
|
+
* Punctuation marks that signal a natural clause boundary inside a
|
|
65
|
+
* BLUF / lede paragraph. Listed in preferred-break order: a colon or
|
|
66
|
+
* em-dash that introduces a list of consequences is the best break,
|
|
67
|
+
* full stops are next, and semicolons last. Single ASCII space is
|
|
68
|
+
* always a fallback boundary handled separately.
|
|
69
|
+
*/
|
|
70
|
+
export declare const HEADLINE_CLAUSE_BOUNDARIES: readonly string[];
|
|
71
|
+
/**
|
|
72
|
+
* Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
|
|
73
|
+
* (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
|
|
74
|
+
* metadata, never prose.
|
|
75
|
+
*/
|
|
76
|
+
export declare const EMOJI_BANNER_CHARS: string[];
|
|
77
|
+
/**
|
|
78
|
+
* Label prefixes that a prose description must never start with. Every
|
|
79
|
+
* entry matches case-insensitively at the start of a trimmed line, followed
|
|
80
|
+
* by optional space and a colon.
|
|
81
|
+
*/
|
|
82
|
+
export declare const METADATA_LINE_PREFIXES: readonly string[];
|
|
83
|
+
/** Connector / determiner words that read as broken copy when they are
|
|
84
|
+
* the final token before a truncation ellipsis. */
|
|
85
|
+
export declare const TRAILING_STOP_WORDS: Set<string>;
|
|
86
|
+
/** Trailing characters we always strip before appending our own ellipsis,
|
|
87
|
+
* so we never emit double-ellipsis or stray punctuation. */
|
|
88
|
+
export declare const TRAILING_PUNCT: RegExp;
|
|
89
|
+
/**
|
|
90
|
+
* Abbreviation tokens (lowercase, including the trailing period) that
|
|
91
|
+
* should NOT count as sentence terminators when `extractFirstSentence`
|
|
92
|
+
* scans for a `.` boundary. Single-letter all-caps initials
|
|
93
|
+
* (`U.S.`, `E.U.`) are handled by the all-caps-initial check.
|
|
94
|
+
*/
|
|
95
|
+
export declare const ABBREVIATION_PREFIXES: readonly string[];
|
|
96
|
+
//# sourceMappingURL=text-utils-constants.d.ts.map
|