euparliamentmonitor 0.9.13 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -4
- package/scripts/aggregator/article-generator.js +2 -1
- package/scripts/aggregator/article-html.d.ts +9 -0
- package/scripts/aggregator/article-html.js +134 -13
- package/scripts/aggregator/article-metadata.d.ts +25 -161
- package/scripts/aggregator/article-metadata.js +71 -649
- package/scripts/aggregator/editorial-brief-resolver.d.ts +9 -0
- package/scripts/aggregator/editorial-brief-resolver.js +3 -1
- package/scripts/aggregator/metadata/date-labels.d.ts +122 -0
- package/scripts/aggregator/metadata/date-labels.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +188 -0
- package/scripts/aggregator/metadata/text-utils.js +542 -0
- package/scripts/constants/og-locales.d.ts +15 -0
- package/scripts/constants/og-locales.js +17 -0
- package/scripts/constants/seo/index.d.ts +21 -0
- package/scripts/constants/seo/index.js +23 -0
- package/scripts/constants/seo/og-locales.d.ts +59 -0
- package/scripts/constants/seo/og-locales.js +59 -0
- package/scripts/constants/seo/social-handles.d.ts +50 -0
- package/scripts/constants/seo/social-handles.js +65 -0
- package/scripts/constants/social-handles.d.ts +11 -0
- package/scripts/constants/social-handles.js +13 -0
- package/scripts/discover-untranslated-briefs.js +224 -19
- package/scripts/generators/news-indexes.d.ts +35 -0
- package/scripts/generators/news-indexes.js +67 -6
- package/scripts/generators/political-intelligence/html.js +14 -6
- package/scripts/generators/seo-copy.js +42 -0
- package/scripts/generators/sitemap/html.js +13 -5
- package/scripts/lint-src-todos.js +124 -0
- package/scripts/utils/copy-test-reports.js +1 -1
- package/scripts/utils/generate-docs-index.js +1 -1
- package/scripts/validate-brief-translations.js +158 -18
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Aggregator/Metadata/TextUtils
|
|
5
|
+
* @description Pure text / Markdown utility helpers extracted from
|
|
6
|
+
* `article-metadata.ts` as a leaf module in the `metadata/` bounded
|
|
7
|
+
* context. Every helper here is concerned with **how to massage a
|
|
8
|
+
* string** into a meta-tag-safe shape — strip Markdown decorations,
|
|
9
|
+
* recognise banner / metadata rows that must never reach the
|
|
10
|
+
* description, clamp text to byte budgets without producing broken
|
|
11
|
+
* copy, and identify the first complete sentence in a prose paragraph.
|
|
12
|
+
*
|
|
13
|
+
* Bounded-context rules for this file:
|
|
14
|
+
* - **No upward imports** — pure helpers, no dependencies on other
|
|
15
|
+
* `src/aggregator/` modules, no I/O, no globals.
|
|
16
|
+
* - **Deterministic** — same input always produces same output; safe to
|
|
17
|
+
* property-test.
|
|
18
|
+
* - **Locale-agnostic** — every helper works on raw Markdown / prose
|
|
19
|
+
* in any of the 14 publishing languages. Banner-row detection is
|
|
20
|
+
* driven by structural shape (double-bold + pipe-separator), not by
|
|
21
|
+
* a hard-coded English vocabulary.
|
|
22
|
+
*
|
|
23
|
+
* The companion file `article-metadata.ts` re-exports the public surface
|
|
24
|
+
* for back-compat. New code should import directly from this module.
|
|
25
|
+
*/
|
|
26
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
27
|
+
// Length budgets — meta description / title size envelopes
|
|
28
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
29
|
+
/** Maximum `<meta description>` length we will emit. */
|
|
30
|
+
export const DESCRIPTION_MAX_LENGTH = 180;
|
|
31
|
+
/**
|
|
32
|
+
* Maximum `og:description` / `twitter:description` length we will
|
|
33
|
+
* emit. Facebook truncates at ~300 characters in the preview card;
|
|
34
|
+
* Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
|
|
35
|
+
* (which use the full OG payload) get the full BLUF context, then
|
|
36
|
+
* let Twitter clip naturally. Below this length the extended
|
|
37
|
+
* description is emitted verbatim; above it we sentence-boundary
|
|
38
|
+
* truncate the same way as {@link truncateDescription}.
|
|
39
|
+
*/
|
|
40
|
+
export const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
|
|
41
|
+
/** Target minimum extended-description length before we even emit it. */
|
|
42
|
+
export const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
|
|
43
|
+
/** Target minimum `<meta description>` length before we append context. */
|
|
44
|
+
export const DESCRIPTION_MIN_LENGTH = 140;
|
|
45
|
+
/**
|
|
46
|
+
* Length below which a raw description is considered too short to stand
|
|
47
|
+
* on its own and gets enriched with date/context. Independent from
|
|
48
|
+
* {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
|
|
49
|
+
* truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
|
|
50
|
+
* clean 100-140 char prose lede is preserved verbatim instead of being
|
|
51
|
+
* padded with date/context boilerplate.
|
|
52
|
+
*/
|
|
53
|
+
export const ENRICHMENT_TRIGGER_LENGTH = 100;
|
|
54
|
+
/** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
|
|
55
|
+
export const TITLE_MAX_LENGTH = 140;
|
|
56
|
+
/**
|
|
57
|
+
* Soft target for headline-style titles produced as a fallback from
|
|
58
|
+
* BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
|
|
59
|
+
* truncator first looks for a natural clause boundary
|
|
60
|
+
* (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
|
|
61
|
+
* window and breaks there instead of mid-clause-with-ellipsis. This
|
|
62
|
+
* turns a 137-character truncated prose paragraph into a complete
|
|
63
|
+
* journalistic clause, which scans much better in news cards and SERP
|
|
64
|
+
* snippets without sacrificing the keyword-rich opening.
|
|
65
|
+
*/
|
|
66
|
+
export const HEADLINE_SOFT_MIN = 60;
|
|
67
|
+
/**
|
|
68
|
+
* Punctuation marks that signal a natural clause boundary inside a
|
|
69
|
+
* BLUF / lede paragraph. Listed in preferred-break order: a colon or
|
|
70
|
+
* em-dash that introduces a list of consequences is the best break,
|
|
71
|
+
* full stops are next, and semicolons last. Single ASCII space is
|
|
72
|
+
* always a fallback boundary handled separately.
|
|
73
|
+
*/
|
|
74
|
+
export const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
|
|
75
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
76
|
+
// Banner / metadata-row vocabularies
|
|
77
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
78
|
+
/**
|
|
79
|
+
* Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
|
|
80
|
+
* (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
|
|
81
|
+
* metadata, never prose.
|
|
82
|
+
*/
|
|
83
|
+
export const EMOJI_BANNER_CHARS = [
|
|
84
|
+
'📋',
|
|
85
|
+
'📅',
|
|
86
|
+
'🔍',
|
|
87
|
+
'🏛',
|
|
88
|
+
'📰',
|
|
89
|
+
'📊',
|
|
90
|
+
'🏷',
|
|
91
|
+
'📈',
|
|
92
|
+
'📉',
|
|
93
|
+
'⚠',
|
|
94
|
+
'🔔',
|
|
95
|
+
'🎯',
|
|
96
|
+
'🗳',
|
|
97
|
+
'🏢',
|
|
98
|
+
'📄',
|
|
99
|
+
];
|
|
100
|
+
/**
|
|
101
|
+
* Label prefixes that a prose description must never start with. Every
|
|
102
|
+
* entry matches case-insensitively at the start of a trimmed line, followed
|
|
103
|
+
* by optional space and a colon.
|
|
104
|
+
*/
|
|
105
|
+
export const METADATA_LINE_PREFIXES = [
|
|
106
|
+
'Admiralty Grade',
|
|
107
|
+
'Analysis Date',
|
|
108
|
+
'Analysis Owner',
|
|
109
|
+
'Article Type',
|
|
110
|
+
'Article Window',
|
|
111
|
+
'Assessment Date',
|
|
112
|
+
'Briefing',
|
|
113
|
+
'Briefing Date',
|
|
114
|
+
'Classification',
|
|
115
|
+
'Classification Date',
|
|
116
|
+
'Confidence',
|
|
117
|
+
'Confidence in Evidence',
|
|
118
|
+
'Data Sources',
|
|
119
|
+
'Date',
|
|
120
|
+
'Document Type',
|
|
121
|
+
'Filing Date',
|
|
122
|
+
'Generated',
|
|
123
|
+
'Horizon',
|
|
124
|
+
'IMF Status',
|
|
125
|
+
'Last Updated',
|
|
126
|
+
'Parliamentary Status',
|
|
127
|
+
'Parliamentary Term',
|
|
128
|
+
'Period',
|
|
129
|
+
'Prepared',
|
|
130
|
+
'Purpose',
|
|
131
|
+
'Region',
|
|
132
|
+
'Reporting',
|
|
133
|
+
'Reporting Period',
|
|
134
|
+
'Reporting Window',
|
|
135
|
+
'Run',
|
|
136
|
+
'Run ID',
|
|
137
|
+
'Series',
|
|
138
|
+
'Series Run',
|
|
139
|
+
'Source',
|
|
140
|
+
'Sources',
|
|
141
|
+
'SPDX-FileCopyrightText',
|
|
142
|
+
'SPDX-License-Identifier',
|
|
143
|
+
'Topic',
|
|
144
|
+
'Type',
|
|
145
|
+
'WEP Band',
|
|
146
|
+
'WEP Grade',
|
|
147
|
+
'Window',
|
|
148
|
+
];
|
|
149
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
150
|
+
// Trailing-cleanup vocabularies (used by truncation helpers)
|
|
151
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
152
|
+
/** Connector / determiner words that read as broken copy when they are
|
|
153
|
+
* the final token before a truncation ellipsis. */
|
|
154
|
+
export const TRAILING_STOP_WORDS = new Set([
|
|
155
|
+
'the',
|
|
156
|
+
'a',
|
|
157
|
+
'an',
|
|
158
|
+
'of',
|
|
159
|
+
'to',
|
|
160
|
+
'for',
|
|
161
|
+
'in',
|
|
162
|
+
'on',
|
|
163
|
+
'at',
|
|
164
|
+
'by',
|
|
165
|
+
'and',
|
|
166
|
+
'or',
|
|
167
|
+
'with',
|
|
168
|
+
'from',
|
|
169
|
+
]);
|
|
170
|
+
/** Trailing characters we always strip before appending our own ellipsis,
|
|
171
|
+
* so we never emit double-ellipsis or stray punctuation. */
|
|
172
|
+
export const TRAILING_PUNCT = /[.,;:—\-…\s]/u;
|
|
173
|
+
/**
|
|
174
|
+
* Abbreviation tokens (lowercase, including the trailing period) that
|
|
175
|
+
* should NOT count as sentence terminators when {@link extractFirstSentence}
|
|
176
|
+
* scans for a `.` boundary. Single-letter all-caps initials
|
|
177
|
+
* (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
|
|
178
|
+
*/
|
|
179
|
+
export const ABBREVIATION_PREFIXES = [
|
|
180
|
+
'mr.',
|
|
181
|
+
'mrs.',
|
|
182
|
+
'ms.',
|
|
183
|
+
'dr.',
|
|
184
|
+
'st.',
|
|
185
|
+
'no.',
|
|
186
|
+
'vs.',
|
|
187
|
+
'e.g.',
|
|
188
|
+
'i.e.',
|
|
189
|
+
'etc.',
|
|
190
|
+
'cf.',
|
|
191
|
+
'al.',
|
|
192
|
+
// EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
|
|
193
|
+
'q1.',
|
|
194
|
+
'q2.',
|
|
195
|
+
'q3.',
|
|
196
|
+
'q4.',
|
|
197
|
+
'h1.',
|
|
198
|
+
'h2.',
|
|
199
|
+
'fy.',
|
|
200
|
+
];
|
|
201
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
202
|
+
// Line-classification helpers
|
|
203
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
204
|
+
/**
|
|
205
|
+
* Return `true` when a line cannot serve as a prose description. Rejects
|
|
206
|
+
* Markdown structural lines (headings, blockquotes, tables, HTML),
|
|
207
|
+
* mermaid/chart directives, emoji-banner metadata rows, and the known
|
|
208
|
+
* `Key: value` banners that Stage-B agents emit as artefact preamble.
|
|
209
|
+
*
|
|
210
|
+
* @param line - Trimmed line from the aggregated Markdown source
|
|
211
|
+
* @returns `true` when the line is not prose and should be skipped
|
|
212
|
+
*/
|
|
213
|
+
export function shouldSkipDescriptionLine(line) {
|
|
214
|
+
if (line.length === 0)
|
|
215
|
+
return true;
|
|
216
|
+
if (line.startsWith('#'))
|
|
217
|
+
return true;
|
|
218
|
+
if (line.startsWith('>'))
|
|
219
|
+
return true;
|
|
220
|
+
if (line.startsWith('<'))
|
|
221
|
+
return true;
|
|
222
|
+
if (line.startsWith('|'))
|
|
223
|
+
return true;
|
|
224
|
+
if (line.startsWith('---') || line.startsWith('==='))
|
|
225
|
+
return true;
|
|
226
|
+
if (line.startsWith('```') || line.startsWith('~~~'))
|
|
227
|
+
return true;
|
|
228
|
+
if (line.startsWith('%%'))
|
|
229
|
+
return true;
|
|
230
|
+
if (/^title\s/i.test(line))
|
|
231
|
+
return true;
|
|
232
|
+
if (EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)))
|
|
233
|
+
return true;
|
|
234
|
+
const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
|
|
235
|
+
for (const prefix of METADATA_LINE_PREFIXES) {
|
|
236
|
+
const lower = labelSource.toLowerCase();
|
|
237
|
+
const prefixLower = prefix.toLowerCase();
|
|
238
|
+
if (lower.startsWith(`${prefixLower}:`) ||
|
|
239
|
+
lower.startsWith(`${prefixLower} :`) ||
|
|
240
|
+
lower.startsWith(`${prefixLower}**:`) ||
|
|
241
|
+
lower.startsWith(`${prefixLower}*:`)) {
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
if (/^[-*_=~.]{3,}$/.test(line))
|
|
246
|
+
return true;
|
|
247
|
+
if (isLocalizedBannerRow(line))
|
|
248
|
+
return true;
|
|
249
|
+
return false;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Language-agnostic banner-row detector. Stage-B artefacts open with a
|
|
253
|
+
* metadata banner of the shape
|
|
254
|
+
* `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
|
|
255
|
+
* and its localized siblings — notably Japanese / Chinese / Korean briefs
|
|
256
|
+
* which place the full-width colon `:` **inside** the bold span
|
|
257
|
+
* (`**日付:**`) rather than after it. The `METADATA_LINE_PREFIXES` table
|
|
258
|
+
* only covers the English vocabulary; this helper catches the structural
|
|
259
|
+
* shape directly: a line that starts with `**`, contains at least one
|
|
260
|
+
* `|` separator, and carries two-or-more bold key markers that end with
|
|
261
|
+
* — or are followed by — an ASCII colon `:` or full-width colon `:`.
|
|
262
|
+
* Banner rows look identical in every language we publish, so detecting
|
|
263
|
+
* them here keeps localized briefs from leaking their first banner line
|
|
264
|
+
* into the `<meta description>`.
|
|
265
|
+
*
|
|
266
|
+
* @param line - Trimmed source line
|
|
267
|
+
* @returns `true` when the line is a banner row in any locale
|
|
268
|
+
*/
|
|
269
|
+
function isLocalizedBannerRow(line) {
|
|
270
|
+
if (!line.startsWith('**'))
|
|
271
|
+
return false;
|
|
272
|
+
if (!line.includes('|'))
|
|
273
|
+
return false;
|
|
274
|
+
const inside = (line.match(/\*\*[^*]+[::]\s*\*\*/g) ?? []).length;
|
|
275
|
+
const after = (line.match(/\*\*[^*]+\*\*\s*[::]/g) ?? []).length;
|
|
276
|
+
return inside + after >= 2;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Strip a leading all-caps prose label (e.g. `SITUATION:`, `KEY MOTION:`,
|
|
280
|
+
* `BLUF:`, `BOTTOM LINE:`, `TIER-1:`) from a prose line. These labels
|
|
281
|
+
* are common in BLUF-style editorial writing — they survive
|
|
282
|
+
* {@link stripInlineMarkdown} (which strips the `**bold**` wrapper but
|
|
283
|
+
* keeps the literal text) and would otherwise leak into the SEO
|
|
284
|
+
* description as a confusing all-caps shout.
|
|
285
|
+
*
|
|
286
|
+
* Matches up to 4 hyphenated all-caps tokens, optionally followed by a
|
|
287
|
+
* digit suffix (`TIER-1`), terminating at a colon. Returns the original
|
|
288
|
+
* line when no opener is present.
|
|
289
|
+
*
|
|
290
|
+
* @param line - Plain prose line (post-{@link stripInlineMarkdown})
|
|
291
|
+
* @returns Line with the all-caps opener removed
|
|
292
|
+
*/
|
|
293
|
+
export function stripLeadingProseLabel(line) {
|
|
294
|
+
const colonIdx = line.indexOf(': ');
|
|
295
|
+
if (colonIdx < 2 || colonIdx > 80)
|
|
296
|
+
return line;
|
|
297
|
+
const label = line.slice(0, colonIdx);
|
|
298
|
+
const rest = line.slice(colonIdx + 2).trim();
|
|
299
|
+
if (rest.length < 20)
|
|
300
|
+
return line;
|
|
301
|
+
if (!/^[A-Z][A-Z0-9 -]{1,79}$/.test(label))
|
|
302
|
+
return line;
|
|
303
|
+
if (label.length < 3)
|
|
304
|
+
return line;
|
|
305
|
+
return rest;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Strip inline Markdown decorations so we can use the remaining text as
|
|
309
|
+
* plain-text meta-tag content. Removes link syntax, emphasis, inline code
|
|
310
|
+
* backticks, and HTML-entity fragments that the Markdown source sometimes
|
|
311
|
+
* smuggles in. Keeps the visible text readable.
|
|
312
|
+
*
|
|
313
|
+
* @param raw - Trimmed Markdown line
|
|
314
|
+
* @returns Plain-text variant
|
|
315
|
+
*/
|
|
316
|
+
export function stripInlineMarkdown(raw) {
|
|
317
|
+
return raw
|
|
318
|
+
.replace(/!\[([^\]\n]{0,500})\]\(([^)\n]{0,500})\)/g, '$1')
|
|
319
|
+
.replace(/\[([^\]\n]{1,500})\]\(([^)\n]{0,500})\)/g, '$1')
|
|
320
|
+
.replace(/`([^`\n]{1,500})`/g, '$1')
|
|
321
|
+
.replace(/\*\*([^*\n]{1,500})\*\*/g, '$1')
|
|
322
|
+
.replace(/__([^_\n]{1,500})__/g, '$1')
|
|
323
|
+
.replace(/\*([^*\n]{1,500})\*/g, '$1')
|
|
324
|
+
.replace(/_([^_\n]{1,500})_/g, '$1')
|
|
325
|
+
.replace(/~~([^~\n]{1,500})~~/g, '$1')
|
|
326
|
+
.replace(/\s+/g, ' ')
|
|
327
|
+
.trim();
|
|
328
|
+
}
|
|
329
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
330
|
+
// Truncation helpers
|
|
331
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
332
|
+
/**
|
|
333
|
+
* Repeatedly strip trailing stop-words (separated by a single space) and
|
|
334
|
+
* trailing punctuation (including any pre-existing ellipsis). Implemented
|
|
335
|
+
* imperatively to avoid super-linear regex backtracking on the
|
|
336
|
+
* `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
|
|
337
|
+
*
|
|
338
|
+
* @param input - Pre-clipped string to clean up
|
|
339
|
+
* @returns Cleaned string with no trailing stop-words or punctuation
|
|
340
|
+
*/
|
|
341
|
+
function stripTrailingStopWordsAndPunctuation(input) {
|
|
342
|
+
let result = input;
|
|
343
|
+
let changed = true;
|
|
344
|
+
while (changed) {
|
|
345
|
+
changed = false;
|
|
346
|
+
while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
|
|
347
|
+
result = result.slice(0, -1);
|
|
348
|
+
changed = true;
|
|
349
|
+
}
|
|
350
|
+
const lastSpace = result.lastIndexOf(' ');
|
|
351
|
+
if (lastSpace >= 0) {
|
|
352
|
+
const tail = result.slice(lastSpace + 1).toLowerCase();
|
|
353
|
+
if (TRAILING_STOP_WORDS.has(tail)) {
|
|
354
|
+
result = result.slice(0, lastSpace);
|
|
355
|
+
changed = true;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return result;
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
|
|
363
|
+
* an ellipsis when truncation actually happens. Does not break words if
|
|
364
|
+
* avoidable — a trailing partial word is trimmed back to the previous
|
|
365
|
+
* space first.
|
|
366
|
+
*
|
|
367
|
+
* @param text - Raw description text
|
|
368
|
+
* @returns Truncated description with trailing ellipsis when clipped
|
|
369
|
+
*/
|
|
370
|
+
export function truncateDescription(text) {
|
|
371
|
+
if (text.length <= DESCRIPTION_MAX_LENGTH)
|
|
372
|
+
return text;
|
|
373
|
+
const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 1);
|
|
374
|
+
// Prefer the last full sentence terminator within the cut so we don't
|
|
375
|
+
// end on a dangling determiner ("…year. The"). Period/!/? followed by
|
|
376
|
+
// a space marks a clean boundary. Only honour the boundary when it
|
|
377
|
+
// sits past the soft minimum so we keep enough body text to be useful.
|
|
378
|
+
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
379
|
+
if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
|
|
380
|
+
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
381
|
+
}
|
|
382
|
+
const lastSpace = cut.lastIndexOf(' ');
|
|
383
|
+
let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
384
|
+
// Drop dangling stop-words and trailing punctuation/ellipsis so we
|
|
385
|
+
// never emit broken copy ("…year. The" → "…year.") or double-ellipsis
|
|
386
|
+
// ("The……") when the upstream input already carried an ellipsis.
|
|
387
|
+
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
388
|
+
return `${safe}…`;
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
|
|
392
|
+
* characters using the same sentence-boundary-preserving logic as
|
|
393
|
+
* {@link truncateDescription}. Returns `''` when the input is empty
|
|
394
|
+
* or shorter than the meta-description maximum (no point in emitting
|
|
395
|
+
* an "extended" description that's actually shorter than the regular
|
|
396
|
+
* one).
|
|
397
|
+
*
|
|
398
|
+
* @param text - Raw extended-description text (e.g. full BLUF paragraph)
|
|
399
|
+
* @returns Truncated extended description, or `''` when not worth emitting
|
|
400
|
+
*/
|
|
401
|
+
export function truncateExtendedDescription(text) {
|
|
402
|
+
const trimmed = text.trim();
|
|
403
|
+
if (!trimmed)
|
|
404
|
+
return '';
|
|
405
|
+
// Don't emit an extended description that is shorter than the
|
|
406
|
+
// short meta-description budget — there's no SEO win and it would
|
|
407
|
+
// make `og:description` shorter than `<meta description>`.
|
|
408
|
+
if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
|
|
409
|
+
return '';
|
|
410
|
+
if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
|
|
411
|
+
return trimmed;
|
|
412
|
+
const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH - 1);
|
|
413
|
+
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
414
|
+
if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
|
|
415
|
+
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
416
|
+
}
|
|
417
|
+
const lastSpace = cut.lastIndexOf(' ');
|
|
418
|
+
let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
419
|
+
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
420
|
+
return `${safe}…`;
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Clamp a title to `TITLE_MAX_LENGTH` characters in the same
|
|
424
|
+
* word-boundary-preserving fashion as {@link truncateDescription}.
|
|
425
|
+
*
|
|
426
|
+
* @param text - Raw title text
|
|
427
|
+
* @returns Truncated title with trailing ellipsis when clipped
|
|
428
|
+
*/
|
|
429
|
+
export function truncateTitle(text) {
|
|
430
|
+
if (text.length <= TITLE_MAX_LENGTH)
|
|
431
|
+
return text;
|
|
432
|
+
// Prefer ending at a natural clause boundary inside the
|
|
433
|
+
// `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
|
|
434
|
+
// title reads as a complete journalistic clause rather than a
|
|
435
|
+
// mid-sentence prose snippet. Iterate boundaries in priority order;
|
|
436
|
+
// when a candidate falls in the window, break there and drop the
|
|
437
|
+
// ellipsis since the result is grammatically complete.
|
|
438
|
+
const search = text.slice(0, TITLE_MAX_LENGTH);
|
|
439
|
+
for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
|
|
440
|
+
const idx = search.lastIndexOf(boundary);
|
|
441
|
+
if (idx >= HEADLINE_SOFT_MIN) {
|
|
442
|
+
const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
|
|
443
|
+
if (clean.length >= HEADLINE_SOFT_MIN)
|
|
444
|
+
return clean;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
|
|
448
|
+
const lastSpace = cut.lastIndexOf(' ');
|
|
449
|
+
let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
|
|
450
|
+
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
451
|
+
return `${safe}…`;
|
|
452
|
+
}
|
|
453
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
454
|
+
// Sentence extraction
|
|
455
|
+
// ────────────────────────────────────────────────────────────────────────
|
|
456
|
+
/**
|
|
457
|
+
* Return the first complete sentence from a prose paragraph, suitable
|
|
458
|
+
* for use as a fallback editorial title when the artefact H1 is
|
|
459
|
+
* categorical (e.g. `# EU Parliament Committee Reports`) and the
|
|
460
|
+
* resolver must derive `<title>` from the BLUF / lede summary instead.
|
|
461
|
+
*
|
|
462
|
+
* A "sentence" is the prefix up to the first sentence-terminator
|
|
463
|
+
* (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
|
|
464
|
+
* TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
|
|
465
|
+
* `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
|
|
466
|
+
* so they don't terminate the sentence prematurely. When no
|
|
467
|
+
* acceptable terminator exists in the window, returns the entire
|
|
468
|
+
* input unchanged so {@link truncateTitle} can handle clause-boundary
|
|
469
|
+
* truncation downstream.
|
|
470
|
+
*
|
|
471
|
+
* This produces journalistically clean titles even for the
|
|
472
|
+
* propositions / committee-reports cases where the BLUF paragraph
|
|
473
|
+
* opens with a single long sentence that exceeds 140 chars —
|
|
474
|
+
* `truncateTitle` then breaks on a clause boundary, and the result is
|
|
475
|
+
* still grammatical because the input was a sentence prefix rather
|
|
476
|
+
* than an arbitrary paragraph slice.
|
|
477
|
+
*
|
|
478
|
+
* @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
|
|
479
|
+
* @returns First sentence, or the original paragraph when none can be
|
|
480
|
+
* identified within the soft-min window
|
|
481
|
+
*/
|
|
482
|
+
export function extractFirstSentence(paragraph) {
|
|
483
|
+
const trimmed = paragraph.trim();
|
|
484
|
+
if (trimmed.length <= HEADLINE_SOFT_MIN)
|
|
485
|
+
return trimmed;
|
|
486
|
+
// Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
|
|
487
|
+
// we'd rather let truncateTitle clause-truncate the original
|
|
488
|
+
// paragraph than return a too-long first sentence.
|
|
489
|
+
const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
|
|
490
|
+
// Skip common abbreviations that contain a period inside a token
|
|
491
|
+
// (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
|
|
492
|
+
// candidate terminator positions; a position counts only when the
|
|
493
|
+
// char before it is *not* part of a known abbreviation token.
|
|
494
|
+
const terminators = ['. ', '! ', '? ', '; '];
|
|
495
|
+
let bestIdx = -1;
|
|
496
|
+
for (const t of terminators) {
|
|
497
|
+
let from = HEADLINE_SOFT_MIN;
|
|
498
|
+
let idx;
|
|
499
|
+
while ((idx = window.indexOf(t, from)) !== -1) {
|
|
500
|
+
if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
|
|
501
|
+
if (bestIdx === -1 || idx < bestIdx)
|
|
502
|
+
bestIdx = idx;
|
|
503
|
+
break;
|
|
504
|
+
}
|
|
505
|
+
from = idx + t.length;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
if (bestIdx >= HEADLINE_SOFT_MIN) {
|
|
509
|
+
return trimmed.slice(0, bestIdx + 1).trim();
|
|
510
|
+
}
|
|
511
|
+
return trimmed;
|
|
512
|
+
}
|
|
513
|
+
/**
|
|
514
|
+
* Check whether the character preceding the `.` at `idx` in `text`
|
|
515
|
+
* indicates an abbreviation (so the `.` is not a sentence terminator).
|
|
516
|
+
* Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
|
|
517
|
+
* single-letter initials pattern (`U.S.`, `E.U.`).
|
|
518
|
+
*
|
|
519
|
+
* @param text - Source text (lowercased segment + original mixed-case)
|
|
520
|
+
* @param idx - Index of the `.` character in `text`
|
|
521
|
+
* @returns `true` when the period at `idx` is part of an abbreviation
|
|
522
|
+
*/
|
|
523
|
+
function isAbbreviationBoundary(text, idx) {
|
|
524
|
+
// All-caps single-letter initial like `U.S.` or `E.U.` — char at
|
|
525
|
+
// idx-1 is a capital letter, and idx-2 is either start of string,
|
|
526
|
+
// whitespace, or another single-letter+period pair.
|
|
527
|
+
if (idx >= 1) {
|
|
528
|
+
const prev = text.charCodeAt(idx - 1);
|
|
529
|
+
const isUpperLetter = prev >= 65 && prev <= 90;
|
|
530
|
+
if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
|
|
531
|
+
return true;
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
// ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
|
|
535
|
+
// start of the word, then compare lowercased.
|
|
536
|
+
let start = idx;
|
|
537
|
+
while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
|
|
538
|
+
start--;
|
|
539
|
+
const token = text.slice(start, idx + 1).toLowerCase();
|
|
540
|
+
return ABBREVIATION_PREFIXES.includes(token);
|
|
541
|
+
}
|
|
542
|
+
//# sourceMappingURL=text-utils.js.map
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Constants/OgLocales
|
|
3
|
+
* @description Backward-compatible re-export shim. The canonical
|
|
4
|
+
* location is `src/constants/seo/og-locales.ts`; this file remains so
|
|
5
|
+
* existing imports `from '../constants/og-locales.js'` keep working
|
|
6
|
+
* through the May-2026 architecture refactor.
|
|
7
|
+
*
|
|
8
|
+
* New code SHOULD import from `src/constants/seo/index.js`:
|
|
9
|
+
*
|
|
10
|
+
* ```ts
|
|
11
|
+
* import { OG_LOCALES, getOgLocale, buildOgLocaleTags } from '../constants/seo/index.js';
|
|
12
|
+
* ```
|
|
13
|
+
*/
|
|
14
|
+
export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './seo/og-locales.js';
|
|
15
|
+
//# sourceMappingURL=og-locales.d.ts.map
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Constants/OgLocales
|
|
5
|
+
* @description Backward-compatible re-export shim. The canonical
|
|
6
|
+
* location is `src/constants/seo/og-locales.ts`; this file remains so
|
|
7
|
+
* existing imports `from '../constants/og-locales.js'` keep working
|
|
8
|
+
* through the May-2026 architecture refactor.
|
|
9
|
+
*
|
|
10
|
+
* New code SHOULD import from `src/constants/seo/index.js`:
|
|
11
|
+
*
|
|
12
|
+
* ```ts
|
|
13
|
+
* import { OG_LOCALES, getOgLocale, buildOgLocaleTags } from '../constants/seo/index.js';
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
16
|
+
export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './seo/og-locales.js';
|
|
17
|
+
//# sourceMappingURL=og-locales.js.map
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Constants/Seo
|
|
3
|
+
* @description Bounded-context barrel for the SEO-header constants
|
|
4
|
+
* shared by the four public HTML surfaces (news article, news index,
|
|
5
|
+
* sitemap, political-intelligence landing).
|
|
6
|
+
*
|
|
7
|
+
* Public API:
|
|
8
|
+
* - `OG_LOCALES`, `getOgLocale`, `buildOgLocaleTags` — BCP-47 locale
|
|
9
|
+
* mapping and tag emitters for the OpenGraph `og:locale[:alternate]`
|
|
10
|
+
* block.
|
|
11
|
+
* - `ORG_SAME_AS`, `TWITTER_SITE_HANDLE`, `TWITTER_CREATOR_HANDLE`,
|
|
12
|
+
* `buildTwitterAttributionTags` — canonical publisher/handles.
|
|
13
|
+
*
|
|
14
|
+
* Consumers MUST import from `src/constants/seo/index.js` (this
|
|
15
|
+
* barrel), not from the individual files inside `src/constants/seo/`.
|
|
16
|
+
* The drift-guard unit test in `test/unit/bounded-contexts.test.js`
|
|
17
|
+
* enforces this contract.
|
|
18
|
+
*/
|
|
19
|
+
export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './og-locales.js';
|
|
20
|
+
export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './social-handles.js';
|
|
21
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Constants/Seo
|
|
5
|
+
* @description Bounded-context barrel for the SEO-header constants
|
|
6
|
+
* shared by the four public HTML surfaces (news article, news index,
|
|
7
|
+
* sitemap, political-intelligence landing).
|
|
8
|
+
*
|
|
9
|
+
* Public API:
|
|
10
|
+
* - `OG_LOCALES`, `getOgLocale`, `buildOgLocaleTags` — BCP-47 locale
|
|
11
|
+
* mapping and tag emitters for the OpenGraph `og:locale[:alternate]`
|
|
12
|
+
* block.
|
|
13
|
+
* - `ORG_SAME_AS`, `TWITTER_SITE_HANDLE`, `TWITTER_CREATOR_HANDLE`,
|
|
14
|
+
* `buildTwitterAttributionTags` — canonical publisher/handles.
|
|
15
|
+
*
|
|
16
|
+
* Consumers MUST import from `src/constants/seo/index.js` (this
|
|
17
|
+
* barrel), not from the individual files inside `src/constants/seo/`.
|
|
18
|
+
* The drift-guard unit test in `test/unit/bounded-contexts.test.js`
|
|
19
|
+
* enforces this contract.
|
|
20
|
+
*/
|
|
21
|
+
export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './og-locales.js';
|
|
22
|
+
export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './social-handles.js';
|
|
23
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Constants/OgLocales
|
|
3
|
+
* @description BCP-47 OpenGraph locale codes (`en_GB`, `sv_SE`, …) for
|
|
4
|
+
* the 14 supported languages, plus helpers to emit the canonical
|
|
5
|
+
* `og:locale` and 13 `og:locale:alternate` meta tags on every page.
|
|
6
|
+
*
|
|
7
|
+
* **Why this matters.** OpenGraph (Facebook, LinkedIn, Slack, Discord)
|
|
8
|
+
* and Twitter/X expect `og:locale` to follow the BCP-47 underscore
|
|
9
|
+
* form (`language_TERRITORY`). The ISO 639-1 bare code (`en`, `sv`)
|
|
10
|
+
* that the rest of the site uses internally is accepted by Google's
|
|
11
|
+
* `inLanguage` schema but breaks social-card locale routing — Facebook
|
|
12
|
+
* silently falls back to `en_US` and serves the English preview to
|
|
13
|
+
* Swedish/German/Arabic users.
|
|
14
|
+
*
|
|
15
|
+
* **Choice of region tag.** Where a language has an obvious primary EU
|
|
16
|
+
* jurisdiction we use it (`sv_SE`, `de_DE`, `fr_FR`, `es_ES`, `nl_NL`).
|
|
17
|
+
* For Arabic, Hebrew, Japanese, Korean, Chinese, Norwegian, Danish,
|
|
18
|
+
* Finnish we pick the canonical CLDR/ISO 3166 region. English is
|
|
19
|
+
* `en_GB` (not `en_US`) — the editorial voice targets EU readers.
|
|
20
|
+
*
|
|
21
|
+
* The mapping table is the **single source of truth** for the social-
|
|
22
|
+
* card surface and is consumed by every HTML generator in
|
|
23
|
+
* `src/generators/` plus `src/aggregator/article-html.ts`. Tests
|
|
24
|
+
* assert byte-equivalent output across the four surfaces.
|
|
25
|
+
*/
|
|
26
|
+
import type { LanguageCode } from '../../types/index.js';
|
|
27
|
+
/**
|
|
28
|
+
* BCP-47 OpenGraph locale code per supported language.
|
|
29
|
+
*
|
|
30
|
+
* The values follow `<language>_<TERRITORY>` (underscore-separated)
|
|
31
|
+
* as required by the OpenGraph protocol. Use the helpers below rather
|
|
32
|
+
* than reading the map directly so the locale logic stays in one
|
|
33
|
+
* place.
|
|
34
|
+
*/
|
|
35
|
+
export declare const OG_LOCALES: Readonly<Record<LanguageCode, string>>;
|
|
36
|
+
/**
|
|
37
|
+
* Return the BCP-47 locale code for a given ISO 639-1 language code.
|
|
38
|
+
* Falls back to `en_GB` for unknown languages — the same fallback the
|
|
39
|
+
* rest of the site uses for missing translations.
|
|
40
|
+
*
|
|
41
|
+
* @param lang - ISO 639-1 language code (e.g., `"en"`, `"sv"`)
|
|
42
|
+
* @returns BCP-47 `language_TERRITORY` locale (e.g., `"en_GB"`)
|
|
43
|
+
*/
|
|
44
|
+
export declare function getOgLocale(lang: string): string;
|
|
45
|
+
/**
|
|
46
|
+
* Build the OpenGraph locale meta tag block — one canonical
|
|
47
|
+
* `og:locale` for the current language plus an `og:locale:alternate`
|
|
48
|
+
* for every other supported language. Emitting the alternates lets the
|
|
49
|
+
* Facebook/LinkedIn crawler discover the localized siblings without
|
|
50
|
+
* having to follow the `<link rel="alternate" hreflang>` chain.
|
|
51
|
+
*
|
|
52
|
+
* The output is intentionally indented with two spaces to match the
|
|
53
|
+
* surrounding `<head>` formatting in the four generators.
|
|
54
|
+
*
|
|
55
|
+
* @param currentLang - Language being rendered (drives `og:locale`)
|
|
56
|
+
* @returns Multi-line HTML fragment ready to drop into `<head>`
|
|
57
|
+
*/
|
|
58
|
+
export declare function buildOgLocaleTags(currentLang: string): string;
|
|
59
|
+
//# sourceMappingURL=og-locales.d.ts.map
|