euparliamentmonitor 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/package.json +6 -4
  2. package/scripts/aggregator/article-generator.js +2 -1
  3. package/scripts/aggregator/article-html.d.ts +9 -0
  4. package/scripts/aggregator/article-html.js +134 -13
  5. package/scripts/aggregator/article-metadata.d.ts +25 -161
  6. package/scripts/aggregator/article-metadata.js +71 -649
  7. package/scripts/aggregator/editorial-brief-resolver.d.ts +9 -0
  8. package/scripts/aggregator/editorial-brief-resolver.js +3 -1
  9. package/scripts/aggregator/metadata/date-labels.d.ts +122 -0
  10. package/scripts/aggregator/metadata/date-labels.js +209 -0
  11. package/scripts/aggregator/metadata/text-utils.d.ts +188 -0
  12. package/scripts/aggregator/metadata/text-utils.js +542 -0
  13. package/scripts/constants/og-locales.d.ts +15 -0
  14. package/scripts/constants/og-locales.js +17 -0
  15. package/scripts/constants/seo/index.d.ts +21 -0
  16. package/scripts/constants/seo/index.js +23 -0
  17. package/scripts/constants/seo/og-locales.d.ts +59 -0
  18. package/scripts/constants/seo/og-locales.js +59 -0
  19. package/scripts/constants/seo/social-handles.d.ts +50 -0
  20. package/scripts/constants/seo/social-handles.js +65 -0
  21. package/scripts/constants/social-handles.d.ts +11 -0
  22. package/scripts/constants/social-handles.js +13 -0
  23. package/scripts/discover-untranslated-briefs.js +224 -19
  24. package/scripts/generators/news-indexes.d.ts +35 -0
  25. package/scripts/generators/news-indexes.js +67 -6
  26. package/scripts/generators/political-intelligence/html.js +14 -6
  27. package/scripts/generators/seo-copy.js +42 -0
  28. package/scripts/generators/sitemap/html.js +13 -5
  29. package/scripts/lint-src-todos.js +124 -0
  30. package/scripts/utils/copy-test-reports.js +1 -1
  31. package/scripts/utils/generate-docs-index.js +1 -1
  32. package/scripts/validate-brief-translations.js +158 -18
@@ -0,0 +1,542 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Aggregator/Metadata/TextUtils
5
+ * @description Pure text / Markdown utility helpers extracted from
6
+ * `article-metadata.ts` as a leaf module in the `metadata/` bounded
7
+ * context. Every helper here is concerned with **how to massage a
8
+ * string** into a meta-tag-safe shape — strip Markdown decorations,
9
+ * recognise banner / metadata rows that must never reach the
10
+ * description, clamp text to byte budgets without producing broken
11
+ * copy, and identify the first complete sentence in a prose paragraph.
12
+ *
13
+ * Bounded-context rules for this file:
14
+ * - **No upward imports** — pure helpers, no dependencies on other
15
+ * `src/aggregator/` modules, no I/O, no globals.
16
+ * - **Deterministic** — same input always produces same output; safe to
17
+ * property-test.
18
+ * - **Locale-agnostic** — every helper works on raw Markdown / prose
19
+ * in any of the 14 publishing languages. Banner-row detection is
20
+ * driven by structural shape (double-bold + pipe-separator), not by
21
+ * a hard-coded English vocabulary.
22
+ *
23
+ * The companion file `article-metadata.ts` re-exports the public surface
24
+ * for back-compat. New code should import directly from this module.
25
+ */
26
+ // ────────────────────────────────────────────────────────────────────────
27
+ // Length budgets — meta description / title size envelopes
28
+ // ────────────────────────────────────────────────────────────────────────
29
+ /** Maximum `<meta description>` length we will emit. */
30
+ export const DESCRIPTION_MAX_LENGTH = 180;
31
+ /**
32
+ * Maximum `og:description` / `twitter:description` length we will
33
+ * emit. Facebook truncates at ~300 characters in the preview card;
34
+ * Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
35
+ * (which use the full OG payload) get the full BLUF context, then
36
+ * let Twitter clip naturally. Below this length the extended
37
+ * description is emitted verbatim; above it we sentence-boundary
38
+ * truncate the same way as {@link truncateDescription}.
39
+ */
40
+ export const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
41
+ /** Target minimum extended-description length before we even emit it. */
42
+ export const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
43
+ /** Target minimum `<meta description>` length before we append context. */
44
+ export const DESCRIPTION_MIN_LENGTH = 140;
45
+ /**
46
+ * Length below which a raw description is considered too short to stand
47
+ * on its own and gets enriched with date/context. Independent from
48
+ * {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
49
+ * truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
50
+ * clean 100-140 char prose lede is preserved verbatim instead of being
51
+ * padded with date/context boilerplate.
52
+ */
53
+ export const ENRICHMENT_TRIGGER_LENGTH = 100;
54
+ /** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
55
+ export const TITLE_MAX_LENGTH = 140;
56
+ /**
57
+ * Soft target for headline-style titles produced as a fallback from
58
+ * BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
59
+ * truncator first looks for a natural clause boundary
60
+ * (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
61
+ * window and breaks there instead of mid-clause-with-ellipsis. This
62
+ * turns a 137-character truncated prose paragraph into a complete
63
+ * journalistic clause, which scans much better in news cards and SERP
64
+ * snippets without sacrificing the keyword-rich opening.
65
+ */
66
+ export const HEADLINE_SOFT_MIN = 60;
67
+ /**
68
+ * Punctuation marks that signal a natural clause boundary inside a
69
+ * BLUF / lede paragraph. Listed in preferred-break order: a colon or
70
+ * em-dash that introduces a list of consequences is the best break,
71
+ * full stops are next, and semicolons last. Single ASCII space is
72
+ * always a fallback boundary handled separately.
73
+ */
74
+ export const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
75
+ // ────────────────────────────────────────────────────────────────────────
76
+ // Banner / metadata-row vocabularies
77
+ // ────────────────────────────────────────────────────────────────────────
78
+ /**
79
+ * Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
80
+ * (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
81
+ * metadata, never prose.
82
+ */
83
+ export const EMOJI_BANNER_CHARS = [
84
+ '📋',
85
+ '📅',
86
+ '🔍',
87
+ '🏛',
88
+ '📰',
89
+ '📊',
90
+ '🏷',
91
+ '📈',
92
+ '📉',
93
+ '⚠',
94
+ '🔔',
95
+ '🎯',
96
+ '🗳',
97
+ '🏢',
98
+ '📄',
99
+ ];
100
+ /**
101
+ * Label prefixes that a prose description must never start with. Every
102
+ * entry matches case-insensitively at the start of a trimmed line, followed
103
+ * by optional space and a colon.
104
+ */
105
+ export const METADATA_LINE_PREFIXES = [
106
+ 'Admiralty Grade',
107
+ 'Analysis Date',
108
+ 'Analysis Owner',
109
+ 'Article Type',
110
+ 'Article Window',
111
+ 'Assessment Date',
112
+ 'Briefing',
113
+ 'Briefing Date',
114
+ 'Classification',
115
+ 'Classification Date',
116
+ 'Confidence',
117
+ 'Confidence in Evidence',
118
+ 'Data Sources',
119
+ 'Date',
120
+ 'Document Type',
121
+ 'Filing Date',
122
+ 'Generated',
123
+ 'Horizon',
124
+ 'IMF Status',
125
+ 'Last Updated',
126
+ 'Parliamentary Status',
127
+ 'Parliamentary Term',
128
+ 'Period',
129
+ 'Prepared',
130
+ 'Purpose',
131
+ 'Region',
132
+ 'Reporting',
133
+ 'Reporting Period',
134
+ 'Reporting Window',
135
+ 'Run',
136
+ 'Run ID',
137
+ 'Series',
138
+ 'Series Run',
139
+ 'Source',
140
+ 'Sources',
141
+ 'SPDX-FileCopyrightText',
142
+ 'SPDX-License-Identifier',
143
+ 'Topic',
144
+ 'Type',
145
+ 'WEP Band',
146
+ 'WEP Grade',
147
+ 'Window',
148
+ ];
149
+ // ────────────────────────────────────────────────────────────────────────
150
+ // Trailing-cleanup vocabularies (used by truncation helpers)
151
+ // ────────────────────────────────────────────────────────────────────────
152
+ /** Connector / determiner words that read as broken copy when they are
153
+ * the final token before a truncation ellipsis. */
154
+ export const TRAILING_STOP_WORDS = new Set([
155
+ 'the',
156
+ 'a',
157
+ 'an',
158
+ 'of',
159
+ 'to',
160
+ 'for',
161
+ 'in',
162
+ 'on',
163
+ 'at',
164
+ 'by',
165
+ 'and',
166
+ 'or',
167
+ 'with',
168
+ 'from',
169
+ ]);
170
+ /** Trailing characters we always strip before appending our own ellipsis,
171
+ * so we never emit double-ellipsis or stray punctuation. */
172
+ export const TRAILING_PUNCT = /[.,;:—\-…\s]/u;
173
+ /**
174
+ * Abbreviation tokens (lowercase, including the trailing period) that
175
+ * should NOT count as sentence terminators when {@link extractFirstSentence}
176
+ * scans for a `.` boundary. Single-letter all-caps initials
177
+ * (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
178
+ */
179
+ export const ABBREVIATION_PREFIXES = [
180
+ 'mr.',
181
+ 'mrs.',
182
+ 'ms.',
183
+ 'dr.',
184
+ 'st.',
185
+ 'no.',
186
+ 'vs.',
187
+ 'e.g.',
188
+ 'i.e.',
189
+ 'etc.',
190
+ 'cf.',
191
+ 'al.',
192
+ // EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
193
+ 'q1.',
194
+ 'q2.',
195
+ 'q3.',
196
+ 'q4.',
197
+ 'h1.',
198
+ 'h2.',
199
+ 'fy.',
200
+ ];
201
+ // ────────────────────────────────────────────────────────────────────────
202
+ // Line-classification helpers
203
+ // ────────────────────────────────────────────────────────────────────────
204
+ /**
205
+ * Return `true` when a line cannot serve as a prose description. Rejects
206
+ * Markdown structural lines (headings, blockquotes, tables, HTML),
207
+ * mermaid/chart directives, emoji-banner metadata rows, and the known
208
+ * `Key: value` banners that Stage-B agents emit as artefact preamble.
209
+ *
210
+ * @param line - Trimmed line from the aggregated Markdown source
211
+ * @returns `true` when the line is not prose and should be skipped
212
+ */
213
+ export function shouldSkipDescriptionLine(line) {
214
+ if (line.length === 0)
215
+ return true;
216
+ if (line.startsWith('#'))
217
+ return true;
218
+ if (line.startsWith('>'))
219
+ return true;
220
+ if (line.startsWith('<'))
221
+ return true;
222
+ if (line.startsWith('|'))
223
+ return true;
224
+ if (line.startsWith('---') || line.startsWith('==='))
225
+ return true;
226
+ if (line.startsWith('```') || line.startsWith('~~~'))
227
+ return true;
228
+ if (line.startsWith('%%'))
229
+ return true;
230
+ if (/^title\s/i.test(line))
231
+ return true;
232
+ if (EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)))
233
+ return true;
234
+ const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
235
+ for (const prefix of METADATA_LINE_PREFIXES) {
236
+ const lower = labelSource.toLowerCase();
237
+ const prefixLower = prefix.toLowerCase();
238
+ if (lower.startsWith(`${prefixLower}:`) ||
239
+ lower.startsWith(`${prefixLower} :`) ||
240
+ lower.startsWith(`${prefixLower}**:`) ||
241
+ lower.startsWith(`${prefixLower}*:`)) {
242
+ return true;
243
+ }
244
+ }
245
+ if (/^[-*_=~.]{3,}$/.test(line))
246
+ return true;
247
+ if (isLocalizedBannerRow(line))
248
+ return true;
249
+ return false;
250
+ }
251
+ /**
252
+ * Language-agnostic banner-row detector. Stage-B artefacts open with a
253
+ * metadata banner of the shape
254
+ * `**Date:** 2026-05-15 | **Type:** Breaking | **Run:** breaking-run-001`
255
+ * and its localized siblings — notably Japanese / Chinese / Korean briefs
256
+ * which place the full-width colon `:` **inside** the bold span
257
+ * (`**日付:**`) rather than after it. The `METADATA_LINE_PREFIXES` table
258
+ * only covers the English vocabulary; this helper catches the structural
259
+ * shape directly: a line that starts with `**`, contains at least one
260
+ * `|` separator, and carries two-or-more bold key markers that end with
261
+ * — or are followed by — an ASCII colon `:` or full-width colon `:`.
262
+ * Banner rows look identical in every language we publish, so detecting
263
+ * them here keeps localized briefs from leaking their first banner line
264
+ * into the `<meta description>`.
265
+ *
266
+ * @param line - Trimmed source line
267
+ * @returns `true` when the line is a banner row in any locale
268
+ */
269
+ function isLocalizedBannerRow(line) {
270
+ if (!line.startsWith('**'))
271
+ return false;
272
+ if (!line.includes('|'))
273
+ return false;
274
+ const inside = (line.match(/\*\*[^*]+[::]\s*\*\*/g) ?? []).length;
275
+ const after = (line.match(/\*\*[^*]+\*\*\s*[::]/g) ?? []).length;
276
+ return inside + after >= 2;
277
+ }
278
+ /**
279
+ * Strip a leading all-caps prose label (e.g. `SITUATION:`, `KEY MOTION:`,
280
+ * `BLUF:`, `BOTTOM LINE:`, `TIER-1:`) from a prose line. These labels
281
+ * are common in BLUF-style editorial writing — they survive
282
+ * {@link stripInlineMarkdown} (which strips the `**bold**` wrapper but
283
+ * keeps the literal text) and would otherwise leak into the SEO
284
+ * description as a confusing all-caps shout.
285
+ *
286
+ * Matches up to 4 hyphenated all-caps tokens, optionally followed by a
287
+ * digit suffix (`TIER-1`), terminating at a colon. Returns the original
288
+ * line when no opener is present.
289
+ *
290
+ * @param line - Plain prose line (post-{@link stripInlineMarkdown})
291
+ * @returns Line with the all-caps opener removed
292
+ */
293
+ export function stripLeadingProseLabel(line) {
294
+ const colonIdx = line.indexOf(': ');
295
+ if (colonIdx < 2 || colonIdx > 80)
296
+ return line;
297
+ const label = line.slice(0, colonIdx);
298
+ const rest = line.slice(colonIdx + 2).trim();
299
+ if (rest.length < 20)
300
+ return line;
301
+ if (!/^[A-Z][A-Z0-9 -]{1,79}$/.test(label))
302
+ return line;
303
+ if (label.length < 3)
304
+ return line;
305
+ return rest;
306
+ }
307
+ /**
308
+ * Strip inline Markdown decorations so we can use the remaining text as
309
+ * plain-text meta-tag content. Removes link syntax, emphasis, inline code
310
+ * backticks, and HTML-entity fragments that the Markdown source sometimes
311
+ * smuggles in. Keeps the visible text readable.
312
+ *
313
+ * @param raw - Trimmed Markdown line
314
+ * @returns Plain-text variant
315
+ */
316
+ export function stripInlineMarkdown(raw) {
317
+ return raw
318
+ .replace(/!\[([^\]\n]{0,500})\]\(([^)\n]{0,500})\)/g, '$1')
319
+ .replace(/\[([^\]\n]{1,500})\]\(([^)\n]{0,500})\)/g, '$1')
320
+ .replace(/`([^`\n]{1,500})`/g, '$1')
321
+ .replace(/\*\*([^*\n]{1,500})\*\*/g, '$1')
322
+ .replace(/__([^_\n]{1,500})__/g, '$1')
323
+ .replace(/\*([^*\n]{1,500})\*/g, '$1')
324
+ .replace(/_([^_\n]{1,500})_/g, '$1')
325
+ .replace(/~~([^~\n]{1,500})~~/g, '$1')
326
+ .replace(/\s+/g, ' ')
327
+ .trim();
328
+ }
329
+ // ────────────────────────────────────────────────────────────────────────
330
+ // Truncation helpers
331
+ // ────────────────────────────────────────────────────────────────────────
332
+ /**
333
+ * Repeatedly strip trailing stop-words (separated by a single space) and
334
+ * trailing punctuation (including any pre-existing ellipsis). Implemented
335
+ * imperatively to avoid super-linear regex backtracking on the
336
+ * `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
337
+ *
338
+ * @param input - Pre-clipped string to clean up
339
+ * @returns Cleaned string with no trailing stop-words or punctuation
340
+ */
341
+ function stripTrailingStopWordsAndPunctuation(input) {
342
+ let result = input;
343
+ let changed = true;
344
+ while (changed) {
345
+ changed = false;
346
+ while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
347
+ result = result.slice(0, -1);
348
+ changed = true;
349
+ }
350
+ const lastSpace = result.lastIndexOf(' ');
351
+ if (lastSpace >= 0) {
352
+ const tail = result.slice(lastSpace + 1).toLowerCase();
353
+ if (TRAILING_STOP_WORDS.has(tail)) {
354
+ result = result.slice(0, lastSpace);
355
+ changed = true;
356
+ }
357
+ }
358
+ }
359
+ return result;
360
+ }
361
+ /**
362
+ * Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
363
+ * an ellipsis when truncation actually happens. Does not break words if
364
+ * avoidable — a trailing partial word is trimmed back to the previous
365
+ * space first.
366
+ *
367
+ * @param text - Raw description text
368
+ * @returns Truncated description with trailing ellipsis when clipped
369
+ */
370
+ export function truncateDescription(text) {
371
+ if (text.length <= DESCRIPTION_MAX_LENGTH)
372
+ return text;
373
+ const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 1);
374
+ // Prefer the last full sentence terminator within the cut so we don't
375
+ // end on a dangling determiner ("…year. The"). Period/!/? followed by
376
+ // a space marks a clean boundary. Only honour the boundary when it
377
+ // sits past the soft minimum so we keep enough body text to be useful.
378
+ const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
379
+ if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
380
+ return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
381
+ }
382
+ const lastSpace = cut.lastIndexOf(' ');
383
+ let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
384
+ // Drop dangling stop-words and trailing punctuation/ellipsis so we
385
+ // never emit broken copy ("…year. The" → "…year.") or double-ellipsis
386
+ // ("The……") when the upstream input already carried an ellipsis.
387
+ safe = stripTrailingStopWordsAndPunctuation(safe);
388
+ return `${safe}…`;
389
+ }
390
+ /**
391
+ * Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
392
+ * characters using the same sentence-boundary-preserving logic as
393
+ * {@link truncateDescription}. Returns `''` when the input is empty
394
+ * or shorter than the meta-description maximum (no point in emitting
395
+ * an "extended" description that's actually shorter than the regular
396
+ * one).
397
+ *
398
+ * @param text - Raw extended-description text (e.g. full BLUF paragraph)
399
+ * @returns Truncated extended description, or `''` when not worth emitting
400
+ */
401
+ export function truncateExtendedDescription(text) {
402
+ const trimmed = text.trim();
403
+ if (!trimmed)
404
+ return '';
405
+ // Don't emit an extended description that is shorter than the
406
+ // short meta-description budget — there's no SEO win and it would
407
+ // make `og:description` shorter than `<meta description>`.
408
+ if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
409
+ return '';
410
+ if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
411
+ return trimmed;
412
+ const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH - 1);
413
+ const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
414
+ if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
415
+ return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
416
+ }
417
+ const lastSpace = cut.lastIndexOf(' ');
418
+ let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
419
+ safe = stripTrailingStopWordsAndPunctuation(safe);
420
+ return `${safe}…`;
421
+ }
422
+ /**
423
+ * Clamp a title to `TITLE_MAX_LENGTH` characters in the same
424
+ * word-boundary-preserving fashion as {@link truncateDescription}.
425
+ *
426
+ * @param text - Raw title text
427
+ * @returns Truncated title with trailing ellipsis when clipped
428
+ */
429
+ export function truncateTitle(text) {
430
+ if (text.length <= TITLE_MAX_LENGTH)
431
+ return text;
432
+ // Prefer ending at a natural clause boundary inside the
433
+ // `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
434
+ // title reads as a complete journalistic clause rather than a
435
+ // mid-sentence prose snippet. Iterate boundaries in priority order;
436
+ // when a candidate falls in the window, break there and drop the
437
+ // ellipsis since the result is grammatically complete.
438
+ const search = text.slice(0, TITLE_MAX_LENGTH);
439
+ for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
440
+ const idx = search.lastIndexOf(boundary);
441
+ if (idx >= HEADLINE_SOFT_MIN) {
442
+ const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
443
+ if (clean.length >= HEADLINE_SOFT_MIN)
444
+ return clean;
445
+ }
446
+ }
447
+ const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
448
+ const lastSpace = cut.lastIndexOf(' ');
449
+ let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
450
+ safe = stripTrailingStopWordsAndPunctuation(safe);
451
+ return `${safe}…`;
452
+ }
453
+ // ────────────────────────────────────────────────────────────────────────
454
+ // Sentence extraction
455
+ // ────────────────────────────────────────────────────────────────────────
456
+ /**
457
+ * Return the first complete sentence from a prose paragraph, suitable
458
+ * for use as a fallback editorial title when the artefact H1 is
459
+ * categorical (e.g. `# EU Parliament Committee Reports`) and the
460
+ * resolver must derive `<title>` from the BLUF / lede summary instead.
461
+ *
462
+ * A "sentence" is the prefix up to the first sentence-terminator
463
+ * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
464
+ * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
465
+ * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
466
+ * so they don't terminate the sentence prematurely. When no
467
+ * acceptable terminator exists in the window, returns the entire
468
+ * input unchanged so {@link truncateTitle} can handle clause-boundary
469
+ * truncation downstream.
470
+ *
471
+ * This produces journalistically clean titles even for the
472
+ * propositions / committee-reports cases where the BLUF paragraph
473
+ * opens with a single long sentence that exceeds 140 chars —
474
+ * `truncateTitle` then breaks on a clause boundary, and the result is
475
+ * still grammatical because the input was a sentence prefix rather
476
+ * than an arbitrary paragraph slice.
477
+ *
478
+ * @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
479
+ * @returns First sentence, or the original paragraph when none can be
480
+ * identified within the soft-min window
481
+ */
482
+ export function extractFirstSentence(paragraph) {
483
+ const trimmed = paragraph.trim();
484
+ if (trimmed.length <= HEADLINE_SOFT_MIN)
485
+ return trimmed;
486
+ // Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
487
+ // we'd rather let truncateTitle clause-truncate the original
488
+ // paragraph than return a too-long first sentence.
489
+ const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
490
+ // Skip common abbreviations that contain a period inside a token
491
+ // (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
492
+ // candidate terminator positions; a position counts only when the
493
+ // char before it is *not* part of a known abbreviation token.
494
+ const terminators = ['. ', '! ', '? ', '; '];
495
+ let bestIdx = -1;
496
+ for (const t of terminators) {
497
+ let from = HEADLINE_SOFT_MIN;
498
+ let idx;
499
+ while ((idx = window.indexOf(t, from)) !== -1) {
500
+ if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
501
+ if (bestIdx === -1 || idx < bestIdx)
502
+ bestIdx = idx;
503
+ break;
504
+ }
505
+ from = idx + t.length;
506
+ }
507
+ }
508
+ if (bestIdx >= HEADLINE_SOFT_MIN) {
509
+ return trimmed.slice(0, bestIdx + 1).trim();
510
+ }
511
+ return trimmed;
512
+ }
513
+ /**
514
+ * Check whether the character preceding the `.` at `idx` in `text`
515
+ * indicates an abbreviation (so the `.` is not a sentence terminator).
516
+ * Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
517
+ * single-letter initials pattern (`U.S.`, `E.U.`).
518
+ *
519
+ * @param text - Source text (lowercased segment + original mixed-case)
520
+ * @param idx - Index of the `.` character in `text`
521
+ * @returns `true` when the period at `idx` is part of an abbreviation
522
+ */
523
+ function isAbbreviationBoundary(text, idx) {
524
+ // All-caps single-letter initial like `U.S.` or `E.U.` — char at
525
+ // idx-1 is a capital letter, and idx-2 is either start of string,
526
+ // whitespace, or another single-letter+period pair.
527
+ if (idx >= 1) {
528
+ const prev = text.charCodeAt(idx - 1);
529
+ const isUpperLetter = prev >= 65 && prev <= 90;
530
+ if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
531
+ return true;
532
+ }
533
+ }
534
+ // ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
535
+ // start of the word, then compare lowercased.
536
+ let start = idx;
537
+ while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
538
+ start--;
539
+ const token = text.slice(start, idx + 1).toLowerCase();
540
+ return ABBREVIATION_PREFIXES.includes(token);
541
+ }
542
+ //# sourceMappingURL=text-utils.js.map
@@ -0,0 +1,15 @@
1
+ /**
2
+ * @module Constants/OgLocales
3
+ * @description Backward-compatible re-export shim. The canonical
4
+ * location is `src/constants/seo/og-locales.ts`; this file remains so
5
+ * existing imports `from '../constants/og-locales.js'` keep working
6
+ * through the May-2026 architecture refactor.
7
+ *
8
+ * New code SHOULD import from `src/constants/seo/index.js`:
9
+ *
10
+ * ```ts
11
+ * import { OG_LOCALES, getOgLocale, buildOgLocaleTags } from '../constants/seo/index.js';
12
+ * ```
13
+ */
14
+ export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './seo/og-locales.js';
15
+ //# sourceMappingURL=og-locales.d.ts.map
@@ -0,0 +1,17 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Constants/OgLocales
5
+ * @description Backward-compatible re-export shim. The canonical
6
+ * location is `src/constants/seo/og-locales.ts`; this file remains so
7
+ * existing imports `from '../constants/og-locales.js'` keep working
8
+ * through the May-2026 architecture refactor.
9
+ *
10
+ * New code SHOULD import from `src/constants/seo/index.js`:
11
+ *
12
+ * ```ts
13
+ * import { OG_LOCALES, getOgLocale, buildOgLocaleTags } from '../constants/seo/index.js';
14
+ * ```
15
+ */
16
+ export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './seo/og-locales.js';
17
+ //# sourceMappingURL=og-locales.js.map
@@ -0,0 +1,21 @@
1
+ /**
2
+ * @module Constants/Seo
3
+ * @description Bounded-context barrel for the SEO-header constants
4
+ * shared by the four public HTML surfaces (news article, news index,
5
+ * sitemap, political-intelligence landing).
6
+ *
7
+ * Public API:
8
+ * - `OG_LOCALES`, `getOgLocale`, `buildOgLocaleTags` — BCP-47 locale
9
+ * mapping and tag emitters for the OpenGraph `og:locale[:alternate]`
10
+ * block.
11
+ * - `ORG_SAME_AS`, `TWITTER_SITE_HANDLE`, `TWITTER_CREATOR_HANDLE`,
12
+ * `buildTwitterAttributionTags` — canonical publisher/handles.
13
+ *
14
+ * Consumers MUST import from `src/constants/seo/index.js` (this
15
+ * barrel), not from the individual files inside `src/constants/seo/`.
16
+ * The drift-guard unit test in `test/unit/bounded-contexts.test.js`
17
+ * enforces this contract.
18
+ */
19
+ export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './og-locales.js';
20
+ export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './social-handles.js';
21
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,23 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Constants/Seo
5
+ * @description Bounded-context barrel for the SEO-header constants
6
+ * shared by the four public HTML surfaces (news article, news index,
7
+ * sitemap, political-intelligence landing).
8
+ *
9
+ * Public API:
10
+ * - `OG_LOCALES`, `getOgLocale`, `buildOgLocaleTags` — BCP-47 locale
11
+ * mapping and tag emitters for the OpenGraph `og:locale[:alternate]`
12
+ * block.
13
+ * - `ORG_SAME_AS`, `TWITTER_SITE_HANDLE`, `TWITTER_CREATOR_HANDLE`,
14
+ * `buildTwitterAttributionTags` — canonical publisher/handles.
15
+ *
16
+ * Consumers MUST import from `src/constants/seo/index.js` (this
17
+ * barrel), not from the individual files inside `src/constants/seo/`.
18
+ * The drift-guard unit test in `test/unit/bounded-contexts.test.js`
19
+ * enforces this contract.
20
+ */
21
+ export { OG_LOCALES, getOgLocale, buildOgLocaleTags } from './og-locales.js';
22
+ export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './social-handles.js';
23
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,59 @@
1
+ /**
2
+ * @module Constants/OgLocales
3
+ * @description BCP-47 OpenGraph locale codes (`en_GB`, `sv_SE`, …) for
4
+ * the 14 supported languages, plus helpers to emit the canonical
5
+ * `og:locale` and 13 `og:locale:alternate` meta tags on every page.
6
+ *
7
+ * **Why this matters.** OpenGraph (Facebook, LinkedIn, Slack, Discord)
8
+ * and Twitter/X expect `og:locale` to follow the BCP-47 underscore
9
+ * form (`language_TERRITORY`). The ISO 639-1 bare code (`en`, `sv`)
10
+ * that the rest of the site uses internally is accepted by Google's
11
+ * `inLanguage` schema but breaks social-card locale routing — Facebook
12
+ * silently falls back to `en_US` and serves the English preview to
13
+ * Swedish/German/Arabic users.
14
+ *
15
+ * **Choice of region tag.** Where a language has an obvious primary EU
16
+ * jurisdiction we use it (`sv_SE`, `de_DE`, `fr_FR`, `es_ES`, `nl_NL`).
17
+ * For Arabic, Hebrew, Japanese, Korean, Chinese, Norwegian, Danish,
18
+ * Finnish we pick the canonical CLDR/ISO 3166 region. English is
19
+ * `en_GB` (not `en_US`) — the editorial voice targets EU readers.
20
+ *
21
+ * The mapping table is the **single source of truth** for the social-
22
+ * card surface and is consumed by every HTML generator in
23
+ * `src/generators/` plus `src/aggregator/article-html.ts`. Tests
24
+ * assert byte-equivalent output across the four surfaces.
25
+ */
26
+ import type { LanguageCode } from '../../types/index.js';
27
+ /**
28
+ * BCP-47 OpenGraph locale code per supported language.
29
+ *
30
+ * The values follow `<language>_<TERRITORY>` (underscore-separated)
31
+ * as required by the OpenGraph protocol. Use the helpers below rather
32
+ * than reading the map directly so the locale logic stays in one
33
+ * place.
34
+ */
35
+ export declare const OG_LOCALES: Readonly<Record<LanguageCode, string>>;
36
+ /**
37
+ * Return the BCP-47 locale code for a given ISO 639-1 language code.
38
+ * Falls back to `en_GB` for unknown languages — the same fallback the
39
+ * rest of the site uses for missing translations.
40
+ *
41
+ * @param lang - ISO 639-1 language code (e.g., `"en"`, `"sv"`)
42
+ * @returns BCP-47 `language_TERRITORY` locale (e.g., `"en_GB"`)
43
+ */
44
+ export declare function getOgLocale(lang: string): string;
45
+ /**
46
+ * Build the OpenGraph locale meta tag block — one canonical
47
+ * `og:locale` for the current language plus an `og:locale:alternate`
48
+ * for every other supported language. Emitting the alternates lets the
49
+ * Facebook/LinkedIn crawler discover the localized siblings without
50
+ * having to follow the `<link rel="alternate" hreflang>` chain.
51
+ *
52
+ * The output is intentionally indented with two spaces to match the
53
+ * surrounding `<head>` formatting in the four generators.
54
+ *
55
+ * @param currentLang - Language being rendered (drives `og:locale`)
56
+ * @returns Multi-line HTML fragment ready to drop into `<head>`
57
+ */
58
+ export declare function buildOgLocaleTags(currentLang: string): string;
59
+ //# sourceMappingURL=og-locales.d.ts.map