euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -2,209 +2,25 @@
2
2
  // SPDX-License-Identifier: Apache-2.0
3
3
  /**
4
4
  * @module Aggregator/Metadata/TextUtils
5
- * @description Pure text / Markdown utility helpers extracted from
6
- * `article-metadata.ts` as a leaf module in the `metadata/` bounded
7
- * context. Every helper here is concerned with **how to massage a
8
- * string** into a meta-tag-safe shape strip Markdown decorations,
9
- * recognise banner / metadata rows that must never reach the
10
- * description, clamp text to byte budgets without producing broken
11
- * copy, and identify the first complete sentence in a prose paragraph.
5
+ * @description Pure text / Markdown classification + label-stripping
6
+ * helpers used by the metadata resolver chain. Constants live in
7
+ * `text-utils-constants.ts`; byte-budget truncators and sentence-
8
+ * extraction live in `text-truncate.ts`. This file re-exports the
9
+ * full public surface so existing call-sites keep working.
12
10
  *
13
- * Bounded-context rules for this file:
14
- * - **No upward imports** — pure helpers, no dependencies on other
15
- * `src/aggregator/` modules, no I/O, no globals.
16
- * - **Deterministic** — same input always produces same output; safe to
17
- * property-test.
11
+ * Bounded-context rules:
12
+ * - **No upward imports** — pure helpers, no I/O, no globals.
13
+ * - **Deterministic** same input always produces same output.
18
14
  * - **Locale-agnostic** — every helper works on raw Markdown / prose
19
15
  * in any of the 14 publishing languages. Banner-row detection is
20
16
  * driven by structural shape (double-bold + pipe-separator), not by
21
17
  * a hard-coded English vocabulary.
22
- *
23
- * The companion file `article-metadata.ts` re-exports the public surface
24
- * for back-compat. New code should import directly from this module.
25
- */
26
- // ────────────────────────────────────────────────────────────────────────
27
- // Length budgets — meta description / title size envelopes
28
- // ────────────────────────────────────────────────────────────────────────
29
- /** Maximum `<meta description>` length we will emit. */
30
- export const DESCRIPTION_MAX_LENGTH = 180;
31
- /**
32
- * Maximum `og:description` / `twitter:description` length we will
33
- * emit. Facebook truncates at ~300 characters in the preview card;
34
- * Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
35
- * (which use the full OG payload) get the full BLUF context, then
36
- * let Twitter clip naturally. Below this length the extended
37
- * description is emitted verbatim; above it we sentence-boundary
38
- * truncate the same way as {@link truncateDescription}.
39
- */
40
- export const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
41
- /** Target minimum extended-description length before we even emit it. */
42
- export const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
43
- /** Target minimum `<meta description>` length before we append context. */
44
- export const DESCRIPTION_MIN_LENGTH = 140;
45
- /**
46
- * Length below which a raw description is considered too short to stand
47
- * on its own and gets enriched with date/context. Independent from
48
- * {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
49
- * truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
50
- * clean 100-140 char prose lede is preserved verbatim instead of being
51
- * padded with date/context boilerplate.
52
- */
53
- export const ENRICHMENT_TRIGGER_LENGTH = 100;
54
- /** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
55
- export const TITLE_MAX_LENGTH = 140;
56
- /**
57
- * Soft target for headline-style titles produced as a fallback from
58
- * BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
59
- * truncator first looks for a natural clause boundary
60
- * (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
61
- * window and breaks there instead of mid-clause-with-ellipsis. This
62
- * turns a 137-character truncated prose paragraph into a complete
63
- * journalistic clause, which scans much better in news cards and SERP
64
- * snippets without sacrificing the keyword-rich opening.
65
- */
66
- export const HEADLINE_SOFT_MIN = 60;
67
- /**
68
- * Punctuation marks that signal a natural clause boundary inside a
69
- * BLUF / lede paragraph. Listed in preferred-break order: a colon or
70
- * em-dash that introduces a list of consequences is the best break,
71
- * full stops are next, and semicolons last. Single ASCII space is
72
- * always a fallback boundary handled separately.
73
- */
74
- export const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
75
- // ────────────────────────────────────────────────────────────────────────
76
- // Banner / metadata-row vocabularies
77
- // ────────────────────────────────────────────────────────────────────────
78
- /**
79
- * Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
80
- * (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
81
- * metadata, never prose.
82
- */
83
- export const EMOJI_BANNER_CHARS = [
84
- '📋',
85
- '📅',
86
- '🔍',
87
- '🏛',
88
- '📰',
89
- '📊',
90
- '🏷',
91
- '📈',
92
- '📉',
93
- '⚠',
94
- '🔔',
95
- '🎯',
96
- '🗳',
97
- '🏢',
98
- '📄',
99
- ];
100
- /**
101
- * Label prefixes that a prose description must never start with. Every
102
- * entry matches case-insensitively at the start of a trimmed line, followed
103
- * by optional space and a colon.
104
18
  */
105
- export const METADATA_LINE_PREFIXES = [
106
- 'Admiralty Grade',
107
- 'Analysis Date',
108
- 'Analysis Owner',
109
- 'Article Type',
110
- 'Article Window',
111
- 'Assessment Date',
112
- 'Briefing',
113
- 'Briefing Date',
114
- 'Classification',
115
- 'Classification Date',
116
- 'Confidence',
117
- 'Confidence in Evidence',
118
- 'Data Sources',
119
- 'Date',
120
- 'Document Type',
121
- 'Filing Date',
122
- 'Generated',
123
- 'Horizon',
124
- 'IMF Status',
125
- 'Last Updated',
126
- 'Parliamentary Status',
127
- 'Parliamentary Term',
128
- 'Period',
129
- 'Prepared',
130
- 'Purpose',
131
- 'Region',
132
- 'Reporting',
133
- 'Reporting Period',
134
- 'Reporting Window',
135
- 'Run',
136
- 'Run ID',
137
- 'Series',
138
- 'Series Run',
139
- 'Source',
140
- 'Sources',
141
- 'SPDX-FileCopyrightText',
142
- 'SPDX-License-Identifier',
143
- 'Topic',
144
- 'Type',
145
- // Bare `WEP:` (Words of Estimative Probability) lines appear in
146
- // `intelligence/synthesis-summary.md` between a KJ-N heading and its
147
- // prose body (e.g. `**WEP: ALMOST CERTAINLY (>95%)** | Admiralty: A1`).
148
- // The line is grade/confidence metadata, not editorial prose — without
149
- // this prefix it leaked into `<meta description>` as an all-caps shout
150
- // (run #26223932441, propositions 2026-05-21).
151
- 'WEP',
152
- 'WEP Band',
153
- 'WEP Grade',
154
- 'Window',
155
- ];
156
- // ────────────────────────────────────────────────────────────────────────
157
- // Trailing-cleanup vocabularies (used by truncation helpers)
158
- // ────────────────────────────────────────────────────────────────────────
159
- /** Connector / determiner words that read as broken copy when they are
160
- * the final token before a truncation ellipsis. */
161
- export const TRAILING_STOP_WORDS = new Set([
162
- 'the',
163
- 'a',
164
- 'an',
165
- 'of',
166
- 'to',
167
- 'for',
168
- 'in',
169
- 'on',
170
- 'at',
171
- 'by',
172
- 'and',
173
- 'or',
174
- 'with',
175
- 'from',
176
- ]);
177
- /** Trailing characters we always strip before appending our own ellipsis,
178
- * so we never emit double-ellipsis or stray punctuation. */
179
- export const TRAILING_PUNCT = /[.,;:—\-…\s]/u;
180
- /**
181
- * Abbreviation tokens (lowercase, including the trailing period) that
182
- * should NOT count as sentence terminators when {@link extractFirstSentence}
183
- * scans for a `.` boundary. Single-letter all-caps initials
184
- * (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
185
- */
186
- export const ABBREVIATION_PREFIXES = [
187
- 'mr.',
188
- 'mrs.',
189
- 'ms.',
190
- 'dr.',
191
- 'st.',
192
- 'no.',
193
- 'vs.',
194
- 'e.g.',
195
- 'i.e.',
196
- 'etc.',
197
- 'cf.',
198
- 'al.',
199
- // EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
200
- 'q1.',
201
- 'q2.',
202
- 'q3.',
203
- 'q4.',
204
- 'h1.',
205
- 'h2.',
206
- 'fy.',
207
- ];
19
+ export { ABBREVIATION_PREFIXES, DESCRIPTION_MAX_LENGTH, DESCRIPTION_MIN_LENGTH, EMOJI_BANNER_CHARS, ENRICHMENT_TRIGGER_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MIN_LENGTH, HEADLINE_CLAUSE_BOUNDARIES, HEADLINE_SOFT_MIN, METADATA_LINE_PREFIXES, TITLE_MAX_LENGTH, TRAILING_PUNCT, TRAILING_STOP_WORDS, } from './text-utils-constants.js';
20
+ export { extractFirstSentence, stripTrailingStopWordsAndPunctuation, truncateDescription, truncateExtendedDescription, truncateTitle, } from './text-truncate.js';
21
+ import { EMOJI_BANNER_CHARS, METADATA_LINE_PREFIXES } from './text-utils-constants.js';
22
+ const STRUCTURAL_LINE_PREFIXES = ['#', '>', '<', '|'];
23
+ const FENCE_LINE_PREFIXES = ['```', '~~~'];
208
24
  // ────────────────────────────────────────────────────────────────────────
209
25
  // Line-classification helpers
210
26
  // ────────────────────────────────────────────────────────────────────────
@@ -220,40 +36,75 @@ export const ABBREVIATION_PREFIXES = [
220
36
  export function shouldSkipDescriptionLine(line) {
221
37
  if (line.length === 0)
222
38
  return true;
223
- if (line.startsWith('#'))
224
- return true;
225
- if (line.startsWith('>'))
226
- return true;
227
- if (line.startsWith('<'))
228
- return true;
229
- if (line.startsWith('|'))
230
- return true;
231
- if (line.startsWith('---') || line.startsWith('==='))
232
- return true;
233
- if (line.startsWith('```') || line.startsWith('~~~'))
234
- return true;
235
- if (line.startsWith('%%'))
236
- return true;
237
- if (/^title\s/i.test(line))
238
- return true;
239
- if (EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)))
240
- return true;
39
+ return DESCRIPTION_SKIP_CHECKS.some((check) => check(line));
40
+ }
41
+ const DESCRIPTION_SKIP_CHECKS = [
42
+ (line) => startsWithAny(line, STRUCTURAL_LINE_PREFIXES),
43
+ (line) => line.startsWith('---') || line.startsWith('==='),
44
+ (line) => startsWithAny(line, FENCE_LINE_PREFIXES),
45
+ (line) => line.startsWith('%%'),
46
+ (line) => /^title\s/i.test(line),
47
+ (line) => EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)),
48
+ startsWithSeparatorFragment,
49
+ isStructuralListLeader,
50
+ startsWithContinuationConjunction,
51
+ hasTrailingEllipsis,
52
+ isPublishedBanner,
53
+ startsWithMetadataLabel,
54
+ (line) => /^[-*_=~.]{3,}$/.test(line),
55
+ isLocalizedBannerRow,
56
+ isPlainPipeBannerRow,
57
+ ];
58
+ function startsWithAny(line, prefixes) {
59
+ return prefixes.some((prefix) => line.startsWith(prefix));
60
+ }
61
+ function startsWithSeparatorFragment(line) {
62
+ return /^[:;,—–-]\s/u.test(line);
63
+ }
64
+ function isStructuralListLeader(line) {
65
+ return /^\(?[0-9]{1,2}[.):]\s/u.test(line) || /^\(?[a-z][.)]\s/iu.test(line);
66
+ }
67
+ function startsWithContinuationConjunction(line) {
68
+ return /^(that|which|while|whereas|and|but|for|yet|so|nor|or)\s/iu.test(line);
69
+ }
70
+ function hasTrailingEllipsis(line) {
71
+ return line.endsWith('…') || /\.{3,}$/u.test(line);
72
+ }
73
+ function isPublishedBanner(line) {
74
+ return /^published\s+\d{4}-\d{2}-\d{2}\b/iu.test(line);
75
+ }
76
+ function startsWithMetadataLabel(line) {
241
77
  const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
242
- for (const prefix of METADATA_LINE_PREFIXES) {
243
- const lower = labelSource.toLowerCase();
78
+ const lower = labelSource.toLowerCase();
79
+ return METADATA_LINE_PREFIXES.some((prefix) => {
244
80
  const prefixLower = prefix.toLowerCase();
245
- if (lower.startsWith(`${prefixLower}:`) ||
81
+ return (lower.startsWith(`${prefixLower}:`) ||
246
82
  lower.startsWith(`${prefixLower} :`) ||
247
83
  lower.startsWith(`${prefixLower}**:`) ||
248
- lower.startsWith(`${prefixLower}*:`)) {
249
- return true;
250
- }
84
+ lower.startsWith(`${prefixLower}*:`));
85
+ });
86
+ }
87
+ /**
88
+ * Detect a plain (non-bold) pipe-delimited banner row of the shape
89
+ * `Tag: Value | Tag: Value | Tag: Value`. Matches three-or-more
90
+ * `Word: …` segments separated by ` | ` so legitimate prose containing
91
+ * a single colon (`The Commission's view: …`) is preserved.
92
+ *
93
+ * @param line - Trimmed source line
94
+ * @returns `true` when the line is a plain pipe-banner row
95
+ */
96
+ function isPlainPipeBannerRow(line) {
97
+ if (!line.includes('|'))
98
+ return false;
99
+ const segments = line.split('|').map((s) => s.trim());
100
+ if (segments.length < 3)
101
+ return false;
102
+ let labeledSegments = 0;
103
+ for (const seg of segments) {
104
+ if (/^[A-Z][\p{L}\p{M}\p{N}\- ]{1,30}[::]\s+\S/u.test(seg))
105
+ labeledSegments += 1;
251
106
  }
252
- if (/^[-*_=~.]{3,}$/.test(line))
253
- return true;
254
- if (isLocalizedBannerRow(line))
255
- return true;
256
- return false;
107
+ return labeledSegments >= 2;
257
108
  }
258
109
  /**
259
110
  * Language-agnostic banner-row detector. Stage-B artefacts open with a
@@ -311,6 +162,48 @@ export function stripLeadingProseLabel(line) {
311
162
  return line;
312
163
  return rest;
313
164
  }
165
+ /**
166
+ * Strip a leading `**Label:**` / `**Label:**` prefix from a Markdown
167
+ * BLUF line, in any of the 14 publishing languages. Translated
168
+ * executive briefs open the `## FOR IMMEDIATE ACTION` section with
169
+ * patterns such as `**Issue:** …`, `**Fråga:** …`, `**Asunto:** …`,
170
+ * `**主題:** …`, `**الموضوع:** …`, `**Thema:** …`, `**Sujet :** …` —
171
+ * without this stripper the localized label leaked into
172
+ * `<meta description>` for every non-English locale (the English
173
+ * `**Issue:**` line is already filtered by `METADATA_LINE_PREFIXES`).
174
+ *
175
+ * The matcher is *structural*, not vocabulary-driven: it accepts up to
176
+ * 5 word/glyph tokens (letters, marks, digits, spaces, hyphens),
177
+ * followed by either an ASCII colon `:` or full-width colon `:`,
178
+ * followed by `**`, followed by whitespace. Returns the line verbatim
179
+ * when no qualifying opener is present so it is safe to apply
180
+ * unconditionally.
181
+ *
182
+ * @param raw - Raw Markdown line (still carrying `**…**` decorations)
183
+ * @returns Line with the leading `**Label:**` prefix removed, or the
184
+ * original input when no such prefix exists
185
+ */
186
+ export function stripLeadingBoldLabel(raw) {
187
+ // Allowed label characters: any Unicode letter, mark, digit, space, hyphen.
188
+ // 1–5 tokens (≤ 40 chars total) to avoid swallowing long inline-bold prose.
189
+ // Both `**Label:**` (colon inside the bold span) and `**Label**:` are
190
+ // observed in translations — match both shapes.
191
+ const pattern = /^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})[::]\*\*\s+|^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})\*\*\s*[::]\s+/u;
192
+ const match = pattern.exec(raw);
193
+ if (!match) {
194
+ // Defense in depth: even when no `**Label**` decoration is present,
195
+ // strip a residual orphan separator at the line start. Upstream
196
+ // strippers (e.g. {@link stripInlineMarkdown} applied after a
197
+ // partial bold-label removal) can leave `: rest of sentence…`
198
+ // shapes; we never want those leading punctuation glyphs to survive
199
+ // into the description or title.
200
+ return raw.replace(/^[:;—–-]\s+/u, '');
201
+ }
202
+ // After the bold-label match, also strip any *additional* residual
203
+ // separator that may follow (rare, but observed when authors write
204
+ // `**Issue**: : `).
205
+ return raw.slice(match[0].length).replace(/^[:;—–-]\s+/u, '');
206
+ }
314
207
  /**
315
208
  * Strip inline Markdown decorations so we can use the remaining text as
316
209
  * plain-text meta-tag content. Removes link syntax, emphasis, inline code
@@ -333,217 +226,4 @@ export function stripInlineMarkdown(raw) {
333
226
  .replace(/\s+/g, ' ')
334
227
  .trim();
335
228
  }
336
- // ────────────────────────────────────────────────────────────────────────
337
- // Truncation helpers
338
- // ────────────────────────────────────────────────────────────────────────
339
- /**
340
- * Repeatedly strip trailing stop-words (separated by a single space) and
341
- * trailing punctuation (including any pre-existing ellipsis). Implemented
342
- * imperatively to avoid super-linear regex backtracking on the
343
- * `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
344
- *
345
- * @param input - Pre-clipped string to clean up
346
- * @returns Cleaned string with no trailing stop-words or punctuation
347
- */
348
- function stripTrailingStopWordsAndPunctuation(input) {
349
- let result = input;
350
- let changed = true;
351
- while (changed) {
352
- changed = false;
353
- while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
354
- result = result.slice(0, -1);
355
- changed = true;
356
- }
357
- const lastSpace = result.lastIndexOf(' ');
358
- if (lastSpace >= 0) {
359
- const tail = result.slice(lastSpace + 1).toLowerCase();
360
- if (TRAILING_STOP_WORDS.has(tail)) {
361
- result = result.slice(0, lastSpace);
362
- changed = true;
363
- }
364
- }
365
- }
366
- return result;
367
- }
368
- /**
369
- * Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
370
- * an ellipsis when truncation actually happens. Does not break words if
371
- * avoidable — a trailing partial word is trimmed back to the previous
372
- * space first.
373
- *
374
- * @param text - Raw description text
375
- * @returns Truncated description with trailing ellipsis when clipped
376
- */
377
- export function truncateDescription(text) {
378
- if (text.length <= DESCRIPTION_MAX_LENGTH)
379
- return text;
380
- const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 1);
381
- // Prefer the last full sentence terminator within the cut so we don't
382
- // end on a dangling determiner ("…year. The"). Period/!/? followed by
383
- // a space marks a clean boundary. Only honour the boundary when it
384
- // sits past the soft minimum so we keep enough body text to be useful.
385
- const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
386
- if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
387
- return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
388
- }
389
- const lastSpace = cut.lastIndexOf(' ');
390
- let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
391
- // Drop dangling stop-words and trailing punctuation/ellipsis so we
392
- // never emit broken copy ("…year. The" → "…year.") or double-ellipsis
393
- // ("The……") when the upstream input already carried an ellipsis.
394
- safe = stripTrailingStopWordsAndPunctuation(safe);
395
- return `${safe}…`;
396
- }
397
- /**
398
- * Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
399
- * characters using the same sentence-boundary-preserving logic as
400
- * {@link truncateDescription}. Returns `''` when the input is empty
401
- * or shorter than the meta-description maximum (no point in emitting
402
- * an "extended" description that's actually shorter than the regular
403
- * one).
404
- *
405
- * @param text - Raw extended-description text (e.g. full BLUF paragraph)
406
- * @returns Truncated extended description, or `''` when not worth emitting
407
- */
408
- export function truncateExtendedDescription(text) {
409
- const trimmed = text.trim();
410
- if (!trimmed)
411
- return '';
412
- // Don't emit an extended description that is shorter than the
413
- // short meta-description budget — there's no SEO win and it would
414
- // make `og:description` shorter than `<meta description>`.
415
- if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
416
- return '';
417
- if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
418
- return trimmed;
419
- const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH - 1);
420
- const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
421
- if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
422
- return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
423
- }
424
- const lastSpace = cut.lastIndexOf(' ');
425
- let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
426
- safe = stripTrailingStopWordsAndPunctuation(safe);
427
- return `${safe}…`;
428
- }
429
- /**
430
- * Clamp a title to `TITLE_MAX_LENGTH` characters in the same
431
- * word-boundary-preserving fashion as {@link truncateDescription}.
432
- *
433
- * @param text - Raw title text
434
- * @returns Truncated title with trailing ellipsis when clipped
435
- */
436
- export function truncateTitle(text) {
437
- if (text.length <= TITLE_MAX_LENGTH)
438
- return text;
439
- // Prefer ending at a natural clause boundary inside the
440
- // `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
441
- // title reads as a complete journalistic clause rather than a
442
- // mid-sentence prose snippet. Iterate boundaries in priority order;
443
- // when a candidate falls in the window, break there and drop the
444
- // ellipsis since the result is grammatically complete.
445
- const search = text.slice(0, TITLE_MAX_LENGTH);
446
- for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
447
- const idx = search.lastIndexOf(boundary);
448
- if (idx >= HEADLINE_SOFT_MIN) {
449
- const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
450
- if (clean.length >= HEADLINE_SOFT_MIN)
451
- return clean;
452
- }
453
- }
454
- const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
455
- const lastSpace = cut.lastIndexOf(' ');
456
- let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
457
- safe = stripTrailingStopWordsAndPunctuation(safe);
458
- return `${safe}…`;
459
- }
460
- // ────────────────────────────────────────────────────────────────────────
461
- // Sentence extraction
462
- // ────────────────────────────────────────────────────────────────────────
463
- /**
464
- * Return the first complete sentence from a prose paragraph, suitable
465
- * for use as a fallback editorial title when the artefact H1 is
466
- * categorical (e.g. `# EU Parliament Committee Reports`) and the
467
- * resolver must derive `<title>` from the BLUF / lede summary instead.
468
- *
469
- * A "sentence" is the prefix up to the first sentence-terminator
470
- * (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
471
- * TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
472
- * `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
473
- * so they don't terminate the sentence prematurely. When no
474
- * acceptable terminator exists in the window, returns the entire
475
- * input unchanged so {@link truncateTitle} can handle clause-boundary
476
- * truncation downstream.
477
- *
478
- * This produces journalistically clean titles even for the
479
- * propositions / committee-reports cases where the BLUF paragraph
480
- * opens with a single long sentence that exceeds 140 chars —
481
- * `truncateTitle` then breaks on a clause boundary, and the result is
482
- * still grammatical because the input was a sentence prefix rather
483
- * than an arbitrary paragraph slice.
484
- *
485
- * @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
486
- * @returns First sentence, or the original paragraph when none can be
487
- * identified within the soft-min window
488
- */
489
- export function extractFirstSentence(paragraph) {
490
- const trimmed = paragraph.trim();
491
- if (trimmed.length <= HEADLINE_SOFT_MIN)
492
- return trimmed;
493
- // Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
494
- // we'd rather let truncateTitle clause-truncate the original
495
- // paragraph than return a too-long first sentence.
496
- const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
497
- // Skip common abbreviations that contain a period inside a token
498
- // (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
499
- // candidate terminator positions; a position counts only when the
500
- // char before it is *not* part of a known abbreviation token.
501
- const terminators = ['. ', '! ', '? ', '; '];
502
- let bestIdx = -1;
503
- for (const t of terminators) {
504
- let from = HEADLINE_SOFT_MIN;
505
- let idx;
506
- while ((idx = window.indexOf(t, from)) !== -1) {
507
- if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
508
- if (bestIdx === -1 || idx < bestIdx)
509
- bestIdx = idx;
510
- break;
511
- }
512
- from = idx + t.length;
513
- }
514
- }
515
- if (bestIdx >= HEADLINE_SOFT_MIN) {
516
- return trimmed.slice(0, bestIdx + 1).trim();
517
- }
518
- return trimmed;
519
- }
520
- /**
521
- * Check whether the character preceding the `.` at `idx` in `text`
522
- * indicates an abbreviation (so the `.` is not a sentence terminator).
523
- * Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
524
- * single-letter initials pattern (`U.S.`, `E.U.`).
525
- *
526
- * @param text - Source text (lowercased segment + original mixed-case)
527
- * @param idx - Index of the `.` character in `text`
528
- * @returns `true` when the period at `idx` is part of an abbreviation
529
- */
530
- function isAbbreviationBoundary(text, idx) {
531
- // All-caps single-letter initial like `U.S.` or `E.U.` — char at
532
- // idx-1 is a capital letter, and idx-2 is either start of string,
533
- // whitespace, or another single-letter+period pair.
534
- if (idx >= 1) {
535
- const prev = text.charCodeAt(idx - 1);
536
- const isUpperLetter = prev >= 65 && prev <= 90;
537
- if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
538
- return true;
539
- }
540
- }
541
- // ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
542
- // start of the word, then compare lowercased.
543
- let start = idx;
544
- while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
545
- start--;
546
- const token = text.slice(start, idx + 1).toLowerCase();
547
- return ABBREVIATION_PREFIXES.includes(token);
548
- }
549
229
  //# sourceMappingURL=text-utils.js.map
@@ -0,0 +1,37 @@
1
+ /**
2
+ * `true` when the candidate is a bold-prose section header that
3
+ * leaked through the priority-finding extractor (e.g. `Strategic
4
+ * significance`, `Threat Level`).
5
+ *
6
+ * @param value - Title candidate
7
+ * @returns `true` when the candidate matches the section-header denylist.
8
+ */
9
+ export declare function looksLikeSectionHeader(value: string): boolean;
10
+ /**
11
+ * `true` when the candidate ends with `…` or `...` (was truncated
12
+ * over the title budget).
13
+ *
14
+ * @param value - Title candidate
15
+ * @returns `true` when the candidate has a trailing ellipsis.
16
+ */
17
+ export declare function looksLikeEllipsisCut(value: string): boolean;
18
+ /**
19
+ * `true` when the candidate is a bare adopted-text doc-ID.
20
+ *
21
+ * @param value - Title candidate
22
+ * @returns `true` when the candidate matches the `TA-NN-YYYY-NNNN` shape.
23
+ */
24
+ export declare function looksLikeDocId(value: string): boolean;
25
+ /**
26
+ * Master rejection predicate. Returns the reason code (one of
27
+ * `section-header`, `ellipsis-cut`, `doc-id`, `sentence-fragment`)
28
+ * when the candidate should be rejected, or `null` when it is
29
+ * usable.
30
+ *
31
+ * @param value - Title candidate
32
+ * @returns Reason code, or `null` when the candidate is usable.
33
+ */
34
+ export declare function findTitleRejectionReason(value: string): 'section-header' | 'ellipsis-cut' | 'doc-id' | 'sentence-fragment' | null;
35
+ /** Exposed for unit tests + the SEO validator. */
36
+ export declare const TITLE_REJECTION_DENYLIST: readonly string[];
37
+ //# sourceMappingURL=title-rejection.d.ts.map