euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -2,209 +2,25 @@
|
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
/**
|
|
4
4
|
* @module Aggregator/Metadata/TextUtils
|
|
5
|
-
* @description Pure text / Markdown
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
* description, clamp text to byte budgets without producing broken
|
|
11
|
-
* copy, and identify the first complete sentence in a prose paragraph.
|
|
5
|
+
* @description Pure text / Markdown classification + label-stripping
|
|
6
|
+
* helpers used by the metadata resolver chain. Constants live in
|
|
7
|
+
* `text-utils-constants.ts`; byte-budget truncators and sentence-
|
|
8
|
+
* extraction live in `text-truncate.ts`. This file re-exports the
|
|
9
|
+
* full public surface so existing call-sites keep working.
|
|
12
10
|
*
|
|
13
|
-
* Bounded-context rules
|
|
14
|
-
* - **No upward imports** — pure helpers, no
|
|
15
|
-
*
|
|
16
|
-
* - **Deterministic** — same input always produces same output; safe to
|
|
17
|
-
* property-test.
|
|
11
|
+
* Bounded-context rules:
|
|
12
|
+
* - **No upward imports** — pure helpers, no I/O, no globals.
|
|
13
|
+
* - **Deterministic** — same input always produces same output.
|
|
18
14
|
* - **Locale-agnostic** — every helper works on raw Markdown / prose
|
|
19
15
|
* in any of the 14 publishing languages. Banner-row detection is
|
|
20
16
|
* driven by structural shape (double-bold + pipe-separator), not by
|
|
21
17
|
* a hard-coded English vocabulary.
|
|
22
|
-
*
|
|
23
|
-
* The companion file `article-metadata.ts` re-exports the public surface
|
|
24
|
-
* for back-compat. New code should import directly from this module.
|
|
25
|
-
*/
|
|
26
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
27
|
-
// Length budgets — meta description / title size envelopes
|
|
28
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
29
|
-
/** Maximum `<meta description>` length we will emit. */
|
|
30
|
-
export const DESCRIPTION_MAX_LENGTH = 180;
|
|
31
|
-
/**
|
|
32
|
-
* Maximum `og:description` / `twitter:description` length we will
|
|
33
|
-
* emit. Facebook truncates at ~300 characters in the preview card;
|
|
34
|
-
* Twitter at ~200. We aim for the longer cap so LinkedIn / Slack
|
|
35
|
-
* (which use the full OG payload) get the full BLUF context, then
|
|
36
|
-
* let Twitter clip naturally. Below this length the extended
|
|
37
|
-
* description is emitted verbatim; above it we sentence-boundary
|
|
38
|
-
* truncate the same way as {@link truncateDescription}.
|
|
39
|
-
*/
|
|
40
|
-
export const EXTENDED_DESCRIPTION_MAX_LENGTH = 300;
|
|
41
|
-
/** Target minimum extended-description length before we even emit it. */
|
|
42
|
-
export const EXTENDED_DESCRIPTION_MIN_LENGTH = 200;
|
|
43
|
-
/** Target minimum `<meta description>` length before we append context. */
|
|
44
|
-
export const DESCRIPTION_MIN_LENGTH = 140;
|
|
45
|
-
/**
|
|
46
|
-
* Length below which a raw description is considered too short to stand
|
|
47
|
-
* on its own and gets enriched with date/context. Independent from
|
|
48
|
-
* {@link DESCRIPTION_MIN_LENGTH} (which controls sentence-boundary
|
|
49
|
-
* truncation behaviour). Set lower than DESCRIPTION_MIN_LENGTH so a
|
|
50
|
-
* clean 100-140 char prose lede is preserved verbatim instead of being
|
|
51
|
-
* padded with date/context boilerplate.
|
|
52
|
-
*/
|
|
53
|
-
export const ENRICHMENT_TRIGGER_LENGTH = 100;
|
|
54
|
-
/** Maximum `<title>` length — anything longer is truncated with an ellipsis. */
|
|
55
|
-
export const TITLE_MAX_LENGTH = 140;
|
|
56
|
-
/**
|
|
57
|
-
* Soft target for headline-style titles produced as a fallback from
|
|
58
|
-
* BLUF/lede prose. When the candidate exceeds `TITLE_MAX_LENGTH`, the
|
|
59
|
-
* truncator first looks for a natural clause boundary
|
|
60
|
-
* (`.`, `:`, `—`, `;`) inside the `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]`
|
|
61
|
-
* window and breaks there instead of mid-clause-with-ellipsis. This
|
|
62
|
-
* turns a 137-character truncated prose paragraph into a complete
|
|
63
|
-
* journalistic clause, which scans much better in news cards and SERP
|
|
64
|
-
* snippets without sacrificing the keyword-rich opening.
|
|
65
|
-
*/
|
|
66
|
-
export const HEADLINE_SOFT_MIN = 60;
|
|
67
|
-
/**
|
|
68
|
-
* Punctuation marks that signal a natural clause boundary inside a
|
|
69
|
-
* BLUF / lede paragraph. Listed in preferred-break order: a colon or
|
|
70
|
-
* em-dash that introduces a list of consequences is the best break,
|
|
71
|
-
* full stops are next, and semicolons last. Single ASCII space is
|
|
72
|
-
* always a fallback boundary handled separately.
|
|
73
|
-
*/
|
|
74
|
-
export const HEADLINE_CLAUSE_BOUNDARIES = [': ', ' — ', ' – ', '. ', '; '];
|
|
75
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
76
|
-
// Banner / metadata-row vocabularies
|
|
77
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
78
|
-
/**
|
|
79
|
-
* Emoji-banner prefixes that Stage-B agents use to decorate metadata rows
|
|
80
|
-
* (e.g. `📋 Analysis Owner:`). Any line starting with one of these is
|
|
81
|
-
* metadata, never prose.
|
|
82
|
-
*/
|
|
83
|
-
export const EMOJI_BANNER_CHARS = [
|
|
84
|
-
'📋',
|
|
85
|
-
'📅',
|
|
86
|
-
'🔍',
|
|
87
|
-
'🏛',
|
|
88
|
-
'📰',
|
|
89
|
-
'📊',
|
|
90
|
-
'🏷',
|
|
91
|
-
'📈',
|
|
92
|
-
'📉',
|
|
93
|
-
'⚠',
|
|
94
|
-
'🔔',
|
|
95
|
-
'🎯',
|
|
96
|
-
'🗳',
|
|
97
|
-
'🏢',
|
|
98
|
-
'📄',
|
|
99
|
-
];
|
|
100
|
-
/**
|
|
101
|
-
* Label prefixes that a prose description must never start with. Every
|
|
102
|
-
* entry matches case-insensitively at the start of a trimmed line, followed
|
|
103
|
-
* by optional space and a colon.
|
|
104
18
|
*/
|
|
105
|
-
export
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
'Article Window',
|
|
111
|
-
'Assessment Date',
|
|
112
|
-
'Briefing',
|
|
113
|
-
'Briefing Date',
|
|
114
|
-
'Classification',
|
|
115
|
-
'Classification Date',
|
|
116
|
-
'Confidence',
|
|
117
|
-
'Confidence in Evidence',
|
|
118
|
-
'Data Sources',
|
|
119
|
-
'Date',
|
|
120
|
-
'Document Type',
|
|
121
|
-
'Filing Date',
|
|
122
|
-
'Generated',
|
|
123
|
-
'Horizon',
|
|
124
|
-
'IMF Status',
|
|
125
|
-
'Last Updated',
|
|
126
|
-
'Parliamentary Status',
|
|
127
|
-
'Parliamentary Term',
|
|
128
|
-
'Period',
|
|
129
|
-
'Prepared',
|
|
130
|
-
'Purpose',
|
|
131
|
-
'Region',
|
|
132
|
-
'Reporting',
|
|
133
|
-
'Reporting Period',
|
|
134
|
-
'Reporting Window',
|
|
135
|
-
'Run',
|
|
136
|
-
'Run ID',
|
|
137
|
-
'Series',
|
|
138
|
-
'Series Run',
|
|
139
|
-
'Source',
|
|
140
|
-
'Sources',
|
|
141
|
-
'SPDX-FileCopyrightText',
|
|
142
|
-
'SPDX-License-Identifier',
|
|
143
|
-
'Topic',
|
|
144
|
-
'Type',
|
|
145
|
-
// Bare `WEP:` (Words of Estimative Probability) lines appear in
|
|
146
|
-
// `intelligence/synthesis-summary.md` between a KJ-N heading and its
|
|
147
|
-
// prose body (e.g. `**WEP: ALMOST CERTAINLY (>95%)** | Admiralty: A1`).
|
|
148
|
-
// The line is grade/confidence metadata, not editorial prose — without
|
|
149
|
-
// this prefix it leaked into `<meta description>` as an all-caps shout
|
|
150
|
-
// (run #26223932441, propositions 2026-05-21).
|
|
151
|
-
'WEP',
|
|
152
|
-
'WEP Band',
|
|
153
|
-
'WEP Grade',
|
|
154
|
-
'Window',
|
|
155
|
-
];
|
|
156
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
157
|
-
// Trailing-cleanup vocabularies (used by truncation helpers)
|
|
158
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
159
|
-
/** Connector / determiner words that read as broken copy when they are
|
|
160
|
-
* the final token before a truncation ellipsis. */
|
|
161
|
-
export const TRAILING_STOP_WORDS = new Set([
|
|
162
|
-
'the',
|
|
163
|
-
'a',
|
|
164
|
-
'an',
|
|
165
|
-
'of',
|
|
166
|
-
'to',
|
|
167
|
-
'for',
|
|
168
|
-
'in',
|
|
169
|
-
'on',
|
|
170
|
-
'at',
|
|
171
|
-
'by',
|
|
172
|
-
'and',
|
|
173
|
-
'or',
|
|
174
|
-
'with',
|
|
175
|
-
'from',
|
|
176
|
-
]);
|
|
177
|
-
/** Trailing characters we always strip before appending our own ellipsis,
|
|
178
|
-
* so we never emit double-ellipsis or stray punctuation. */
|
|
179
|
-
export const TRAILING_PUNCT = /[.,;:—\-…\s]/u;
|
|
180
|
-
/**
|
|
181
|
-
* Abbreviation tokens (lowercase, including the trailing period) that
|
|
182
|
-
* should NOT count as sentence terminators when {@link extractFirstSentence}
|
|
183
|
-
* scans for a `.` boundary. Single-letter all-caps initials
|
|
184
|
-
* (`U.S.`, `E.U.`) are handled by the all-caps-initial check below.
|
|
185
|
-
*/
|
|
186
|
-
export const ABBREVIATION_PREFIXES = [
|
|
187
|
-
'mr.',
|
|
188
|
-
'mrs.',
|
|
189
|
-
'ms.',
|
|
190
|
-
'dr.',
|
|
191
|
-
'st.',
|
|
192
|
-
'no.',
|
|
193
|
-
'vs.',
|
|
194
|
-
'e.g.',
|
|
195
|
-
'i.e.',
|
|
196
|
-
'etc.',
|
|
197
|
-
'cf.',
|
|
198
|
-
'al.',
|
|
199
|
-
// EP fiscal-year and quarter shorthand: Q1., Q2., Q3., Q4., H1., H2., FY.
|
|
200
|
-
'q1.',
|
|
201
|
-
'q2.',
|
|
202
|
-
'q3.',
|
|
203
|
-
'q4.',
|
|
204
|
-
'h1.',
|
|
205
|
-
'h2.',
|
|
206
|
-
'fy.',
|
|
207
|
-
];
|
|
19
|
+
export { ABBREVIATION_PREFIXES, DESCRIPTION_MAX_LENGTH, DESCRIPTION_MIN_LENGTH, EMOJI_BANNER_CHARS, ENRICHMENT_TRIGGER_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MIN_LENGTH, HEADLINE_CLAUSE_BOUNDARIES, HEADLINE_SOFT_MIN, METADATA_LINE_PREFIXES, TITLE_MAX_LENGTH, TRAILING_PUNCT, TRAILING_STOP_WORDS, } from './text-utils-constants.js';
|
|
20
|
+
export { extractFirstSentence, stripTrailingStopWordsAndPunctuation, truncateDescription, truncateExtendedDescription, truncateTitle, } from './text-truncate.js';
|
|
21
|
+
import { EMOJI_BANNER_CHARS, METADATA_LINE_PREFIXES } from './text-utils-constants.js';
|
|
22
|
+
const STRUCTURAL_LINE_PREFIXES = ['#', '>', '<', '|'];
|
|
23
|
+
const FENCE_LINE_PREFIXES = ['```', '~~~'];
|
|
208
24
|
// ────────────────────────────────────────────────────────────────────────
|
|
209
25
|
// Line-classification helpers
|
|
210
26
|
// ────────────────────────────────────────────────────────────────────────
|
|
@@ -220,40 +36,75 @@ export const ABBREVIATION_PREFIXES = [
|
|
|
220
36
|
export function shouldSkipDescriptionLine(line) {
|
|
221
37
|
if (line.length === 0)
|
|
222
38
|
return true;
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
39
|
+
return DESCRIPTION_SKIP_CHECKS.some((check) => check(line));
|
|
40
|
+
}
|
|
41
|
+
const DESCRIPTION_SKIP_CHECKS = [
|
|
42
|
+
(line) => startsWithAny(line, STRUCTURAL_LINE_PREFIXES),
|
|
43
|
+
(line) => line.startsWith('---') || line.startsWith('==='),
|
|
44
|
+
(line) => startsWithAny(line, FENCE_LINE_PREFIXES),
|
|
45
|
+
(line) => line.startsWith('%%'),
|
|
46
|
+
(line) => /^title\s/i.test(line),
|
|
47
|
+
(line) => EMOJI_BANNER_CHARS.some((char) => line.startsWith(char)),
|
|
48
|
+
startsWithSeparatorFragment,
|
|
49
|
+
isStructuralListLeader,
|
|
50
|
+
startsWithContinuationConjunction,
|
|
51
|
+
hasTrailingEllipsis,
|
|
52
|
+
isPublishedBanner,
|
|
53
|
+
startsWithMetadataLabel,
|
|
54
|
+
(line) => /^[-*_=~.]{3,}$/.test(line),
|
|
55
|
+
isLocalizedBannerRow,
|
|
56
|
+
isPlainPipeBannerRow,
|
|
57
|
+
];
|
|
58
|
+
function startsWithAny(line, prefixes) {
|
|
59
|
+
return prefixes.some((prefix) => line.startsWith(prefix));
|
|
60
|
+
}
|
|
61
|
+
function startsWithSeparatorFragment(line) {
|
|
62
|
+
return /^[:;,—–-]\s/u.test(line);
|
|
63
|
+
}
|
|
64
|
+
function isStructuralListLeader(line) {
|
|
65
|
+
return /^\(?[0-9]{1,2}[.):]\s/u.test(line) || /^\(?[a-z][.)]\s/iu.test(line);
|
|
66
|
+
}
|
|
67
|
+
function startsWithContinuationConjunction(line) {
|
|
68
|
+
return /^(that|which|while|whereas|and|but|for|yet|so|nor|or)\s/iu.test(line);
|
|
69
|
+
}
|
|
70
|
+
function hasTrailingEllipsis(line) {
|
|
71
|
+
return line.endsWith('…') || /\.{3,}$/u.test(line);
|
|
72
|
+
}
|
|
73
|
+
function isPublishedBanner(line) {
|
|
74
|
+
return /^published\s+\d{4}-\d{2}-\d{2}\b/iu.test(line);
|
|
75
|
+
}
|
|
76
|
+
function startsWithMetadataLabel(line) {
|
|
241
77
|
const labelSource = line.replace(/^\*+/, '').replace(/^\*\*/, '').replace(/^_+/, '').trim();
|
|
242
|
-
|
|
243
|
-
|
|
78
|
+
const lower = labelSource.toLowerCase();
|
|
79
|
+
return METADATA_LINE_PREFIXES.some((prefix) => {
|
|
244
80
|
const prefixLower = prefix.toLowerCase();
|
|
245
|
-
|
|
81
|
+
return (lower.startsWith(`${prefixLower}:`) ||
|
|
246
82
|
lower.startsWith(`${prefixLower} :`) ||
|
|
247
83
|
lower.startsWith(`${prefixLower}**:`) ||
|
|
248
|
-
lower.startsWith(`${prefixLower}*:`))
|
|
249
|
-
|
|
250
|
-
|
|
84
|
+
lower.startsWith(`${prefixLower}*:`));
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Detect a plain (non-bold) pipe-delimited banner row of the shape
|
|
89
|
+
* `Tag: Value | Tag: Value | Tag: Value`. Matches three-or-more
|
|
90
|
+
* `Word: …` segments separated by ` | ` so legitimate prose containing
|
|
91
|
+
* a single colon (`The Commission's view: …`) is preserved.
|
|
92
|
+
*
|
|
93
|
+
* @param line - Trimmed source line
|
|
94
|
+
* @returns `true` when the line is a plain pipe-banner row
|
|
95
|
+
*/
|
|
96
|
+
function isPlainPipeBannerRow(line) {
|
|
97
|
+
if (!line.includes('|'))
|
|
98
|
+
return false;
|
|
99
|
+
const segments = line.split('|').map((s) => s.trim());
|
|
100
|
+
if (segments.length < 3)
|
|
101
|
+
return false;
|
|
102
|
+
let labeledSegments = 0;
|
|
103
|
+
for (const seg of segments) {
|
|
104
|
+
if (/^[A-Z][\p{L}\p{M}\p{N}\- ]{1,30}[::]\s+\S/u.test(seg))
|
|
105
|
+
labeledSegments += 1;
|
|
251
106
|
}
|
|
252
|
-
|
|
253
|
-
return true;
|
|
254
|
-
if (isLocalizedBannerRow(line))
|
|
255
|
-
return true;
|
|
256
|
-
return false;
|
|
107
|
+
return labeledSegments >= 2;
|
|
257
108
|
}
|
|
258
109
|
/**
|
|
259
110
|
* Language-agnostic banner-row detector. Stage-B artefacts open with a
|
|
@@ -311,6 +162,48 @@ export function stripLeadingProseLabel(line) {
|
|
|
311
162
|
return line;
|
|
312
163
|
return rest;
|
|
313
164
|
}
|
|
165
|
+
/**
|
|
166
|
+
* Strip a leading `**Label:**` / `**Label:**` prefix from a Markdown
|
|
167
|
+
* BLUF line, in any of the 14 publishing languages. Translated
|
|
168
|
+
* executive briefs open the `## FOR IMMEDIATE ACTION` section with
|
|
169
|
+
* patterns such as `**Issue:** …`, `**Fråga:** …`, `**Asunto:** …`,
|
|
170
|
+
* `**主題:** …`, `**الموضوع:** …`, `**Thema:** …`, `**Sujet :** …` —
|
|
171
|
+
* without this stripper the localized label leaked into
|
|
172
|
+
* `<meta description>` for every non-English locale (the English
|
|
173
|
+
* `**Issue:**` line is already filtered by `METADATA_LINE_PREFIXES`).
|
|
174
|
+
*
|
|
175
|
+
* The matcher is *structural*, not vocabulary-driven: it accepts up to
|
|
176
|
+
* 5 word/glyph tokens (letters, marks, digits, spaces, hyphens),
|
|
177
|
+
* followed by either an ASCII colon `:` or full-width colon `:`,
|
|
178
|
+
* followed by `**`, followed by whitespace. Returns the line verbatim
|
|
179
|
+
* when no qualifying opener is present so it is safe to apply
|
|
180
|
+
* unconditionally.
|
|
181
|
+
*
|
|
182
|
+
* @param raw - Raw Markdown line (still carrying `**…**` decorations)
|
|
183
|
+
* @returns Line with the leading `**Label:**` prefix removed, or the
|
|
184
|
+
* original input when no such prefix exists
|
|
185
|
+
*/
|
|
186
|
+
export function stripLeadingBoldLabel(raw) {
|
|
187
|
+
// Allowed label characters: any Unicode letter, mark, digit, space, hyphen.
|
|
188
|
+
// 1–5 tokens (≤ 40 chars total) to avoid swallowing long inline-bold prose.
|
|
189
|
+
// Both `**Label:**` (colon inside the bold span) and `**Label**:` are
|
|
190
|
+
// observed in translations — match both shapes.
|
|
191
|
+
const pattern = /^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})[::]\*\*\s+|^\*\*([\p{L}\p{M}\p{N}][\p{L}\p{M}\p{N} -]{0,38})\*\*\s*[::]\s+/u;
|
|
192
|
+
const match = pattern.exec(raw);
|
|
193
|
+
if (!match) {
|
|
194
|
+
// Defense in depth: even when no `**Label**` decoration is present,
|
|
195
|
+
// strip a residual orphan separator at the line start. Upstream
|
|
196
|
+
// strippers (e.g. {@link stripInlineMarkdown} applied after a
|
|
197
|
+
// partial bold-label removal) can leave `: rest of sentence…`
|
|
198
|
+
// shapes; we never want those leading punctuation glyphs to survive
|
|
199
|
+
// into the description or title.
|
|
200
|
+
return raw.replace(/^[:;—–-]\s+/u, '');
|
|
201
|
+
}
|
|
202
|
+
// After the bold-label match, also strip any *additional* residual
|
|
203
|
+
// separator that may follow (rare, but observed when authors write
|
|
204
|
+
// `**Issue**: : `).
|
|
205
|
+
return raw.slice(match[0].length).replace(/^[:;—–-]\s+/u, '');
|
|
206
|
+
}
|
|
314
207
|
/**
|
|
315
208
|
* Strip inline Markdown decorations so we can use the remaining text as
|
|
316
209
|
* plain-text meta-tag content. Removes link syntax, emphasis, inline code
|
|
@@ -333,217 +226,4 @@ export function stripInlineMarkdown(raw) {
|
|
|
333
226
|
.replace(/\s+/g, ' ')
|
|
334
227
|
.trim();
|
|
335
228
|
}
|
|
336
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
337
|
-
// Truncation helpers
|
|
338
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
339
|
-
/**
|
|
340
|
-
* Repeatedly strip trailing stop-words (separated by a single space) and
|
|
341
|
-
* trailing punctuation (including any pre-existing ellipsis). Implemented
|
|
342
|
-
* imperatively to avoid super-linear regex backtracking on the
|
|
343
|
-
* `(?:\s+stop-word)+$` pattern flagged by `security/detect-unsafe-regex`.
|
|
344
|
-
*
|
|
345
|
-
* @param input - Pre-clipped string to clean up
|
|
346
|
-
* @returns Cleaned string with no trailing stop-words or punctuation
|
|
347
|
-
*/
|
|
348
|
-
function stripTrailingStopWordsAndPunctuation(input) {
|
|
349
|
-
let result = input;
|
|
350
|
-
let changed = true;
|
|
351
|
-
while (changed) {
|
|
352
|
-
changed = false;
|
|
353
|
-
while (result.length > 0 && TRAILING_PUNCT.test(result.charAt(result.length - 1))) {
|
|
354
|
-
result = result.slice(0, -1);
|
|
355
|
-
changed = true;
|
|
356
|
-
}
|
|
357
|
-
const lastSpace = result.lastIndexOf(' ');
|
|
358
|
-
if (lastSpace >= 0) {
|
|
359
|
-
const tail = result.slice(lastSpace + 1).toLowerCase();
|
|
360
|
-
if (TRAILING_STOP_WORDS.has(tail)) {
|
|
361
|
-
result = result.slice(0, lastSpace);
|
|
362
|
-
changed = true;
|
|
363
|
-
}
|
|
364
|
-
}
|
|
365
|
-
}
|
|
366
|
-
return result;
|
|
367
|
-
}
|
|
368
|
-
/**
|
|
369
|
-
* Clamp a string to `DESCRIPTION_MAX_LENGTH` characters, appending
|
|
370
|
-
* an ellipsis when truncation actually happens. Does not break words if
|
|
371
|
-
* avoidable — a trailing partial word is trimmed back to the previous
|
|
372
|
-
* space first.
|
|
373
|
-
*
|
|
374
|
-
* @param text - Raw description text
|
|
375
|
-
* @returns Truncated description with trailing ellipsis when clipped
|
|
376
|
-
*/
|
|
377
|
-
export function truncateDescription(text) {
|
|
378
|
-
if (text.length <= DESCRIPTION_MAX_LENGTH)
|
|
379
|
-
return text;
|
|
380
|
-
const cut = text.slice(0, DESCRIPTION_MAX_LENGTH - 1);
|
|
381
|
-
// Prefer the last full sentence terminator within the cut so we don't
|
|
382
|
-
// end on a dangling determiner ("…year. The"). Period/!/? followed by
|
|
383
|
-
// a space marks a clean boundary. Only honour the boundary when it
|
|
384
|
-
// sits past the soft minimum so we keep enough body text to be useful.
|
|
385
|
-
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
386
|
-
if (sentenceEnd >= DESCRIPTION_MIN_LENGTH) {
|
|
387
|
-
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
388
|
-
}
|
|
389
|
-
const lastSpace = cut.lastIndexOf(' ');
|
|
390
|
-
let safe = lastSpace > DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
391
|
-
// Drop dangling stop-words and trailing punctuation/ellipsis so we
|
|
392
|
-
// never emit broken copy ("…year. The" → "…year.") or double-ellipsis
|
|
393
|
-
// ("The……") when the upstream input already carried an ellipsis.
|
|
394
|
-
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
395
|
-
return `${safe}…`;
|
|
396
|
-
}
|
|
397
|
-
/**
|
|
398
|
-
* Clamp an extended description to {@link EXTENDED_DESCRIPTION_MAX_LENGTH}
|
|
399
|
-
* characters using the same sentence-boundary-preserving logic as
|
|
400
|
-
* {@link truncateDescription}. Returns `''` when the input is empty
|
|
401
|
-
* or shorter than the meta-description maximum (no point in emitting
|
|
402
|
-
* an "extended" description that's actually shorter than the regular
|
|
403
|
-
* one).
|
|
404
|
-
*
|
|
405
|
-
* @param text - Raw extended-description text (e.g. full BLUF paragraph)
|
|
406
|
-
* @returns Truncated extended description, or `''` when not worth emitting
|
|
407
|
-
*/
|
|
408
|
-
export function truncateExtendedDescription(text) {
|
|
409
|
-
const trimmed = text.trim();
|
|
410
|
-
if (!trimmed)
|
|
411
|
-
return '';
|
|
412
|
-
// Don't emit an extended description that is shorter than the
|
|
413
|
-
// short meta-description budget — there's no SEO win and it would
|
|
414
|
-
// make `og:description` shorter than `<meta description>`.
|
|
415
|
-
if (trimmed.length <= DESCRIPTION_MAX_LENGTH)
|
|
416
|
-
return '';
|
|
417
|
-
if (trimmed.length <= EXTENDED_DESCRIPTION_MAX_LENGTH)
|
|
418
|
-
return trimmed;
|
|
419
|
-
const cut = trimmed.slice(0, EXTENDED_DESCRIPTION_MAX_LENGTH - 1);
|
|
420
|
-
const sentenceEnd = Math.max(cut.lastIndexOf('. '), cut.lastIndexOf('! '), cut.lastIndexOf('? '));
|
|
421
|
-
if (sentenceEnd >= EXTENDED_DESCRIPTION_MIN_LENGTH) {
|
|
422
|
-
return cut.slice(0, sentenceEnd + 1).replace(/\s+$/, '');
|
|
423
|
-
}
|
|
424
|
-
const lastSpace = cut.lastIndexOf(' ');
|
|
425
|
-
let safe = lastSpace > EXTENDED_DESCRIPTION_MAX_LENGTH - 60 ? cut.slice(0, lastSpace) : cut;
|
|
426
|
-
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
427
|
-
return `${safe}…`;
|
|
428
|
-
}
|
|
429
|
-
/**
|
|
430
|
-
* Clamp a title to `TITLE_MAX_LENGTH` characters in the same
|
|
431
|
-
* word-boundary-preserving fashion as {@link truncateDescription}.
|
|
432
|
-
*
|
|
433
|
-
* @param text - Raw title text
|
|
434
|
-
* @returns Truncated title with trailing ellipsis when clipped
|
|
435
|
-
*/
|
|
436
|
-
export function truncateTitle(text) {
|
|
437
|
-
if (text.length <= TITLE_MAX_LENGTH)
|
|
438
|
-
return text;
|
|
439
|
-
// Prefer ending at a natural clause boundary inside the
|
|
440
|
-
// `[HEADLINE_SOFT_MIN, TITLE_MAX_LENGTH]` window so the truncated
|
|
441
|
-
// title reads as a complete journalistic clause rather than a
|
|
442
|
-
// mid-sentence prose snippet. Iterate boundaries in priority order;
|
|
443
|
-
// when a candidate falls in the window, break there and drop the
|
|
444
|
-
// ellipsis since the result is grammatically complete.
|
|
445
|
-
const search = text.slice(0, TITLE_MAX_LENGTH);
|
|
446
|
-
for (const boundary of HEADLINE_CLAUSE_BOUNDARIES) {
|
|
447
|
-
const idx = search.lastIndexOf(boundary);
|
|
448
|
-
if (idx >= HEADLINE_SOFT_MIN) {
|
|
449
|
-
const clean = stripTrailingStopWordsAndPunctuation(text.slice(0, idx));
|
|
450
|
-
if (clean.length >= HEADLINE_SOFT_MIN)
|
|
451
|
-
return clean;
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
const cut = text.slice(0, TITLE_MAX_LENGTH - 1);
|
|
455
|
-
const lastSpace = cut.lastIndexOf(' ');
|
|
456
|
-
let safe = lastSpace > TITLE_MAX_LENGTH - 40 ? cut.slice(0, lastSpace) : cut;
|
|
457
|
-
safe = stripTrailingStopWordsAndPunctuation(safe);
|
|
458
|
-
return `${safe}…`;
|
|
459
|
-
}
|
|
460
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
461
|
-
// Sentence extraction
|
|
462
|
-
// ────────────────────────────────────────────────────────────────────────
|
|
463
|
-
/**
|
|
464
|
-
* Return the first complete sentence from a prose paragraph, suitable
|
|
465
|
-
* for use as a fallback editorial title when the artefact H1 is
|
|
466
|
-
* categorical (e.g. `# EU Parliament Committee Reports`) and the
|
|
467
|
-
* resolver must derive `<title>` from the BLUF / lede summary instead.
|
|
468
|
-
*
|
|
469
|
-
* A "sentence" is the prefix up to the first sentence-terminator
|
|
470
|
-
* (`. `, `! `, `? `, `; `) inside the `[HEADLINE_SOFT_MIN,
|
|
471
|
-
* TITLE_MAX_LENGTH]` window. Common abbreviations (`Q1.`, `Q2.`,
|
|
472
|
-
* `H1.`, `H2.`, `Mr.`, `Mrs.`, `e.g.`, `i.e.`, `vs.`) are skipped
|
|
473
|
-
* so they don't terminate the sentence prematurely. When no
|
|
474
|
-
* acceptable terminator exists in the window, returns the entire
|
|
475
|
-
* input unchanged so {@link truncateTitle} can handle clause-boundary
|
|
476
|
-
* truncation downstream.
|
|
477
|
-
*
|
|
478
|
-
* This produces journalistically clean titles even for the
|
|
479
|
-
* propositions / committee-reports cases where the BLUF paragraph
|
|
480
|
-
* opens with a single long sentence that exceeds 140 chars —
|
|
481
|
-
* `truncateTitle` then breaks on a clause boundary, and the result is
|
|
482
|
-
* still grammatical because the input was a sentence prefix rather
|
|
483
|
-
* than an arbitrary paragraph slice.
|
|
484
|
-
*
|
|
485
|
-
* @param paragraph - Prose paragraph (post-{@link stripInlineMarkdown})
|
|
486
|
-
* @returns First sentence, or the original paragraph when none can be
|
|
487
|
-
* identified within the soft-min window
|
|
488
|
-
*/
|
|
489
|
-
export function extractFirstSentence(paragraph) {
|
|
490
|
-
const trimmed = paragraph.trim();
|
|
491
|
-
if (trimmed.length <= HEADLINE_SOFT_MIN)
|
|
492
|
-
return trimmed;
|
|
493
|
-
// Limit terminator search to TITLE_MAX_LENGTH * 1.5 — beyond that
|
|
494
|
-
// we'd rather let truncateTitle clause-truncate the original
|
|
495
|
-
// paragraph than return a too-long first sentence.
|
|
496
|
-
const window = trimmed.slice(0, Math.floor(TITLE_MAX_LENGTH * 1.5));
|
|
497
|
-
// Skip common abbreviations that contain a period inside a token
|
|
498
|
-
// (Q1., e.g., i.e., vs., Mr., Mrs., No., U.S., E.U.). We walk
|
|
499
|
-
// candidate terminator positions; a position counts only when the
|
|
500
|
-
// char before it is *not* part of a known abbreviation token.
|
|
501
|
-
const terminators = ['. ', '! ', '? ', '; '];
|
|
502
|
-
let bestIdx = -1;
|
|
503
|
-
for (const t of terminators) {
|
|
504
|
-
let from = HEADLINE_SOFT_MIN;
|
|
505
|
-
let idx;
|
|
506
|
-
while ((idx = window.indexOf(t, from)) !== -1) {
|
|
507
|
-
if (!isAbbreviationBoundary(window, idx) && idx < window.length - 1) {
|
|
508
|
-
if (bestIdx === -1 || idx < bestIdx)
|
|
509
|
-
bestIdx = idx;
|
|
510
|
-
break;
|
|
511
|
-
}
|
|
512
|
-
from = idx + t.length;
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
if (bestIdx >= HEADLINE_SOFT_MIN) {
|
|
516
|
-
return trimmed.slice(0, bestIdx + 1).trim();
|
|
517
|
-
}
|
|
518
|
-
return trimmed;
|
|
519
|
-
}
|
|
520
|
-
/**
|
|
521
|
-
* Check whether the character preceding the `.` at `idx` in `text`
|
|
522
|
-
* indicates an abbreviation (so the `.` is not a sentence terminator).
|
|
523
|
-
* Matches the {@link ABBREVIATION_PREFIXES} table and the all-caps
|
|
524
|
-
* single-letter initials pattern (`U.S.`, `E.U.`).
|
|
525
|
-
*
|
|
526
|
-
* @param text - Source text (lowercased segment + original mixed-case)
|
|
527
|
-
* @param idx - Index of the `.` character in `text`
|
|
528
|
-
* @returns `true` when the period at `idx` is part of an abbreviation
|
|
529
|
-
*/
|
|
530
|
-
function isAbbreviationBoundary(text, idx) {
|
|
531
|
-
// All-caps single-letter initial like `U.S.` or `E.U.` — char at
|
|
532
|
-
// idx-1 is a capital letter, and idx-2 is either start of string,
|
|
533
|
-
// whitespace, or another single-letter+period pair.
|
|
534
|
-
if (idx >= 1) {
|
|
535
|
-
const prev = text.charCodeAt(idx - 1);
|
|
536
|
-
const isUpperLetter = prev >= 65 && prev <= 90;
|
|
537
|
-
if (isUpperLetter && (idx === 1 || text[idx - 2] === ' ' || text[idx - 2] === '.')) {
|
|
538
|
-
return true;
|
|
539
|
-
}
|
|
540
|
-
}
|
|
541
|
-
// ABBREVIATION_PREFIXES lookup — scan backwards from `.` to find the
|
|
542
|
-
// start of the word, then compare lowercased.
|
|
543
|
-
let start = idx;
|
|
544
|
-
while (start > 0 && /[a-zA-Z]/u.test(text[start - 1] ?? ''))
|
|
545
|
-
start--;
|
|
546
|
-
const token = text.slice(start, idx + 1).toLowerCase();
|
|
547
|
-
return ABBREVIATION_PREFIXES.includes(token);
|
|
548
|
-
}
|
|
549
229
|
//# sourceMappingURL=text-utils.js.map
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `true` when the candidate is a bold-prose section header that
|
|
3
|
+
* leaked through the priority-finding extractor (e.g. `Strategic
|
|
4
|
+
* significance`, `Threat Level`).
|
|
5
|
+
*
|
|
6
|
+
* @param value - Title candidate
|
|
7
|
+
* @returns `true` when the candidate matches the section-header denylist.
|
|
8
|
+
*/
|
|
9
|
+
export declare function looksLikeSectionHeader(value: string): boolean;
|
|
10
|
+
/**
|
|
11
|
+
* `true` when the candidate ends with `…` or `...` (was truncated
|
|
12
|
+
* over the title budget).
|
|
13
|
+
*
|
|
14
|
+
* @param value - Title candidate
|
|
15
|
+
* @returns `true` when the candidate has a trailing ellipsis.
|
|
16
|
+
*/
|
|
17
|
+
export declare function looksLikeEllipsisCut(value: string): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* `true` when the candidate is a bare adopted-text doc-ID.
|
|
20
|
+
*
|
|
21
|
+
* @param value - Title candidate
|
|
22
|
+
* @returns `true` when the candidate matches the `TA-NN-YYYY-NNNN` shape.
|
|
23
|
+
*/
|
|
24
|
+
export declare function looksLikeDocId(value: string): boolean;
|
|
25
|
+
/**
|
|
26
|
+
* Master rejection predicate. Returns the reason code (one of
|
|
27
|
+
* `section-header`, `ellipsis-cut`, `doc-id`, `sentence-fragment`)
|
|
28
|
+
* when the candidate should be rejected, or `null` when it is
|
|
29
|
+
* usable.
|
|
30
|
+
*
|
|
31
|
+
* @param value - Title candidate
|
|
32
|
+
* @returns Reason code, or `null` when the candidate is usable.
|
|
33
|
+
*/
|
|
34
|
+
export declare function findTitleRejectionReason(value: string): 'section-header' | 'ellipsis-cut' | 'doc-id' | 'sentence-fragment' | null;
|
|
35
|
+
/** Exposed for unit tests + the SEO validator. */
|
|
36
|
+
export declare const TITLE_REJECTION_DENYLIST: readonly string[];
|
|
37
|
+
//# sourceMappingURL=title-rejection.d.ts.map
|