euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Aggregator/Metadata/KeywordFilters
|
|
5
|
+
* @description Cross-site keyword catalogue and noise-token filter used
|
|
6
|
+
* by {@link buildSeoKeywords} in `resolve-helpers.ts`.
|
|
7
|
+
*
|
|
8
|
+
* Two responsibilities:
|
|
9
|
+
*
|
|
10
|
+
* 1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
|
|
11
|
+
* are prepended to every article's `<meta name="keywords">` list
|
|
12
|
+
* regardless of language, so search-engine discovery of the
|
|
13
|
+
* Hack23 civic-tech portfolio (EU Parliament Monitor +
|
|
14
|
+
* Riksdagsmonitor + CIA) is consistent across all 14 localized
|
|
15
|
+
* surfaces. The user explicitly requested
|
|
16
|
+
* `riksdagsmonitor, political intelligence, riksdag, regeringen`
|
|
17
|
+
* (the sister Swedish-Parliament project) plus EP analogues.
|
|
18
|
+
*
|
|
19
|
+
* 2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
|
|
20
|
+
* the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
|
|
21
|
+
* synthetic run-id slugs (`propositions-run261-1779431162`) that
|
|
22
|
+
* the previous keyword extractor leaked into `<head>` when a
|
|
23
|
+
* brief mentioned its own run id editorially (e.g.
|
|
24
|
+
* `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
|
|
25
|
+
* 0 classified actors`).
|
|
26
|
+
*
|
|
27
|
+
* Pure leaf module — no imports.
|
|
28
|
+
*/
|
|
29
|
+
/**
|
|
30
|
+
* Cross-site SEO keywords prepended to every article in every
|
|
31
|
+
* language. Order is meaningful: stronger civic-tech-portfolio terms
|
|
32
|
+
* first so they appear ahead of the per-article-type keywords when
|
|
33
|
+
* the 16-entry budget is exceeded.
|
|
34
|
+
*/
|
|
35
|
+
export const CROSS_SITE_KEYWORDS = [
|
|
36
|
+
'EU Parliament Monitor',
|
|
37
|
+
'European Parliament',
|
|
38
|
+
'European Commission',
|
|
39
|
+
'political intelligence',
|
|
40
|
+
'Riksdagsmonitor',
|
|
41
|
+
'Riksdag',
|
|
42
|
+
'Regeringen',
|
|
43
|
+
];
|
|
44
|
+
/**
|
|
45
|
+
* Lower-case allowlist of common English words that the noise filter
|
|
46
|
+
* must always keep, even when their shape would otherwise match the
|
|
47
|
+
* "looks like a hex token" heuristic (e.g. `face`, `dead`, `beef`).
|
|
48
|
+
* Kept intentionally tiny to avoid lexicon drift.
|
|
49
|
+
*/
|
|
50
|
+
const HEX_ALPHABETIC_ALLOWLIST = new Set([
|
|
51
|
+
'face',
|
|
52
|
+
'fade',
|
|
53
|
+
'dead',
|
|
54
|
+
'beef',
|
|
55
|
+
'cafe',
|
|
56
|
+
'feed',
|
|
57
|
+
'deed',
|
|
58
|
+
'fed',
|
|
59
|
+
'add',
|
|
60
|
+
'dad',
|
|
61
|
+
'bad',
|
|
62
|
+
]);
|
|
63
|
+
/**
|
|
64
|
+
* Detect run-id slug chains of the form
|
|
65
|
+
* `<letters>(-<letters>)*-run<digits>(-<digits>)*` — e.g.
|
|
66
|
+
* `propositions-run261-1779431162` or
|
|
67
|
+
* `breaking-news-run17-1234567890`. Implemented as a split-and-scan
|
|
68
|
+
* walker (instead of a single backtracking regex) to satisfy the
|
|
69
|
+
* `security/detect-unsafe-regex` lint rule.
|
|
70
|
+
*
|
|
71
|
+
* @param lower - Lower-case candidate token
|
|
72
|
+
* @returns `true` when the token matches the run-id slug shape
|
|
73
|
+
*/
|
|
74
|
+
function isRunSlugChain(lower) {
|
|
75
|
+
const parts = lower.split('-');
|
|
76
|
+
if (parts.length < 2)
|
|
77
|
+
return false;
|
|
78
|
+
let runIndex = -1;
|
|
79
|
+
for (let i = 0; i < parts.length; i++) {
|
|
80
|
+
if (/^run\d+$/u.test(parts[i] ?? '')) {
|
|
81
|
+
runIndex = i;
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
if (runIndex <= 0)
|
|
86
|
+
return false;
|
|
87
|
+
// Every segment before `run<digits>` must be all-letters; every
|
|
88
|
+
// segment after must be all-digits.
|
|
89
|
+
for (let i = 0; i < runIndex; i++) {
|
|
90
|
+
if (!/^[a-z]+$/u.test(parts[i] ?? ''))
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
for (let i = runIndex + 1; i < parts.length; i++) {
|
|
94
|
+
if (!/^\d+$/u.test(parts[i] ?? ''))
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Decide whether a single keyword token should be discarded as noise.
|
|
101
|
+
*
|
|
102
|
+
* The current rules reject tokens that:
|
|
103
|
+
*
|
|
104
|
+
* - Look like a UUID hex chunk: ≥4 chars and consist solely of the
|
|
105
|
+
* `[0-9a-f]` alphabet **and** contain at least one digit (so
|
|
106
|
+
* real English words like `dead` / `face` survive). Tokens of
|
|
107
|
+
* length ≥8 are always rejected (a real English word of that
|
|
108
|
+
* length composed exclusively of hex letters is vanishingly rare;
|
|
109
|
+
* the allowlist guards the short cases).
|
|
110
|
+
* - Are mostly digits (≥80 % digit characters) — runtime epoch
|
|
111
|
+
* suffixes such as `1779431162` and committee-codeoid mashes like
|
|
112
|
+
* `2024k1234`.
|
|
113
|
+
* - Start with `run` and end with all-digits (`run261`, `run17`),
|
|
114
|
+
* the per-run slug suffix the aggregator stamps onto run ids.
|
|
115
|
+
* - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
|
|
116
|
+
* after a strip / normalization round-trip.
|
|
117
|
+
*
|
|
118
|
+
* Returns `false` for normal vocabulary so the keyword list stays
|
|
119
|
+
* useful — every reject path is intentionally narrow.
|
|
120
|
+
*
|
|
121
|
+
* @param token - Single token candidate
|
|
122
|
+
* @returns `true` when the token should be dropped from keywords
|
|
123
|
+
*/
|
|
124
|
+
export function isNoiseKeywordToken(token) {
|
|
125
|
+
if (!token)
|
|
126
|
+
return true;
|
|
127
|
+
const trimmed = token.trim();
|
|
128
|
+
if (trimmed.length < 4)
|
|
129
|
+
return true;
|
|
130
|
+
const lower = trimmed.toLowerCase();
|
|
131
|
+
// Reject pure-digit and digit-dominated tokens.
|
|
132
|
+
if (/^\d+$/u.test(lower))
|
|
133
|
+
return true;
|
|
134
|
+
const digitCount = (lower.match(/\d/gu) ?? []).length;
|
|
135
|
+
if (digitCount > 0 && digitCount / lower.length >= 0.8)
|
|
136
|
+
return true;
|
|
137
|
+
// Reject `run<digits>` slugs and `…-run<digits>-<digits>` chains.
|
|
138
|
+
if (/^run\d+$/u.test(lower))
|
|
139
|
+
return true;
|
|
140
|
+
if (isRunSlugChain(lower))
|
|
141
|
+
return true;
|
|
142
|
+
// Reject hex-shaped tokens unless they are common English words.
|
|
143
|
+
const isHex = /^[0-9a-f]+$/u.test(lower);
|
|
144
|
+
if (isHex) {
|
|
145
|
+
if (lower.length >= 8)
|
|
146
|
+
return true;
|
|
147
|
+
if (digitCount > 0)
|
|
148
|
+
return true;
|
|
149
|
+
if (HEX_ALPHABETIC_ALLOWLIST.has(lower))
|
|
150
|
+
return false;
|
|
151
|
+
// Short all-letter hex words: keep (avoids overfitting).
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
return false;
|
|
155
|
+
}
|
|
156
|
+
//# sourceMappingURL=keyword-filters.js.map
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* helpers) and heading-rules (for the editorial-lede whitelist and the
|
|
14
14
|
* heading-text normaliser).
|
|
15
15
|
*/
|
|
16
|
-
import { DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, shouldSkipDescriptionLine, stripInlineMarkdown, stripLeadingProseLabel, truncateDescription, truncateExtendedDescription, } from './text-utils.js';
|
|
16
|
+
import { DESCRIPTION_MAX_LENGTH, EXTENDED_DESCRIPTION_MAX_LENGTH, shouldSkipDescriptionLine, stripInlineMarkdown, stripLeadingBoldLabel, stripLeadingProseLabel, truncateDescription, truncateExtendedDescription, } from './text-utils.js';
|
|
17
17
|
import { EDITORIAL_LEDE_HEADINGS, isLedeHeadingMatch, normaliseHeadingText, } from './heading-rules.js';
|
|
18
18
|
/**
|
|
19
19
|
* Process one Markdown line against the in-progress paragraph buffer.
|
|
@@ -35,7 +35,16 @@ function collectProseLine(line, buf) {
|
|
|
35
35
|
return 'continue';
|
|
36
36
|
if (shouldSkipDescriptionLine(line))
|
|
37
37
|
return hasBuffer ? 'break' : 'continue';
|
|
38
|
-
|
|
38
|
+
// Strip the leading `**Label:**` opener (any language) *before*
|
|
39
|
+
// running the inline-markdown stripper, so localized BLUF labels
|
|
40
|
+
// like `**Fråga:**` / `**主題:**` / `**الموضوع:**` are removed
|
|
41
|
+
// structurally rather than leaking into the description as plain
|
|
42
|
+
// text (`"Fråga: …"`). The English `**Issue:**` line is already
|
|
43
|
+
// skipped earlier by METADATA_LINE_PREFIXES; this code path covers
|
|
44
|
+
// the 13 non-English locales for which the label vocabulary is
|
|
45
|
+
// open-ended.
|
|
46
|
+
const stripped = stripLeadingBoldLabel(line);
|
|
47
|
+
const plain = stripLeadingProseLabel(stripInlineMarkdown(stripped));
|
|
39
48
|
if (!hasBuffer && plain.length < 40)
|
|
40
49
|
return 'continue';
|
|
41
50
|
buf.lines.push(plain);
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Strip the trailing parenthesised metadata that briefs append to every
|
|
3
|
+
* priority-finding name — procedure codes, dates, committee tags. The
|
|
4
|
+
* regex is intentionally non-greedy so it removes only the LAST
|
|
5
|
+
* parenthesised group on the line.
|
|
6
|
+
*
|
|
7
|
+
* @param text - Headline or paragraph text
|
|
8
|
+
* @returns Text with the trailing `(…)` stripped
|
|
9
|
+
*/
|
|
10
|
+
export declare function stripPriorityTailMetadata(text: string): string;
|
|
11
|
+
/**
|
|
12
|
+
* Normalise a priority-finding headline: drop the
|
|
13
|
+
* `Trigger N:` / `Dossier N:` / leading-numeric prefix, strip trailing
|
|
14
|
+
* parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
|
|
15
|
+
* `(ITRE/ENVI)`), and trim residual punctuation. The result is a
|
|
16
|
+
* headline-shaped string suitable for `<title>` use.
|
|
17
|
+
*
|
|
18
|
+
* @param raw - Raw bold-title or heading text
|
|
19
|
+
* @returns Cleaned headline (may be empty after stripping)
|
|
20
|
+
*/
|
|
21
|
+
export declare function cleanPriorityHeadline(raw: string): string;
|
|
22
|
+
//# sourceMappingURL=priority-finding-cleaning.d.ts.map
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Aggregator/Metadata/PriorityFindingCleaning
|
|
5
|
+
* @description Headline-cleaning helpers extracted from
|
|
6
|
+
* `priority-finding-highlight.ts` to keep that module under the
|
|
7
|
+
* 600-line drift-guard budget enforced by
|
|
8
|
+
* `test/unit/source-file-size.test.js`.
|
|
9
|
+
*
|
|
10
|
+
* Public entry point: {@link cleanPriorityHeadline} — normalises a
|
|
11
|
+
* raw bold-title / heading string by stripping priority decorations
|
|
12
|
+
* (`🔴 CRITICAL — `), editorial prefixes (`Trigger 1: `), trailing
|
|
13
|
+
* confidence markers (`🔴 CRITICAL`), and parenthesised tail metadata
|
|
14
|
+
* (`(TA-10-2026-0160, 2026-04-30)`).
|
|
15
|
+
*
|
|
16
|
+
* Bounded-context rules:
|
|
17
|
+
* - **Pure helpers** — no I/O, no globals.
|
|
18
|
+
* - **Deterministic** — same input always produces same output.
|
|
19
|
+
* - **Reusable** — every helper accepts a plain string and returns a
|
|
20
|
+
* plain string; suitable for property-tests.
|
|
21
|
+
*/
|
|
22
|
+
import { stripInlineMarkdown } from './text-utils.js';
|
|
23
|
+
/**
|
|
24
|
+
* Leading priority-label tokens stripped by {@link cleanPriorityHeadline}
|
|
25
|
+
* (`🔴 CRITICAL — Title` → `Title`). Kept as a list to bypass the
|
|
26
|
+
* unsafe-regex lint by avoiding deep alternation in a single pattern.
|
|
27
|
+
*/
|
|
28
|
+
const PRIORITY_LABEL_TOKENS = [
|
|
29
|
+
'CRITICAL',
|
|
30
|
+
'HIGH PRIORITY',
|
|
31
|
+
'HIGH',
|
|
32
|
+
'MEDIUM PRIORITY',
|
|
33
|
+
'MEDIUM',
|
|
34
|
+
'LOW PRIORITY',
|
|
35
|
+
'LOW',
|
|
36
|
+
'URGENT',
|
|
37
|
+
'ALERT',
|
|
38
|
+
'PRIORITY',
|
|
39
|
+
];
|
|
40
|
+
/**
|
|
41
|
+
* Trailing confidence-marker tokens stripped by
|
|
42
|
+
* {@link cleanPriorityHeadline}. Same rationale as
|
|
43
|
+
* {@link PRIORITY_LABEL_TOKENS}.
|
|
44
|
+
*/
|
|
45
|
+
const PRIORITY_TRAILING_TOKENS = [
|
|
46
|
+
'CRITICAL',
|
|
47
|
+
'HIGH PRIORITY',
|
|
48
|
+
'HIGH',
|
|
49
|
+
'MEDIUM PRIORITY',
|
|
50
|
+
'MEDIUM',
|
|
51
|
+
'LOW PRIORITY',
|
|
52
|
+
'LOW',
|
|
53
|
+
];
|
|
54
|
+
/**
|
|
55
|
+
* Leading editorial-prefix tokens stripped by
|
|
56
|
+
* {@link cleanPriorityHeadline} (`Trigger 1: Title` → `Title`).
|
|
57
|
+
*/
|
|
58
|
+
const PRIORITY_LEADING_PREFIX_TOKENS = [
|
|
59
|
+
'Trigger',
|
|
60
|
+
'Dossier',
|
|
61
|
+
'Priority',
|
|
62
|
+
'Finding',
|
|
63
|
+
'Item',
|
|
64
|
+
'Highlight',
|
|
65
|
+
'Top',
|
|
66
|
+
'Story',
|
|
67
|
+
'Alert',
|
|
68
|
+
'Judgement',
|
|
69
|
+
'Judgment',
|
|
70
|
+
];
|
|
71
|
+
/**
|
|
72
|
+
* Strip a leading priority decoration (`🔴 `, `CRITICAL — `) from a
|
|
73
|
+
* candidate headline. Extracted from {@link cleanPriorityHeadline} to
|
|
74
|
+
* keep cognitive complexity within budget.
|
|
75
|
+
*
|
|
76
|
+
* @param text - Candidate headline (already trimmed)
|
|
77
|
+
* @returns Headline with the leading decoration removed
|
|
78
|
+
*/
|
|
79
|
+
function stripPriorityLeadingDecoration(text) {
|
|
80
|
+
let out = text;
|
|
81
|
+
for (let pass = 0; pass < 2; pass++) {
|
|
82
|
+
out = out.replace(/^[^\p{L}\p{N}]+/u, '').trim();
|
|
83
|
+
for (const token of PRIORITY_LABEL_TOKENS) {
|
|
84
|
+
if (out.toLowerCase().startsWith(token.toLowerCase())) {
|
|
85
|
+
const rest = out.slice(token.length).trim();
|
|
86
|
+
const sep = rest.match(/^[:—–-]\s*(.+)$/u);
|
|
87
|
+
if (sep?.[1]) {
|
|
88
|
+
out = sep[1].trim();
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return out;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Strip a leading editorial prefix (`Trigger 1: `, `Dossier 2: `) and a
|
|
98
|
+
* stray leading ordinal (`1. `, `2.1 `) from a candidate headline.
|
|
99
|
+
*
|
|
100
|
+
* @param text - Candidate headline
|
|
101
|
+
* @returns Headline with the leading editorial decoration removed
|
|
102
|
+
*/
|
|
103
|
+
function stripPriorityLeadingPrefix(text) {
|
|
104
|
+
let out = text;
|
|
105
|
+
for (const token of PRIORITY_LEADING_PREFIX_TOKENS) {
|
|
106
|
+
if (!out.toLowerCase().startsWith(token.toLowerCase()))
|
|
107
|
+
continue;
|
|
108
|
+
const rest = out.slice(token.length);
|
|
109
|
+
const match = rest.match(/^\s+\d+\s*[:–—-]\s*(.+)$/u);
|
|
110
|
+
if (match?.[1]) {
|
|
111
|
+
out = match[1];
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// Drop a stray leading "1. " / "2) " ordinal.
|
|
116
|
+
out = out.replace(/^\d+[.):·\s]\s*/u, '');
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Strip a trailing confidence marker (`🔴 CRITICAL`, `🟡 MEDIUM`) from a
|
|
121
|
+
* candidate headline. Single pass — caller invokes inside a fixed-point
|
|
122
|
+
* loop.
|
|
123
|
+
*
|
|
124
|
+
* @param text - Candidate headline
|
|
125
|
+
* @returns Headline with the trailing confidence marker removed
|
|
126
|
+
*/
|
|
127
|
+
function stripPriorityTrailingMarker(text) {
|
|
128
|
+
let out = text;
|
|
129
|
+
for (const token of PRIORITY_TRAILING_TOKENS) {
|
|
130
|
+
const pattern = new RegExp(`\\s+[^\\p{L}\\p{N}\\s]?\\s*${token}\\s*$`, 'iu');
|
|
131
|
+
const next = out.replace(pattern, '');
|
|
132
|
+
if (next !== out) {
|
|
133
|
+
out = next;
|
|
134
|
+
break;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
return out;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Strip the trailing parenthesised metadata that briefs append to every
|
|
141
|
+
* priority-finding name — procedure codes, dates, committee tags. The
|
|
142
|
+
* regex is intentionally non-greedy so it removes only the LAST
|
|
143
|
+
* parenthesised group on the line.
|
|
144
|
+
*
|
|
145
|
+
* @param text - Headline or paragraph text
|
|
146
|
+
* @returns Text with the trailing `(…)` stripped
|
|
147
|
+
*/
|
|
148
|
+
export function stripPriorityTailMetadata(text) {
|
|
149
|
+
return text.replace(/\s*\([^()]{3,80}\)\s*$/u, '').trim();
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Normalise a priority-finding headline: drop the
|
|
153
|
+
* `Trigger N:` / `Dossier N:` / leading-numeric prefix, strip trailing
|
|
154
|
+
* parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
|
|
155
|
+
* `(ITRE/ENVI)`), and trim residual punctuation. The result is a
|
|
156
|
+
* headline-shaped string suitable for `<title>` use.
|
|
157
|
+
*
|
|
158
|
+
* @param raw - Raw bold-title or heading text
|
|
159
|
+
* @returns Cleaned headline (may be empty after stripping)
|
|
160
|
+
*/
|
|
161
|
+
export function cleanPriorityHeadline(raw) {
|
|
162
|
+
let text = stripInlineMarkdown(raw).trim();
|
|
163
|
+
text = stripPriorityLeadingDecoration(text);
|
|
164
|
+
text = stripPriorityLeadingPrefix(text);
|
|
165
|
+
// Trailing cleanup runs in a fixed-point loop so combined patterns
|
|
166
|
+
// like "Title (Confidence, 80%): 🔴" collapse all the way down to
|
|
167
|
+
// "Title".
|
|
168
|
+
let previous = '';
|
|
169
|
+
while (previous !== text) {
|
|
170
|
+
previous = text;
|
|
171
|
+
text = stripPriorityTrailingMarker(text);
|
|
172
|
+
text = stripPriorityTailMetadata(text);
|
|
173
|
+
// Drop a single trailing emoji left after metadata stripping.
|
|
174
|
+
text = text.replace(/\s+[^\p{L}\p{N}\s]+\s*$/u, '');
|
|
175
|
+
// Drop trailing colons / dashes left over.
|
|
176
|
+
text = text.replace(/[\s:—–-]+$/u, '');
|
|
177
|
+
text = text.trim();
|
|
178
|
+
}
|
|
179
|
+
return text;
|
|
180
|
+
}
|
|
181
|
+
//# sourceMappingURL=priority-finding-cleaning.js.map
|
|
@@ -9,7 +9,9 @@
|
|
|
9
9
|
* artifact-highlight.ts when an artefact has no usable H1.
|
|
10
10
|
*/
|
|
11
11
|
import { normaliseHeadingText } from './heading-rules.js';
|
|
12
|
+
import { cleanPriorityHeadline, stripPriorityTailMetadata } from './priority-finding-cleaning.js';
|
|
12
13
|
import { DESCRIPTION_MAX_LENGTH, shouldSkipDescriptionLine, stripInlineMarkdown, stripLeadingProseLabel, truncateDescription, } from './text-utils.js';
|
|
14
|
+
import { findTitleRejectionReason } from './title-rejection.js';
|
|
13
15
|
/**
|
|
14
16
|
* Section headings inside the executive brief that introduce the
|
|
15
17
|
* named-priority-finding block (matched case-insensitively against the
|
|
@@ -290,21 +292,69 @@ function extractPriorityFindingItem(lines, i) {
|
|
|
290
292
|
*/
|
|
291
293
|
const PRIORITY_METADATA_BOLD_PREFIXES = [
|
|
292
294
|
'admiralty',
|
|
295
|
+
'admiralty scale',
|
|
296
|
+
'admiralty scale used',
|
|
297
|
+
'analysis owner',
|
|
298
|
+
'analyst note',
|
|
299
|
+
'analytical quality',
|
|
300
|
+
'bluf',
|
|
301
|
+
'bottom line up front',
|
|
302
|
+
'caveats and gaps',
|
|
293
303
|
'classification',
|
|
304
|
+
'composition layer',
|
|
294
305
|
'confidence',
|
|
295
|
-
'
|
|
306
|
+
'confidence summary',
|
|
296
307
|
'data quality',
|
|
308
|
+
'data sources',
|
|
297
309
|
'date',
|
|
310
|
+
'emerging patterns',
|
|
311
|
+
'forward indicators',
|
|
312
|
+
'gate target',
|
|
298
313
|
'generated',
|
|
314
|
+
'headline judgement',
|
|
315
|
+
'headline judgment',
|
|
316
|
+
'horizon',
|
|
317
|
+
'imf status',
|
|
318
|
+
'issue',
|
|
319
|
+
'key assumptions',
|
|
320
|
+
'key assumptions check',
|
|
321
|
+
'key intelligence',
|
|
322
|
+
'key risk indicators',
|
|
299
323
|
'lead author',
|
|
324
|
+
'master assumptions',
|
|
325
|
+
'master narrative',
|
|
300
326
|
'methodology',
|
|
327
|
+
'parliamentary status',
|
|
328
|
+
'period',
|
|
329
|
+
'prepared',
|
|
330
|
+
'purpose',
|
|
331
|
+
'quality of information check',
|
|
332
|
+
'reporting',
|
|
301
333
|
'reporting window',
|
|
302
334
|
'run',
|
|
335
|
+
'sat documentation',
|
|
336
|
+
'sat documentation below',
|
|
337
|
+
'scope',
|
|
303
338
|
'session',
|
|
339
|
+
'signal assessment',
|
|
304
340
|
'source',
|
|
305
341
|
'sources',
|
|
342
|
+
'threat level',
|
|
343
|
+
'tier 1 priority issues',
|
|
344
|
+
'tier 2 priority issues',
|
|
345
|
+
'tier 3 priority issues',
|
|
346
|
+
'tier 1 priority',
|
|
347
|
+
'tier 2 priority',
|
|
348
|
+
'tier 3 priority',
|
|
306
349
|
'time horizon',
|
|
350
|
+
'top line',
|
|
351
|
+
'top-line judgement',
|
|
352
|
+
'top-line judgment',
|
|
307
353
|
'wep',
|
|
354
|
+
'wep band',
|
|
355
|
+
'wep bands',
|
|
356
|
+
'wep bands applied',
|
|
357
|
+
'window',
|
|
308
358
|
];
|
|
309
359
|
/**
|
|
310
360
|
* Recognise a metadata-banner bold line (`**Admiralty Grade: B/2**`,
|
|
@@ -331,6 +381,22 @@ function isMetadataBoldLine(line) {
|
|
|
331
381
|
if (inner.startsWith(`${prefix}—`) || inner.startsWith(`${prefix} —`))
|
|
332
382
|
return true;
|
|
333
383
|
}
|
|
384
|
+
// Pipe-banner shape: two or more `|`-separated segments inside the
|
|
385
|
+
// bold body indicate a methodology / SAT-tag banner row, never an
|
|
386
|
+
// editorial headline (e.g.
|
|
387
|
+
// `**WEP Bands Applied | Admiralty Scale Used | SAT Documentation**`).
|
|
388
|
+
// Single `|` is allowed because it occurs in legitimate headlines
|
|
389
|
+
// ("Brexit | A Decade On"). Three or more delimiters is the threshold.
|
|
390
|
+
const pipeSegments = inner
|
|
391
|
+
.split('|')
|
|
392
|
+
.map((s) => s.trim())
|
|
393
|
+
.filter((s) => s.length > 0);
|
|
394
|
+
if (pipeSegments.length >= 3)
|
|
395
|
+
return true;
|
|
396
|
+
// Trailing-ellipsis bold: `**Some long banner line…**` was clipped by
|
|
397
|
+
// the brief author and is not a usable editorial headline.
|
|
398
|
+
if (inner.endsWith('…') || inner.endsWith('...'))
|
|
399
|
+
return true;
|
|
334
400
|
return false;
|
|
335
401
|
}
|
|
336
402
|
/**
|
|
@@ -351,6 +417,14 @@ function buildPriorityResult(rawHeadline, tail, lines, i) {
|
|
|
351
417
|
const cleaned = cleanPriorityHeadline(rawHeadline);
|
|
352
418
|
if (cleaned.length < 5)
|
|
353
419
|
return null;
|
|
420
|
+
// Reject bold-prose section labels (`Strategic significance`,
|
|
421
|
+
// `Threat Level`, `Convergence themes`, …) and other denylisted
|
|
422
|
+
// tokens. Without this, the priority-finding loop would surface a
|
|
423
|
+
// `**Strategic significance:** …` line — which the executive-brief
|
|
424
|
+
// template uses inside every dossier subsection — as the article
|
|
425
|
+
// title. See `title-rejection.ts` for the full denylist.
|
|
426
|
+
if (findTitleRejectionReason(cleaned))
|
|
427
|
+
return null;
|
|
354
428
|
const summaryLines = collectPrioritySummaryLines(tail, lines, i);
|
|
355
429
|
const summary = truncateDescription(summaryLines.join(' '));
|
|
356
430
|
return { headline: cleaned, summary };
|
|
@@ -416,162 +490,4 @@ function collectPrioritySummaryLines(tail, lines, i) {
|
|
|
416
490
|
}
|
|
417
491
|
return summaryLines;
|
|
418
492
|
}
|
|
419
|
-
/**
|
|
420
|
-
* Leading priority-label tokens stripped by {@link cleanPriorityHeadline}
|
|
421
|
-
* (`🔴 CRITICAL — Title` → `Title`). Kept as a list to bypass the
|
|
422
|
-
* unsafe-regex lint by avoiding deep alternation in a single pattern.
|
|
423
|
-
*/
|
|
424
|
-
const PRIORITY_LABEL_TOKENS = [
|
|
425
|
-
'CRITICAL',
|
|
426
|
-
'HIGH PRIORITY',
|
|
427
|
-
'HIGH',
|
|
428
|
-
'MEDIUM PRIORITY',
|
|
429
|
-
'MEDIUM',
|
|
430
|
-
'LOW PRIORITY',
|
|
431
|
-
'LOW',
|
|
432
|
-
'URGENT',
|
|
433
|
-
'ALERT',
|
|
434
|
-
'PRIORITY',
|
|
435
|
-
];
|
|
436
|
-
/**
|
|
437
|
-
* Trailing confidence-marker tokens stripped by
|
|
438
|
-
* {@link cleanPriorityHeadline}. Same rationale as
|
|
439
|
-
* {@link PRIORITY_LABEL_TOKENS}.
|
|
440
|
-
*/
|
|
441
|
-
const PRIORITY_TRAILING_TOKENS = [
|
|
442
|
-
'CRITICAL',
|
|
443
|
-
'HIGH PRIORITY',
|
|
444
|
-
'HIGH',
|
|
445
|
-
'MEDIUM PRIORITY',
|
|
446
|
-
'MEDIUM',
|
|
447
|
-
'LOW PRIORITY',
|
|
448
|
-
'LOW',
|
|
449
|
-
];
|
|
450
|
-
/**
|
|
451
|
-
* Leading editorial-prefix tokens stripped by
|
|
452
|
-
* {@link cleanPriorityHeadline} (`Trigger 1: Title` → `Title`).
|
|
453
|
-
*/
|
|
454
|
-
const PRIORITY_LEADING_PREFIX_TOKENS = [
|
|
455
|
-
'Trigger',
|
|
456
|
-
'Dossier',
|
|
457
|
-
'Priority',
|
|
458
|
-
'Finding',
|
|
459
|
-
'Item',
|
|
460
|
-
'Highlight',
|
|
461
|
-
'Top',
|
|
462
|
-
'Story',
|
|
463
|
-
'Alert',
|
|
464
|
-
'Judgement',
|
|
465
|
-
'Judgment',
|
|
466
|
-
];
|
|
467
|
-
/**
|
|
468
|
-
* Strip a leading priority decoration (`🔴 `, `CRITICAL — `) from a
|
|
469
|
-
* candidate headline. Extracted from {@link cleanPriorityHeadline} to
|
|
470
|
-
* keep cognitive complexity within budget.
|
|
471
|
-
*
|
|
472
|
-
* @param text - Candidate headline (already trimmed)
|
|
473
|
-
* @returns Headline with the leading decoration removed
|
|
474
|
-
*/
|
|
475
|
-
function stripPriorityLeadingDecoration(text) {
|
|
476
|
-
let out = text;
|
|
477
|
-
for (let pass = 0; pass < 2; pass++) {
|
|
478
|
-
out = out.replace(/^[^\p{L}\p{N}]+/u, '').trim();
|
|
479
|
-
for (const token of PRIORITY_LABEL_TOKENS) {
|
|
480
|
-
if (out.toLowerCase().startsWith(token.toLowerCase())) {
|
|
481
|
-
const rest = out.slice(token.length).trim();
|
|
482
|
-
const sep = rest.match(/^[:—–-]\s*(.+)$/u);
|
|
483
|
-
if (sep?.[1]) {
|
|
484
|
-
out = sep[1].trim();
|
|
485
|
-
break;
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
}
|
|
489
|
-
}
|
|
490
|
-
return out;
|
|
491
|
-
}
|
|
492
|
-
/**
|
|
493
|
-
* Strip a leading editorial prefix (`Trigger 1: `, `Dossier 2: `) and a
|
|
494
|
-
* stray leading ordinal (`1. `, `2.1 `) from a candidate headline.
|
|
495
|
-
*
|
|
496
|
-
* @param text - Candidate headline
|
|
497
|
-
* @returns Headline with the leading editorial decoration removed
|
|
498
|
-
*/
|
|
499
|
-
function stripPriorityLeadingPrefix(text) {
|
|
500
|
-
let out = text;
|
|
501
|
-
for (const token of PRIORITY_LEADING_PREFIX_TOKENS) {
|
|
502
|
-
if (!out.toLowerCase().startsWith(token.toLowerCase()))
|
|
503
|
-
continue;
|
|
504
|
-
const rest = out.slice(token.length);
|
|
505
|
-
const match = rest.match(/^\s+\d+\s*[:–—-]\s*(.+)$/u);
|
|
506
|
-
if (match?.[1]) {
|
|
507
|
-
out = match[1];
|
|
508
|
-
break;
|
|
509
|
-
}
|
|
510
|
-
}
|
|
511
|
-
// Drop a stray leading "1. " / "2) " ordinal.
|
|
512
|
-
out = out.replace(/^\d+[.):·\s]\s*/u, '');
|
|
513
|
-
return out;
|
|
514
|
-
}
|
|
515
|
-
/**
|
|
516
|
-
* Strip a trailing confidence marker (`🔴 CRITICAL`, `🟡 MEDIUM`) from a
|
|
517
|
-
* candidate headline. Single pass — caller invokes inside a fixed-point
|
|
518
|
-
* loop.
|
|
519
|
-
*
|
|
520
|
-
* @param text - Candidate headline
|
|
521
|
-
* @returns Headline with the trailing confidence marker removed
|
|
522
|
-
*/
|
|
523
|
-
function stripPriorityTrailingMarker(text) {
|
|
524
|
-
let out = text;
|
|
525
|
-
for (const token of PRIORITY_TRAILING_TOKENS) {
|
|
526
|
-
const pattern = new RegExp(`\\s+[^\\p{L}\\p{N}\\s]?\\s*${token}\\s*$`, 'iu');
|
|
527
|
-
const next = out.replace(pattern, '');
|
|
528
|
-
if (next !== out) {
|
|
529
|
-
out = next;
|
|
530
|
-
break;
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
return out;
|
|
534
|
-
}
|
|
535
|
-
/**
|
|
536
|
-
* Normalise a priority-finding headline: drop the
|
|
537
|
-
* `Trigger N:` / `Dossier N:` / leading-numeric prefix, strip trailing
|
|
538
|
-
* parenthesised metadata (`(TA-10-2026-0160, 2026-04-30)`,
|
|
539
|
-
* `(ITRE/ENVI)`), and trim residual punctuation. The result is a
|
|
540
|
-
* headline-shaped string suitable for `<title>` use.
|
|
541
|
-
*
|
|
542
|
-
* @param raw - Raw bold-title or heading text
|
|
543
|
-
* @returns Cleaned headline (may be empty after stripping)
|
|
544
|
-
*/
|
|
545
|
-
function cleanPriorityHeadline(raw) {
|
|
546
|
-
let text = stripInlineMarkdown(raw).trim();
|
|
547
|
-
text = stripPriorityLeadingDecoration(text);
|
|
548
|
-
text = stripPriorityLeadingPrefix(text);
|
|
549
|
-
// Trailing cleanup runs in a fixed-point loop so combined patterns
|
|
550
|
-
// like "Title (Confidence, 80%): 🔴" collapse all the way down to
|
|
551
|
-
// "Title".
|
|
552
|
-
let previous = '';
|
|
553
|
-
while (previous !== text) {
|
|
554
|
-
previous = text;
|
|
555
|
-
text = stripPriorityTrailingMarker(text);
|
|
556
|
-
text = stripPriorityTailMetadata(text);
|
|
557
|
-
// Drop a single trailing emoji left after metadata stripping.
|
|
558
|
-
text = text.replace(/\s+[^\p{L}\p{N}\s]+\s*$/u, '');
|
|
559
|
-
// Drop trailing colons / dashes left over.
|
|
560
|
-
text = text.replace(/[\s:—–-]+$/u, '');
|
|
561
|
-
text = text.trim();
|
|
562
|
-
}
|
|
563
|
-
return text;
|
|
564
|
-
}
|
|
565
|
-
/**
|
|
566
|
-
* Strip the trailing parenthesised metadata that briefs append to every
|
|
567
|
-
* priority-finding name — procedure codes, dates, committee tags. The
|
|
568
|
-
* regex is intentionally non-greedy so it removes only the LAST
|
|
569
|
-
* parenthesised group on the line.
|
|
570
|
-
*
|
|
571
|
-
* @param text - Headline or paragraph text
|
|
572
|
-
* @returns Text with the trailing `(…)` stripped
|
|
573
|
-
*/
|
|
574
|
-
function stripPriorityTailMetadata(text) {
|
|
575
|
-
return text.replace(/\s*\([^()]{3,80}\)\s*$/u, '').trim();
|
|
576
|
-
}
|
|
577
493
|
//# sourceMappingURL=priority-finding-highlight.js.map
|