euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -2,6 +2,59 @@
|
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
/** Sentinel used when no schema variant supplies a usable article type. */
|
|
4
4
|
export const UNKNOWN_ARTICLE_TYPE = 'unknown';
|
|
5
|
+
/**
|
|
6
|
+
* Canonical article-type slugs published by the EU Parliament Monitor
|
|
7
|
+
* aggregator. Used by {@link stripRunSuffix} to reject any normalisation
|
|
8
|
+
* that would yield a non-canonical leading token.
|
|
9
|
+
*/
|
|
10
|
+
const CANONICAL_ARTICLE_TYPES = new Set([
|
|
11
|
+
'breaking',
|
|
12
|
+
'committee-reports',
|
|
13
|
+
'motions',
|
|
14
|
+
'propositions',
|
|
15
|
+
'week-ahead',
|
|
16
|
+
'week-in-review',
|
|
17
|
+
'month-ahead',
|
|
18
|
+
'month-in-review',
|
|
19
|
+
'quarter-in-review',
|
|
20
|
+
'year-ahead',
|
|
21
|
+
'year-in-review',
|
|
22
|
+
'term-outlook',
|
|
23
|
+
'election-cycle',
|
|
24
|
+
]);
|
|
25
|
+
/**
|
|
26
|
+
* Pattern matching trailing `-run<N>` taxonomy noise that historic
|
|
27
|
+
* Stage-B writers occasionally encode into `articleType` (e.g.
|
|
28
|
+
* `committee-reports-run47`, `motions-run41`, `breaking-run193`). Also
|
|
29
|
+
* tolerates the legacy double-prefixed `motions-runmotions-run-1777010709`
|
|
30
|
+
* pattern observed in 2025 manifests where the writer concatenated the
|
|
31
|
+
* articleType and runId. The leading `-run` makes the match greedy enough
|
|
32
|
+
* to catch both single-suffix and double-prefixed forms.
|
|
33
|
+
*
|
|
34
|
+
* Exported for unit tests.
|
|
35
|
+
*/
|
|
36
|
+
export const RUN_SUFFIX_PATTERN = /-run[a-zA-Z0-9-]*\d+$/u;
|
|
37
|
+
/**
|
|
38
|
+
* Strip a trailing `-run<N>` taxonomy-noise suffix from an article-type
|
|
39
|
+
* slug, but only when doing so yields a {@link CANONICAL_ARTICLE_TYPES}
|
|
40
|
+
* token. This is conservative: a non-canonical leading token (e.g.
|
|
41
|
+
* `custom-type-run5`) is returned untouched so we never silently
|
|
42
|
+
* collapse a genuinely new article type into something it isn't.
|
|
43
|
+
*
|
|
44
|
+
* @param slug - Raw article-type slug from a manifest field
|
|
45
|
+
* @returns Canonical slug when the suffix was successfully stripped,
|
|
46
|
+
* otherwise the original input
|
|
47
|
+
*/
|
|
48
|
+
export function stripRunSuffix(slug) {
|
|
49
|
+
if (!slug || !RUN_SUFFIX_PATTERN.test(slug)) {
|
|
50
|
+
return slug;
|
|
51
|
+
}
|
|
52
|
+
const stripped = slug.replace(RUN_SUFFIX_PATTERN, '');
|
|
53
|
+
if (CANONICAL_ARTICLE_TYPES.has(stripped)) {
|
|
54
|
+
return stripped;
|
|
55
|
+
}
|
|
56
|
+
return slug;
|
|
57
|
+
}
|
|
5
58
|
/**
|
|
6
59
|
* Resolve the article-type slug from a manifest, tolerating historic schemas.
|
|
7
60
|
*
|
|
@@ -11,24 +64,27 @@ export const UNKNOWN_ARTICLE_TYPE = 'unknown';
|
|
|
11
64
|
* 3. `articleTypes[0]` — pre-aggregator-pipeline plural array
|
|
12
65
|
* 4. `runType` — historic field on older breaking-run manifests
|
|
13
66
|
*
|
|
14
|
-
*
|
|
67
|
+
* Each candidate is passed through {@link stripRunSuffix} so trailing
|
|
68
|
+
* `-run<N>` taxonomy noise never leaks into JSON-LD `articleSection`,
|
|
69
|
+
* the filesystem slug, or the SEO dump's article-type histogram. Falls
|
|
70
|
+
* back to `'unknown'` when none of the above is a non-empty string.
|
|
15
71
|
*
|
|
16
72
|
* @param manifest - Parsed manifest (any of the supported schemas)
|
|
17
73
|
* @returns Article-type slug usable as a filename component
|
|
18
74
|
*/
|
|
19
75
|
export function resolveArticleType(manifest) {
|
|
20
76
|
if (typeof manifest.articleType === 'string' && manifest.articleType) {
|
|
21
|
-
return manifest.articleType;
|
|
77
|
+
return stripRunSuffix(manifest.articleType);
|
|
22
78
|
}
|
|
23
79
|
if (typeof manifest.articleTypeSlug === 'string' && manifest.articleTypeSlug) {
|
|
24
|
-
return manifest.articleTypeSlug;
|
|
80
|
+
return stripRunSuffix(manifest.articleTypeSlug);
|
|
25
81
|
}
|
|
26
82
|
const first = manifest.articleTypes?.[0];
|
|
27
83
|
if (typeof first === 'string' && first) {
|
|
28
|
-
return first;
|
|
84
|
+
return stripRunSuffix(first);
|
|
29
85
|
}
|
|
30
86
|
if (typeof manifest.runType === 'string' && manifest.runType) {
|
|
31
|
-
return manifest.runType;
|
|
87
|
+
return stripRunSuffix(manifest.runType);
|
|
32
88
|
}
|
|
33
89
|
return UNKNOWN_ARTICLE_TYPE;
|
|
34
90
|
}
|
|
@@ -159,6 +159,17 @@ function quoteMermaidLabel(raw) {
|
|
|
159
159
|
function rewriteQuadrantChartLine(line) {
|
|
160
160
|
let m = line.match(/^(\s*(?:x-axis|y-axis)\s+)(.+?)\s*-{2}>\s*(.+?)\s*$/);
|
|
161
161
|
if (m) {
|
|
162
|
+
// If the line already has a quoted label followed by a numeric
|
|
163
|
+
// axis-start (e.g. `x-axis "Probability" 0 --> 100`), leave it
|
|
164
|
+
// alone — re-quoting would swallow the numeric token into the
|
|
165
|
+
// label string and produce a broken `x-axis "\"…\" 0" --> "100"`.
|
|
166
|
+
const lhs = (m[2] ?? '').trim();
|
|
167
|
+
const rhs = (m[3] ?? '').trim();
|
|
168
|
+
const lhsHasQuotedLabel = /^"[^"]*"\s+\S/.test(lhs);
|
|
169
|
+
const rhsIsBareNumber = /^-?\d+$/.test(rhs) || /^-?\d+\.\d+$/.test(rhs);
|
|
170
|
+
if (lhsHasQuotedLabel && rhsIsBareNumber) {
|
|
171
|
+
return line;
|
|
172
|
+
}
|
|
162
173
|
return `${m[1]}${quoteMermaidLabel(m[2] ?? '')} --> ${quoteMermaidLabel(m[3] ?? '')}`;
|
|
163
174
|
}
|
|
164
175
|
m = line.match(/^(\s*(?:x-axis|y-axis)\s+)(.+?)\s*$/);
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Headings inside an editorial artefact that carry the journalist's lede
|
|
3
|
+
* paragraph (a one-paragraph summary of "what happened, why it matters").
|
|
4
|
+
* When the resolver sees one of these as a `## …` heading inside the
|
|
5
|
+
* editorial artefact, it prefers the first prose paragraph that follows
|
|
6
|
+
* it as the description (and as a title fallback) over a generic line
|
|
7
|
+
* walk. Names are matched case-insensitively against the heading text
|
|
8
|
+
* (after stripping inline Markdown).
|
|
9
|
+
*/
|
|
10
|
+
export declare const EDITORIAL_LEDE_HEADINGS: readonly string[];
|
|
11
|
+
/**
|
|
12
|
+
* Artifact-category prefixes that appear inside editorial-artefact H1s as
|
|
13
|
+
* a structural label rather than an editorial headline (e.g. `# Synthesis
|
|
14
|
+
* Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
|
|
15
|
+
* starts with one of these prefixes followed by a separator (em/en dash,
|
|
16
|
+
* hyphen, or colon), the resolver treats it as **generic** so it does
|
|
17
|
+
* not leak into the article `<title>`. Compared lower-case, with leading
|
|
18
|
+
* punctuation stripped.
|
|
19
|
+
*/
|
|
20
|
+
export declare const ARTIFACT_CATEGORY_PREFIXES: readonly string[];
|
|
21
|
+
/**
|
|
22
|
+
* Normalise a Markdown heading's text for comparison against the
|
|
23
|
+
* editorial-lede heading whitelist. Strips inline Markdown decorations
|
|
24
|
+
* (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
|
|
25
|
+
* characters (emoji, punctuation, spaces) so a heading like
|
|
26
|
+
* `🎯 Headline Judgement` compares equal to `headline judgement`.
|
|
27
|
+
*
|
|
28
|
+
* @param raw - Raw heading text (no leading hashes)
|
|
29
|
+
* @returns Lower-cased, decoration-stripped heading text
|
|
30
|
+
*/
|
|
31
|
+
export declare function normaliseHeadingText(raw: string): string;
|
|
32
|
+
/**
|
|
33
|
+
* Word-boundary match against an editorial-lede whitelist entry. Matches
|
|
34
|
+
* when the normalised heading equals the whitelist entry exactly, or when
|
|
35
|
+
* the entry is followed by any non-alphanumeric character — covering
|
|
36
|
+
* localized parenthetical glosses written with ASCII or full-width
|
|
37
|
+
* punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
|
|
38
|
+
* `bluf — 핵심 결론`, `60-second read — what happened`).
|
|
39
|
+
*
|
|
40
|
+
* @param headingText - Normalised heading text (lower-case, decoration-stripped)
|
|
41
|
+
* @param whitelistEntry - Lower-case whitelist entry from
|
|
42
|
+
* {@link EDITORIAL_LEDE_HEADINGS}
|
|
43
|
+
* @returns `true` when `headingText` begins with `whitelistEntry` at a
|
|
44
|
+
* word boundary
|
|
45
|
+
*/
|
|
46
|
+
export declare function isLedeHeadingMatch(headingText: string, whitelistEntry: string): boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Return `true` when an artefact-H1 begins with one of the
|
|
49
|
+
* `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
|
|
50
|
+
* carry the artefact's structural label rather than a journalist's
|
|
51
|
+
* headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
|
|
52
|
+
* 2026)`) and must not leak into the article `<title>`.
|
|
53
|
+
*
|
|
54
|
+
* @param heading - Plain-text H1 (after `stripInlineMarkdown`)
|
|
55
|
+
* @returns `true` when the heading is an artefact-category label
|
|
56
|
+
*/
|
|
57
|
+
export declare function isArtifactCategoryHeading(heading: string): boolean;
|
|
58
|
+
/**
|
|
59
|
+
* Strip a leading or trailing artifact-category label from a heading and
|
|
60
|
+
* return the editorial-topic core. When neither end carries a category
|
|
61
|
+
* label, the heading is returned unchanged. When the category label is
|
|
62
|
+
* the **entire** heading (e.g. `# Executive Brief`) the result is the
|
|
63
|
+
* empty string.
|
|
64
|
+
*
|
|
65
|
+
* Examples:
|
|
66
|
+
* - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
|
|
67
|
+
* - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
|
|
68
|
+
* - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
|
|
69
|
+
* - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
|
|
70
|
+
* - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
|
|
71
|
+
*
|
|
72
|
+
* Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
|
|
73
|
+
* stripped because it functions as a date stamp rather than editorial
|
|
74
|
+
* copy. The returned core is trimmed of whitespace and trailing
|
|
75
|
+
* punctuation.
|
|
76
|
+
*
|
|
77
|
+
* @param heading - Raw heading text (post-{@link stripInlineMarkdown})
|
|
78
|
+
* @returns Editorial-topic core, or empty string when only the category survived
|
|
79
|
+
*/
|
|
80
|
+
export declare function stripArtifactCategoryAffix(heading: string): string;
|
|
81
|
+
//# sourceMappingURL=artifact-category-heading.d.ts.map
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Aggregator/Metadata/ArtifactCategoryHeading
|
|
5
|
+
* @description Artifact-category and editorial-lede heading helpers
|
|
6
|
+
* extracted from {@link ./heading-rules.ts}. Owns:
|
|
7
|
+
*
|
|
8
|
+
* - {@link EDITORIAL_LEDE_HEADINGS} — whitelist of `##` headings that
|
|
9
|
+
* carry the journalist's lede paragraph.
|
|
10
|
+
* - {@link ARTIFACT_CATEGORY_PREFIXES} — structural-label H1 prefixes
|
|
11
|
+
* that must not leak into the article `<title>`.
|
|
12
|
+
* - {@link normaliseHeadingText} / {@link isLedeHeadingMatch} — the
|
|
13
|
+
* lede whitelist matcher used by `lede-extractor.ts`.
|
|
14
|
+
* - {@link isArtifactCategoryHeading} / {@link stripArtifactCategoryAffix}
|
|
15
|
+
* — predicates used by the resolver's generic-heading classifier.
|
|
16
|
+
*
|
|
17
|
+
* Pure leaf module. Re-exported through {@link ./heading-rules.ts} for
|
|
18
|
+
* back-compat with existing call sites. Split out of `heading-rules.ts`
|
|
19
|
+
* in May 2026 to keep both files under the 600-raw-line drift-guard.
|
|
20
|
+
*/
|
|
21
|
+
import { stripInlineMarkdown } from './text-utils.js';
|
|
22
|
+
/**
|
|
23
|
+
* Headings inside an editorial artefact that carry the journalist's lede
|
|
24
|
+
* paragraph (a one-paragraph summary of "what happened, why it matters").
|
|
25
|
+
* When the resolver sees one of these as a `## …` heading inside the
|
|
26
|
+
* editorial artefact, it prefers the first prose paragraph that follows
|
|
27
|
+
* it as the description (and as a title fallback) over a generic line
|
|
28
|
+
* walk. Names are matched case-insensitively against the heading text
|
|
29
|
+
* (after stripping inline Markdown).
|
|
30
|
+
*/
|
|
31
|
+
export const EDITORIAL_LEDE_HEADINGS = [
|
|
32
|
+
'60-second read',
|
|
33
|
+
'60 second read',
|
|
34
|
+
'sixty-second read',
|
|
35
|
+
'lede',
|
|
36
|
+
'lead',
|
|
37
|
+
'tl;dr',
|
|
38
|
+
'tldr',
|
|
39
|
+
'synopsis',
|
|
40
|
+
'in brief',
|
|
41
|
+
'at a glance',
|
|
42
|
+
'bottom line',
|
|
43
|
+
'bluf',
|
|
44
|
+
'bluf — bottom line up front',
|
|
45
|
+
'bottom line up front',
|
|
46
|
+
'executive summary',
|
|
47
|
+
'executive briefing',
|
|
48
|
+
'master narrative',
|
|
49
|
+
'overview',
|
|
50
|
+
'headline judgement',
|
|
51
|
+
'headline judgment',
|
|
52
|
+
'key findings',
|
|
53
|
+
'key judgements',
|
|
54
|
+
'key judgments',
|
|
55
|
+
'situation summary',
|
|
56
|
+
'situation report',
|
|
57
|
+
'situation update',
|
|
58
|
+
// ── Editorial-brief specific headings introduced in the May-2026
|
|
59
|
+
// executive-brief style guide. These sections carry the most
|
|
60
|
+
// publishable journalism in the brief and are the user-visible
|
|
61
|
+
// source of the title / description after this refactor.
|
|
62
|
+
'reader briefing',
|
|
63
|
+
'strategic intelligence summary',
|
|
64
|
+
'strategic assessment',
|
|
65
|
+
'top-line summary',
|
|
66
|
+
'top line summary',
|
|
67
|
+
'headline intelligence',
|
|
68
|
+
'key intelligence judgment',
|
|
69
|
+
'key intelligence judgement',
|
|
70
|
+
'key intelligence judgments',
|
|
71
|
+
'key intelligence judgements',
|
|
72
|
+
'key intelligence judgements summary',
|
|
73
|
+
'key intelligence judgments summary',
|
|
74
|
+
'intelligence assessment',
|
|
75
|
+
'intelligence assessment summary',
|
|
76
|
+
'priority intelligence items',
|
|
77
|
+
'lead intelligence assessment',
|
|
78
|
+
// ── May-2026 executive-brief "FOR IMMEDIATE ACTION" pattern. Every
|
|
79
|
+
// 14-language brief in `analysis/daily/**/propositions/` opens
|
|
80
|
+
// the post-banner body with this H2 (translated per locale), and
|
|
81
|
+
// its first row is the BLUF (`**Issue:** …` / `**Fråga:** …` /
|
|
82
|
+
// `**主題:** …` / `**الموضوع:** …` …). The English header is
|
|
83
|
+
// whitelisted here so the extractor catches it directly; the 13
|
|
84
|
+
// translated equivalents fall through to the generic strong-prose
|
|
85
|
+
// walker, which now strips the localized bold label via
|
|
86
|
+
// {@link stripLeadingBoldLabel} so the same BLUF copy lands in
|
|
87
|
+
// `<meta description>` regardless of locale.
|
|
88
|
+
'for immediate action',
|
|
89
|
+
];
|
|
90
|
+
/**
|
|
91
|
+
* Artifact-category prefixes that appear inside editorial-artefact H1s as
|
|
92
|
+
* a structural label rather than an editorial headline (e.g. `# Synthesis
|
|
93
|
+
* Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
|
|
94
|
+
* starts with one of these prefixes followed by a separator (em/en dash,
|
|
95
|
+
* hyphen, or colon), the resolver treats it as **generic** so it does
|
|
96
|
+
* not leak into the article `<title>`. Compared lower-case, with leading
|
|
97
|
+
* punctuation stripped.
|
|
98
|
+
*/
|
|
99
|
+
export const ARTIFACT_CATEGORY_PREFIXES = [
|
|
100
|
+
'actor mapping',
|
|
101
|
+
'analytical quality',
|
|
102
|
+
'breaking news analysis',
|
|
103
|
+
'coalition dynamics',
|
|
104
|
+
'commission wp alignment',
|
|
105
|
+
'committee activity report',
|
|
106
|
+
'cross run continuity',
|
|
107
|
+
'data availability assessment',
|
|
108
|
+
'deep analysis',
|
|
109
|
+
'economic context',
|
|
110
|
+
'executive brief',
|
|
111
|
+
'executive briefing',
|
|
112
|
+
'executive intelligence brief',
|
|
113
|
+
'executive intelligence briefing',
|
|
114
|
+
'executive summary',
|
|
115
|
+
'forward indicators',
|
|
116
|
+
'historical baseline',
|
|
117
|
+
'impact matrix',
|
|
118
|
+
'intelligence assessment',
|
|
119
|
+
'intelligence briefing',
|
|
120
|
+
'intelligence synthesis summary',
|
|
121
|
+
'legislative output analysis',
|
|
122
|
+
'legislative pipeline analysis',
|
|
123
|
+
'legislative pipeline forecast',
|
|
124
|
+
'mandate fulfilment scorecard',
|
|
125
|
+
'master intelligence synthesis',
|
|
126
|
+
'mcp reliability audit',
|
|
127
|
+
'methodology reflection',
|
|
128
|
+
'monthly outlook',
|
|
129
|
+
'motions analysis',
|
|
130
|
+
'parliamentary calendar projection',
|
|
131
|
+
'pestle analysis',
|
|
132
|
+
'political intelligence brief',
|
|
133
|
+
'political risk',
|
|
134
|
+
'political threat landscape',
|
|
135
|
+
'presidency trio context',
|
|
136
|
+
'propositions analysis',
|
|
137
|
+
'quantitative swot',
|
|
138
|
+
'risk assessment',
|
|
139
|
+
'risk matrix',
|
|
140
|
+
'risk scoring',
|
|
141
|
+
'scenario forecast',
|
|
142
|
+
'seat projection',
|
|
143
|
+
'significance classification',
|
|
144
|
+
'situation report',
|
|
145
|
+
'situation summary',
|
|
146
|
+
'stakeholder analysis',
|
|
147
|
+
'stakeholder impact',
|
|
148
|
+
'stakeholder map',
|
|
149
|
+
'swot analysis',
|
|
150
|
+
'synthesis summary',
|
|
151
|
+
'threat assessment',
|
|
152
|
+
'threat model',
|
|
153
|
+
'voting patterns',
|
|
154
|
+
'weekly outlook',
|
|
155
|
+
'wildcards blackswans',
|
|
156
|
+
];
|
|
157
|
+
/**
|
|
158
|
+
* Match a single calendar month name (English) with optional `-uary` /
|
|
159
|
+
* `-uary` suffix, used as a building block for the date-stamp parenthetical
|
|
160
|
+
* detector. Split out of the parent regex so the alternation never appears
|
|
161
|
+
* inside an optional / repeated subgroup (which would trigger
|
|
162
|
+
* security/detect-unsafe-regex on the wider pattern).
|
|
163
|
+
*/
|
|
164
|
+
const MONTH_NAME_SOURCE = 'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?';
|
|
165
|
+
/**
|
|
166
|
+
* Single-date stamp inside a trailing parenthetical — ISO date,
|
|
167
|
+
* `<day> <Month> [<year>]`, `<Month> <year>`, or `Week of <ISO>`.
|
|
168
|
+
* Each alternative is a fixed-shape literal sequence so the resulting
|
|
169
|
+
* pattern carries no nested optional/repeated alternation.
|
|
170
|
+
*/
|
|
171
|
+
const TRAILING_DATE_PAREN_RE = new RegExp('\\s*\\(\\s*(?:' +
|
|
172
|
+
[
|
|
173
|
+
'\\d{4}-\\d{2}-\\d{2}',
|
|
174
|
+
`\\d{1,2}\\s+(?:${MONTH_NAME_SOURCE})\\s+\\d{4}`,
|
|
175
|
+
`\\d{1,2}\\s+(?:${MONTH_NAME_SOURCE})`,
|
|
176
|
+
`(?:${MONTH_NAME_SOURCE})\\s+\\d{4}`,
|
|
177
|
+
'Week\\s+of\\s+\\d{4}-\\d{2}-\\d{2}',
|
|
178
|
+
].join('|') +
|
|
179
|
+
')\\s*\\)\\s*$', 'iu');
|
|
180
|
+
/**
|
|
181
|
+
* Public-only export used by the resolver to strip a trailing
|
|
182
|
+
* single-date parenthetical from an artefact-category core, retaining
|
|
183
|
+
* substantive parentheticals such as `(May 2026 – May 2027)` or
|
|
184
|
+
* `(2024-2029 Mandate, Mid-Term Review)`. Returns the empty string when
|
|
185
|
+
* the cleaned core falls below the 5-character editorial floor.
|
|
186
|
+
*
|
|
187
|
+
* @param core - Heading with the category label already stripped
|
|
188
|
+
* @returns Cleaned editorial-topic core, or empty string when too short
|
|
189
|
+
*/
|
|
190
|
+
function cleanupAffixCore(core) {
|
|
191
|
+
// Only strip parenthetical content that is a pure date stamp
|
|
192
|
+
// (e.g. `(2026-05-08)`, `(May 2026)`, `(8 May)`). Substantive
|
|
193
|
+
// parentheticals such as `(May 2026 – May 2027)`, `(2024-2029
|
|
194
|
+
// Mandate, Mid-Term Review)`, or `(2026 → 2031)` carry editorial
|
|
195
|
+
// context and stay in the title.
|
|
196
|
+
const withoutDateParen = core.replace(TRAILING_DATE_PAREN_RE, '').trim();
|
|
197
|
+
const withoutTrailingPunct = withoutDateParen.replace(/[—–:;,.\s-]+$/u, '').trim();
|
|
198
|
+
if (withoutTrailingPunct.length < 5)
|
|
199
|
+
return '';
|
|
200
|
+
return withoutTrailingPunct;
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Lower-case, decoration-stripped form used by the artifact-category
|
|
204
|
+
* matchers. Strips inline Markdown, leading non-alphanumeric runs (emoji,
|
|
205
|
+
* decoration), and collapses whitespace to a single space.
|
|
206
|
+
*
|
|
207
|
+
* @param raw - Raw heading text
|
|
208
|
+
* @returns Lower-case normalised form
|
|
209
|
+
*/
|
|
210
|
+
function normaliseCategoryHeading(raw) {
|
|
211
|
+
return stripInlineMarkdown(raw)
|
|
212
|
+
.trim()
|
|
213
|
+
.toLowerCase()
|
|
214
|
+
.replace(/^[^a-z0-9]+/, '')
|
|
215
|
+
.replace(/\s+/g, ' ');
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Normalise a Markdown heading's text for comparison against the
|
|
219
|
+
* editorial-lede heading whitelist. Strips inline Markdown decorations
|
|
220
|
+
* (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
|
|
221
|
+
* characters (emoji, punctuation, spaces) so a heading like
|
|
222
|
+
* `🎯 Headline Judgement` compares equal to `headline judgement`.
|
|
223
|
+
*
|
|
224
|
+
* @param raw - Raw heading text (no leading hashes)
|
|
225
|
+
* @returns Lower-cased, decoration-stripped heading text
|
|
226
|
+
*/
|
|
227
|
+
export function normaliseHeadingText(raw) {
|
|
228
|
+
return stripInlineMarkdown(raw)
|
|
229
|
+
.replace(/[*_`#]+/g, '')
|
|
230
|
+
.replace(/^[^A-Za-z0-9]+/, '')
|
|
231
|
+
.trim()
|
|
232
|
+
.toLowerCase();
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Word-boundary match against an editorial-lede whitelist entry. Matches
|
|
236
|
+
* when the normalised heading equals the whitelist entry exactly, or when
|
|
237
|
+
* the entry is followed by any non-alphanumeric character — covering
|
|
238
|
+
* localized parenthetical glosses written with ASCII or full-width
|
|
239
|
+
* punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
|
|
240
|
+
* `bluf — 핵심 결론`, `60-second read — what happened`).
|
|
241
|
+
*
|
|
242
|
+
* @param headingText - Normalised heading text (lower-case, decoration-stripped)
|
|
243
|
+
* @param whitelistEntry - Lower-case whitelist entry from
|
|
244
|
+
* {@link EDITORIAL_LEDE_HEADINGS}
|
|
245
|
+
* @returns `true` when `headingText` begins with `whitelistEntry` at a
|
|
246
|
+
* word boundary
|
|
247
|
+
*/
|
|
248
|
+
export function isLedeHeadingMatch(headingText, whitelistEntry) {
|
|
249
|
+
if (headingText === whitelistEntry)
|
|
250
|
+
return true;
|
|
251
|
+
if (!headingText.startsWith(whitelistEntry))
|
|
252
|
+
return false;
|
|
253
|
+
const next = headingText.charAt(whitelistEntry.length);
|
|
254
|
+
// Word boundary — anything that is not an ASCII letter/digit is a
|
|
255
|
+
// separator we accept. This works uniformly across ASCII parentheses,
|
|
256
|
+
// CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
|
|
257
|
+
// ideographic full-width colon `:`.
|
|
258
|
+
return next === '' || !/[a-z0-9]/.test(next);
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Return `true` when an artefact-H1 begins with one of the
|
|
262
|
+
* `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
|
|
263
|
+
* carry the artefact's structural label rather than a journalist's
|
|
264
|
+
* headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
|
|
265
|
+
* 2026)`) and must not leak into the article `<title>`.
|
|
266
|
+
*
|
|
267
|
+
* @param heading - Plain-text H1 (after `stripInlineMarkdown`)
|
|
268
|
+
* @returns `true` when the heading is an artefact-category label
|
|
269
|
+
*/
|
|
270
|
+
export function isArtifactCategoryHeading(heading) {
|
|
271
|
+
const normalized = normaliseCategoryHeading(heading);
|
|
272
|
+
if (normalized === '')
|
|
273
|
+
return false;
|
|
274
|
+
for (const prefix of ARTIFACT_CATEGORY_PREFIXES) {
|
|
275
|
+
if (normalized === prefix)
|
|
276
|
+
return true;
|
|
277
|
+
if (normalized.startsWith(`${prefix} —`) ||
|
|
278
|
+
normalized.startsWith(`${prefix} –`) ||
|
|
279
|
+
normalized.startsWith(`${prefix} -`) ||
|
|
280
|
+
normalized.startsWith(`${prefix}:`)) {
|
|
281
|
+
return true;
|
|
282
|
+
}
|
|
283
|
+
if (normalized.endsWith(` — ${prefix}`) ||
|
|
284
|
+
normalized.endsWith(` – ${prefix}`) ||
|
|
285
|
+
normalized.endsWith(` - ${prefix}`) ||
|
|
286
|
+
normalized.endsWith(`: ${prefix}`)) {
|
|
287
|
+
return true;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return false;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Strip a leading or trailing artifact-category label from a heading and
|
|
294
|
+
* return the editorial-topic core. When neither end carries a category
|
|
295
|
+
* label, the heading is returned unchanged. When the category label is
|
|
296
|
+
* the **entire** heading (e.g. `# Executive Brief`) the result is the
|
|
297
|
+
* empty string.
|
|
298
|
+
*
|
|
299
|
+
* Examples:
|
|
300
|
+
* - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
|
|
301
|
+
* - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
|
|
302
|
+
* - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
|
|
303
|
+
* - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
|
|
304
|
+
* - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
|
|
305
|
+
*
|
|
306
|
+
* Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
|
|
307
|
+
* stripped because it functions as a date stamp rather than editorial
|
|
308
|
+
* copy. The returned core is trimmed of whitespace and trailing
|
|
309
|
+
* punctuation.
|
|
310
|
+
*
|
|
311
|
+
* @param heading - Raw heading text (post-{@link stripInlineMarkdown})
|
|
312
|
+
* @returns Editorial-topic core, or empty string when only the category survived
|
|
313
|
+
*/
|
|
314
|
+
export function stripArtifactCategoryAffix(heading) {
|
|
315
|
+
const trimmed = heading.trim();
|
|
316
|
+
if (trimmed === '')
|
|
317
|
+
return '';
|
|
318
|
+
const sortedPrefixes = [...ARTIFACT_CATEGORY_PREFIXES].sort((a, b) => b.length - a.length);
|
|
319
|
+
const normalized = normaliseCategoryHeading(trimmed);
|
|
320
|
+
const skip = trimmed.length - normalized.length;
|
|
321
|
+
const visible = trimmed.slice(skip < 0 ? 0 : skip);
|
|
322
|
+
// For trailing-prefix detection (e.g. `Topic — Deep Analysis (date)`),
|
|
323
|
+
// we strip ANY trailing parenthetical because both the prefix and its
|
|
324
|
+
// date stamp are noise to remove. For leading-prefix detection (e.g.
|
|
325
|
+
// `Executive Brief — Year Ahead (May 2026 – May 2027)`), we keep the
|
|
326
|
+
// trailing parenthetical so substantive context survives into
|
|
327
|
+
// `cleanupAffixCore`, which only strips pure date stamps.
|
|
328
|
+
const visibleParenStripped = visible.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
|
|
329
|
+
const normalizedVisible = normaliseCategoryHeading(visible);
|
|
330
|
+
const normalizedParenStripped = normaliseCategoryHeading(visibleParenStripped);
|
|
331
|
+
for (const prefix of sortedPrefixes) {
|
|
332
|
+
for (const sep of [' — ', ' – ', ' - ', ': ']) {
|
|
333
|
+
const candidate = `${prefix}${sep}`;
|
|
334
|
+
if (normalizedVisible.startsWith(candidate)) {
|
|
335
|
+
const core = visible.slice(candidate.length).trim();
|
|
336
|
+
return cleanupAffixCore(core);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
for (const sep of [' — ', ' – ', ' - ', ': ']) {
|
|
340
|
+
const candidate = `${sep}${prefix}`;
|
|
341
|
+
if (normalizedParenStripped.endsWith(candidate)) {
|
|
342
|
+
const core = visibleParenStripped
|
|
343
|
+
.slice(0, visibleParenStripped.length - candidate.length)
|
|
344
|
+
.trim();
|
|
345
|
+
return cleanupAffixCore(core);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
if (normalizedParenStripped === prefix)
|
|
349
|
+
return '';
|
|
350
|
+
}
|
|
351
|
+
return trimmed;
|
|
352
|
+
}
|
|
353
|
+
//# sourceMappingURL=artifact-category-heading.js.map
|
|
@@ -17,7 +17,7 @@ import fs from 'fs';
|
|
|
17
17
|
import path from 'path';
|
|
18
18
|
import { extractFirstH1 } from './h1-extractor.js';
|
|
19
19
|
import { extractLedeAfterHeading, extractStrongProseLine } from './lede-extractor.js';
|
|
20
|
-
import { isGenericHeading, stripArtifactCategoryAffix } from './heading-rules.js';
|
|
20
|
+
import { isGenericHeading, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './heading-rules.js';
|
|
21
21
|
import { truncateTitle } from './text-utils.js';
|
|
22
22
|
import { extractPriorityFindingHighlight } from './priority-finding-highlight.js';
|
|
23
23
|
/** Ordered list of artefact filenames that typically carry the editorial H1. */
|
|
@@ -121,18 +121,37 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
|
|
|
121
121
|
if (headline && !isGenericHeading(headline, articleType, date)) {
|
|
122
122
|
return { cleanHighlight: { headline: truncateTitle(headline), summary } };
|
|
123
123
|
}
|
|
124
|
-
// The artefact H1 is generic
|
|
125
|
-
//
|
|
126
|
-
//
|
|
127
|
-
//
|
|
128
|
-
//
|
|
129
|
-
//
|
|
130
|
-
//
|
|
131
|
-
//
|
|
124
|
+
// The artefact H1 is classified generic by the boilerplate matcher
|
|
125
|
+
// (`Executive Brief — EU Parliament Motions | 28 April – 5 May 2026`
|
|
126
|
+
// matches because it starts with the `Executive Brief —` affix). Before
|
|
127
|
+
// falling through to deeper inference, try the *stripped-affix* form
|
|
128
|
+
// FIRST — when authors hand-craft a brief H1 with date / session
|
|
129
|
+
// context (e.g. `… EU Parliament Motions | 28 April – 5 May 2026`,
|
|
130
|
+
// `… EP Committee Reports · Week of 2026-05-14–21`,
|
|
131
|
+
// `… Year Ahead — May 2026–May 2027`), the stripped tail is the
|
|
132
|
+
// canonical editorial title and must win over priority-finding
|
|
133
|
+
// inference. This fixes title-leaks where the priority-finding
|
|
134
|
+
// extractor would otherwise surface a bold-prose section label such
|
|
135
|
+
// as `Strategic significance`, `Event description`, `Threat Level`.
|
|
136
|
+
if (headline) {
|
|
137
|
+
const stripped = stripArtifactCategoryAffix(headline);
|
|
138
|
+
if (stripped && stripped !== headline && !isGenericHeading(stripped, articleType, date)) {
|
|
139
|
+
return { cleanHighlight: { headline: truncateTitle(stripped), summary } };
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Only when the brief H1 is both generic AND its stripped form is
|
|
143
|
+
// still generic (e.g. bare `Executive Brief — EU Parliament
|
|
144
|
+
// Propositions` with no date) do we attempt to surface the FIRST
|
|
145
|
+
// NAMED PRIORITY FINDING from the brief's `## Key Developments` /
|
|
146
|
+
// `## Priority Dossiers` / `## Top Findings` block. This is the
|
|
147
|
+
// canonical Stage-B authoring pattern (see
|
|
148
|
+
// `analysis/templates/executive-brief.md`) — every brief lists its
|
|
149
|
+
// top dossiers as `**Name** (procedure-code, date) — paragraph` or
|
|
150
|
+
// `### N. Name (committee)`. Surfacing that name produces a
|
|
132
151
|
// distinctive editorial headline ("Digital Markets Act Enforcement",
|
|
133
152
|
// "Ukraine War Accountability") instead of a stripped category noun.
|
|
134
153
|
const priority = extractPriorityFindingHighlight(body);
|
|
135
|
-
if (priority?.headline) {
|
|
154
|
+
if (priority?.headline && !isArtifactCategoryHeading(priority.headline)) {
|
|
136
155
|
return {
|
|
137
156
|
cleanHighlight: {
|
|
138
157
|
headline: truncateTitle(priority.headline),
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Read the first existing English brief artefact under `runDir` and
|
|
3
|
+
* return its SPDX-stripped body. Returns the empty string when none of
|
|
4
|
+
* the candidate artefacts exists or the run directory is missing —
|
|
5
|
+
* callers should treat the empty string as "no brief content
|
|
6
|
+
* available" and fall back to their existing extraction ladder.
|
|
7
|
+
*
|
|
8
|
+
* @param runDir - Absolute run directory, or empty string when unavailable
|
|
9
|
+
* @returns Brief body text with SPDX preamble removed
|
|
10
|
+
*/
|
|
11
|
+
export declare function readEnglishBriefBody(runDir: string): string;
|
|
12
|
+
//# sourceMappingURL=brief-body.d.ts.map
|