euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -2,6 +2,59 @@
2
2
  // SPDX-License-Identifier: Apache-2.0
3
3
  /** Sentinel used when no schema variant supplies a usable article type. */
4
4
  export const UNKNOWN_ARTICLE_TYPE = 'unknown';
5
+ /**
6
+ * Canonical article-type slugs published by the EU Parliament Monitor
7
+ * aggregator. Used by {@link stripRunSuffix} to reject any normalisation
8
+ * that would yield a non-canonical leading token.
9
+ */
10
+ const CANONICAL_ARTICLE_TYPES = new Set([
11
+ 'breaking',
12
+ 'committee-reports',
13
+ 'motions',
14
+ 'propositions',
15
+ 'week-ahead',
16
+ 'week-in-review',
17
+ 'month-ahead',
18
+ 'month-in-review',
19
+ 'quarter-in-review',
20
+ 'year-ahead',
21
+ 'year-in-review',
22
+ 'term-outlook',
23
+ 'election-cycle',
24
+ ]);
25
+ /**
26
+ * Pattern matching trailing `-run<N>` taxonomy noise that historic
27
+ * Stage-B writers occasionally encode into `articleType` (e.g.
28
+ * `committee-reports-run47`, `motions-run41`, `breaking-run193`). Also
29
+ * tolerates the legacy double-prefixed `motions-runmotions-run-1777010709`
30
+ * pattern observed in 2025 manifests where the writer concatenated the
31
+ * articleType and runId. The leading `-run` makes the match greedy enough
32
+ * to catch both single-suffix and double-prefixed forms.
33
+ *
34
+ * Exported for unit tests.
35
+ */
36
+ export const RUN_SUFFIX_PATTERN = /-run[a-zA-Z0-9-]*\d+$/u;
37
+ /**
38
+ * Strip a trailing `-run<N>` taxonomy-noise suffix from an article-type
39
+ * slug, but only when doing so yields a {@link CANONICAL_ARTICLE_TYPES}
40
+ * token. This is conservative: a non-canonical leading token (e.g.
41
+ * `custom-type-run5`) is returned untouched so we never silently
42
+ * collapse a genuinely new article type into something it isn't.
43
+ *
44
+ * @param slug - Raw article-type slug from a manifest field
45
+ * @returns Canonical slug when the suffix was successfully stripped,
46
+ * otherwise the original input
47
+ */
48
+ export function stripRunSuffix(slug) {
49
+ if (!slug || !RUN_SUFFIX_PATTERN.test(slug)) {
50
+ return slug;
51
+ }
52
+ const stripped = slug.replace(RUN_SUFFIX_PATTERN, '');
53
+ if (CANONICAL_ARTICLE_TYPES.has(stripped)) {
54
+ return stripped;
55
+ }
56
+ return slug;
57
+ }
5
58
  /**
6
59
  * Resolve the article-type slug from a manifest, tolerating historic schemas.
7
60
  *
@@ -11,24 +64,27 @@ export const UNKNOWN_ARTICLE_TYPE = 'unknown';
11
64
  * 3. `articleTypes[0]` — pre-aggregator-pipeline plural array
12
65
  * 4. `runType` — historic field on older breaking-run manifests
13
66
  *
14
- * Falls back to `'unknown'` when none of the above is a non-empty string.
67
+ * Each candidate is passed through {@link stripRunSuffix} so trailing
68
+ * `-run<N>` taxonomy noise never leaks into JSON-LD `articleSection`,
69
+ * the filesystem slug, or the SEO dump's article-type histogram. Falls
70
+ * back to `'unknown'` when none of the above is a non-empty string.
15
71
  *
16
72
  * @param manifest - Parsed manifest (any of the supported schemas)
17
73
  * @returns Article-type slug usable as a filename component
18
74
  */
19
75
  export function resolveArticleType(manifest) {
20
76
  if (typeof manifest.articleType === 'string' && manifest.articleType) {
21
- return manifest.articleType;
77
+ return stripRunSuffix(manifest.articleType);
22
78
  }
23
79
  if (typeof manifest.articleTypeSlug === 'string' && manifest.articleTypeSlug) {
24
- return manifest.articleTypeSlug;
80
+ return stripRunSuffix(manifest.articleTypeSlug);
25
81
  }
26
82
  const first = manifest.articleTypes?.[0];
27
83
  if (typeof first === 'string' && first) {
28
- return first;
84
+ return stripRunSuffix(first);
29
85
  }
30
86
  if (typeof manifest.runType === 'string' && manifest.runType) {
31
- return manifest.runType;
87
+ return stripRunSuffix(manifest.runType);
32
88
  }
33
89
  return UNKNOWN_ARTICLE_TYPE;
34
90
  }
@@ -159,6 +159,17 @@ function quoteMermaidLabel(raw) {
159
159
  function rewriteQuadrantChartLine(line) {
160
160
  let m = line.match(/^(\s*(?:x-axis|y-axis)\s+)(.+?)\s*-{2}>\s*(.+?)\s*$/);
161
161
  if (m) {
162
+ // If the line already has a quoted label followed by a numeric
163
+ // axis-start (e.g. `x-axis "Probability" 0 --> 100`), leave it
164
+ // alone — re-quoting would swallow the numeric token into the
165
+ // label string and produce a broken `x-axis "\"…\" 0" --> "100"`.
166
+ const lhs = (m[2] ?? '').trim();
167
+ const rhs = (m[3] ?? '').trim();
168
+ const lhsHasQuotedLabel = /^"[^"]*"\s+\S/.test(lhs);
169
+ const rhsIsBareNumber = /^-?\d+$/.test(rhs) || /^-?\d+\.\d+$/.test(rhs);
170
+ if (lhsHasQuotedLabel && rhsIsBareNumber) {
171
+ return line;
172
+ }
162
173
  return `${m[1]}${quoteMermaidLabel(m[2] ?? '')} --> ${quoteMermaidLabel(m[3] ?? '')}`;
163
174
  }
164
175
  m = line.match(/^(\s*(?:x-axis|y-axis)\s+)(.+?)\s*$/);
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Headings inside an editorial artefact that carry the journalist's lede
3
+ * paragraph (a one-paragraph summary of "what happened, why it matters").
4
+ * When the resolver sees one of these as a `## …` heading inside the
5
+ * editorial artefact, it prefers the first prose paragraph that follows
6
+ * it as the description (and as a title fallback) over a generic line
7
+ * walk. Names are matched case-insensitively against the heading text
8
+ * (after stripping inline Markdown).
9
+ */
10
+ export declare const EDITORIAL_LEDE_HEADINGS: readonly string[];
11
+ /**
12
+ * Artifact-category prefixes that appear inside editorial-artefact H1s as
13
+ * a structural label rather than an editorial headline (e.g. `# Synthesis
14
+ * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
15
+ * starts with one of these prefixes followed by a separator (em/en dash,
16
+ * hyphen, or colon), the resolver treats it as **generic** so it does
17
+ * not leak into the article `<title>`. Compared lower-case, with leading
18
+ * punctuation stripped.
19
+ */
20
+ export declare const ARTIFACT_CATEGORY_PREFIXES: readonly string[];
21
+ /**
22
+ * Normalise a Markdown heading's text for comparison against the
23
+ * editorial-lede heading whitelist. Strips inline Markdown decorations
24
+ * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
25
+ * characters (emoji, punctuation, spaces) so a heading like
26
+ * `🎯 Headline Judgement` compares equal to `headline judgement`.
27
+ *
28
+ * @param raw - Raw heading text (no leading hashes)
29
+ * @returns Lower-cased, decoration-stripped heading text
30
+ */
31
+ export declare function normaliseHeadingText(raw: string): string;
32
+ /**
33
+ * Word-boundary match against an editorial-lede whitelist entry. Matches
34
+ * when the normalised heading equals the whitelist entry exactly, or when
35
+ * the entry is followed by any non-alphanumeric character — covering
36
+ * localized parenthetical glosses written with ASCII or full-width
37
+ * punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
38
+ * `bluf — 핵심 결론`, `60-second read — what happened`).
39
+ *
40
+ * @param headingText - Normalised heading text (lower-case, decoration-stripped)
41
+ * @param whitelistEntry - Lower-case whitelist entry from
42
+ * {@link EDITORIAL_LEDE_HEADINGS}
43
+ * @returns `true` when `headingText` begins with `whitelistEntry` at a
44
+ * word boundary
45
+ */
46
+ export declare function isLedeHeadingMatch(headingText: string, whitelistEntry: string): boolean;
47
+ /**
48
+ * Return `true` when an artefact-H1 begins with one of the
49
+ * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
50
+ * carry the artefact's structural label rather than a journalist's
51
+ * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
52
+ * 2026)`) and must not leak into the article `<title>`.
53
+ *
54
+ * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
55
+ * @returns `true` when the heading is an artefact-category label
56
+ */
57
+ export declare function isArtifactCategoryHeading(heading: string): boolean;
58
+ /**
59
+ * Strip a leading or trailing artifact-category label from a heading and
60
+ * return the editorial-topic core. When neither end carries a category
61
+ * label, the heading is returned unchanged. When the category label is
62
+ * the **entire** heading (e.g. `# Executive Brief`) the result is the
63
+ * empty string.
64
+ *
65
+ * Examples:
66
+ * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
67
+ * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
68
+ * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
69
+ * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
70
+ * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
71
+ *
72
+ * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
73
+ * stripped because it functions as a date stamp rather than editorial
74
+ * copy. The returned core is trimmed of whitespace and trailing
75
+ * punctuation.
76
+ *
77
+ * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
78
+ * @returns Editorial-topic core, or empty string when only the category survived
79
+ */
80
+ export declare function stripArtifactCategoryAffix(heading: string): string;
81
+ //# sourceMappingURL=artifact-category-heading.d.ts.map
@@ -0,0 +1,353 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Aggregator/Metadata/ArtifactCategoryHeading
5
+ * @description Artifact-category and editorial-lede heading helpers
6
+ * extracted from {@link ./heading-rules.ts}. Owns:
7
+ *
8
+ * - {@link EDITORIAL_LEDE_HEADINGS} — whitelist of `##` headings that
9
+ * carry the journalist's lede paragraph.
10
+ * - {@link ARTIFACT_CATEGORY_PREFIXES} — structural-label H1 prefixes
11
+ * that must not leak into the article `<title>`.
12
+ * - {@link normaliseHeadingText} / {@link isLedeHeadingMatch} — the
13
+ * lede whitelist matcher used by `lede-extractor.ts`.
14
+ * - {@link isArtifactCategoryHeading} / {@link stripArtifactCategoryAffix}
15
+ * — predicates used by the resolver's generic-heading classifier.
16
+ *
17
+ * Pure leaf module. Re-exported through {@link ./heading-rules.ts} for
18
+ * back-compat with existing call sites. Split out of `heading-rules.ts`
19
+ * in May 2026 to keep both files under the 600-raw-line drift-guard.
20
+ */
21
+ import { stripInlineMarkdown } from './text-utils.js';
22
+ /**
23
+ * Headings inside an editorial artefact that carry the journalist's lede
24
+ * paragraph (a one-paragraph summary of "what happened, why it matters").
25
+ * When the resolver sees one of these as a `## …` heading inside the
26
+ * editorial artefact, it prefers the first prose paragraph that follows
27
+ * it as the description (and as a title fallback) over a generic line
28
+ * walk. Names are matched case-insensitively against the heading text
29
+ * (after stripping inline Markdown).
30
+ */
31
+ export const EDITORIAL_LEDE_HEADINGS = [
32
+ '60-second read',
33
+ '60 second read',
34
+ 'sixty-second read',
35
+ 'lede',
36
+ 'lead',
37
+ 'tl;dr',
38
+ 'tldr',
39
+ 'synopsis',
40
+ 'in brief',
41
+ 'at a glance',
42
+ 'bottom line',
43
+ 'bluf',
44
+ 'bluf — bottom line up front',
45
+ 'bottom line up front',
46
+ 'executive summary',
47
+ 'executive briefing',
48
+ 'master narrative',
49
+ 'overview',
50
+ 'headline judgement',
51
+ 'headline judgment',
52
+ 'key findings',
53
+ 'key judgements',
54
+ 'key judgments',
55
+ 'situation summary',
56
+ 'situation report',
57
+ 'situation update',
58
+ // ── Editorial-brief specific headings introduced in the May-2026
59
+ // executive-brief style guide. These sections carry the most
60
+ // publishable journalism in the brief and are the user-visible
61
+ // source of the title / description after this refactor.
62
+ 'reader briefing',
63
+ 'strategic intelligence summary',
64
+ 'strategic assessment',
65
+ 'top-line summary',
66
+ 'top line summary',
67
+ 'headline intelligence',
68
+ 'key intelligence judgment',
69
+ 'key intelligence judgement',
70
+ 'key intelligence judgments',
71
+ 'key intelligence judgements',
72
+ 'key intelligence judgements summary',
73
+ 'key intelligence judgments summary',
74
+ 'intelligence assessment',
75
+ 'intelligence assessment summary',
76
+ 'priority intelligence items',
77
+ 'lead intelligence assessment',
78
+ // ── May-2026 executive-brief "FOR IMMEDIATE ACTION" pattern. Every
79
+ // 14-language brief in `analysis/daily/**/propositions/` opens
80
+ // the post-banner body with this H2 (translated per locale), and
81
+ // its first row is the BLUF (`**Issue:** …` / `**Fråga:** …` /
82
+ // `**主題:** …` / `**الموضوع:** …` …). The English header is
83
+ // whitelisted here so the extractor catches it directly; the 13
84
+ // translated equivalents fall through to the generic strong-prose
85
+ // walker, which now strips the localized bold label via
86
+ // {@link stripLeadingBoldLabel} so the same BLUF copy lands in
87
+ // `<meta description>` regardless of locale.
88
+ 'for immediate action',
89
+ ];
90
+ /**
91
+ * Artifact-category prefixes that appear inside editorial-artefact H1s as
92
+ * a structural label rather than an editorial headline (e.g. `# Synthesis
93
+ * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
94
+ * starts with one of these prefixes followed by a separator (em/en dash,
95
+ * hyphen, or colon), the resolver treats it as **generic** so it does
96
+ * not leak into the article `<title>`. Compared lower-case, with leading
97
+ * punctuation stripped.
98
+ */
99
+ export const ARTIFACT_CATEGORY_PREFIXES = [
100
+ 'actor mapping',
101
+ 'analytical quality',
102
+ 'breaking news analysis',
103
+ 'coalition dynamics',
104
+ 'commission wp alignment',
105
+ 'committee activity report',
106
+ 'cross run continuity',
107
+ 'data availability assessment',
108
+ 'deep analysis',
109
+ 'economic context',
110
+ 'executive brief',
111
+ 'executive briefing',
112
+ 'executive intelligence brief',
113
+ 'executive intelligence briefing',
114
+ 'executive summary',
115
+ 'forward indicators',
116
+ 'historical baseline',
117
+ 'impact matrix',
118
+ 'intelligence assessment',
119
+ 'intelligence briefing',
120
+ 'intelligence synthesis summary',
121
+ 'legislative output analysis',
122
+ 'legislative pipeline analysis',
123
+ 'legislative pipeline forecast',
124
+ 'mandate fulfilment scorecard',
125
+ 'master intelligence synthesis',
126
+ 'mcp reliability audit',
127
+ 'methodology reflection',
128
+ 'monthly outlook',
129
+ 'motions analysis',
130
+ 'parliamentary calendar projection',
131
+ 'pestle analysis',
132
+ 'political intelligence brief',
133
+ 'political risk',
134
+ 'political threat landscape',
135
+ 'presidency trio context',
136
+ 'propositions analysis',
137
+ 'quantitative swot',
138
+ 'risk assessment',
139
+ 'risk matrix',
140
+ 'risk scoring',
141
+ 'scenario forecast',
142
+ 'seat projection',
143
+ 'significance classification',
144
+ 'situation report',
145
+ 'situation summary',
146
+ 'stakeholder analysis',
147
+ 'stakeholder impact',
148
+ 'stakeholder map',
149
+ 'swot analysis',
150
+ 'synthesis summary',
151
+ 'threat assessment',
152
+ 'threat model',
153
+ 'voting patterns',
154
+ 'weekly outlook',
155
+ 'wildcards blackswans',
156
+ ];
157
+ /**
158
+ * Match a single calendar month name (English) with optional `-uary` /
159
+ * `-uary` suffix, used as a building block for the date-stamp parenthetical
160
+ * detector. Split out of the parent regex so the alternation never appears
161
+ * inside an optional / repeated subgroup (which would trigger
162
+ * security/detect-unsafe-regex on the wider pattern).
163
+ */
164
+ const MONTH_NAME_SOURCE = 'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?';
165
+ /**
166
+ * Single-date stamp inside a trailing parenthetical — ISO date,
167
+ * `<day> <Month> [<year>]`, `<Month> <year>`, or `Week of <ISO>`.
168
+ * Each alternative is a fixed-shape literal sequence so the resulting
169
+ * pattern carries no nested optional/repeated alternation.
170
+ */
171
+ const TRAILING_DATE_PAREN_RE = new RegExp('\\s*\\(\\s*(?:' +
172
+ [
173
+ '\\d{4}-\\d{2}-\\d{2}',
174
+ `\\d{1,2}\\s+(?:${MONTH_NAME_SOURCE})\\s+\\d{4}`,
175
+ `\\d{1,2}\\s+(?:${MONTH_NAME_SOURCE})`,
176
+ `(?:${MONTH_NAME_SOURCE})\\s+\\d{4}`,
177
+ 'Week\\s+of\\s+\\d{4}-\\d{2}-\\d{2}',
178
+ ].join('|') +
179
+ ')\\s*\\)\\s*$', 'iu');
180
+ /**
181
+ * Public-only export used by the resolver to strip a trailing
182
+ * single-date parenthetical from an artefact-category core, retaining
183
+ * substantive parentheticals such as `(May 2026 – May 2027)` or
184
+ * `(2024-2029 Mandate, Mid-Term Review)`. Returns the empty string when
185
+ * the cleaned core falls below the 5-character editorial floor.
186
+ *
187
+ * @param core - Heading with the category label already stripped
188
+ * @returns Cleaned editorial-topic core, or empty string when too short
189
+ */
190
+ function cleanupAffixCore(core) {
191
+ // Only strip parenthetical content that is a pure date stamp
192
+ // (e.g. `(2026-05-08)`, `(May 2026)`, `(8 May)`). Substantive
193
+ // parentheticals such as `(May 2026 – May 2027)`, `(2024-2029
194
+ // Mandate, Mid-Term Review)`, or `(2026 → 2031)` carry editorial
195
+ // context and stay in the title.
196
+ const withoutDateParen = core.replace(TRAILING_DATE_PAREN_RE, '').trim();
197
+ const withoutTrailingPunct = withoutDateParen.replace(/[—–:;,.\s-]+$/u, '').trim();
198
+ if (withoutTrailingPunct.length < 5)
199
+ return '';
200
+ return withoutTrailingPunct;
201
+ }
202
+ /**
203
+ * Lower-case, decoration-stripped form used by the artifact-category
204
+ * matchers. Strips inline Markdown, leading non-alphanumeric runs (emoji,
205
+ * decoration), and collapses whitespace to a single space.
206
+ *
207
+ * @param raw - Raw heading text
208
+ * @returns Lower-case normalised form
209
+ */
210
+ function normaliseCategoryHeading(raw) {
211
+ return stripInlineMarkdown(raw)
212
+ .trim()
213
+ .toLowerCase()
214
+ .replace(/^[^a-z0-9]+/, '')
215
+ .replace(/\s+/g, ' ');
216
+ }
217
+ /**
218
+ * Normalise a Markdown heading's text for comparison against the
219
+ * editorial-lede heading whitelist. Strips inline Markdown decorations
220
+ * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
221
+ * characters (emoji, punctuation, spaces) so a heading like
222
+ * `🎯 Headline Judgement` compares equal to `headline judgement`.
223
+ *
224
+ * @param raw - Raw heading text (no leading hashes)
225
+ * @returns Lower-cased, decoration-stripped heading text
226
+ */
227
+ export function normaliseHeadingText(raw) {
228
+ return stripInlineMarkdown(raw)
229
+ .replace(/[*_`#]+/g, '')
230
+ .replace(/^[^A-Za-z0-9]+/, '')
231
+ .trim()
232
+ .toLowerCase();
233
+ }
234
+ /**
235
+ * Word-boundary match against an editorial-lede whitelist entry. Matches
236
+ * when the normalised heading equals the whitelist entry exactly, or when
237
+ * the entry is followed by any non-alphanumeric character — covering
238
+ * localized parenthetical glosses written with ASCII or full-width
239
+ * punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
240
+ * `bluf — 핵심 결론`, `60-second read — what happened`).
241
+ *
242
+ * @param headingText - Normalised heading text (lower-case, decoration-stripped)
243
+ * @param whitelistEntry - Lower-case whitelist entry from
244
+ * {@link EDITORIAL_LEDE_HEADINGS}
245
+ * @returns `true` when `headingText` begins with `whitelistEntry` at a
246
+ * word boundary
247
+ */
248
+ export function isLedeHeadingMatch(headingText, whitelistEntry) {
249
+ if (headingText === whitelistEntry)
250
+ return true;
251
+ if (!headingText.startsWith(whitelistEntry))
252
+ return false;
253
+ const next = headingText.charAt(whitelistEntry.length);
254
+ // Word boundary — anything that is not an ASCII letter/digit is a
255
+ // separator we accept. This works uniformly across ASCII parentheses,
256
+ // CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
257
+ // ideographic full-width colon `:`.
258
+ return next === '' || !/[a-z0-9]/.test(next);
259
+ }
260
+ /**
261
+ * Return `true` when an artefact-H1 begins with one of the
262
+ * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
263
+ * carry the artefact's structural label rather than a journalist's
264
+ * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
265
+ * 2026)`) and must not leak into the article `<title>`.
266
+ *
267
+ * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
268
+ * @returns `true` when the heading is an artefact-category label
269
+ */
270
+ export function isArtifactCategoryHeading(heading) {
271
+ const normalized = normaliseCategoryHeading(heading);
272
+ if (normalized === '')
273
+ return false;
274
+ for (const prefix of ARTIFACT_CATEGORY_PREFIXES) {
275
+ if (normalized === prefix)
276
+ return true;
277
+ if (normalized.startsWith(`${prefix} —`) ||
278
+ normalized.startsWith(`${prefix} –`) ||
279
+ normalized.startsWith(`${prefix} -`) ||
280
+ normalized.startsWith(`${prefix}:`)) {
281
+ return true;
282
+ }
283
+ if (normalized.endsWith(` — ${prefix}`) ||
284
+ normalized.endsWith(` – ${prefix}`) ||
285
+ normalized.endsWith(` - ${prefix}`) ||
286
+ normalized.endsWith(`: ${prefix}`)) {
287
+ return true;
288
+ }
289
+ }
290
+ return false;
291
+ }
292
+ /**
293
+ * Strip a leading or trailing artifact-category label from a heading and
294
+ * return the editorial-topic core. When neither end carries a category
295
+ * label, the heading is returned unchanged. When the category label is
296
+ * the **entire** heading (e.g. `# Executive Brief`) the result is the
297
+ * empty string.
298
+ *
299
+ * Examples:
300
+ * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
301
+ * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
302
+ * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
303
+ * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
304
+ * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
305
+ *
306
+ * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
307
+ * stripped because it functions as a date stamp rather than editorial
308
+ * copy. The returned core is trimmed of whitespace and trailing
309
+ * punctuation.
310
+ *
311
+ * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
312
+ * @returns Editorial-topic core, or empty string when only the category survived
313
+ */
314
+ export function stripArtifactCategoryAffix(heading) {
315
+ const trimmed = heading.trim();
316
+ if (trimmed === '')
317
+ return '';
318
+ const sortedPrefixes = [...ARTIFACT_CATEGORY_PREFIXES].sort((a, b) => b.length - a.length);
319
+ const normalized = normaliseCategoryHeading(trimmed);
320
+ const skip = trimmed.length - normalized.length;
321
+ const visible = trimmed.slice(skip < 0 ? 0 : skip);
322
+ // For trailing-prefix detection (e.g. `Topic — Deep Analysis (date)`),
323
+ // we strip ANY trailing parenthetical because both the prefix and its
324
+ // date stamp are noise to remove. For leading-prefix detection (e.g.
325
+ // `Executive Brief — Year Ahead (May 2026 – May 2027)`), we keep the
326
+ // trailing parenthetical so substantive context survives into
327
+ // `cleanupAffixCore`, which only strips pure date stamps.
328
+ const visibleParenStripped = visible.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
329
+ const normalizedVisible = normaliseCategoryHeading(visible);
330
+ const normalizedParenStripped = normaliseCategoryHeading(visibleParenStripped);
331
+ for (const prefix of sortedPrefixes) {
332
+ for (const sep of [' — ', ' – ', ' - ', ': ']) {
333
+ const candidate = `${prefix}${sep}`;
334
+ if (normalizedVisible.startsWith(candidate)) {
335
+ const core = visible.slice(candidate.length).trim();
336
+ return cleanupAffixCore(core);
337
+ }
338
+ }
339
+ for (const sep of [' — ', ' – ', ' - ', ': ']) {
340
+ const candidate = `${sep}${prefix}`;
341
+ if (normalizedParenStripped.endsWith(candidate)) {
342
+ const core = visibleParenStripped
343
+ .slice(0, visibleParenStripped.length - candidate.length)
344
+ .trim();
345
+ return cleanupAffixCore(core);
346
+ }
347
+ }
348
+ if (normalizedParenStripped === prefix)
349
+ return '';
350
+ }
351
+ return trimmed;
352
+ }
353
+ //# sourceMappingURL=artifact-category-heading.js.map
@@ -17,7 +17,7 @@ import fs from 'fs';
17
17
  import path from 'path';
18
18
  import { extractFirstH1 } from './h1-extractor.js';
19
19
  import { extractLedeAfterHeading, extractStrongProseLine } from './lede-extractor.js';
20
- import { isGenericHeading, stripArtifactCategoryAffix } from './heading-rules.js';
20
+ import { isGenericHeading, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './heading-rules.js';
21
21
  import { truncateTitle } from './text-utils.js';
22
22
  import { extractPriorityFindingHighlight } from './priority-finding-highlight.js';
23
23
  /** Ordered list of artefact filenames that typically carry the editorial H1. */
@@ -121,18 +121,37 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
121
121
  if (headline && !isGenericHeading(headline, articleType, date)) {
122
122
  return { cleanHighlight: { headline: truncateTitle(headline), summary } };
123
123
  }
124
- // The artefact H1 is generic boilerplate (`Executive Brief — EU Parliament
125
- // Breaking News`). Before falling back to a stripped category-core
126
- // headline, try to surface the FIRST NAMED PRIORITY FINDING from the
127
- // brief's `## Key Developments` / `## Priority Dossiers` /
128
- // `## Top Findings` block. This is the canonical Stage-B authoring
129
- // pattern (see `analysis/templates/executive-brief.md`) every brief
130
- // lists its top dossiers as `**Name** (procedure-code, date) — paragraph`
131
- // or `### N. Name (committee)`. Surfacing that name produces a
124
+ // The artefact H1 is classified generic by the boilerplate matcher
125
+ // (`Executive Brief EU Parliament Motions | 28 April – 5 May 2026`
126
+ // matches because it starts with the `Executive Brief —` affix). Before
127
+ // falling through to deeper inference, try the *stripped-affix* form
128
+ // FIRST when authors hand-craft a brief H1 with date / session
129
+ // context (e.g. `… EU Parliament Motions | 28 April – 5 May 2026`,
130
+ // `… EP Committee Reports · Week of 2026-05-14–21`,
131
+ // `… Year Ahead May 2026–May 2027`), the stripped tail is the
132
+ // canonical editorial title and must win over priority-finding
133
+ // inference. This fixes title-leaks where the priority-finding
134
+ // extractor would otherwise surface a bold-prose section label such
135
+ // as `Strategic significance`, `Event description`, `Threat Level`.
136
+ if (headline) {
137
+ const stripped = stripArtifactCategoryAffix(headline);
138
+ if (stripped && stripped !== headline && !isGenericHeading(stripped, articleType, date)) {
139
+ return { cleanHighlight: { headline: truncateTitle(stripped), summary } };
140
+ }
141
+ }
142
+ // Only when the brief H1 is both generic AND its stripped form is
143
+ // still generic (e.g. bare `Executive Brief — EU Parliament
144
+ // Propositions` with no date) do we attempt to surface the FIRST
145
+ // NAMED PRIORITY FINDING from the brief's `## Key Developments` /
146
+ // `## Priority Dossiers` / `## Top Findings` block. This is the
147
+ // canonical Stage-B authoring pattern (see
148
+ // `analysis/templates/executive-brief.md`) — every brief lists its
149
+ // top dossiers as `**Name** (procedure-code, date) — paragraph` or
150
+ // `### N. Name (committee)`. Surfacing that name produces a
132
151
  // distinctive editorial headline ("Digital Markets Act Enforcement",
133
152
  // "Ukraine War Accountability") instead of a stripped category noun.
134
153
  const priority = extractPriorityFindingHighlight(body);
135
- if (priority?.headline) {
154
+ if (priority?.headline && !isArtifactCategoryHeading(priority.headline)) {
136
155
  return {
137
156
  cleanHighlight: {
138
157
  headline: truncateTitle(priority.headline),
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Read the first existing English brief artefact under `runDir` and
3
+ * return its SPDX-stripped body. Returns the empty string when none of
4
+ * the candidate artefacts exists or the run directory is missing —
5
+ * callers should treat the empty string as "no brief content
6
+ * available" and fall back to their existing extraction ladder.
7
+ *
8
+ * @param runDir - Absolute run directory, or empty string when unavailable
9
+ * @returns Brief body text with SPDX preamble removed
10
+ */
11
+ export declare function readEnglishBriefBody(runDir: string): string;
12
+ //# sourceMappingURL=brief-body.d.ts.map