euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -1,83 +1,4 @@
1
- /**
2
- * Headings inside an editorial artefact that carry the journalist's lede
3
- * paragraph (a one-paragraph summary of "what happened, why it matters").
4
- * When the resolver sees one of these as a `## …` heading inside the
5
- * editorial artefact, it prefers the first prose paragraph that follows
6
- * it as the description (and as a title fallback) over a generic line
7
- * walk. Names are matched case-insensitively against the heading text
8
- * (after stripping inline Markdown).
9
- */
10
- export declare const EDITORIAL_LEDE_HEADINGS: readonly string[];
11
- /**
12
- * Artifact-category prefixes that appear inside editorial-artefact H1s as
13
- * a structural label rather than an editorial headline (e.g. `# Synthesis
14
- * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
15
- * starts with one of these prefixes followed by a separator (em/en dash,
16
- * hyphen, or colon), the resolver treats it as **generic** so it does
17
- * not leak into the article `<title>`. Compared lower-case, with leading
18
- * punctuation stripped.
19
- */
20
- export declare const ARTIFACT_CATEGORY_PREFIXES: readonly string[];
21
- /**
22
- * Normalise a Markdown heading's text for comparison against the
23
- * editorial-lede heading whitelist. Strips inline Markdown decorations
24
- * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
25
- * characters (emoji, punctuation, spaces) so a heading like
26
- * `🎯 Headline Judgement` compares equal to `headline judgement`.
27
- *
28
- * @param raw - Raw heading text (no leading hashes)
29
- * @returns Lower-cased, decoration-stripped heading text
30
- */
31
- export declare function normaliseHeadingText(raw: string): string;
32
- /**
33
- * Word-boundary match against an editorial-lede whitelist entry. Matches
34
- * when the normalised heading equals the whitelist entry exactly, or when
35
- * the entry is followed by any non-alphanumeric character — covering
36
- * localized parenthetical glosses written with ASCII or full-width
37
- * punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
38
- * `bluf — 핵심 결론`, `60-second read — what happened`).
39
- *
40
- * @param headingText - Normalised heading text (lower-case, decoration-stripped)
41
- * @param whitelistEntry - Lower-case whitelist entry from
42
- * {@link EDITORIAL_LEDE_HEADINGS}
43
- * @returns `true` when `headingText` begins with `whitelistEntry` at a
44
- * word boundary
45
- */
46
- export declare function isLedeHeadingMatch(headingText: string, whitelistEntry: string): boolean;
47
- /**
48
- * Return `true` when an artefact-H1 begins with one of the
49
- * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
50
- * carry the artefact's structural label rather than a journalist's
51
- * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
52
- * 2026)`) and must not leak into the article `<title>`.
53
- *
54
- * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
55
- * @returns `true` when the heading is an artefact-category label
56
- */
57
- export declare function isArtifactCategoryHeading(heading: string): boolean;
58
- /**
59
- * Strip a leading or trailing artifact-category label from a heading and
60
- * return the editorial-topic core. When neither end carries a category
61
- * label, the heading is returned unchanged. When the category label is
62
- * the **entire** heading (e.g. `# Executive Brief`) the result is the
63
- * empty string.
64
- *
65
- * Examples:
66
- * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
67
- * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
68
- * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
69
- * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
70
- * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
71
- *
72
- * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
73
- * stripped because it functions as a date stamp rather than editorial
74
- * copy. The returned core is trimmed of whitespace and trailing
75
- * punctuation.
76
- *
77
- * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
78
- * @returns Editorial-topic core, or empty string when only the category survived
79
- */
80
- export declare function stripArtifactCategoryAffix(heading: string): string;
1
+ export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
81
2
  /**
82
3
  * Return `true` when the supplied heading matches the generic
83
4
  * `${humanize(articleType)} — ${date}` form that the aggregator writes as
@@ -85,7 +6,7 @@ export declare function stripArtifactCategoryAffix(heading: string): string;
85
6
  * separators, and matches the `breaking-breaking` variant that some
86
7
  * same-day collision runs produce.
87
8
  *
88
- * @param heading - Plain-text heading (post-{@link stripInlineMarkdown})
9
+ * @param heading - Plain-text heading (post-`stripInlineMarkdown`)
89
10
  * @param articleType - Article type slug
90
11
  * @param date - ISO date string
91
12
  * @returns `true` when the heading carries no editorial information
@@ -2,280 +2,104 @@
2
2
  // SPDX-License-Identifier: Apache-2.0
3
3
  /**
4
4
  * @module Aggregator/Metadata/HeadingRules
5
- * @description Heading-classification helpers extracted from
6
- * `article-metadata.ts`. Owns the editorial-lede whitelist, the artefact
7
- * category prefix list, the institutional-noun whitelist, and the
8
- * `isGenericHeading` / `stripArtifactCategoryAffix` predicates that
9
- * drive title-tier selection in the resolver.
5
+ * @description Heading-classification helpers used by the article
6
+ * metadata resolver. Owns:
10
7
  *
11
- * Pure leaf module — the only runtime dependencies are
12
- * {@link stripInlineMarkdown} (text-utils) and {@link humanizeSlug}
13
- * (slug). Re-exported through `article-metadata.ts` for back-compat
14
- * with existing call sites.
8
+ * - {@link isGenericHeading} — the resolver's master generic-heading
9
+ * predicate (drives title-tier selection).
10
+ * - Internal helpers for institutional-noun, category-noun, and
11
+ * `<label><sep><date>` boilerplate detection.
12
+ *
13
+ * The editorial-lede whitelist, artifact-category prefix list, and the
14
+ * {@link isArtifactCategoryHeading} / {@link stripArtifactCategoryAffix}
15
+ * helpers were extracted to `./artifact-category-heading.ts` in May 2026
16
+ * to keep this file under the 600-raw-line drift-guard. They are
17
+ * **re-exported here** so existing call sites (`lede-extractor.ts`,
18
+ * `artifact-walker.ts`, `article-metadata.ts`) keep working unchanged.
19
+ *
20
+ * Pure leaf module. The only runtime dependencies are
21
+ * {@link humanizeSlug} (slug) and the helpers re-exported from
22
+ * `./artifact-category-heading.js`.
15
23
  */
16
- import { stripInlineMarkdown } from './text-utils.js';
17
24
  import { humanizeSlug } from './slug.js';
25
+ import { isArtifactCategoryHeading } from './artifact-category-heading.js';
26
+ // Re-export the artifact-category surface so existing imports continue
27
+ // to work without touching consumers.
28
+ export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
18
29
  /**
19
- * Headings inside an editorial artefact that carry the journalist's lede
20
- * paragraph (a one-paragraph summary of "what happened, why it matters").
21
- * When the resolver sees one of these as a `## …` heading inside the
22
- * editorial artefact, it prefers the first prose paragraph that follows
23
- * it as the description (and as a title fallback) over a generic line
24
- * walk. Names are matched case-insensitively against the heading text
25
- * (after stripping inline Markdown).
30
+ * Article-type aliases that author-templates use interchangeably with
31
+ * the humanized slug. `breaking` runs in particular alternate between
32
+ * `Breaking` and `Breaking News` in brief H1s. The aliases are matched
33
+ * alongside the canonical `humanizeSlug(articleType)` value so the
34
+ * downstream pattern + trailing-date regex pick them all up.
26
35
  */
27
- export const EDITORIAL_LEDE_HEADINGS = [
28
- '60-second read',
29
- '60 second read',
30
- 'sixty-second read',
31
- 'lede',
32
- 'lead',
33
- 'tl;dr',
34
- 'tldr',
35
- 'synopsis',
36
- 'in brief',
37
- 'at a glance',
38
- 'bottom line',
39
- 'bluf',
40
- 'bluf — bottom line up front',
41
- 'bottom line up front',
42
- 'executive summary',
43
- 'executive briefing',
44
- 'master narrative',
45
- 'overview',
46
- 'headline judgement',
47
- 'headline judgment',
48
- 'key findings',
49
- 'key judgements',
50
- 'key judgments',
51
- 'situation summary',
52
- 'situation report',
53
- 'situation update',
54
- ];
36
+ const ARTICLE_TYPE_ALIASES = {
37
+ breaking: ['Breaking News'],
38
+ };
55
39
  /**
56
- * Artifact-category prefixes that appear inside editorial-artefact H1s as
57
- * a structural label rather than an editorial headline (e.g. `# Synthesis
58
- * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
59
- * starts with one of these prefixes followed by a separator (em/en dash,
60
- * hyphen, or colon), the resolver treats it as **generic** so it does
61
- * not leak into the article `<title>`. Compared lower-case, with leading
62
- * punctuation stripped.
40
+ * Separators observed in the wild for brief H1s mixing the
41
+ * article-type label with a single ISO or human-friendly date.
63
42
  */
64
- export const ARTIFACT_CATEGORY_PREFIXES = [
65
- 'actor mapping',
66
- 'analytical quality',
67
- 'breaking news analysis',
68
- 'coalition dynamics',
69
- 'commission wp alignment',
70
- 'committee activity report',
71
- 'cross run continuity',
72
- 'deep analysis',
73
- 'economic context',
74
- 'executive brief',
75
- 'executive briefing',
76
- 'executive intelligence brief',
77
- 'executive intelligence briefing',
78
- 'executive summary',
79
- 'forward indicators',
80
- 'historical baseline',
81
- 'impact matrix',
82
- 'intelligence assessment',
83
- 'intelligence briefing',
84
- 'intelligence synthesis summary',
85
- 'legislative output analysis',
86
- 'legislative pipeline analysis',
87
- 'legislative pipeline forecast',
88
- 'mandate fulfilment scorecard',
89
- 'master intelligence synthesis',
90
- 'mcp reliability audit',
91
- 'methodology reflection',
92
- 'monthly outlook',
93
- 'motions analysis',
94
- 'parliamentary calendar projection',
95
- 'pestle analysis',
96
- 'political intelligence brief',
97
- 'political risk',
98
- 'political threat landscape',
99
- 'presidency trio context',
100
- 'propositions analysis',
101
- 'quantitative swot',
102
- 'risk assessment',
103
- 'risk matrix',
104
- 'risk scoring',
105
- 'scenario forecast',
106
- 'seat projection',
107
- 'significance classification',
108
- 'situation report',
109
- 'situation summary',
110
- 'stakeholder analysis',
111
- 'stakeholder impact',
112
- 'stakeholder map',
113
- 'swot analysis',
114
- 'synthesis summary',
115
- 'threat assessment',
116
- 'threat model',
117
- 'voting patterns',
118
- 'weekly outlook',
119
- 'wildcards blackswans',
120
- ];
43
+ const GENERIC_HEADING_SEPARATORS = [' — ', ' - ', ' – ', ': ', ' ', ' | ', ', '];
121
44
  /**
122
- * Normalise a Markdown heading's text for comparison against the
123
- * editorial-lede heading whitelist. Strips inline Markdown decorations
124
- * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
125
- * characters (emoji, punctuation, spaces) so a heading like
126
- * `🎯 Headline Judgement` compares equal to `headline judgement`.
127
- *
128
- * @param raw - Raw heading text (no leading hashes)
129
- * @returns Lower-cased, decoration-stripped heading text
45
+ * Date-shape character class: digits, dashes (ISO) plus letters and
46
+ * single spaces (human-friendly forms like `8 April 2026`). Single-day
47
+ * only date *ranges* are preserved as editorial scope-window content.
130
48
  */
131
- export function normaliseHeadingText(raw) {
132
- return stripInlineMarkdown(raw)
133
- .replace(/[*_`#]+/g, '')
134
- .replace(/^[^A-Za-z0-9]+/, '')
135
- .trim()
136
- .toLowerCase();
137
- }
49
+ const GENERIC_HEADING_DATE_SHAPE = '[\\d][\\d\\-]*|\\d{1,2}\\s+[A-Za-z]+\\s+\\d{4}';
138
50
  /**
139
- * Word-boundary match against an editorial-lede whitelist entry. Matches
140
- * when the normalised heading equals the whitelist entry exactly, or when
141
- * the entry is followed by any non-alphanumeric character — covering
142
- * localized parenthetical glosses written with ASCII or full-width
143
- * punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
144
- * `bluf — 핵심 결론`, `60-second read — what happened`).
51
+ * Aliases used for one article-type slug, including the canonical
52
+ * humanised slug plus any registered aliases.
145
53
  *
146
- * @param headingText - Normalised heading text (lower-case, decoration-stripped)
147
- * @param whitelistEntry - Lower-case whitelist entry from
148
- * {@link EDITORIAL_LEDE_HEADINGS}
149
- * @returns `true` when `headingText` begins with `whitelistEntry` at a
150
- * word boundary
54
+ * @param articleType - Article-type slug
55
+ * @returns Ordered list of label aliases
151
56
  */
152
- export function isLedeHeadingMatch(headingText, whitelistEntry) {
153
- if (headingText === whitelistEntry)
154
- return true;
155
- if (!headingText.startsWith(whitelistEntry))
156
- return false;
157
- const next = headingText.charAt(whitelistEntry.length);
158
- // Word boundary — anything that is not an ASCII letter/digit is a
159
- // separator we accept. This works uniformly across ASCII parentheses,
160
- // CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
161
- // ideographic full-width colon `:`.
162
- return next === '' || !/[a-z0-9]/.test(next);
57
+ function resolveLabelAliases(articleType) {
58
+ const human = humanizeSlug(articleType);
59
+ return [human, ...(ARTICLE_TYPE_ALIASES[articleType] ?? [])];
163
60
  }
164
61
  /**
165
- * Return `true` when an artefact-H1 begins with one of the
166
- * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
167
- * carry the artefact's structural label rather than a journalist's
168
- * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
169
- * 2026)`) and must not leak into the article `<title>`.
62
+ * Match an exact `<prefix?><label><sep><date>` shape, including the
63
+ * `EU Parliament ` / `EP ` prefix variants and the redundant
64
+ * `<label> <label> <date>` form occasionally emitted by same-day
65
+ * collision runs.
170
66
  *
171
- * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
172
- * @returns `true` when the heading is an artefact-category label
67
+ * @param normalized - Heading text after whitespace collapse
68
+ * @param label - Article-type label to test against
69
+ * @param date - ISO date string
70
+ * @returns `true` when the heading matches a known literal shape
173
71
  */
174
- export function isArtifactCategoryHeading(heading) {
175
- const normalized = normaliseCategoryHeading(heading);
176
- if (normalized === '')
177
- return false;
178
- for (const prefix of ARTIFACT_CATEGORY_PREFIXES) {
179
- if (normalized === prefix)
72
+ function matchesLiteralLabelDateShape(normalized, label, date) {
73
+ for (const sep of GENERIC_HEADING_SEPARATORS) {
74
+ const p = `${label}${sep}${date}`;
75
+ if (normalized === p)
180
76
  return true;
181
- if (normalized.startsWith(`${prefix} —`) ||
182
- normalized.startsWith(`${prefix} –`) ||
183
- normalized.startsWith(`${prefix} -`) ||
184
- normalized.startsWith(`${prefix}:`)) {
77
+ if (normalized === `EU Parliament ${p}`)
185
78
  return true;
186
- }
187
- if (normalized.endsWith(` — ${prefix}`) ||
188
- normalized.endsWith(` – ${prefix}`) ||
189
- normalized.endsWith(` - ${prefix}`) ||
190
- normalized.endsWith(`: ${prefix}`)) {
79
+ if (normalized === `EP ${p}`)
191
80
  return true;
192
- }
193
- }
194
- return false;
195
- }
196
- /**
197
- * Strip a leading or trailing artifact-category label from a heading and
198
- * return the editorial-topic core. When neither end carries a category
199
- * label, the heading is returned unchanged. When the category label is
200
- * the **entire** heading (e.g. `# Executive Brief`) the result is the
201
- * empty string.
202
- *
203
- * Examples:
204
- * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
205
- * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
206
- * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
207
- * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
208
- * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
209
- *
210
- * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
211
- * stripped because it functions as a date stamp rather than editorial
212
- * copy. The returned core is trimmed of whitespace and trailing
213
- * punctuation.
214
- *
215
- * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
216
- * @returns Editorial-topic core, or empty string when only the category survived
217
- */
218
- export function stripArtifactCategoryAffix(heading) {
219
- const trimmed = heading.trim();
220
- if (trimmed === '')
221
- return '';
222
- const sortedPrefixes = [...ARTIFACT_CATEGORY_PREFIXES].sort((a, b) => b.length - a.length);
223
- const normalized = normaliseCategoryHeading(trimmed);
224
- const skip = trimmed.length - normalized.length;
225
- const visible = trimmed.slice(skip < 0 ? 0 : skip);
226
- const visibleClean = visible.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
227
- const normalizedClean = normaliseCategoryHeading(visibleClean);
228
- for (const prefix of sortedPrefixes) {
229
- for (const sep of [' — ', ' – ', ' - ', ': ']) {
230
- const candidate = `${prefix}${sep}`;
231
- if (normalizedClean.startsWith(candidate)) {
232
- const core = visibleClean.slice(candidate.length).trim();
233
- return cleanupAffixCore(core);
234
- }
235
- }
236
- for (const sep of [' — ', ' – ', ' - ', ': ']) {
237
- const candidate = `${sep}${prefix}`;
238
- if (normalizedClean.endsWith(candidate)) {
239
- const core = visibleClean.slice(0, visibleClean.length - candidate.length).trim();
240
- return cleanupAffixCore(core);
241
- }
242
- }
243
- if (normalizedClean === prefix)
244
- return '';
245
81
  }
246
- return trimmed;
247
- }
248
- /**
249
- * Tidy the editorial-topic core returned by
250
- * {@link stripArtifactCategoryAffix}: drop trailing parenthesised
251
- * metadata (`(2026-05-08)`, `(May 2026)`) and trailing punctuation. When
252
- * stripping leaves the string too short to be meaningful (<5 chars),
253
- * return the empty string so callers fall through to lower tiers.
254
- *
255
- * @param core - Heading with the category label already stripped
256
- * @returns Cleaned editorial-topic core, or empty string when too short
257
- */
258
- function cleanupAffixCore(core) {
259
- const withoutTrailingParens = core.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
260
- const withoutTrailingPunct = withoutTrailingParens.replace(/[—–:;,.\s-]+$/u, '').trim();
261
- if (withoutTrailingPunct.length < 5)
262
- return '';
263
- return withoutTrailingPunct;
82
+ const labelRedundant = `${label} ${label}`;
83
+ return normalized === `${labelRedundant} — ${date}`;
264
84
  }
265
85
  /**
266
- * Lower-case, decoration-stripped form used by the artifact-category
267
- * matchers. Strips inline Markdown, leading non-alphanumeric runs (emoji,
268
- * decoration), and collapses whitespace to a single space.
86
+ * Match `<prefix?><label><sep-or-space><any-date>` patterns where the
87
+ * date token can be any ISO / human / single-day-range shape. Anchored
88
+ * to end-of-string so it cannot fire on editorial sentences that
89
+ * happen to contain a date token mid-clause.
269
90
  *
270
- * @param raw - Raw heading text
271
- * @returns Lower-case normalised form
91
+ * @param normalized - Heading text after whitespace collapse
92
+ * @param label - Article-type label to test against
93
+ * @returns `true` when the heading matches the trailing-date shape
272
94
  */
273
- function normaliseCategoryHeading(raw) {
274
- return stripInlineMarkdown(raw)
275
- .trim()
276
- .toLowerCase()
277
- .replace(/^[^a-z0-9]+/, '')
278
- .replace(/\s+/g, ' ');
95
+ function matchesTrailingDateShape(normalized, label) {
96
+ const trailingDateOnly = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s*[—–\\-|,:]\\s*(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
97
+ if (trailingDateOnly.test(normalized))
98
+ return true;
99
+ // Same shape but label followed directly by a date with whitespace only
100
+ // (e.g. `Breaking News 2026-04-01`).
101
+ const labelSpaceDate = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s+(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
102
+ return labelSpaceDate.test(normalized);
279
103
  }
280
104
  /**
281
105
  * Return `true` when the supplied heading matches the generic
@@ -284,7 +108,7 @@ function normaliseCategoryHeading(raw) {
284
108
  * separators, and matches the `breaking-breaking` variant that some
285
109
  * same-day collision runs produce.
286
110
  *
287
- * @param heading - Plain-text heading (post-{@link stripInlineMarkdown})
111
+ * @param heading - Plain-text heading (post-`stripInlineMarkdown`)
288
112
  * @param articleType - Article type slug
289
113
  * @param date - ISO date string
290
114
  * @returns `true` when the heading carries no editorial information
@@ -295,27 +119,12 @@ export function isGenericHeading(heading, articleType, date) {
295
119
  return true;
296
120
  if (isArtifactCategoryHeading(normalized))
297
121
  return true;
298
- const human = humanizeSlug(articleType);
299
- const patterns = [
300
- `${human} — ${date}`,
301
- `${human} - ${date}`,
302
- `${human} – ${date}`,
303
- `${human}: ${date}`,
304
- `${human} ${date}`,
305
- ];
306
- const humanRedundant = `${human} ${human}`;
307
- for (const p of patterns) {
308
- if (normalized === p)
309
- return true;
310
- if (normalized === `EU Parliament ${p}`)
122
+ for (const label of resolveLabelAliases(articleType)) {
123
+ if (matchesLiteralLabelDateShape(normalized, label, date))
311
124
  return true;
312
- if (normalized === `${humanRedundant} — ${date}`)
125
+ if (matchesTrailingDateShape(normalized, label))
313
126
  return true;
314
127
  }
315
- const trailingDateOnly = new RegExp(`^${escapeRegex(human)}\\s*[—–-]\\s*[\\d-]+$`, 'u');
316
- if (trailingDateOnly.test(normalized)) {
317
- return true;
318
- }
319
128
  if (isCategoryNounHeading(normalized, articleType))
320
129
  return true;
321
130
  if (isBareInstitutionalHeading(normalized))
@@ -0,0 +1,60 @@
1
+ /**
2
+ * @module Aggregator/Metadata/KeywordFilters
3
+ * @description Cross-site keyword catalogue and noise-token filter used
4
+ * by {@link buildSeoKeywords} in `resolve-helpers.ts`.
5
+ *
6
+ * Two responsibilities:
7
+ *
8
+ * 1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
9
+ * are prepended to every article's `<meta name="keywords">` list
10
+ * regardless of language, so search-engine discovery of the
11
+ * Hack23 civic-tech portfolio (EU Parliament Monitor +
12
+ * Riksdagsmonitor + CIA) is consistent across all 14 localized
13
+ * surfaces. The user explicitly requested
14
+ * `riksdagsmonitor, political intelligence, riksdag, regeringen`
15
+ * (the sister Swedish-Parliament project) plus EP analogues.
16
+ *
17
+ * 2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
18
+ * the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
19
+ * synthetic run-id slugs (`propositions-run261-1779431162`) that
20
+ * the previous keyword extractor leaked into `<head>` when a
21
+ * brief mentioned its own run id editorially (e.g.
22
+ * `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
23
+ * 0 classified actors`).
24
+ *
25
+ * Pure leaf module — no imports.
26
+ */
27
+ /**
28
+ * Cross-site SEO keywords prepended to every article in every
29
+ * language. Order is meaningful: stronger civic-tech-portfolio terms
30
+ * first so they appear ahead of the per-article-type keywords when
31
+ * the 16-entry budget is exceeded.
32
+ */
33
+ export declare const CROSS_SITE_KEYWORDS: readonly string[];
34
+ /**
35
+ * Decide whether a single keyword token should be discarded as noise.
36
+ *
37
+ * The current rules reject tokens that:
38
+ *
39
+ * - Look like a UUID hex chunk: ≥4 chars and consist solely of the
40
+ * `[0-9a-f]` alphabet **and** contain at least one digit (so
41
+ * real English words like `dead` / `face` survive). Tokens of
42
+ * length ≥8 are always rejected (a real English word of that
43
+ * length composed exclusively of hex letters is vanishingly rare;
44
+ * the allowlist guards the short cases).
45
+ * - Are mostly digits (≥80 % digit characters) — runtime epoch
46
+ * suffixes such as `1779431162` and committee-codeoid mashes like
47
+ * `2024k1234`.
48
+ * - Start with `run` and end with all-digits (`run261`, `run17`),
49
+ * the per-run slug suffix the aggregator stamps onto run ids.
50
+ * - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
51
+ * after a strip / normalization round-trip.
52
+ *
53
+ * Returns `false` for normal vocabulary so the keyword list stays
54
+ * useful — every reject path is intentionally narrow.
55
+ *
56
+ * @param token - Single token candidate
57
+ * @returns `true` when the token should be dropped from keywords
58
+ */
59
+ export declare function isNoiseKeywordToken(token: string): boolean;
60
+ //# sourceMappingURL=keyword-filters.d.ts.map