euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -1,83 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
* Headings inside an editorial artefact that carry the journalist's lede
|
|
3
|
-
* paragraph (a one-paragraph summary of "what happened, why it matters").
|
|
4
|
-
* When the resolver sees one of these as a `## …` heading inside the
|
|
5
|
-
* editorial artefact, it prefers the first prose paragraph that follows
|
|
6
|
-
* it as the description (and as a title fallback) over a generic line
|
|
7
|
-
* walk. Names are matched case-insensitively against the heading text
|
|
8
|
-
* (after stripping inline Markdown).
|
|
9
|
-
*/
|
|
10
|
-
export declare const EDITORIAL_LEDE_HEADINGS: readonly string[];
|
|
11
|
-
/**
|
|
12
|
-
* Artifact-category prefixes that appear inside editorial-artefact H1s as
|
|
13
|
-
* a structural label rather than an editorial headline (e.g. `# Synthesis
|
|
14
|
-
* Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
|
|
15
|
-
* starts with one of these prefixes followed by a separator (em/en dash,
|
|
16
|
-
* hyphen, or colon), the resolver treats it as **generic** so it does
|
|
17
|
-
* not leak into the article `<title>`. Compared lower-case, with leading
|
|
18
|
-
* punctuation stripped.
|
|
19
|
-
*/
|
|
20
|
-
export declare const ARTIFACT_CATEGORY_PREFIXES: readonly string[];
|
|
21
|
-
/**
|
|
22
|
-
* Normalise a Markdown heading's text for comparison against the
|
|
23
|
-
* editorial-lede heading whitelist. Strips inline Markdown decorations
|
|
24
|
-
* (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
|
|
25
|
-
* characters (emoji, punctuation, spaces) so a heading like
|
|
26
|
-
* `🎯 Headline Judgement` compares equal to `headline judgement`.
|
|
27
|
-
*
|
|
28
|
-
* @param raw - Raw heading text (no leading hashes)
|
|
29
|
-
* @returns Lower-cased, decoration-stripped heading text
|
|
30
|
-
*/
|
|
31
|
-
export declare function normaliseHeadingText(raw: string): string;
|
|
32
|
-
/**
|
|
33
|
-
* Word-boundary match against an editorial-lede whitelist entry. Matches
|
|
34
|
-
* when the normalised heading equals the whitelist entry exactly, or when
|
|
35
|
-
* the entry is followed by any non-alphanumeric character — covering
|
|
36
|
-
* localized parenthetical glosses written with ASCII or full-width
|
|
37
|
-
* punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
|
|
38
|
-
* `bluf — 핵심 결론`, `60-second read — what happened`).
|
|
39
|
-
*
|
|
40
|
-
* @param headingText - Normalised heading text (lower-case, decoration-stripped)
|
|
41
|
-
* @param whitelistEntry - Lower-case whitelist entry from
|
|
42
|
-
* {@link EDITORIAL_LEDE_HEADINGS}
|
|
43
|
-
* @returns `true` when `headingText` begins with `whitelistEntry` at a
|
|
44
|
-
* word boundary
|
|
45
|
-
*/
|
|
46
|
-
export declare function isLedeHeadingMatch(headingText: string, whitelistEntry: string): boolean;
|
|
47
|
-
/**
|
|
48
|
-
* Return `true` when an artefact-H1 begins with one of the
|
|
49
|
-
* `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
|
|
50
|
-
* carry the artefact's structural label rather than a journalist's
|
|
51
|
-
* headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
|
|
52
|
-
* 2026)`) and must not leak into the article `<title>`.
|
|
53
|
-
*
|
|
54
|
-
* @param heading - Plain-text H1 (after `stripInlineMarkdown`)
|
|
55
|
-
* @returns `true` when the heading is an artefact-category label
|
|
56
|
-
*/
|
|
57
|
-
export declare function isArtifactCategoryHeading(heading: string): boolean;
|
|
58
|
-
/**
|
|
59
|
-
* Strip a leading or trailing artifact-category label from a heading and
|
|
60
|
-
* return the editorial-topic core. When neither end carries a category
|
|
61
|
-
* label, the heading is returned unchanged. When the category label is
|
|
62
|
-
* the **entire** heading (e.g. `# Executive Brief`) the result is the
|
|
63
|
-
* empty string.
|
|
64
|
-
*
|
|
65
|
-
* Examples:
|
|
66
|
-
* - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
|
|
67
|
-
* - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
|
|
68
|
-
* - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
|
|
69
|
-
* - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
|
|
70
|
-
* - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
|
|
71
|
-
*
|
|
72
|
-
* Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
|
|
73
|
-
* stripped because it functions as a date stamp rather than editorial
|
|
74
|
-
* copy. The returned core is trimmed of whitespace and trailing
|
|
75
|
-
* punctuation.
|
|
76
|
-
*
|
|
77
|
-
* @param heading - Raw heading text (post-{@link stripInlineMarkdown})
|
|
78
|
-
* @returns Editorial-topic core, or empty string when only the category survived
|
|
79
|
-
*/
|
|
80
|
-
export declare function stripArtifactCategoryAffix(heading: string): string;
|
|
1
|
+
export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
|
|
81
2
|
/**
|
|
82
3
|
* Return `true` when the supplied heading matches the generic
|
|
83
4
|
* `${humanize(articleType)} — ${date}` form that the aggregator writes as
|
|
@@ -85,7 +6,7 @@ export declare function stripArtifactCategoryAffix(heading: string): string;
|
|
|
85
6
|
* separators, and matches the `breaking-breaking` variant that some
|
|
86
7
|
* same-day collision runs produce.
|
|
87
8
|
*
|
|
88
|
-
* @param heading - Plain-text heading (post
|
|
9
|
+
* @param heading - Plain-text heading (post-`stripInlineMarkdown`)
|
|
89
10
|
* @param articleType - Article type slug
|
|
90
11
|
* @param date - ISO date string
|
|
91
12
|
* @returns `true` when the heading carries no editorial information
|
|
@@ -2,280 +2,104 @@
|
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
/**
|
|
4
4
|
* @module Aggregator/Metadata/HeadingRules
|
|
5
|
-
* @description Heading-classification helpers
|
|
6
|
-
*
|
|
7
|
-
* category prefix list, the institutional-noun whitelist, and the
|
|
8
|
-
* `isGenericHeading` / `stripArtifactCategoryAffix` predicates that
|
|
9
|
-
* drive title-tier selection in the resolver.
|
|
5
|
+
* @description Heading-classification helpers used by the article
|
|
6
|
+
* metadata resolver. Owns:
|
|
10
7
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
8
|
+
* - {@link isGenericHeading} — the resolver's master generic-heading
|
|
9
|
+
* predicate (drives title-tier selection).
|
|
10
|
+
* - Internal helpers for institutional-noun, category-noun, and
|
|
11
|
+
* `<label><sep><date>` boilerplate detection.
|
|
12
|
+
*
|
|
13
|
+
* The editorial-lede whitelist, artifact-category prefix list, and the
|
|
14
|
+
* {@link isArtifactCategoryHeading} / {@link stripArtifactCategoryAffix}
|
|
15
|
+
* helpers were extracted to `./artifact-category-heading.ts` in May 2026
|
|
16
|
+
* to keep this file under the 600-raw-line drift-guard. They are
|
|
17
|
+
* **re-exported here** so existing call sites (`lede-extractor.ts`,
|
|
18
|
+
* `artifact-walker.ts`, `article-metadata.ts`) keep working unchanged.
|
|
19
|
+
*
|
|
20
|
+
* Pure leaf module. The only runtime dependencies are
|
|
21
|
+
* {@link humanizeSlug} (slug) and the helpers re-exported from
|
|
22
|
+
* `./artifact-category-heading.js`.
|
|
15
23
|
*/
|
|
16
|
-
import { stripInlineMarkdown } from './text-utils.js';
|
|
17
24
|
import { humanizeSlug } from './slug.js';
|
|
25
|
+
import { isArtifactCategoryHeading } from './artifact-category-heading.js';
|
|
26
|
+
// Re-export the artifact-category surface so existing imports continue
|
|
27
|
+
// to work without touching consumers.
|
|
28
|
+
export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
|
|
18
29
|
/**
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
* walk. Names are matched case-insensitively against the heading text
|
|
25
|
-
* (after stripping inline Markdown).
|
|
30
|
+
* Article-type aliases that author-templates use interchangeably with
|
|
31
|
+
* the humanized slug. `breaking` runs in particular alternate between
|
|
32
|
+
* `Breaking` and `Breaking News` in brief H1s. The aliases are matched
|
|
33
|
+
* alongside the canonical `humanizeSlug(articleType)` value so the
|
|
34
|
+
* downstream pattern + trailing-date regex pick them all up.
|
|
26
35
|
*/
|
|
27
|
-
|
|
28
|
-
'
|
|
29
|
-
|
|
30
|
-
'sixty-second read',
|
|
31
|
-
'lede',
|
|
32
|
-
'lead',
|
|
33
|
-
'tl;dr',
|
|
34
|
-
'tldr',
|
|
35
|
-
'synopsis',
|
|
36
|
-
'in brief',
|
|
37
|
-
'at a glance',
|
|
38
|
-
'bottom line',
|
|
39
|
-
'bluf',
|
|
40
|
-
'bluf — bottom line up front',
|
|
41
|
-
'bottom line up front',
|
|
42
|
-
'executive summary',
|
|
43
|
-
'executive briefing',
|
|
44
|
-
'master narrative',
|
|
45
|
-
'overview',
|
|
46
|
-
'headline judgement',
|
|
47
|
-
'headline judgment',
|
|
48
|
-
'key findings',
|
|
49
|
-
'key judgements',
|
|
50
|
-
'key judgments',
|
|
51
|
-
'situation summary',
|
|
52
|
-
'situation report',
|
|
53
|
-
'situation update',
|
|
54
|
-
];
|
|
36
|
+
const ARTICLE_TYPE_ALIASES = {
|
|
37
|
+
breaking: ['Breaking News'],
|
|
38
|
+
};
|
|
55
39
|
/**
|
|
56
|
-
*
|
|
57
|
-
*
|
|
58
|
-
* Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
|
|
59
|
-
* starts with one of these prefixes followed by a separator (em/en dash,
|
|
60
|
-
* hyphen, or colon), the resolver treats it as **generic** so it does
|
|
61
|
-
* not leak into the article `<title>`. Compared lower-case, with leading
|
|
62
|
-
* punctuation stripped.
|
|
40
|
+
* Separators observed in the wild for brief H1s mixing the
|
|
41
|
+
* article-type label with a single ISO or human-friendly date.
|
|
63
42
|
*/
|
|
64
|
-
|
|
65
|
-
'actor mapping',
|
|
66
|
-
'analytical quality',
|
|
67
|
-
'breaking news analysis',
|
|
68
|
-
'coalition dynamics',
|
|
69
|
-
'commission wp alignment',
|
|
70
|
-
'committee activity report',
|
|
71
|
-
'cross run continuity',
|
|
72
|
-
'deep analysis',
|
|
73
|
-
'economic context',
|
|
74
|
-
'executive brief',
|
|
75
|
-
'executive briefing',
|
|
76
|
-
'executive intelligence brief',
|
|
77
|
-
'executive intelligence briefing',
|
|
78
|
-
'executive summary',
|
|
79
|
-
'forward indicators',
|
|
80
|
-
'historical baseline',
|
|
81
|
-
'impact matrix',
|
|
82
|
-
'intelligence assessment',
|
|
83
|
-
'intelligence briefing',
|
|
84
|
-
'intelligence synthesis summary',
|
|
85
|
-
'legislative output analysis',
|
|
86
|
-
'legislative pipeline analysis',
|
|
87
|
-
'legislative pipeline forecast',
|
|
88
|
-
'mandate fulfilment scorecard',
|
|
89
|
-
'master intelligence synthesis',
|
|
90
|
-
'mcp reliability audit',
|
|
91
|
-
'methodology reflection',
|
|
92
|
-
'monthly outlook',
|
|
93
|
-
'motions analysis',
|
|
94
|
-
'parliamentary calendar projection',
|
|
95
|
-
'pestle analysis',
|
|
96
|
-
'political intelligence brief',
|
|
97
|
-
'political risk',
|
|
98
|
-
'political threat landscape',
|
|
99
|
-
'presidency trio context',
|
|
100
|
-
'propositions analysis',
|
|
101
|
-
'quantitative swot',
|
|
102
|
-
'risk assessment',
|
|
103
|
-
'risk matrix',
|
|
104
|
-
'risk scoring',
|
|
105
|
-
'scenario forecast',
|
|
106
|
-
'seat projection',
|
|
107
|
-
'significance classification',
|
|
108
|
-
'situation report',
|
|
109
|
-
'situation summary',
|
|
110
|
-
'stakeholder analysis',
|
|
111
|
-
'stakeholder impact',
|
|
112
|
-
'stakeholder map',
|
|
113
|
-
'swot analysis',
|
|
114
|
-
'synthesis summary',
|
|
115
|
-
'threat assessment',
|
|
116
|
-
'threat model',
|
|
117
|
-
'voting patterns',
|
|
118
|
-
'weekly outlook',
|
|
119
|
-
'wildcards blackswans',
|
|
120
|
-
];
|
|
43
|
+
const GENERIC_HEADING_SEPARATORS = [' — ', ' - ', ' – ', ': ', ' ', ' | ', ', '];
|
|
121
44
|
/**
|
|
122
|
-
*
|
|
123
|
-
*
|
|
124
|
-
*
|
|
125
|
-
* characters (emoji, punctuation, spaces) so a heading like
|
|
126
|
-
* `🎯 Headline Judgement` compares equal to `headline judgement`.
|
|
127
|
-
*
|
|
128
|
-
* @param raw - Raw heading text (no leading hashes)
|
|
129
|
-
* @returns Lower-cased, decoration-stripped heading text
|
|
45
|
+
* Date-shape character class: digits, dashes (ISO) plus letters and
|
|
46
|
+
* single spaces (human-friendly forms like `8 April 2026`). Single-day
|
|
47
|
+
* only — date *ranges* are preserved as editorial scope-window content.
|
|
130
48
|
*/
|
|
131
|
-
|
|
132
|
-
return stripInlineMarkdown(raw)
|
|
133
|
-
.replace(/[*_`#]+/g, '')
|
|
134
|
-
.replace(/^[^A-Za-z0-9]+/, '')
|
|
135
|
-
.trim()
|
|
136
|
-
.toLowerCase();
|
|
137
|
-
}
|
|
49
|
+
const GENERIC_HEADING_DATE_SHAPE = '[\\d][\\d\\-]*|\\d{1,2}\\s+[A-Za-z]+\\s+\\d{4}';
|
|
138
50
|
/**
|
|
139
|
-
*
|
|
140
|
-
*
|
|
141
|
-
* the entry is followed by any non-alphanumeric character — covering
|
|
142
|
-
* localized parenthetical glosses written with ASCII or full-width
|
|
143
|
-
* punctuation (e.g. `bluf (bottom line up front)`, `bluf(結論先出し)`,
|
|
144
|
-
* `bluf — 핵심 결론`, `60-second read — what happened`).
|
|
51
|
+
* Aliases used for one article-type slug, including the canonical
|
|
52
|
+
* humanised slug plus any registered aliases.
|
|
145
53
|
*
|
|
146
|
-
* @param
|
|
147
|
-
* @
|
|
148
|
-
* {@link EDITORIAL_LEDE_HEADINGS}
|
|
149
|
-
* @returns `true` when `headingText` begins with `whitelistEntry` at a
|
|
150
|
-
* word boundary
|
|
54
|
+
* @param articleType - Article-type slug
|
|
55
|
+
* @returns Ordered list of label aliases
|
|
151
56
|
*/
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if (!headingText.startsWith(whitelistEntry))
|
|
156
|
-
return false;
|
|
157
|
-
const next = headingText.charAt(whitelistEntry.length);
|
|
158
|
-
// Word boundary — anything that is not an ASCII letter/digit is a
|
|
159
|
-
// separator we accept. This works uniformly across ASCII parentheses,
|
|
160
|
-
// CJK full-width brackets `(`, dashes `— – -`, colons `:`, and the
|
|
161
|
-
// ideographic full-width colon `:`.
|
|
162
|
-
return next === '' || !/[a-z0-9]/.test(next);
|
|
57
|
+
function resolveLabelAliases(articleType) {
|
|
58
|
+
const human = humanizeSlug(articleType);
|
|
59
|
+
return [human, ...(ARTICLE_TYPE_ALIASES[articleType] ?? [])];
|
|
163
60
|
}
|
|
164
61
|
/**
|
|
165
|
-
*
|
|
166
|
-
* `
|
|
167
|
-
*
|
|
168
|
-
*
|
|
169
|
-
* 2026)`) and must not leak into the article `<title>`.
|
|
62
|
+
* Match an exact `<prefix?><label><sep><date>` shape, including the
|
|
63
|
+
* `EU Parliament ` / `EP ` prefix variants and the redundant
|
|
64
|
+
* `<label> <label> — <date>` form occasionally emitted by same-day
|
|
65
|
+
* collision runs.
|
|
170
66
|
*
|
|
171
|
-
* @param
|
|
172
|
-
* @
|
|
67
|
+
* @param normalized - Heading text after whitespace collapse
|
|
68
|
+
* @param label - Article-type label to test against
|
|
69
|
+
* @param date - ISO date string
|
|
70
|
+
* @returns `true` when the heading matches a known literal shape
|
|
173
71
|
*/
|
|
174
|
-
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
for (const prefix of ARTIFACT_CATEGORY_PREFIXES) {
|
|
179
|
-
if (normalized === prefix)
|
|
72
|
+
function matchesLiteralLabelDateShape(normalized, label, date) {
|
|
73
|
+
for (const sep of GENERIC_HEADING_SEPARATORS) {
|
|
74
|
+
const p = `${label}${sep}${date}`;
|
|
75
|
+
if (normalized === p)
|
|
180
76
|
return true;
|
|
181
|
-
if (normalized
|
|
182
|
-
normalized.startsWith(`${prefix} –`) ||
|
|
183
|
-
normalized.startsWith(`${prefix} -`) ||
|
|
184
|
-
normalized.startsWith(`${prefix}:`)) {
|
|
77
|
+
if (normalized === `EU Parliament ${p}`)
|
|
185
78
|
return true;
|
|
186
|
-
}
|
|
187
|
-
if (normalized.endsWith(` — ${prefix}`) ||
|
|
188
|
-
normalized.endsWith(` – ${prefix}`) ||
|
|
189
|
-
normalized.endsWith(` - ${prefix}`) ||
|
|
190
|
-
normalized.endsWith(`: ${prefix}`)) {
|
|
79
|
+
if (normalized === `EP ${p}`)
|
|
191
80
|
return true;
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
return false;
|
|
195
|
-
}
|
|
196
|
-
/**
|
|
197
|
-
* Strip a leading or trailing artifact-category label from a heading and
|
|
198
|
-
* return the editorial-topic core. When neither end carries a category
|
|
199
|
-
* label, the heading is returned unchanged. When the category label is
|
|
200
|
-
* the **entire** heading (e.g. `# Executive Brief`) the result is the
|
|
201
|
-
* empty string.
|
|
202
|
-
*
|
|
203
|
-
* Examples:
|
|
204
|
-
* - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
|
|
205
|
-
* - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
|
|
206
|
-
* - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
|
|
207
|
-
* - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
|
|
208
|
-
* - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
|
|
209
|
-
*
|
|
210
|
-
* Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
|
|
211
|
-
* stripped because it functions as a date stamp rather than editorial
|
|
212
|
-
* copy. The returned core is trimmed of whitespace and trailing
|
|
213
|
-
* punctuation.
|
|
214
|
-
*
|
|
215
|
-
* @param heading - Raw heading text (post-{@link stripInlineMarkdown})
|
|
216
|
-
* @returns Editorial-topic core, or empty string when only the category survived
|
|
217
|
-
*/
|
|
218
|
-
export function stripArtifactCategoryAffix(heading) {
|
|
219
|
-
const trimmed = heading.trim();
|
|
220
|
-
if (trimmed === '')
|
|
221
|
-
return '';
|
|
222
|
-
const sortedPrefixes = [...ARTIFACT_CATEGORY_PREFIXES].sort((a, b) => b.length - a.length);
|
|
223
|
-
const normalized = normaliseCategoryHeading(trimmed);
|
|
224
|
-
const skip = trimmed.length - normalized.length;
|
|
225
|
-
const visible = trimmed.slice(skip < 0 ? 0 : skip);
|
|
226
|
-
const visibleClean = visible.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
|
|
227
|
-
const normalizedClean = normaliseCategoryHeading(visibleClean);
|
|
228
|
-
for (const prefix of sortedPrefixes) {
|
|
229
|
-
for (const sep of [' — ', ' – ', ' - ', ': ']) {
|
|
230
|
-
const candidate = `${prefix}${sep}`;
|
|
231
|
-
if (normalizedClean.startsWith(candidate)) {
|
|
232
|
-
const core = visibleClean.slice(candidate.length).trim();
|
|
233
|
-
return cleanupAffixCore(core);
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
for (const sep of [' — ', ' – ', ' - ', ': ']) {
|
|
237
|
-
const candidate = `${sep}${prefix}`;
|
|
238
|
-
if (normalizedClean.endsWith(candidate)) {
|
|
239
|
-
const core = visibleClean.slice(0, visibleClean.length - candidate.length).trim();
|
|
240
|
-
return cleanupAffixCore(core);
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
if (normalizedClean === prefix)
|
|
244
|
-
return '';
|
|
245
81
|
}
|
|
246
|
-
|
|
247
|
-
}
|
|
248
|
-
/**
|
|
249
|
-
* Tidy the editorial-topic core returned by
|
|
250
|
-
* {@link stripArtifactCategoryAffix}: drop trailing parenthesised
|
|
251
|
-
* metadata (`(2026-05-08)`, `(May 2026)`) and trailing punctuation. When
|
|
252
|
-
* stripping leaves the string too short to be meaningful (<5 chars),
|
|
253
|
-
* return the empty string so callers fall through to lower tiers.
|
|
254
|
-
*
|
|
255
|
-
* @param core - Heading with the category label already stripped
|
|
256
|
-
* @returns Cleaned editorial-topic core, or empty string when too short
|
|
257
|
-
*/
|
|
258
|
-
function cleanupAffixCore(core) {
|
|
259
|
-
const withoutTrailingParens = core.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
|
|
260
|
-
const withoutTrailingPunct = withoutTrailingParens.replace(/[—–:;,.\s-]+$/u, '').trim();
|
|
261
|
-
if (withoutTrailingPunct.length < 5)
|
|
262
|
-
return '';
|
|
263
|
-
return withoutTrailingPunct;
|
|
82
|
+
const labelRedundant = `${label} ${label}`;
|
|
83
|
+
return normalized === `${labelRedundant} — ${date}`;
|
|
264
84
|
}
|
|
265
85
|
/**
|
|
266
|
-
*
|
|
267
|
-
*
|
|
268
|
-
*
|
|
86
|
+
* Match `<prefix?><label><sep-or-space><any-date>` patterns where the
|
|
87
|
+
* date token can be any ISO / human / single-day-range shape. Anchored
|
|
88
|
+
* to end-of-string so it cannot fire on editorial sentences that
|
|
89
|
+
* happen to contain a date token mid-clause.
|
|
269
90
|
*
|
|
270
|
-
* @param
|
|
271
|
-
* @
|
|
91
|
+
* @param normalized - Heading text after whitespace collapse
|
|
92
|
+
* @param label - Article-type label to test against
|
|
93
|
+
* @returns `true` when the heading matches the trailing-date shape
|
|
272
94
|
*/
|
|
273
|
-
function
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
95
|
+
function matchesTrailingDateShape(normalized, label) {
|
|
96
|
+
const trailingDateOnly = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s*[—–\\-|,:]\\s*(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
|
|
97
|
+
if (trailingDateOnly.test(normalized))
|
|
98
|
+
return true;
|
|
99
|
+
// Same shape but label followed directly by a date with whitespace only
|
|
100
|
+
// (e.g. `Breaking News 2026-04-01`).
|
|
101
|
+
const labelSpaceDate = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s+(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
|
|
102
|
+
return labelSpaceDate.test(normalized);
|
|
279
103
|
}
|
|
280
104
|
/**
|
|
281
105
|
* Return `true` when the supplied heading matches the generic
|
|
@@ -284,7 +108,7 @@ function normaliseCategoryHeading(raw) {
|
|
|
284
108
|
* separators, and matches the `breaking-breaking` variant that some
|
|
285
109
|
* same-day collision runs produce.
|
|
286
110
|
*
|
|
287
|
-
* @param heading - Plain-text heading (post
|
|
111
|
+
* @param heading - Plain-text heading (post-`stripInlineMarkdown`)
|
|
288
112
|
* @param articleType - Article type slug
|
|
289
113
|
* @param date - ISO date string
|
|
290
114
|
* @returns `true` when the heading carries no editorial information
|
|
@@ -295,27 +119,12 @@ export function isGenericHeading(heading, articleType, date) {
|
|
|
295
119
|
return true;
|
|
296
120
|
if (isArtifactCategoryHeading(normalized))
|
|
297
121
|
return true;
|
|
298
|
-
const
|
|
299
|
-
|
|
300
|
-
`${human} — ${date}`,
|
|
301
|
-
`${human} - ${date}`,
|
|
302
|
-
`${human} – ${date}`,
|
|
303
|
-
`${human}: ${date}`,
|
|
304
|
-
`${human} ${date}`,
|
|
305
|
-
];
|
|
306
|
-
const humanRedundant = `${human} ${human}`;
|
|
307
|
-
for (const p of patterns) {
|
|
308
|
-
if (normalized === p)
|
|
309
|
-
return true;
|
|
310
|
-
if (normalized === `EU Parliament ${p}`)
|
|
122
|
+
for (const label of resolveLabelAliases(articleType)) {
|
|
123
|
+
if (matchesLiteralLabelDateShape(normalized, label, date))
|
|
311
124
|
return true;
|
|
312
|
-
if (normalized
|
|
125
|
+
if (matchesTrailingDateShape(normalized, label))
|
|
313
126
|
return true;
|
|
314
127
|
}
|
|
315
|
-
const trailingDateOnly = new RegExp(`^${escapeRegex(human)}\\s*[—–-]\\s*[\\d-]+$`, 'u');
|
|
316
|
-
if (trailingDateOnly.test(normalized)) {
|
|
317
|
-
return true;
|
|
318
|
-
}
|
|
319
128
|
if (isCategoryNounHeading(normalized, articleType))
|
|
320
129
|
return true;
|
|
321
130
|
if (isBareInstitutionalHeading(normalized))
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Aggregator/Metadata/KeywordFilters
|
|
3
|
+
* @description Cross-site keyword catalogue and noise-token filter used
|
|
4
|
+
* by {@link buildSeoKeywords} in `resolve-helpers.ts`.
|
|
5
|
+
*
|
|
6
|
+
* Two responsibilities:
|
|
7
|
+
*
|
|
8
|
+
* 1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
|
|
9
|
+
* are prepended to every article's `<meta name="keywords">` list
|
|
10
|
+
* regardless of language, so search-engine discovery of the
|
|
11
|
+
* Hack23 civic-tech portfolio (EU Parliament Monitor +
|
|
12
|
+
* Riksdagsmonitor + CIA) is consistent across all 14 localized
|
|
13
|
+
* surfaces. The user explicitly requested
|
|
14
|
+
* `riksdagsmonitor, political intelligence, riksdag, regeringen`
|
|
15
|
+
* (the sister Swedish-Parliament project) plus EP analogues.
|
|
16
|
+
*
|
|
17
|
+
* 2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
|
|
18
|
+
* the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
|
|
19
|
+
* synthetic run-id slugs (`propositions-run261-1779431162`) that
|
|
20
|
+
* the previous keyword extractor leaked into `<head>` when a
|
|
21
|
+
* brief mentioned its own run id editorially (e.g.
|
|
22
|
+
* `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
|
|
23
|
+
* 0 classified actors`).
|
|
24
|
+
*
|
|
25
|
+
* Pure leaf module — no imports.
|
|
26
|
+
*/
|
|
27
|
+
/**
|
|
28
|
+
* Cross-site SEO keywords prepended to every article in every
|
|
29
|
+
* language. Order is meaningful: stronger civic-tech-portfolio terms
|
|
30
|
+
* first so they appear ahead of the per-article-type keywords when
|
|
31
|
+
* the 16-entry budget is exceeded.
|
|
32
|
+
*/
|
|
33
|
+
export declare const CROSS_SITE_KEYWORDS: readonly string[];
|
|
34
|
+
/**
|
|
35
|
+
* Decide whether a single keyword token should be discarded as noise.
|
|
36
|
+
*
|
|
37
|
+
* The current rules reject tokens that:
|
|
38
|
+
*
|
|
39
|
+
* - Look like a UUID hex chunk: ≥4 chars and consist solely of the
|
|
40
|
+
* `[0-9a-f]` alphabet **and** contain at least one digit (so
|
|
41
|
+
* real English words like `dead` / `face` survive). Tokens of
|
|
42
|
+
* length ≥8 are always rejected (a real English word of that
|
|
43
|
+
* length composed exclusively of hex letters is vanishingly rare;
|
|
44
|
+
* the allowlist guards the short cases).
|
|
45
|
+
* - Are mostly digits (≥80 % digit characters) — runtime epoch
|
|
46
|
+
* suffixes such as `1779431162` and committee-codeoid mashes like
|
|
47
|
+
* `2024k1234`.
|
|
48
|
+
* - Start with `run` and end with all-digits (`run261`, `run17`),
|
|
49
|
+
* the per-run slug suffix the aggregator stamps onto run ids.
|
|
50
|
+
* - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
|
|
51
|
+
* after a strip / normalization round-trip.
|
|
52
|
+
*
|
|
53
|
+
* Returns `false` for normal vocabulary so the keyword list stays
|
|
54
|
+
* useful — every reject path is intentionally narrow.
|
|
55
|
+
*
|
|
56
|
+
* @param token - Single token candidate
|
|
57
|
+
* @returns `true` when the token should be dropped from keywords
|
|
58
|
+
*/
|
|
59
|
+
export declare function isNoiseKeywordToken(token: string): boolean;
|
|
60
|
+
//# sourceMappingURL=keyword-filters.d.ts.map
|