npm - euparliamentmonitor - Versions diffs - 0.9.21 → 0.9.23 - Mend

euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/package.json +6 -2
package/scripts/aggregator/article-metadata.js +69 -14
package/scripts/aggregator/editorial-brief-resolver.js +23 -0
package/scripts/aggregator/html/headline.d.ts +41 -9
package/scripts/aggregator/html/headline.js +69 -10
package/scripts/aggregator/html/shell.js +73 -17
package/scripts/aggregator/manifest/index.d.ts +1 -1
package/scripts/aggregator/manifest/index.js +1 -1
package/scripts/aggregator/manifest/resolver.d.ts +28 -1
package/scripts/aggregator/manifest/resolver.js +61 -5
package/scripts/aggregator/markdown-renderer.js +11 -0
package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
package/scripts/aggregator/metadata/artifact-walker.js +29 -10
package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
package/scripts/aggregator/metadata/brief-body.js +69 -0
package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
package/scripts/aggregator/metadata/heading-rules.js +78 -269
package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
package/scripts/aggregator/metadata/keyword-filters.js +156 -0
package/scripts/aggregator/metadata/lede-extractor.js +11 -2
package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
package/scripts/aggregator/metadata/seo-budgets.js +202 -0
package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
package/scripts/aggregator/metadata/text-truncate.js +277 -0
package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
package/scripts/aggregator/metadata/text-utils.js +119 -439
package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
package/scripts/aggregator/metadata/title-rejection.js +179 -0
package/scripts/copy-vendor.js +84 -112
package/scripts/dump-article-seo.js +640 -0
package/scripts/fix-mermaid-diagrams.js +931 -0
package/scripts/generators/news-indexes/backfill.d.ts +6 -1
package/scripts/generators/news-indexes/backfill.js +71 -4
package/scripts/validate-article-seo.js +534 -0
package/scripts/validate-mermaid-diagrams.js +306 -0

package/scripts/aggregator/metadata/heading-rules.d.ts CHANGED Viewed

@@ -1,83 +1,4 @@
-/**
- * Headings inside an editorial artefact that carry the journalist's lede
- * paragraph (a one-paragraph summary of "what happened, why it matters").
- * When the resolver sees one of these as a `## …` heading inside the
- * editorial artefact, it prefers the first prose paragraph that follows
- * it as the description (and as a title fallback) over a generic line
- * walk. Names are matched case-insensitively against the heading text
- * (after stripping inline Markdown).
- */
-export declare const EDITORIAL_LEDE_HEADINGS: readonly string[];
-/**
- * Artifact-category prefixes that appear inside editorial-artefact H1s as
- * a structural label rather than an editorial headline (e.g. `# Synthesis
- * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
- * starts with one of these prefixes followed by a separator (em/en dash,
- * hyphen, or colon), the resolver treats it as **generic** so it does
- * not leak into the article `<title>`. Compared lower-case, with leading
- * punctuation stripped.
- */
-export declare const ARTIFACT_CATEGORY_PREFIXES: readonly string[];
-/**
- * Normalise a Markdown heading's text for comparison against the
- * editorial-lede heading whitelist. Strips inline Markdown decorations
- * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
- * characters (emoji, punctuation, spaces) so a heading like
- * `🎯 Headline Judgement` compares equal to `headline judgement`.
- *
- * @param raw - Raw heading text (no leading hashes)
- * @returns Lower-cased, decoration-stripped heading text
- */
-export declare function normaliseHeadingText(raw: string): string;
-/**
- * Word-boundary match against an editorial-lede whitelist entry. Matches
- * when the normalised heading equals the whitelist entry exactly, or when
- * the entry is followed by any non-alphanumeric character — covering
- * localized parenthetical glosses written with ASCII or full-width
- * punctuation (e.g. `bluf (bottom line up front)`, `bluf（結論先出し）`,
- * `bluf — 핵심 결론`, `60-second read — what happened`).
- *
- * @param headingText - Normalised heading text (lower-case, decoration-stripped)
- * @param whitelistEntry - Lower-case whitelist entry from
- *                        {@link EDITORIAL_LEDE_HEADINGS}
- * @returns `true` when `headingText` begins with `whitelistEntry` at a
- *          word boundary
- */
-export declare function isLedeHeadingMatch(headingText: string, whitelistEntry: string): boolean;
-/**
- * Return `true` when an artefact-H1 begins with one of the
- * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
- * carry the artefact's structural label rather than a journalist's
- * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
- * 2026)`) and must not leak into the article `<title>`.
- *
- * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
- * @returns `true` when the heading is an artefact-category label
- */
-export declare function isArtifactCategoryHeading(heading: string): boolean;
-/**
- * Strip a leading or trailing artifact-category label from a heading and
- * return the editorial-topic core. When neither end carries a category
- * label, the heading is returned unchanged. When the category label is
- * the **entire** heading (e.g. `# Executive Brief`) the result is the
- * empty string.
- *
- * Examples:
- * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
- * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
- * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
- * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
- * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
- *
- * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
- * stripped because it functions as a date stamp rather than editorial
- * copy. The returned core is trimmed of whitespace and trailing
- * punctuation.
- *
- * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
- * @returns Editorial-topic core, or empty string when only the category survived
- */
-export declare function stripArtifactCategoryAffix(heading: string): string;
+export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
 /**
  * Return `true` when the supplied heading matches the generic
  * `${humanize(articleType)} — ${date}` form that the aggregator writes as
@@ -85,7 +6,7 @@ export declare function stripArtifactCategoryAffix(heading: string): string;
  * separators, and matches the `breaking-breaking` variant that some
  * same-day collision runs produce.
  *
- * @param heading - Plain-text heading (post-{@link stripInlineMarkdown})
+ * @param heading - Plain-text heading (post-`stripInlineMarkdown`)
  * @param articleType - Article type slug
  * @param date - ISO date string
  * @returns `true` when the heading carries no editorial information

package/scripts/aggregator/metadata/heading-rules.js CHANGED Viewed

@@ -2,280 +2,104 @@
 // SPDX-License-Identifier: Apache-2.0
 /**
  * @module Aggregator/Metadata/HeadingRules
- * @description Heading-classification helpers extracted from
- * `article-metadata.ts`. Owns the editorial-lede whitelist, the artefact
- * category prefix list, the institutional-noun whitelist, and the
- * `isGenericHeading` / `stripArtifactCategoryAffix` predicates that
- * drive title-tier selection in the resolver.
+ * @description Heading-classification helpers used by the article
+ * metadata resolver. Owns:
  *
- * Pure leaf module — the only runtime dependencies are
- * {@link stripInlineMarkdown} (text-utils) and {@link humanizeSlug}
- * (slug). Re-exported through `article-metadata.ts` for back-compat
- * with existing call sites.
+ *   - {@link isGenericHeading} — the resolver's master generic-heading
+ *     predicate (drives title-tier selection).
+ *   - Internal helpers for institutional-noun, category-noun, and
+ *     `<label><sep><date>` boilerplate detection.
+ *
+ * The editorial-lede whitelist, artifact-category prefix list, and the
+ * {@link isArtifactCategoryHeading} / {@link stripArtifactCategoryAffix}
+ * helpers were extracted to `./artifact-category-heading.ts` in May 2026
+ * to keep this file under the 600-raw-line drift-guard. They are
+ * **re-exported here** so existing call sites (`lede-extractor.ts`,
+ * `artifact-walker.ts`, `article-metadata.ts`) keep working unchanged.
+ *
+ * Pure leaf module. The only runtime dependencies are
+ * {@link humanizeSlug} (slug) and the helpers re-exported from
+ * `./artifact-category-heading.js`.
  */
-import { stripInlineMarkdown } from './text-utils.js';
 import { humanizeSlug } from './slug.js';
+import { isArtifactCategoryHeading } from './artifact-category-heading.js';
+// Re-export the artifact-category surface so existing imports continue
+// to work without touching consumers.
+export { EDITORIAL_LEDE_HEADINGS, ARTIFACT_CATEGORY_PREFIXES, normaliseHeadingText, isLedeHeadingMatch, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './artifact-category-heading.js';
 /**
- * Headings inside an editorial artefact that carry the journalist's lede
- * paragraph (a one-paragraph summary of "what happened, why it matters").
- * When the resolver sees one of these as a `## …` heading inside the
- * editorial artefact, it prefers the first prose paragraph that follows
- * it as the description (and as a title fallback) over a generic line
- * walk. Names are matched case-insensitively against the heading text
- * (after stripping inline Markdown).
+ * Article-type aliases that author-templates use interchangeably with
+ * the humanized slug. `breaking` runs in particular alternate between
+ * `Breaking` and `Breaking News` in brief H1s. The aliases are matched
+ * alongside the canonical `humanizeSlug(articleType)` value so the
+ * downstream pattern + trailing-date regex pick them all up.
  */
-export const EDITORIAL_LEDE_HEADINGS = [
-    '60-second read',
-    '60 second read',
-    'sixty-second read',
-    'lede',
-    'lead',
-    'tl;dr',
-    'tldr',
-    'synopsis',
-    'in brief',
-    'at a glance',
-    'bottom line',
-    'bluf',
-    'bluf — bottom line up front',
-    'bottom line up front',
-    'executive summary',
-    'executive briefing',
-    'master narrative',
-    'overview',
-    'headline judgement',
-    'headline judgment',
-    'key findings',
-    'key judgements',
-    'key judgments',
-    'situation summary',
-    'situation report',
-    'situation update',
-];
+const ARTICLE_TYPE_ALIASES = {
+    breaking: ['Breaking News'],
+};
 /**
- * Artifact-category prefixes that appear inside editorial-artefact H1s as
- * a structural label rather than an editorial headline (e.g. `# Synthesis
- * Summary — Week in Review (3 Apr – 1 May 2026)`). When a candidate H1
- * starts with one of these prefixes followed by a separator (em/en dash,
- * hyphen, or colon), the resolver treats it as **generic** so it does
- * not leak into the article `<title>`. Compared lower-case, with leading
- * punctuation stripped.
+ * Separators observed in the wild for brief H1s mixing the
+ * article-type label with a single ISO or human-friendly date.
  */
-export const ARTIFACT_CATEGORY_PREFIXES = [
-    'actor mapping',
-    'analytical quality',
-    'breaking news analysis',
-    'coalition dynamics',
-    'commission wp alignment',
-    'committee activity report',
-    'cross run continuity',
-    'deep analysis',
-    'economic context',
-    'executive brief',
-    'executive briefing',
-    'executive intelligence brief',
-    'executive intelligence briefing',
-    'executive summary',
-    'forward indicators',
-    'historical baseline',
-    'impact matrix',
-    'intelligence assessment',
-    'intelligence briefing',
-    'intelligence synthesis summary',
-    'legislative output analysis',
-    'legislative pipeline analysis',
-    'legislative pipeline forecast',
-    'mandate fulfilment scorecard',
-    'master intelligence synthesis',
-    'mcp reliability audit',
-    'methodology reflection',
-    'monthly outlook',
-    'motions analysis',
-    'parliamentary calendar projection',
-    'pestle analysis',
-    'political intelligence brief',
-    'political risk',
-    'political threat landscape',
-    'presidency trio context',
-    'propositions analysis',
-    'quantitative swot',
-    'risk assessment',
-    'risk matrix',
-    'risk scoring',
-    'scenario forecast',
-    'seat projection',
-    'significance classification',
-    'situation report',
-    'situation summary',
-    'stakeholder analysis',
-    'stakeholder impact',
-    'stakeholder map',
-    'swot analysis',
-    'synthesis summary',
-    'threat assessment',
-    'threat model',
-    'voting patterns',
-    'weekly outlook',
-    'wildcards blackswans',
-];
+const GENERIC_HEADING_SEPARATORS = [' — ', ' - ', ' – ', ': ', ' ', ' | ', ', '];
 /**
- * Normalise a Markdown heading's text for comparison against the
- * editorial-lede heading whitelist. Strips inline Markdown decorations
- * (`*`, `_`, `` ` ``, `#`), then strips any leading non-alphanumeric
- * characters (emoji, punctuation, spaces) so a heading like
- * `🎯 Headline Judgement` compares equal to `headline judgement`.
- *
- * @param raw - Raw heading text (no leading hashes)
- * @returns Lower-cased, decoration-stripped heading text
+ * Date-shape character class: digits, dashes (ISO) plus letters and
+ * single spaces (human-friendly forms like `8 April 2026`). Single-day
+ * only — date *ranges* are preserved as editorial scope-window content.
  */
-export function normaliseHeadingText(raw) {
-    return stripInlineMarkdown(raw)
-        .replace(/[*_`#]+/g, '')
-        .replace(/^[^A-Za-z0-9]+/, '')
-        .trim()
-        .toLowerCase();
-}
+const GENERIC_HEADING_DATE_SHAPE = '[\\d][\\d\\-]*|\\d{1,2}\\s+[A-Za-z]+\\s+\\d{4}';
 /**
- * Word-boundary match against an editorial-lede whitelist entry. Matches
- * when the normalised heading equals the whitelist entry exactly, or when
- * the entry is followed by any non-alphanumeric character — covering
- * localized parenthetical glosses written with ASCII or full-width
- * punctuation (e.g. `bluf (bottom line up front)`, `bluf（結論先出し）`,
- * `bluf — 핵심 결론`, `60-second read — what happened`).
+ * Aliases used for one article-type slug, including the canonical
+ * humanised slug plus any registered aliases.
  *
- * @param headingText - Normalised heading text (lower-case, decoration-stripped)
- * @param whitelistEntry - Lower-case whitelist entry from
- *                        {@link EDITORIAL_LEDE_HEADINGS}
- * @returns `true` when `headingText` begins with `whitelistEntry` at a
- *          word boundary
+ * @param articleType - Article-type slug
+ * @returns Ordered list of label aliases
  */
-export function isLedeHeadingMatch(headingText, whitelistEntry) {
-    if (headingText === whitelistEntry)
-        return true;
-    if (!headingText.startsWith(whitelistEntry))
-        return false;
-    const next = headingText.charAt(whitelistEntry.length);
-    // Word boundary — anything that is not an ASCII letter/digit is a
-    // separator we accept. This works uniformly across ASCII parentheses,
-    // CJK full-width brackets `（`, dashes `— – -`, colons `:`, and the
-    // ideographic full-width colon `：`.
-    return next === '' || !/[a-z0-9]/.test(next);
+function resolveLabelAliases(articleType) {
+    const human = humanizeSlug(articleType);
+    return [human, ...(ARTICLE_TYPE_ALIASES[articleType] ?? [])];
 }
 /**
- * Return `true` when an artefact-H1 begins with one of the
- * `ARTIFACT_CATEGORY_PREFIXES` followed by a separator. Such H1s
- * carry the artefact's structural label rather than a journalist's
- * headline (e.g. `# Synthesis Summary — Week in Review (3 Apr – 1 May
- * 2026)`) and must not leak into the article `<title>`.
+ * Match an exact `<prefix?><label><sep><date>` shape, including the
+ * `EU Parliament ` / `EP ` prefix variants and the redundant
+ * `<label> <label> — <date>` form occasionally emitted by same-day
+ * collision runs.
  *
- * @param heading - Plain-text H1 (after `stripInlineMarkdown`)
- * @returns `true` when the heading is an artefact-category label
+ * @param normalized - Heading text after whitespace collapse
+ * @param label - Article-type label to test against
+ * @param date - ISO date string
+ * @returns `true` when the heading matches a known literal shape
  */
-export function isArtifactCategoryHeading(heading) {
-    const normalized = normaliseCategoryHeading(heading);
-    if (normalized === '')
-        return false;
-    for (const prefix of ARTIFACT_CATEGORY_PREFIXES) {
-        if (normalized === prefix)
+function matchesLiteralLabelDateShape(normalized, label, date) {
+    for (const sep of GENERIC_HEADING_SEPARATORS) {
+        const p = `${label}${sep}${date}`;
+        if (normalized === p)
             return true;
-        if (normalized.startsWith(`${prefix} —`) ||
-            normalized.startsWith(`${prefix} –`) ||
-            normalized.startsWith(`${prefix} -`) ||
-            normalized.startsWith(`${prefix}:`)) {
+        if (normalized === `EU Parliament ${p}`)
             return true;
-        }
-        if (normalized.endsWith(` — ${prefix}`) ||
-            normalized.endsWith(` – ${prefix}`) ||
-            normalized.endsWith(` - ${prefix}`) ||
-            normalized.endsWith(`: ${prefix}`)) {
+        if (normalized === `EP ${p}`)
             return true;
-        }
-    }
-    return false;
-}
-/**
- * Strip a leading or trailing artifact-category label from a heading and
- * return the editorial-topic core. When neither end carries a category
- * label, the heading is returned unchanged. When the category label is
- * the **entire** heading (e.g. `# Executive Brief`) the result is the
- * empty string.
- *
- * Examples:
- * - `Executive Brief — EU Parliament Motions` → `EU Parliament Motions`
- * - `EU Parliament Propositions — Executive Brief` → `EU Parliament Propositions`
- * - `EP10 Term Outlook — Executive Brief` → `EP10 Term Outlook`
- * - `Key Legislative Developments — Deep Analysis (2026-05-08)` → `Key Legislative Developments`
- * - `Synthesis Summary — EP Motions & Adopted Texts` → `EP Motions & Adopted Texts`
- *
- * Trailing parenthesised metadata (`(2026-05-08)`, `(May 2026)`) is also
- * stripped because it functions as a date stamp rather than editorial
- * copy. The returned core is trimmed of whitespace and trailing
- * punctuation.
- *
- * @param heading - Raw heading text (post-{@link stripInlineMarkdown})
- * @returns Editorial-topic core, or empty string when only the category survived
- */
-export function stripArtifactCategoryAffix(heading) {
-    const trimmed = heading.trim();
-    if (trimmed === '')
-        return '';
-    const sortedPrefixes = [...ARTIFACT_CATEGORY_PREFIXES].sort((a, b) => b.length - a.length);
-    const normalized = normaliseCategoryHeading(trimmed);
-    const skip = trimmed.length - normalized.length;
-    const visible = trimmed.slice(skip < 0 ? 0 : skip);
-    const visibleClean = visible.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
-    const normalizedClean = normaliseCategoryHeading(visibleClean);
-    for (const prefix of sortedPrefixes) {
-        for (const sep of [' — ', ' – ', ' - ', ': ']) {
-            const candidate = `${prefix}${sep}`;
-            if (normalizedClean.startsWith(candidate)) {
-                const core = visibleClean.slice(candidate.length).trim();
-                return cleanupAffixCore(core);
-            }
-        }
-        for (const sep of [' — ', ' – ', ' - ', ': ']) {
-            const candidate = `${sep}${prefix}`;
-            if (normalizedClean.endsWith(candidate)) {
-                const core = visibleClean.slice(0, visibleClean.length - candidate.length).trim();
-                return cleanupAffixCore(core);
-            }
-        }
-        if (normalizedClean === prefix)
-            return '';
     }
-    return trimmed;
-}
-/**
- * Tidy the editorial-topic core returned by
- * {@link stripArtifactCategoryAffix}: drop trailing parenthesised
- * metadata (`(2026-05-08)`, `(May 2026)`) and trailing punctuation. When
- * stripping leaves the string too short to be meaningful (<5 chars),
- * return the empty string so callers fall through to lower tiers.
- *
- * @param core - Heading with the category label already stripped
- * @returns Cleaned editorial-topic core, or empty string when too short
- */
-function cleanupAffixCore(core) {
-    const withoutTrailingParens = core.replace(/\s*\([^)]{1,80}\)\s*$/u, '').trim();
-    const withoutTrailingPunct = withoutTrailingParens.replace(/[—–:;,.\s-]+$/u, '').trim();
-    if (withoutTrailingPunct.length < 5)
-        return '';
-    return withoutTrailingPunct;
+    const labelRedundant = `${label} ${label}`;
+    return normalized === `${labelRedundant} — ${date}`;
 }
 /**
- * Lower-case, decoration-stripped form used by the artifact-category
- * matchers. Strips inline Markdown, leading non-alphanumeric runs (emoji,
- * decoration), and collapses whitespace to a single space.
+ * Match `<prefix?><label><sep-or-space><any-date>` patterns where the
+ * date token can be any ISO / human / single-day-range shape. Anchored
+ * to end-of-string so it cannot fire on editorial sentences that
+ * happen to contain a date token mid-clause.
  *
- * @param raw - Raw heading text
- * @returns Lower-case normalised form
+ * @param normalized - Heading text after whitespace collapse
+ * @param label - Article-type label to test against
+ * @returns `true` when the heading matches the trailing-date shape
  */
-function normaliseCategoryHeading(raw) {
-    return stripInlineMarkdown(raw)
-        .trim()
-        .toLowerCase()
-        .replace(/^[^a-z0-9]+/, '')
-        .replace(/\s+/g, ' ');
+function matchesTrailingDateShape(normalized, label) {
+    const trailingDateOnly = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s*[—–\\-|,:]\\s*(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
+    if (trailingDateOnly.test(normalized))
+        return true;
+    // Same shape but label followed directly by a date with whitespace only
+    // (e.g. `Breaking News 2026-04-01`).
+    const labelSpaceDate = new RegExp(`^(?:EU Parliament |EP )?${escapeRegex(label)}\\s+(?:${GENERIC_HEADING_DATE_SHAPE})$`, 'u');
+    return labelSpaceDate.test(normalized);
 }
 /**
  * Return `true` when the supplied heading matches the generic
@@ -284,7 +108,7 @@ function normaliseCategoryHeading(raw) {
  * separators, and matches the `breaking-breaking` variant that some
  * same-day collision runs produce.
  *
- * @param heading - Plain-text heading (post-{@link stripInlineMarkdown})
+ * @param heading - Plain-text heading (post-`stripInlineMarkdown`)
  * @param articleType - Article type slug
  * @param date - ISO date string
  * @returns `true` when the heading carries no editorial information
@@ -295,27 +119,12 @@ export function isGenericHeading(heading, articleType, date) {
         return true;
     if (isArtifactCategoryHeading(normalized))
         return true;
-    const human = humanizeSlug(articleType);
-    const patterns = [
-        `${human} — ${date}`,
-        `${human} - ${date}`,
-        `${human} – ${date}`,
-        `${human}: ${date}`,
-        `${human} ${date}`,
-    ];
-    const humanRedundant = `${human} ${human}`;
-    for (const p of patterns) {
-        if (normalized === p)
-            return true;
-        if (normalized === `EU Parliament ${p}`)
+    for (const label of resolveLabelAliases(articleType)) {
+        if (matchesLiteralLabelDateShape(normalized, label, date))
             return true;
-        if (normalized === `${humanRedundant} — ${date}`)
+        if (matchesTrailingDateShape(normalized, label))
             return true;
     }
-    const trailingDateOnly = new RegExp(`^${escapeRegex(human)}\\s*[—–-]\\s*[\\d-]+$`, 'u');
-    if (trailingDateOnly.test(normalized)) {
-        return true;
-    }
     if (isCategoryNounHeading(normalized, articleType))
         return true;
     if (isBareInstitutionalHeading(normalized))

package/scripts/aggregator/metadata/keyword-filters.d.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * @module Aggregator/Metadata/KeywordFilters
+ * @description Cross-site keyword catalogue and noise-token filter used
+ * by {@link buildSeoKeywords} in `resolve-helpers.ts`.
+ *
+ * Two responsibilities:
+ *
+ *   1. **Always-on cross-site keywords** ({@link CROSS_SITE_KEYWORDS})
+ *      are prepended to every article's `<meta name="keywords">` list
+ *      regardless of language, so search-engine discovery of the
+ *      Hack23 civic-tech portfolio (EU Parliament Monitor +
+ *      Riksdagsmonitor + CIA) is consistent across all 14 localized
+ *      surfaces. The user explicitly requested
+ *      `riksdagsmonitor, political intelligence, riksdag, regeringen`
+ *      (the sister Swedish-Parliament project) plus EP analogues.
+ *
+ *   2. **Noise-token rejection** ({@link isNoiseKeywordToken}) drops
+ *      the UUID-fragment tokens (`77fc920c`, `3a76`, `9db5`, …) and
+ *      synthetic run-id slugs (`propositions-run261-1779431162`) that
+ *      the previous keyword extractor leaked into `<head>` when a
+ *      brief mentioned its own run id editorially (e.g.
+ *      `Analysis run 77fc920c-3a76-4813-9db5-43a7e9acc25e returned
+ *      0 classified actors`).
+ *
+ * Pure leaf module — no imports.
+ */
+/**
+ * Cross-site SEO keywords prepended to every article in every
+ * language. Order is meaningful: stronger civic-tech-portfolio terms
+ * first so they appear ahead of the per-article-type keywords when
+ * the 16-entry budget is exceeded.
+ */
+export declare const CROSS_SITE_KEYWORDS: readonly string[];
+/**
+ * Decide whether a single keyword token should be discarded as noise.
+ *
+ * The current rules reject tokens that:
+ *
+ *   - Look like a UUID hex chunk: ≥4 chars and consist solely of the
+ *     `[0-9a-f]` alphabet **and** contain at least one digit (so
+ *     real English words like `dead` / `face` survive). Tokens of
+ *     length ≥8 are always rejected (a real English word of that
+ *     length composed exclusively of hex letters is vanishingly rare;
+ *     the allowlist guards the short cases).
+ *   - Are mostly digits (≥80 % digit characters) — runtime epoch
+ *     suffixes such as `1779431162` and committee-codeoid mashes like
+ *     `2024k1234`.
+ *   - Start with `run` and end with all-digits (`run261`, `run17`),
+ *     the per-run slug suffix the aggregator stamps onto run ids.
+ *   - Match the full opaque-runId shape `<type>-run<digits>-<digits>`
+ *     after a strip / normalization round-trip.
+ *
+ * Returns `false` for normal vocabulary so the keyword list stays
+ * useful — every reject path is intentionally narrow.
+ *
+ * @param token - Single token candidate
+ * @returns `true` when the token should be dropped from keywords
+ */
+export declare function isNoiseKeywordToken(token: string): boolean;
+//# sourceMappingURL=keyword-filters.d.ts.map