npm - euparliamentmonitor - Versions diffs - 0.9.28 → 1.0.0 - Mend

euparliamentmonitor 0.9.28 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/package.json +3 -3
package/scripts/aggregator/html/localize-body.d.ts +28 -4
package/scripts/aggregator/html/localize-body.js +79 -21
package/scripts/aggregator/html/shell.js +2 -1
package/scripts/aggregator/metadata/artifact-category-heading.js +8 -1
package/scripts/aggregator/metadata/heading-rules.js +11 -0
package/scripts/aggregator/metadata/seo-budgets.js +12 -9
package/scripts/aggregator/reader-friendly-transform.js +1 -1
package/scripts/generators/news-indexes/backfill-hreflang.d.ts +13 -0
package/scripts/generators/news-indexes/backfill-hreflang.js +112 -0
package/scripts/generators/news-indexes/backfill-reader-label.d.ts +47 -0
package/scripts/generators/news-indexes/backfill-reader-label.js +86 -0
package/scripts/generators/news-indexes/backfill.d.ts +19 -18
package/scripts/generators/news-indexes/backfill.js +118 -111
package/scripts/generators/news-indexes/per-language.js +2 -1
package/scripts/generators/political-intelligence/html.js +2 -1
package/scripts/generators/sitemap/html.js +2 -1
package/scripts/generators/sitemap/index.d.ts +1 -1
package/scripts/generators/sitemap/index.js +1 -1
package/scripts/generators/sitemap/rss.d.ts +38 -2
package/scripts/generators/sitemap/rss.js +54 -10
package/scripts/generators/sitemap/xml.js +21 -6
package/scripts/generators/sitemap.js +42 -9
package/scripts/mcp/ep/error-classifier.d.ts +38 -0
package/scripts/mcp/ep/error-classifier.js +49 -0
package/scripts/mcp/ep/tools-feeds.js +27 -2
package/scripts/templates/sections/footer.js +3 -1
package/scripts/templates/sections/rss-discovery.d.ts +22 -0
package/scripts/templates/sections/rss-discovery.js +48 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "euparliamentmonitor",
-  "version": "0.9.28",
+  "version": "1.0.0",
   "type": "module",
   "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
   "main": "scripts/index.js",
@@ -167,7 +167,7 @@
     "clean-css": "^5.3.3",
     "d3": "7.9.0",
     "esbuild": "0.28.0",
-    "eslint": "10.4.0",
+    "eslint": "10.4.1",
     "eslint-config-prettier": "10.1.8",
     "eslint-plugin-jsdoc": "63.0.0",
     "eslint-plugin-security": "4.0.0",
@@ -179,7 +179,7 @@
     "husky": "9.1.7",
     "jscpd": "4.2.4",
     "knip": "^6.7.0",
-    "lint-staged": "17.0.5",
+    "lint-staged": "17.0.6",
     "mermaid": "11.15.0",
     "papaparse": "5.5.3",
     "prettier": "3.8.3",

package/scripts/aggregator/html/localize-body.d.ts CHANGED Viewed

@@ -22,11 +22,32 @@ export declare function localizeArticleBody(bodyHtml: string, lang: LanguageCode
  * @returns Modified string, or `haystack` unchanged when `needle` is absent
  */
 export declare function replaceFirstStringIn(haystack: string, needle: string, replacement: string): string;
+/**
+ * Locate the cut point that ends the Executive Brief body — the start of
+ * the next top-level boundary heading after `afterHeading`. A boundary is
+ * any `<h2>` whose `id` either starts with the canonical `section-` prefix
+ * or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
+ * (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
+ *
+ * Critically, this only matches **top-level** section anchors — never the
+ * brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
+ * …), which carry slugified ids without the `section-` prefix. That is why
+ * we cannot simply look for the next `<h2`.
+ *
+ * Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
+ * CodeQL's safe-regex envelope.
+ *
+ * @param html - Full article body HTML
+ * @param afterHeading - Index immediately after the Executive Brief `</h2>`
+ * @returns Index of the next boundary `<h2`, or `-1` when the Executive
+ *          Brief is the last block in the body.
+ */
+export declare function findExecutiveBriefSectionCut(html: string, afterHeading: number): number;
 /**
  * Replace the **inner body** of the Executive Brief section (the
  * `<h2 id="section-executive-brief">…</h2>` heading and everything that
- * follows it up to — but not including — the next `<h2 id="section-…">`
- * sibling) with the supplied replacement HTML. The Executive Brief
+ * follows it up to — but not including — the next top-level boundary
+ * heading) with the supplied replacement HTML. The Executive Brief
  * heading itself is preserved by emitting it inline ahead of the
  * replacement, so the in-page anchor (`#section-executive-brief`) and
  * the table-of-contents link continue to work.
@@ -39,8 +60,11 @@ export declare function replaceFirstStringIn(haystack: string, needle: string, r
  * `render-one.writeLanguageVariant`.
  *
  * Implementation uses `indexOf`/slice exclusively to stay within
- * CodeQL's safe-regex envelope. Returns `html` unchanged when the
- * Executive Brief heading is absent or malformed.
+ * CodeQL's safe-regex envelope. The replacement spans from the heading to
+ * the next top-level boundary (see {@link findExecutiveBriefSectionCut});
+ * when the Executive Brief is the last block in the body the replacement
+ * extends to end-of-body. Returns `html` unchanged only when the Executive
+ * Brief heading is absent or malformed.
  *
  * @param html - Full article body HTML
  * @param localizedHeading - Localized text for the Executive Brief H2

package/scripts/aggregator/html/localize-body.js CHANGED Viewed

@@ -12,6 +12,23 @@ import { TRADECRAFT_HEADING_LABELS, TRADECRAFT_INTRO_LABELS, TRADECRAFT_METHODOL
 import { escapeHTML } from '../../utils/file-utils.js';
 import { TRADECRAFT_SECTION_ID, MANIFEST_SECTION_ID, SUPPLEMENTARY_SECTION_ID, } from '../artifact-order.js';
 import { KEY_TAKEAWAYS_SECTION_ID } from '../key-takeaways.js';
+import { READER_GUIDE_SECTION_ID } from '../reader-guide-constants.js';
+/**
+ * Top-level section anchors that mark the **end** of the Executive Brief
+ * body. Canonical analysis sections are matched by the shared
+ * `id="section-…"` prefix (see {@link findExecutiveBriefSectionCut});
+ * the appendix and reader-guide sections below carry bespoke ids that do
+ * **not** share that prefix, so they are matched explicitly. Including
+ * them ensures the localized brief splice also fires on sparse runs where
+ * the Executive Brief is the last canonical section and only appendix
+ * blocks follow it.
+ */
+const EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS = [
+    `id="${READER_GUIDE_SECTION_ID}"`,
+    `id="${TRADECRAFT_SECTION_ID}"`,
+    `id="${MANIFEST_SECTION_ID}"`,
+    `id="${SUPPLEMENTARY_SECTION_ID}"`,
+];
 /**
  * Localize the Tradecraft References and Analysis Index sections in the
  * rendered article body HTML. Replaces English headings, introductions,
@@ -102,11 +119,48 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
         return haystack;
     return haystack.slice(0, idx) + replacement + haystack.slice(idx + needle.length);
 }
+/**
+ * Locate the cut point that ends the Executive Brief body — the start of
+ * the next top-level boundary heading after `afterHeading`. A boundary is
+ * any `<h2>` whose `id` either starts with the canonical `section-` prefix
+ * or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
+ * (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
+ *
+ * Critically, this only matches **top-level** section anchors — never the
+ * brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
+ * …), which carry slugified ids without the `section-` prefix. That is why
+ * we cannot simply look for the next `<h2`.
+ *
+ * Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
+ * CodeQL's safe-regex envelope.
+ *
+ * @param html - Full article body HTML
+ * @param afterHeading - Index immediately after the Executive Brief `</h2>`
+ * @returns Index of the next boundary `<h2`, or `-1` when the Executive
+ *          Brief is the last block in the body.
+ */
+export function findExecutiveBriefSectionCut(html, afterHeading) {
+    let best = -1;
+    const consider = (markerIdx) => {
+        if (markerIdx === -1)
+            return;
+        const h2 = html.lastIndexOf('<h2', markerIdx);
+        if (h2 === -1 || h2 < afterHeading)
+            return;
+        if (best === -1 || h2 < best)
+            best = h2;
+    };
+    consider(html.indexOf('id="section-', afterHeading));
+    for (const marker of EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS) {
+        consider(html.indexOf(marker, afterHeading));
+    }
+    return best;
+}
 /**
  * Replace the **inner body** of the Executive Brief section (the
  * `<h2 id="section-executive-brief">…</h2>` heading and everything that
- * follows it up to — but not including — the next `<h2 id="section-…">`
- * sibling) with the supplied replacement HTML. The Executive Brief
+ * follows it up to — but not including — the next top-level boundary
+ * heading) with the supplied replacement HTML. The Executive Brief
  * heading itself is preserved by emitting it inline ahead of the
  * replacement, so the in-page anchor (`#section-executive-brief`) and
  * the table-of-contents link continue to work.
@@ -119,8 +173,11 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
  * `render-one.writeLanguageVariant`.
  *
  * Implementation uses `indexOf`/slice exclusively to stay within
- * CodeQL's safe-regex envelope. Returns `html` unchanged when the
- * Executive Brief heading is absent or malformed.
+ * CodeQL's safe-regex envelope. The replacement spans from the heading to
+ * the next top-level boundary (see {@link findExecutiveBriefSectionCut});
+ * when the Executive Brief is the last block in the body the replacement
+ * extends to end-of-body. Returns `html` unchanged only when the Executive
+ * Brief heading is absent or malformed.
  *
  * @param html - Full article body HTML
  * @param localizedHeading - Localized text for the Executive Brief H2
@@ -147,23 +204,24 @@ export function replaceExecutiveBriefSection(html, localizedHeading, replacement
     if (h2CloseTagIdx === -1)
         return html;
     const afterHeading = h2CloseTagIdx + '</h2>'.length;
-    // Find the next `<h2 id="section-...">` boundary — the start of the
-    // following article section. If there is no further section heading
-    // we conservatively bail out (replacing through end-of-body would
-    // also drop appendix content like Reader Guide / Key Takeaways).
-    const nextSectionId = html.indexOf('id="section-', afterHeading);
-    if (nextSectionId === -1)
-        return html;
-    const nextH2 = html.lastIndexOf('<h2', nextSectionId);
-    if (nextH2 === -1 || nextH2 <= afterHeading)
-        return html;
-    // Find the start of the line containing the next `<h2` so we don't
-    // strip leading whitespace from the next section. We look at most
-    // one newline back.
-    let cutEnd = nextH2;
-    const prevNewline = html.lastIndexOf('\n', nextH2 - 1);
-    if (prevNewline !== -1 && prevNewline >= afterHeading) {
-        cutEnd = prevNewline + 1;
+    // Find the next top-level boundary heading — the start of the following
+    // article section or appendix. When none exists the Executive Brief is
+    // the last block, so we replace through end-of-body. This guarantees the
+    // localized brief is spliced even on sparse runs (previously the splice
+    // bailed and non-English readers were stranded on the English brief).
+    const nextH2 = findExecutiveBriefSectionCut(html, afterHeading);
+    let cutEnd;
+    if (nextH2 === -1) {
+        cutEnd = html.length;
+    }
+    else {
+        // Start of the line containing the next `<h2` so we don't strip
+        // leading whitespace from the next section.
+        cutEnd = nextH2;
+        const prevNewline = html.lastIndexOf('\n', nextH2 - 1);
+        if (prevNewline !== -1 && prevNewline >= afterHeading) {
+            cutEnd = prevNewline + 1;
+        }
     }
     const newHeading = `<h2 id="section-executive-brief">${escapeHTML(localizedHeading)}</h2>\n`;
     const trimmedReplacement = replacementBodyHtml.endsWith('\n')

package/scripts/aggregator/html/shell.js CHANGED Viewed

@@ -19,6 +19,7 @@ import { escapeHTML } from '../../utils/file-utils.js';
 import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
 import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
 import { getSitemapFilename } from '../../generators/sitemap/index.js';
+import { buildRssAlternateLink } from '../../templates/sections/rss-discovery.js';
 import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
 import { clampForBudget } from '../metadata/seo-budgets.js';
 import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
@@ -352,7 +353,7 @@ ${keywordsMeta}  <meta name="robots" content="index, follow, max-snippet:-1, max
   <meta property="article:publisher" content="https://hack23.com">
   <link rel="canonical" href="${canonicalUrl}">
 ${hreflangLinks}
-  <link rel="alternate" type="application/rss+xml" title="EU Parliament Monitor RSS" href="${BASE_URL}/rss.xml">
+  ${buildRssAlternateLink(safeLang, `${BASE_URL}/`)}
   <link rel="preconnect" href="https://hack23.com" crossorigin>
   <meta property="og:type" content="article">
   <meta property="og:title" content="${escapeHTML(ogTitleClamped)}">

package/scripts/aggregator/metadata/artifact-category-heading.js CHANGED Viewed

@@ -153,6 +153,13 @@ export const ARTIFACT_CATEGORY_PREFIXES = [
     'voting patterns',
     'weekly outlook',
     'wildcards blackswans',
+    // CJK localized category prefixes (translations of "executive briefing")
+    'エグゼクティブ・ブリーフィング',
+    'エグゼクティブブリーフィング',
+    'エグゼクティブ・ブリーフ',
+    '행정 브리핑',
+    '执行简报',
+    '執行簡報',
 ];
 /**
  * Match a single calendar month name (English) with optional `-uary` /
@@ -211,7 +218,7 @@ function normaliseCategoryHeading(raw) {
     return stripInlineMarkdown(raw)
         .trim()
         .toLowerCase()
-        .replace(/^[^a-z0-9]+/, '')
+        .replace(/^[^a-z0-9\p{L}]+/u, '')
         .replace(/\s+/g, ' ');
 }
 /**

package/scripts/aggregator/metadata/heading-rules.js CHANGED Viewed

@@ -158,6 +158,17 @@ const BARE_INSTITUTIONAL_HEADINGS = [
     'briefing',
     'intelligence brief',
     'intelligence briefing',
+    // CJK / localized translations of generic headings
+    'エグゼクティブ・ブリーフィング',
+    'エグゼクティブブリーフィング',
+    'エグゼクティブ・ブリーフ',
+    'ブリーフィング',
+    '행정 브리핑',
+    '브리핑',
+    '执行简报',
+    '简报',
+    '執行簡報',
+    '簡報',
 ];
 /**
  * Return `true` when the heading is one of {@link BARE_INSTITUTIONAL_HEADINGS}

package/scripts/aggregator/metadata/seo-budgets.js CHANGED Viewed

@@ -160,15 +160,18 @@ export function clampForBudget(text, lang, surface) {
         if (cleaned.length >= softMin)
             return cleaned;
     }
-    // Whitespace-aware fallback. Chinese and Japanese text often has no
-    // ASCII spaces, so skip this step for them and fall straight through
-    // to the hard cut. Korean is the exception — it uses inter-word spaces.
-    if (family !== 'cjk' || lang === 'ko') {
-        const lastSpace = window.lastIndexOf(' ');
-        if (lastSpace >= softMin) {
-            const safe = trimTrailingSeparators(window.slice(0, lastSpace));
-            return `${safe}…`;
-        }
+    // Whitespace-aware fallback. Runs for every script: an ASCII space
+    // past the soft minimum is a safe break that drops a partial trailing
+    // segment whole rather than slicing it mid-token. Chinese and Japanese
+    // prose has no inter-word spaces, so `lastIndexOf(' ')` returns -1 and
+    // this is a no-op for them — but composed SEO snippets join clauses
+    // (body, dateline, reader label) with ASCII spaces, so honouring that
+    // boundary prevents hard-cutting the reader label mid-word. Korean
+    // uses inter-word spaces natively and benefits the same way.
+    const lastSpace = window.lastIndexOf(' ');
+    if (lastSpace >= softMin) {
+        const safe = trimTrailingSeparators(window.slice(0, lastSpace));
+        return `${safe}…`;
     }
     const hardCut = trimTrailingSeparators(window);
     return `${hardCut}…`;

package/scripts/aggregator/reader-friendly-transform.js CHANGED Viewed

@@ -46,7 +46,7 @@ const ADMIRALTY_LABELS = {
 export function applyReaderFriendlyTransform(html) {
     const state = createInitialState(html);
     const withGlossary = injectReaderGlossary(html);
-    const parts = withGlossary.split(/(<[^>]+>)/g);
+    const parts = withGlossary.split(/(<[^<>]+>)/g);
     for (let i = 0; i < parts.length; i++) {
         const part = parts[i] ?? '';
         if (part.startsWith('<')) {

package/scripts/generators/news-indexes/backfill-hreflang.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * Backfill hreflang alternate links for all article HTML files.
+ *
+ * Handles three cases:
+ * 1. Articles with no hreflang links at all → inject the full block before `</head>`
+ * 2. Articles with relative hreflang URLs → replace with absolute URLs
+ * 3. Articles already correct → skip
+ *
+ * @param filenames - News article filenames
+ * @returns Number of HTML files updated
+ */
+export declare function backfillArticleHreflang(filenames: readonly string[]): number;
+//# sourceMappingURL=backfill-hreflang.d.ts.map

package/scripts/generators/news-indexes/backfill-hreflang.js ADDED Viewed

@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * @module Generators/NewsIndexes/BackfillHreflang
+ * @description Hreflang alternate-link backfill for article HTML files.
+ * Extracted from `backfill.ts` to keep source files ≤600 lines.
+ */
+import path from 'path';
+import fs from 'fs';
+import { NEWS_DIR, BASE_URL } from '../../constants/config.js';
+import { ALL_LANGUAGES } from '../../constants/languages.js';
+import { parseArticleFilename, atomicWrite } from '../../utils/file-utils.js';
+/**
+ * Read an article HTML file, returning an empty string when unavailable.
+ *
+ * @param filepath - Absolute HTML file path
+ * @returns File content or empty string
+ */
+function readArticleHtml(filepath) {
+    try {
+        return path.isAbsolute(filepath) ? fs.readFileSync(filepath, 'utf8') : '';
+    }
+    catch {
+        return '';
+    }
+}
+/**
+ * Build hreflang `<link rel="alternate">` tags for an article slug.
+ * Produces one tag per supported language plus an `x-default` pointing at
+ * the English variant, all using absolute URLs.
+ *
+ * @param articleSlug - Slug without language suffix (e.g. `2026-02-24-propositions`)
+ * @returns Newline-joined `<link>` tags
+ */
+function buildArticleHreflang(articleSlug) {
+    const entries = ALL_LANGUAGES.map((code) => `  <link rel="alternate" hreflang="${code}" href="${BASE_URL}/news/${articleSlug}-${code}.html">`);
+    entries.push(`  <link rel="alternate" hreflang="x-default" href="${BASE_URL}/news/${articleSlug}-en.html">`);
+    return entries.join('\n');
+}
+/**
+ * Inject hreflang links into an article that has none.
+ *
+ * @param html - Article HTML content
+ * @param hreflangBlock - Pre-built hreflang link block
+ * @returns Updated HTML, or original if no change needed
+ */
+function injectHreflangLinks(html, hreflangBlock) {
+    return html.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
+}
+/**
+ * Replace existing relative hreflang links with absolute URLs.
+ *
+ * @param html - Article HTML content
+ * @param hreflangBlock - Pre-built hreflang link block with absolute URLs
+ * @returns Updated HTML, or original if no change needed
+ */
+function fixRelativeHreflangLinks(html, hreflangBlock) {
+    const stripped = html.replace(/\s*<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="[^"]*">\n?/gu, '');
+    return stripped.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
+}
+/**
+ * Backfill hreflang alternate links for all article HTML files.
+ *
+ * Handles three cases:
+ * 1. Articles with no hreflang links at all → inject the full block before `</head>`
+ * 2. Articles with relative hreflang URLs → replace with absolute URLs
+ * 3. Articles already correct → skip
+ *
+ * @param filenames - News article filenames
+ * @returns Number of HTML files updated
+ */
+export function backfillArticleHreflang(filenames) {
+    let updated = 0;
+    for (const filename of filenames) {
+        if (backfillOneArticleHreflang(filename))
+            updated++;
+    }
+    return updated;
+}
+/**
+ * Backfill hreflang for a single article file.
+ *
+ * @param filename - News article filename
+ * @returns True when the file was updated
+ */
+function backfillOneArticleHreflang(filename) {
+    const parsed = parseArticleFilename(filename);
+    if (!parsed)
+        return false;
+    const filepath = path.join(NEWS_DIR, filename);
+    const html = readArticleHtml(filepath);
+    if (!html)
+        return false;
+    const articleSlug = `${parsed.date}-${parsed.slug}`;
+    const hreflangBlock = buildArticleHreflang(articleSlug);
+    const hasHreflang = /<link\s+rel="alternate"\s+hreflang="/u.test(html);
+    let next;
+    if (!hasHreflang) {
+        next = injectHreflangLinks(html, hreflangBlock);
+    }
+    else {
+        const hasRelative = /<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="(?!https?:\/\/)/u.test(html);
+        if (!hasRelative)
+            return false;
+        next = fixRelativeHreflangLinks(html, hreflangBlock);
+    }
+    if (next === html)
+        return false;
+    atomicWrite(filepath, next);
+    return true;
+}
+//# sourceMappingURL=backfill-hreflang.js.map

package/scripts/generators/news-indexes/backfill-reader-label.d.ts ADDED Viewed

@@ -0,0 +1,47 @@
+import type { LanguageCode } from '../../types/index.js';
+/**
+ * Remove a trailing **truncated** copy of the localized reader label
+ * (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
+ *
+ * Earlier backfill passes appended the reader label and then clamped the
+ * whole buffer to the per-script `metaDescription` budget, hard-cutting
+ * the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
+ * ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
+ * persisted to `<meta description>` and survive a plain prefix/date-label
+ * strip, so re-feeding them to the resolver re-emits the broken tail.
+ *
+ * A trailing copy that matches the label **in full** is left intact — it
+ * is a complete, reader-facing clause we want to preserve. Only a partial
+ * (truncated) prefix of the label is dropped, leaving the clean body for
+ * the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
+ * reader clause.
+ *
+ * @param description - Candidate description (prefix/date-label removed)
+ * @param langCode - Article language code
+ * @returns Description with any truncated trailing reader label removed
+ */
+export declare function stripTruncatedReaderLabel(description: string, langCode: LanguageCode): string;
+/**
+ * Locate a trailing **truncated** copy of the localized reader label and
+ * return the index at which the description body ends (i.e. where the
+ * partial label begins). Returns -1 when no partial label is present or
+ * when the label is present in full (a complete clause we keep).
+ *
+ * @param text - Trimmed candidate description
+ * @param langCode - Article language code
+ * @returns Cut index for the partial label, or -1 when none applies
+ */
+export declare function findTruncatedReaderLabelCut(text: string, langCode: LanguageCode): number;
+/**
+ * Detect whether a legacy `<meta description>` ends with a **truncated**
+ * reader label once its dateline prefix and redundant date-label clause
+ * are removed. Long, unique legacy descriptions otherwise bypass
+ * `shouldBackfillDescription`, leaving a persisted mid-word cut
+ * (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
+ *
+ * @param body - Stripped description body (prefix/date-label removed)
+ * @param langCode - Article language code
+ * @returns True when a truncated reader label remains in the body
+ */
+export declare function hasTruncatedReaderLabelInBody(body: string, langCode: LanguageCode): boolean;
+//# sourceMappingURL=backfill-reader-label.d.ts.map

package/scripts/generators/news-indexes/backfill-reader-label.js ADDED Viewed

@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * @module Generators/NewsIndexes/BackfillReaderLabel
+ * @description Truncated reader-label detection and stripping helpers,
+ * extracted from `backfill.ts` to keep source files ≤600 lines.
+ */
+import { getLocalizedString } from '../../constants/languages.js';
+import { SEO_CONTEXT_LABELS } from '../../aggregator/metadata/template-fallback.js';
+/**
+ * Remove a trailing **truncated** copy of the localized reader label
+ * (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
+ *
+ * Earlier backfill passes appended the reader label and then clamped the
+ * whole buffer to the per-script `metaDescription` budget, hard-cutting
+ * the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
+ * ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
+ * persisted to `<meta description>` and survive a plain prefix/date-label
+ * strip, so re-feeding them to the resolver re-emits the broken tail.
+ *
+ * A trailing copy that matches the label **in full** is left intact — it
+ * is a complete, reader-facing clause we want to preserve. Only a partial
+ * (truncated) prefix of the label is dropped, leaving the clean body for
+ * the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
+ * reader clause.
+ *
+ * @param description - Candidate description (prefix/date-label removed)
+ * @param langCode - Article language code
+ * @returns Description with any truncated trailing reader label removed
+ */
+export function stripTruncatedReaderLabel(description, langCode) {
+    const text = description.trim();
+    const cut = findTruncatedReaderLabelCut(text, langCode);
+    if (cut < 0)
+        return text;
+    return text
+        .replace(/[.。！？!?…]+$/u, '')
+        .slice(0, cut)
+        .replace(/[\s,;:—\-–·。、]+$/u, '')
+        .trim();
+}
+/**
+ * Locate a trailing **truncated** copy of the localized reader label and
+ * return the index at which the description body ends (i.e. where the
+ * partial label begins). Returns -1 when no partial label is present or
+ * when the label is present in full (a complete clause we keep).
+ *
+ * @param text - Trimmed candidate description
+ * @param langCode - Article language code
+ * @returns Cut index for the partial label, or -1 when none applies
+ */
+export function findTruncatedReaderLabelCut(text, langCode) {
+    const labels = getLocalizedString(SEO_CONTEXT_LABELS, langCode);
+    const reader = (labels.reader ?? '').trim();
+    // Require a reasonably long label so we never strip on a coincidental
+    // short suffix match; real labels are 40+ chars (Latin) / 11+ (CJK).
+    if (reader.length < 8 || text.length < 8)
+        return -1;
+    // Tolerate a terminator the resolver/healer appended after the cut.
+    const core = text.replace(/[.。！？!?…]+$/u, '');
+    const maxK = Math.min(core.length, reader.length);
+    for (let k = maxK; k >= 8; k -= 1) {
+        if (core.slice(core.length - k) === reader.slice(0, k)) {
+            // Full label present at the tail — keep it (not a truncation).
+            if (k === reader.length)
+                return -1;
+            return core.length - k;
+        }
+    }
+    return -1;
+}
+/**
+ * Detect whether a legacy `<meta description>` ends with a **truncated**
+ * reader label once its dateline prefix and redundant date-label clause
+ * are removed. Long, unique legacy descriptions otherwise bypass
+ * `shouldBackfillDescription`, leaving a persisted mid-word cut
+ * (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
+ *
+ * @param body - Stripped description body (prefix/date-label removed)
+ * @param langCode - Article language code
+ * @returns True when a truncated reader label remains in the body
+ */
+export function hasTruncatedReaderLabelInBody(body, langCode) {
+    return findTruncatedReaderLabelCut(body, langCode) >= 0;
+}
+//# sourceMappingURL=backfill-reader-label.js.map

package/scripts/generators/news-indexes/backfill.d.ts CHANGED Viewed

@@ -60,14 +60,26 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
     readonly forceContextPrefix?: boolean;
 }): string;
 /**
- * Apply SEO meta tag replacements to a complete article HTML document.
+ * Strip the legacy dateline prefix **and** the redundant localized
+ * date-label clause from a candidate description, returning the
+ * reader-facing body in isolation. Used to clean a previously-backfilled
+ * `<meta description>` before it is re-fed to the per-language SEO
+ * resolver — without this, the resolver re-clamps the prefixed buffer
+ * against the CJK metaDescription budget and truncates the reader label
+ * mid-clause (live regression in `news/2026-04-26-week-ahead-ko.html`,
+ * a dangling "추적하는." participle).
  *
+ * @param date - Article date (ISO YYYY-MM-DD)
+ * @param slug - Article slug
+ * @param lang - Article language code
+ * @param description - Candidate description (possibly already prefixed)
+ * @returns Reader-facing body with prefix + date label removed
+ */
+export declare function stripLegacyBackfillContext(date: string, slug: string, lang: string, description: string): string;
+/**
+ * Apply SEO meta tag replacements to a complete article HTML document.
  * Exported for the regression test in
- * `test/unit/news-indexes-jsonld-description-regex.test.js`, which
- * locks in the JSON-LD description regex against the duplicate-tail
- * bug (the legacy `"description":"[^"]*"` pattern terminated at the
- * first JSON-escaped quote `\"` and left the previous description's
- * tail in place, accumulating duplicates on every prebuild run).
+ * `test/unit/news-indexes-jsonld-description-regex.test.js`.
  *
  * @param html - Existing article HTML
  * @param description - Backfilled meta description
@@ -75,16 +87,5 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
  * @returns Updated HTML
  */
 export declare function applyArticleSeoBackfill(html: string, description: string, keywords: readonly string[]): string;
-/**
- * Backfill hreflang alternate links for all article HTML files.
- *
- * Handles three cases:
- * 1. Articles with no hreflang links at all → inject the full block before `</head>`
- * 2. Articles with relative hreflang URLs → replace with absolute URLs
- * 3. Articles already correct → skip
- *
- * @param filenames - News article filenames
- * @returns Number of HTML files updated
- */
-export declare function backfillArticleHreflang(filenames: readonly string[]): number;
+export { backfillArticleHreflang } from './backfill-hreflang.js';
 //# sourceMappingURL=backfill.d.ts.map