npm - euparliamentmonitor - Versions diffs - 0.9.21 → 0.9.22 - Mend

euparliamentmonitor 0.9.21 → 0.9.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +5 -2
package/scripts/aggregator/metadata/artifact-walker.js +2 -2
package/scripts/aggregator/metadata/heading-rules.js +1 -0
package/scripts/copy-vendor.js +84 -112
package/scripts/dump-article-seo.js +567 -0
package/scripts/generators/news-indexes/backfill.d.ts +6 -1
package/scripts/generators/news-indexes/backfill.js +71 -4

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "euparliamentmonitor",
-  "version": "0.9.21",
+  "version": "0.9.22",
   "type": "module",
   "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
   "main": "scripts/index.js",
@@ -71,6 +71,7 @@
     "prior-run-diff": "node scripts/aggregator/prior-run-diff.js",
     "generate-article": "node scripts/aggregator/article-generator.js",
     "generate-article:all": "node scripts/aggregator/article-generator.js --all",
+    "dump:article-seo": "node scripts/dump-article-seo.js",
     "generate-news-indexes": "node scripts/generators/news-indexes.js",
     "generate-sitemap": "node scripts/generators/sitemap.js",
     "image:generate": "node scripts/generate-responsive-images.js",
@@ -164,6 +165,7 @@
     "chartjs-plugin-annotation": "3.1.0",
     "clean-css": "^5.3.3",
     "d3": "7.9.0",
+    "esbuild": "0.28.0",
     "eslint": "10.4.0",
     "eslint-config-prettier": "10.1.8",
     "eslint-plugin-jsdoc": "63.0.0",
@@ -208,6 +210,7 @@
     "flatted": ">=3.4.2",
     "path-to-regexp": ">=8.4.0",
     "ip-address": ">=10.1.1",
-    "uuid": ">=11.1.1"
+    "uuid": ">=11.1.1",
+    "qs": "6.15.2"
   }
 }

package/scripts/aggregator/metadata/artifact-walker.js CHANGED Viewed

@@ -17,7 +17,7 @@ import fs from 'fs';
 import path from 'path';
 import { extractFirstH1 } from './h1-extractor.js';
 import { extractLedeAfterHeading, extractStrongProseLine } from './lede-extractor.js';
-import { isGenericHeading, stripArtifactCategoryAffix } from './heading-rules.js';
+import { isGenericHeading, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './heading-rules.js';
 import { truncateTitle } from './text-utils.js';
 import { extractPriorityFindingHighlight } from './priority-finding-highlight.js';
 /** Ordered list of artefact filenames that typically carry the editorial H1. */
@@ -132,7 +132,7 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
     // distinctive editorial headline ("Digital Markets Act Enforcement",
     // "Ukraine War Accountability") instead of a stripped category noun.
     const priority = extractPriorityFindingHighlight(body);
-    if (priority?.headline) {
+    if (priority?.headline && !isArtifactCategoryHeading(priority.headline)) {
         return {
             cleanHighlight: {
                 headline: truncateTitle(priority.headline),

package/scripts/aggregator/metadata/heading-rules.js CHANGED Viewed

@@ -69,6 +69,7 @@ export const ARTIFACT_CATEGORY_PREFIXES = [
     'commission wp alignment',
     'committee activity report',
     'cross run continuity',
+    'data availability assessment',
     'deep analysis',
     'economic context',
     'executive brief',

package/scripts/copy-vendor.js CHANGED Viewed

@@ -12,15 +12,25 @@
  *   - chart.js                 → js/vendor/chart.umd.min.js
  *   - chartjs-plugin-annotation → js/vendor/chartjs-plugin-annotation.min.js
  *   - d3                        → js/vendor/d3.min.js
- *   - mermaid                   → js/vendor/mermaid/  (entry + chunks/)
+ *   - mermaid                   → js/vendor/mermaid/mermaid.esm.min.mjs
  *
- * Mermaid is special: v11+ ships as code-split ESM. The entry
- * `mermaid.esm.min.mjs` does dynamic `import()` on diagram-specific chunks
- * under `dist/chunks/mermaid.esm.min/*.mjs`. To make every diagram type render
- * without external network calls, we copy the **entire mermaid `dist/`
- * directory** (filtered to the `.esm.min` flavour to keep payload small).
+ * Mermaid is special: v11+ ships as a **code-split ESM bundle**. The entry
+ * `mermaid.esm.min.mjs` (28 KB) statically imports 81 diagram-specific chunks
+ * from `dist/chunks/mermaid.esm.min/*.mjs`. Empirically (May 2026), serving
+ * those chunks through S3 + CloudFront has been unreliable — the entry returns
+ * 200 OK but every chunk URL returns 403 from CloudFront, breaking every
+ * article that references the loader.
  *
- * Idempotent: rerunning overwrites prior copies and leaves licenses in place.
+ * To eliminate that failure mode, we **bundle Mermaid into a single
+ * self-contained ESM file at copy-vendor time using esbuild** (devDependency).
+ * The output is written to the same path / filename that the loader and the
+ * existing article HTML already reference (`mermaid.esm.min.mjs`), so the
+ * loader (`js/mermaid-init.js`) and the generated articles continue to work
+ * unchanged — only the file's content changes (3.2 MB self-contained vs.
+ * 28 KB entry-plus-81-chunks).
+ *
+ * Idempotent: rerunning overwrites prior copies and leaves licenses in place;
+ * stale `chunks/` directories from prior layouts are pruned.
  *
  * Failure modes:
  *   - Missing chart.js / d3 / chartjs-plugin-annotation → hard error (these
@@ -28,11 +38,12 @@
  *   - Missing mermaid → soft error (logged, exit 0). Mermaid is also a pinned
  *     `devDependency`, but optional installs (e.g. `npm ci --omit=dev`) may
  *     skip it; we want the deploy to succeed without diagrams rather than fail.
+ *   - Bundling failure → hard error: mermaid is present but unusable, which
+ *     would silently ship a broken page; fail fast at build time instead.
  */
 import {
   copyFileSync,
-  cpSync,
   existsSync,
   mkdirSync,
   readdirSync,
@@ -43,6 +54,7 @@ import {
 } from 'node:fs';
 import path from 'node:path';
 import process from 'node:process';
+import * as esbuild from 'esbuild';
 const ROOT = process.cwd();
 const NODE_MODULES = path.join(ROOT, 'node_modules');
@@ -120,7 +132,8 @@ function copyOrFail(label, srcRel, dstRel, license) {
 function copyMermaid() {
   const mermaidDist = path.join(NODE_MODULES, 'mermaid', 'dist');
   const target = path.join(VENDOR_DIR, 'mermaid');
-  if (!existsSync(mermaidDist)) {
+  const entryPoint = path.join(mermaidDist, 'mermaid.esm.min.mjs');
+  if (!existsSync(entryPoint)) {
     process.stdout.write(
       '  ⚠ mermaid not installed (devDependency); skipping diagram bundle.\n',
     );
@@ -128,84 +141,68 @@ function copyMermaid() {
   }
   ensureDir(target);
-  // Per-file idempotency: walk the source tree and only copy files whose
-  // bytes differ from what's already in `js/vendor/mermaid/`. Replaces the
-  // earlier `rmSync` + `cpSync` approach which always touched every chunk's
-  // mtime — `aws s3 sync` (size+mtime by default) then re-uploaded the
-  // entire mermaid bundle on every deploy even though the bundle is byte-
-  // identical until the pinned mermaid version in package.json changes.
+  // Bundle mermaid's code-split ESM entry plus all of its dynamic-import
+  // chunks into a SINGLE self-contained ESM file. esbuild follows every
+  // static and dynamic `import` from the entry and inlines the transitive
+  // closure, so the resulting file has no external module references —
+  // exactly what the static-site origin needs.
   //
-  // Filename contract preserved exactly: entry stays at
-  // `js/vendor/mermaid/mermaid.esm.min.mjs` and chunks stay at
-  // `js/vendor/mermaid/chunks/mermaid.esm.min/*.mjs` so every existing
-  // `<script type="module" src="../js/vendor/mermaid/mermaid.esm.min.mjs">`
-  // and dynamic `import()` from the entry continues to resolve.
-  // Build the set of source files we want to ship (filter mirrors the
-  // previous cpSync filter exactly).
-  const wantedTopLevel = new Set(['mermaid.esm.min.mjs']);
-  const wantedFiles = []; // { src, rel } — `rel` is relative to mermaidDist
-  function shouldShip(rel) {
-    if (rel.endsWith('.map')) return false;
-    const segments = rel.split(path.sep);
-    const top = segments[0];
-    if (top === 'chunks') {
-      if (segments.length === 1) return false; // directory itself, not a file
-      const flavour = segments[1];
-      return flavour === 'mermaid.esm.min';
-    }
-    if (segments.length === 1) {
-      return wantedTopLevel.has(top);
-    }
-    return false;
-  }
-  function walkSource(dir) {
-    for (const entry of readdirSync(dir, { withFileTypes: true })) {
-      const full = path.join(dir, entry.name);
-      const rel = path.relative(mermaidDist, full);
-      if (entry.isDirectory()) {
-        walkSource(full);
-      } else if (entry.isFile() && shouldShip(rel)) {
-        wantedFiles.push({ src: full, rel });
+  // We write the output under the same filename the loader and existing
+  // article HTML already reference (`mermaid.esm.min.mjs`), so this script
+  // is the only place that changes when we switch from "entry + 81 chunks"
+  // to "single bundle". The previous chunk-shipping layout (`chunks/`) is
+  // pruned below.
+  const outFile = path.join(target, 'mermaid.esm.min.mjs');
+  try {
+    esbuild.buildSync({
+      entryPoints: [entryPoint],
+      outfile: outFile,
+      bundle: true,
+      format: 'esm',
+      minify: true,
+      // `browser` keeps mermaid's runtime-detection paths (e.g. `document`
+      // checks) intact — same target as the upstream `.esm.min.mjs` build.
+      platform: 'browser',
+      target: 'es2022',
+      // Resolve `import.meta.url` at runtime (relative to the served bundle
+      // location) rather than baking in the build-time path.
+      supported: { 'import-meta': true },
+      // Drop sourcemaps; the upstream bundle ships them as `.map` siblings
+      // and we previously excluded those from vendor copy.
+      sourcemap: false,
+      legalComments: 'none',
+      // Use 'error' so esbuild prints its own detailed diagnostics (file,
+      // line, column) on failure — 'silent' previously swallowed all context.
+      logLevel: 'error',
+    });
+  } catch (err) {
+    // esbuild attaches structured diagnostics on `err.errors`; print them
+    // so CI logs are actionable without re-running locally.
+    if (err && Array.isArray(err.errors)) {
+      for (const e of err.errors) {
+        const loc = e.location
+          ? `${e.location.file}:${e.location.line}:${e.location.column}: `
+          : '';
+        process.stderr.write(`  ${loc}${e.text}\n`);
       }
     }
-  }
-  walkSource(mermaidDist);
-  // Copy only-if-changed.
-  let copied = 0;
-  let unchanged = 0;
-  for (const { src, rel } of wantedFiles) {
-    const dst = path.join(target, rel);
-    ensureDir(path.dirname(dst));
-    if (copyFileIfChanged(src, dst)) {
-      copied++;
-    } else {
-      unchanged++;
-    }
+    process.stderr.write(
+      `error: mermaid bundle failed: ${err && err.message ? err.message : err}\n` +
+        '       Check that node_modules/mermaid is installed (run `npm ci`) and that\n' +
+        '       esbuild can resolve the ESM entry point at node_modules/mermaid/dist/mermaid.esm.min.mjs.\n',
+    );
+    process.exit(1);
   }
-  // Remove orphaned files in the destination tree that no longer have a
-  // matching wanted source — this preserves the "no stale chunks from a
-  // previous mermaid version" guarantee that the old `rmSync` provided,
-  // without touching any current chunk's mtime.
-  const wantedDstSet = new Set(
-    wantedFiles.map(({ rel }) => path.join(target, rel)),
-  );
-  // Allow our REUSE sidecar files alongside their primary file.
+  // Prune the obsolete chunks layout (and any other orphans) from previous
+  // copy-vendor runs. The bundled file is fully self-contained, so anything
+  // other than the bundle itself + its REUSE sidecar is stale.
+  const wantedDstSet = new Set([outFile]);
   function isAllowedSidecar(absPath) {
     if (!absPath.endsWith('.license')) return false;
     const primary = absPath.slice(0, -'.license'.length);
     return wantedDstSet.has(primary);
   }
-  // Also allow the chunks-dir flavour-level license sidecar we drop below.
-  const flavourLicensePath = path.join(
-    target,
-    'chunks',
-    'mermaid.esm.min.license',
-  );
   function pruneOrphans(dir) {
     if (!existsSync(dir)) return;
@@ -213,7 +210,6 @@ function copyMermaid() {
       const full = path.join(dir, entry.name);
       if (entry.isDirectory()) {
         pruneOrphans(full);
-        // Remove now-empty directories so a flavour rename leaves no shell.
         try {
           if (readdirSync(full).length === 0) {
             rmSync(full, { recursive: true, force: true });
@@ -222,11 +218,7 @@ function copyMermaid() {
           // best-effort
         }
       } else if (entry.isFile()) {
-        if (
-          !wantedDstSet.has(full) &&
-          !isAllowedSidecar(full) &&
-          full !== flavourLicensePath
-        ) {
+        if (!wantedDstSet.has(full) && !isAllowedSidecar(full)) {
           rmSync(full, { force: true });
         }
       }
@@ -234,39 +226,19 @@ function copyMermaid() {
   }
   pruneOrphans(target);
-  // REUSE sidecar for the entry file + flavour directory.
-  const entry = path.join(target, 'mermaid.esm.min.mjs');
-  if (existsSync(entry)) {
-    writeLicense(entry, '2014-2026 Mermaid contributors', 'MIT');
-  }
-  // Also drop a license file at the chunks dir so REUSE lint passes for the
-  // generated tree without us having to enumerate every chunk by name.
-  const chunksDir = path.join(target, 'chunks', 'mermaid.esm.min');
-  if (existsSync(chunksDir)) {
-    writeIfChanged(
-      flavourLicensePath,
-      'SPDX-FileCopyrightText: 2014-2026 Mermaid contributors\nSPDX-License-Identifier: MIT\n',
-    );
-  }
+  // REUSE sidecar for the bundled file. The bundle contains code from
+  // mermaid + its transitive ESM deps; mermaid's own MIT license header
+  // remains intact in the dependency tree (REUSE.toml covers the vendored
+  // artifact via path-level annotation; this sidecar keeps the file
+  // self-documenting).
+  writeLicense(outFile, '2014-2026 Mermaid contributors', 'MIT');
+  const size = statSync(outFile).size;
   process.stdout.write(
-    `  ✓ mermaid/ (${copied} copied, ${unchanged} unchanged; ${countMjs(target)} total mjs chunks)\n`,
+    `  ✓ mermaid/mermaid.esm.min.mjs (${(size / 1024).toFixed(0)} KB self-contained bundle)\n`,
   );
 }
-function countMjs(dir) {
-  let n = 0;
-  function walk(d) {
-    if (!existsSync(d)) return;
-    for (const entry of readdirSync(d, { withFileTypes: true })) {
-      const p = path.join(d, entry.name);
-      if (entry.isDirectory()) walk(p);
-      else if (entry.isFile() && entry.name.endsWith('.mjs')) n += 1;
-    }
-  }
-  walk(dir);
-  return n;
-}
 function main() {
   ensureDir(VENDOR_DIR);
   process.stdout.write(`Copying vendor JS libraries to ${path.relative(ROOT, VENDOR_DIR)}/\n`);

package/scripts/dump-article-seo.js ADDED Viewed

@@ -0,0 +1,567 @@
+#!/usr/bin/env node
+// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * @module scripts/dump-article-seo
+ * @description Read-only preview of the SEO `<head>` metadata that the
+ * deterministic article generator **would produce** for every executive
+ * brief committed under `analysis/daily/`. Use this before running
+ * `npm run generate-article:all` to audit and improve titles,
+ * descriptions, and keywords without touching any `news/*.html` file.
+ *
+ * **Source: executive briefs, not HTML.**
+ * The script reads each analysis run's `executive-brief.md` (and its
+ * translated siblings) via the same resolver chain that the real
+ * article generator uses. No HTML files are read or written; the script
+ * is purely additive and fully idempotent.
+ *
+ * **Identical code path to the real renderer.** The script intentionally
+ * imports the same helpers that `scripts/aggregator/article-generator.js`
+ * (the engine behind `npm run generate-article:all` and the
+ * `regenerate-articles.yml` workflow) uses:
+ *
+ *   1. `discoverAnalysisRuns(repoRoot)` — same run discovery as the batch
+ *      renderer (`generator/render-batch.js`).
+ *   2. `aggregateAnalysisRun({ runDir, repoRoot })` — same Markdown
+ *      aggregation that feeds `resolveArticleMetadata`, which in turn
+ *      reads `executive-brief.md` and its translated siblings.
+ *   3. `resolveArticleMetadata({ articleType, date, markdown, manifest,
+ *      runDir })` — the single source of truth for per-language `(title,
+ *      description, extendedDescription, keywords, source)` documented in
+ *      `src/aggregator/article-metadata.ts`. The entry returned here is
+ *      *bit-for-bit identical* to the one passed into
+ *      `src/aggregator/html/shell.ts` for the `<title>`,
+ *      `<meta name="description">`, and `<meta name="keywords">` tags.
+ *
+ * **Two-part output per run.**
+ *   - *Field analysis* — human-readable breakdown of each SEO field
+ *     (length, content, resolution tier) for quick editorial review.
+ *   - *HTML head snippet* — the **complete `<head>` block** that the
+ *     article generator will emit, produced by calling
+ *     `wrapArticleHtml()` from `src/aggregator/html/shell.ts` with an
+ *     empty body and slicing out `<head>...</head>`. This includes the
+ *     `<title>`, `<meta name="description">`, `<meta name="keywords">`,
+ *     all `<meta property="og:*">` and `<meta name="twitter:*">` tags,
+ *     `<link rel="canonical">`, hreflang alternates, JSON-LD
+ *     `NewsArticle` + `BreadcrumbList`, and every other tag the real
+ *     renderer emits — because the snippet *is* the real renderer's
+ *     output. Copy-paste these into a browser extension or SEO tool to
+ *     preview how the article will appear in search results and social
+ *     cards before committing to HTML generation.
+ *
+ * Invocation:
+ *   node scripts/dump-article-seo.js \
+ *     [--repo-root <path>]   # defaults to process.cwd()
+ *     [--lang en]            # defaults to en
+ *     [--out <path>]         # also write the human-readable dump here
+ *     [--json <path>]        # also write a machine-readable JSONL dump
+ *     [--limit <N>]          # only process the first N runs (debug)
+ *     [--quiet]              # suppress per-run stdout (file output only)
+ */
+import fs from 'node:fs';
+import path from 'node:path';
+import process from 'node:process';
+import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
+import {
+  aggregateAnalysisRun,
+  resolveArticleTypeFromManifest,
+} from './aggregator/analysis-aggregator.js';
+import { resolveArticleMetadata } from './aggregator/article-metadata.js';
+import { buildArticleSlug } from './aggregator/generator/slug.js';
+import { getArticleFilename } from './aggregator/html/hreflang.js';
+import { wrapArticleHtml } from './aggregator/html/shell.js';
+import { ALL_LANGUAGES, isSupportedLanguage } from './constants/language-core.js';
+const SUPPORTED_LANGS = new Set(ALL_LANGUAGES);
+/**
+ * Parse the small CLI surface used by this script. Kept inline so the
+ * dumper has no extra dependencies beyond the same compiled-from-TS
+ * helpers the real renderer uses.
+ *
+ * @param {readonly string[]} argv - `process.argv.slice(2)`
+ * @returns {{repoRoot: string, lang: string, outPath: string|null,
+ *           jsonPath: string|null, limit: number, quiet: boolean}}
+ */
+export function parseArgs(argv) {
+  let repoRoot = process.cwd();
+  let lang = 'en';
+  let outPath = null;
+  let jsonPath = null;
+  let limit = Number.POSITIVE_INFINITY;
+  let quiet = false;
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i];
+    switch (arg) {
+      case '--repo-root':
+        repoRoot = path.resolve(requireValue(argv, i, arg));
+        i += 1;
+        break;
+      case '--lang':
+        lang = requireValue(argv, i, arg);
+        i += 1;
+        break;
+      case '--out':
+        outPath = path.resolve(requireValue(argv, i, arg));
+        i += 1;
+        break;
+      case '--json':
+        jsonPath = path.resolve(requireValue(argv, i, arg));
+        i += 1;
+        break;
+      case '--limit': {
+        const raw = requireValue(argv, i, arg);
+        if (!/^\d+$/u.test(raw)) {
+          throw new Error(`--limit expects a positive integer, got "${raw}"`);
+        }
+        const parsed = Number.parseInt(raw, 10);
+        if (!Number.isFinite(parsed) || parsed < 1) {
+          throw new Error(`--limit expects a positive integer, got "${raw}"`);
+        }
+        limit = parsed;
+        i += 1;
+        break;
+      }
+      case '--quiet':
+        quiet = true;
+        break;
+      case '--help':
+      case '-h':
+        printHelpAndExit();
+        break;
+      default:
+        throw new Error(`Unknown argument: ${arg}`);
+    }
+  }
+  if (!isSupportedLanguage(lang)) {
+    throw new Error(
+      `Unsupported --lang "${lang}". Expected one of: ${[...SUPPORTED_LANGS].join(', ')}`
+    );
+  }
+  return { repoRoot, lang, outPath, jsonPath, limit, quiet };
+}
+function requireValue(argv, i, flag) {
+  const value = argv[i + 1];
+  if (value === undefined) {
+    throw new Error(`${flag} requires a value`);
+  }
+  return value;
+}
+function printHelpAndExit() {
+  process.stdout.write(
+    [
+      'Usage: node scripts/dump-article-seo.js [options]',
+      '',
+      'Read-only preview of the SEO <head> metadata (title, description,',
+      'keywords, og:*, twitter:*) that the article generator would produce',
+      'from each executive brief — without generating any HTML files.',
+      '',
+      'Options:',
+      '  --repo-root <path>   Repository root (default: cwd)',
+      '  --lang <code>        Language to dump (default: en)',
+      '  --out <path>         Write the human-readable report here',
+      '  --json <path>        Also write a JSONL record per run',
+      '  --limit <N>          Process only the first N runs (debug)',
+      '  --quiet              Suppress per-run stdout',
+      '  -h, --help           Show this help',
+      '',
+    ].join('\n')
+  );
+  process.exit(0);
+}
+/**
+ * Mirror of the private `readManifestMetadata` helper inside
+ * `scripts/aggregator/generator/render-one.js`. We re-implement it here
+ * rather than export it from the renderer because the metadata-relevant
+ * subset of a manifest is intentionally a *contract*, not a public API:
+ * the resolver only consumes the seven keys listed below and silently
+ * ignores everything else. Re-implementing keeps the dumper aligned
+ * with that contract without leaking unrelated manifest fields into
+ * `resolveArticleMetadata`.
+ *
+ * @param {string} runDir - Absolute path to the analysis run
+ * @returns {object} Metadata-relevant manifest fields (possibly empty)
+ */
+export function readManifestMetadata(runDir) {
+  const manifestPath = path.join(runDir, 'manifest.json');
+  if (!fs.existsSync(manifestPath)) return {};
+  let parsed;
+  try {
+    parsed = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
+  } catch {
+    return {};
+  }
+  const manifest = {};
+  const resolvedType = resolveArticleTypeFromManifest(parsed);
+  if (resolvedType && resolvedType !== 'unknown') {
+    manifest.articleType = resolvedType;
+  }
+  if (typeof parsed.date === 'string') manifest.date = parsed.date;
+  if (typeof parsed.runId === 'string') manifest.runId = parsed.runId;
+  if (typeof parsed.title === 'string' || isLanguageMapLike(parsed.title)) {
+    manifest.title = parsed.title;
+  }
+  if (
+    typeof parsed.description === 'string' ||
+    isLanguageMapLike(parsed.description)
+  ) {
+    manifest.description = parsed.description;
+  }
+  if (typeof parsed.committee === 'string') {
+    manifest.committee = parsed.committee;
+  }
+  return manifest;
+}
+function isLanguageMapLike(value) {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
+  for (const entry of Object.values(value)) {
+    if (typeof entry !== 'string') return false;
+  }
+  return true;
+}
+/**
+ * Resolve the SEO metadata for one analysis run by reading its executive
+ * brief and applying the same resolver chain as the article generator.
+ * Pure: no files written, no stdout side-effects.
+ *
+ * @param {object} opts
+ * @param {string} opts.runDir  - Absolute path to the analysis run
+ * @param {string} opts.repoRoot - Repository root (for relative paths)
+ * @param {string} opts.lang    - Language code to extract
+ * @returns {{
+ *   runDir: string,
+ *   runDirRel: string,
+ *   date: string,
+ *   articleType: string,
+ *   slug: string,
+ *   filename: string,
+ *   entry: {title: string, description: string,
+ *           extendedDescription: string, keywords: readonly string[],
+ *           source: string}
+ * }}
+ */
+export function resolveRunSeo({ runDir, repoRoot, lang }) {
+  const aggregated = aggregateAnalysisRun({ runDir, repoRoot });
+  const manifestMetadata = readManifestMetadata(runDir);
+  const resolved = resolveArticleMetadata({
+    articleType: aggregated.articleType,
+    date: aggregated.date,
+    markdown: aggregated.markdown,
+    manifest: manifestMetadata,
+    runDir,
+  });
+  const entry = resolved[lang];
+  if (!entry) {
+    throw new Error(
+      `resolveArticleMetadata returned no entry for lang="${lang}" in ${runDir}`
+    );
+  }
+  const slug = buildArticleSlug(aggregated.date, aggregated.articleType);
+  const filename = getArticleFilename(slug, lang);
+  const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
+  return {
+    runDir,
+    runDirRel,
+    date: aggregated.date,
+    articleType: aggregated.articleType,
+    slug,
+    filename,
+    entry: {
+      title: entry.title,
+      description: entry.description,
+      extendedDescription: entry.extendedDescription,
+      keywords: entry.keywords ?? [],
+      source: entry.source,
+    },
+  };
+}
+/**
+ * Build the full `<head>` block that the article generator will emit for
+ * this run. The output is **bit-for-bit identical** to the `<head>`
+ * produced by `wrapArticleHtml()` in `src/aggregator/html/shell.ts` for
+ * the same metadata — including all SEO, Open Graph, Twitter card,
+ * `<link rel>`, theme-color, and JSON-LD tags — because this function
+ * literally invokes `wrapArticleHtml()` with an empty body and slices
+ * out the `<head>...</head>` block from the resulting document. There
+ * is no duplicated head-rendering code path.
+ *
+ * Use this to preview how the article will appear in search results and
+ * social-card previews **before** running the full HTML generator.
+ *
+ * @param {ReturnType<typeof resolveRunSeo>} record
+ * @param {string} lang - Language code passed through to `wrapArticleHtml`
+ * @returns {string} The complete `<head>...</head>` block from the
+ *                   real article renderer, ready to paste for review.
+ */
+export function buildHtmlHeadSnippet(record, lang) {
+  const { entry } = record;
+  const html = wrapArticleHtml({
+    lang,
+    articleSlug: record.slug,
+    body: '',
+    title: entry.title,
+    description: entry.description,
+    extendedDescription: entry.extendedDescription,
+    keywords: entry.keywords ?? [],
+    date: record.date,
+    articleType: record.articleType,
+  });
+  const match = html.match(/<head>[\s\S]*?<\/head>/);
+  if (!match) {
+    throw new Error(
+      `buildHtmlHeadSnippet: could not locate <head> block in wrapArticleHtml output for ${record.slug}`
+    );
+  }
+  return match[0];
+}
+/**
+ * Format one resolved-SEO record as the human/AI-readable block used in
+ * the stdout dump. Each block contains two sections:
+ *   1. *Field analysis* — per-field character/term counts and the
+ *      resolution tier so editors can spot template fallbacks instantly.
+ *   2. *HTML head snippet* — the exact tags the article generator will
+ *      emit, ready to paste into a browser/SEO tool for preview.
+ *
+ * @param {ReturnType<typeof resolveRunSeo>} record
+ * @param {number} index - 1-based position within the dump
+ * @param {number} total - Total number of records being dumped
+ * @param {string} [lang] - Language code (used for the HTML snippet; defaults to 'en')
+ * @returns {string}
+ */
+export function formatRecord(record, index, total, lang = 'en') {
+  const lines = [];
+  lines.push('='.repeat(80));
+  lines.push(`[${index}/${total}] ${record.slug}`);
+  lines.push('='.repeat(80));
+  lines.push(`run-dir         : ${record.runDirRel}`);
+  lines.push(`date            : ${record.date}`);
+  lines.push(`article-type    : ${record.articleType}`);
+  lines.push(`resolution-tier : ${record.entry.source}`);
+  lines.push(`html-file       : news/${record.filename}`);
+  lines.push('');
+  lines.push('--- Field analysis (from executive-brief.md → resolveArticleMetadata) ---');
+  lines.push(
+    `<title>            (${record.entry.title.length} chars): ${formatInline(record.entry.title)}`
+  );
+  lines.push(
+    `<meta description> (${record.entry.description.length} chars): ${formatInline(record.entry.description)}`
+  );
+  lines.push(
+    `<meta description-extended> (${record.entry.extendedDescription.length} chars): ${formatInline(record.entry.extendedDescription)}`
+  );
+  const keywords = record.entry.keywords;
+  lines.push(
+    `<meta keywords>    (${keywords.length} terms): ${keywords.length ? keywords.join(', ') : '(empty)'}`
+  );
+  lines.push('');
+  lines.push('--- HTML <head> block (verbatim output of wrapArticleHtml — same code path as the article generator) ---');
+  lines.push(buildHtmlHeadSnippet(record, lang));
+  lines.push('');
+  return lines.join('\n');
+}
+function formatInline(value) {
+  if (!value) return '(empty)';
+  // Strip newlines so each field stays on one line.
+  return value.replace(/\s+/g, ' ').trim();
+}
+/**
+ * Run the full dump: discover analysis runs, resolve SEO metadata from
+ * each executive brief, print field analysis + HTML head snippet, and
+ * optionally write to disk. Returns summary statistics so unit tests and
+ * downstream tooling can assert on histograms without re-parsing stdout.
+ *
+ * @param {ReturnType<typeof parseArgs>} opts
+ * @returns {{
+ *   discovered: number,
+ *   total: number,
+ *   processed: number,
+ *   resolutionTiers: Record<string, number>,
+ *   emptyKeywordCount: number,
+ *   shortDescriptionCount: number,
+ *   records: ReadonlyArray<ReturnType<typeof resolveRunSeo>>
+ * }}
+ */
+export function dumpArticleSeo(opts) {
+  const { repoRoot, lang, outPath, jsonPath, limit, quiet } = opts;
+  const allRuns = discoverAnalysisRuns(repoRoot).map((run) => run.runDir);
+  const discovered = allRuns.length;
+  const targetRuns = Number.isFinite(limit) ? allRuns.slice(0, limit) : allRuns;
+  const total = targetRuns.length;
+  const records = [];
+  const failures = [];
+  const resolutionTiers = Object.create(null);
+  let emptyKeywordCount = 0;
+  let shortDescriptionCount = 0;
+  const textChunks = [];
+  const jsonLines = [];
+  const header =
+    `# Executive Brief SEO Preview\n` +
+    `# Source         : executive-brief.md under analysis/daily/*/\n` +
+    `# repo-root      : ${repoRoot}\n` +
+    `# language       : ${lang}\n` +
+    `# total runs     : ${discovered}\n` +
+    `# selected runs  : ${total}\n` +
+    `# generated by   : scripts/dump-article-seo.js\n` +
+    `# resolver       : src/aggregator/article-metadata.ts → resolveArticleMetadata()\n` +
+    `# rendered by    : src/aggregator/html/shell.ts (same call path as npm run generate-article:all)\n` +
+    `# purpose        : review and improve SEO before generating HTML\n\n`;
+  if (!quiet) process.stdout.write(header);
+  textChunks.push(header);
+  for (let i = 0; i < targetRuns.length; i += 1) {
+    const runDir = targetRuns[i];
+    let record;
+    try {
+      record = resolveRunSeo({ runDir, repoRoot, lang });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      failures.push({ runDir, message });
+      const failBlock = `--- FAILED ${path.relative(repoRoot, runDir)}: ${message}\n\n`;
+      if (!quiet) process.stderr.write(failBlock);
+      textChunks.push(failBlock);
+      continue;
+    }
+    records.push(record);
+    const tier = record.entry.source;
+    resolutionTiers[tier] = (resolutionTiers[tier] ?? 0) + 1;
+    if (record.entry.keywords.length === 0) emptyKeywordCount += 1;
+    if (record.entry.description.length < 70) shortDescriptionCount += 1;
+    const block = formatRecord(record, i + 1, total, lang);
+    if (!quiet) process.stdout.write(`${block}\n`);
+    textChunks.push(`${block}\n`);
+    jsonLines.push(
+      JSON.stringify({
+        slug: record.slug,
+        runDir: record.runDirRel,
+        date: record.date,
+        articleType: record.articleType,
+        lang,
+        filename: record.filename,
+        source: record.entry.source,
+        title: record.entry.title,
+        description: record.entry.description,
+        extendedDescription: record.entry.extendedDescription,
+        keywords: record.entry.keywords,
+        htmlHeadSnippet: buildHtmlHeadSnippet(record, lang),
+      })
+    );
+  }
+  const summary = buildSummary({
+    discovered,
+    total,
+    processed: records.length,
+    failures,
+    resolutionTiers,
+    emptyKeywordCount,
+    shortDescriptionCount,
+  });
+  if (!quiet) process.stdout.write(summary);
+  textChunks.push(summary);
+  if (outPath) {
+    fs.mkdirSync(path.dirname(outPath), { recursive: true });
+    fs.writeFileSync(outPath, textChunks.join(''), 'utf8');
+  }
+  if (jsonPath) {
+    fs.mkdirSync(path.dirname(jsonPath), { recursive: true });
+    fs.writeFileSync(jsonPath, `${jsonLines.join('\n')}\n`, 'utf8');
+  }
+  return {
+    discovered,
+    total,
+    processed: records.length,
+    resolutionTiers,
+    emptyKeywordCount,
+    shortDescriptionCount,
+    records,
+  };
+}
+function buildSummary({
+  discovered,
+  total,
+  processed,
+  failures,
+  resolutionTiers,
+  emptyKeywordCount,
+  shortDescriptionCount,
+}) {
+  const tierEntries = Object.entries(resolutionTiers).sort(
+    ([a], [b]) => a.localeCompare(b)
+  );
+  const lines = [];
+  lines.push('='.repeat(80));
+  lines.push('SUMMARY');
+  lines.push('='.repeat(80));
+  lines.push(`total runs discovered : ${discovered}`);
+  lines.push(`selected for preview  : ${total}`);
+  lines.push(`successfully resolved : ${processed}`);
+  lines.push(`failed runs           : ${failures.length}`);
+  lines.push('');
+  lines.push('Resolution-tier histogram (alphabetical by source label):');
+  if (tierEntries.length === 0) {
+    lines.push('  (no runs resolved)');
+  } else {
+    for (const [tier, count] of tierEntries) {
+      lines.push(`  ${tier.padEnd(20)} ${count}`);
+    }
+  }
+  lines.push('');
+  lines.push('Quality flags:');
+  lines.push(`  runs with empty <meta keywords>           : ${emptyKeywordCount}`);
+  lines.push(`  runs with <meta description> shorter than 70 chars : ${shortDescriptionCount}`);
+  if (failures.length > 0) {
+    lines.push('');
+    lines.push('Failures:');
+    for (const fail of failures) {
+      lines.push(`  - ${fail.runDir}: ${fail.message}`);
+    }
+  }
+  lines.push('');
+  return `${lines.join('\n')}`;
+}
+// Run as a script only when invoked directly (not when imported by tests).
+const invokedDirectly =
+  import.meta.url === `file://${process.argv[1]}` ||
+  process.argv[1]?.endsWith('dump-article-seo.js');
+if (invokedDirectly) {
+  try {
+    const opts = parseArgs(process.argv.slice(2));
+    dumpArticleSeo(opts);
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    process.stderr.write(`dump-article-seo: ${message}\n`);
+    process.exit(1);
+  }
+}

package/scripts/generators/news-indexes/backfill.d.ts CHANGED Viewed

@@ -50,10 +50,15 @@ export declare function healJsonLdDescriptionCorruption(filenames: readonly stri
  * @param slug - Article slug (used to derive the category)
  * @param lang - Article language (ISO 639-1 lower-case code)
  * @param description - Candidate description (resolver output preferred)
+ * @param options - Backfill options
+ * @param options.forceContextPrefix - Force date/language/category prefix
+ *   even when the description is already substantive
  * @returns Page-specific description, prefix-free when description is
  *   already substantive
  */
-export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string): string;
+export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string, options?: {
+    readonly forceContextPrefix?: boolean;
+}): string;
 /**
  * Apply SEO meta tag replacements to a complete article HTML document.
  *

package/scripts/generators/news-indexes/backfill.js CHANGED Viewed

@@ -17,6 +17,23 @@ import { formatSlug, parseArticleFilename, extractArticleMeta, escapeHTML, atomi
 import { detectCategory } from '../../utils/article-category.js';
 import { buildSeoKeywords, resolveArticleMetadata } from '../../aggregator/article-metadata.js';
 const MIN_ARTICLE_DESCRIPTION_LENGTH = 120;
+/** Language labels used only in forced legacy backfill prefixes. */
+const LEGACY_LANGUAGE_LABELS = {
+    en: 'English',
+    sv: 'Svenska',
+    da: 'Dansk',
+    no: 'Norsk',
+    fi: 'Suomi',
+    de: 'Deutsch',
+    fr: 'Français',
+    es: 'Español',
+    nl: 'Nederlands',
+    ar: 'العربية',
+    he: 'עברית',
+    ja: '日本語',
+    ko: '한국어',
+    zh: '中文',
+};
 /**
  * Regex pattern that flags internal artefact identifiers
  * (`<slug>-run<N>-<unix-ts>`). Used by
@@ -127,7 +144,9 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
         ? resolverDescription
         : safeDescription || formatSlug(parsed.slug);
     const description = needsDescription
-        ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription)
+        ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription, {
+            forceContextPrefix: true,
+        })
         : meta.description;
     const keywords = entry?.keywords ?? fallbackKeywords;
     const nextHtml = applyArticleSeoBackfill(html, description, keywords);
@@ -160,23 +179,71 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
  * @param slug - Article slug (used to derive the category)
  * @param lang - Article language (ISO 639-1 lower-case code)
  * @param description - Candidate description (resolver output preferred)
+ * @param options - Backfill options
+ * @param options.forceContextPrefix - Force date/language/category prefix
+ *   even when the description is already substantive
  * @returns Page-specific description, prefix-free when description is
  *   already substantive
  */
-export function buildLegacyBackfillDescription(date, slug, lang, description) {
+export function buildLegacyBackfillDescription(date, slug, lang, description, options = {}) {
     const trimmedDescription = description.trim();
-    if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH) {
+    if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH && !options.forceContextPrefix) {
         return capDescriptionLength(trimmedDescription);
     }
     const category = detectCategory(slug);
     const langCode = (lang || 'en').toLowerCase();
     const categoryLabels = getLocalizedString(ARTICLE_TYPE_LABELS, langCode);
     const label = categoryLabels[category] ?? formatSlug(slug);
-    const prefix = `${date} — ${label}`;
+    const qualifier = buildLegacySlugQualifier(slug, label);
+    const languageLabel = legacyLanguageLabel(langCode);
+    const prefix = [date, languageLabel, label, qualifier]
+        .filter((part) => part.length > 0)
+        .join(' — ');
     const body = trimmedDescription || label;
     const contextual = `${prefix} — ${body}`.replace(/\s+/g, ' ').trim();
     return capDescriptionLength(contextual);
 }
+/**
+ * Resolve the human language label used to make otherwise-identical
+ * cross-locale legacy descriptions unique.
+ *
+ * @param lang - Language code
+ * @returns Local language name, or the raw code if unknown
+ */
+function legacyLanguageLabel(lang) {
+    const descriptor = Object.getOwnPropertyDescriptor(LEGACY_LANGUAGE_LABELS, lang);
+    return typeof descriptor?.value === 'string' ? descriptor.value : lang;
+}
+/**
+ * Build an optional slug-derived qualifier for legacy pages that share the
+ * same date and article category (for example same-day `*-run2` variants).
+ *
+ * @param slug - Article slug without date/language suffix
+ * @param localizedLabel - Localized category label already present in prefix
+ * @returns Human-readable qualifier, or empty when it would duplicate label
+ */
+function buildLegacySlugQualifier(slug, localizedLabel) {
+    const formatted = formatSlug(slug).trim();
+    if (!formatted)
+        return '';
+    const normalizedFormatted = normalizeLegacyQualifier(formatted);
+    const normalizedLabel = normalizeLegacyQualifier(localizedLabel);
+    if (!normalizedFormatted || normalizedFormatted === normalizedLabel)
+        return '';
+    return formatted;
+}
+/**
+ * Normalize a prefix component for duplicate detection.
+ *
+ * @param value - Candidate text
+ * @returns Lower-case alphanumeric text
+ */
+function normalizeLegacyQualifier(value) {
+    return value
+        .toLowerCase()
+        .replace(/[^\p{L}\p{N}]+/gu, ' ')
+        .trim();
+}
 /**
  * Clamp a description to the 180-character SERP-friendly cap with a
  * trailing ellipsis when truncated. Extracted from