euparliamentmonitor 0.9.21 → 0.9.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "euparliamentmonitor",
3
- "version": "0.9.21",
3
+ "version": "0.9.22",
4
4
  "type": "module",
5
5
  "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
6
6
  "main": "scripts/index.js",
@@ -71,6 +71,7 @@
71
71
  "prior-run-diff": "node scripts/aggregator/prior-run-diff.js",
72
72
  "generate-article": "node scripts/aggregator/article-generator.js",
73
73
  "generate-article:all": "node scripts/aggregator/article-generator.js --all",
74
+ "dump:article-seo": "node scripts/dump-article-seo.js",
74
75
  "generate-news-indexes": "node scripts/generators/news-indexes.js",
75
76
  "generate-sitemap": "node scripts/generators/sitemap.js",
76
77
  "image:generate": "node scripts/generate-responsive-images.js",
@@ -164,6 +165,7 @@
164
165
  "chartjs-plugin-annotation": "3.1.0",
165
166
  "clean-css": "^5.3.3",
166
167
  "d3": "7.9.0",
168
+ "esbuild": "0.28.0",
167
169
  "eslint": "10.4.0",
168
170
  "eslint-config-prettier": "10.1.8",
169
171
  "eslint-plugin-jsdoc": "63.0.0",
@@ -208,6 +210,7 @@
208
210
  "flatted": ">=3.4.2",
209
211
  "path-to-regexp": ">=8.4.0",
210
212
  "ip-address": ">=10.1.1",
211
- "uuid": ">=11.1.1"
213
+ "uuid": ">=11.1.1",
214
+ "qs": "6.15.2"
212
215
  }
213
216
  }
@@ -17,7 +17,7 @@ import fs from 'fs';
17
17
  import path from 'path';
18
18
  import { extractFirstH1 } from './h1-extractor.js';
19
19
  import { extractLedeAfterHeading, extractStrongProseLine } from './lede-extractor.js';
20
- import { isGenericHeading, stripArtifactCategoryAffix } from './heading-rules.js';
20
+ import { isGenericHeading, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './heading-rules.js';
21
21
  import { truncateTitle } from './text-utils.js';
22
22
  import { extractPriorityFindingHighlight } from './priority-finding-highlight.js';
23
23
  /** Ordered list of artefact filenames that typically carry the editorial H1. */
@@ -132,7 +132,7 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
132
132
  // distinctive editorial headline ("Digital Markets Act Enforcement",
133
133
  // "Ukraine War Accountability") instead of a stripped category noun.
134
134
  const priority = extractPriorityFindingHighlight(body);
135
- if (priority?.headline) {
135
+ if (priority?.headline && !isArtifactCategoryHeading(priority.headline)) {
136
136
  return {
137
137
  cleanHighlight: {
138
138
  headline: truncateTitle(priority.headline),
@@ -69,6 +69,7 @@ export const ARTIFACT_CATEGORY_PREFIXES = [
69
69
  'commission wp alignment',
70
70
  'committee activity report',
71
71
  'cross run continuity',
72
+ 'data availability assessment',
72
73
  'deep analysis',
73
74
  'economic context',
74
75
  'executive brief',
@@ -12,15 +12,25 @@
12
12
  * - chart.js → js/vendor/chart.umd.min.js
13
13
  * - chartjs-plugin-annotation → js/vendor/chartjs-plugin-annotation.min.js
14
14
  * - d3 → js/vendor/d3.min.js
15
- * - mermaid → js/vendor/mermaid/ (entry + chunks/)
15
+ * - mermaid → js/vendor/mermaid/mermaid.esm.min.mjs
16
16
  *
17
- * Mermaid is special: v11+ ships as code-split ESM. The entry
18
- * `mermaid.esm.min.mjs` does dynamic `import()` on diagram-specific chunks
19
- * under `dist/chunks/mermaid.esm.min/*.mjs`. To make every diagram type render
20
- * without external network calls, we copy the **entire mermaid `dist/`
21
- * directory** (filtered to the `.esm.min` flavour to keep payload small).
17
+ * Mermaid is special: v11+ ships as a **code-split ESM bundle**. The entry
18
+ * `mermaid.esm.min.mjs` (28 KB) statically imports 81 diagram-specific chunks
19
+ * from `dist/chunks/mermaid.esm.min/*.mjs`. Empirically (May 2026), serving
20
+ * those chunks through S3 + CloudFront has been unreliable — the entry returns
21
+ * 200 OK but every chunk URL returns 403 from CloudFront, breaking every
22
+ * article that references the loader.
22
23
  *
23
- * Idempotent: rerunning overwrites prior copies and leaves licenses in place.
24
+ * To eliminate that failure mode, we **bundle Mermaid into a single
25
+ * self-contained ESM file at copy-vendor time using esbuild** (devDependency).
26
+ * The output is written to the same path / filename that the loader and the
27
+ * existing article HTML already reference (`mermaid.esm.min.mjs`), so the
28
+ * loader (`js/mermaid-init.js`) and the generated articles continue to work
29
+ * unchanged — only the file's content changes (3.2 MB self-contained vs.
30
+ * 28 KB entry-plus-81-chunks).
31
+ *
32
+ * Idempotent: rerunning overwrites prior copies and leaves licenses in place;
33
+ * stale `chunks/` directories from prior layouts are pruned.
24
34
  *
25
35
  * Failure modes:
26
36
  * - Missing chart.js / d3 / chartjs-plugin-annotation → hard error (these
@@ -28,11 +38,12 @@
28
38
  * - Missing mermaid → soft error (logged, exit 0). Mermaid is also a pinned
29
39
  * `devDependency`, but optional installs (e.g. `npm ci --omit=dev`) may
30
40
  * skip it; we want the deploy to succeed without diagrams rather than fail.
41
+ * - Bundling failure → hard error: mermaid is present but unusable, which
42
+ * would silently ship a broken page; fail fast at build time instead.
31
43
  */
32
44
 
33
45
  import {
34
46
  copyFileSync,
35
- cpSync,
36
47
  existsSync,
37
48
  mkdirSync,
38
49
  readdirSync,
@@ -43,6 +54,7 @@ import {
43
54
  } from 'node:fs';
44
55
  import path from 'node:path';
45
56
  import process from 'node:process';
57
+ import * as esbuild from 'esbuild';
46
58
 
47
59
  const ROOT = process.cwd();
48
60
  const NODE_MODULES = path.join(ROOT, 'node_modules');
@@ -120,7 +132,8 @@ function copyOrFail(label, srcRel, dstRel, license) {
120
132
  function copyMermaid() {
121
133
  const mermaidDist = path.join(NODE_MODULES, 'mermaid', 'dist');
122
134
  const target = path.join(VENDOR_DIR, 'mermaid');
123
- if (!existsSync(mermaidDist)) {
135
+ const entryPoint = path.join(mermaidDist, 'mermaid.esm.min.mjs');
136
+ if (!existsSync(entryPoint)) {
124
137
  process.stdout.write(
125
138
  ' ⚠ mermaid not installed (devDependency); skipping diagram bundle.\n',
126
139
  );
@@ -128,84 +141,68 @@ function copyMermaid() {
128
141
  }
129
142
  ensureDir(target);
130
143
 
131
- // Per-file idempotency: walk the source tree and only copy files whose
132
- // bytes differ from what's already in `js/vendor/mermaid/`. Replaces the
133
- // earlier `rmSync` + `cpSync` approach which always touched every chunk's
134
- // mtime `aws s3 sync` (size+mtime by default) then re-uploaded the
135
- // entire mermaid bundle on every deploy even though the bundle is byte-
136
- // identical until the pinned mermaid version in package.json changes.
144
+ // Bundle mermaid's code-split ESM entry plus all of its dynamic-import
145
+ // chunks into a SINGLE self-contained ESM file. esbuild follows every
146
+ // static and dynamic `import` from the entry and inlines the transitive
147
+ // closure, so the resulting file has no external module references
148
+ // exactly what the static-site origin needs.
137
149
  //
138
- // Filename contract preserved exactly: entry stays at
139
- // `js/vendor/mermaid/mermaid.esm.min.mjs` and chunks stay at
140
- // `js/vendor/mermaid/chunks/mermaid.esm.min/*.mjs` so every existing
141
- // `<script type="module" src="../js/vendor/mermaid/mermaid.esm.min.mjs">`
142
- // and dynamic `import()` from the entry continues to resolve.
143
-
144
- // Build the set of source files we want to ship (filter mirrors the
145
- // previous cpSync filter exactly).
146
- const wantedTopLevel = new Set(['mermaid.esm.min.mjs']);
147
- const wantedFiles = []; // { src, rel } — `rel` is relative to mermaidDist
148
-
149
- function shouldShip(rel) {
150
- if (rel.endsWith('.map')) return false;
151
- const segments = rel.split(path.sep);
152
- const top = segments[0];
153
- if (top === 'chunks') {
154
- if (segments.length === 1) return false; // directory itself, not a file
155
- const flavour = segments[1];
156
- return flavour === 'mermaid.esm.min';
157
- }
158
- if (segments.length === 1) {
159
- return wantedTopLevel.has(top);
160
- }
161
- return false;
162
- }
163
-
164
- function walkSource(dir) {
165
- for (const entry of readdirSync(dir, { withFileTypes: true })) {
166
- const full = path.join(dir, entry.name);
167
- const rel = path.relative(mermaidDist, full);
168
- if (entry.isDirectory()) {
169
- walkSource(full);
170
- } else if (entry.isFile() && shouldShip(rel)) {
171
- wantedFiles.push({ src: full, rel });
150
+ // We write the output under the same filename the loader and existing
151
+ // article HTML already reference (`mermaid.esm.min.mjs`), so this script
152
+ // is the only place that changes when we switch from "entry + 81 chunks"
153
+ // to "single bundle". The previous chunk-shipping layout (`chunks/`) is
154
+ // pruned below.
155
+ const outFile = path.join(target, 'mermaid.esm.min.mjs');
156
+ try {
157
+ esbuild.buildSync({
158
+ entryPoints: [entryPoint],
159
+ outfile: outFile,
160
+ bundle: true,
161
+ format: 'esm',
162
+ minify: true,
163
+ // `browser` keeps mermaid's runtime-detection paths (e.g. `document`
164
+ // checks) intact — same target as the upstream `.esm.min.mjs` build.
165
+ platform: 'browser',
166
+ target: 'es2022',
167
+ // Resolve `import.meta.url` at runtime (relative to the served bundle
168
+ // location) rather than baking in the build-time path.
169
+ supported: { 'import-meta': true },
170
+ // Drop sourcemaps; the upstream bundle ships them as `.map` siblings
171
+ // and we previously excluded those from vendor copy.
172
+ sourcemap: false,
173
+ legalComments: 'none',
174
+ // Use 'error' so esbuild prints its own detailed diagnostics (file,
175
+ // line, column) on failure — 'silent' previously swallowed all context.
176
+ logLevel: 'error',
177
+ });
178
+ } catch (err) {
179
+ // esbuild attaches structured diagnostics on `err.errors`; print them
180
+ // so CI logs are actionable without re-running locally.
181
+ if (err && Array.isArray(err.errors)) {
182
+ for (const e of err.errors) {
183
+ const loc = e.location
184
+ ? `${e.location.file}:${e.location.line}:${e.location.column}: `
185
+ : '';
186
+ process.stderr.write(` ${loc}${e.text}\n`);
172
187
  }
173
188
  }
174
- }
175
- walkSource(mermaidDist);
176
-
177
- // Copy only-if-changed.
178
- let copied = 0;
179
- let unchanged = 0;
180
- for (const { src, rel } of wantedFiles) {
181
- const dst = path.join(target, rel);
182
- ensureDir(path.dirname(dst));
183
- if (copyFileIfChanged(src, dst)) {
184
- copied++;
185
- } else {
186
- unchanged++;
187
- }
189
+ process.stderr.write(
190
+ `error: mermaid bundle failed: ${err && err.message ? err.message : err}\n` +
191
+ ' Check that node_modules/mermaid is installed (run `npm ci`) and that\n' +
192
+ ' esbuild can resolve the ESM entry point at node_modules/mermaid/dist/mermaid.esm.min.mjs.\n',
193
+ );
194
+ process.exit(1);
188
195
  }
189
196
 
190
- // Remove orphaned files in the destination tree that no longer have a
191
- // matching wanted source this preserves the "no stale chunks from a
192
- // previous mermaid version" guarantee that the old `rmSync` provided,
193
- // without touching any current chunk's mtime.
194
- const wantedDstSet = new Set(
195
- wantedFiles.map(({ rel }) => path.join(target, rel)),
196
- );
197
- // Allow our REUSE sidecar files alongside their primary file.
197
+ // Prune the obsolete chunks layout (and any other orphans) from previous
198
+ // copy-vendor runs. The bundled file is fully self-contained, so anything
199
+ // other than the bundle itself + its REUSE sidecar is stale.
200
+ const wantedDstSet = new Set([outFile]);
198
201
  function isAllowedSidecar(absPath) {
199
202
  if (!absPath.endsWith('.license')) return false;
200
203
  const primary = absPath.slice(0, -'.license'.length);
201
204
  return wantedDstSet.has(primary);
202
205
  }
203
- // Also allow the chunks-dir flavour-level license sidecar we drop below.
204
- const flavourLicensePath = path.join(
205
- target,
206
- 'chunks',
207
- 'mermaid.esm.min.license',
208
- );
209
206
 
210
207
  function pruneOrphans(dir) {
211
208
  if (!existsSync(dir)) return;
@@ -213,7 +210,6 @@ function copyMermaid() {
213
210
  const full = path.join(dir, entry.name);
214
211
  if (entry.isDirectory()) {
215
212
  pruneOrphans(full);
216
- // Remove now-empty directories so a flavour rename leaves no shell.
217
213
  try {
218
214
  if (readdirSync(full).length === 0) {
219
215
  rmSync(full, { recursive: true, force: true });
@@ -222,11 +218,7 @@ function copyMermaid() {
222
218
  // best-effort
223
219
  }
224
220
  } else if (entry.isFile()) {
225
- if (
226
- !wantedDstSet.has(full) &&
227
- !isAllowedSidecar(full) &&
228
- full !== flavourLicensePath
229
- ) {
221
+ if (!wantedDstSet.has(full) && !isAllowedSidecar(full)) {
230
222
  rmSync(full, { force: true });
231
223
  }
232
224
  }
@@ -234,39 +226,19 @@ function copyMermaid() {
234
226
  }
235
227
  pruneOrphans(target);
236
228
 
237
- // REUSE sidecar for the entry file + flavour directory.
238
- const entry = path.join(target, 'mermaid.esm.min.mjs');
239
- if (existsSync(entry)) {
240
- writeLicense(entry, '2014-2026 Mermaid contributors', 'MIT');
241
- }
242
- // Also drop a license file at the chunks dir so REUSE lint passes for the
243
- // generated tree without us having to enumerate every chunk by name.
244
- const chunksDir = path.join(target, 'chunks', 'mermaid.esm.min');
245
- if (existsSync(chunksDir)) {
246
- writeIfChanged(
247
- flavourLicensePath,
248
- 'SPDX-FileCopyrightText: 2014-2026 Mermaid contributors\nSPDX-License-Identifier: MIT\n',
249
- );
250
- }
229
+ // REUSE sidecar for the bundled file. The bundle contains code from
230
+ // mermaid + its transitive ESM deps; mermaid's own MIT license header
231
+ // remains intact in the dependency tree (REUSE.toml covers the vendored
232
+ // artifact via path-level annotation; this sidecar keeps the file
233
+ // self-documenting).
234
+ writeLicense(outFile, '2014-2026 Mermaid contributors', 'MIT');
235
+
236
+ const size = statSync(outFile).size;
251
237
  process.stdout.write(
252
- ` ✓ mermaid/ (${copied} copied, ${unchanged} unchanged; ${countMjs(target)} total mjs chunks)\n`,
238
+ ` ✓ mermaid/mermaid.esm.min.mjs (${(size / 1024).toFixed(0)} KB self-contained bundle)\n`,
253
239
  );
254
240
  }
255
241
 
256
- function countMjs(dir) {
257
- let n = 0;
258
- function walk(d) {
259
- if (!existsSync(d)) return;
260
- for (const entry of readdirSync(d, { withFileTypes: true })) {
261
- const p = path.join(d, entry.name);
262
- if (entry.isDirectory()) walk(p);
263
- else if (entry.isFile() && entry.name.endsWith('.mjs')) n += 1;
264
- }
265
- }
266
- walk(dir);
267
- return n;
268
- }
269
-
270
242
  function main() {
271
243
  ensureDir(VENDOR_DIR);
272
244
  process.stdout.write(`Copying vendor JS libraries to ${path.relative(ROOT, VENDOR_DIR)}/\n`);
@@ -0,0 +1,567 @@
1
+ #!/usr/bin/env node
2
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ /**
6
+ * @module scripts/dump-article-seo
7
+ * @description Read-only preview of the SEO `<head>` metadata that the
8
+ * deterministic article generator **would produce** for every executive
9
+ * brief committed under `analysis/daily/`. Use this before running
10
+ * `npm run generate-article:all` to audit and improve titles,
11
+ * descriptions, and keywords without touching any `news/*.html` file.
12
+ *
13
+ * **Source: executive briefs, not HTML.**
14
+ * The script reads each analysis run's `executive-brief.md` (and its
15
+ * translated siblings) via the same resolver chain that the real
16
+ * article generator uses. No HTML files are read or written; the script
17
+ * is purely additive and fully idempotent.
18
+ *
19
+ * **Identical code path to the real renderer.** The script intentionally
20
+ * imports the same helpers that `scripts/aggregator/article-generator.js`
21
+ * (the engine behind `npm run generate-article:all` and the
22
+ * `regenerate-articles.yml` workflow) uses:
23
+ *
24
+ * 1. `discoverAnalysisRuns(repoRoot)` — same run discovery as the batch
25
+ * renderer (`generator/render-batch.js`).
26
+ * 2. `aggregateAnalysisRun({ runDir, repoRoot })` — same Markdown
27
+ * aggregation that feeds `resolveArticleMetadata`, which in turn
28
+ * reads `executive-brief.md` and its translated siblings.
29
+ * 3. `resolveArticleMetadata({ articleType, date, markdown, manifest,
30
+ * runDir })` — the single source of truth for per-language `(title,
31
+ * description, extendedDescription, keywords, source)` documented in
32
+ * `src/aggregator/article-metadata.ts`. The entry returned here is
33
+ * *bit-for-bit identical* to the one passed into
34
+ * `src/aggregator/html/shell.ts` for the `<title>`,
35
+ * `<meta name="description">`, and `<meta name="keywords">` tags.
36
+ *
37
+ * **Two-part output per run.**
38
+ * - *Field analysis* — human-readable breakdown of each SEO field
39
+ * (length, content, resolution tier) for quick editorial review.
40
+ * - *HTML head snippet* — the **complete `<head>` block** that the
41
+ * article generator will emit, produced by calling
42
+ * `wrapArticleHtml()` from `src/aggregator/html/shell.ts` with an
43
+ * empty body and slicing out `<head>...</head>`. This includes the
44
+ * `<title>`, `<meta name="description">`, `<meta name="keywords">`,
45
+ * all `<meta property="og:*">` and `<meta name="twitter:*">` tags,
46
+ * `<link rel="canonical">`, hreflang alternates, JSON-LD
47
+ * `NewsArticle` + `BreadcrumbList`, and every other tag the real
48
+ * renderer emits — because the snippet *is* the real renderer's
49
+ * output. Copy-paste these into a browser extension or SEO tool to
50
+ * preview how the article will appear in search results and social
51
+ * cards before committing to HTML generation.
52
+ *
53
+ * Invocation:
54
+ * node scripts/dump-article-seo.js \
55
+ * [--repo-root <path>] # defaults to process.cwd()
56
+ * [--lang en] # defaults to en
57
+ * [--out <path>] # also write the human-readable dump here
58
+ * [--json <path>] # also write a machine-readable JSONL dump
59
+ * [--limit <N>] # only process the first N runs (debug)
60
+ * [--quiet] # suppress per-run stdout (file output only)
61
+ */
62
+
63
+ import fs from 'node:fs';
64
+ import path from 'node:path';
65
+ import process from 'node:process';
66
+
67
+ import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
68
+ import {
69
+ aggregateAnalysisRun,
70
+ resolveArticleTypeFromManifest,
71
+ } from './aggregator/analysis-aggregator.js';
72
+ import { resolveArticleMetadata } from './aggregator/article-metadata.js';
73
+ import { buildArticleSlug } from './aggregator/generator/slug.js';
74
+ import { getArticleFilename } from './aggregator/html/hreflang.js';
75
+ import { wrapArticleHtml } from './aggregator/html/shell.js';
76
+ import { ALL_LANGUAGES, isSupportedLanguage } from './constants/language-core.js';
77
+
78
+ const SUPPORTED_LANGS = new Set(ALL_LANGUAGES);
79
+
80
+ /**
81
+ * Parse the small CLI surface used by this script. Kept inline so the
82
+ * dumper has no extra dependencies beyond the same compiled-from-TS
83
+ * helpers the real renderer uses.
84
+ *
85
+ * @param {readonly string[]} argv - `process.argv.slice(2)`
86
+ * @returns {{repoRoot: string, lang: string, outPath: string|null,
87
+ * jsonPath: string|null, limit: number, quiet: boolean}}
88
+ */
89
+ export function parseArgs(argv) {
90
+ let repoRoot = process.cwd();
91
+ let lang = 'en';
92
+ let outPath = null;
93
+ let jsonPath = null;
94
+ let limit = Number.POSITIVE_INFINITY;
95
+ let quiet = false;
96
+
97
+ for (let i = 0; i < argv.length; i += 1) {
98
+ const arg = argv[i];
99
+ switch (arg) {
100
+ case '--repo-root':
101
+ repoRoot = path.resolve(requireValue(argv, i, arg));
102
+ i += 1;
103
+ break;
104
+ case '--lang':
105
+ lang = requireValue(argv, i, arg);
106
+ i += 1;
107
+ break;
108
+ case '--out':
109
+ outPath = path.resolve(requireValue(argv, i, arg));
110
+ i += 1;
111
+ break;
112
+ case '--json':
113
+ jsonPath = path.resolve(requireValue(argv, i, arg));
114
+ i += 1;
115
+ break;
116
+ case '--limit': {
117
+ const raw = requireValue(argv, i, arg);
118
+ if (!/^\d+$/u.test(raw)) {
119
+ throw new Error(`--limit expects a positive integer, got "${raw}"`);
120
+ }
121
+ const parsed = Number.parseInt(raw, 10);
122
+ if (!Number.isFinite(parsed) || parsed < 1) {
123
+ throw new Error(`--limit expects a positive integer, got "${raw}"`);
124
+ }
125
+ limit = parsed;
126
+ i += 1;
127
+ break;
128
+ }
129
+ case '--quiet':
130
+ quiet = true;
131
+ break;
132
+ case '--help':
133
+ case '-h':
134
+ printHelpAndExit();
135
+ break;
136
+ default:
137
+ throw new Error(`Unknown argument: ${arg}`);
138
+ }
139
+ }
140
+
141
+ if (!isSupportedLanguage(lang)) {
142
+ throw new Error(
143
+ `Unsupported --lang "${lang}". Expected one of: ${[...SUPPORTED_LANGS].join(', ')}`
144
+ );
145
+ }
146
+
147
+ return { repoRoot, lang, outPath, jsonPath, limit, quiet };
148
+ }
149
+
150
+ function requireValue(argv, i, flag) {
151
+ const value = argv[i + 1];
152
+ if (value === undefined) {
153
+ throw new Error(`${flag} requires a value`);
154
+ }
155
+ return value;
156
+ }
157
+
158
+ function printHelpAndExit() {
159
+ process.stdout.write(
160
+ [
161
+ 'Usage: node scripts/dump-article-seo.js [options]',
162
+ '',
163
+ 'Read-only preview of the SEO <head> metadata (title, description,',
164
+ 'keywords, og:*, twitter:*) that the article generator would produce',
165
+ 'from each executive brief — without generating any HTML files.',
166
+ '',
167
+ 'Options:',
168
+ ' --repo-root <path> Repository root (default: cwd)',
169
+ ' --lang <code> Language to dump (default: en)',
170
+ ' --out <path> Write the human-readable report here',
171
+ ' --json <path> Also write a JSONL record per run',
172
+ ' --limit <N> Process only the first N runs (debug)',
173
+ ' --quiet Suppress per-run stdout',
174
+ ' -h, --help Show this help',
175
+ '',
176
+ ].join('\n')
177
+ );
178
+ process.exit(0);
179
+ }
180
+
181
+ /**
182
+ * Mirror of the private `readManifestMetadata` helper inside
183
+ * `scripts/aggregator/generator/render-one.js`. We re-implement it here
184
+ * rather than export it from the renderer because the metadata-relevant
185
+ * subset of a manifest is intentionally a *contract*, not a public API:
186
+ * the resolver only consumes the seven keys listed below and silently
187
+ * ignores everything else. Re-implementing keeps the dumper aligned
188
+ * with that contract without leaking unrelated manifest fields into
189
+ * `resolveArticleMetadata`.
190
+ *
191
+ * @param {string} runDir - Absolute path to the analysis run
192
+ * @returns {object} Metadata-relevant manifest fields (possibly empty)
193
+ */
194
+ export function readManifestMetadata(runDir) {
195
+ const manifestPath = path.join(runDir, 'manifest.json');
196
+ if (!fs.existsSync(manifestPath)) return {};
197
+
198
+ let parsed;
199
+ try {
200
+ parsed = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
201
+ } catch {
202
+ return {};
203
+ }
204
+
205
+ const manifest = {};
206
+ const resolvedType = resolveArticleTypeFromManifest(parsed);
207
+ if (resolvedType && resolvedType !== 'unknown') {
208
+ manifest.articleType = resolvedType;
209
+ }
210
+ if (typeof parsed.date === 'string') manifest.date = parsed.date;
211
+ if (typeof parsed.runId === 'string') manifest.runId = parsed.runId;
212
+ if (typeof parsed.title === 'string' || isLanguageMapLike(parsed.title)) {
213
+ manifest.title = parsed.title;
214
+ }
215
+ if (
216
+ typeof parsed.description === 'string' ||
217
+ isLanguageMapLike(parsed.description)
218
+ ) {
219
+ manifest.description = parsed.description;
220
+ }
221
+ if (typeof parsed.committee === 'string') {
222
+ manifest.committee = parsed.committee;
223
+ }
224
+ return manifest;
225
+ }
226
+
227
+ function isLanguageMapLike(value) {
228
+ if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
229
+ for (const entry of Object.values(value)) {
230
+ if (typeof entry !== 'string') return false;
231
+ }
232
+ return true;
233
+ }
234
+
235
+ /**
236
+ * Resolve the SEO metadata for one analysis run by reading its executive
237
+ * brief and applying the same resolver chain as the article generator.
238
+ * Pure: no files written, no stdout side-effects.
239
+ *
240
+ * @param {object} opts
241
+ * @param {string} opts.runDir - Absolute path to the analysis run
242
+ * @param {string} opts.repoRoot - Repository root (for relative paths)
243
+ * @param {string} opts.lang - Language code to extract
244
+ * @returns {{
245
+ * runDir: string,
246
+ * runDirRel: string,
247
+ * date: string,
248
+ * articleType: string,
249
+ * slug: string,
250
+ * filename: string,
251
+ * entry: {title: string, description: string,
252
+ * extendedDescription: string, keywords: readonly string[],
253
+ * source: string}
254
+ * }}
255
+ */
256
+ export function resolveRunSeo({ runDir, repoRoot, lang }) {
257
+ const aggregated = aggregateAnalysisRun({ runDir, repoRoot });
258
+ const manifestMetadata = readManifestMetadata(runDir);
259
+ const resolved = resolveArticleMetadata({
260
+ articleType: aggregated.articleType,
261
+ date: aggregated.date,
262
+ markdown: aggregated.markdown,
263
+ manifest: manifestMetadata,
264
+ runDir,
265
+ });
266
+
267
+ const entry = resolved[lang];
268
+ if (!entry) {
269
+ throw new Error(
270
+ `resolveArticleMetadata returned no entry for lang="${lang}" in ${runDir}`
271
+ );
272
+ }
273
+
274
+ const slug = buildArticleSlug(aggregated.date, aggregated.articleType);
275
+ const filename = getArticleFilename(slug, lang);
276
+ const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
277
+
278
+ return {
279
+ runDir,
280
+ runDirRel,
281
+ date: aggregated.date,
282
+ articleType: aggregated.articleType,
283
+ slug,
284
+ filename,
285
+ entry: {
286
+ title: entry.title,
287
+ description: entry.description,
288
+ extendedDescription: entry.extendedDescription,
289
+ keywords: entry.keywords ?? [],
290
+ source: entry.source,
291
+ },
292
+ };
293
+ }
294
+
295
+ /**
296
+ * Build the full `<head>` block that the article generator will emit for
297
+ * this run. The output is **bit-for-bit identical** to the `<head>`
298
+ * produced by `wrapArticleHtml()` in `src/aggregator/html/shell.ts` for
299
+ * the same metadata — including all SEO, Open Graph, Twitter card,
300
+ * `<link rel>`, theme-color, and JSON-LD tags — because this function
301
+ * literally invokes `wrapArticleHtml()` with an empty body and slices
302
+ * out the `<head>...</head>` block from the resulting document. There
303
+ * is no duplicated head-rendering code path.
304
+ *
305
+ * Use this to preview how the article will appear in search results and
306
+ * social-card previews **before** running the full HTML generator.
307
+ *
308
+ * @param {ReturnType<typeof resolveRunSeo>} record
309
+ * @param {string} lang - Language code passed through to `wrapArticleHtml`
310
+ * @returns {string} The complete `<head>...</head>` block from the
311
+ * real article renderer, ready to paste for review.
312
+ */
313
+ export function buildHtmlHeadSnippet(record, lang) {
314
+ const { entry } = record;
315
+ const html = wrapArticleHtml({
316
+ lang,
317
+ articleSlug: record.slug,
318
+ body: '',
319
+ title: entry.title,
320
+ description: entry.description,
321
+ extendedDescription: entry.extendedDescription,
322
+ keywords: entry.keywords ?? [],
323
+ date: record.date,
324
+ articleType: record.articleType,
325
+ });
326
+ const match = html.match(/<head>[\s\S]*?<\/head>/);
327
+ if (!match) {
328
+ throw new Error(
329
+ `buildHtmlHeadSnippet: could not locate <head> block in wrapArticleHtml output for ${record.slug}`
330
+ );
331
+ }
332
+ return match[0];
333
+ }
334
+
335
+ /**
336
+ * Format one resolved-SEO record as the human/AI-readable block used in
337
+ * the stdout dump. Each block contains two sections:
338
+ * 1. *Field analysis* — per-field character/term counts and the
339
+ * resolution tier so editors can spot template fallbacks instantly.
340
+ * 2. *HTML head snippet* — the exact tags the article generator will
341
+ * emit, ready to paste into a browser/SEO tool for preview.
342
+ *
343
+ * @param {ReturnType<typeof resolveRunSeo>} record
344
+ * @param {number} index - 1-based position within the dump
345
+ * @param {number} total - Total number of records being dumped
346
+ * @param {string} [lang] - Language code (used for the HTML snippet; defaults to 'en')
347
+ * @returns {string}
348
+ */
349
+ export function formatRecord(record, index, total, lang = 'en') {
350
+ const lines = [];
351
+ lines.push('='.repeat(80));
352
+ lines.push(`[${index}/${total}] ${record.slug}`);
353
+ lines.push('='.repeat(80));
354
+ lines.push(`run-dir : ${record.runDirRel}`);
355
+ lines.push(`date : ${record.date}`);
356
+ lines.push(`article-type : ${record.articleType}`);
357
+ lines.push(`resolution-tier : ${record.entry.source}`);
358
+ lines.push(`html-file : news/${record.filename}`);
359
+ lines.push('');
360
+ lines.push('--- Field analysis (from executive-brief.md → resolveArticleMetadata) ---');
361
+ lines.push(
362
+ `<title> (${record.entry.title.length} chars): ${formatInline(record.entry.title)}`
363
+ );
364
+ lines.push(
365
+ `<meta description> (${record.entry.description.length} chars): ${formatInline(record.entry.description)}`
366
+ );
367
+ lines.push(
368
+ `<meta description-extended> (${record.entry.extendedDescription.length} chars): ${formatInline(record.entry.extendedDescription)}`
369
+ );
370
+ const keywords = record.entry.keywords;
371
+ lines.push(
372
+ `<meta keywords> (${keywords.length} terms): ${keywords.length ? keywords.join(', ') : '(empty)'}`
373
+ );
374
+ lines.push('');
375
+ lines.push('--- HTML <head> block (verbatim output of wrapArticleHtml — same code path as the article generator) ---');
376
+ lines.push(buildHtmlHeadSnippet(record, lang));
377
+ lines.push('');
378
+ return lines.join('\n');
379
+ }
380
+
381
+ function formatInline(value) {
382
+ if (!value) return '(empty)';
383
+ // Strip newlines so each field stays on one line.
384
+ return value.replace(/\s+/g, ' ').trim();
385
+ }
386
+
387
+ /**
388
+ * Run the full dump: discover analysis runs, resolve SEO metadata from
389
+ * each executive brief, print field analysis + HTML head snippet, and
390
+ * optionally write to disk. Returns summary statistics so unit tests and
391
+ * downstream tooling can assert on histograms without re-parsing stdout.
392
+ *
393
+ * @param {ReturnType<typeof parseArgs>} opts
394
+ * @returns {{
395
+ * discovered: number,
396
+ * total: number,
397
+ * processed: number,
398
+ * resolutionTiers: Record<string, number>,
399
+ * emptyKeywordCount: number,
400
+ * shortDescriptionCount: number,
401
+ * records: ReadonlyArray<ReturnType<typeof resolveRunSeo>>
402
+ * }}
403
+ */
404
+ export function dumpArticleSeo(opts) {
405
+ const { repoRoot, lang, outPath, jsonPath, limit, quiet } = opts;
406
+
407
+ const allRuns = discoverAnalysisRuns(repoRoot).map((run) => run.runDir);
408
+ const discovered = allRuns.length;
409
+ const targetRuns = Number.isFinite(limit) ? allRuns.slice(0, limit) : allRuns;
410
+ const total = targetRuns.length;
411
+
412
+ const records = [];
413
+ const failures = [];
414
+ const resolutionTiers = Object.create(null);
415
+ let emptyKeywordCount = 0;
416
+ let shortDescriptionCount = 0;
417
+
418
+ const textChunks = [];
419
+ const jsonLines = [];
420
+ const header =
421
+ `# Executive Brief SEO Preview\n` +
422
+ `# Source : executive-brief.md under analysis/daily/*/\n` +
423
+ `# repo-root : ${repoRoot}\n` +
424
+ `# language : ${lang}\n` +
425
+ `# total runs : ${discovered}\n` +
426
+ `# selected runs : ${total}\n` +
427
+ `# generated by : scripts/dump-article-seo.js\n` +
428
+ `# resolver : src/aggregator/article-metadata.ts → resolveArticleMetadata()\n` +
429
+ `# rendered by : src/aggregator/html/shell.ts (same call path as npm run generate-article:all)\n` +
430
+ `# purpose : review and improve SEO before generating HTML\n\n`;
431
+
432
+ if (!quiet) process.stdout.write(header);
433
+ textChunks.push(header);
434
+
435
+ for (let i = 0; i < targetRuns.length; i += 1) {
436
+ const runDir = targetRuns[i];
437
+ let record;
438
+ try {
439
+ record = resolveRunSeo({ runDir, repoRoot, lang });
440
+ } catch (error) {
441
+ const message = error instanceof Error ? error.message : String(error);
442
+ failures.push({ runDir, message });
443
+ const failBlock = `--- FAILED ${path.relative(repoRoot, runDir)}: ${message}\n\n`;
444
+ if (!quiet) process.stderr.write(failBlock);
445
+ textChunks.push(failBlock);
446
+ continue;
447
+ }
448
+ records.push(record);
449
+
450
+ const tier = record.entry.source;
451
+ resolutionTiers[tier] = (resolutionTiers[tier] ?? 0) + 1;
452
+ if (record.entry.keywords.length === 0) emptyKeywordCount += 1;
453
+ if (record.entry.description.length < 70) shortDescriptionCount += 1;
454
+
455
+ const block = formatRecord(record, i + 1, total, lang);
456
+ if (!quiet) process.stdout.write(`${block}\n`);
457
+ textChunks.push(`${block}\n`);
458
+
459
+ jsonLines.push(
460
+ JSON.stringify({
461
+ slug: record.slug,
462
+ runDir: record.runDirRel,
463
+ date: record.date,
464
+ articleType: record.articleType,
465
+ lang,
466
+ filename: record.filename,
467
+ source: record.entry.source,
468
+ title: record.entry.title,
469
+ description: record.entry.description,
470
+ extendedDescription: record.entry.extendedDescription,
471
+ keywords: record.entry.keywords,
472
+ htmlHeadSnippet: buildHtmlHeadSnippet(record, lang),
473
+ })
474
+ );
475
+ }
476
+
477
+ const summary = buildSummary({
478
+ discovered,
479
+ total,
480
+ processed: records.length,
481
+ failures,
482
+ resolutionTiers,
483
+ emptyKeywordCount,
484
+ shortDescriptionCount,
485
+ });
486
+ if (!quiet) process.stdout.write(summary);
487
+ textChunks.push(summary);
488
+
489
+ if (outPath) {
490
+ fs.mkdirSync(path.dirname(outPath), { recursive: true });
491
+ fs.writeFileSync(outPath, textChunks.join(''), 'utf8');
492
+ }
493
+ if (jsonPath) {
494
+ fs.mkdirSync(path.dirname(jsonPath), { recursive: true });
495
+ fs.writeFileSync(jsonPath, `${jsonLines.join('\n')}\n`, 'utf8');
496
+ }
497
+
498
+ return {
499
+ discovered,
500
+ total,
501
+ processed: records.length,
502
+ resolutionTiers,
503
+ emptyKeywordCount,
504
+ shortDescriptionCount,
505
+ records,
506
+ };
507
+ }
508
+
509
+ function buildSummary({
510
+ discovered,
511
+ total,
512
+ processed,
513
+ failures,
514
+ resolutionTiers,
515
+ emptyKeywordCount,
516
+ shortDescriptionCount,
517
+ }) {
518
+ const tierEntries = Object.entries(resolutionTiers).sort(
519
+ ([a], [b]) => a.localeCompare(b)
520
+ );
521
+ const lines = [];
522
+ lines.push('='.repeat(80));
523
+ lines.push('SUMMARY');
524
+ lines.push('='.repeat(80));
525
+ lines.push(`total runs discovered : ${discovered}`);
526
+ lines.push(`selected for preview : ${total}`);
527
+ lines.push(`successfully resolved : ${processed}`);
528
+ lines.push(`failed runs : ${failures.length}`);
529
+ lines.push('');
530
+ lines.push('Resolution-tier histogram (alphabetical by source label):');
531
+ if (tierEntries.length === 0) {
532
+ lines.push(' (no runs resolved)');
533
+ } else {
534
+ for (const [tier, count] of tierEntries) {
535
+ lines.push(` ${tier.padEnd(20)} ${count}`);
536
+ }
537
+ }
538
+ lines.push('');
539
+ lines.push('Quality flags:');
540
+ lines.push(` runs with empty <meta keywords> : ${emptyKeywordCount}`);
541
+ lines.push(` runs with <meta description> shorter than 70 chars : ${shortDescriptionCount}`);
542
+ if (failures.length > 0) {
543
+ lines.push('');
544
+ lines.push('Failures:');
545
+ for (const fail of failures) {
546
+ lines.push(` - ${fail.runDir}: ${fail.message}`);
547
+ }
548
+ }
549
+ lines.push('');
550
+ return `${lines.join('\n')}`;
551
+ }
552
+
553
+ // Run as a script only when invoked directly (not when imported by tests).
554
+ const invokedDirectly =
555
+ import.meta.url === `file://${process.argv[1]}` ||
556
+ process.argv[1]?.endsWith('dump-article-seo.js');
557
+
558
+ if (invokedDirectly) {
559
+ try {
560
+ const opts = parseArgs(process.argv.slice(2));
561
+ dumpArticleSeo(opts);
562
+ } catch (error) {
563
+ const message = error instanceof Error ? error.message : String(error);
564
+ process.stderr.write(`dump-article-seo: ${message}\n`);
565
+ process.exit(1);
566
+ }
567
+ }
@@ -50,10 +50,15 @@ export declare function healJsonLdDescriptionCorruption(filenames: readonly stri
50
50
  * @param slug - Article slug (used to derive the category)
51
51
  * @param lang - Article language (ISO 639-1 lower-case code)
52
52
  * @param description - Candidate description (resolver output preferred)
53
+ * @param options - Backfill options
54
+ * @param options.forceContextPrefix - Force date/language/category prefix
55
+ * even when the description is already substantive
53
56
  * @returns Page-specific description, prefix-free when description is
54
57
  * already substantive
55
58
  */
56
- export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string): string;
59
+ export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string, options?: {
60
+ readonly forceContextPrefix?: boolean;
61
+ }): string;
57
62
  /**
58
63
  * Apply SEO meta tag replacements to a complete article HTML document.
59
64
  *
@@ -17,6 +17,23 @@ import { formatSlug, parseArticleFilename, extractArticleMeta, escapeHTML, atomi
17
17
  import { detectCategory } from '../../utils/article-category.js';
18
18
  import { buildSeoKeywords, resolveArticleMetadata } from '../../aggregator/article-metadata.js';
19
19
  const MIN_ARTICLE_DESCRIPTION_LENGTH = 120;
20
+ /** Language labels used only in forced legacy backfill prefixes. */
21
+ const LEGACY_LANGUAGE_LABELS = {
22
+ en: 'English',
23
+ sv: 'Svenska',
24
+ da: 'Dansk',
25
+ no: 'Norsk',
26
+ fi: 'Suomi',
27
+ de: 'Deutsch',
28
+ fr: 'Français',
29
+ es: 'Español',
30
+ nl: 'Nederlands',
31
+ ar: 'العربية',
32
+ he: 'עברית',
33
+ ja: '日本語',
34
+ ko: '한국어',
35
+ zh: '中文',
36
+ };
20
37
  /**
21
38
  * Regex pattern that flags internal artefact identifiers
22
39
  * (`<slug>-run<N>-<unix-ts>`). Used by
@@ -127,7 +144,9 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
127
144
  ? resolverDescription
128
145
  : safeDescription || formatSlug(parsed.slug);
129
146
  const description = needsDescription
130
- ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription)
147
+ ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription, {
148
+ forceContextPrefix: true,
149
+ })
131
150
  : meta.description;
132
151
  const keywords = entry?.keywords ?? fallbackKeywords;
133
152
  const nextHtml = applyArticleSeoBackfill(html, description, keywords);
@@ -160,23 +179,71 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
160
179
  * @param slug - Article slug (used to derive the category)
161
180
  * @param lang - Article language (ISO 639-1 lower-case code)
162
181
  * @param description - Candidate description (resolver output preferred)
182
+ * @param options - Backfill options
183
+ * @param options.forceContextPrefix - Force date/language/category prefix
184
+ * even when the description is already substantive
163
185
  * @returns Page-specific description, prefix-free when description is
164
186
  * already substantive
165
187
  */
166
- export function buildLegacyBackfillDescription(date, slug, lang, description) {
188
+ export function buildLegacyBackfillDescription(date, slug, lang, description, options = {}) {
167
189
  const trimmedDescription = description.trim();
168
- if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH) {
190
+ if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH && !options.forceContextPrefix) {
169
191
  return capDescriptionLength(trimmedDescription);
170
192
  }
171
193
  const category = detectCategory(slug);
172
194
  const langCode = (lang || 'en').toLowerCase();
173
195
  const categoryLabels = getLocalizedString(ARTICLE_TYPE_LABELS, langCode);
174
196
  const label = categoryLabels[category] ?? formatSlug(slug);
175
- const prefix = `${date} — ${label}`;
197
+ const qualifier = buildLegacySlugQualifier(slug, label);
198
+ const languageLabel = legacyLanguageLabel(langCode);
199
+ const prefix = [date, languageLabel, label, qualifier]
200
+ .filter((part) => part.length > 0)
201
+ .join(' — ');
176
202
  const body = trimmedDescription || label;
177
203
  const contextual = `${prefix} — ${body}`.replace(/\s+/g, ' ').trim();
178
204
  return capDescriptionLength(contextual);
179
205
  }
206
+ /**
207
+ * Resolve the human language label used to make otherwise-identical
208
+ * cross-locale legacy descriptions unique.
209
+ *
210
+ * @param lang - Language code
211
+ * @returns Local language name, or the raw code if unknown
212
+ */
213
+ function legacyLanguageLabel(lang) {
214
+ const descriptor = Object.getOwnPropertyDescriptor(LEGACY_LANGUAGE_LABELS, lang);
215
+ return typeof descriptor?.value === 'string' ? descriptor.value : lang;
216
+ }
217
+ /**
218
+ * Build an optional slug-derived qualifier for legacy pages that share the
219
+ * same date and article category (for example same-day `*-run2` variants).
220
+ *
221
+ * @param slug - Article slug without date/language suffix
222
+ * @param localizedLabel - Localized category label already present in prefix
223
+ * @returns Human-readable qualifier, or empty when it would duplicate label
224
+ */
225
+ function buildLegacySlugQualifier(slug, localizedLabel) {
226
+ const formatted = formatSlug(slug).trim();
227
+ if (!formatted)
228
+ return '';
229
+ const normalizedFormatted = normalizeLegacyQualifier(formatted);
230
+ const normalizedLabel = normalizeLegacyQualifier(localizedLabel);
231
+ if (!normalizedFormatted || normalizedFormatted === normalizedLabel)
232
+ return '';
233
+ return formatted;
234
+ }
235
+ /**
236
+ * Normalize a prefix component for duplicate detection.
237
+ *
238
+ * @param value - Candidate text
239
+ * @returns Lower-case alphanumeric text
240
+ */
241
+ function normalizeLegacyQualifier(value) {
242
+ return value
243
+ .toLowerCase()
244
+ .replace(/[^\p{L}\p{N}]+/gu, ' ')
245
+ .trim();
246
+ }
180
247
  /**
181
248
  * Clamp a description to the 180-character SERP-friendly cap with a
182
249
  * trailing ellipsis when truncated. Extracted from