euparliamentmonitor 0.9.21 → 0.9.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/scripts/aggregator/metadata/artifact-walker.js +2 -2
- package/scripts/aggregator/metadata/heading-rules.js +1 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +567 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "euparliamentmonitor",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.22",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
|
|
6
6
|
"main": "scripts/index.js",
|
|
@@ -71,6 +71,7 @@
|
|
|
71
71
|
"prior-run-diff": "node scripts/aggregator/prior-run-diff.js",
|
|
72
72
|
"generate-article": "node scripts/aggregator/article-generator.js",
|
|
73
73
|
"generate-article:all": "node scripts/aggregator/article-generator.js --all",
|
|
74
|
+
"dump:article-seo": "node scripts/dump-article-seo.js",
|
|
74
75
|
"generate-news-indexes": "node scripts/generators/news-indexes.js",
|
|
75
76
|
"generate-sitemap": "node scripts/generators/sitemap.js",
|
|
76
77
|
"image:generate": "node scripts/generate-responsive-images.js",
|
|
@@ -164,6 +165,7 @@
|
|
|
164
165
|
"chartjs-plugin-annotation": "3.1.0",
|
|
165
166
|
"clean-css": "^5.3.3",
|
|
166
167
|
"d3": "7.9.0",
|
|
168
|
+
"esbuild": "0.28.0",
|
|
167
169
|
"eslint": "10.4.0",
|
|
168
170
|
"eslint-config-prettier": "10.1.8",
|
|
169
171
|
"eslint-plugin-jsdoc": "63.0.0",
|
|
@@ -208,6 +210,7 @@
|
|
|
208
210
|
"flatted": ">=3.4.2",
|
|
209
211
|
"path-to-regexp": ">=8.4.0",
|
|
210
212
|
"ip-address": ">=10.1.1",
|
|
211
|
-
"uuid": ">=11.1.1"
|
|
213
|
+
"uuid": ">=11.1.1",
|
|
214
|
+
"qs": "6.15.2"
|
|
212
215
|
}
|
|
213
216
|
}
|
|
@@ -17,7 +17,7 @@ import fs from 'fs';
|
|
|
17
17
|
import path from 'path';
|
|
18
18
|
import { extractFirstH1 } from './h1-extractor.js';
|
|
19
19
|
import { extractLedeAfterHeading, extractStrongProseLine } from './lede-extractor.js';
|
|
20
|
-
import { isGenericHeading, stripArtifactCategoryAffix } from './heading-rules.js';
|
|
20
|
+
import { isGenericHeading, isArtifactCategoryHeading, stripArtifactCategoryAffix, } from './heading-rules.js';
|
|
21
21
|
import { truncateTitle } from './text-utils.js';
|
|
22
22
|
import { extractPriorityFindingHighlight } from './priority-finding-highlight.js';
|
|
23
23
|
/** Ordered list of artefact filenames that typically carry the editorial H1. */
|
|
@@ -132,7 +132,7 @@ function probeCandidateForHighlight(runDir, rel, articleType, date) {
|
|
|
132
132
|
// distinctive editorial headline ("Digital Markets Act Enforcement",
|
|
133
133
|
// "Ukraine War Accountability") instead of a stripped category noun.
|
|
134
134
|
const priority = extractPriorityFindingHighlight(body);
|
|
135
|
-
if (priority?.headline) {
|
|
135
|
+
if (priority?.headline && !isArtifactCategoryHeading(priority.headline)) {
|
|
136
136
|
return {
|
|
137
137
|
cleanHighlight: {
|
|
138
138
|
headline: truncateTitle(priority.headline),
|
package/scripts/copy-vendor.js
CHANGED
|
@@ -12,15 +12,25 @@
|
|
|
12
12
|
* - chart.js → js/vendor/chart.umd.min.js
|
|
13
13
|
* - chartjs-plugin-annotation → js/vendor/chartjs-plugin-annotation.min.js
|
|
14
14
|
* - d3 → js/vendor/d3.min.js
|
|
15
|
-
* - mermaid → js/vendor/mermaid/
|
|
15
|
+
* - mermaid → js/vendor/mermaid/mermaid.esm.min.mjs
|
|
16
16
|
*
|
|
17
|
-
* Mermaid is special: v11+ ships as code-split ESM
|
|
18
|
-
* `mermaid.esm.min.mjs`
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
17
|
+
* Mermaid is special: v11+ ships as a **code-split ESM bundle**. The entry
|
|
18
|
+
* `mermaid.esm.min.mjs` (28 KB) statically imports 81 diagram-specific chunks
|
|
19
|
+
* from `dist/chunks/mermaid.esm.min/*.mjs`. Empirically (May 2026), serving
|
|
20
|
+
* those chunks through S3 + CloudFront has been unreliable — the entry returns
|
|
21
|
+
* 200 OK but every chunk URL returns 403 from CloudFront, breaking every
|
|
22
|
+
* article that references the loader.
|
|
22
23
|
*
|
|
23
|
-
*
|
|
24
|
+
* To eliminate that failure mode, we **bundle Mermaid into a single
|
|
25
|
+
* self-contained ESM file at copy-vendor time using esbuild** (devDependency).
|
|
26
|
+
* The output is written to the same path / filename that the loader and the
|
|
27
|
+
* existing article HTML already reference (`mermaid.esm.min.mjs`), so the
|
|
28
|
+
* loader (`js/mermaid-init.js`) and the generated articles continue to work
|
|
29
|
+
* unchanged — only the file's content changes (3.2 MB self-contained vs.
|
|
30
|
+
* 28 KB entry-plus-81-chunks).
|
|
31
|
+
*
|
|
32
|
+
* Idempotent: rerunning overwrites prior copies and leaves licenses in place;
|
|
33
|
+
* stale `chunks/` directories from prior layouts are pruned.
|
|
24
34
|
*
|
|
25
35
|
* Failure modes:
|
|
26
36
|
* - Missing chart.js / d3 / chartjs-plugin-annotation → hard error (these
|
|
@@ -28,11 +38,12 @@
|
|
|
28
38
|
* - Missing mermaid → soft error (logged, exit 0). Mermaid is also a pinned
|
|
29
39
|
* `devDependency`, but optional installs (e.g. `npm ci --omit=dev`) may
|
|
30
40
|
* skip it; we want the deploy to succeed without diagrams rather than fail.
|
|
41
|
+
* - Bundling failure → hard error: mermaid is present but unusable, which
|
|
42
|
+
* would silently ship a broken page; fail fast at build time instead.
|
|
31
43
|
*/
|
|
32
44
|
|
|
33
45
|
import {
|
|
34
46
|
copyFileSync,
|
|
35
|
-
cpSync,
|
|
36
47
|
existsSync,
|
|
37
48
|
mkdirSync,
|
|
38
49
|
readdirSync,
|
|
@@ -43,6 +54,7 @@ import {
|
|
|
43
54
|
} from 'node:fs';
|
|
44
55
|
import path from 'node:path';
|
|
45
56
|
import process from 'node:process';
|
|
57
|
+
import * as esbuild from 'esbuild';
|
|
46
58
|
|
|
47
59
|
const ROOT = process.cwd();
|
|
48
60
|
const NODE_MODULES = path.join(ROOT, 'node_modules');
|
|
@@ -120,7 +132,8 @@ function copyOrFail(label, srcRel, dstRel, license) {
|
|
|
120
132
|
function copyMermaid() {
|
|
121
133
|
const mermaidDist = path.join(NODE_MODULES, 'mermaid', 'dist');
|
|
122
134
|
const target = path.join(VENDOR_DIR, 'mermaid');
|
|
123
|
-
|
|
135
|
+
const entryPoint = path.join(mermaidDist, 'mermaid.esm.min.mjs');
|
|
136
|
+
if (!existsSync(entryPoint)) {
|
|
124
137
|
process.stdout.write(
|
|
125
138
|
' ⚠ mermaid not installed (devDependency); skipping diagram bundle.\n',
|
|
126
139
|
);
|
|
@@ -128,84 +141,68 @@ function copyMermaid() {
|
|
|
128
141
|
}
|
|
129
142
|
ensureDir(target);
|
|
130
143
|
|
|
131
|
-
//
|
|
132
|
-
//
|
|
133
|
-
//
|
|
134
|
-
//
|
|
135
|
-
//
|
|
136
|
-
// identical until the pinned mermaid version in package.json changes.
|
|
144
|
+
// Bundle mermaid's code-split ESM entry plus all of its dynamic-import
|
|
145
|
+
// chunks into a SINGLE self-contained ESM file. esbuild follows every
|
|
146
|
+
// static and dynamic `import` from the entry and inlines the transitive
|
|
147
|
+
// closure, so the resulting file has no external module references —
|
|
148
|
+
// exactly what the static-site origin needs.
|
|
137
149
|
//
|
|
138
|
-
//
|
|
139
|
-
// `
|
|
140
|
-
//
|
|
141
|
-
//
|
|
142
|
-
//
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
150
|
+
// We write the output under the same filename the loader and existing
|
|
151
|
+
// article HTML already reference (`mermaid.esm.min.mjs`), so this script
|
|
152
|
+
// is the only place that changes when we switch from "entry + 81 chunks"
|
|
153
|
+
// to "single bundle". The previous chunk-shipping layout (`chunks/`) is
|
|
154
|
+
// pruned below.
|
|
155
|
+
const outFile = path.join(target, 'mermaid.esm.min.mjs');
|
|
156
|
+
try {
|
|
157
|
+
esbuild.buildSync({
|
|
158
|
+
entryPoints: [entryPoint],
|
|
159
|
+
outfile: outFile,
|
|
160
|
+
bundle: true,
|
|
161
|
+
format: 'esm',
|
|
162
|
+
minify: true,
|
|
163
|
+
// `browser` keeps mermaid's runtime-detection paths (e.g. `document`
|
|
164
|
+
// checks) intact — same target as the upstream `.esm.min.mjs` build.
|
|
165
|
+
platform: 'browser',
|
|
166
|
+
target: 'es2022',
|
|
167
|
+
// Resolve `import.meta.url` at runtime (relative to the served bundle
|
|
168
|
+
// location) rather than baking in the build-time path.
|
|
169
|
+
supported: { 'import-meta': true },
|
|
170
|
+
// Drop sourcemaps; the upstream bundle ships them as `.map` siblings
|
|
171
|
+
// and we previously excluded those from vendor copy.
|
|
172
|
+
sourcemap: false,
|
|
173
|
+
legalComments: 'none',
|
|
174
|
+
// Use 'error' so esbuild prints its own detailed diagnostics (file,
|
|
175
|
+
// line, column) on failure — 'silent' previously swallowed all context.
|
|
176
|
+
logLevel: 'error',
|
|
177
|
+
});
|
|
178
|
+
} catch (err) {
|
|
179
|
+
// esbuild attaches structured diagnostics on `err.errors`; print them
|
|
180
|
+
// so CI logs are actionable without re-running locally.
|
|
181
|
+
if (err && Array.isArray(err.errors)) {
|
|
182
|
+
for (const e of err.errors) {
|
|
183
|
+
const loc = e.location
|
|
184
|
+
? `${e.location.file}:${e.location.line}:${e.location.column}: `
|
|
185
|
+
: '';
|
|
186
|
+
process.stderr.write(` ${loc}${e.text}\n`);
|
|
172
187
|
}
|
|
173
188
|
}
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
for (const { src, rel } of wantedFiles) {
|
|
181
|
-
const dst = path.join(target, rel);
|
|
182
|
-
ensureDir(path.dirname(dst));
|
|
183
|
-
if (copyFileIfChanged(src, dst)) {
|
|
184
|
-
copied++;
|
|
185
|
-
} else {
|
|
186
|
-
unchanged++;
|
|
187
|
-
}
|
|
189
|
+
process.stderr.write(
|
|
190
|
+
`error: mermaid bundle failed: ${err && err.message ? err.message : err}\n` +
|
|
191
|
+
' Check that node_modules/mermaid is installed (run `npm ci`) and that\n' +
|
|
192
|
+
' esbuild can resolve the ESM entry point at node_modules/mermaid/dist/mermaid.esm.min.mjs.\n',
|
|
193
|
+
);
|
|
194
|
+
process.exit(1);
|
|
188
195
|
}
|
|
189
196
|
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
//
|
|
193
|
-
|
|
194
|
-
const wantedDstSet = new Set(
|
|
195
|
-
wantedFiles.map(({ rel }) => path.join(target, rel)),
|
|
196
|
-
);
|
|
197
|
-
// Allow our REUSE sidecar files alongside their primary file.
|
|
197
|
+
// Prune the obsolete chunks layout (and any other orphans) from previous
|
|
198
|
+
// copy-vendor runs. The bundled file is fully self-contained, so anything
|
|
199
|
+
// other than the bundle itself + its REUSE sidecar is stale.
|
|
200
|
+
const wantedDstSet = new Set([outFile]);
|
|
198
201
|
function isAllowedSidecar(absPath) {
|
|
199
202
|
if (!absPath.endsWith('.license')) return false;
|
|
200
203
|
const primary = absPath.slice(0, -'.license'.length);
|
|
201
204
|
return wantedDstSet.has(primary);
|
|
202
205
|
}
|
|
203
|
-
// Also allow the chunks-dir flavour-level license sidecar we drop below.
|
|
204
|
-
const flavourLicensePath = path.join(
|
|
205
|
-
target,
|
|
206
|
-
'chunks',
|
|
207
|
-
'mermaid.esm.min.license',
|
|
208
|
-
);
|
|
209
206
|
|
|
210
207
|
function pruneOrphans(dir) {
|
|
211
208
|
if (!existsSync(dir)) return;
|
|
@@ -213,7 +210,6 @@ function copyMermaid() {
|
|
|
213
210
|
const full = path.join(dir, entry.name);
|
|
214
211
|
if (entry.isDirectory()) {
|
|
215
212
|
pruneOrphans(full);
|
|
216
|
-
// Remove now-empty directories so a flavour rename leaves no shell.
|
|
217
213
|
try {
|
|
218
214
|
if (readdirSync(full).length === 0) {
|
|
219
215
|
rmSync(full, { recursive: true, force: true });
|
|
@@ -222,11 +218,7 @@ function copyMermaid() {
|
|
|
222
218
|
// best-effort
|
|
223
219
|
}
|
|
224
220
|
} else if (entry.isFile()) {
|
|
225
|
-
if (
|
|
226
|
-
!wantedDstSet.has(full) &&
|
|
227
|
-
!isAllowedSidecar(full) &&
|
|
228
|
-
full !== flavourLicensePath
|
|
229
|
-
) {
|
|
221
|
+
if (!wantedDstSet.has(full) && !isAllowedSidecar(full)) {
|
|
230
222
|
rmSync(full, { force: true });
|
|
231
223
|
}
|
|
232
224
|
}
|
|
@@ -234,39 +226,19 @@ function copyMermaid() {
|
|
|
234
226
|
}
|
|
235
227
|
pruneOrphans(target);
|
|
236
228
|
|
|
237
|
-
// REUSE sidecar for the
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
const
|
|
245
|
-
if (existsSync(chunksDir)) {
|
|
246
|
-
writeIfChanged(
|
|
247
|
-
flavourLicensePath,
|
|
248
|
-
'SPDX-FileCopyrightText: 2014-2026 Mermaid contributors\nSPDX-License-Identifier: MIT\n',
|
|
249
|
-
);
|
|
250
|
-
}
|
|
229
|
+
// REUSE sidecar for the bundled file. The bundle contains code from
|
|
230
|
+
// mermaid + its transitive ESM deps; mermaid's own MIT license header
|
|
231
|
+
// remains intact in the dependency tree (REUSE.toml covers the vendored
|
|
232
|
+
// artifact via path-level annotation; this sidecar keeps the file
|
|
233
|
+
// self-documenting).
|
|
234
|
+
writeLicense(outFile, '2014-2026 Mermaid contributors', 'MIT');
|
|
235
|
+
|
|
236
|
+
const size = statSync(outFile).size;
|
|
251
237
|
process.stdout.write(
|
|
252
|
-
` ✓ mermaid/ (${
|
|
238
|
+
` ✓ mermaid/mermaid.esm.min.mjs (${(size / 1024).toFixed(0)} KB self-contained bundle)\n`,
|
|
253
239
|
);
|
|
254
240
|
}
|
|
255
241
|
|
|
256
|
-
function countMjs(dir) {
|
|
257
|
-
let n = 0;
|
|
258
|
-
function walk(d) {
|
|
259
|
-
if (!existsSync(d)) return;
|
|
260
|
-
for (const entry of readdirSync(d, { withFileTypes: true })) {
|
|
261
|
-
const p = path.join(d, entry.name);
|
|
262
|
-
if (entry.isDirectory()) walk(p);
|
|
263
|
-
else if (entry.isFile() && entry.name.endsWith('.mjs')) n += 1;
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
walk(dir);
|
|
267
|
-
return n;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
242
|
function main() {
|
|
271
243
|
ensureDir(VENDOR_DIR);
|
|
272
244
|
process.stdout.write(`Copying vendor JS libraries to ${path.relative(ROOT, VENDOR_DIR)}/\n`);
|
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @module scripts/dump-article-seo
|
|
7
|
+
* @description Read-only preview of the SEO `<head>` metadata that the
|
|
8
|
+
* deterministic article generator **would produce** for every executive
|
|
9
|
+
* brief committed under `analysis/daily/`. Use this before running
|
|
10
|
+
* `npm run generate-article:all` to audit and improve titles,
|
|
11
|
+
* descriptions, and keywords without touching any `news/*.html` file.
|
|
12
|
+
*
|
|
13
|
+
* **Source: executive briefs, not HTML.**
|
|
14
|
+
* The script reads each analysis run's `executive-brief.md` (and its
|
|
15
|
+
* translated siblings) via the same resolver chain that the real
|
|
16
|
+
* article generator uses. No HTML files are read or written; the script
|
|
17
|
+
* is purely additive and fully idempotent.
|
|
18
|
+
*
|
|
19
|
+
* **Identical code path to the real renderer.** The script intentionally
|
|
20
|
+
* imports the same helpers that `scripts/aggregator/article-generator.js`
|
|
21
|
+
* (the engine behind `npm run generate-article:all` and the
|
|
22
|
+
* `regenerate-articles.yml` workflow) uses:
|
|
23
|
+
*
|
|
24
|
+
* 1. `discoverAnalysisRuns(repoRoot)` — same run discovery as the batch
|
|
25
|
+
* renderer (`generator/render-batch.js`).
|
|
26
|
+
* 2. `aggregateAnalysisRun({ runDir, repoRoot })` — same Markdown
|
|
27
|
+
* aggregation that feeds `resolveArticleMetadata`, which in turn
|
|
28
|
+
* reads `executive-brief.md` and its translated siblings.
|
|
29
|
+
* 3. `resolveArticleMetadata({ articleType, date, markdown, manifest,
|
|
30
|
+
* runDir })` — the single source of truth for per-language `(title,
|
|
31
|
+
* description, extendedDescription, keywords, source)` documented in
|
|
32
|
+
* `src/aggregator/article-metadata.ts`. The entry returned here is
|
|
33
|
+
* *bit-for-bit identical* to the one passed into
|
|
34
|
+
* `src/aggregator/html/shell.ts` for the `<title>`,
|
|
35
|
+
* `<meta name="description">`, and `<meta name="keywords">` tags.
|
|
36
|
+
*
|
|
37
|
+
* **Two-part output per run.**
|
|
38
|
+
* - *Field analysis* — human-readable breakdown of each SEO field
|
|
39
|
+
* (length, content, resolution tier) for quick editorial review.
|
|
40
|
+
* - *HTML head snippet* — the **complete `<head>` block** that the
|
|
41
|
+
* article generator will emit, produced by calling
|
|
42
|
+
* `wrapArticleHtml()` from `src/aggregator/html/shell.ts` with an
|
|
43
|
+
* empty body and slicing out `<head>...</head>`. This includes the
|
|
44
|
+
* `<title>`, `<meta name="description">`, `<meta name="keywords">`,
|
|
45
|
+
* all `<meta property="og:*">` and `<meta name="twitter:*">` tags,
|
|
46
|
+
* `<link rel="canonical">`, hreflang alternates, JSON-LD
|
|
47
|
+
* `NewsArticle` + `BreadcrumbList`, and every other tag the real
|
|
48
|
+
* renderer emits — because the snippet *is* the real renderer's
|
|
49
|
+
* output. Copy-paste these into a browser extension or SEO tool to
|
|
50
|
+
* preview how the article will appear in search results and social
|
|
51
|
+
* cards before committing to HTML generation.
|
|
52
|
+
*
|
|
53
|
+
* Invocation:
|
|
54
|
+
* node scripts/dump-article-seo.js \
|
|
55
|
+
* [--repo-root <path>] # defaults to process.cwd()
|
|
56
|
+
* [--lang en] # defaults to en
|
|
57
|
+
* [--out <path>] # also write the human-readable dump here
|
|
58
|
+
* [--json <path>] # also write a machine-readable JSONL dump
|
|
59
|
+
* [--limit <N>] # only process the first N runs (debug)
|
|
60
|
+
* [--quiet] # suppress per-run stdout (file output only)
|
|
61
|
+
*/
|
|
62
|
+
|
|
63
|
+
import fs from 'node:fs';
|
|
64
|
+
import path from 'node:path';
|
|
65
|
+
import process from 'node:process';
|
|
66
|
+
|
|
67
|
+
import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
|
|
68
|
+
import {
|
|
69
|
+
aggregateAnalysisRun,
|
|
70
|
+
resolveArticleTypeFromManifest,
|
|
71
|
+
} from './aggregator/analysis-aggregator.js';
|
|
72
|
+
import { resolveArticleMetadata } from './aggregator/article-metadata.js';
|
|
73
|
+
import { buildArticleSlug } from './aggregator/generator/slug.js';
|
|
74
|
+
import { getArticleFilename } from './aggregator/html/hreflang.js';
|
|
75
|
+
import { wrapArticleHtml } from './aggregator/html/shell.js';
|
|
76
|
+
import { ALL_LANGUAGES, isSupportedLanguage } from './constants/language-core.js';
|
|
77
|
+
|
|
78
|
+
const SUPPORTED_LANGS = new Set(ALL_LANGUAGES);
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Parse the small CLI surface used by this script. Kept inline so the
|
|
82
|
+
* dumper has no extra dependencies beyond the same compiled-from-TS
|
|
83
|
+
* helpers the real renderer uses.
|
|
84
|
+
*
|
|
85
|
+
* @param {readonly string[]} argv - `process.argv.slice(2)`
|
|
86
|
+
* @returns {{repoRoot: string, lang: string, outPath: string|null,
|
|
87
|
+
* jsonPath: string|null, limit: number, quiet: boolean}}
|
|
88
|
+
*/
|
|
89
|
+
export function parseArgs(argv) {
|
|
90
|
+
let repoRoot = process.cwd();
|
|
91
|
+
let lang = 'en';
|
|
92
|
+
let outPath = null;
|
|
93
|
+
let jsonPath = null;
|
|
94
|
+
let limit = Number.POSITIVE_INFINITY;
|
|
95
|
+
let quiet = false;
|
|
96
|
+
|
|
97
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
98
|
+
const arg = argv[i];
|
|
99
|
+
switch (arg) {
|
|
100
|
+
case '--repo-root':
|
|
101
|
+
repoRoot = path.resolve(requireValue(argv, i, arg));
|
|
102
|
+
i += 1;
|
|
103
|
+
break;
|
|
104
|
+
case '--lang':
|
|
105
|
+
lang = requireValue(argv, i, arg);
|
|
106
|
+
i += 1;
|
|
107
|
+
break;
|
|
108
|
+
case '--out':
|
|
109
|
+
outPath = path.resolve(requireValue(argv, i, arg));
|
|
110
|
+
i += 1;
|
|
111
|
+
break;
|
|
112
|
+
case '--json':
|
|
113
|
+
jsonPath = path.resolve(requireValue(argv, i, arg));
|
|
114
|
+
i += 1;
|
|
115
|
+
break;
|
|
116
|
+
case '--limit': {
|
|
117
|
+
const raw = requireValue(argv, i, arg);
|
|
118
|
+
if (!/^\d+$/u.test(raw)) {
|
|
119
|
+
throw new Error(`--limit expects a positive integer, got "${raw}"`);
|
|
120
|
+
}
|
|
121
|
+
const parsed = Number.parseInt(raw, 10);
|
|
122
|
+
if (!Number.isFinite(parsed) || parsed < 1) {
|
|
123
|
+
throw new Error(`--limit expects a positive integer, got "${raw}"`);
|
|
124
|
+
}
|
|
125
|
+
limit = parsed;
|
|
126
|
+
i += 1;
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
case '--quiet':
|
|
130
|
+
quiet = true;
|
|
131
|
+
break;
|
|
132
|
+
case '--help':
|
|
133
|
+
case '-h':
|
|
134
|
+
printHelpAndExit();
|
|
135
|
+
break;
|
|
136
|
+
default:
|
|
137
|
+
throw new Error(`Unknown argument: ${arg}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if (!isSupportedLanguage(lang)) {
|
|
142
|
+
throw new Error(
|
|
143
|
+
`Unsupported --lang "${lang}". Expected one of: ${[...SUPPORTED_LANGS].join(', ')}`
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return { repoRoot, lang, outPath, jsonPath, limit, quiet };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function requireValue(argv, i, flag) {
|
|
151
|
+
const value = argv[i + 1];
|
|
152
|
+
if (value === undefined) {
|
|
153
|
+
throw new Error(`${flag} requires a value`);
|
|
154
|
+
}
|
|
155
|
+
return value;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function printHelpAndExit() {
|
|
159
|
+
process.stdout.write(
|
|
160
|
+
[
|
|
161
|
+
'Usage: node scripts/dump-article-seo.js [options]',
|
|
162
|
+
'',
|
|
163
|
+
'Read-only preview of the SEO <head> metadata (title, description,',
|
|
164
|
+
'keywords, og:*, twitter:*) that the article generator would produce',
|
|
165
|
+
'from each executive brief — without generating any HTML files.',
|
|
166
|
+
'',
|
|
167
|
+
'Options:',
|
|
168
|
+
' --repo-root <path> Repository root (default: cwd)',
|
|
169
|
+
' --lang <code> Language to dump (default: en)',
|
|
170
|
+
' --out <path> Write the human-readable report here',
|
|
171
|
+
' --json <path> Also write a JSONL record per run',
|
|
172
|
+
' --limit <N> Process only the first N runs (debug)',
|
|
173
|
+
' --quiet Suppress per-run stdout',
|
|
174
|
+
' -h, --help Show this help',
|
|
175
|
+
'',
|
|
176
|
+
].join('\n')
|
|
177
|
+
);
|
|
178
|
+
process.exit(0);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Mirror of the private `readManifestMetadata` helper inside
|
|
183
|
+
* `scripts/aggregator/generator/render-one.js`. We re-implement it here
|
|
184
|
+
* rather than export it from the renderer because the metadata-relevant
|
|
185
|
+
* subset of a manifest is intentionally a *contract*, not a public API:
|
|
186
|
+
* the resolver only consumes the seven keys listed below and silently
|
|
187
|
+
* ignores everything else. Re-implementing keeps the dumper aligned
|
|
188
|
+
* with that contract without leaking unrelated manifest fields into
|
|
189
|
+
* `resolveArticleMetadata`.
|
|
190
|
+
*
|
|
191
|
+
* @param {string} runDir - Absolute path to the analysis run
|
|
192
|
+
* @returns {object} Metadata-relevant manifest fields (possibly empty)
|
|
193
|
+
*/
|
|
194
|
+
export function readManifestMetadata(runDir) {
|
|
195
|
+
const manifestPath = path.join(runDir, 'manifest.json');
|
|
196
|
+
if (!fs.existsSync(manifestPath)) return {};
|
|
197
|
+
|
|
198
|
+
let parsed;
|
|
199
|
+
try {
|
|
200
|
+
parsed = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
|
|
201
|
+
} catch {
|
|
202
|
+
return {};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const manifest = {};
|
|
206
|
+
const resolvedType = resolveArticleTypeFromManifest(parsed);
|
|
207
|
+
if (resolvedType && resolvedType !== 'unknown') {
|
|
208
|
+
manifest.articleType = resolvedType;
|
|
209
|
+
}
|
|
210
|
+
if (typeof parsed.date === 'string') manifest.date = parsed.date;
|
|
211
|
+
if (typeof parsed.runId === 'string') manifest.runId = parsed.runId;
|
|
212
|
+
if (typeof parsed.title === 'string' || isLanguageMapLike(parsed.title)) {
|
|
213
|
+
manifest.title = parsed.title;
|
|
214
|
+
}
|
|
215
|
+
if (
|
|
216
|
+
typeof parsed.description === 'string' ||
|
|
217
|
+
isLanguageMapLike(parsed.description)
|
|
218
|
+
) {
|
|
219
|
+
manifest.description = parsed.description;
|
|
220
|
+
}
|
|
221
|
+
if (typeof parsed.committee === 'string') {
|
|
222
|
+
manifest.committee = parsed.committee;
|
|
223
|
+
}
|
|
224
|
+
return manifest;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function isLanguageMapLike(value) {
|
|
228
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
|
|
229
|
+
for (const entry of Object.values(value)) {
|
|
230
|
+
if (typeof entry !== 'string') return false;
|
|
231
|
+
}
|
|
232
|
+
return true;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Resolve the SEO metadata for one analysis run by reading its executive
|
|
237
|
+
* brief and applying the same resolver chain as the article generator.
|
|
238
|
+
* Pure: no files written, no stdout side-effects.
|
|
239
|
+
*
|
|
240
|
+
* @param {object} opts
|
|
241
|
+
* @param {string} opts.runDir - Absolute path to the analysis run
|
|
242
|
+
* @param {string} opts.repoRoot - Repository root (for relative paths)
|
|
243
|
+
* @param {string} opts.lang - Language code to extract
|
|
244
|
+
* @returns {{
|
|
245
|
+
* runDir: string,
|
|
246
|
+
* runDirRel: string,
|
|
247
|
+
* date: string,
|
|
248
|
+
* articleType: string,
|
|
249
|
+
* slug: string,
|
|
250
|
+
* filename: string,
|
|
251
|
+
* entry: {title: string, description: string,
|
|
252
|
+
* extendedDescription: string, keywords: readonly string[],
|
|
253
|
+
* source: string}
|
|
254
|
+
* }}
|
|
255
|
+
*/
|
|
256
|
+
export function resolveRunSeo({ runDir, repoRoot, lang }) {
|
|
257
|
+
const aggregated = aggregateAnalysisRun({ runDir, repoRoot });
|
|
258
|
+
const manifestMetadata = readManifestMetadata(runDir);
|
|
259
|
+
const resolved = resolveArticleMetadata({
|
|
260
|
+
articleType: aggregated.articleType,
|
|
261
|
+
date: aggregated.date,
|
|
262
|
+
markdown: aggregated.markdown,
|
|
263
|
+
manifest: manifestMetadata,
|
|
264
|
+
runDir,
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
const entry = resolved[lang];
|
|
268
|
+
if (!entry) {
|
|
269
|
+
throw new Error(
|
|
270
|
+
`resolveArticleMetadata returned no entry for lang="${lang}" in ${runDir}`
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const slug = buildArticleSlug(aggregated.date, aggregated.articleType);
|
|
275
|
+
const filename = getArticleFilename(slug, lang);
|
|
276
|
+
const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
runDir,
|
|
280
|
+
runDirRel,
|
|
281
|
+
date: aggregated.date,
|
|
282
|
+
articleType: aggregated.articleType,
|
|
283
|
+
slug,
|
|
284
|
+
filename,
|
|
285
|
+
entry: {
|
|
286
|
+
title: entry.title,
|
|
287
|
+
description: entry.description,
|
|
288
|
+
extendedDescription: entry.extendedDescription,
|
|
289
|
+
keywords: entry.keywords ?? [],
|
|
290
|
+
source: entry.source,
|
|
291
|
+
},
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Build the full `<head>` block that the article generator will emit for
|
|
297
|
+
* this run. The output is **bit-for-bit identical** to the `<head>`
|
|
298
|
+
* produced by `wrapArticleHtml()` in `src/aggregator/html/shell.ts` for
|
|
299
|
+
* the same metadata — including all SEO, Open Graph, Twitter card,
|
|
300
|
+
* `<link rel>`, theme-color, and JSON-LD tags — because this function
|
|
301
|
+
* literally invokes `wrapArticleHtml()` with an empty body and slices
|
|
302
|
+
* out the `<head>...</head>` block from the resulting document. There
|
|
303
|
+
* is no duplicated head-rendering code path.
|
|
304
|
+
*
|
|
305
|
+
* Use this to preview how the article will appear in search results and
|
|
306
|
+
* social-card previews **before** running the full HTML generator.
|
|
307
|
+
*
|
|
308
|
+
* @param {ReturnType<typeof resolveRunSeo>} record
|
|
309
|
+
* @param {string} lang - Language code passed through to `wrapArticleHtml`
|
|
310
|
+
* @returns {string} The complete `<head>...</head>` block from the
|
|
311
|
+
* real article renderer, ready to paste for review.
|
|
312
|
+
*/
|
|
313
|
+
export function buildHtmlHeadSnippet(record, lang) {
|
|
314
|
+
const { entry } = record;
|
|
315
|
+
const html = wrapArticleHtml({
|
|
316
|
+
lang,
|
|
317
|
+
articleSlug: record.slug,
|
|
318
|
+
body: '',
|
|
319
|
+
title: entry.title,
|
|
320
|
+
description: entry.description,
|
|
321
|
+
extendedDescription: entry.extendedDescription,
|
|
322
|
+
keywords: entry.keywords ?? [],
|
|
323
|
+
date: record.date,
|
|
324
|
+
articleType: record.articleType,
|
|
325
|
+
});
|
|
326
|
+
const match = html.match(/<head>[\s\S]*?<\/head>/);
|
|
327
|
+
if (!match) {
|
|
328
|
+
throw new Error(
|
|
329
|
+
`buildHtmlHeadSnippet: could not locate <head> block in wrapArticleHtml output for ${record.slug}`
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
return match[0];
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Format one resolved-SEO record as the human/AI-readable block used in
|
|
337
|
+
* the stdout dump. Each block contains two sections:
|
|
338
|
+
* 1. *Field analysis* — per-field character/term counts and the
|
|
339
|
+
* resolution tier so editors can spot template fallbacks instantly.
|
|
340
|
+
* 2. *HTML head snippet* — the exact tags the article generator will
|
|
341
|
+
* emit, ready to paste into a browser/SEO tool for preview.
|
|
342
|
+
*
|
|
343
|
+
* @param {ReturnType<typeof resolveRunSeo>} record
|
|
344
|
+
* @param {number} index - 1-based position within the dump
|
|
345
|
+
* @param {number} total - Total number of records being dumped
|
|
346
|
+
* @param {string} [lang] - Language code (used for the HTML snippet; defaults to 'en')
|
|
347
|
+
* @returns {string}
|
|
348
|
+
*/
|
|
349
|
+
export function formatRecord(record, index, total, lang = 'en') {
|
|
350
|
+
const lines = [];
|
|
351
|
+
lines.push('='.repeat(80));
|
|
352
|
+
lines.push(`[${index}/${total}] ${record.slug}`);
|
|
353
|
+
lines.push('='.repeat(80));
|
|
354
|
+
lines.push(`run-dir : ${record.runDirRel}`);
|
|
355
|
+
lines.push(`date : ${record.date}`);
|
|
356
|
+
lines.push(`article-type : ${record.articleType}`);
|
|
357
|
+
lines.push(`resolution-tier : ${record.entry.source}`);
|
|
358
|
+
lines.push(`html-file : news/${record.filename}`);
|
|
359
|
+
lines.push('');
|
|
360
|
+
lines.push('--- Field analysis (from executive-brief.md → resolveArticleMetadata) ---');
|
|
361
|
+
lines.push(
|
|
362
|
+
`<title> (${record.entry.title.length} chars): ${formatInline(record.entry.title)}`
|
|
363
|
+
);
|
|
364
|
+
lines.push(
|
|
365
|
+
`<meta description> (${record.entry.description.length} chars): ${formatInline(record.entry.description)}`
|
|
366
|
+
);
|
|
367
|
+
lines.push(
|
|
368
|
+
`<meta description-extended> (${record.entry.extendedDescription.length} chars): ${formatInline(record.entry.extendedDescription)}`
|
|
369
|
+
);
|
|
370
|
+
const keywords = record.entry.keywords;
|
|
371
|
+
lines.push(
|
|
372
|
+
`<meta keywords> (${keywords.length} terms): ${keywords.length ? keywords.join(', ') : '(empty)'}`
|
|
373
|
+
);
|
|
374
|
+
lines.push('');
|
|
375
|
+
lines.push('--- HTML <head> block (verbatim output of wrapArticleHtml — same code path as the article generator) ---');
|
|
376
|
+
lines.push(buildHtmlHeadSnippet(record, lang));
|
|
377
|
+
lines.push('');
|
|
378
|
+
return lines.join('\n');
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function formatInline(value) {
|
|
382
|
+
if (!value) return '(empty)';
|
|
383
|
+
// Strip newlines so each field stays on one line.
|
|
384
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* Run the full dump: discover analysis runs, resolve SEO metadata from
|
|
389
|
+
* each executive brief, print field analysis + HTML head snippet, and
|
|
390
|
+
* optionally write to disk. Returns summary statistics so unit tests and
|
|
391
|
+
* downstream tooling can assert on histograms without re-parsing stdout.
|
|
392
|
+
*
|
|
393
|
+
* @param {ReturnType<typeof parseArgs>} opts
|
|
394
|
+
* @returns {{
|
|
395
|
+
* discovered: number,
|
|
396
|
+
* total: number,
|
|
397
|
+
* processed: number,
|
|
398
|
+
* resolutionTiers: Record<string, number>,
|
|
399
|
+
* emptyKeywordCount: number,
|
|
400
|
+
* shortDescriptionCount: number,
|
|
401
|
+
* records: ReadonlyArray<ReturnType<typeof resolveRunSeo>>
|
|
402
|
+
* }}
|
|
403
|
+
*/
|
|
404
|
+
export function dumpArticleSeo(opts) {
|
|
405
|
+
const { repoRoot, lang, outPath, jsonPath, limit, quiet } = opts;
|
|
406
|
+
|
|
407
|
+
const allRuns = discoverAnalysisRuns(repoRoot).map((run) => run.runDir);
|
|
408
|
+
const discovered = allRuns.length;
|
|
409
|
+
const targetRuns = Number.isFinite(limit) ? allRuns.slice(0, limit) : allRuns;
|
|
410
|
+
const total = targetRuns.length;
|
|
411
|
+
|
|
412
|
+
const records = [];
|
|
413
|
+
const failures = [];
|
|
414
|
+
const resolutionTiers = Object.create(null);
|
|
415
|
+
let emptyKeywordCount = 0;
|
|
416
|
+
let shortDescriptionCount = 0;
|
|
417
|
+
|
|
418
|
+
const textChunks = [];
|
|
419
|
+
const jsonLines = [];
|
|
420
|
+
const header =
|
|
421
|
+
`# Executive Brief SEO Preview\n` +
|
|
422
|
+
`# Source : executive-brief.md under analysis/daily/*/\n` +
|
|
423
|
+
`# repo-root : ${repoRoot}\n` +
|
|
424
|
+
`# language : ${lang}\n` +
|
|
425
|
+
`# total runs : ${discovered}\n` +
|
|
426
|
+
`# selected runs : ${total}\n` +
|
|
427
|
+
`# generated by : scripts/dump-article-seo.js\n` +
|
|
428
|
+
`# resolver : src/aggregator/article-metadata.ts → resolveArticleMetadata()\n` +
|
|
429
|
+
`# rendered by : src/aggregator/html/shell.ts (same call path as npm run generate-article:all)\n` +
|
|
430
|
+
`# purpose : review and improve SEO before generating HTML\n\n`;
|
|
431
|
+
|
|
432
|
+
if (!quiet) process.stdout.write(header);
|
|
433
|
+
textChunks.push(header);
|
|
434
|
+
|
|
435
|
+
for (let i = 0; i < targetRuns.length; i += 1) {
|
|
436
|
+
const runDir = targetRuns[i];
|
|
437
|
+
let record;
|
|
438
|
+
try {
|
|
439
|
+
record = resolveRunSeo({ runDir, repoRoot, lang });
|
|
440
|
+
} catch (error) {
|
|
441
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
442
|
+
failures.push({ runDir, message });
|
|
443
|
+
const failBlock = `--- FAILED ${path.relative(repoRoot, runDir)}: ${message}\n\n`;
|
|
444
|
+
if (!quiet) process.stderr.write(failBlock);
|
|
445
|
+
textChunks.push(failBlock);
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
448
|
+
records.push(record);
|
|
449
|
+
|
|
450
|
+
const tier = record.entry.source;
|
|
451
|
+
resolutionTiers[tier] = (resolutionTiers[tier] ?? 0) + 1;
|
|
452
|
+
if (record.entry.keywords.length === 0) emptyKeywordCount += 1;
|
|
453
|
+
if (record.entry.description.length < 70) shortDescriptionCount += 1;
|
|
454
|
+
|
|
455
|
+
const block = formatRecord(record, i + 1, total, lang);
|
|
456
|
+
if (!quiet) process.stdout.write(`${block}\n`);
|
|
457
|
+
textChunks.push(`${block}\n`);
|
|
458
|
+
|
|
459
|
+
jsonLines.push(
|
|
460
|
+
JSON.stringify({
|
|
461
|
+
slug: record.slug,
|
|
462
|
+
runDir: record.runDirRel,
|
|
463
|
+
date: record.date,
|
|
464
|
+
articleType: record.articleType,
|
|
465
|
+
lang,
|
|
466
|
+
filename: record.filename,
|
|
467
|
+
source: record.entry.source,
|
|
468
|
+
title: record.entry.title,
|
|
469
|
+
description: record.entry.description,
|
|
470
|
+
extendedDescription: record.entry.extendedDescription,
|
|
471
|
+
keywords: record.entry.keywords,
|
|
472
|
+
htmlHeadSnippet: buildHtmlHeadSnippet(record, lang),
|
|
473
|
+
})
|
|
474
|
+
);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const summary = buildSummary({
|
|
478
|
+
discovered,
|
|
479
|
+
total,
|
|
480
|
+
processed: records.length,
|
|
481
|
+
failures,
|
|
482
|
+
resolutionTiers,
|
|
483
|
+
emptyKeywordCount,
|
|
484
|
+
shortDescriptionCount,
|
|
485
|
+
});
|
|
486
|
+
if (!quiet) process.stdout.write(summary);
|
|
487
|
+
textChunks.push(summary);
|
|
488
|
+
|
|
489
|
+
if (outPath) {
|
|
490
|
+
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
|
491
|
+
fs.writeFileSync(outPath, textChunks.join(''), 'utf8');
|
|
492
|
+
}
|
|
493
|
+
if (jsonPath) {
|
|
494
|
+
fs.mkdirSync(path.dirname(jsonPath), { recursive: true });
|
|
495
|
+
fs.writeFileSync(jsonPath, `${jsonLines.join('\n')}\n`, 'utf8');
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
return {
|
|
499
|
+
discovered,
|
|
500
|
+
total,
|
|
501
|
+
processed: records.length,
|
|
502
|
+
resolutionTiers,
|
|
503
|
+
emptyKeywordCount,
|
|
504
|
+
shortDescriptionCount,
|
|
505
|
+
records,
|
|
506
|
+
};
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
function buildSummary({
|
|
510
|
+
discovered,
|
|
511
|
+
total,
|
|
512
|
+
processed,
|
|
513
|
+
failures,
|
|
514
|
+
resolutionTiers,
|
|
515
|
+
emptyKeywordCount,
|
|
516
|
+
shortDescriptionCount,
|
|
517
|
+
}) {
|
|
518
|
+
const tierEntries = Object.entries(resolutionTiers).sort(
|
|
519
|
+
([a], [b]) => a.localeCompare(b)
|
|
520
|
+
);
|
|
521
|
+
const lines = [];
|
|
522
|
+
lines.push('='.repeat(80));
|
|
523
|
+
lines.push('SUMMARY');
|
|
524
|
+
lines.push('='.repeat(80));
|
|
525
|
+
lines.push(`total runs discovered : ${discovered}`);
|
|
526
|
+
lines.push(`selected for preview : ${total}`);
|
|
527
|
+
lines.push(`successfully resolved : ${processed}`);
|
|
528
|
+
lines.push(`failed runs : ${failures.length}`);
|
|
529
|
+
lines.push('');
|
|
530
|
+
lines.push('Resolution-tier histogram (alphabetical by source label):');
|
|
531
|
+
if (tierEntries.length === 0) {
|
|
532
|
+
lines.push(' (no runs resolved)');
|
|
533
|
+
} else {
|
|
534
|
+
for (const [tier, count] of tierEntries) {
|
|
535
|
+
lines.push(` ${tier.padEnd(20)} ${count}`);
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
lines.push('');
|
|
539
|
+
lines.push('Quality flags:');
|
|
540
|
+
lines.push(` runs with empty <meta keywords> : ${emptyKeywordCount}`);
|
|
541
|
+
lines.push(` runs with <meta description> shorter than 70 chars : ${shortDescriptionCount}`);
|
|
542
|
+
if (failures.length > 0) {
|
|
543
|
+
lines.push('');
|
|
544
|
+
lines.push('Failures:');
|
|
545
|
+
for (const fail of failures) {
|
|
546
|
+
lines.push(` - ${fail.runDir}: ${fail.message}`);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
lines.push('');
|
|
550
|
+
return `${lines.join('\n')}`;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// Run as a script only when invoked directly (not when imported by tests).
|
|
554
|
+
const invokedDirectly =
|
|
555
|
+
import.meta.url === `file://${process.argv[1]}` ||
|
|
556
|
+
process.argv[1]?.endsWith('dump-article-seo.js');
|
|
557
|
+
|
|
558
|
+
if (invokedDirectly) {
|
|
559
|
+
try {
|
|
560
|
+
const opts = parseArgs(process.argv.slice(2));
|
|
561
|
+
dumpArticleSeo(opts);
|
|
562
|
+
} catch (error) {
|
|
563
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
564
|
+
process.stderr.write(`dump-article-seo: ${message}\n`);
|
|
565
|
+
process.exit(1);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
@@ -50,10 +50,15 @@ export declare function healJsonLdDescriptionCorruption(filenames: readonly stri
|
|
|
50
50
|
* @param slug - Article slug (used to derive the category)
|
|
51
51
|
* @param lang - Article language (ISO 639-1 lower-case code)
|
|
52
52
|
* @param description - Candidate description (resolver output preferred)
|
|
53
|
+
* @param options - Backfill options
|
|
54
|
+
* @param options.forceContextPrefix - Force date/language/category prefix
|
|
55
|
+
* even when the description is already substantive
|
|
53
56
|
* @returns Page-specific description, prefix-free when description is
|
|
54
57
|
* already substantive
|
|
55
58
|
*/
|
|
56
|
-
export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string
|
|
59
|
+
export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string, options?: {
|
|
60
|
+
readonly forceContextPrefix?: boolean;
|
|
61
|
+
}): string;
|
|
57
62
|
/**
|
|
58
63
|
* Apply SEO meta tag replacements to a complete article HTML document.
|
|
59
64
|
*
|
|
@@ -17,6 +17,23 @@ import { formatSlug, parseArticleFilename, extractArticleMeta, escapeHTML, atomi
|
|
|
17
17
|
import { detectCategory } from '../../utils/article-category.js';
|
|
18
18
|
import { buildSeoKeywords, resolveArticleMetadata } from '../../aggregator/article-metadata.js';
|
|
19
19
|
const MIN_ARTICLE_DESCRIPTION_LENGTH = 120;
|
|
20
|
+
/** Language labels used only in forced legacy backfill prefixes. */
|
|
21
|
+
const LEGACY_LANGUAGE_LABELS = {
|
|
22
|
+
en: 'English',
|
|
23
|
+
sv: 'Svenska',
|
|
24
|
+
da: 'Dansk',
|
|
25
|
+
no: 'Norsk',
|
|
26
|
+
fi: 'Suomi',
|
|
27
|
+
de: 'Deutsch',
|
|
28
|
+
fr: 'Français',
|
|
29
|
+
es: 'Español',
|
|
30
|
+
nl: 'Nederlands',
|
|
31
|
+
ar: 'العربية',
|
|
32
|
+
he: 'עברית',
|
|
33
|
+
ja: '日本語',
|
|
34
|
+
ko: '한국어',
|
|
35
|
+
zh: '中文',
|
|
36
|
+
};
|
|
20
37
|
/**
|
|
21
38
|
* Regex pattern that flags internal artefact identifiers
|
|
22
39
|
* (`<slug>-run<N>-<unix-ts>`). Used by
|
|
@@ -127,7 +144,9 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
|
|
|
127
144
|
? resolverDescription
|
|
128
145
|
: safeDescription || formatSlug(parsed.slug);
|
|
129
146
|
const description = needsDescription
|
|
130
|
-
? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription
|
|
147
|
+
? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription, {
|
|
148
|
+
forceContextPrefix: true,
|
|
149
|
+
})
|
|
131
150
|
: meta.description;
|
|
132
151
|
const keywords = entry?.keywords ?? fallbackKeywords;
|
|
133
152
|
const nextHtml = applyArticleSeoBackfill(html, description, keywords);
|
|
@@ -160,23 +179,71 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
|
|
|
160
179
|
* @param slug - Article slug (used to derive the category)
|
|
161
180
|
* @param lang - Article language (ISO 639-1 lower-case code)
|
|
162
181
|
* @param description - Candidate description (resolver output preferred)
|
|
182
|
+
* @param options - Backfill options
|
|
183
|
+
* @param options.forceContextPrefix - Force date/language/category prefix
|
|
184
|
+
* even when the description is already substantive
|
|
163
185
|
* @returns Page-specific description, prefix-free when description is
|
|
164
186
|
* already substantive
|
|
165
187
|
*/
|
|
166
|
-
export function buildLegacyBackfillDescription(date, slug, lang, description) {
|
|
188
|
+
export function buildLegacyBackfillDescription(date, slug, lang, description, options = {}) {
|
|
167
189
|
const trimmedDescription = description.trim();
|
|
168
|
-
if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH) {
|
|
190
|
+
if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH && !options.forceContextPrefix) {
|
|
169
191
|
return capDescriptionLength(trimmedDescription);
|
|
170
192
|
}
|
|
171
193
|
const category = detectCategory(slug);
|
|
172
194
|
const langCode = (lang || 'en').toLowerCase();
|
|
173
195
|
const categoryLabels = getLocalizedString(ARTICLE_TYPE_LABELS, langCode);
|
|
174
196
|
const label = categoryLabels[category] ?? formatSlug(slug);
|
|
175
|
-
const
|
|
197
|
+
const qualifier = buildLegacySlugQualifier(slug, label);
|
|
198
|
+
const languageLabel = legacyLanguageLabel(langCode);
|
|
199
|
+
const prefix = [date, languageLabel, label, qualifier]
|
|
200
|
+
.filter((part) => part.length > 0)
|
|
201
|
+
.join(' — ');
|
|
176
202
|
const body = trimmedDescription || label;
|
|
177
203
|
const contextual = `${prefix} — ${body}`.replace(/\s+/g, ' ').trim();
|
|
178
204
|
return capDescriptionLength(contextual);
|
|
179
205
|
}
|
|
206
|
+
/**
|
|
207
|
+
* Resolve the human language label used to make otherwise-identical
|
|
208
|
+
* cross-locale legacy descriptions unique.
|
|
209
|
+
*
|
|
210
|
+
* @param lang - Language code
|
|
211
|
+
* @returns Local language name, or the raw code if unknown
|
|
212
|
+
*/
|
|
213
|
+
function legacyLanguageLabel(lang) {
|
|
214
|
+
const descriptor = Object.getOwnPropertyDescriptor(LEGACY_LANGUAGE_LABELS, lang);
|
|
215
|
+
return typeof descriptor?.value === 'string' ? descriptor.value : lang;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Build an optional slug-derived qualifier for legacy pages that share the
|
|
219
|
+
* same date and article category (for example same-day `*-run2` variants).
|
|
220
|
+
*
|
|
221
|
+
* @param slug - Article slug without date/language suffix
|
|
222
|
+
* @param localizedLabel - Localized category label already present in prefix
|
|
223
|
+
* @returns Human-readable qualifier, or empty when it would duplicate label
|
|
224
|
+
*/
|
|
225
|
+
function buildLegacySlugQualifier(slug, localizedLabel) {
|
|
226
|
+
const formatted = formatSlug(slug).trim();
|
|
227
|
+
if (!formatted)
|
|
228
|
+
return '';
|
|
229
|
+
const normalizedFormatted = normalizeLegacyQualifier(formatted);
|
|
230
|
+
const normalizedLabel = normalizeLegacyQualifier(localizedLabel);
|
|
231
|
+
if (!normalizedFormatted || normalizedFormatted === normalizedLabel)
|
|
232
|
+
return '';
|
|
233
|
+
return formatted;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Normalize a prefix component for duplicate detection.
|
|
237
|
+
*
|
|
238
|
+
* @param value - Candidate text
|
|
239
|
+
* @returns Lower-case alphanumeric text
|
|
240
|
+
*/
|
|
241
|
+
function normalizeLegacyQualifier(value) {
|
|
242
|
+
return value
|
|
243
|
+
.toLowerCase()
|
|
244
|
+
.replace(/[^\p{L}\p{N}]+/gu, ' ')
|
|
245
|
+
.trim();
|
|
246
|
+
}
|
|
180
247
|
/**
|
|
181
248
|
* Clamp a description to the 180-character SERP-friendly cap with a
|
|
182
249
|
* trailing ellipsis when truncated. Extracted from
|