euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @module scripts/dump-article-seo
|
|
7
|
+
* @description Read-only preview of the SEO `<head>` metadata that the
|
|
8
|
+
* deterministic article generator **would produce** for every executive
|
|
9
|
+
* brief committed under `analysis/daily/`. Use this before running
|
|
10
|
+
* `npm run generate-article:all` to audit and improve titles,
|
|
11
|
+
* descriptions, and keywords without touching any `news/*.html` file.
|
|
12
|
+
*
|
|
13
|
+
* **Source: executive briefs, not HTML.**
|
|
14
|
+
* The script reads each analysis run's `executive-brief.md` (and its
|
|
15
|
+
* translated siblings) via the same resolver chain that the real
|
|
16
|
+
* article generator uses. No HTML files are read or written; the script
|
|
17
|
+
* is purely additive and fully idempotent.
|
|
18
|
+
*
|
|
19
|
+
* **Identical code path to the real renderer.** The script intentionally
|
|
20
|
+
* imports the same helpers that `scripts/aggregator/article-generator.js`
|
|
21
|
+
* (the engine behind `npm run generate-article:all` and the
|
|
22
|
+
* `regenerate-articles.yml` workflow) uses:
|
|
23
|
+
*
|
|
24
|
+
* 1. `discoverAnalysisRuns(repoRoot)` — same run discovery as the batch
|
|
25
|
+
* renderer (`generator/render-batch.js`).
|
|
26
|
+
* 2. `aggregateAnalysisRun({ runDir, repoRoot })` — same Markdown
|
|
27
|
+
* aggregation that feeds `resolveArticleMetadata`, which in turn
|
|
28
|
+
* reads `executive-brief.md` and its translated siblings.
|
|
29
|
+
* 3. `resolveArticleMetadata({ articleType, date, markdown, manifest,
|
|
30
|
+
* runDir })` — the single source of truth for per-language `(title,
|
|
31
|
+
* description, extendedDescription, keywords, source)` documented in
|
|
32
|
+
* `src/aggregator/article-metadata.ts`. The entry returned here is
|
|
33
|
+
* *bit-for-bit identical* to the one passed into
|
|
34
|
+
* `src/aggregator/html/shell.ts` for the `<title>`,
|
|
35
|
+
* `<meta name="description">`, and `<meta name="keywords">` tags.
|
|
36
|
+
*
|
|
37
|
+
* **Two-part output per run.**
|
|
38
|
+
* - *Field analysis* — human-readable breakdown of each SEO field
|
|
39
|
+
* (length, content, resolution tier) for quick editorial review.
|
|
40
|
+
* - *HTML head snippet* — the **complete `<head>` block** that the
|
|
41
|
+
* article generator will emit, produced by calling
|
|
42
|
+
* `wrapArticleHtml()` from `src/aggregator/html/shell.ts` with an
|
|
43
|
+
* empty body and slicing out `<head>...</head>`. This includes the
|
|
44
|
+
* `<title>`, `<meta name="description">`, `<meta name="keywords">`,
|
|
45
|
+
* all `<meta property="og:*">` and `<meta name="twitter:*">` tags,
|
|
46
|
+
* `<link rel="canonical">`, hreflang alternates, JSON-LD
|
|
47
|
+
* `NewsArticle` + `BreadcrumbList`, and every other tag the real
|
|
48
|
+
* renderer emits — because the snippet *is* the real renderer's
|
|
49
|
+
* output. Copy-paste these into a browser extension or SEO tool to
|
|
50
|
+
* preview how the article will appear in search results and social
|
|
51
|
+
* cards before committing to HTML generation.
|
|
52
|
+
*
|
|
53
|
+
* Invocation:
|
|
54
|
+
* node scripts/dump-article-seo.js \
|
|
55
|
+
* [--repo-root <path>] # defaults to process.cwd()
|
|
56
|
+
* [--lang en] # defaults to en
|
|
57
|
+
* [--out <path>] # also write the human-readable dump here
|
|
58
|
+
* [--json <path>] # also write a machine-readable JSONL dump
|
|
59
|
+
* [--limit <N>] # only process the first N runs (debug)
|
|
60
|
+
* [--quiet] # suppress per-run stdout (file output only)
|
|
61
|
+
*/
|
|
62
|
+
|
|
63
|
+
import fs from 'node:fs';
|
|
64
|
+
import path from 'node:path';
|
|
65
|
+
import process from 'node:process';
|
|
66
|
+
|
|
67
|
+
import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
|
|
68
|
+
import {
|
|
69
|
+
aggregateAnalysisRun,
|
|
70
|
+
resolveArticleTypeFromManifest,
|
|
71
|
+
} from './aggregator/analysis-aggregator.js';
|
|
72
|
+
import { resolveArticleMetadata } from './aggregator/article-metadata.js';
|
|
73
|
+
import { buildArticleSlug } from './aggregator/generator/slug.js';
|
|
74
|
+
import { getArticleFilename } from './aggregator/html/hreflang.js';
|
|
75
|
+
import { wrapArticleHtml } from './aggregator/html/shell.js';
|
|
76
|
+
import { ALL_LANGUAGES, isSupportedLanguage } from './constants/language-core.js';
|
|
77
|
+
|
|
78
|
+
const SUPPORTED_LANGS = new Set(ALL_LANGUAGES);
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Parse the small CLI surface used by this script. Kept inline so the
|
|
82
|
+
* dumper has no extra dependencies beyond the same compiled-from-TS
|
|
83
|
+
* helpers the real renderer uses.
|
|
84
|
+
*
|
|
85
|
+
* @param {readonly string[]} argv - `process.argv.slice(2)`
|
|
86
|
+
* @returns {{repoRoot: string, lang: string, outPath: string|null,
|
|
87
|
+
* jsonPath: string|null, limit: number, quiet: boolean}}
|
|
88
|
+
*/
|
|
89
|
+
export function parseArgs(argv) {
|
|
90
|
+
let repoRoot = process.cwd();
|
|
91
|
+
let lang = 'en';
|
|
92
|
+
let outPath = null;
|
|
93
|
+
let jsonPath = null;
|
|
94
|
+
let limit = Number.POSITIVE_INFINITY;
|
|
95
|
+
let quiet = false;
|
|
96
|
+
|
|
97
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
98
|
+
const arg = argv[i];
|
|
99
|
+
switch (arg) {
|
|
100
|
+
case '--repo-root':
|
|
101
|
+
repoRoot = path.resolve(requireValue(argv, i, arg));
|
|
102
|
+
i += 1;
|
|
103
|
+
break;
|
|
104
|
+
case '--lang':
|
|
105
|
+
lang = requireValue(argv, i, arg);
|
|
106
|
+
i += 1;
|
|
107
|
+
break;
|
|
108
|
+
case '--out':
|
|
109
|
+
outPath = path.resolve(requireValue(argv, i, arg));
|
|
110
|
+
i += 1;
|
|
111
|
+
break;
|
|
112
|
+
case '--json':
|
|
113
|
+
jsonPath = path.resolve(requireValue(argv, i, arg));
|
|
114
|
+
i += 1;
|
|
115
|
+
break;
|
|
116
|
+
case '--limit': {
|
|
117
|
+
const raw = requireValue(argv, i, arg);
|
|
118
|
+
if (!/^\d+$/u.test(raw)) {
|
|
119
|
+
throw new Error(`--limit expects a positive integer, got "${raw}"`);
|
|
120
|
+
}
|
|
121
|
+
const parsed = Number.parseInt(raw, 10);
|
|
122
|
+
if (!Number.isFinite(parsed) || parsed < 1) {
|
|
123
|
+
throw new Error(`--limit expects a positive integer, got "${raw}"`);
|
|
124
|
+
}
|
|
125
|
+
limit = parsed;
|
|
126
|
+
i += 1;
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
case '--quiet':
|
|
130
|
+
quiet = true;
|
|
131
|
+
break;
|
|
132
|
+
case '--help':
|
|
133
|
+
case '-h':
|
|
134
|
+
printHelpAndExit();
|
|
135
|
+
break;
|
|
136
|
+
default:
|
|
137
|
+
throw new Error(`Unknown argument: ${arg}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if (!isSupportedLanguage(lang)) {
|
|
142
|
+
throw new Error(
|
|
143
|
+
`Unsupported --lang "${lang}". Expected one of: ${[...SUPPORTED_LANGS].join(', ')}`
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return { repoRoot, lang, outPath, jsonPath, limit, quiet };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function requireValue(argv, i, flag) {
|
|
151
|
+
const value = argv[i + 1];
|
|
152
|
+
if (value === undefined) {
|
|
153
|
+
throw new Error(`${flag} requires a value`);
|
|
154
|
+
}
|
|
155
|
+
return value;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function printHelpAndExit() {
|
|
159
|
+
process.stdout.write(
|
|
160
|
+
[
|
|
161
|
+
'Usage: node scripts/dump-article-seo.js [options]',
|
|
162
|
+
'',
|
|
163
|
+
'Read-only preview of the SEO <head> metadata (title, description,',
|
|
164
|
+
'keywords, og:*, twitter:*) that the article generator would produce',
|
|
165
|
+
'from each executive brief — without generating any HTML files.',
|
|
166
|
+
'',
|
|
167
|
+
'Options:',
|
|
168
|
+
' --repo-root <path> Repository root (default: cwd)',
|
|
169
|
+
' --lang <code> Language to dump (default: en)',
|
|
170
|
+
' --out <path> Write the human-readable report here',
|
|
171
|
+
' --json <path> Also write a JSONL record per run',
|
|
172
|
+
' --limit <N> Process only the first N runs (debug)',
|
|
173
|
+
' --quiet Suppress per-run stdout',
|
|
174
|
+
' -h, --help Show this help',
|
|
175
|
+
'',
|
|
176
|
+
].join('\n')
|
|
177
|
+
);
|
|
178
|
+
process.exit(0);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Mirror of the private `readManifestMetadata` helper inside
|
|
183
|
+
* `scripts/aggregator/generator/render-one.js`. We re-implement it here
|
|
184
|
+
* rather than export it from the renderer because the metadata-relevant
|
|
185
|
+
* subset of a manifest is intentionally a *contract*, not a public API:
|
|
186
|
+
* the resolver only consumes the seven keys listed below and silently
|
|
187
|
+
* ignores everything else. Re-implementing keeps the dumper aligned
|
|
188
|
+
* with that contract without leaking unrelated manifest fields into
|
|
189
|
+
* `resolveArticleMetadata`.
|
|
190
|
+
*
|
|
191
|
+
* @param {string} runDir - Absolute path to the analysis run
|
|
192
|
+
* @returns {object} Metadata-relevant manifest fields (possibly empty)
|
|
193
|
+
*/
|
|
194
|
+
export function readManifestMetadata(runDir) {
|
|
195
|
+
const manifestPath = path.join(runDir, 'manifest.json');
|
|
196
|
+
if (!fs.existsSync(manifestPath)) return {};
|
|
197
|
+
|
|
198
|
+
let parsed;
|
|
199
|
+
try {
|
|
200
|
+
parsed = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
|
|
201
|
+
} catch {
|
|
202
|
+
return {};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const manifest = {};
|
|
206
|
+
const resolvedType = resolveArticleTypeFromManifest(parsed);
|
|
207
|
+
if (resolvedType && resolvedType !== 'unknown') {
|
|
208
|
+
manifest.articleType = resolvedType;
|
|
209
|
+
}
|
|
210
|
+
if (typeof parsed.date === 'string') manifest.date = parsed.date;
|
|
211
|
+
if (typeof parsed.runId === 'string') manifest.runId = parsed.runId;
|
|
212
|
+
if (typeof parsed.title === 'string' || isLanguageMapLike(parsed.title)) {
|
|
213
|
+
manifest.title = parsed.title;
|
|
214
|
+
}
|
|
215
|
+
if (
|
|
216
|
+
typeof parsed.description === 'string' ||
|
|
217
|
+
isLanguageMapLike(parsed.description)
|
|
218
|
+
) {
|
|
219
|
+
manifest.description = parsed.description;
|
|
220
|
+
}
|
|
221
|
+
if (typeof parsed.committee === 'string') {
|
|
222
|
+
manifest.committee = parsed.committee;
|
|
223
|
+
}
|
|
224
|
+
return manifest;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function isLanguageMapLike(value) {
|
|
228
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
|
|
229
|
+
for (const entry of Object.values(value)) {
|
|
230
|
+
if (typeof entry !== 'string') return false;
|
|
231
|
+
}
|
|
232
|
+
return true;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Resolve the SEO metadata for one analysis run by reading its executive
|
|
237
|
+
* brief and applying the same resolver chain as the article generator.
|
|
238
|
+
* Pure: no files written, no stdout side-effects.
|
|
239
|
+
*
|
|
240
|
+
* @param {object} opts
|
|
241
|
+
* @param {string} opts.runDir - Absolute path to the analysis run
|
|
242
|
+
* @param {string} opts.repoRoot - Repository root (for relative paths)
|
|
243
|
+
* @param {string} opts.lang - Language code to extract
|
|
244
|
+
* @returns {{
|
|
245
|
+
* runDir: string,
|
|
246
|
+
* runDirRel: string,
|
|
247
|
+
* date: string,
|
|
248
|
+
* articleType: string,
|
|
249
|
+
* slug: string,
|
|
250
|
+
* filename: string,
|
|
251
|
+
* entry: {title: string, description: string,
|
|
252
|
+
* extendedDescription: string, keywords: readonly string[],
|
|
253
|
+
* source: string}
|
|
254
|
+
* }}
|
|
255
|
+
*/
|
|
256
|
+
export function resolveRunSeo({ runDir, repoRoot, lang }) {
|
|
257
|
+
const aggregated = aggregateAnalysisRun({ runDir, repoRoot });
|
|
258
|
+
const manifestMetadata = readManifestMetadata(runDir);
|
|
259
|
+
const resolved = resolveArticleMetadata({
|
|
260
|
+
articleType: aggregated.articleType,
|
|
261
|
+
date: aggregated.date,
|
|
262
|
+
markdown: aggregated.markdown,
|
|
263
|
+
manifest: manifestMetadata,
|
|
264
|
+
runDir,
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
const entry = resolved[lang];
|
|
268
|
+
if (!entry) {
|
|
269
|
+
throw new Error(
|
|
270
|
+
`resolveArticleMetadata returned no entry for lang="${lang}" in ${runDir}`
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const slug = buildArticleSlug(aggregated.date, aggregated.articleType);
|
|
275
|
+
const filename = getArticleFilename(slug, lang);
|
|
276
|
+
const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
|
|
277
|
+
|
|
278
|
+
return {
|
|
279
|
+
runDir,
|
|
280
|
+
runDirRel,
|
|
281
|
+
date: aggregated.date,
|
|
282
|
+
articleType: aggregated.articleType,
|
|
283
|
+
slug,
|
|
284
|
+
filename,
|
|
285
|
+
entry: {
|
|
286
|
+
title: entry.title,
|
|
287
|
+
description: entry.description,
|
|
288
|
+
extendedDescription: entry.extendedDescription,
|
|
289
|
+
keywords: entry.keywords ?? [],
|
|
290
|
+
source: entry.source,
|
|
291
|
+
},
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Build the full `<head>` block that the article generator will emit for
|
|
297
|
+
* this run. The output is **bit-for-bit identical** to the `<head>`
|
|
298
|
+
* produced by `wrapArticleHtml()` in `src/aggregator/html/shell.ts` for
|
|
299
|
+
* the same metadata — including all SEO, Open Graph, Twitter card,
|
|
300
|
+
* `<link rel>`, theme-color, and JSON-LD tags — because this function
|
|
301
|
+
* literally invokes `wrapArticleHtml()` with an empty body and slices
|
|
302
|
+
* out the `<head>...</head>` block from the resulting document. There
|
|
303
|
+
* is no duplicated head-rendering code path.
|
|
304
|
+
*
|
|
305
|
+
* Use this to preview how the article will appear in search results and
|
|
306
|
+
* social-card previews **before** running the full HTML generator.
|
|
307
|
+
*
|
|
308
|
+
* @param {ReturnType<typeof resolveRunSeo>} record
|
|
309
|
+
* @param {string} lang - Language code passed through to `wrapArticleHtml`
|
|
310
|
+
* @returns {string} The complete `<head>...</head>` block from the
|
|
311
|
+
* real article renderer, ready to paste for review.
|
|
312
|
+
*/
|
|
313
|
+
export function buildHtmlHeadSnippet(record, lang) {
|
|
314
|
+
const { entry } = record;
|
|
315
|
+
const html = wrapArticleHtml({
|
|
316
|
+
lang,
|
|
317
|
+
articleSlug: record.slug,
|
|
318
|
+
body: '',
|
|
319
|
+
title: entry.title,
|
|
320
|
+
description: entry.description,
|
|
321
|
+
extendedDescription: entry.extendedDescription,
|
|
322
|
+
keywords: entry.keywords ?? [],
|
|
323
|
+
date: record.date,
|
|
324
|
+
articleType: record.articleType,
|
|
325
|
+
});
|
|
326
|
+
const match = html.match(/<head>[\s\S]*?<\/head>/);
|
|
327
|
+
if (!match) {
|
|
328
|
+
throw new Error(
|
|
329
|
+
`buildHtmlHeadSnippet: could not locate <head> block in wrapArticleHtml output for ${record.slug}`
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
return match[0];
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Format one resolved-SEO record as the human/AI-readable block used in
|
|
337
|
+
* the stdout dump. Each block contains two sections:
|
|
338
|
+
* 1. *Field analysis* — per-field character/term counts and the
|
|
339
|
+
* resolution tier so editors can spot template fallbacks instantly.
|
|
340
|
+
* 2. *HTML head snippet* — the exact tags the article generator will
|
|
341
|
+
* emit, ready to paste into a browser/SEO tool for preview.
|
|
342
|
+
*
|
|
343
|
+
* @param {ReturnType<typeof resolveRunSeo>} record
|
|
344
|
+
* @param {number} index - 1-based position within the dump
|
|
345
|
+
* @param {number} total - Total number of records being dumped
|
|
346
|
+
* @param {string} [lang] - Language code (used for the HTML snippet; defaults to 'en')
|
|
347
|
+
* @returns {string}
|
|
348
|
+
*/
|
|
349
|
+
export function formatRecord(record, index, total, lang = 'en') {
|
|
350
|
+
const lines = [];
|
|
351
|
+
lines.push('='.repeat(80));
|
|
352
|
+
lines.push(`[${index}/${total}] ${record.slug}`);
|
|
353
|
+
lines.push('='.repeat(80));
|
|
354
|
+
lines.push(`run-dir : ${record.runDirRel}`);
|
|
355
|
+
lines.push(`date : ${record.date}`);
|
|
356
|
+
lines.push(`article-type : ${record.articleType}`);
|
|
357
|
+
lines.push(`resolution-tier : ${record.entry.source}`);
|
|
358
|
+
lines.push(`html-file : news/${record.filename}`);
|
|
359
|
+
lines.push('');
|
|
360
|
+
lines.push('--- Field analysis (from executive-brief.md → resolveArticleMetadata) ---');
|
|
361
|
+
lines.push(
|
|
362
|
+
`<title> (${record.entry.title.length} chars): ${formatInline(record.entry.title)}`
|
|
363
|
+
);
|
|
364
|
+
lines.push(
|
|
365
|
+
`<meta description> (${record.entry.description.length} chars): ${formatInline(record.entry.description)}`
|
|
366
|
+
);
|
|
367
|
+
lines.push(
|
|
368
|
+
`<meta description-extended> (${record.entry.extendedDescription.length} chars): ${formatInline(record.entry.extendedDescription)}`
|
|
369
|
+
);
|
|
370
|
+
const keywords = record.entry.keywords;
|
|
371
|
+
lines.push(
|
|
372
|
+
`<meta keywords> (${keywords.length} terms): ${keywords.length ? keywords.join(', ') : '(empty)'}`
|
|
373
|
+
);
|
|
374
|
+
lines.push('');
|
|
375
|
+
lines.push('--- HTML <head> block (verbatim output of wrapArticleHtml — same code path as the article generator) ---');
|
|
376
|
+
lines.push(buildHtmlHeadSnippet(record, lang));
|
|
377
|
+
lines.push('');
|
|
378
|
+
return lines.join('\n');
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function formatInline(value) {
|
|
382
|
+
if (!value) return '(empty)';
|
|
383
|
+
// Strip newlines so each field stays on one line.
|
|
384
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* Run the full dump: discover analysis runs, resolve SEO metadata from
|
|
389
|
+
* each executive brief, print field analysis + HTML head snippet, and
|
|
390
|
+
* optionally write to disk. Returns summary statistics so unit tests and
|
|
391
|
+
* downstream tooling can assert on histograms without re-parsing stdout.
|
|
392
|
+
*
|
|
393
|
+
* @param {ReturnType<typeof parseArgs>} opts
|
|
394
|
+
* @returns {{
|
|
395
|
+
* discovered: number,
|
|
396
|
+
* total: number,
|
|
397
|
+
* processed: number,
|
|
398
|
+
* resolutionTiers: Record<string, number>,
|
|
399
|
+
* emptyKeywordCount: number,
|
|
400
|
+
* shortDescriptionCount: number,
|
|
401
|
+
* records: ReadonlyArray<ReturnType<typeof resolveRunSeo>>
|
|
402
|
+
* }}
|
|
403
|
+
*/
|
|
404
|
+
export function dumpArticleSeo(opts) {
|
|
405
|
+
const { repoRoot, lang, outPath, jsonPath, limit, quiet } = opts;
|
|
406
|
+
|
|
407
|
+
const allRuns = discoverAnalysisRuns(repoRoot).map((run) => run.runDir);
|
|
408
|
+
const discovered = allRuns.length;
|
|
409
|
+
const targetRuns = Number.isFinite(limit) ? allRuns.slice(0, limit) : allRuns;
|
|
410
|
+
const total = targetRuns.length;
|
|
411
|
+
|
|
412
|
+
const records = [];
|
|
413
|
+
const failures = [];
|
|
414
|
+
const resolutionTiers = Object.create(null);
|
|
415
|
+
let emptyKeywordCount = 0;
|
|
416
|
+
let shortDescriptionCount = 0;
|
|
417
|
+
// Phase-6 quality-flag counters — added 2026-05 to drive the
|
|
418
|
+
// "make SEO uniformly excellent" iteration. None of these abort the
|
|
419
|
+
// dump (the run still returns success); they surface in the Quality
|
|
420
|
+
// flags section so the next agent run can target the residual offenders.
|
|
421
|
+
let titleTooShortCount = 0; // < 30 chars (SERP under-utilisation)
|
|
422
|
+
let titleTooLongCount = 0; // > 65 chars (SERP truncation)
|
|
423
|
+
let descTooShortCount = 0; // < 120 chars (snippet under-utilisation)
|
|
424
|
+
let descTooLongCount = 0; // > 160 chars (SERP truncation)
|
|
425
|
+
let emptyExtendedDescriptionCount = 0;
|
|
426
|
+
let pollutedArticleTypeCount = 0; // sanity gate post-Phase-5
|
|
427
|
+
const titleOccurrences = new Map();
|
|
428
|
+
|
|
429
|
+
const textChunks = [];
|
|
430
|
+
const jsonLines = [];
|
|
431
|
+
const header =
|
|
432
|
+
`# Executive Brief SEO Preview\n` +
|
|
433
|
+
`# Source : executive-brief.md under analysis/daily/*/\n` +
|
|
434
|
+
`# repo-root : ${repoRoot}\n` +
|
|
435
|
+
`# language : ${lang}\n` +
|
|
436
|
+
`# total runs : ${discovered}\n` +
|
|
437
|
+
`# selected runs : ${total}\n` +
|
|
438
|
+
`# generated by : scripts/dump-article-seo.js\n` +
|
|
439
|
+
`# resolver : src/aggregator/article-metadata.ts → resolveArticleMetadata()\n` +
|
|
440
|
+
`# rendered by : src/aggregator/html/shell.ts (same call path as npm run generate-article:all)\n` +
|
|
441
|
+
`# purpose : review and improve SEO before generating HTML\n\n`;
|
|
442
|
+
|
|
443
|
+
if (!quiet) process.stdout.write(header);
|
|
444
|
+
textChunks.push(header);
|
|
445
|
+
|
|
446
|
+
for (let i = 0; i < targetRuns.length; i += 1) {
|
|
447
|
+
const runDir = targetRuns[i];
|
|
448
|
+
let record;
|
|
449
|
+
try {
|
|
450
|
+
record = resolveRunSeo({ runDir, repoRoot, lang });
|
|
451
|
+
} catch (error) {
|
|
452
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
453
|
+
failures.push({ runDir, message });
|
|
454
|
+
const failBlock = `--- FAILED ${path.relative(repoRoot, runDir)}: ${message}\n\n`;
|
|
455
|
+
if (!quiet) process.stderr.write(failBlock);
|
|
456
|
+
textChunks.push(failBlock);
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
records.push(record);
|
|
460
|
+
|
|
461
|
+
const tier = record.entry.source;
|
|
462
|
+
resolutionTiers[tier] = (resolutionTiers[tier] ?? 0) + 1;
|
|
463
|
+
if (record.entry.keywords.length === 0) emptyKeywordCount += 1;
|
|
464
|
+
if (record.entry.description.length < 70) shortDescriptionCount += 1;
|
|
465
|
+
// Phase-6 quality flags — soft SERP-budget warnings.
|
|
466
|
+
const titleLen = record.entry.title.length;
|
|
467
|
+
if (titleLen > 0 && titleLen < 30) titleTooShortCount += 1;
|
|
468
|
+
if (titleLen > 65) titleTooLongCount += 1;
|
|
469
|
+
const descLen = record.entry.description.length;
|
|
470
|
+
if (descLen > 0 && descLen < 120) descTooShortCount += 1;
|
|
471
|
+
if (descLen > 160) descTooLongCount += 1;
|
|
472
|
+
if (!record.entry.extendedDescription) emptyExtendedDescriptionCount += 1;
|
|
473
|
+
if (typeof record.articleType === 'string' && /-run[a-zA-Z0-9-]*\d+$/u.test(record.articleType)) {
|
|
474
|
+
pollutedArticleTypeCount += 1;
|
|
475
|
+
}
|
|
476
|
+
// Duplicate-title detection — zero tolerance across the corpus.
|
|
477
|
+
// We group by exact byte-match of the resolved title; same-title
|
|
478
|
+
// collisions on different (date, articleType) pairs are the
|
|
479
|
+
// SERP-cannibalisation pattern that the dump is meant to surface.
|
|
480
|
+
const titleKey = record.entry.title;
|
|
481
|
+
if (titleKey) {
|
|
482
|
+
const bucket = titleOccurrences.get(titleKey) ?? [];
|
|
483
|
+
bucket.push(record.slug);
|
|
484
|
+
titleOccurrences.set(titleKey, bucket);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
const block = formatRecord(record, i + 1, total, lang);
|
|
488
|
+
if (!quiet) process.stdout.write(`${block}\n`);
|
|
489
|
+
textChunks.push(`${block}\n`);
|
|
490
|
+
|
|
491
|
+
jsonLines.push(
|
|
492
|
+
JSON.stringify({
|
|
493
|
+
slug: record.slug,
|
|
494
|
+
runDir: record.runDirRel,
|
|
495
|
+
date: record.date,
|
|
496
|
+
articleType: record.articleType,
|
|
497
|
+
lang,
|
|
498
|
+
filename: record.filename,
|
|
499
|
+
source: record.entry.source,
|
|
500
|
+
title: record.entry.title,
|
|
501
|
+
description: record.entry.description,
|
|
502
|
+
extendedDescription: record.entry.extendedDescription,
|
|
503
|
+
keywords: record.entry.keywords,
|
|
504
|
+
htmlHeadSnippet: buildHtmlHeadSnippet(record, lang),
|
|
505
|
+
})
|
|
506
|
+
);
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
const duplicateTitleGroups = Array.from(titleOccurrences.entries())
|
|
510
|
+
.filter(([, slugs]) => slugs.length > 1)
|
|
511
|
+
.map(([title, slugs]) => ({ title, count: slugs.length, slugs: [...slugs] }))
|
|
512
|
+
.sort((a, b) => b.count - a.count || a.title.localeCompare(b.title));
|
|
513
|
+
|
|
514
|
+
const summary = buildSummary({
|
|
515
|
+
discovered,
|
|
516
|
+
total,
|
|
517
|
+
processed: records.length,
|
|
518
|
+
failures,
|
|
519
|
+
resolutionTiers,
|
|
520
|
+
emptyKeywordCount,
|
|
521
|
+
shortDescriptionCount,
|
|
522
|
+
titleTooShortCount,
|
|
523
|
+
titleTooLongCount,
|
|
524
|
+
descTooShortCount,
|
|
525
|
+
descTooLongCount,
|
|
526
|
+
emptyExtendedDescriptionCount,
|
|
527
|
+
pollutedArticleTypeCount,
|
|
528
|
+
duplicateTitleGroups,
|
|
529
|
+
});
|
|
530
|
+
if (!quiet) process.stdout.write(summary);
|
|
531
|
+
textChunks.push(summary);
|
|
532
|
+
|
|
533
|
+
if (outPath) {
|
|
534
|
+
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
|
535
|
+
fs.writeFileSync(outPath, textChunks.join(''), 'utf8');
|
|
536
|
+
}
|
|
537
|
+
if (jsonPath) {
|
|
538
|
+
fs.mkdirSync(path.dirname(jsonPath), { recursive: true });
|
|
539
|
+
fs.writeFileSync(jsonPath, `${jsonLines.join('\n')}\n`, 'utf8');
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
return {
|
|
543
|
+
discovered,
|
|
544
|
+
total,
|
|
545
|
+
processed: records.length,
|
|
546
|
+
resolutionTiers,
|
|
547
|
+
emptyKeywordCount,
|
|
548
|
+
shortDescriptionCount,
|
|
549
|
+
titleTooShortCount,
|
|
550
|
+
titleTooLongCount,
|
|
551
|
+
descTooShortCount,
|
|
552
|
+
descTooLongCount,
|
|
553
|
+
emptyExtendedDescriptionCount,
|
|
554
|
+
pollutedArticleTypeCount,
|
|
555
|
+
duplicateTitleGroups,
|
|
556
|
+
records,
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
function buildSummary({
|
|
561
|
+
discovered,
|
|
562
|
+
total,
|
|
563
|
+
processed,
|
|
564
|
+
failures,
|
|
565
|
+
resolutionTiers,
|
|
566
|
+
emptyKeywordCount,
|
|
567
|
+
shortDescriptionCount,
|
|
568
|
+
titleTooShortCount,
|
|
569
|
+
titleTooLongCount,
|
|
570
|
+
descTooShortCount,
|
|
571
|
+
descTooLongCount,
|
|
572
|
+
emptyExtendedDescriptionCount,
|
|
573
|
+
pollutedArticleTypeCount,
|
|
574
|
+
duplicateTitleGroups,
|
|
575
|
+
}) {
|
|
576
|
+
const tierEntries = Object.entries(resolutionTiers).sort(
|
|
577
|
+
([a], [b]) => a.localeCompare(b)
|
|
578
|
+
);
|
|
579
|
+
const lines = [];
|
|
580
|
+
lines.push('='.repeat(80));
|
|
581
|
+
lines.push('SUMMARY');
|
|
582
|
+
lines.push('='.repeat(80));
|
|
583
|
+
lines.push(`total runs discovered : ${discovered}`);
|
|
584
|
+
lines.push(`selected for preview : ${total}`);
|
|
585
|
+
lines.push(`successfully resolved : ${processed}`);
|
|
586
|
+
lines.push(`failed runs : ${failures.length}`);
|
|
587
|
+
lines.push('');
|
|
588
|
+
lines.push('Resolution-tier histogram (alphabetical by source label):');
|
|
589
|
+
if (tierEntries.length === 0) {
|
|
590
|
+
lines.push(' (no runs resolved)');
|
|
591
|
+
} else {
|
|
592
|
+
for (const [tier, count] of tierEntries) {
|
|
593
|
+
lines.push(` ${tier.padEnd(20)} ${count}`);
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
lines.push('');
|
|
597
|
+
lines.push('Quality flags:');
|
|
598
|
+
lines.push(` runs with empty <meta keywords> : ${emptyKeywordCount}`);
|
|
599
|
+
lines.push(` runs with <meta description> shorter than 70 chars : ${shortDescriptionCount}`);
|
|
600
|
+
lines.push(` runs with <title> shorter than 30 chars : ${titleTooShortCount}`);
|
|
601
|
+
lines.push(` runs with <title> longer than 65 chars (SERP-clipped) : ${titleTooLongCount}`);
|
|
602
|
+
lines.push(` runs with <meta description> shorter than 120 chars : ${descTooShortCount}`);
|
|
603
|
+
lines.push(` runs with <meta description> longer than 160 chars : ${descTooLongCount}`);
|
|
604
|
+
lines.push(` runs with empty og:description (extendedDescription) : ${emptyExtendedDescriptionCount}`);
|
|
605
|
+
lines.push(` runs with -run<N>-polluted articleType : ${pollutedArticleTypeCount}`);
|
|
606
|
+
lines.push(` duplicate title groups (SERP cannibalisation) : ${duplicateTitleGroups.length}`);
|
|
607
|
+
if (duplicateTitleGroups.length > 0) {
|
|
608
|
+
for (const group of duplicateTitleGroups) {
|
|
609
|
+
lines.push(` [${group.count}×] ${group.title}`);
|
|
610
|
+
for (const slug of group.slugs) {
|
|
611
|
+
lines.push(` - ${slug}`);
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
if (failures.length > 0) {
|
|
616
|
+
lines.push('');
|
|
617
|
+
lines.push('Failures:');
|
|
618
|
+
for (const fail of failures) {
|
|
619
|
+
lines.push(` - ${fail.runDir}: ${fail.message}`);
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
lines.push('');
|
|
623
|
+
return `${lines.join('\n')}`;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
// Run as a script only when invoked directly (not when imported by tests).
|
|
627
|
+
const invokedDirectly =
|
|
628
|
+
import.meta.url === `file://${process.argv[1]}` ||
|
|
629
|
+
process.argv[1]?.endsWith('dump-article-seo.js');
|
|
630
|
+
|
|
631
|
+
if (invokedDirectly) {
|
|
632
|
+
try {
|
|
633
|
+
const opts = parseArgs(process.argv.slice(2));
|
|
634
|
+
dumpArticleSeo(opts);
|
|
635
|
+
} catch (error) {
|
|
636
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
637
|
+
process.stderr.write(`dump-article-seo: ${message}\n`);
|
|
638
|
+
process.exit(1);
|
|
639
|
+
}
|
|
640
|
+
}
|