euparliamentmonitor 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/aggregator/article-metadata.js +69 -14
- package/scripts/aggregator/editorial-brief-resolver.js +23 -0
- package/scripts/aggregator/html/headline.d.ts +41 -9
- package/scripts/aggregator/html/headline.js +69 -10
- package/scripts/aggregator/html/shell.js +73 -17
- package/scripts/aggregator/manifest/index.d.ts +1 -1
- package/scripts/aggregator/manifest/index.js +1 -1
- package/scripts/aggregator/manifest/resolver.d.ts +28 -1
- package/scripts/aggregator/manifest/resolver.js +61 -5
- package/scripts/aggregator/markdown-renderer.js +11 -0
- package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
- package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
- package/scripts/aggregator/metadata/artifact-walker.js +29 -10
- package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
- package/scripts/aggregator/metadata/brief-body.js +69 -0
- package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
- package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
- package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
- package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
- package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
- package/scripts/aggregator/metadata/heading-rules.js +78 -269
- package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
- package/scripts/aggregator/metadata/keyword-filters.js +156 -0
- package/scripts/aggregator/metadata/lede-extractor.js +11 -2
- package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
- package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
- package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
- package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
- package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
- package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
- package/scripts/aggregator/metadata/seo-budgets.js +202 -0
- package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
- package/scripts/aggregator/metadata/text-truncate.js +277 -0
- package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
- package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
- package/scripts/aggregator/metadata/text-utils.js +119 -439
- package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
- package/scripts/aggregator/metadata/title-rejection.js +179 -0
- package/scripts/copy-vendor.js +84 -112
- package/scripts/dump-article-seo.js +640 -0
- package/scripts/fix-mermaid-diagrams.js +931 -0
- package/scripts/generators/news-indexes/backfill.d.ts +6 -1
- package/scripts/generators/news-indexes/backfill.js +71 -4
- package/scripts/validate-article-seo.js +534 -0
- package/scripts/validate-mermaid-diagrams.js +306 -0
|
@@ -50,10 +50,15 @@ export declare function healJsonLdDescriptionCorruption(filenames: readonly stri
|
|
|
50
50
|
* @param slug - Article slug (used to derive the category)
|
|
51
51
|
* @param lang - Article language (ISO 639-1 lower-case code)
|
|
52
52
|
* @param description - Candidate description (resolver output preferred)
|
|
53
|
+
* @param options - Backfill options
|
|
54
|
+
* @param options.forceContextPrefix - Force date/language/category prefix
|
|
55
|
+
* even when the description is already substantive
|
|
53
56
|
* @returns Page-specific description, prefix-free when description is
|
|
54
57
|
* already substantive
|
|
55
58
|
*/
|
|
56
|
-
export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string
|
|
59
|
+
export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string, options?: {
|
|
60
|
+
readonly forceContextPrefix?: boolean;
|
|
61
|
+
}): string;
|
|
57
62
|
/**
|
|
58
63
|
* Apply SEO meta tag replacements to a complete article HTML document.
|
|
59
64
|
*
|
|
@@ -17,6 +17,23 @@ import { formatSlug, parseArticleFilename, extractArticleMeta, escapeHTML, atomi
|
|
|
17
17
|
import { detectCategory } from '../../utils/article-category.js';
|
|
18
18
|
import { buildSeoKeywords, resolveArticleMetadata } from '../../aggregator/article-metadata.js';
|
|
19
19
|
const MIN_ARTICLE_DESCRIPTION_LENGTH = 120;
|
|
20
|
+
/** Language labels used only in forced legacy backfill prefixes. */
|
|
21
|
+
const LEGACY_LANGUAGE_LABELS = {
|
|
22
|
+
en: 'English',
|
|
23
|
+
sv: 'Svenska',
|
|
24
|
+
da: 'Dansk',
|
|
25
|
+
no: 'Norsk',
|
|
26
|
+
fi: 'Suomi',
|
|
27
|
+
de: 'Deutsch',
|
|
28
|
+
fr: 'Français',
|
|
29
|
+
es: 'Español',
|
|
30
|
+
nl: 'Nederlands',
|
|
31
|
+
ar: 'العربية',
|
|
32
|
+
he: 'עברית',
|
|
33
|
+
ja: '日本語',
|
|
34
|
+
ko: '한국어',
|
|
35
|
+
zh: '中文',
|
|
36
|
+
};
|
|
20
37
|
/**
|
|
21
38
|
* Regex pattern that flags internal artefact identifiers
|
|
22
39
|
* (`<slug>-run<N>-<unix-ts>`). Used by
|
|
@@ -127,7 +144,9 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
|
|
|
127
144
|
? resolverDescription
|
|
128
145
|
: safeDescription || formatSlug(parsed.slug);
|
|
129
146
|
const description = needsDescription
|
|
130
|
-
? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription
|
|
147
|
+
? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription, {
|
|
148
|
+
forceContextPrefix: true,
|
|
149
|
+
})
|
|
131
150
|
: meta.description;
|
|
132
151
|
const keywords = entry?.keywords ?? fallbackKeywords;
|
|
133
152
|
const nextHtml = applyArticleSeoBackfill(html, description, keywords);
|
|
@@ -160,23 +179,71 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
|
|
|
160
179
|
* @param slug - Article slug (used to derive the category)
|
|
161
180
|
* @param lang - Article language (ISO 639-1 lower-case code)
|
|
162
181
|
* @param description - Candidate description (resolver output preferred)
|
|
182
|
+
* @param options - Backfill options
|
|
183
|
+
* @param options.forceContextPrefix - Force date/language/category prefix
|
|
184
|
+
* even when the description is already substantive
|
|
163
185
|
* @returns Page-specific description, prefix-free when description is
|
|
164
186
|
* already substantive
|
|
165
187
|
*/
|
|
166
|
-
export function buildLegacyBackfillDescription(date, slug, lang, description) {
|
|
188
|
+
export function buildLegacyBackfillDescription(date, slug, lang, description, options = {}) {
|
|
167
189
|
const trimmedDescription = description.trim();
|
|
168
|
-
if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH) {
|
|
190
|
+
if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH && !options.forceContextPrefix) {
|
|
169
191
|
return capDescriptionLength(trimmedDescription);
|
|
170
192
|
}
|
|
171
193
|
const category = detectCategory(slug);
|
|
172
194
|
const langCode = (lang || 'en').toLowerCase();
|
|
173
195
|
const categoryLabels = getLocalizedString(ARTICLE_TYPE_LABELS, langCode);
|
|
174
196
|
const label = categoryLabels[category] ?? formatSlug(slug);
|
|
175
|
-
const
|
|
197
|
+
const qualifier = buildLegacySlugQualifier(slug, label);
|
|
198
|
+
const languageLabel = legacyLanguageLabel(langCode);
|
|
199
|
+
const prefix = [date, languageLabel, label, qualifier]
|
|
200
|
+
.filter((part) => part.length > 0)
|
|
201
|
+
.join(' — ');
|
|
176
202
|
const body = trimmedDescription || label;
|
|
177
203
|
const contextual = `${prefix} — ${body}`.replace(/\s+/g, ' ').trim();
|
|
178
204
|
return capDescriptionLength(contextual);
|
|
179
205
|
}
|
|
206
|
+
/**
|
|
207
|
+
* Resolve the human language label used to make otherwise-identical
|
|
208
|
+
* cross-locale legacy descriptions unique.
|
|
209
|
+
*
|
|
210
|
+
* @param lang - Language code
|
|
211
|
+
* @returns Local language name, or the raw code if unknown
|
|
212
|
+
*/
|
|
213
|
+
function legacyLanguageLabel(lang) {
|
|
214
|
+
const descriptor = Object.getOwnPropertyDescriptor(LEGACY_LANGUAGE_LABELS, lang);
|
|
215
|
+
return typeof descriptor?.value === 'string' ? descriptor.value : lang;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Build an optional slug-derived qualifier for legacy pages that share the
|
|
219
|
+
* same date and article category (for example same-day `*-run2` variants).
|
|
220
|
+
*
|
|
221
|
+
* @param slug - Article slug without date/language suffix
|
|
222
|
+
* @param localizedLabel - Localized category label already present in prefix
|
|
223
|
+
* @returns Human-readable qualifier, or empty when it would duplicate label
|
|
224
|
+
*/
|
|
225
|
+
function buildLegacySlugQualifier(slug, localizedLabel) {
|
|
226
|
+
const formatted = formatSlug(slug).trim();
|
|
227
|
+
if (!formatted)
|
|
228
|
+
return '';
|
|
229
|
+
const normalizedFormatted = normalizeLegacyQualifier(formatted);
|
|
230
|
+
const normalizedLabel = normalizeLegacyQualifier(localizedLabel);
|
|
231
|
+
if (!normalizedFormatted || normalizedFormatted === normalizedLabel)
|
|
232
|
+
return '';
|
|
233
|
+
return formatted;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Normalize a prefix component for duplicate detection.
|
|
237
|
+
*
|
|
238
|
+
* @param value - Candidate text
|
|
239
|
+
* @returns Lower-case alphanumeric text
|
|
240
|
+
*/
|
|
241
|
+
function normalizeLegacyQualifier(value) {
|
|
242
|
+
return value
|
|
243
|
+
.toLowerCase()
|
|
244
|
+
.replace(/[^\p{L}\p{N}]+/gu, ' ')
|
|
245
|
+
.trim();
|
|
246
|
+
}
|
|
180
247
|
/**
|
|
181
248
|
* Clamp a description to the 180-character SERP-friendly cap with a
|
|
182
249
|
* trailing ellipsis when truncated. Extracted from
|
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @module scripts/validate-article-seo
|
|
7
|
+
* @description Hard CI gate for resolved `<title>` / `<meta description>`
|
|
8
|
+
* metadata that the deterministic article generator would emit for every
|
|
9
|
+
* executive brief under `analysis/daily/`. Companion to
|
|
10
|
+
* `validate-manifest-seo.js`: where that script audits the per-language
|
|
11
|
+
* `(title, description)` pairs committed *in manifest.json*, this one
|
|
12
|
+
* audits the **resolved output** of `resolveArticleMetadata()` — the same
|
|
13
|
+
* code path used by `npm run generate-article:all`.
|
|
14
|
+
*
|
|
15
|
+
* Why a separate gate? `manifest.json` is the Stage-B *input* contract.
|
|
16
|
+
* The Stage-D *output* contract is the resolved entry returned by
|
|
17
|
+
* `resolveArticleMetadata`, which threads through editorial-highlight
|
|
18
|
+
* extraction, BLUF-summary derivation, contextual title composition
|
|
19
|
+
* (including `— Run N` run-qualifier for same-date/same-articleType
|
|
20
|
+
* republishes), and CJK-aware length budgets. Bugs in any of those
|
|
21
|
+
* downstream layers can ship a degraded `<head>` even when manifest.json
|
|
22
|
+
* is clean.
|
|
23
|
+
*
|
|
24
|
+
* Gates applied per resolved entry (English only by default — the per-
|
|
25
|
+
* language outputs are validated by `validate-manifest-seo.js`):
|
|
26
|
+
*
|
|
27
|
+
* 1. **title-empty** — `entry.title` must be a non-empty string.
|
|
28
|
+
* 2. **title-length** — effective length in `[TITLE_MIN_LENGTH,
|
|
29
|
+
* TITLE_MAX_LENGTH]` after CJK 2× weighting.
|
|
30
|
+
* 3. **title-ellipsis** — must not end with `…` or `...` (mid-
|
|
31
|
+
* sentence truncation regression).
|
|
32
|
+
* 4. **description-empty** — `entry.description` must be a non-empty
|
|
33
|
+
* string.
|
|
34
|
+
* 5. **description-length** — effective length in `[DESCRIPTION_MIN_LENGTH,
|
|
35
|
+
* DESCRIPTION_MAX_LENGTH]` after CJK 2× weighting.
|
|
36
|
+
* 6. **description-ellipsis** — must not end with `…` / `...`.
|
|
37
|
+
* 7. **forbidden-prefix** — title/description must not start with a
|
|
38
|
+
* Stage-B preamble label (`Run:`, `Purpose:`, `BLUF:`, …).
|
|
39
|
+
* 8. **leaky-runid** — title/description must not contain
|
|
40
|
+
* internal run-id tokens or "analysis run" jargon.
|
|
41
|
+
* 9. **title-uniqueness** — when ≥2 runs share the same `(date,
|
|
42
|
+
* articleType)`, their resolved titles must differ (typically via
|
|
43
|
+
* the `— Run N` qualifier produced by `composeContextualTitle`).
|
|
44
|
+
* 10. **title-ellipsis-cut** — title rejected by resolver predicate
|
|
45
|
+
* `looksLikeEllipsisCut` (trailing `…` / `...`). Fires alongside
|
|
46
|
+
* `title-ellipsis` for backwards compatibility.
|
|
47
|
+
* 11. **title-doc-id** — title is a bare adopted-text doc-ID
|
|
48
|
+
* (`TA-NN-YYYY-NNNN`), never an editorial headline.
|
|
49
|
+
* 12. **title-section-header** — title is a bold-prose section header
|
|
50
|
+
* from `executive-brief.md` (`Strategic significance`,
|
|
51
|
+
* `Threat Level`, `Key Assumptions Check`, …).
|
|
52
|
+
* 13. **title-sentence-fragment** — title is a complete sentence
|
|
53
|
+
* (single trailing period, ≥4 words) leaked from a BLUF / lede
|
|
54
|
+
* paragraph rather than a noun-phrase headline.
|
|
55
|
+
* 14. **description-leaky-section-header** — description starts with
|
|
56
|
+
* a bold-prose section header label (lead phrase before `.:`).
|
|
57
|
+
*
|
|
58
|
+
* Gates 10–14 ship as `severity: 'advisory'` — they are surfaced in
|
|
59
|
+
* the JSON report's `totals.advisories` / `totals.byGateAdvisory` and
|
|
60
|
+
* printed with a ⚠️ prefix, but they do NOT count toward the
|
|
61
|
+
* fail-count or exit code. This lets the validator catch resolver
|
|
62
|
+
* regressions immediately without breaking CI on legacy articles that
|
|
63
|
+
* pre-date the resolver-tightening work. Promote to failure-class in a
|
|
64
|
+
* follow-up once the legacy data is regenerated.
|
|
65
|
+
*
|
|
66
|
+
* The process exits with code 1 if any failure-class violations exist
|
|
67
|
+
* (unless `--no-fail` is passed for advisory mode).
|
|
68
|
+
*
|
|
69
|
+
* Invocation:
|
|
70
|
+
* node scripts/validate-article-seo.js \
|
|
71
|
+
* [--repo-root <path>] \
|
|
72
|
+
* [--paths <runDir>...] # validate specific runDirs only
|
|
73
|
+
* [--lang en] # default 'en'
|
|
74
|
+
* [--report <path>] # write JSON report; default stdout
|
|
75
|
+
* [--no-fail] # exit 0 even when violations found
|
|
76
|
+
* [--quiet] # suppress per-run logging
|
|
77
|
+
*/
|
|
78
|
+
|
|
79
|
+
import fs from 'node:fs';
|
|
80
|
+
import path from 'node:path';
|
|
81
|
+
import process from 'node:process';
|
|
82
|
+
|
|
83
|
+
import { ALL_LANGUAGES } from './constants/language-core.js';
|
|
84
|
+
import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
|
|
85
|
+
import { resolveRunSeo } from './dump-article-seo.js';
|
|
86
|
+
import {
|
|
87
|
+
DESCRIPTION_MAX_LENGTH,
|
|
88
|
+
DESCRIPTION_MIN_LENGTH,
|
|
89
|
+
FORBIDDEN_PATTERNS,
|
|
90
|
+
FORBIDDEN_PREFIXES,
|
|
91
|
+
FORBIDDEN_SUBSTRINGS,
|
|
92
|
+
TITLE_MAX_LENGTH,
|
|
93
|
+
TITLE_MIN_LENGTH,
|
|
94
|
+
detectForbiddenPrefix,
|
|
95
|
+
detectLeakyRunIdOrJargon,
|
|
96
|
+
effectiveTextLength,
|
|
97
|
+
} from './validate-manifest-seo.js';
|
|
98
|
+
import { aggregateByKey } from './validate-brief-translations.js';
|
|
99
|
+
import {
|
|
100
|
+
findTitleRejectionReason,
|
|
101
|
+
looksLikeSectionHeader,
|
|
102
|
+
} from './aggregator/metadata/title-rejection.js';
|
|
103
|
+
|
|
104
|
+
/** Trailing-ellipsis detector: literal `…` or three ASCII periods. */
|
|
105
|
+
export const TRAILING_ELLIPSIS_RE = /(?:…|\.{3})\s*$/u;
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Detect whether a value ends with an ellipsis. Returns the matching
|
|
109
|
+
* fragment for the violation message, or null when the value is clean.
|
|
110
|
+
*
|
|
111
|
+
* @param {string} value
|
|
112
|
+
* @returns {string | null}
|
|
113
|
+
*/
|
|
114
|
+
export function detectTrailingEllipsis(value) {
|
|
115
|
+
if (typeof value !== 'string') return null;
|
|
116
|
+
const m = TRAILING_ELLIPSIS_RE.exec(value);
|
|
117
|
+
return m ? m[0] : null;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Resolve a single run through the same path as the article generator
|
|
122
|
+
* and apply the SEO gates to the resolved English entry. Pushes
|
|
123
|
+
* violations into the accumulator.
|
|
124
|
+
*
|
|
125
|
+
* @param {object} ctx
|
|
126
|
+
* @param {string} ctx.runDir
|
|
127
|
+
* @param {string} ctx.repoRoot
|
|
128
|
+
* @param {string} ctx.lang
|
|
129
|
+
* @param {Array<object>} ctx.violations
|
|
130
|
+
* @returns {ReturnType<typeof resolveRunSeo> | null} The resolved
|
|
131
|
+
* record (or null if resolution threw — in which case a `resolve`
|
|
132
|
+
* gate violation has been pushed).
|
|
133
|
+
*/
|
|
134
|
+
function validateOneRun(ctx) {
|
|
135
|
+
const { runDir, repoRoot, lang, violations } = ctx;
|
|
136
|
+
const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
|
|
137
|
+
let record;
|
|
138
|
+
try {
|
|
139
|
+
record = resolveRunSeo({ runDir, repoRoot, lang });
|
|
140
|
+
} catch (err) {
|
|
141
|
+
violations.push({
|
|
142
|
+
runDir: runDirRel,
|
|
143
|
+
lang,
|
|
144
|
+
gate: 'resolve',
|
|
145
|
+
message: `resolveRunSeo failed: ${err.message}`,
|
|
146
|
+
});
|
|
147
|
+
return null;
|
|
148
|
+
}
|
|
149
|
+
const { entry } = record;
|
|
150
|
+
applyFieldGates({
|
|
151
|
+
runDirRel,
|
|
152
|
+
lang,
|
|
153
|
+
kind: 'title',
|
|
154
|
+
value: entry.title,
|
|
155
|
+
minLen: TITLE_MIN_LENGTH,
|
|
156
|
+
maxLen: TITLE_MAX_LENGTH,
|
|
157
|
+
violations,
|
|
158
|
+
});
|
|
159
|
+
applyFieldGates({
|
|
160
|
+
runDirRel,
|
|
161
|
+
lang,
|
|
162
|
+
kind: 'description',
|
|
163
|
+
value: entry.description,
|
|
164
|
+
minLen: DESCRIPTION_MIN_LENGTH,
|
|
165
|
+
maxLen: DESCRIPTION_MAX_LENGTH,
|
|
166
|
+
violations,
|
|
167
|
+
});
|
|
168
|
+
return record;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Apply the empty / length / ellipsis / forbidden-prefix / leaky-runid
|
|
173
|
+
* gates to a single (kind, value) projection.
|
|
174
|
+
*
|
|
175
|
+
* @param {object} ctx
|
|
176
|
+
* @param {string} ctx.runDirRel
|
|
177
|
+
* @param {string} ctx.lang
|
|
178
|
+
* @param {'title' | 'description'} ctx.kind
|
|
179
|
+
* @param {string} ctx.value
|
|
180
|
+
* @param {number} ctx.minLen
|
|
181
|
+
* @param {number} ctx.maxLen
|
|
182
|
+
* @param {Array<object>} ctx.violations
|
|
183
|
+
*/
|
|
184
|
+
function applyFieldGates(ctx) {
|
|
185
|
+
const { runDirRel, lang, kind, value, minLen, maxLen, violations } = ctx;
|
|
186
|
+
if (typeof value !== 'string' || value.trim().length === 0) {
|
|
187
|
+
violations.push({
|
|
188
|
+
runDir: runDirRel,
|
|
189
|
+
lang,
|
|
190
|
+
gate: `${kind}-empty`,
|
|
191
|
+
message: `${kind} resolved to empty / non-string value`,
|
|
192
|
+
});
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
const length = effectiveTextLength(value);
|
|
196
|
+
if (length < minLen || length > maxLen) {
|
|
197
|
+
violations.push({
|
|
198
|
+
runDir: runDirRel,
|
|
199
|
+
lang,
|
|
200
|
+
gate: `${kind}-length`,
|
|
201
|
+
message: `${kind} has effective length ${length}; expected ${minLen}–${maxLen}`,
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
const ellipsis = detectTrailingEllipsis(value);
|
|
205
|
+
if (ellipsis) {
|
|
206
|
+
violations.push({
|
|
207
|
+
runDir: runDirRel,
|
|
208
|
+
lang,
|
|
209
|
+
gate: `${kind}-ellipsis`,
|
|
210
|
+
message: `${kind} ends with "${ellipsis}" — mid-sentence truncation is forbidden`,
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
const prefix = detectForbiddenPrefix(value);
|
|
214
|
+
if (prefix) {
|
|
215
|
+
violations.push({
|
|
216
|
+
runDir: runDirRel,
|
|
217
|
+
lang,
|
|
218
|
+
gate: 'forbidden-prefix',
|
|
219
|
+
message: `${kind} begins with reserved Stage-B label "${prefix}"`,
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
const leaked = detectLeakyRunIdOrJargon(value);
|
|
223
|
+
if (leaked) {
|
|
224
|
+
violations.push({
|
|
225
|
+
runDir: runDirRel,
|
|
226
|
+
lang,
|
|
227
|
+
gate: 'leaky-runid',
|
|
228
|
+
message: `${kind} contains internal token "${leaked}"`,
|
|
229
|
+
});
|
|
230
|
+
}
|
|
231
|
+
applyKindSpecificRejectionGates(ctx);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Apply the resolver-aligned title-rejection predicates from
|
|
236
|
+
* `src/aggregator/metadata/title-rejection.ts`. Keeping validator and
|
|
237
|
+
* resolver in lock-step ensures the same regression that the resolver
|
|
238
|
+
* already rejects can never sneak through into a shipped `<title>` or
|
|
239
|
+
* `<meta description>`.
|
|
240
|
+
*
|
|
241
|
+
* Emitted gates (deferred PR #2163 follow-up):
|
|
242
|
+
*
|
|
243
|
+
* - `title-ellipsis-cut` — trailing `…` / `...`
|
|
244
|
+
* - `title-doc-id` — bare `TA-NN-YYYY-NNNN` doc-ID
|
|
245
|
+
* - `title-section-header` — bold-prose section header
|
|
246
|
+
* - `title-sentence-fragment` — leaked complete sentence
|
|
247
|
+
* - `description-leaky-section-header` — description starts with a
|
|
248
|
+
* bold-prose section header label (`Strategic significance: …`,
|
|
249
|
+
* `Threat Level: …`, …)
|
|
250
|
+
*
|
|
251
|
+
* @param {object} ctx
|
|
252
|
+
* @param {string} ctx.runDirRel
|
|
253
|
+
* @param {string} ctx.lang
|
|
254
|
+
* @param {'title' | 'description'} ctx.kind
|
|
255
|
+
* @param {string} ctx.value
|
|
256
|
+
* @param {Array<object>} ctx.violations
|
|
257
|
+
*/
|
|
258
|
+
/**
|
|
259
|
+
* Evaluate the resolver-aligned rejection gates for a single
|
|
260
|
+
* `(kind, value)` pair and return the gate names that fire. Pure,
|
|
261
|
+
* dependency-free, used by both {@link applyKindSpecificRejectionGates}
|
|
262
|
+
* and the unit tests to keep validator and resolver in lock-step.
|
|
263
|
+
*
|
|
264
|
+
* @param {'title' | 'description'} kind
|
|
265
|
+
* @param {string} value
|
|
266
|
+
* @returns {string[]} Zero or more gate names (e.g. `title-doc-id`,
|
|
267
|
+
* `description-leaky-section-header`).
|
|
268
|
+
*/
|
|
269
|
+
export function evaluateResolverRejectionGates(kind, value) {
|
|
270
|
+
if (typeof value !== 'string' || value.length === 0) return [];
|
|
271
|
+
if (kind === 'title') {
|
|
272
|
+
const reason = findTitleRejectionReason(value);
|
|
273
|
+
return reason ? [`title-${reason}`] : [];
|
|
274
|
+
}
|
|
275
|
+
if (kind === 'description' && detectDescriptionLeadSectionHeader(value)) {
|
|
276
|
+
return ['description-leaky-section-header'];
|
|
277
|
+
}
|
|
278
|
+
return [];
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
function applyKindSpecificRejectionGates(ctx) {
|
|
282
|
+
const { runDirRel, lang, kind, value, violations } = ctx;
|
|
283
|
+
const gates = evaluateResolverRejectionGates(kind, value);
|
|
284
|
+
for (const gate of gates) {
|
|
285
|
+
violations.push({
|
|
286
|
+
runDir: runDirRel,
|
|
287
|
+
lang,
|
|
288
|
+
gate,
|
|
289
|
+
// The 5 resolver-aligned gates ship as `advisory` so the validator
|
|
290
|
+
// can surface them in the report (and unit tests can lock the
|
|
291
|
+
// signal) without immediately failing CI on the 25 legacy
|
|
292
|
+
// articles that pre-date the resolver-tightening work. Promote to
|
|
293
|
+
// failure-class in a follow-up once those runs are regenerated.
|
|
294
|
+
severity: 'advisory',
|
|
295
|
+
message:
|
|
296
|
+
gate === 'description-leaky-section-header'
|
|
297
|
+
? `description starts with a bold-prose section header: "${value}"`
|
|
298
|
+
: `title rejected by resolver predicate (${gate.slice('title-'.length)}): "${value}"`,
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* `true` when a description's lead phrase (first sentence or first
|
|
305
|
+
* bold-label segment) is a denylisted section header. Splits on the
|
|
306
|
+
* first sentence terminator (`.`, `:`, `…`, `?`, `!`) and asks the
|
|
307
|
+
* canonical {@link looksLikeSectionHeader} predicate.
|
|
308
|
+
*
|
|
309
|
+
* @param {string} value
|
|
310
|
+
* @returns {boolean}
|
|
311
|
+
*/
|
|
312
|
+
function detectDescriptionLeadSectionHeader(value) {
|
|
313
|
+
const trimmed = value.trim();
|
|
314
|
+
if (!trimmed) return false;
|
|
315
|
+
const m = /^([^.:!?\u2026]+)/u.exec(trimmed);
|
|
316
|
+
const lead = (m ? m[1] : trimmed).trim();
|
|
317
|
+
return looksLikeSectionHeader(lead);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
/**
|
|
321
|
+
* After resolving every run, detect duplicate titles within the same
|
|
322
|
+
* `(date, articleType)` collision group. The `— Run N` qualifier from
|
|
323
|
+
* `composeContextualTitle` is the contracted differentiator for
|
|
324
|
+
* same-date/same-articleType republishes; missing it is a
|
|
325
|
+
* uniqueness-gate failure.
|
|
326
|
+
*
|
|
327
|
+
* @param {Array<{record: ReturnType<typeof resolveRunSeo>, lang: string}>} resolvedList
|
|
328
|
+
* @param {Array<object>} violations
|
|
329
|
+
*/
|
|
330
|
+
function detectDuplicateTitles(resolvedList, violations) {
|
|
331
|
+
/** @type {Map<string, Array<{record: any, lang: string}>>} */
|
|
332
|
+
const groups = new Map();
|
|
333
|
+
for (const item of resolvedList) {
|
|
334
|
+
const key = `${item.record.date}|${item.record.articleType}`;
|
|
335
|
+
const bucket = groups.get(key);
|
|
336
|
+
if (bucket) bucket.push(item);
|
|
337
|
+
else groups.set(key, [item]);
|
|
338
|
+
}
|
|
339
|
+
for (const [key, items] of groups.entries()) {
|
|
340
|
+
if (items.length < 2) continue;
|
|
341
|
+
/** @type {Map<string, string[]>} */
|
|
342
|
+
const byTitle = new Map();
|
|
343
|
+
for (const it of items) {
|
|
344
|
+
const t = it.record.entry.title;
|
|
345
|
+
const bucket = byTitle.get(t);
|
|
346
|
+
if (bucket) bucket.push(it.record.runDirRel);
|
|
347
|
+
else byTitle.set(t, [it.record.runDirRel]);
|
|
348
|
+
}
|
|
349
|
+
for (const [title, dirs] of byTitle.entries()) {
|
|
350
|
+
if (dirs.length < 2) continue;
|
|
351
|
+
violations.push({
|
|
352
|
+
runDir: dirs.join(', '),
|
|
353
|
+
lang: items[0].lang,
|
|
354
|
+
gate: 'title-uniqueness',
|
|
355
|
+
affectedRuns: dirs,
|
|
356
|
+
message:
|
|
357
|
+
`${dirs.length} runs in collision group "${key}" share the ` +
|
|
358
|
+
`resolved title "${title}" — composeContextualTitle must ` +
|
|
359
|
+
`append a "— Run N" qualifier (or equivalent) so the per-run ` +
|
|
360
|
+
`articles are distinguishable in SERPs and social cards`,
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Run validation across a list of run directories.
|
|
368
|
+
*
|
|
369
|
+
* @param {string[]} runDirs
|
|
370
|
+
* @param {string} repoRoot
|
|
371
|
+
* @param {{ quiet?: boolean, lang?: string }} options
|
|
372
|
+
*/
|
|
373
|
+
export function runValidation(runDirs, repoRoot, { quiet = false, lang = 'en' } = {}) {
|
|
374
|
+
const allViolations = [];
|
|
375
|
+
/** @type {Array<{record: ReturnType<typeof resolveRunSeo>, lang: string}>} */
|
|
376
|
+
const resolved = [];
|
|
377
|
+
for (const runDir of runDirs) {
|
|
378
|
+
const before = allViolations.length;
|
|
379
|
+
const record = validateOneRun({ runDir, repoRoot, lang, violations: allViolations });
|
|
380
|
+
const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
|
|
381
|
+
if (record) resolved.push({ record, lang });
|
|
382
|
+
const added = allViolations.length - before;
|
|
383
|
+
if (!quiet) {
|
|
384
|
+
if (added > 0) {
|
|
385
|
+
for (const entry of allViolations.slice(before)) {
|
|
386
|
+
const prefix = entry.severity === 'advisory' ? '⚠️ ' : '❌';
|
|
387
|
+
const stream = entry.severity === 'advisory' ? process.stdout : process.stderr;
|
|
388
|
+
stream.write(`${prefix} ${runDirRel} [${entry.gate}] ${entry.message}\n`);
|
|
389
|
+
}
|
|
390
|
+
} else {
|
|
391
|
+
process.stdout.write(`✅ ${runDirRel}\n`);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
const beforeUniq = allViolations.length;
|
|
396
|
+
detectDuplicateTitles(resolved, allViolations);
|
|
397
|
+
if (!quiet) {
|
|
398
|
+
for (const entry of allViolations.slice(beforeUniq)) {
|
|
399
|
+
const prefix = entry.severity === 'advisory' ? '⚠️ ' : '❌';
|
|
400
|
+
const stream = entry.severity === 'advisory' ? process.stdout : process.stderr;
|
|
401
|
+
stream.write(`${prefix} ${entry.runDir} [${entry.gate}] ${entry.message}\n`);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return allViolations;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Parse CLI argv. Exported for unit tests.
|
|
409
|
+
*/
|
|
410
|
+
export function parseArgs(argv) {
|
|
411
|
+
const opts = {
|
|
412
|
+
repoRoot: process.cwd(),
|
|
413
|
+
paths: [],
|
|
414
|
+
lang: 'en',
|
|
415
|
+
report: null,
|
|
416
|
+
fail: true,
|
|
417
|
+
quiet: false,
|
|
418
|
+
};
|
|
419
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
420
|
+
const arg = argv[i];
|
|
421
|
+
switch (arg) {
|
|
422
|
+
case '--repo-root':
|
|
423
|
+
opts.repoRoot = argv[i + 1];
|
|
424
|
+
i += 1;
|
|
425
|
+
break;
|
|
426
|
+
case '--paths':
|
|
427
|
+
while (i + 1 < argv.length) {
|
|
428
|
+
const next = argv[i + 1];
|
|
429
|
+
if (next === '--') {
|
|
430
|
+
i += 1;
|
|
431
|
+
break;
|
|
432
|
+
}
|
|
433
|
+
if (next.startsWith('--')) break;
|
|
434
|
+
opts.paths.push(next);
|
|
435
|
+
i += 1;
|
|
436
|
+
}
|
|
437
|
+
break;
|
|
438
|
+
case '--lang':
|
|
439
|
+
opts.lang = argv[i + 1];
|
|
440
|
+
i += 1;
|
|
441
|
+
break;
|
|
442
|
+
case '--report':
|
|
443
|
+
opts.report = argv[i + 1];
|
|
444
|
+
i += 1;
|
|
445
|
+
break;
|
|
446
|
+
case '--no-fail':
|
|
447
|
+
opts.fail = false;
|
|
448
|
+
break;
|
|
449
|
+
case '--quiet':
|
|
450
|
+
opts.quiet = true;
|
|
451
|
+
break;
|
|
452
|
+
case '--help':
|
|
453
|
+
case '-h':
|
|
454
|
+
process.stdout.write(
|
|
455
|
+
'Usage: validate-article-seo.js [--repo-root <path>] ' +
|
|
456
|
+
'[--paths <runDir>... [--]] [--lang <code>] [--report <path>] [--no-fail] [--quiet]\n'
|
|
457
|
+
);
|
|
458
|
+
process.exit(0);
|
|
459
|
+
break;
|
|
460
|
+
default:
|
|
461
|
+
if (arg.startsWith('--')) {
|
|
462
|
+
throw new Error(`Unknown flag: ${arg}`);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
if (!ALL_LANGUAGES.includes(opts.lang)) {
|
|
467
|
+
throw new Error(`Unsupported --lang "${opts.lang}"`);
|
|
468
|
+
}
|
|
469
|
+
return opts;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
/** Main entry point. */
|
|
473
|
+
export function main(argv) {
|
|
474
|
+
const opts = parseArgs(argv);
|
|
475
|
+
const runDirs =
|
|
476
|
+
opts.paths.length > 0
|
|
477
|
+
? opts.paths.map((p) => path.resolve(opts.repoRoot, p))
|
|
478
|
+
: discoverAnalysisRuns(opts.repoRoot).map((r) => r.runDir);
|
|
479
|
+
|
|
480
|
+
const violations = runValidation(runDirs, opts.repoRoot, {
|
|
481
|
+
quiet: opts.quiet,
|
|
482
|
+
lang: opts.lang,
|
|
483
|
+
});
|
|
484
|
+
const failingViolations = violations.filter((v) => v.severity !== 'advisory');
|
|
485
|
+
const advisoryViolations = violations.filter((v) => v.severity === 'advisory');
|
|
486
|
+
|
|
487
|
+
const report = {
|
|
488
|
+
generatedAt: new Date().toISOString(),
|
|
489
|
+
lang: opts.lang,
|
|
490
|
+
totals: {
|
|
491
|
+
runsChecked: runDirs.length,
|
|
492
|
+
violations: failingViolations.length,
|
|
493
|
+
advisories: advisoryViolations.length,
|
|
494
|
+
byGate: aggregateByKey(failingViolations, 'gate'),
|
|
495
|
+
byGateAdvisory: aggregateByKey(advisoryViolations, 'gate'),
|
|
496
|
+
},
|
|
497
|
+
violations,
|
|
498
|
+
};
|
|
499
|
+
|
|
500
|
+
const json = `${JSON.stringify(report, null, 2)}\n`;
|
|
501
|
+
if (opts.report) {
|
|
502
|
+
fs.mkdirSync(path.dirname(opts.report), { recursive: true });
|
|
503
|
+
fs.writeFileSync(opts.report, json);
|
|
504
|
+
} else if (!opts.quiet) {
|
|
505
|
+
process.stdout.write(json);
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
if (failingViolations.length > 0 && opts.fail) {
|
|
509
|
+
process.exit(1);
|
|
510
|
+
}
|
|
511
|
+
return report;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
/* c8 ignore start */
|
|
515
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
516
|
+
try {
|
|
517
|
+
main(process.argv.slice(2));
|
|
518
|
+
} catch (err) {
|
|
519
|
+
process.stderr.write(`validate-article-seo: ${err.message}\n`);
|
|
520
|
+
process.exit(1);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
/* c8 ignore stop */
|
|
524
|
+
|
|
525
|
+
// Re-export for tests
|
|
526
|
+
export {
|
|
527
|
+
FORBIDDEN_PATTERNS,
|
|
528
|
+
FORBIDDEN_PREFIXES,
|
|
529
|
+
FORBIDDEN_SUBSTRINGS,
|
|
530
|
+
TITLE_MAX_LENGTH,
|
|
531
|
+
TITLE_MIN_LENGTH,
|
|
532
|
+
DESCRIPTION_MAX_LENGTH,
|
|
533
|
+
DESCRIPTION_MIN_LENGTH,
|
|
534
|
+
};
|