euparliamentmonitor 0.9.21 → 0.9.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +6 -2
  2. package/scripts/aggregator/article-metadata.js +69 -14
  3. package/scripts/aggregator/editorial-brief-resolver.js +23 -0
  4. package/scripts/aggregator/html/headline.d.ts +41 -9
  5. package/scripts/aggregator/html/headline.js +69 -10
  6. package/scripts/aggregator/html/shell.js +73 -17
  7. package/scripts/aggregator/manifest/index.d.ts +1 -1
  8. package/scripts/aggregator/manifest/index.js +1 -1
  9. package/scripts/aggregator/manifest/resolver.d.ts +28 -1
  10. package/scripts/aggregator/manifest/resolver.js +61 -5
  11. package/scripts/aggregator/markdown-renderer.js +11 -0
  12. package/scripts/aggregator/metadata/artifact-category-heading.d.ts +81 -0
  13. package/scripts/aggregator/metadata/artifact-category-heading.js +353 -0
  14. package/scripts/aggregator/metadata/artifact-walker.js +29 -10
  15. package/scripts/aggregator/metadata/brief-body.d.ts +12 -0
  16. package/scripts/aggregator/metadata/brief-body.js +69 -0
  17. package/scripts/aggregator/metadata/briefing-highlight.d.ts +47 -0
  18. package/scripts/aggregator/metadata/briefing-highlight.js +469 -0
  19. package/scripts/aggregator/metadata/editorial-highlight.d.ts +18 -0
  20. package/scripts/aggregator/metadata/editorial-highlight.js +40 -1
  21. package/scripts/aggregator/metadata/heading-rules.d.ts +2 -81
  22. package/scripts/aggregator/metadata/heading-rules.js +78 -269
  23. package/scripts/aggregator/metadata/keyword-filters.d.ts +60 -0
  24. package/scripts/aggregator/metadata/keyword-filters.js +156 -0
  25. package/scripts/aggregator/metadata/lede-extractor.js +11 -2
  26. package/scripts/aggregator/metadata/priority-finding-cleaning.d.ts +22 -0
  27. package/scripts/aggregator/metadata/priority-finding-cleaning.js +181 -0
  28. package/scripts/aggregator/metadata/priority-finding-highlight.js +75 -159
  29. package/scripts/aggregator/metadata/resolve-helpers.d.ts +34 -0
  30. package/scripts/aggregator/metadata/resolve-helpers.js +202 -15
  31. package/scripts/aggregator/metadata/seo-budgets.d.ts +140 -0
  32. package/scripts/aggregator/metadata/seo-budgets.js +202 -0
  33. package/scripts/aggregator/metadata/text-truncate.d.ts +75 -0
  34. package/scripts/aggregator/metadata/text-truncate.js +277 -0
  35. package/scripts/aggregator/metadata/text-utils-constants.d.ts +96 -0
  36. package/scripts/aggregator/metadata/text-utils-constants.js +209 -0
  37. package/scripts/aggregator/metadata/text-utils.d.ts +32 -143
  38. package/scripts/aggregator/metadata/text-utils.js +119 -439
  39. package/scripts/aggregator/metadata/title-rejection.d.ts +37 -0
  40. package/scripts/aggregator/metadata/title-rejection.js +179 -0
  41. package/scripts/copy-vendor.js +84 -112
  42. package/scripts/dump-article-seo.js +640 -0
  43. package/scripts/fix-mermaid-diagrams.js +931 -0
  44. package/scripts/generators/news-indexes/backfill.d.ts +6 -1
  45. package/scripts/generators/news-indexes/backfill.js +71 -4
  46. package/scripts/validate-article-seo.js +534 -0
  47. package/scripts/validate-mermaid-diagrams.js +306 -0
@@ -50,10 +50,15 @@ export declare function healJsonLdDescriptionCorruption(filenames: readonly stri
50
50
  * @param slug - Article slug (used to derive the category)
51
51
  * @param lang - Article language (ISO 639-1 lower-case code)
52
52
  * @param description - Candidate description (resolver output preferred)
53
+ * @param options - Backfill options
54
+ * @param options.forceContextPrefix - Force date/language/category prefix
55
+ * even when the description is already substantive
53
56
  * @returns Page-specific description, prefix-free when description is
54
57
  * already substantive
55
58
  */
56
- export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string): string;
59
+ export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string, options?: {
60
+ readonly forceContextPrefix?: boolean;
61
+ }): string;
57
62
  /**
58
63
  * Apply SEO meta tag replacements to a complete article HTML document.
59
64
  *
@@ -17,6 +17,23 @@ import { formatSlug, parseArticleFilename, extractArticleMeta, escapeHTML, atomi
17
17
  import { detectCategory } from '../../utils/article-category.js';
18
18
  import { buildSeoKeywords, resolveArticleMetadata } from '../../aggregator/article-metadata.js';
19
19
  const MIN_ARTICLE_DESCRIPTION_LENGTH = 120;
20
+ /** Language labels used only in forced legacy backfill prefixes. */
21
+ const LEGACY_LANGUAGE_LABELS = {
22
+ en: 'English',
23
+ sv: 'Svenska',
24
+ da: 'Dansk',
25
+ no: 'Norsk',
26
+ fi: 'Suomi',
27
+ de: 'Deutsch',
28
+ fr: 'Français',
29
+ es: 'Español',
30
+ nl: 'Nederlands',
31
+ ar: 'العربية',
32
+ he: 'עברית',
33
+ ja: '日本語',
34
+ ko: '한국어',
35
+ zh: '中文',
36
+ };
20
37
  /**
21
38
  * Regex pattern that flags internal artefact identifiers
22
39
  * (`<slug>-run<N>-<unix-ts>`). Used by
@@ -127,7 +144,9 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
127
144
  ? resolverDescription
128
145
  : safeDescription || formatSlug(parsed.slug);
129
146
  const description = needsDescription
130
- ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription)
147
+ ? buildLegacyBackfillDescription(parsed.date, parsed.slug, parsed.lang, baseDescription, {
148
+ forceContextPrefix: true,
149
+ })
131
150
  : meta.description;
132
151
  const keywords = entry?.keywords ?? fallbackKeywords;
133
152
  const nextHtml = applyArticleSeoBackfill(html, description, keywords);
@@ -160,23 +179,71 @@ function backfillOneLegacyArticleSeo(filename, descriptions) {
160
179
  * @param slug - Article slug (used to derive the category)
161
180
  * @param lang - Article language (ISO 639-1 lower-case code)
162
181
  * @param description - Candidate description (resolver output preferred)
182
+ * @param options - Backfill options
183
+ * @param options.forceContextPrefix - Force date/language/category prefix
184
+ * even when the description is already substantive
163
185
  * @returns Page-specific description, prefix-free when description is
164
186
  * already substantive
165
187
  */
166
- export function buildLegacyBackfillDescription(date, slug, lang, description) {
188
+ export function buildLegacyBackfillDescription(date, slug, lang, description, options = {}) {
167
189
  const trimmedDescription = description.trim();
168
- if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH) {
190
+ if (trimmedDescription.length >= MIN_ARTICLE_DESCRIPTION_LENGTH && !options.forceContextPrefix) {
169
191
  return capDescriptionLength(trimmedDescription);
170
192
  }
171
193
  const category = detectCategory(slug);
172
194
  const langCode = (lang || 'en').toLowerCase();
173
195
  const categoryLabels = getLocalizedString(ARTICLE_TYPE_LABELS, langCode);
174
196
  const label = categoryLabels[category] ?? formatSlug(slug);
175
- const prefix = `${date} — ${label}`;
197
+ const qualifier = buildLegacySlugQualifier(slug, label);
198
+ const languageLabel = legacyLanguageLabel(langCode);
199
+ const prefix = [date, languageLabel, label, qualifier]
200
+ .filter((part) => part.length > 0)
201
+ .join(' — ');
176
202
  const body = trimmedDescription || label;
177
203
  const contextual = `${prefix} — ${body}`.replace(/\s+/g, ' ').trim();
178
204
  return capDescriptionLength(contextual);
179
205
  }
206
+ /**
207
+ * Resolve the human language label used to make otherwise-identical
208
+ * cross-locale legacy descriptions unique.
209
+ *
210
+ * @param lang - Language code
211
+ * @returns Local language name, or the raw code if unknown
212
+ */
213
+ function legacyLanguageLabel(lang) {
214
+ const descriptor = Object.getOwnPropertyDescriptor(LEGACY_LANGUAGE_LABELS, lang);
215
+ return typeof descriptor?.value === 'string' ? descriptor.value : lang;
216
+ }
217
+ /**
218
+ * Build an optional slug-derived qualifier for legacy pages that share the
219
+ * same date and article category (for example same-day `*-run2` variants).
220
+ *
221
+ * @param slug - Article slug without date/language suffix
222
+ * @param localizedLabel - Localized category label already present in prefix
223
+ * @returns Human-readable qualifier, or empty when it would duplicate label
224
+ */
225
+ function buildLegacySlugQualifier(slug, localizedLabel) {
226
+ const formatted = formatSlug(slug).trim();
227
+ if (!formatted)
228
+ return '';
229
+ const normalizedFormatted = normalizeLegacyQualifier(formatted);
230
+ const normalizedLabel = normalizeLegacyQualifier(localizedLabel);
231
+ if (!normalizedFormatted || normalizedFormatted === normalizedLabel)
232
+ return '';
233
+ return formatted;
234
+ }
235
+ /**
236
+ * Normalize a prefix component for duplicate detection.
237
+ *
238
+ * @param value - Candidate text
239
+ * @returns Lower-case alphanumeric text
240
+ */
241
+ function normalizeLegacyQualifier(value) {
242
+ return value
243
+ .toLowerCase()
244
+ .replace(/[^\p{L}\p{N}]+/gu, ' ')
245
+ .trim();
246
+ }
180
247
  /**
181
248
  * Clamp a description to the 180-character SERP-friendly cap with a
182
249
  * trailing ellipsis when truncated. Extracted from
@@ -0,0 +1,534 @@
1
+ #!/usr/bin/env node
2
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ /**
6
+ * @module scripts/validate-article-seo
7
+ * @description Hard CI gate for resolved `<title>` / `<meta description>`
8
+ * metadata that the deterministic article generator would emit for every
9
+ * executive brief under `analysis/daily/`. Companion to
10
+ * `validate-manifest-seo.js`: where that script audits the per-language
11
+ * `(title, description)` pairs committed *in manifest.json*, this one
12
+ * audits the **resolved output** of `resolveArticleMetadata()` — the same
13
+ * code path used by `npm run generate-article:all`.
14
+ *
15
+ * Why a separate gate? `manifest.json` is the Stage-B *input* contract.
16
+ * The Stage-D *output* contract is the resolved entry returned by
17
+ * `resolveArticleMetadata`, which threads through editorial-highlight
18
+ * extraction, BLUF-summary derivation, contextual title composition
19
+ * (including `— Run N` run-qualifier for same-date/same-articleType
20
+ * republishes), and CJK-aware length budgets. Bugs in any of those
21
+ * downstream layers can ship a degraded `<head>` even when manifest.json
22
+ * is clean.
23
+ *
24
+ * Gates applied per resolved entry (English only by default — the per-
25
+ * language outputs are validated by `validate-manifest-seo.js`):
26
+ *
27
+ * 1. **title-empty** — `entry.title` must be a non-empty string.
28
+ * 2. **title-length** — effective length in `[TITLE_MIN_LENGTH,
29
+ * TITLE_MAX_LENGTH]` after CJK 2× weighting.
30
+ * 3. **title-ellipsis** — must not end with `…` or `...` (mid-
31
+ * sentence truncation regression).
32
+ * 4. **description-empty** — `entry.description` must be a non-empty
33
+ * string.
34
+ * 5. **description-length** — effective length in `[DESCRIPTION_MIN_LENGTH,
35
+ * DESCRIPTION_MAX_LENGTH]` after CJK 2× weighting.
36
+ * 6. **description-ellipsis** — must not end with `…` / `...`.
37
+ * 7. **forbidden-prefix** — title/description must not start with a
38
+ * Stage-B preamble label (`Run:`, `Purpose:`, `BLUF:`, …).
39
+ * 8. **leaky-runid** — title/description must not contain
40
+ * internal run-id tokens or "analysis run" jargon.
41
+ * 9. **title-uniqueness** — when ≥2 runs share the same `(date,
42
+ * articleType)`, their resolved titles must differ (typically via
43
+ * the `— Run N` qualifier produced by `composeContextualTitle`).
44
+ * 10. **title-ellipsis-cut** — title rejected by resolver predicate
45
+ * `looksLikeEllipsisCut` (trailing `…` / `...`). Fires alongside
46
+ * `title-ellipsis` for backwards compatibility.
47
+ * 11. **title-doc-id** — title is a bare adopted-text doc-ID
48
+ * (`TA-NN-YYYY-NNNN`), never an editorial headline.
49
+ * 12. **title-section-header** — title is a bold-prose section header
50
+ * from `executive-brief.md` (`Strategic significance`,
51
+ * `Threat Level`, `Key Assumptions Check`, …).
52
+ * 13. **title-sentence-fragment** — title is a complete sentence
53
+ * (single trailing period, ≥4 words) leaked from a BLUF / lede
54
+ * paragraph rather than a noun-phrase headline.
55
+ * 14. **description-leaky-section-header** — description starts with
56
+ * a bold-prose section header label (lead phrase before `.:`).
57
+ *
58
+ * Gates 10–14 ship as `severity: 'advisory'` — they are surfaced in
59
+ * the JSON report's `totals.advisories` / `totals.byGateAdvisory` and
60
+ * printed with a ⚠️ prefix, but they do NOT count toward the
61
+ * fail-count or exit code. This lets the validator catch resolver
62
+ * regressions immediately without breaking CI on legacy articles that
63
+ * pre-date the resolver-tightening work. Promote to failure-class in a
64
+ * follow-up once the legacy data is regenerated.
65
+ *
66
+ * The process exits with code 1 if any failure-class violations exist
67
+ * (unless `--no-fail` is passed for advisory mode).
68
+ *
69
+ * Invocation:
70
+ * node scripts/validate-article-seo.js \
71
+ * [--repo-root <path>] \
72
+ * [--paths <runDir>...] # validate specific runDirs only
73
+ * [--lang en] # default 'en'
74
+ * [--report <path>] # write JSON report; default stdout
75
+ * [--no-fail] # exit 0 even when violations found
76
+ * [--quiet] # suppress per-run logging
77
+ */
78
+
79
+ import fs from 'node:fs';
80
+ import path from 'node:path';
81
+ import process from 'node:process';
82
+
83
+ import { ALL_LANGUAGES } from './constants/language-core.js';
84
+ import { discoverAnalysisRuns } from './aggregator/generator/discovery.js';
85
+ import { resolveRunSeo } from './dump-article-seo.js';
86
+ import {
87
+ DESCRIPTION_MAX_LENGTH,
88
+ DESCRIPTION_MIN_LENGTH,
89
+ FORBIDDEN_PATTERNS,
90
+ FORBIDDEN_PREFIXES,
91
+ FORBIDDEN_SUBSTRINGS,
92
+ TITLE_MAX_LENGTH,
93
+ TITLE_MIN_LENGTH,
94
+ detectForbiddenPrefix,
95
+ detectLeakyRunIdOrJargon,
96
+ effectiveTextLength,
97
+ } from './validate-manifest-seo.js';
98
+ import { aggregateByKey } from './validate-brief-translations.js';
99
+ import {
100
+ findTitleRejectionReason,
101
+ looksLikeSectionHeader,
102
+ } from './aggregator/metadata/title-rejection.js';
103
+
104
+ /** Trailing-ellipsis detector: literal `…` or three ASCII periods. */
105
+ export const TRAILING_ELLIPSIS_RE = /(?:…|\.{3})\s*$/u;
106
+
107
+ /**
108
+ * Detect whether a value ends with an ellipsis. Returns the matching
109
+ * fragment for the violation message, or null when the value is clean.
110
+ *
111
+ * @param {string} value
112
+ * @returns {string | null}
113
+ */
114
+ export function detectTrailingEllipsis(value) {
115
+ if (typeof value !== 'string') return null;
116
+ const m = TRAILING_ELLIPSIS_RE.exec(value);
117
+ return m ? m[0] : null;
118
+ }
119
+
120
+ /**
121
+ * Resolve a single run through the same path as the article generator
122
+ * and apply the SEO gates to the resolved English entry. Pushes
123
+ * violations into the accumulator.
124
+ *
125
+ * @param {object} ctx
126
+ * @param {string} ctx.runDir
127
+ * @param {string} ctx.repoRoot
128
+ * @param {string} ctx.lang
129
+ * @param {Array<object>} ctx.violations
130
+ * @returns {ReturnType<typeof resolveRunSeo> | null} The resolved
131
+ * record (or null if resolution threw — in which case a `resolve`
132
+ * gate violation has been pushed).
133
+ */
134
+ function validateOneRun(ctx) {
135
+ const { runDir, repoRoot, lang, violations } = ctx;
136
+ const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
137
+ let record;
138
+ try {
139
+ record = resolveRunSeo({ runDir, repoRoot, lang });
140
+ } catch (err) {
141
+ violations.push({
142
+ runDir: runDirRel,
143
+ lang,
144
+ gate: 'resolve',
145
+ message: `resolveRunSeo failed: ${err.message}`,
146
+ });
147
+ return null;
148
+ }
149
+ const { entry } = record;
150
+ applyFieldGates({
151
+ runDirRel,
152
+ lang,
153
+ kind: 'title',
154
+ value: entry.title,
155
+ minLen: TITLE_MIN_LENGTH,
156
+ maxLen: TITLE_MAX_LENGTH,
157
+ violations,
158
+ });
159
+ applyFieldGates({
160
+ runDirRel,
161
+ lang,
162
+ kind: 'description',
163
+ value: entry.description,
164
+ minLen: DESCRIPTION_MIN_LENGTH,
165
+ maxLen: DESCRIPTION_MAX_LENGTH,
166
+ violations,
167
+ });
168
+ return record;
169
+ }
170
+
171
+ /**
172
+ * Apply the empty / length / ellipsis / forbidden-prefix / leaky-runid
173
+ * gates to a single (kind, value) projection.
174
+ *
175
+ * @param {object} ctx
176
+ * @param {string} ctx.runDirRel
177
+ * @param {string} ctx.lang
178
+ * @param {'title' | 'description'} ctx.kind
179
+ * @param {string} ctx.value
180
+ * @param {number} ctx.minLen
181
+ * @param {number} ctx.maxLen
182
+ * @param {Array<object>} ctx.violations
183
+ */
184
+ function applyFieldGates(ctx) {
185
+ const { runDirRel, lang, kind, value, minLen, maxLen, violations } = ctx;
186
+ if (typeof value !== 'string' || value.trim().length === 0) {
187
+ violations.push({
188
+ runDir: runDirRel,
189
+ lang,
190
+ gate: `${kind}-empty`,
191
+ message: `${kind} resolved to empty / non-string value`,
192
+ });
193
+ return;
194
+ }
195
+ const length = effectiveTextLength(value);
196
+ if (length < minLen || length > maxLen) {
197
+ violations.push({
198
+ runDir: runDirRel,
199
+ lang,
200
+ gate: `${kind}-length`,
201
+ message: `${kind} has effective length ${length}; expected ${minLen}–${maxLen}`,
202
+ });
203
+ }
204
+ const ellipsis = detectTrailingEllipsis(value);
205
+ if (ellipsis) {
206
+ violations.push({
207
+ runDir: runDirRel,
208
+ lang,
209
+ gate: `${kind}-ellipsis`,
210
+ message: `${kind} ends with "${ellipsis}" — mid-sentence truncation is forbidden`,
211
+ });
212
+ }
213
+ const prefix = detectForbiddenPrefix(value);
214
+ if (prefix) {
215
+ violations.push({
216
+ runDir: runDirRel,
217
+ lang,
218
+ gate: 'forbidden-prefix',
219
+ message: `${kind} begins with reserved Stage-B label "${prefix}"`,
220
+ });
221
+ }
222
+ const leaked = detectLeakyRunIdOrJargon(value);
223
+ if (leaked) {
224
+ violations.push({
225
+ runDir: runDirRel,
226
+ lang,
227
+ gate: 'leaky-runid',
228
+ message: `${kind} contains internal token "${leaked}"`,
229
+ });
230
+ }
231
+ applyKindSpecificRejectionGates(ctx);
232
+ }
233
+
234
+ /**
235
+ * Apply the resolver-aligned title-rejection predicates from
236
+ * `src/aggregator/metadata/title-rejection.ts`. Keeping validator and
237
+ * resolver in lock-step ensures the same regression that the resolver
238
+ * already rejects can never sneak through into a shipped `<title>` or
239
+ * `<meta description>`.
240
+ *
241
+ * Emitted gates (deferred PR #2163 follow-up):
242
+ *
243
+ * - `title-ellipsis-cut` — trailing `…` / `...`
244
+ * - `title-doc-id` — bare `TA-NN-YYYY-NNNN` doc-ID
245
+ * - `title-section-header` — bold-prose section header
246
+ * - `title-sentence-fragment` — leaked complete sentence
247
+ * - `description-leaky-section-header` — description starts with a
248
+ * bold-prose section header label (`Strategic significance: …`,
249
+ * `Threat Level: …`, …)
250
+ *
251
+ * @param {object} ctx
252
+ * @param {string} ctx.runDirRel
253
+ * @param {string} ctx.lang
254
+ * @param {'title' | 'description'} ctx.kind
255
+ * @param {string} ctx.value
256
+ * @param {Array<object>} ctx.violations
257
+ */
258
+ /**
259
+ * Evaluate the resolver-aligned rejection gates for a single
260
+ * `(kind, value)` pair and return the gate names that fire. Pure,
261
+ * dependency-free, used by both {@link applyKindSpecificRejectionGates}
262
+ * and the unit tests to keep validator and resolver in lock-step.
263
+ *
264
+ * @param {'title' | 'description'} kind
265
+ * @param {string} value
266
+ * @returns {string[]} Zero or more gate names (e.g. `title-doc-id`,
267
+ * `description-leaky-section-header`).
268
+ */
269
+ export function evaluateResolverRejectionGates(kind, value) {
270
+ if (typeof value !== 'string' || value.length === 0) return [];
271
+ if (kind === 'title') {
272
+ const reason = findTitleRejectionReason(value);
273
+ return reason ? [`title-${reason}`] : [];
274
+ }
275
+ if (kind === 'description' && detectDescriptionLeadSectionHeader(value)) {
276
+ return ['description-leaky-section-header'];
277
+ }
278
+ return [];
279
+ }
280
+
281
+ function applyKindSpecificRejectionGates(ctx) {
282
+ const { runDirRel, lang, kind, value, violations } = ctx;
283
+ const gates = evaluateResolverRejectionGates(kind, value);
284
+ for (const gate of gates) {
285
+ violations.push({
286
+ runDir: runDirRel,
287
+ lang,
288
+ gate,
289
+ // The 5 resolver-aligned gates ship as `advisory` so the validator
290
+ // can surface them in the report (and unit tests can lock the
291
+ // signal) without immediately failing CI on the 25 legacy
292
+ // articles that pre-date the resolver-tightening work. Promote to
293
+ // failure-class in a follow-up once those runs are regenerated.
294
+ severity: 'advisory',
295
+ message:
296
+ gate === 'description-leaky-section-header'
297
+ ? `description starts with a bold-prose section header: "${value}"`
298
+ : `title rejected by resolver predicate (${gate.slice('title-'.length)}): "${value}"`,
299
+ });
300
+ }
301
+ }
302
+
303
+ /**
304
+ * `true` when a description's lead phrase (first sentence or first
305
+ * bold-label segment) is a denylisted section header. Splits on the
306
+ * first sentence terminator (`.`, `:`, `…`, `?`, `!`) and asks the
307
+ * canonical {@link looksLikeSectionHeader} predicate.
308
+ *
309
+ * @param {string} value
310
+ * @returns {boolean}
311
+ */
312
+ function detectDescriptionLeadSectionHeader(value) {
313
+ const trimmed = value.trim();
314
+ if (!trimmed) return false;
315
+ const m = /^([^.:!?\u2026]+)/u.exec(trimmed);
316
+ const lead = (m ? m[1] : trimmed).trim();
317
+ return looksLikeSectionHeader(lead);
318
+ }
319
+
320
+ /**
321
+ * After resolving every run, detect duplicate titles within the same
322
+ * `(date, articleType)` collision group. The `— Run N` qualifier from
323
+ * `composeContextualTitle` is the contracted differentiator for
324
+ * same-date/same-articleType republishes; missing it is a
325
+ * uniqueness-gate failure.
326
+ *
327
+ * @param {Array<{record: ReturnType<typeof resolveRunSeo>, lang: string}>} resolvedList
328
+ * @param {Array<object>} violations
329
+ */
330
+ function detectDuplicateTitles(resolvedList, violations) {
331
+ /** @type {Map<string, Array<{record: any, lang: string}>>} */
332
+ const groups = new Map();
333
+ for (const item of resolvedList) {
334
+ const key = `${item.record.date}|${item.record.articleType}`;
335
+ const bucket = groups.get(key);
336
+ if (bucket) bucket.push(item);
337
+ else groups.set(key, [item]);
338
+ }
339
+ for (const [key, items] of groups.entries()) {
340
+ if (items.length < 2) continue;
341
+ /** @type {Map<string, string[]>} */
342
+ const byTitle = new Map();
343
+ for (const it of items) {
344
+ const t = it.record.entry.title;
345
+ const bucket = byTitle.get(t);
346
+ if (bucket) bucket.push(it.record.runDirRel);
347
+ else byTitle.set(t, [it.record.runDirRel]);
348
+ }
349
+ for (const [title, dirs] of byTitle.entries()) {
350
+ if (dirs.length < 2) continue;
351
+ violations.push({
352
+ runDir: dirs.join(', '),
353
+ lang: items[0].lang,
354
+ gate: 'title-uniqueness',
355
+ affectedRuns: dirs,
356
+ message:
357
+ `${dirs.length} runs in collision group "${key}" share the ` +
358
+ `resolved title "${title}" — composeContextualTitle must ` +
359
+ `append a "— Run N" qualifier (or equivalent) so the per-run ` +
360
+ `articles are distinguishable in SERPs and social cards`,
361
+ });
362
+ }
363
+ }
364
+ }
365
+
366
+ /**
367
+ * Run validation across a list of run directories.
368
+ *
369
+ * @param {string[]} runDirs
370
+ * @param {string} repoRoot
371
+ * @param {{ quiet?: boolean, lang?: string }} options
372
+ */
373
+ export function runValidation(runDirs, repoRoot, { quiet = false, lang = 'en' } = {}) {
374
+ const allViolations = [];
375
+ /** @type {Array<{record: ReturnType<typeof resolveRunSeo>, lang: string}>} */
376
+ const resolved = [];
377
+ for (const runDir of runDirs) {
378
+ const before = allViolations.length;
379
+ const record = validateOneRun({ runDir, repoRoot, lang, violations: allViolations });
380
+ const runDirRel = path.relative(repoRoot, runDir).split(path.sep).join('/');
381
+ if (record) resolved.push({ record, lang });
382
+ const added = allViolations.length - before;
383
+ if (!quiet) {
384
+ if (added > 0) {
385
+ for (const entry of allViolations.slice(before)) {
386
+ const prefix = entry.severity === 'advisory' ? '⚠️ ' : '❌';
387
+ const stream = entry.severity === 'advisory' ? process.stdout : process.stderr;
388
+ stream.write(`${prefix} ${runDirRel} [${entry.gate}] ${entry.message}\n`);
389
+ }
390
+ } else {
391
+ process.stdout.write(`✅ ${runDirRel}\n`);
392
+ }
393
+ }
394
+ }
395
+ const beforeUniq = allViolations.length;
396
+ detectDuplicateTitles(resolved, allViolations);
397
+ if (!quiet) {
398
+ for (const entry of allViolations.slice(beforeUniq)) {
399
+ const prefix = entry.severity === 'advisory' ? '⚠️ ' : '❌';
400
+ const stream = entry.severity === 'advisory' ? process.stdout : process.stderr;
401
+ stream.write(`${prefix} ${entry.runDir} [${entry.gate}] ${entry.message}\n`);
402
+ }
403
+ }
404
+ return allViolations;
405
+ }
406
+
407
+ /**
408
+ * Parse CLI argv. Exported for unit tests.
409
+ */
410
+ export function parseArgs(argv) {
411
+ const opts = {
412
+ repoRoot: process.cwd(),
413
+ paths: [],
414
+ lang: 'en',
415
+ report: null,
416
+ fail: true,
417
+ quiet: false,
418
+ };
419
+ for (let i = 0; i < argv.length; i += 1) {
420
+ const arg = argv[i];
421
+ switch (arg) {
422
+ case '--repo-root':
423
+ opts.repoRoot = argv[i + 1];
424
+ i += 1;
425
+ break;
426
+ case '--paths':
427
+ while (i + 1 < argv.length) {
428
+ const next = argv[i + 1];
429
+ if (next === '--') {
430
+ i += 1;
431
+ break;
432
+ }
433
+ if (next.startsWith('--')) break;
434
+ opts.paths.push(next);
435
+ i += 1;
436
+ }
437
+ break;
438
+ case '--lang':
439
+ opts.lang = argv[i + 1];
440
+ i += 1;
441
+ break;
442
+ case '--report':
443
+ opts.report = argv[i + 1];
444
+ i += 1;
445
+ break;
446
+ case '--no-fail':
447
+ opts.fail = false;
448
+ break;
449
+ case '--quiet':
450
+ opts.quiet = true;
451
+ break;
452
+ case '--help':
453
+ case '-h':
454
+ process.stdout.write(
455
+ 'Usage: validate-article-seo.js [--repo-root <path>] ' +
456
+ '[--paths <runDir>... [--]] [--lang <code>] [--report <path>] [--no-fail] [--quiet]\n'
457
+ );
458
+ process.exit(0);
459
+ break;
460
+ default:
461
+ if (arg.startsWith('--')) {
462
+ throw new Error(`Unknown flag: ${arg}`);
463
+ }
464
+ }
465
+ }
466
+ if (!ALL_LANGUAGES.includes(opts.lang)) {
467
+ throw new Error(`Unsupported --lang "${opts.lang}"`);
468
+ }
469
+ return opts;
470
+ }
471
+
472
+ /** Main entry point. */
473
+ export function main(argv) {
474
+ const opts = parseArgs(argv);
475
+ const runDirs =
476
+ opts.paths.length > 0
477
+ ? opts.paths.map((p) => path.resolve(opts.repoRoot, p))
478
+ : discoverAnalysisRuns(opts.repoRoot).map((r) => r.runDir);
479
+
480
+ const violations = runValidation(runDirs, opts.repoRoot, {
481
+ quiet: opts.quiet,
482
+ lang: opts.lang,
483
+ });
484
+ const failingViolations = violations.filter((v) => v.severity !== 'advisory');
485
+ const advisoryViolations = violations.filter((v) => v.severity === 'advisory');
486
+
487
+ const report = {
488
+ generatedAt: new Date().toISOString(),
489
+ lang: opts.lang,
490
+ totals: {
491
+ runsChecked: runDirs.length,
492
+ violations: failingViolations.length,
493
+ advisories: advisoryViolations.length,
494
+ byGate: aggregateByKey(failingViolations, 'gate'),
495
+ byGateAdvisory: aggregateByKey(advisoryViolations, 'gate'),
496
+ },
497
+ violations,
498
+ };
499
+
500
+ const json = `${JSON.stringify(report, null, 2)}\n`;
501
+ if (opts.report) {
502
+ fs.mkdirSync(path.dirname(opts.report), { recursive: true });
503
+ fs.writeFileSync(opts.report, json);
504
+ } else if (!opts.quiet) {
505
+ process.stdout.write(json);
506
+ }
507
+
508
+ if (failingViolations.length > 0 && opts.fail) {
509
+ process.exit(1);
510
+ }
511
+ return report;
512
+ }
513
+
514
+ /* c8 ignore start */
515
+ if (import.meta.url === `file://${process.argv[1]}`) {
516
+ try {
517
+ main(process.argv.slice(2));
518
+ } catch (err) {
519
+ process.stderr.write(`validate-article-seo: ${err.message}\n`);
520
+ process.exit(1);
521
+ }
522
+ }
523
+ /* c8 ignore stop */
524
+
525
+ // Re-export for tests
526
+ export {
527
+ FORBIDDEN_PATTERNS,
528
+ FORBIDDEN_PREFIXES,
529
+ FORBIDDEN_SUBSTRINGS,
530
+ TITLE_MAX_LENGTH,
531
+ TITLE_MIN_LENGTH,
532
+ DESCRIPTION_MAX_LENGTH,
533
+ DESCRIPTION_MIN_LENGTH,
534
+ };