euparliamentmonitor 0.9.28 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/scripts/aggregator/html/localize-body.d.ts +28 -4
- package/scripts/aggregator/html/localize-body.js +79 -21
- package/scripts/aggregator/html/shell.js +2 -1
- package/scripts/aggregator/metadata/artifact-category-heading.js +8 -1
- package/scripts/aggregator/metadata/heading-rules.js +11 -0
- package/scripts/aggregator/metadata/seo-budgets.js +12 -9
- package/scripts/aggregator/reader-friendly-transform.js +1 -1
- package/scripts/generators/news-indexes/backfill-hreflang.d.ts +13 -0
- package/scripts/generators/news-indexes/backfill-hreflang.js +112 -0
- package/scripts/generators/news-indexes/backfill-reader-label.d.ts +47 -0
- package/scripts/generators/news-indexes/backfill-reader-label.js +86 -0
- package/scripts/generators/news-indexes/backfill.d.ts +19 -18
- package/scripts/generators/news-indexes/backfill.js +118 -111
- package/scripts/generators/news-indexes/per-language.js +2 -1
- package/scripts/generators/political-intelligence/html.js +2 -1
- package/scripts/generators/sitemap/html.js +2 -1
- package/scripts/generators/sitemap/index.d.ts +1 -1
- package/scripts/generators/sitemap/index.js +1 -1
- package/scripts/generators/sitemap/rss.d.ts +38 -2
- package/scripts/generators/sitemap/rss.js +54 -10
- package/scripts/generators/sitemap/xml.js +21 -6
- package/scripts/generators/sitemap.js +42 -9
- package/scripts/mcp/ep/error-classifier.d.ts +38 -0
- package/scripts/mcp/ep/error-classifier.js +49 -0
- package/scripts/mcp/ep/tools-feeds.js +27 -2
- package/scripts/templates/sections/footer.js +3 -1
- package/scripts/templates/sections/rss-discovery.d.ts +22 -0
- package/scripts/templates/sections/rss-discovery.js +48 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "euparliamentmonitor",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
|
|
6
6
|
"main": "scripts/index.js",
|
|
@@ -167,7 +167,7 @@
|
|
|
167
167
|
"clean-css": "^5.3.3",
|
|
168
168
|
"d3": "7.9.0",
|
|
169
169
|
"esbuild": "0.28.0",
|
|
170
|
-
"eslint": "10.4.
|
|
170
|
+
"eslint": "10.4.1",
|
|
171
171
|
"eslint-config-prettier": "10.1.8",
|
|
172
172
|
"eslint-plugin-jsdoc": "63.0.0",
|
|
173
173
|
"eslint-plugin-security": "4.0.0",
|
|
@@ -179,7 +179,7 @@
|
|
|
179
179
|
"husky": "9.1.7",
|
|
180
180
|
"jscpd": "4.2.4",
|
|
181
181
|
"knip": "^6.7.0",
|
|
182
|
-
"lint-staged": "17.0.
|
|
182
|
+
"lint-staged": "17.0.6",
|
|
183
183
|
"mermaid": "11.15.0",
|
|
184
184
|
"papaparse": "5.5.3",
|
|
185
185
|
"prettier": "3.8.3",
|
|
@@ -22,11 +22,32 @@ export declare function localizeArticleBody(bodyHtml: string, lang: LanguageCode
|
|
|
22
22
|
* @returns Modified string, or `haystack` unchanged when `needle` is absent
|
|
23
23
|
*/
|
|
24
24
|
export declare function replaceFirstStringIn(haystack: string, needle: string, replacement: string): string;
|
|
25
|
+
/**
|
|
26
|
+
* Locate the cut point that ends the Executive Brief body — the start of
|
|
27
|
+
* the next top-level boundary heading after `afterHeading`. A boundary is
|
|
28
|
+
* any `<h2>` whose `id` either starts with the canonical `section-` prefix
|
|
29
|
+
* or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
|
|
30
|
+
* (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
|
|
31
|
+
*
|
|
32
|
+
* Critically, this only matches **top-level** section anchors — never the
|
|
33
|
+
* brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
|
|
34
|
+
* …), which carry slugified ids without the `section-` prefix. That is why
|
|
35
|
+
* we cannot simply look for the next `<h2`.
|
|
36
|
+
*
|
|
37
|
+
* Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
|
|
38
|
+
* CodeQL's safe-regex envelope.
|
|
39
|
+
*
|
|
40
|
+
* @param html - Full article body HTML
|
|
41
|
+
* @param afterHeading - Index immediately after the Executive Brief `</h2>`
|
|
42
|
+
* @returns Index of the next boundary `<h2`, or `-1` when the Executive
|
|
43
|
+
* Brief is the last block in the body.
|
|
44
|
+
*/
|
|
45
|
+
export declare function findExecutiveBriefSectionCut(html: string, afterHeading: number): number;
|
|
25
46
|
/**
|
|
26
47
|
* Replace the **inner body** of the Executive Brief section (the
|
|
27
48
|
* `<h2 id="section-executive-brief">…</h2>` heading and everything that
|
|
28
|
-
* follows it up to — but not including — the next
|
|
29
|
-
*
|
|
49
|
+
* follows it up to — but not including — the next top-level boundary
|
|
50
|
+
* heading) with the supplied replacement HTML. The Executive Brief
|
|
30
51
|
* heading itself is preserved by emitting it inline ahead of the
|
|
31
52
|
* replacement, so the in-page anchor (`#section-executive-brief`) and
|
|
32
53
|
* the table-of-contents link continue to work.
|
|
@@ -39,8 +60,11 @@ export declare function replaceFirstStringIn(haystack: string, needle: string, r
|
|
|
39
60
|
* `render-one.writeLanguageVariant`.
|
|
40
61
|
*
|
|
41
62
|
* Implementation uses `indexOf`/slice exclusively to stay within
|
|
42
|
-
* CodeQL's safe-regex envelope.
|
|
43
|
-
*
|
|
63
|
+
* CodeQL's safe-regex envelope. The replacement spans from the heading to
|
|
64
|
+
* the next top-level boundary (see {@link findExecutiveBriefSectionCut});
|
|
65
|
+
* when the Executive Brief is the last block in the body the replacement
|
|
66
|
+
* extends to end-of-body. Returns `html` unchanged only when the Executive
|
|
67
|
+
* Brief heading is absent or malformed.
|
|
44
68
|
*
|
|
45
69
|
* @param html - Full article body HTML
|
|
46
70
|
* @param localizedHeading - Localized text for the Executive Brief H2
|
|
@@ -12,6 +12,23 @@ import { TRADECRAFT_HEADING_LABELS, TRADECRAFT_INTRO_LABELS, TRADECRAFT_METHODOL
|
|
|
12
12
|
import { escapeHTML } from '../../utils/file-utils.js';
|
|
13
13
|
import { TRADECRAFT_SECTION_ID, MANIFEST_SECTION_ID, SUPPLEMENTARY_SECTION_ID, } from '../artifact-order.js';
|
|
14
14
|
import { KEY_TAKEAWAYS_SECTION_ID } from '../key-takeaways.js';
|
|
15
|
+
import { READER_GUIDE_SECTION_ID } from '../reader-guide-constants.js';
|
|
16
|
+
/**
|
|
17
|
+
* Top-level section anchors that mark the **end** of the Executive Brief
|
|
18
|
+
* body. Canonical analysis sections are matched by the shared
|
|
19
|
+
* `id="section-…"` prefix (see {@link findExecutiveBriefSectionCut});
|
|
20
|
+
* the appendix and reader-guide sections below carry bespoke ids that do
|
|
21
|
+
* **not** share that prefix, so they are matched explicitly. Including
|
|
22
|
+
* them ensures the localized brief splice also fires on sparse runs where
|
|
23
|
+
* the Executive Brief is the last canonical section and only appendix
|
|
24
|
+
* blocks follow it.
|
|
25
|
+
*/
|
|
26
|
+
const EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS = [
|
|
27
|
+
`id="${READER_GUIDE_SECTION_ID}"`,
|
|
28
|
+
`id="${TRADECRAFT_SECTION_ID}"`,
|
|
29
|
+
`id="${MANIFEST_SECTION_ID}"`,
|
|
30
|
+
`id="${SUPPLEMENTARY_SECTION_ID}"`,
|
|
31
|
+
];
|
|
15
32
|
/**
|
|
16
33
|
* Localize the Tradecraft References and Analysis Index sections in the
|
|
17
34
|
* rendered article body HTML. Replaces English headings, introductions,
|
|
@@ -102,11 +119,48 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
|
|
|
102
119
|
return haystack;
|
|
103
120
|
return haystack.slice(0, idx) + replacement + haystack.slice(idx + needle.length);
|
|
104
121
|
}
|
|
122
|
+
/**
|
|
123
|
+
* Locate the cut point that ends the Executive Brief body — the start of
|
|
124
|
+
* the next top-level boundary heading after `afterHeading`. A boundary is
|
|
125
|
+
* any `<h2>` whose `id` either starts with the canonical `section-` prefix
|
|
126
|
+
* or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
|
|
127
|
+
* (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
|
|
128
|
+
*
|
|
129
|
+
* Critically, this only matches **top-level** section anchors — never the
|
|
130
|
+
* brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
|
|
131
|
+
* …), which carry slugified ids without the `section-` prefix. That is why
|
|
132
|
+
* we cannot simply look for the next `<h2`.
|
|
133
|
+
*
|
|
134
|
+
* Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
|
|
135
|
+
* CodeQL's safe-regex envelope.
|
|
136
|
+
*
|
|
137
|
+
* @param html - Full article body HTML
|
|
138
|
+
* @param afterHeading - Index immediately after the Executive Brief `</h2>`
|
|
139
|
+
* @returns Index of the next boundary `<h2`, or `-1` when the Executive
|
|
140
|
+
* Brief is the last block in the body.
|
|
141
|
+
*/
|
|
142
|
+
export function findExecutiveBriefSectionCut(html, afterHeading) {
|
|
143
|
+
let best = -1;
|
|
144
|
+
const consider = (markerIdx) => {
|
|
145
|
+
if (markerIdx === -1)
|
|
146
|
+
return;
|
|
147
|
+
const h2 = html.lastIndexOf('<h2', markerIdx);
|
|
148
|
+
if (h2 === -1 || h2 < afterHeading)
|
|
149
|
+
return;
|
|
150
|
+
if (best === -1 || h2 < best)
|
|
151
|
+
best = h2;
|
|
152
|
+
};
|
|
153
|
+
consider(html.indexOf('id="section-', afterHeading));
|
|
154
|
+
for (const marker of EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS) {
|
|
155
|
+
consider(html.indexOf(marker, afterHeading));
|
|
156
|
+
}
|
|
157
|
+
return best;
|
|
158
|
+
}
|
|
105
159
|
/**
|
|
106
160
|
* Replace the **inner body** of the Executive Brief section (the
|
|
107
161
|
* `<h2 id="section-executive-brief">…</h2>` heading and everything that
|
|
108
|
-
* follows it up to — but not including — the next
|
|
109
|
-
*
|
|
162
|
+
* follows it up to — but not including — the next top-level boundary
|
|
163
|
+
* heading) with the supplied replacement HTML. The Executive Brief
|
|
110
164
|
* heading itself is preserved by emitting it inline ahead of the
|
|
111
165
|
* replacement, so the in-page anchor (`#section-executive-brief`) and
|
|
112
166
|
* the table-of-contents link continue to work.
|
|
@@ -119,8 +173,11 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
|
|
|
119
173
|
* `render-one.writeLanguageVariant`.
|
|
120
174
|
*
|
|
121
175
|
* Implementation uses `indexOf`/slice exclusively to stay within
|
|
122
|
-
* CodeQL's safe-regex envelope.
|
|
123
|
-
*
|
|
176
|
+
* CodeQL's safe-regex envelope. The replacement spans from the heading to
|
|
177
|
+
* the next top-level boundary (see {@link findExecutiveBriefSectionCut});
|
|
178
|
+
* when the Executive Brief is the last block in the body the replacement
|
|
179
|
+
* extends to end-of-body. Returns `html` unchanged only when the Executive
|
|
180
|
+
* Brief heading is absent or malformed.
|
|
124
181
|
*
|
|
125
182
|
* @param html - Full article body HTML
|
|
126
183
|
* @param localizedHeading - Localized text for the Executive Brief H2
|
|
@@ -147,23 +204,24 @@ export function replaceExecutiveBriefSection(html, localizedHeading, replacement
|
|
|
147
204
|
if (h2CloseTagIdx === -1)
|
|
148
205
|
return html;
|
|
149
206
|
const afterHeading = h2CloseTagIdx + '</h2>'.length;
|
|
150
|
-
// Find the next
|
|
151
|
-
//
|
|
152
|
-
//
|
|
153
|
-
//
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
207
|
+
// Find the next top-level boundary heading — the start of the following
|
|
208
|
+
// article section or appendix. When none exists the Executive Brief is
|
|
209
|
+
// the last block, so we replace through end-of-body. This guarantees the
|
|
210
|
+
// localized brief is spliced even on sparse runs (previously the splice
|
|
211
|
+
// bailed and non-English readers were stranded on the English brief).
|
|
212
|
+
const nextH2 = findExecutiveBriefSectionCut(html, afterHeading);
|
|
213
|
+
let cutEnd;
|
|
214
|
+
if (nextH2 === -1) {
|
|
215
|
+
cutEnd = html.length;
|
|
216
|
+
}
|
|
217
|
+
else {
|
|
218
|
+
// Start of the line containing the next `<h2` so we don't strip
|
|
219
|
+
// leading whitespace from the next section.
|
|
220
|
+
cutEnd = nextH2;
|
|
221
|
+
const prevNewline = html.lastIndexOf('\n', nextH2 - 1);
|
|
222
|
+
if (prevNewline !== -1 && prevNewline >= afterHeading) {
|
|
223
|
+
cutEnd = prevNewline + 1;
|
|
224
|
+
}
|
|
167
225
|
}
|
|
168
226
|
const newHeading = `<h2 id="section-executive-brief">${escapeHTML(localizedHeading)}</h2>\n`;
|
|
169
227
|
const trimmedReplacement = replacementBodyHtml.endsWith('\n')
|
|
@@ -19,6 +19,7 @@ import { escapeHTML } from '../../utils/file-utils.js';
|
|
|
19
19
|
import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
|
|
20
20
|
import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
|
|
21
21
|
import { getSitemapFilename } from '../../generators/sitemap/index.js';
|
|
22
|
+
import { buildRssAlternateLink } from '../../templates/sections/rss-discovery.js';
|
|
22
23
|
import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
|
|
23
24
|
import { clampForBudget } from '../metadata/seo-budgets.js';
|
|
24
25
|
import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
|
|
@@ -352,7 +353,7 @@ ${keywordsMeta} <meta name="robots" content="index, follow, max-snippet:-1, max
|
|
|
352
353
|
<meta property="article:publisher" content="https://hack23.com">
|
|
353
354
|
<link rel="canonical" href="${canonicalUrl}">
|
|
354
355
|
${hreflangLinks}
|
|
355
|
-
|
|
356
|
+
${buildRssAlternateLink(safeLang, `${BASE_URL}/`)}
|
|
356
357
|
<link rel="preconnect" href="https://hack23.com" crossorigin>
|
|
357
358
|
<meta property="og:type" content="article">
|
|
358
359
|
<meta property="og:title" content="${escapeHTML(ogTitleClamped)}">
|
|
@@ -153,6 +153,13 @@ export const ARTIFACT_CATEGORY_PREFIXES = [
|
|
|
153
153
|
'voting patterns',
|
|
154
154
|
'weekly outlook',
|
|
155
155
|
'wildcards blackswans',
|
|
156
|
+
// CJK localized category prefixes (translations of "executive briefing")
|
|
157
|
+
'エグゼクティブ・ブリーフィング',
|
|
158
|
+
'エグゼクティブブリーフィング',
|
|
159
|
+
'エグゼクティブ・ブリーフ',
|
|
160
|
+
'행정 브리핑',
|
|
161
|
+
'执行简报',
|
|
162
|
+
'執行簡報',
|
|
156
163
|
];
|
|
157
164
|
/**
|
|
158
165
|
* Match a single calendar month name (English) with optional `-uary` /
|
|
@@ -211,7 +218,7 @@ function normaliseCategoryHeading(raw) {
|
|
|
211
218
|
return stripInlineMarkdown(raw)
|
|
212
219
|
.trim()
|
|
213
220
|
.toLowerCase()
|
|
214
|
-
.replace(/^[^a-z0-9]
|
|
221
|
+
.replace(/^[^a-z0-9\p{L}]+/u, '')
|
|
215
222
|
.replace(/\s+/g, ' ');
|
|
216
223
|
}
|
|
217
224
|
/**
|
|
@@ -158,6 +158,17 @@ const BARE_INSTITUTIONAL_HEADINGS = [
|
|
|
158
158
|
'briefing',
|
|
159
159
|
'intelligence brief',
|
|
160
160
|
'intelligence briefing',
|
|
161
|
+
// CJK / localized translations of generic headings
|
|
162
|
+
'エグゼクティブ・ブリーフィング',
|
|
163
|
+
'エグゼクティブブリーフィング',
|
|
164
|
+
'エグゼクティブ・ブリーフ',
|
|
165
|
+
'ブリーフィング',
|
|
166
|
+
'행정 브리핑',
|
|
167
|
+
'브리핑',
|
|
168
|
+
'执行简报',
|
|
169
|
+
'简报',
|
|
170
|
+
'執行簡報',
|
|
171
|
+
'簡報',
|
|
161
172
|
];
|
|
162
173
|
/**
|
|
163
174
|
* Return `true` when the heading is one of {@link BARE_INSTITUTIONAL_HEADINGS}
|
|
@@ -160,15 +160,18 @@ export function clampForBudget(text, lang, surface) {
|
|
|
160
160
|
if (cleaned.length >= softMin)
|
|
161
161
|
return cleaned;
|
|
162
162
|
}
|
|
163
|
-
// Whitespace-aware fallback.
|
|
164
|
-
//
|
|
165
|
-
//
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
163
|
+
// Whitespace-aware fallback. Runs for every script: an ASCII space
|
|
164
|
+
// past the soft minimum is a safe break that drops a partial trailing
|
|
165
|
+
// segment whole rather than slicing it mid-token. Chinese and Japanese
|
|
166
|
+
// prose has no inter-word spaces, so `lastIndexOf(' ')` returns -1 and
|
|
167
|
+
// this is a no-op for them — but composed SEO snippets join clauses
|
|
168
|
+
// (body, dateline, reader label) with ASCII spaces, so honouring that
|
|
169
|
+
// boundary prevents hard-cutting the reader label mid-word. Korean
|
|
170
|
+
// uses inter-word spaces natively and benefits the same way.
|
|
171
|
+
const lastSpace = window.lastIndexOf(' ');
|
|
172
|
+
if (lastSpace >= softMin) {
|
|
173
|
+
const safe = trimTrailingSeparators(window.slice(0, lastSpace));
|
|
174
|
+
return `${safe}…`;
|
|
172
175
|
}
|
|
173
176
|
const hardCut = trimTrailingSeparators(window);
|
|
174
177
|
return `${hardCut}…`;
|
|
@@ -46,7 +46,7 @@ const ADMIRALTY_LABELS = {
|
|
|
46
46
|
export function applyReaderFriendlyTransform(html) {
|
|
47
47
|
const state = createInitialState(html);
|
|
48
48
|
const withGlossary = injectReaderGlossary(html);
|
|
49
|
-
const parts = withGlossary.split(/(<[
|
|
49
|
+
const parts = withGlossary.split(/(<[^<>]+>)/g);
|
|
50
50
|
for (let i = 0; i < parts.length; i++) {
|
|
51
51
|
const part = parts[i] ?? '';
|
|
52
52
|
if (part.startsWith('<')) {
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Backfill hreflang alternate links for all article HTML files.
|
|
3
|
+
*
|
|
4
|
+
* Handles three cases:
|
|
5
|
+
* 1. Articles with no hreflang links at all → inject the full block before `</head>`
|
|
6
|
+
* 2. Articles with relative hreflang URLs → replace with absolute URLs
|
|
7
|
+
* 3. Articles already correct → skip
|
|
8
|
+
*
|
|
9
|
+
* @param filenames - News article filenames
|
|
10
|
+
* @returns Number of HTML files updated
|
|
11
|
+
*/
|
|
12
|
+
export declare function backfillArticleHreflang(filenames: readonly string[]): number;
|
|
13
|
+
//# sourceMappingURL=backfill-hreflang.d.ts.map
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Generators/NewsIndexes/BackfillHreflang
|
|
5
|
+
* @description Hreflang alternate-link backfill for article HTML files.
|
|
6
|
+
* Extracted from `backfill.ts` to keep source files ≤600 lines.
|
|
7
|
+
*/
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import fs from 'fs';
|
|
10
|
+
import { NEWS_DIR, BASE_URL } from '../../constants/config.js';
|
|
11
|
+
import { ALL_LANGUAGES } from '../../constants/languages.js';
|
|
12
|
+
import { parseArticleFilename, atomicWrite } from '../../utils/file-utils.js';
|
|
13
|
+
/**
|
|
14
|
+
* Read an article HTML file, returning an empty string when unavailable.
|
|
15
|
+
*
|
|
16
|
+
* @param filepath - Absolute HTML file path
|
|
17
|
+
* @returns File content or empty string
|
|
18
|
+
*/
|
|
19
|
+
function readArticleHtml(filepath) {
|
|
20
|
+
try {
|
|
21
|
+
return path.isAbsolute(filepath) ? fs.readFileSync(filepath, 'utf8') : '';
|
|
22
|
+
}
|
|
23
|
+
catch {
|
|
24
|
+
return '';
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Build hreflang `<link rel="alternate">` tags for an article slug.
|
|
29
|
+
* Produces one tag per supported language plus an `x-default` pointing at
|
|
30
|
+
* the English variant, all using absolute URLs.
|
|
31
|
+
*
|
|
32
|
+
* @param articleSlug - Slug without language suffix (e.g. `2026-02-24-propositions`)
|
|
33
|
+
* @returns Newline-joined `<link>` tags
|
|
34
|
+
*/
|
|
35
|
+
function buildArticleHreflang(articleSlug) {
|
|
36
|
+
const entries = ALL_LANGUAGES.map((code) => ` <link rel="alternate" hreflang="${code}" href="${BASE_URL}/news/${articleSlug}-${code}.html">`);
|
|
37
|
+
entries.push(` <link rel="alternate" hreflang="x-default" href="${BASE_URL}/news/${articleSlug}-en.html">`);
|
|
38
|
+
return entries.join('\n');
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Inject hreflang links into an article that has none.
|
|
42
|
+
*
|
|
43
|
+
* @param html - Article HTML content
|
|
44
|
+
* @param hreflangBlock - Pre-built hreflang link block
|
|
45
|
+
* @returns Updated HTML, or original if no change needed
|
|
46
|
+
*/
|
|
47
|
+
function injectHreflangLinks(html, hreflangBlock) {
|
|
48
|
+
return html.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Replace existing relative hreflang links with absolute URLs.
|
|
52
|
+
*
|
|
53
|
+
* @param html - Article HTML content
|
|
54
|
+
* @param hreflangBlock - Pre-built hreflang link block with absolute URLs
|
|
55
|
+
* @returns Updated HTML, or original if no change needed
|
|
56
|
+
*/
|
|
57
|
+
function fixRelativeHreflangLinks(html, hreflangBlock) {
|
|
58
|
+
const stripped = html.replace(/\s*<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="[^"]*">\n?/gu, '');
|
|
59
|
+
return stripped.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Backfill hreflang alternate links for all article HTML files.
|
|
63
|
+
*
|
|
64
|
+
* Handles three cases:
|
|
65
|
+
* 1. Articles with no hreflang links at all → inject the full block before `</head>`
|
|
66
|
+
* 2. Articles with relative hreflang URLs → replace with absolute URLs
|
|
67
|
+
* 3. Articles already correct → skip
|
|
68
|
+
*
|
|
69
|
+
* @param filenames - News article filenames
|
|
70
|
+
* @returns Number of HTML files updated
|
|
71
|
+
*/
|
|
72
|
+
export function backfillArticleHreflang(filenames) {
|
|
73
|
+
let updated = 0;
|
|
74
|
+
for (const filename of filenames) {
|
|
75
|
+
if (backfillOneArticleHreflang(filename))
|
|
76
|
+
updated++;
|
|
77
|
+
}
|
|
78
|
+
return updated;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Backfill hreflang for a single article file.
|
|
82
|
+
*
|
|
83
|
+
* @param filename - News article filename
|
|
84
|
+
* @returns True when the file was updated
|
|
85
|
+
*/
|
|
86
|
+
function backfillOneArticleHreflang(filename) {
|
|
87
|
+
const parsed = parseArticleFilename(filename);
|
|
88
|
+
if (!parsed)
|
|
89
|
+
return false;
|
|
90
|
+
const filepath = path.join(NEWS_DIR, filename);
|
|
91
|
+
const html = readArticleHtml(filepath);
|
|
92
|
+
if (!html)
|
|
93
|
+
return false;
|
|
94
|
+
const articleSlug = `${parsed.date}-${parsed.slug}`;
|
|
95
|
+
const hreflangBlock = buildArticleHreflang(articleSlug);
|
|
96
|
+
const hasHreflang = /<link\s+rel="alternate"\s+hreflang="/u.test(html);
|
|
97
|
+
let next;
|
|
98
|
+
if (!hasHreflang) {
|
|
99
|
+
next = injectHreflangLinks(html, hreflangBlock);
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
const hasRelative = /<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="(?!https?:\/\/)/u.test(html);
|
|
103
|
+
if (!hasRelative)
|
|
104
|
+
return false;
|
|
105
|
+
next = fixRelativeHreflangLinks(html, hreflangBlock);
|
|
106
|
+
}
|
|
107
|
+
if (next === html)
|
|
108
|
+
return false;
|
|
109
|
+
atomicWrite(filepath, next);
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
//# sourceMappingURL=backfill-hreflang.js.map
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import type { LanguageCode } from '../../types/index.js';
|
|
2
|
+
/**
|
|
3
|
+
* Remove a trailing **truncated** copy of the localized reader label
|
|
4
|
+
* (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
|
|
5
|
+
*
|
|
6
|
+
* Earlier backfill passes appended the reader label and then clamped the
|
|
7
|
+
* whole buffer to the per-script `metaDescription` budget, hard-cutting
|
|
8
|
+
* the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
|
|
9
|
+
* ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
|
|
10
|
+
* persisted to `<meta description>` and survive a plain prefix/date-label
|
|
11
|
+
* strip, so re-feeding them to the resolver re-emits the broken tail.
|
|
12
|
+
*
|
|
13
|
+
* A trailing copy that matches the label **in full** is left intact — it
|
|
14
|
+
* is a complete, reader-facing clause we want to preserve. Only a partial
|
|
15
|
+
* (truncated) prefix of the label is dropped, leaving the clean body for
|
|
16
|
+
* the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
|
|
17
|
+
* reader clause.
|
|
18
|
+
*
|
|
19
|
+
* @param description - Candidate description (prefix/date-label removed)
|
|
20
|
+
* @param langCode - Article language code
|
|
21
|
+
* @returns Description with any truncated trailing reader label removed
|
|
22
|
+
*/
|
|
23
|
+
export declare function stripTruncatedReaderLabel(description: string, langCode: LanguageCode): string;
|
|
24
|
+
/**
|
|
25
|
+
* Locate a trailing **truncated** copy of the localized reader label and
|
|
26
|
+
* return the index at which the description body ends (i.e. where the
|
|
27
|
+
* partial label begins). Returns -1 when no partial label is present or
|
|
28
|
+
* when the label is present in full (a complete clause we keep).
|
|
29
|
+
*
|
|
30
|
+
* @param text - Trimmed candidate description
|
|
31
|
+
* @param langCode - Article language code
|
|
32
|
+
* @returns Cut index for the partial label, or -1 when none applies
|
|
33
|
+
*/
|
|
34
|
+
export declare function findTruncatedReaderLabelCut(text: string, langCode: LanguageCode): number;
|
|
35
|
+
/**
|
|
36
|
+
* Detect whether a legacy `<meta description>` ends with a **truncated**
|
|
37
|
+
* reader label once its dateline prefix and redundant date-label clause
|
|
38
|
+
* are removed. Long, unique legacy descriptions otherwise bypass
|
|
39
|
+
* `shouldBackfillDescription`, leaving a persisted mid-word cut
|
|
40
|
+
* (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
|
|
41
|
+
*
|
|
42
|
+
* @param body - Stripped description body (prefix/date-label removed)
|
|
43
|
+
* @param langCode - Article language code
|
|
44
|
+
* @returns True when a truncated reader label remains in the body
|
|
45
|
+
*/
|
|
46
|
+
export declare function hasTruncatedReaderLabelInBody(body: string, langCode: LanguageCode): boolean;
|
|
47
|
+
//# sourceMappingURL=backfill-reader-label.d.ts.map
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Generators/NewsIndexes/BackfillReaderLabel
|
|
5
|
+
* @description Truncated reader-label detection and stripping helpers,
|
|
6
|
+
* extracted from `backfill.ts` to keep source files ≤600 lines.
|
|
7
|
+
*/
|
|
8
|
+
import { getLocalizedString } from '../../constants/languages.js';
|
|
9
|
+
import { SEO_CONTEXT_LABELS } from '../../aggregator/metadata/template-fallback.js';
|
|
10
|
+
/**
|
|
11
|
+
* Remove a trailing **truncated** copy of the localized reader label
|
|
12
|
+
* (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
|
|
13
|
+
*
|
|
14
|
+
* Earlier backfill passes appended the reader label and then clamped the
|
|
15
|
+
* whole buffer to the per-script `metaDescription` budget, hard-cutting
|
|
16
|
+
* the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
|
|
17
|
+
* ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
|
|
18
|
+
* persisted to `<meta description>` and survive a plain prefix/date-label
|
|
19
|
+
* strip, so re-feeding them to the resolver re-emits the broken tail.
|
|
20
|
+
*
|
|
21
|
+
* A trailing copy that matches the label **in full** is left intact — it
|
|
22
|
+
* is a complete, reader-facing clause we want to preserve. Only a partial
|
|
23
|
+
* (truncated) prefix of the label is dropped, leaving the clean body for
|
|
24
|
+
* the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
|
|
25
|
+
* reader clause.
|
|
26
|
+
*
|
|
27
|
+
* @param description - Candidate description (prefix/date-label removed)
|
|
28
|
+
* @param langCode - Article language code
|
|
29
|
+
* @returns Description with any truncated trailing reader label removed
|
|
30
|
+
*/
|
|
31
|
+
export function stripTruncatedReaderLabel(description, langCode) {
|
|
32
|
+
const text = description.trim();
|
|
33
|
+
const cut = findTruncatedReaderLabelCut(text, langCode);
|
|
34
|
+
if (cut < 0)
|
|
35
|
+
return text;
|
|
36
|
+
return text
|
|
37
|
+
.replace(/[.。!?!?…]+$/u, '')
|
|
38
|
+
.slice(0, cut)
|
|
39
|
+
.replace(/[\s,;:—\-–·。、]+$/u, '')
|
|
40
|
+
.trim();
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Locate a trailing **truncated** copy of the localized reader label and
|
|
44
|
+
* return the index at which the description body ends (i.e. where the
|
|
45
|
+
* partial label begins). Returns -1 when no partial label is present or
|
|
46
|
+
* when the label is present in full (a complete clause we keep).
|
|
47
|
+
*
|
|
48
|
+
* @param text - Trimmed candidate description
|
|
49
|
+
* @param langCode - Article language code
|
|
50
|
+
* @returns Cut index for the partial label, or -1 when none applies
|
|
51
|
+
*/
|
|
52
|
+
export function findTruncatedReaderLabelCut(text, langCode) {
|
|
53
|
+
const labels = getLocalizedString(SEO_CONTEXT_LABELS, langCode);
|
|
54
|
+
const reader = (labels.reader ?? '').trim();
|
|
55
|
+
// Require a reasonably long label so we never strip on a coincidental
|
|
56
|
+
// short suffix match; real labels are 40+ chars (Latin) / 11+ (CJK).
|
|
57
|
+
if (reader.length < 8 || text.length < 8)
|
|
58
|
+
return -1;
|
|
59
|
+
// Tolerate a terminator the resolver/healer appended after the cut.
|
|
60
|
+
const core = text.replace(/[.。!?!?…]+$/u, '');
|
|
61
|
+
const maxK = Math.min(core.length, reader.length);
|
|
62
|
+
for (let k = maxK; k >= 8; k -= 1) {
|
|
63
|
+
if (core.slice(core.length - k) === reader.slice(0, k)) {
|
|
64
|
+
// Full label present at the tail — keep it (not a truncation).
|
|
65
|
+
if (k === reader.length)
|
|
66
|
+
return -1;
|
|
67
|
+
return core.length - k;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
return -1;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Detect whether a legacy `<meta description>` ends with a **truncated**
|
|
74
|
+
* reader label once its dateline prefix and redundant date-label clause
|
|
75
|
+
* are removed. Long, unique legacy descriptions otherwise bypass
|
|
76
|
+
* `shouldBackfillDescription`, leaving a persisted mid-word cut
|
|
77
|
+
* (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
|
|
78
|
+
*
|
|
79
|
+
* @param body - Stripped description body (prefix/date-label removed)
|
|
80
|
+
* @param langCode - Article language code
|
|
81
|
+
* @returns True when a truncated reader label remains in the body
|
|
82
|
+
*/
|
|
83
|
+
export function hasTruncatedReaderLabelInBody(body, langCode) {
|
|
84
|
+
return findTruncatedReaderLabelCut(body, langCode) >= 0;
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=backfill-reader-label.js.map
|
|
@@ -60,14 +60,26 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
|
|
|
60
60
|
readonly forceContextPrefix?: boolean;
|
|
61
61
|
}): string;
|
|
62
62
|
/**
|
|
63
|
-
*
|
|
63
|
+
* Strip the legacy dateline prefix **and** the redundant localized
|
|
64
|
+
* date-label clause from a candidate description, returning the
|
|
65
|
+
* reader-facing body in isolation. Used to clean a previously-backfilled
|
|
66
|
+
* `<meta description>` before it is re-fed to the per-language SEO
|
|
67
|
+
* resolver — without this, the resolver re-clamps the prefixed buffer
|
|
68
|
+
* against the CJK metaDescription budget and truncates the reader label
|
|
69
|
+
* mid-clause (live regression in `news/2026-04-26-week-ahead-ko.html`,
|
|
70
|
+
* a dangling "추적하는." participle).
|
|
64
71
|
*
|
|
72
|
+
* @param date - Article date (ISO YYYY-MM-DD)
|
|
73
|
+
* @param slug - Article slug
|
|
74
|
+
* @param lang - Article language code
|
|
75
|
+
* @param description - Candidate description (possibly already prefixed)
|
|
76
|
+
* @returns Reader-facing body with prefix + date label removed
|
|
77
|
+
*/
|
|
78
|
+
export declare function stripLegacyBackfillContext(date: string, slug: string, lang: string, description: string): string;
|
|
79
|
+
/**
|
|
80
|
+
* Apply SEO meta tag replacements to a complete article HTML document.
|
|
65
81
|
* Exported for the regression test in
|
|
66
|
-
* `test/unit/news-indexes-jsonld-description-regex.test.js
|
|
67
|
-
* locks in the JSON-LD description regex against the duplicate-tail
|
|
68
|
-
* bug (the legacy `"description":"[^"]*"` pattern terminated at the
|
|
69
|
-
* first JSON-escaped quote `\"` and left the previous description's
|
|
70
|
-
* tail in place, accumulating duplicates on every prebuild run).
|
|
82
|
+
* `test/unit/news-indexes-jsonld-description-regex.test.js`.
|
|
71
83
|
*
|
|
72
84
|
* @param html - Existing article HTML
|
|
73
85
|
* @param description - Backfilled meta description
|
|
@@ -75,16 +87,5 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
|
|
|
75
87
|
* @returns Updated HTML
|
|
76
88
|
*/
|
|
77
89
|
export declare function applyArticleSeoBackfill(html: string, description: string, keywords: readonly string[]): string;
|
|
78
|
-
|
|
79
|
-
* Backfill hreflang alternate links for all article HTML files.
|
|
80
|
-
*
|
|
81
|
-
* Handles three cases:
|
|
82
|
-
* 1. Articles with no hreflang links at all → inject the full block before `</head>`
|
|
83
|
-
* 2. Articles with relative hreflang URLs → replace with absolute URLs
|
|
84
|
-
* 3. Articles already correct → skip
|
|
85
|
-
*
|
|
86
|
-
* @param filenames - News article filenames
|
|
87
|
-
* @returns Number of HTML files updated
|
|
88
|
-
*/
|
|
89
|
-
export declare function backfillArticleHreflang(filenames: readonly string[]): number;
|
|
90
|
+
export { backfillArticleHreflang } from './backfill-hreflang.js';
|
|
90
91
|
//# sourceMappingURL=backfill.d.ts.map
|