euparliamentmonitor 0.9.28 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/package.json +4 -4
  2. package/scripts/aggregator/html/localize-body.d.ts +28 -4
  3. package/scripts/aggregator/html/localize-body.js +79 -21
  4. package/scripts/aggregator/html/shell.js +2 -1
  5. package/scripts/aggregator/metadata/artifact-category-heading.js +8 -1
  6. package/scripts/aggregator/metadata/heading-rules.js +11 -0
  7. package/scripts/aggregator/metadata/seo-budgets.js +12 -9
  8. package/scripts/aggregator/reader-friendly-transform.js +1 -1
  9. package/scripts/generators/news-indexes/backfill-hreflang.d.ts +13 -0
  10. package/scripts/generators/news-indexes/backfill-hreflang.js +112 -0
  11. package/scripts/generators/news-indexes/backfill-reader-label.d.ts +47 -0
  12. package/scripts/generators/news-indexes/backfill-reader-label.js +86 -0
  13. package/scripts/generators/news-indexes/backfill.d.ts +19 -18
  14. package/scripts/generators/news-indexes/backfill.js +118 -111
  15. package/scripts/generators/news-indexes/per-language.js +2 -1
  16. package/scripts/generators/political-intelligence/html.js +2 -1
  17. package/scripts/generators/sitemap/html.js +2 -1
  18. package/scripts/generators/sitemap/index.d.ts +1 -1
  19. package/scripts/generators/sitemap/index.js +1 -1
  20. package/scripts/generators/sitemap/rss.d.ts +38 -2
  21. package/scripts/generators/sitemap/rss.js +54 -10
  22. package/scripts/generators/sitemap/xml.js +21 -6
  23. package/scripts/generators/sitemap.js +42 -9
  24. package/scripts/mcp/ep/error-classifier.d.ts +38 -0
  25. package/scripts/mcp/ep/error-classifier.js +49 -0
  26. package/scripts/mcp/ep/tools-feeds.js +27 -2
  27. package/scripts/templates/sections/footer.js +3 -1
  28. package/scripts/templates/sections/rss-discovery.d.ts +22 -0
  29. package/scripts/templates/sections/rss-discovery.js +48 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "euparliamentmonitor",
3
- "version": "0.9.28",
3
+ "version": "1.0.1",
4
4
  "type": "module",
5
5
  "description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
6
6
  "main": "scripts/index.js",
@@ -167,7 +167,7 @@
167
167
  "clean-css": "^5.3.3",
168
168
  "d3": "7.9.0",
169
169
  "esbuild": "0.28.0",
170
- "eslint": "10.4.0",
170
+ "eslint": "10.4.1",
171
171
  "eslint-config-prettier": "10.1.8",
172
172
  "eslint-plugin-jsdoc": "63.0.0",
173
173
  "eslint-plugin-security": "4.0.0",
@@ -179,7 +179,7 @@
179
179
  "husky": "9.1.7",
180
180
  "jscpd": "4.2.4",
181
181
  "knip": "^6.7.0",
182
- "lint-staged": "17.0.5",
182
+ "lint-staged": "17.0.7",
183
183
  "mermaid": "11.15.0",
184
184
  "papaparse": "5.5.3",
185
185
  "prettier": "3.8.3",
@@ -187,7 +187,7 @@
187
187
  "sharp": "^0.34.5",
188
188
  "terser": "^5.47.1",
189
189
  "ts-api-utils": "2.5.0",
190
- "tsx": "4.22.3",
190
+ "tsx": "4.22.4",
191
191
  "typedoc": "0.28.19",
192
192
  "typescript": "6.0.3",
193
193
  "vitest": "4.1.7",
@@ -22,11 +22,32 @@ export declare function localizeArticleBody(bodyHtml: string, lang: LanguageCode
22
22
  * @returns Modified string, or `haystack` unchanged when `needle` is absent
23
23
  */
24
24
  export declare function replaceFirstStringIn(haystack: string, needle: string, replacement: string): string;
25
+ /**
26
+ * Locate the cut point that ends the Executive Brief body — the start of
27
+ * the next top-level boundary heading after `afterHeading`. A boundary is
28
+ * any `<h2>` whose `id` either starts with the canonical `section-` prefix
29
+ * or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
30
+ * (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
31
+ *
32
+ * Critically, this only matches **top-level** section anchors — never the
33
+ * brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
34
+ * …), which carry slugified ids without the `section-` prefix. That is why
35
+ * we cannot simply look for the next `<h2`.
36
+ *
37
+ * Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
38
+ * CodeQL's safe-regex envelope.
39
+ *
40
+ * @param html - Full article body HTML
41
+ * @param afterHeading - Index immediately after the Executive Brief `</h2>`
42
+ * @returns Index of the next boundary `<h2`, or `-1` when the Executive
43
+ * Brief is the last block in the body.
44
+ */
45
+ export declare function findExecutiveBriefSectionCut(html: string, afterHeading: number): number;
25
46
  /**
26
47
  * Replace the **inner body** of the Executive Brief section (the
27
48
  * `<h2 id="section-executive-brief">…</h2>` heading and everything that
28
- * follows it up to — but not including — the next `<h2 id="section-…">`
29
- * sibling) with the supplied replacement HTML. The Executive Brief
49
+ * follows it up to — but not including — the next top-level boundary
50
+ * heading) with the supplied replacement HTML. The Executive Brief
30
51
  * heading itself is preserved by emitting it inline ahead of the
31
52
  * replacement, so the in-page anchor (`#section-executive-brief`) and
32
53
  * the table-of-contents link continue to work.
@@ -39,8 +60,11 @@ export declare function replaceFirstStringIn(haystack: string, needle: string, r
39
60
  * `render-one.writeLanguageVariant`.
40
61
  *
41
62
  * Implementation uses `indexOf`/slice exclusively to stay within
42
- * CodeQL's safe-regex envelope. Returns `html` unchanged when the
43
- * Executive Brief heading is absent or malformed.
63
+ * CodeQL's safe-regex envelope. The replacement spans from the heading to
64
+ * the next top-level boundary (see {@link findExecutiveBriefSectionCut});
65
+ * when the Executive Brief is the last block in the body the replacement
66
+ * extends to end-of-body. Returns `html` unchanged only when the Executive
67
+ * Brief heading is absent or malformed.
44
68
  *
45
69
  * @param html - Full article body HTML
46
70
  * @param localizedHeading - Localized text for the Executive Brief H2
@@ -12,6 +12,23 @@ import { TRADECRAFT_HEADING_LABELS, TRADECRAFT_INTRO_LABELS, TRADECRAFT_METHODOL
12
12
  import { escapeHTML } from '../../utils/file-utils.js';
13
13
  import { TRADECRAFT_SECTION_ID, MANIFEST_SECTION_ID, SUPPLEMENTARY_SECTION_ID, } from '../artifact-order.js';
14
14
  import { KEY_TAKEAWAYS_SECTION_ID } from '../key-takeaways.js';
15
+ import { READER_GUIDE_SECTION_ID } from '../reader-guide-constants.js';
16
+ /**
17
+ * Top-level section anchors that mark the **end** of the Executive Brief
18
+ * body. Canonical analysis sections are matched by the shared
19
+ * `id="section-…"` prefix (see {@link findExecutiveBriefSectionCut});
20
+ * the appendix and reader-guide sections below carry bespoke ids that do
21
+ * **not** share that prefix, so they are matched explicitly. Including
22
+ * them ensures the localized brief splice also fires on sparse runs where
23
+ * the Executive Brief is the last canonical section and only appendix
24
+ * blocks follow it.
25
+ */
26
+ const EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS = [
27
+ `id="${READER_GUIDE_SECTION_ID}"`,
28
+ `id="${TRADECRAFT_SECTION_ID}"`,
29
+ `id="${MANIFEST_SECTION_ID}"`,
30
+ `id="${SUPPLEMENTARY_SECTION_ID}"`,
31
+ ];
15
32
  /**
16
33
  * Localize the Tradecraft References and Analysis Index sections in the
17
34
  * rendered article body HTML. Replaces English headings, introductions,
@@ -102,11 +119,48 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
102
119
  return haystack;
103
120
  return haystack.slice(0, idx) + replacement + haystack.slice(idx + needle.length);
104
121
  }
122
+ /**
123
+ * Locate the cut point that ends the Executive Brief body — the start of
124
+ * the next top-level boundary heading after `afterHeading`. A boundary is
125
+ * any `<h2>` whose `id` either starts with the canonical `section-` prefix
126
+ * or exactly matches one of {@link EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS}
127
+ * (Reader Guide / Tradecraft / Analysis Index / Supplementary appendices).
128
+ *
129
+ * Critically, this only matches **top-level** section anchors — never the
130
+ * brief's own internal `<h2>` sub-headings (`## BLUF`, `## 60-Second Read`,
131
+ * …), which carry slugified ids without the `section-` prefix. That is why
132
+ * we cannot simply look for the next `<h2`.
133
+ *
134
+ * Uses `indexOf`/`lastIndexOf` exclusively (no regex) to stay within
135
+ * CodeQL's safe-regex envelope.
136
+ *
137
+ * @param html - Full article body HTML
138
+ * @param afterHeading - Index immediately after the Executive Brief `</h2>`
139
+ * @returns Index of the next boundary `<h2`, or `-1` when the Executive
140
+ * Brief is the last block in the body.
141
+ */
142
+ export function findExecutiveBriefSectionCut(html, afterHeading) {
143
+ let best = -1;
144
+ const consider = (markerIdx) => {
145
+ if (markerIdx === -1)
146
+ return;
147
+ const h2 = html.lastIndexOf('<h2', markerIdx);
148
+ if (h2 === -1 || h2 < afterHeading)
149
+ return;
150
+ if (best === -1 || h2 < best)
151
+ best = h2;
152
+ };
153
+ consider(html.indexOf('id="section-', afterHeading));
154
+ for (const marker of EXECUTIVE_BRIEF_BOUNDARY_ID_MARKERS) {
155
+ consider(html.indexOf(marker, afterHeading));
156
+ }
157
+ return best;
158
+ }
105
159
  /**
106
160
  * Replace the **inner body** of the Executive Brief section (the
107
161
  * `<h2 id="section-executive-brief">…</h2>` heading and everything that
108
- * follows it up to — but not including — the next `<h2 id="section-…">`
109
- * sibling) with the supplied replacement HTML. The Executive Brief
162
+ * follows it up to — but not including — the next top-level boundary
163
+ * heading) with the supplied replacement HTML. The Executive Brief
110
164
  * heading itself is preserved by emitting it inline ahead of the
111
165
  * replacement, so the in-page anchor (`#section-executive-brief`) and
112
166
  * the table-of-contents link continue to work.
@@ -119,8 +173,11 @@ export function replaceFirstStringIn(haystack, needle, replacement) {
119
173
  * `render-one.writeLanguageVariant`.
120
174
  *
121
175
  * Implementation uses `indexOf`/slice exclusively to stay within
122
- * CodeQL's safe-regex envelope. Returns `html` unchanged when the
123
- * Executive Brief heading is absent or malformed.
176
+ * CodeQL's safe-regex envelope. The replacement spans from the heading to
177
+ * the next top-level boundary (see {@link findExecutiveBriefSectionCut});
178
+ * when the Executive Brief is the last block in the body the replacement
179
+ * extends to end-of-body. Returns `html` unchanged only when the Executive
180
+ * Brief heading is absent or malformed.
124
181
  *
125
182
  * @param html - Full article body HTML
126
183
  * @param localizedHeading - Localized text for the Executive Brief H2
@@ -147,23 +204,24 @@ export function replaceExecutiveBriefSection(html, localizedHeading, replacement
147
204
  if (h2CloseTagIdx === -1)
148
205
  return html;
149
206
  const afterHeading = h2CloseTagIdx + '</h2>'.length;
150
- // Find the next `<h2 id="section-...">` boundary — the start of the
151
- // following article section. If there is no further section heading
152
- // we conservatively bail out (replacing through end-of-body would
153
- // also drop appendix content like Reader Guide / Key Takeaways).
154
- const nextSectionId = html.indexOf('id="section-', afterHeading);
155
- if (nextSectionId === -1)
156
- return html;
157
- const nextH2 = html.lastIndexOf('<h2', nextSectionId);
158
- if (nextH2 === -1 || nextH2 <= afterHeading)
159
- return html;
160
- // Find the start of the line containing the next `<h2` so we don't
161
- // strip leading whitespace from the next section. We look at most
162
- // one newline back.
163
- let cutEnd = nextH2;
164
- const prevNewline = html.lastIndexOf('\n', nextH2 - 1);
165
- if (prevNewline !== -1 && prevNewline >= afterHeading) {
166
- cutEnd = prevNewline + 1;
207
+ // Find the next top-level boundary heading — the start of the following
208
+ // article section or appendix. When none exists the Executive Brief is
209
+ // the last block, so we replace through end-of-body. This guarantees the
210
+ // localized brief is spliced even on sparse runs (previously the splice
211
+ // bailed and non-English readers were stranded on the English brief).
212
+ const nextH2 = findExecutiveBriefSectionCut(html, afterHeading);
213
+ let cutEnd;
214
+ if (nextH2 === -1) {
215
+ cutEnd = html.length;
216
+ }
217
+ else {
218
+ // Start of the line containing the next `<h2` so we don't strip
219
+ // leading whitespace from the next section.
220
+ cutEnd = nextH2;
221
+ const prevNewline = html.lastIndexOf('\n', nextH2 - 1);
222
+ if (prevNewline !== -1 && prevNewline >= afterHeading) {
223
+ cutEnd = prevNewline + 1;
224
+ }
167
225
  }
168
226
  const newHeading = `<h2 id="section-executive-brief">${escapeHTML(localizedHeading)}</h2>\n`;
169
227
  const trimmedReplacement = replacementBodyHtml.endsWith('\n')
@@ -19,6 +19,7 @@ import { escapeHTML } from '../../utils/file-utils.js';
19
19
  import { buildResponsiveIconLinks, buildResponsiveSocialImageMeta, buildSiteFooter, buildSiteHeader, buildPageBanner, } from '../../templates/section-builders.js';
20
20
  import { getPoliticalIntelligenceFilename } from '../../generators/political-intelligence.js';
21
21
  import { getSitemapFilename } from '../../generators/sitemap/index.js';
22
+ import { buildRssAlternateLink } from '../../templates/sections/rss-discovery.js';
22
23
  import { truncateHeadline, getTitleSeparator, buildPageTitle, getLocalizedArticleType, getLocalizedArticleTypePlain, } from './headline.js';
23
24
  import { clampForBudget } from '../metadata/seo-budgets.js';
24
25
  import { getArticleFilename, buildArticleHreflangLinks, buildLanguageSwitcher, } from './hreflang.js';
@@ -352,7 +353,7 @@ ${keywordsMeta} <meta name="robots" content="index, follow, max-snippet:-1, max
352
353
  <meta property="article:publisher" content="https://hack23.com">
353
354
  <link rel="canonical" href="${canonicalUrl}">
354
355
  ${hreflangLinks}
355
- <link rel="alternate" type="application/rss+xml" title="EU Parliament Monitor RSS" href="${BASE_URL}/rss.xml">
356
+ ${buildRssAlternateLink(safeLang, `${BASE_URL}/`)}
356
357
  <link rel="preconnect" href="https://hack23.com" crossorigin>
357
358
  <meta property="og:type" content="article">
358
359
  <meta property="og:title" content="${escapeHTML(ogTitleClamped)}">
@@ -153,6 +153,13 @@ export const ARTIFACT_CATEGORY_PREFIXES = [
153
153
  'voting patterns',
154
154
  'weekly outlook',
155
155
  'wildcards blackswans',
156
+ // CJK localized category prefixes (translations of "executive briefing")
157
+ 'エグゼクティブ・ブリーフィング',
158
+ 'エグゼクティブブリーフィング',
159
+ 'エグゼクティブ・ブリーフ',
160
+ '행정 브리핑',
161
+ '执行简报',
162
+ '執行簡報',
156
163
  ];
157
164
  /**
158
165
  * Match a single calendar month name (English) with optional `-uary` /
@@ -211,7 +218,7 @@ function normaliseCategoryHeading(raw) {
211
218
  return stripInlineMarkdown(raw)
212
219
  .trim()
213
220
  .toLowerCase()
214
- .replace(/^[^a-z0-9]+/, '')
221
+ .replace(/^[^a-z0-9\p{L}]+/u, '')
215
222
  .replace(/\s+/g, ' ');
216
223
  }
217
224
  /**
@@ -158,6 +158,17 @@ const BARE_INSTITUTIONAL_HEADINGS = [
158
158
  'briefing',
159
159
  'intelligence brief',
160
160
  'intelligence briefing',
161
+ // CJK / localized translations of generic headings
162
+ 'エグゼクティブ・ブリーフィング',
163
+ 'エグゼクティブブリーフィング',
164
+ 'エグゼクティブ・ブリーフ',
165
+ 'ブリーフィング',
166
+ '행정 브리핑',
167
+ '브리핑',
168
+ '执行简报',
169
+ '简报',
170
+ '執行簡報',
171
+ '簡報',
161
172
  ];
162
173
  /**
163
174
  * Return `true` when the heading is one of {@link BARE_INSTITUTIONAL_HEADINGS}
@@ -160,15 +160,18 @@ export function clampForBudget(text, lang, surface) {
160
160
  if (cleaned.length >= softMin)
161
161
  return cleaned;
162
162
  }
163
- // Whitespace-aware fallback. Chinese and Japanese text often has no
164
- // ASCII spaces, so skip this step for them and fall straight through
165
- // to the hard cut. Korean is the exception it uses inter-word spaces.
166
- if (family !== 'cjk' || lang === 'ko') {
167
- const lastSpace = window.lastIndexOf(' ');
168
- if (lastSpace >= softMin) {
169
- const safe = trimTrailingSeparators(window.slice(0, lastSpace));
170
- return `${safe}…`;
171
- }
163
+ // Whitespace-aware fallback. Runs for every script: an ASCII space
164
+ // past the soft minimum is a safe break that drops a partial trailing
165
+ // segment whole rather than slicing it mid-token. Chinese and Japanese
166
+ // prose has no inter-word spaces, so `lastIndexOf(' ')` returns -1 and
167
+ // this is a no-op for them — but composed SEO snippets join clauses
168
+ // (body, dateline, reader label) with ASCII spaces, so honouring that
169
+ // boundary prevents hard-cutting the reader label mid-word. Korean
170
+ // uses inter-word spaces natively and benefits the same way.
171
+ const lastSpace = window.lastIndexOf(' ');
172
+ if (lastSpace >= softMin) {
173
+ const safe = trimTrailingSeparators(window.slice(0, lastSpace));
174
+ return `${safe}…`;
172
175
  }
173
176
  const hardCut = trimTrailingSeparators(window);
174
177
  return `${hardCut}…`;
@@ -46,7 +46,7 @@ const ADMIRALTY_LABELS = {
46
46
  export function applyReaderFriendlyTransform(html) {
47
47
  const state = createInitialState(html);
48
48
  const withGlossary = injectReaderGlossary(html);
49
- const parts = withGlossary.split(/(<[^>]+>)/g);
49
+ const parts = withGlossary.split(/(<[^<>]+>)/g);
50
50
  for (let i = 0; i < parts.length; i++) {
51
51
  const part = parts[i] ?? '';
52
52
  if (part.startsWith('<')) {
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Backfill hreflang alternate links for all article HTML files.
3
+ *
4
+ * Handles three cases:
5
+ * 1. Articles with no hreflang links at all → inject the full block before `</head>`
6
+ * 2. Articles with relative hreflang URLs → replace with absolute URLs
7
+ * 3. Articles already correct → skip
8
+ *
9
+ * @param filenames - News article filenames
10
+ * @returns Number of HTML files updated
11
+ */
12
+ export declare function backfillArticleHreflang(filenames: readonly string[]): number;
13
+ //# sourceMappingURL=backfill-hreflang.d.ts.map
@@ -0,0 +1,112 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Generators/NewsIndexes/BackfillHreflang
5
+ * @description Hreflang alternate-link backfill for article HTML files.
6
+ * Extracted from `backfill.ts` to keep source files ≤600 lines.
7
+ */
8
+ import path from 'path';
9
+ import fs from 'fs';
10
+ import { NEWS_DIR, BASE_URL } from '../../constants/config.js';
11
+ import { ALL_LANGUAGES } from '../../constants/languages.js';
12
+ import { parseArticleFilename, atomicWrite } from '../../utils/file-utils.js';
13
+ /**
14
+ * Read an article HTML file, returning an empty string when unavailable.
15
+ *
16
+ * @param filepath - Absolute HTML file path
17
+ * @returns File content or empty string
18
+ */
19
+ function readArticleHtml(filepath) {
20
+ try {
21
+ return path.isAbsolute(filepath) ? fs.readFileSync(filepath, 'utf8') : '';
22
+ }
23
+ catch {
24
+ return '';
25
+ }
26
+ }
27
+ /**
28
+ * Build hreflang `<link rel="alternate">` tags for an article slug.
29
+ * Produces one tag per supported language plus an `x-default` pointing at
30
+ * the English variant, all using absolute URLs.
31
+ *
32
+ * @param articleSlug - Slug without language suffix (e.g. `2026-02-24-propositions`)
33
+ * @returns Newline-joined `<link>` tags
34
+ */
35
+ function buildArticleHreflang(articleSlug) {
36
+ const entries = ALL_LANGUAGES.map((code) => ` <link rel="alternate" hreflang="${code}" href="${BASE_URL}/news/${articleSlug}-${code}.html">`);
37
+ entries.push(` <link rel="alternate" hreflang="x-default" href="${BASE_URL}/news/${articleSlug}-en.html">`);
38
+ return entries.join('\n');
39
+ }
40
+ /**
41
+ * Inject hreflang links into an article that has none.
42
+ *
43
+ * @param html - Article HTML content
44
+ * @param hreflangBlock - Pre-built hreflang link block
45
+ * @returns Updated HTML, or original if no change needed
46
+ */
47
+ function injectHreflangLinks(html, hreflangBlock) {
48
+ return html.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
49
+ }
50
+ /**
51
+ * Replace existing relative hreflang links with absolute URLs.
52
+ *
53
+ * @param html - Article HTML content
54
+ * @param hreflangBlock - Pre-built hreflang link block with absolute URLs
55
+ * @returns Updated HTML, or original if no change needed
56
+ */
57
+ function fixRelativeHreflangLinks(html, hreflangBlock) {
58
+ const stripped = html.replace(/\s*<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="[^"]*">\n?/gu, '');
59
+ return stripped.replace(/(<\/head>)/u, `${hreflangBlock}\n$1`);
60
+ }
61
+ /**
62
+ * Backfill hreflang alternate links for all article HTML files.
63
+ *
64
+ * Handles three cases:
65
+ * 1. Articles with no hreflang links at all → inject the full block before `</head>`
66
+ * 2. Articles with relative hreflang URLs → replace with absolute URLs
67
+ * 3. Articles already correct → skip
68
+ *
69
+ * @param filenames - News article filenames
70
+ * @returns Number of HTML files updated
71
+ */
72
+ export function backfillArticleHreflang(filenames) {
73
+ let updated = 0;
74
+ for (const filename of filenames) {
75
+ if (backfillOneArticleHreflang(filename))
76
+ updated++;
77
+ }
78
+ return updated;
79
+ }
80
+ /**
81
+ * Backfill hreflang for a single article file.
82
+ *
83
+ * @param filename - News article filename
84
+ * @returns True when the file was updated
85
+ */
86
+ function backfillOneArticleHreflang(filename) {
87
+ const parsed = parseArticleFilename(filename);
88
+ if (!parsed)
89
+ return false;
90
+ const filepath = path.join(NEWS_DIR, filename);
91
+ const html = readArticleHtml(filepath);
92
+ if (!html)
93
+ return false;
94
+ const articleSlug = `${parsed.date}-${parsed.slug}`;
95
+ const hreflangBlock = buildArticleHreflang(articleSlug);
96
+ const hasHreflang = /<link\s+rel="alternate"\s+hreflang="/u.test(html);
97
+ let next;
98
+ if (!hasHreflang) {
99
+ next = injectHreflangLinks(html, hreflangBlock);
100
+ }
101
+ else {
102
+ const hasRelative = /<link\s+rel="alternate"\s+hreflang="[^"]*"\s+href="(?!https?:\/\/)/u.test(html);
103
+ if (!hasRelative)
104
+ return false;
105
+ next = fixRelativeHreflangLinks(html, hreflangBlock);
106
+ }
107
+ if (next === html)
108
+ return false;
109
+ atomicWrite(filepath, next);
110
+ return true;
111
+ }
112
+ //# sourceMappingURL=backfill-hreflang.js.map
@@ -0,0 +1,47 @@
1
+ import type { LanguageCode } from '../../types/index.js';
2
+ /**
3
+ * Remove a trailing **truncated** copy of the localized reader label
4
+ * (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
5
+ *
6
+ * Earlier backfill passes appended the reader label and then clamped the
7
+ * whole buffer to the per-script `metaDescription` budget, hard-cutting
8
+ * the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
9
+ * ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
10
+ * persisted to `<meta description>` and survive a plain prefix/date-label
11
+ * strip, so re-feeding them to the resolver re-emits the broken tail.
12
+ *
13
+ * A trailing copy that matches the label **in full** is left intact — it
14
+ * is a complete, reader-facing clause we want to preserve. Only a partial
15
+ * (truncated) prefix of the label is dropped, leaving the clean body for
16
+ * the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
17
+ * reader clause.
18
+ *
19
+ * @param description - Candidate description (prefix/date-label removed)
20
+ * @param langCode - Article language code
21
+ * @returns Description with any truncated trailing reader label removed
22
+ */
23
+ export declare function stripTruncatedReaderLabel(description: string, langCode: LanguageCode): string;
24
+ /**
25
+ * Locate a trailing **truncated** copy of the localized reader label and
26
+ * return the index at which the description body ends (i.e. where the
27
+ * partial label begins). Returns -1 when no partial label is present or
28
+ * when the label is present in full (a complete clause we keep).
29
+ *
30
+ * @param text - Trimmed candidate description
31
+ * @param langCode - Article language code
32
+ * @returns Cut index for the partial label, or -1 when none applies
33
+ */
34
+ export declare function findTruncatedReaderLabelCut(text: string, langCode: LanguageCode): number;
35
+ /**
36
+ * Detect whether a legacy `<meta description>` ends with a **truncated**
37
+ * reader label once its dateline prefix and redundant date-label clause
38
+ * are removed. Long, unique legacy descriptions otherwise bypass
39
+ * `shouldBackfillDescription`, leaving a persisted mid-word cut
40
+ * (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
41
+ *
42
+ * @param body - Stripped description body (prefix/date-label removed)
43
+ * @param langCode - Article language code
44
+ * @returns True when a truncated reader label remains in the body
45
+ */
46
+ export declare function hasTruncatedReaderLabelInBody(body: string, langCode: LanguageCode): boolean;
47
+ //# sourceMappingURL=backfill-reader-label.d.ts.map
@@ -0,0 +1,86 @@
1
+ // SPDX-FileCopyrightText: 2024-2026 Hack23 AB
2
+ // SPDX-License-Identifier: Apache-2.0
3
+ /**
4
+ * @module Generators/NewsIndexes/BackfillReaderLabel
5
+ * @description Truncated reader-label detection and stripping helpers,
6
+ * extracted from `backfill.ts` to keep source files ≤600 lines.
7
+ */
8
+ import { getLocalizedString } from '../../constants/languages.js';
9
+ import { SEO_CONTEXT_LABELS } from '../../aggregator/metadata/template-fallback.js';
10
+ /**
11
+ * Remove a trailing **truncated** copy of the localized reader label
12
+ * (`SEO_CONTEXT_LABELS[lang].reader`) from a candidate description.
13
+ *
14
+ * Earlier backfill passes appended the reader label and then clamped the
15
+ * whole buffer to the per-script `metaDescription` budget, hard-cutting
16
+ * the label mid-word (e.g. zh `…政策后果的读` instead of `…政策后果的读者`,
17
+ * ja `…追跡する読`, ko dangling `…추적하는.`). Those mangled fragments were
18
+ * persisted to `<meta description>` and survive a plain prefix/date-label
19
+ * strip, so re-feeding them to the resolver re-emits the broken tail.
20
+ *
21
+ * A trailing copy that matches the label **in full** is left intact — it
22
+ * is a complete, reader-facing clause we want to preserve. Only a partial
23
+ * (truncated) prefix of the label is dropped, leaving the clean body for
24
+ * the resolver to re-enrich with a budget-aware (whole-label-or-nothing)
25
+ * reader clause.
26
+ *
27
+ * @param description - Candidate description (prefix/date-label removed)
28
+ * @param langCode - Article language code
29
+ * @returns Description with any truncated trailing reader label removed
30
+ */
31
+ export function stripTruncatedReaderLabel(description, langCode) {
32
+ const text = description.trim();
33
+ const cut = findTruncatedReaderLabelCut(text, langCode);
34
+ if (cut < 0)
35
+ return text;
36
+ return text
37
+ .replace(/[.。!?!?…]+$/u, '')
38
+ .slice(0, cut)
39
+ .replace(/[\s,;:—\-–·。、]+$/u, '')
40
+ .trim();
41
+ }
42
+ /**
43
+ * Locate a trailing **truncated** copy of the localized reader label and
44
+ * return the index at which the description body ends (i.e. where the
45
+ * partial label begins). Returns -1 when no partial label is present or
46
+ * when the label is present in full (a complete clause we keep).
47
+ *
48
+ * @param text - Trimmed candidate description
49
+ * @param langCode - Article language code
50
+ * @returns Cut index for the partial label, or -1 when none applies
51
+ */
52
+ export function findTruncatedReaderLabelCut(text, langCode) {
53
+ const labels = getLocalizedString(SEO_CONTEXT_LABELS, langCode);
54
+ const reader = (labels.reader ?? '').trim();
55
+ // Require a reasonably long label so we never strip on a coincidental
56
+ // short suffix match; real labels are 40+ chars (Latin) / 11+ (CJK).
57
+ if (reader.length < 8 || text.length < 8)
58
+ return -1;
59
+ // Tolerate a terminator the resolver/healer appended after the cut.
60
+ const core = text.replace(/[.。!?!?…]+$/u, '');
61
+ const maxK = Math.min(core.length, reader.length);
62
+ for (let k = maxK; k >= 8; k -= 1) {
63
+ if (core.slice(core.length - k) === reader.slice(0, k)) {
64
+ // Full label present at the tail — keep it (not a truncation).
65
+ if (k === reader.length)
66
+ return -1;
67
+ return core.length - k;
68
+ }
69
+ }
70
+ return -1;
71
+ }
72
+ /**
73
+ * Detect whether a legacy `<meta description>` ends with a **truncated**
74
+ * reader label once its dateline prefix and redundant date-label clause
75
+ * are removed. Long, unique legacy descriptions otherwise bypass
76
+ * `shouldBackfillDescription`, leaving a persisted mid-word cut
77
+ * (e.g. zh `…政策后果的读`, ja `…追跡する読`, ko `…추적하는.`) in place.
78
+ *
79
+ * @param body - Stripped description body (prefix/date-label removed)
80
+ * @param langCode - Article language code
81
+ * @returns True when a truncated reader label remains in the body
82
+ */
83
+ export function hasTruncatedReaderLabelInBody(body, langCode) {
84
+ return findTruncatedReaderLabelCut(body, langCode) >= 0;
85
+ }
86
+ //# sourceMappingURL=backfill-reader-label.js.map
@@ -60,14 +60,26 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
60
60
  readonly forceContextPrefix?: boolean;
61
61
  }): string;
62
62
  /**
63
- * Apply SEO meta tag replacements to a complete article HTML document.
63
+ * Strip the legacy dateline prefix **and** the redundant localized
64
+ * date-label clause from a candidate description, returning the
65
+ * reader-facing body in isolation. Used to clean a previously-backfilled
66
+ * `<meta description>` before it is re-fed to the per-language SEO
67
+ * resolver — without this, the resolver re-clamps the prefixed buffer
68
+ * against the CJK metaDescription budget and truncates the reader label
69
+ * mid-clause (live regression in `news/2026-04-26-week-ahead-ko.html`,
70
+ * a dangling "추적하는." participle).
64
71
  *
72
+ * @param date - Article date (ISO YYYY-MM-DD)
73
+ * @param slug - Article slug
74
+ * @param lang - Article language code
75
+ * @param description - Candidate description (possibly already prefixed)
76
+ * @returns Reader-facing body with prefix + date label removed
77
+ */
78
+ export declare function stripLegacyBackfillContext(date: string, slug: string, lang: string, description: string): string;
79
+ /**
80
+ * Apply SEO meta tag replacements to a complete article HTML document.
65
81
  * Exported for the regression test in
66
- * `test/unit/news-indexes-jsonld-description-regex.test.js`, which
67
- * locks in the JSON-LD description regex against the duplicate-tail
68
- * bug (the legacy `"description":"[^"]*"` pattern terminated at the
69
- * first JSON-escaped quote `\"` and left the previous description's
70
- * tail in place, accumulating duplicates on every prebuild run).
82
+ * `test/unit/news-indexes-jsonld-description-regex.test.js`.
71
83
  *
72
84
  * @param html - Existing article HTML
73
85
  * @param description - Backfilled meta description
@@ -75,16 +87,5 @@ export declare function buildLegacyBackfillDescription(date: string, slug: strin
75
87
  * @returns Updated HTML
76
88
  */
77
89
  export declare function applyArticleSeoBackfill(html: string, description: string, keywords: readonly string[]): string;
78
- /**
79
- * Backfill hreflang alternate links for all article HTML files.
80
- *
81
- * Handles three cases:
82
- * 1. Articles with no hreflang links at all → inject the full block before `</head>`
83
- * 2. Articles with relative hreflang URLs → replace with absolute URLs
84
- * 3. Articles already correct → skip
85
- *
86
- * @param filenames - News article filenames
87
- * @returns Number of HTML files updated
88
- */
89
- export declare function backfillArticleHreflang(filenames: readonly string[]): number;
90
+ export { backfillArticleHreflang } from './backfill-hreflang.js';
90
91
  //# sourceMappingURL=backfill.d.ts.map