euparliamentmonitor 0.9.13 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -4
- package/scripts/aggregator/article-generator.js +2 -1
- package/scripts/aggregator/article-html.d.ts +9 -0
- package/scripts/aggregator/article-html.js +134 -13
- package/scripts/aggregator/article-metadata.d.ts +25 -161
- package/scripts/aggregator/article-metadata.js +71 -649
- package/scripts/aggregator/editorial-brief-resolver.d.ts +9 -0
- package/scripts/aggregator/editorial-brief-resolver.js +3 -1
- package/scripts/aggregator/metadata/date-labels.d.ts +122 -0
- package/scripts/aggregator/metadata/date-labels.js +209 -0
- package/scripts/aggregator/metadata/text-utils.d.ts +188 -0
- package/scripts/aggregator/metadata/text-utils.js +542 -0
- package/scripts/constants/og-locales.d.ts +15 -0
- package/scripts/constants/og-locales.js +17 -0
- package/scripts/constants/seo/index.d.ts +21 -0
- package/scripts/constants/seo/index.js +23 -0
- package/scripts/constants/seo/og-locales.d.ts +59 -0
- package/scripts/constants/seo/og-locales.js +59 -0
- package/scripts/constants/seo/social-handles.d.ts +50 -0
- package/scripts/constants/seo/social-handles.js +65 -0
- package/scripts/constants/social-handles.d.ts +11 -0
- package/scripts/constants/social-handles.js +13 -0
- package/scripts/discover-untranslated-briefs.js +224 -19
- package/scripts/generators/news-indexes.d.ts +35 -0
- package/scripts/generators/news-indexes.js +67 -6
- package/scripts/generators/political-intelligence/html.js +14 -6
- package/scripts/generators/seo-copy.js +42 -0
- package/scripts/generators/sitemap/html.js +13 -5
- package/scripts/lint-src-todos.js +124 -0
- package/scripts/utils/copy-test-reports.js +1 -1
- package/scripts/utils/generate-docs-index.js +1 -1
- package/scripts/validate-brief-translations.js +158 -18
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
import { ALL_LANGUAGES } from '../language-core.js';
|
|
4
|
+
/**
|
|
5
|
+
* BCP-47 OpenGraph locale code per supported language.
|
|
6
|
+
*
|
|
7
|
+
* The values follow `<language>_<TERRITORY>` (underscore-separated)
|
|
8
|
+
* as required by the OpenGraph protocol. Use the helpers below rather
|
|
9
|
+
* than reading the map directly so the locale logic stays in one
|
|
10
|
+
* place.
|
|
11
|
+
*/
|
|
12
|
+
export const OG_LOCALES = {
|
|
13
|
+
en: 'en_GB',
|
|
14
|
+
sv: 'sv_SE',
|
|
15
|
+
da: 'da_DK',
|
|
16
|
+
no: 'nb_NO',
|
|
17
|
+
fi: 'fi_FI',
|
|
18
|
+
de: 'de_DE',
|
|
19
|
+
fr: 'fr_FR',
|
|
20
|
+
es: 'es_ES',
|
|
21
|
+
nl: 'nl_NL',
|
|
22
|
+
ar: 'ar_SA',
|
|
23
|
+
he: 'he_IL',
|
|
24
|
+
ja: 'ja_JP',
|
|
25
|
+
ko: 'ko_KR',
|
|
26
|
+
zh: 'zh_CN',
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Return the BCP-47 locale code for a given ISO 639-1 language code.
|
|
30
|
+
* Falls back to `en_GB` for unknown languages — the same fallback the
|
|
31
|
+
* rest of the site uses for missing translations.
|
|
32
|
+
*
|
|
33
|
+
* @param lang - ISO 639-1 language code (e.g., `"en"`, `"sv"`)
|
|
34
|
+
* @returns BCP-47 `language_TERRITORY` locale (e.g., `"en_GB"`)
|
|
35
|
+
*/
|
|
36
|
+
export function getOgLocale(lang) {
|
|
37
|
+
return Object.hasOwn(OG_LOCALES, lang)
|
|
38
|
+
? (OG_LOCALES[lang] ?? OG_LOCALES.en)
|
|
39
|
+
: OG_LOCALES.en;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Build the OpenGraph locale meta tag block — one canonical
|
|
43
|
+
* `og:locale` for the current language plus an `og:locale:alternate`
|
|
44
|
+
* for every other supported language. Emitting the alternates lets the
|
|
45
|
+
* Facebook/LinkedIn crawler discover the localized siblings without
|
|
46
|
+
* having to follow the `<link rel="alternate" hreflang>` chain.
|
|
47
|
+
*
|
|
48
|
+
* The output is intentionally indented with two spaces to match the
|
|
49
|
+
* surrounding `<head>` formatting in the four generators.
|
|
50
|
+
*
|
|
51
|
+
* @param currentLang - Language being rendered (drives `og:locale`)
|
|
52
|
+
* @returns Multi-line HTML fragment ready to drop into `<head>`
|
|
53
|
+
*/
|
|
54
|
+
export function buildOgLocaleTags(currentLang) {
|
|
55
|
+
const primary = getOgLocale(currentLang);
|
|
56
|
+
const alternates = ALL_LANGUAGES.filter((code) => code !== currentLang).map((code) => ` <meta property="og:locale:alternate" content="${getOgLocale(code)}">`);
|
|
57
|
+
return [` <meta property="og:locale" content="${primary}">`, ...alternates].join('\n');
|
|
58
|
+
}
|
|
59
|
+
//# sourceMappingURL=og-locales.js.map
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Constants/SocialHandles
|
|
3
|
+
* @description Verified social-media handles for `twitter:site` /
|
|
4
|
+
* `twitter:creator` and the canonical organization sameAs list emitted
|
|
5
|
+
* into JSON-LD on every generated page.
|
|
6
|
+
*
|
|
7
|
+
* Why this lives in one file: the OpenGraph crawler, the Twitter
|
|
8
|
+
* card validator, and Google's structured-data `NewsMediaOrganization`
|
|
9
|
+
* graph all expect the same handle and the same sameAs URLs. Keeping
|
|
10
|
+
* them in one constant avoids drift between the four generators.
|
|
11
|
+
*
|
|
12
|
+
* **Empty-string semantics**: when a handle is not yet provisioned
|
|
13
|
+
* the matching constant is `''` (empty string). The emit helpers
|
|
14
|
+
* skip emitting an empty tag rather than producing
|
|
15
|
+
* `<meta name="twitter:site" content="">`, which Twitter rejects as
|
|
16
|
+
* malformed.
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Twitter / X handle for the publishing organization, including the
|
|
20
|
+
* leading `@`. Used for `<meta name="twitter:site">`.
|
|
21
|
+
*
|
|
22
|
+
* Currently empty (no verified Twitter presence as of May 2026); set
|
|
23
|
+
* this to e.g. `'@hack23ab'` once the org account is verified, and
|
|
24
|
+
* every generated page will start emitting the tag automatically.
|
|
25
|
+
*/
|
|
26
|
+
export declare const TWITTER_SITE_HANDLE = "";
|
|
27
|
+
/**
|
|
28
|
+
* Twitter / X handle for the editorial team (defaults to the org
|
|
29
|
+
* handle when no separate creator account exists). Used for
|
|
30
|
+
* `<meta name="twitter:creator">`.
|
|
31
|
+
*/
|
|
32
|
+
export declare const TWITTER_CREATOR_HANDLE = "";
|
|
33
|
+
/**
|
|
34
|
+
* Canonical sameAs URLs for the publishing `NewsMediaOrganization`.
|
|
35
|
+
* Emitted into the JSON-LD graph on every article and landing page so
|
|
36
|
+
* Google and other crawlers can verify the publisher identity across
|
|
37
|
+
* its different surfaces.
|
|
38
|
+
*
|
|
39
|
+
* Order matters: most authoritative / most-trafficked surface first.
|
|
40
|
+
*/
|
|
41
|
+
export declare const ORG_SAME_AS: readonly string[];
|
|
42
|
+
/**
|
|
43
|
+
* Build the `twitter:site` / `twitter:creator` meta-tag block. Returns
|
|
44
|
+
* an empty string when neither handle is configured so the caller can
|
|
45
|
+
* safely interpolate the result without producing empty meta tags.
|
|
46
|
+
*
|
|
47
|
+
* @returns Newline-joined meta tags (no trailing newline) or `''`
|
|
48
|
+
*/
|
|
49
|
+
export declare function buildTwitterAttributionTags(): string;
|
|
50
|
+
//# sourceMappingURL=social-handles.d.ts.map
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Constants/SocialHandles
|
|
5
|
+
* @description Verified social-media handles for `twitter:site` /
|
|
6
|
+
* `twitter:creator` and the canonical organization sameAs list emitted
|
|
7
|
+
* into JSON-LD on every generated page.
|
|
8
|
+
*
|
|
9
|
+
* Why this lives in one file: the OpenGraph crawler, the Twitter
|
|
10
|
+
* card validator, and Google's structured-data `NewsMediaOrganization`
|
|
11
|
+
* graph all expect the same handle and the same sameAs URLs. Keeping
|
|
12
|
+
* them in one constant avoids drift between the four generators.
|
|
13
|
+
*
|
|
14
|
+
* **Empty-string semantics**: when a handle is not yet provisioned
|
|
15
|
+
* the matching constant is `''` (empty string). The emit helpers
|
|
16
|
+
* skip emitting an empty tag rather than producing
|
|
17
|
+
* `<meta name="twitter:site" content="">`, which Twitter rejects as
|
|
18
|
+
* malformed.
|
|
19
|
+
*/
|
|
20
|
+
/**
|
|
21
|
+
* Twitter / X handle for the publishing organization, including the
|
|
22
|
+
* leading `@`. Used for `<meta name="twitter:site">`.
|
|
23
|
+
*
|
|
24
|
+
* Currently empty (no verified Twitter presence as of May 2026); set
|
|
25
|
+
* this to e.g. `'@hack23ab'` once the org account is verified, and
|
|
26
|
+
* every generated page will start emitting the tag automatically.
|
|
27
|
+
*/
|
|
28
|
+
export const TWITTER_SITE_HANDLE = '';
|
|
29
|
+
/**
|
|
30
|
+
* Twitter / X handle for the editorial team (defaults to the org
|
|
31
|
+
* handle when no separate creator account exists). Used for
|
|
32
|
+
* `<meta name="twitter:creator">`.
|
|
33
|
+
*/
|
|
34
|
+
export const TWITTER_CREATOR_HANDLE = '';
|
|
35
|
+
/**
|
|
36
|
+
* Canonical sameAs URLs for the publishing `NewsMediaOrganization`.
|
|
37
|
+
* Emitted into the JSON-LD graph on every article and landing page so
|
|
38
|
+
* Google and other crawlers can verify the publisher identity across
|
|
39
|
+
* its different surfaces.
|
|
40
|
+
*
|
|
41
|
+
* Order matters: most authoritative / most-trafficked surface first.
|
|
42
|
+
*/
|
|
43
|
+
export const ORG_SAME_AS = [
|
|
44
|
+
'https://github.com/Hack23',
|
|
45
|
+
'https://github.com/Hack23/euparliamentmonitor',
|
|
46
|
+
'https://hack23.com',
|
|
47
|
+
];
|
|
48
|
+
/**
|
|
49
|
+
* Build the `twitter:site` / `twitter:creator` meta-tag block. Returns
|
|
50
|
+
* an empty string when neither handle is configured so the caller can
|
|
51
|
+
* safely interpolate the result without producing empty meta tags.
|
|
52
|
+
*
|
|
53
|
+
* @returns Newline-joined meta tags (no trailing newline) or `''`
|
|
54
|
+
*/
|
|
55
|
+
export function buildTwitterAttributionTags() {
|
|
56
|
+
const tags = [];
|
|
57
|
+
if (TWITTER_SITE_HANDLE) {
|
|
58
|
+
tags.push(` <meta name="twitter:site" content="${TWITTER_SITE_HANDLE}">`);
|
|
59
|
+
}
|
|
60
|
+
if (TWITTER_CREATOR_HANDLE) {
|
|
61
|
+
tags.push(` <meta name="twitter:creator" content="${TWITTER_CREATOR_HANDLE}">`);
|
|
62
|
+
}
|
|
63
|
+
return tags.join('\n');
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=social-handles.js.map
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Constants/SocialHandles
|
|
3
|
+
* @description Backward-compatible re-export shim. The canonical
|
|
4
|
+
* location is `src/constants/seo/social-handles.ts`; this file remains
|
|
5
|
+
* so existing imports `from '../constants/social-handles.js'` keep
|
|
6
|
+
* working through the May-2026 architecture refactor.
|
|
7
|
+
*
|
|
8
|
+
* New code SHOULD import from `src/constants/seo/index.js`.
|
|
9
|
+
*/
|
|
10
|
+
export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './seo/social-handles.js';
|
|
11
|
+
//# sourceMappingURL=social-handles.d.ts.map
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
/**
|
|
4
|
+
* @module Constants/SocialHandles
|
|
5
|
+
* @description Backward-compatible re-export shim. The canonical
|
|
6
|
+
* location is `src/constants/seo/social-handles.ts`; this file remains
|
|
7
|
+
* so existing imports `from '../constants/social-handles.js'` keep
|
|
8
|
+
* working through the May-2026 architecture refactor.
|
|
9
|
+
*
|
|
10
|
+
* New code SHOULD import from `src/constants/seo/index.js`.
|
|
11
|
+
*/
|
|
12
|
+
export { TWITTER_SITE_HANDLE, TWITTER_CREATOR_HANDLE, ORG_SAME_AS, buildTwitterAttributionTags, } from './seo/social-handles.js';
|
|
13
|
+
//# sourceMappingURL=social-handles.js.map
|
|
@@ -19,14 +19,34 @@
|
|
|
19
19
|
* comfortably inside the gh-aw safe-outputs 10 MB patch ceiling and the
|
|
20
20
|
* Claude Sonnet 4.6 60-minute wall-clock budget.
|
|
21
21
|
*
|
|
22
|
-
* Priority rules
|
|
22
|
+
* Priority rules — `fresh-then-backlog` mode (default):
|
|
23
23
|
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
*
|
|
24
|
+
* The queue is built from two pools so the day's newest brief still gets
|
|
25
|
+
* timely coverage on at least one of the three daily runs, while the
|
|
26
|
+
* long-tail backlog of older briefs (currently ~92 sources / ~1,196 missing
|
|
27
|
+
* translations) actually drains rather than being starved by today's wins.
|
|
28
|
+
*
|
|
29
|
+
* 1. **Fresh slice** (at most 1 entry per run): newest source with any
|
|
30
|
+
* missing language. Tie-breakers: more-missing first, slug asc,
|
|
31
|
+
* non-extended first.
|
|
32
|
+
* 2. **Backlog slice** (remaining `max-briefs - 1` slots): every other
|
|
33
|
+
* source with gaps, sorted by `<date>` ASC (oldest first), then
|
|
34
|
+
* `missingCount` ASC (finish half-done briefs before starting new
|
|
35
|
+
* ones), then `<slug>` ASC, then non-extended first.
|
|
36
|
+
*
|
|
37
|
+
* Final queue = `freshSlice.concat(backlogSlice)`.
|
|
38
|
+
*
|
|
39
|
+
* When `--max-briefs 1`, alternate fresh/backlog by run-number parity
|
|
40
|
+
* (`--run-number`, default 0, normally driven by `$GITHUB_RUN_NUMBER`) so
|
|
41
|
+
* the scheduled cadence still drains backlog while preserving freshness.
|
|
42
|
+
*
|
|
43
|
+
* Alternative modes (via `--mode`):
|
|
44
|
+
*
|
|
45
|
+
* - `backlog-only` — drop the fresh slot entirely; oldest-first across the
|
|
46
|
+
* entire backlog. Useful for catch-up batches.
|
|
47
|
+
* - `newest-first` — legacy behaviour: newest date first, more-missing
|
|
48
|
+
* first, slug asc. Retained for one-off operator dispatch where the
|
|
49
|
+
* operator explicitly wants today's brief covered first.
|
|
30
50
|
*
|
|
31
51
|
* Invocation:
|
|
32
52
|
*
|
|
@@ -34,6 +54,8 @@
|
|
|
34
54
|
* [--repo-root <path>] \
|
|
35
55
|
* [--max-briefs <n>] # default 2
|
|
36
56
|
* [--max-age-days <n>] # default 180; older briefs are skipped
|
|
57
|
+
* [--mode <name>] # fresh-then-backlog | backlog-only | newest-first
|
|
58
|
+
* [--run-number <n>] # parity selector when --max-briefs 1
|
|
37
59
|
* [--output <path>] # default stdout
|
|
38
60
|
* [--include-extended] # also scan extended/executive-brief.md
|
|
39
61
|
*
|
|
@@ -44,12 +66,15 @@
|
|
|
44
66
|
* Output JSON shape:
|
|
45
67
|
* {
|
|
46
68
|
* "generatedAt": "2026-05-16T08:24:16.909Z",
|
|
69
|
+
* "options": { "mode": "fresh-then-backlog", "runNumber": 207, ... },
|
|
47
70
|
* "totals": {
|
|
48
71
|
* "sourcesScanned": 92,
|
|
49
72
|
* "sourcesWithGaps": 92,
|
|
50
73
|
* "translationsMissing": 1196,
|
|
51
74
|
* "queued": 2,
|
|
52
75
|
* "queuedTranslations": 26,
|
|
76
|
+
* "freshNewestDate": "2026-05-16",
|
|
77
|
+
* "backlogOldestDate": "2025-11-19",
|
|
53
78
|
* "topMissingLangs": [
|
|
54
79
|
* { "lang": "ja", "count": 92 },
|
|
55
80
|
* { "lang": "ko", "count": 92 },
|
|
@@ -63,7 +88,14 @@
|
|
|
63
88
|
* "sourcePath": "analysis/daily/2026-05-15/breaking/executive-brief.md",
|
|
64
89
|
* "missingLangs": ["sv","da","no","fi","de","fr","es","nl","ar","he","ja","ko","zh"],
|
|
65
90
|
* "missingCount": 13,
|
|
66
|
-
* "isExtended": false
|
|
91
|
+
* "isExtended": false,
|
|
92
|
+
* "sourceH2Count": 8,
|
|
93
|
+
* "sourceH2Titles": [
|
|
94
|
+
* { "line": 7, "title": "Headline Intelligence" },
|
|
95
|
+
* { "line": 96, "title": "IMF Economic Context" },
|
|
96
|
+
* { "line": 146, "title": "IMF Economic Context — May 2026 Update" }
|
|
97
|
+
* ],
|
|
98
|
+
* "sourceFixedTokens": { "IMF": 17, "WEO": 2, "TA-id": 4 }
|
|
67
99
|
* },
|
|
68
100
|
* ...
|
|
69
101
|
* ]
|
|
@@ -82,6 +114,13 @@ export const TARGET_LANGS = Object.freeze(ALL_LANGUAGES.filter((lang) => lang !=
|
|
|
82
114
|
/** Manual-dispatch upper bound that keeps one 60-minute run inside budget. */
|
|
83
115
|
export const MAX_BRIEFS_LIMIT = 4;
|
|
84
116
|
|
|
117
|
+
/** Discovery prioritisation modes. */
|
|
118
|
+
export const DISCOVERY_MODES = Object.freeze([
|
|
119
|
+
'fresh-then-backlog',
|
|
120
|
+
'backlog-only',
|
|
121
|
+
'newest-first',
|
|
122
|
+
]);
|
|
123
|
+
|
|
85
124
|
/**
|
|
86
125
|
* Parse CLI argv into an options object. Exported for unit tests.
|
|
87
126
|
* @param {string[]} argv
|
|
@@ -93,6 +132,8 @@ export function parseArgs(argv) {
|
|
|
93
132
|
maxAgeDays: 180,
|
|
94
133
|
output: null,
|
|
95
134
|
includeExtended: false,
|
|
135
|
+
mode: 'fresh-then-backlog',
|
|
136
|
+
runNumber: 0,
|
|
96
137
|
};
|
|
97
138
|
for (let i = 0; i < argv.length; i += 1) {
|
|
98
139
|
const arg = argv[i];
|
|
@@ -116,6 +157,14 @@ export function parseArgs(argv) {
|
|
|
116
157
|
case '--include-extended':
|
|
117
158
|
opts.includeExtended = true;
|
|
118
159
|
break;
|
|
160
|
+
case '--mode':
|
|
161
|
+
opts.mode = argv[i + 1];
|
|
162
|
+
i += 1;
|
|
163
|
+
break;
|
|
164
|
+
case '--run-number':
|
|
165
|
+
opts.runNumber = Number(argv[i + 1]);
|
|
166
|
+
i += 1;
|
|
167
|
+
break;
|
|
119
168
|
case '--help':
|
|
120
169
|
case '-h':
|
|
121
170
|
printHelp();
|
|
@@ -137,14 +186,22 @@ export function parseArgs(argv) {
|
|
|
137
186
|
if (!Number.isFinite(opts.maxAgeDays) || opts.maxAgeDays < 1) {
|
|
138
187
|
throw new Error('--max-age-days must be a positive integer');
|
|
139
188
|
}
|
|
189
|
+
if (!DISCOVERY_MODES.includes(opts.mode)) {
|
|
190
|
+
throw new Error(
|
|
191
|
+
`--mode must be one of: ${DISCOVERY_MODES.join(', ')} (got "${opts.mode}")`,
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
if (!Number.isInteger(opts.runNumber) || opts.runNumber < 0) {
|
|
195
|
+
throw new Error('--run-number must be a non-negative integer');
|
|
196
|
+
}
|
|
140
197
|
return opts;
|
|
141
198
|
}
|
|
142
199
|
|
|
143
200
|
function printHelp() {
|
|
144
201
|
process.stdout.write(
|
|
145
202
|
'Usage: discover-untranslated-briefs.js [--repo-root <path>] ' +
|
|
146
|
-
'[--max-briefs <n>] [--max-age-days <n>] [--
|
|
147
|
-
'[--include-extended]\n'
|
|
203
|
+
'[--max-briefs <n>] [--max-age-days <n>] [--mode <name>] ' +
|
|
204
|
+
'[--run-number <n>] [--output <path>] [--include-extended]\n',
|
|
148
205
|
);
|
|
149
206
|
}
|
|
150
207
|
|
|
@@ -223,13 +280,94 @@ export function findMissingLangs(source) {
|
|
|
223
280
|
return missing;
|
|
224
281
|
}
|
|
225
282
|
|
|
283
|
+
/**
|
|
284
|
+
* Fixed-token classes the translator must preserve verbatim. Aligned with
|
|
285
|
+
* `scripts/validate-brief-translations.js` `FIXED_TOKEN_PATTERNS` so the
|
|
286
|
+
* discovery report and the validator surface the same shape of evidence.
|
|
287
|
+
* Keys are stable identifiers (used by the agent prompt); values are the
|
|
288
|
+
* global regex the count is computed against.
|
|
289
|
+
*
|
|
290
|
+
* @type {ReadonlyArray<{ key: string, pattern: RegExp }>}
|
|
291
|
+
*/
|
|
292
|
+
const FIXED_TOKEN_CLASSES = Object.freeze([
|
|
293
|
+
{ key: 'IMF', pattern: /\bIMF\b/g },
|
|
294
|
+
{ key: 'WEO', pattern: /\bWEO\b/g },
|
|
295
|
+
{ key: 'World Bank', pattern: /\bWorld Bank\b/g },
|
|
296
|
+
{ key: 'Fiscal Monitor', pattern: /\bFiscal Monitor\b/g },
|
|
297
|
+
{ key: 'data-vintage', pattern: /data-vintage="WEO-[A-Za-z]+-\d{4}"/g },
|
|
298
|
+
{ key: 'TA-id', pattern: /\bTA-\d{1,2}-\d{4}-\d{4}\b/g },
|
|
299
|
+
{ key: 'procedure-id', pattern: /\b\d{4}\/\d{4}\([A-Z]{3}\)/g },
|
|
300
|
+
]);
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Extract H2 section titles from a markdown source file. Returns the
|
|
304
|
+
* 1-based line number and the visible title (with the leading `## `
|
|
305
|
+
* stripped). The agent uses this to spot duplicate-titled sections such
|
|
306
|
+
* as `## IMF Economic Context` followed by
|
|
307
|
+
* `## IMF Economic Context — May 2026 Update`, which were silently
|
|
308
|
+
* collapsed across all 13 translations in run #25983007788. Surfacing the
|
|
309
|
+
* full title list at discovery time eliminates the ambiguity before any
|
|
310
|
+
* translation work begins.
|
|
311
|
+
*
|
|
312
|
+
* @param {string} absPath
|
|
313
|
+
* @returns {Array<{ line: number, title: string }>}
|
|
314
|
+
*/
|
|
315
|
+
export function extractH2Titles(absPath) {
|
|
316
|
+
if (!fs.existsSync(absPath)) return [];
|
|
317
|
+
const text = fs.readFileSync(absPath, 'utf8');
|
|
318
|
+
const lines = text.split('\n');
|
|
319
|
+
const out = [];
|
|
320
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
321
|
+
const match = /^##\s+(\S.*)$/.exec(lines[i]);
|
|
322
|
+
if (match) out.push({ line: i + 1, title: match[1].trim() });
|
|
323
|
+
}
|
|
324
|
+
return out;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Count occurrences of each FIXED_TOKEN class in the source brief. Only
|
|
329
|
+
* classes with at least one match are emitted, so the queue entry stays
|
|
330
|
+
* compact for short briefs.
|
|
331
|
+
*
|
|
332
|
+
* @param {string} absPath
|
|
333
|
+
* @returns {Record<string, number>}
|
|
334
|
+
*/
|
|
335
|
+
export function countFixedTokens(absPath) {
|
|
336
|
+
if (!fs.existsSync(absPath)) return {};
|
|
337
|
+
const text = fs.readFileSync(absPath, 'utf8');
|
|
338
|
+
const counts = {};
|
|
339
|
+
for (const { key, pattern } of FIXED_TOKEN_CLASSES) {
|
|
340
|
+
const re = new RegExp(pattern.source, pattern.flags);
|
|
341
|
+
let n = 0;
|
|
342
|
+
while (re.exec(text) !== null) n += 1;
|
|
343
|
+
if (n > 0) counts[key] = n;
|
|
344
|
+
}
|
|
345
|
+
return counts;
|
|
346
|
+
}
|
|
347
|
+
|
|
226
348
|
/**
|
|
227
349
|
* Build the prioritised queue. See module docstring for ordering rules.
|
|
228
350
|
*
|
|
229
351
|
* @param {ReturnType<typeof findExecutiveBriefSources>} sources
|
|
230
|
-
* @param {number}
|
|
352
|
+
* @param {number | { maxBriefs: number, mode?: string, runNumber?: number }} options
|
|
353
|
+
* Numeric form retained for backward compatibility — equivalent to
|
|
354
|
+
* `{ maxBriefs, mode: 'fresh-then-backlog', runNumber: 0 }`.
|
|
231
355
|
*/
|
|
232
|
-
export function buildQueue(sources,
|
|
356
|
+
export function buildQueue(sources, options) {
|
|
357
|
+
const opts =
|
|
358
|
+
typeof options === 'number'
|
|
359
|
+
? { maxBriefs: options, mode: 'fresh-then-backlog', runNumber: 0 }
|
|
360
|
+
: {
|
|
361
|
+
maxBriefs: options.maxBriefs,
|
|
362
|
+
mode: options.mode || 'fresh-then-backlog',
|
|
363
|
+
runNumber: Number.isFinite(options.runNumber) ? options.runNumber : 0,
|
|
364
|
+
};
|
|
365
|
+
if (!DISCOVERY_MODES.includes(opts.mode)) {
|
|
366
|
+
throw new Error(
|
|
367
|
+
`buildQueue: invalid mode "${opts.mode}" (expected one of ${DISCOVERY_MODES.join(', ')})`,
|
|
368
|
+
);
|
|
369
|
+
}
|
|
370
|
+
|
|
233
371
|
const withGaps = [];
|
|
234
372
|
let totalMissing = 0;
|
|
235
373
|
const missingByLang = new Map();
|
|
@@ -240,6 +378,16 @@ export function buildQueue(sources, maxBriefs) {
|
|
|
240
378
|
for (const lang of missing) {
|
|
241
379
|
missingByLang.set(lang, (missingByLang.get(lang) || 0) + 1);
|
|
242
380
|
}
|
|
381
|
+
// Pre-compute structural targets for the translator agent so it has
|
|
382
|
+
// explicit visibility into duplicate-titled H2 sections and the
|
|
383
|
+
// verbatim-preserve token budget BEFORE any translation is written.
|
|
384
|
+
// Surfacing these here (rather than relying on the agent to discover
|
|
385
|
+
// them) prevents the regression observed in run #25983007788, where
|
|
386
|
+
// 13 sibling translations of a single brief silently collapsed a
|
|
387
|
+
// `## IMF Economic Context — May 2026 Update` section because the
|
|
388
|
+
// agent treated it as a duplicate of `## IMF Economic Context`.
|
|
389
|
+
const sourceH2Titles = extractH2Titles(source.absPath);
|
|
390
|
+
const sourceFixedTokens = countFixedTokens(source.absPath);
|
|
243
391
|
withGaps.push({
|
|
244
392
|
date: source.date,
|
|
245
393
|
slug: source.slug,
|
|
@@ -247,21 +395,59 @@ export function buildQueue(sources, maxBriefs) {
|
|
|
247
395
|
missingLangs: missing,
|
|
248
396
|
missingCount: missing.length,
|
|
249
397
|
isExtended: source.isExtended,
|
|
398
|
+
sourceH2Titles,
|
|
399
|
+
sourceH2Count: sourceH2Titles.length,
|
|
400
|
+
sourceFixedTokens,
|
|
250
401
|
});
|
|
251
402
|
}
|
|
252
403
|
|
|
253
|
-
//
|
|
254
|
-
|
|
255
|
-
withGaps.sort((a, b) => {
|
|
404
|
+
// Two canonical orderings drive the three modes.
|
|
405
|
+
const newestFirst = (a, b) => {
|
|
256
406
|
if (a.date !== b.date) return a.date < b.date ? 1 : -1;
|
|
257
407
|
if (a.missingCount !== b.missingCount) return b.missingCount - a.missingCount;
|
|
258
408
|
if (a.slug !== b.slug) return a.slug < b.slug ? -1 : 1;
|
|
259
|
-
// Prefer non-extended over extended when both exist for the same slug
|
|
260
409
|
if (a.isExtended !== b.isExtended) return a.isExtended ? 1 : -1;
|
|
261
410
|
return 0;
|
|
262
|
-
}
|
|
411
|
+
};
|
|
412
|
+
const oldestFirstFinishPartial = (a, b) => {
|
|
413
|
+
if (a.date !== b.date) return a.date < b.date ? -1 : 1;
|
|
414
|
+
// Within the same date, finish briefs that are closer to completion
|
|
415
|
+
// first (fewer missing languages → ascending).
|
|
416
|
+
if (a.missingCount !== b.missingCount) return a.missingCount - b.missingCount;
|
|
417
|
+
if (a.slug !== b.slug) return a.slug < b.slug ? -1 : 1;
|
|
418
|
+
if (a.isExtended !== b.isExtended) return a.isExtended ? 1 : -1;
|
|
419
|
+
return 0;
|
|
420
|
+
};
|
|
263
421
|
|
|
264
|
-
|
|
422
|
+
let queue;
|
|
423
|
+
if (opts.mode === 'newest-first') {
|
|
424
|
+
queue = [...withGaps].sort(newestFirst).slice(0, opts.maxBriefs);
|
|
425
|
+
} else if (opts.mode === 'backlog-only') {
|
|
426
|
+
queue = [...withGaps].sort(oldestFirstFinishPartial).slice(0, opts.maxBriefs);
|
|
427
|
+
} else {
|
|
428
|
+
// fresh-then-backlog
|
|
429
|
+
const newestSorted = [...withGaps].sort(newestFirst);
|
|
430
|
+
const oldestSorted = [...withGaps].sort(oldestFirstFinishPartial);
|
|
431
|
+
if (opts.maxBriefs === 1) {
|
|
432
|
+
// Alternate fresh/backlog by run-number parity so the scheduled
|
|
433
|
+
// cadence still drains backlog while preserving freshness on every
|
|
434
|
+
// other slot. Even run-numbers (0, 2, ...) take the fresh slot;
|
|
435
|
+
// odd run-numbers take the oldest backlog slot.
|
|
436
|
+
const pool = opts.runNumber % 2 === 0 ? newestSorted : oldestSorted;
|
|
437
|
+
queue = pool.slice(0, 1);
|
|
438
|
+
} else {
|
|
439
|
+
const freshSlice = newestSorted.slice(0, 1);
|
|
440
|
+
const freshKey = freshSlice[0]
|
|
441
|
+
? `${freshSlice[0].date}\u0000${freshSlice[0].slug}\u0000${freshSlice[0].isExtended}`
|
|
442
|
+
: null;
|
|
443
|
+
const backlogSlice = oldestSorted
|
|
444
|
+
.filter(
|
|
445
|
+
(entry) => `${entry.date}\u0000${entry.slug}\u0000${entry.isExtended}` !== freshKey,
|
|
446
|
+
)
|
|
447
|
+
.slice(0, Math.max(0, opts.maxBriefs - 1));
|
|
448
|
+
queue = [...freshSlice, ...backlogSlice];
|
|
449
|
+
}
|
|
450
|
+
}
|
|
265
451
|
const queuedTranslations = queue.reduce((sum, item) => sum + item.missingCount, 0);
|
|
266
452
|
|
|
267
453
|
// Top 3 most-blocked target languages across the entire backlog. Operators
|
|
@@ -273,6 +459,17 @@ export function buildQueue(sources, maxBriefs) {
|
|
|
273
459
|
.slice(0, 3)
|
|
274
460
|
.map(([lang, count]) => ({ lang, count }));
|
|
275
461
|
|
|
462
|
+
// Operator-visibility extents: newest source still carrying gaps (the
|
|
463
|
+
// candidate for the fresh slot) and oldest source still carrying gaps
|
|
464
|
+
// (the candidate for the backlog slot). Both fall back to null when the
|
|
465
|
+
// backlog is empty.
|
|
466
|
+
let freshNewestDate = null;
|
|
467
|
+
let backlogOldestDate = null;
|
|
468
|
+
for (const entry of withGaps) {
|
|
469
|
+
if (freshNewestDate === null || entry.date > freshNewestDate) freshNewestDate = entry.date;
|
|
470
|
+
if (backlogOldestDate === null || entry.date < backlogOldestDate) backlogOldestDate = entry.date;
|
|
471
|
+
}
|
|
472
|
+
|
|
276
473
|
return {
|
|
277
474
|
totals: {
|
|
278
475
|
sourcesScanned: sources.length,
|
|
@@ -280,6 +477,8 @@ export function buildQueue(sources, maxBriefs) {
|
|
|
280
477
|
translationsMissing: totalMissing,
|
|
281
478
|
queued: queue.length,
|
|
282
479
|
queuedTranslations,
|
|
480
|
+
freshNewestDate,
|
|
481
|
+
backlogOldestDate,
|
|
283
482
|
topMissingLangs,
|
|
284
483
|
},
|
|
285
484
|
queue,
|
|
@@ -296,13 +495,19 @@ export function main(argv) {
|
|
|
296
495
|
includeExtended: opts.includeExtended,
|
|
297
496
|
maxAgeDays: opts.maxAgeDays,
|
|
298
497
|
});
|
|
299
|
-
const { totals, queue } = buildQueue(sources,
|
|
498
|
+
const { totals, queue } = buildQueue(sources, {
|
|
499
|
+
maxBriefs: opts.maxBriefs,
|
|
500
|
+
mode: opts.mode,
|
|
501
|
+
runNumber: opts.runNumber,
|
|
502
|
+
});
|
|
300
503
|
const payload = {
|
|
301
504
|
generatedAt: new Date().toISOString(),
|
|
302
505
|
options: {
|
|
303
506
|
maxBriefs: opts.maxBriefs,
|
|
304
507
|
maxAgeDays: opts.maxAgeDays,
|
|
305
508
|
includeExtended: opts.includeExtended,
|
|
509
|
+
mode: opts.mode,
|
|
510
|
+
runNumber: opts.runNumber,
|
|
306
511
|
},
|
|
307
512
|
totals,
|
|
308
513
|
queue,
|
|
@@ -8,6 +8,25 @@ import type { ParsedArticle } from '../types/index.js';
|
|
|
8
8
|
* @returns Filename string
|
|
9
9
|
*/
|
|
10
10
|
export declare function getIndexFilename(lang: string): string;
|
|
11
|
+
/**
|
|
12
|
+
* Heal JSON-LD `description` field corruption left behind by a prior
|
|
13
|
+
* version of {@link applyArticleSeoBackfill}. The old regex
|
|
14
|
+
* `"description":"[^"]*"` terminated at the first JSON-escaped quote
|
|
15
|
+
* (`\"`), so every rebuild prepended a new value in front of the
|
|
16
|
+
* previous description's tail — producing an unparseable JSON-LD
|
|
17
|
+
* block whose `description` value was followed by a run of repeated
|
|
18
|
+
* fragments before `,"datePublished"`.
|
|
19
|
+
*
|
|
20
|
+
* This pass is idempotent: when the JSON-LD is already well-formed,
|
|
21
|
+
* the regex `[^,]*` matches the empty string and the file is left
|
|
22
|
+
* unchanged. It runs unconditionally because the original backfill
|
|
23
|
+
* path skips files whose `<meta name="description">` is already
|
|
24
|
+
* clean, even when their JSON-LD is corrupted.
|
|
25
|
+
*
|
|
26
|
+
* @param filenames - News article filenames to inspect
|
|
27
|
+
* @returns Number of HTML files updated
|
|
28
|
+
*/
|
|
29
|
+
export declare function healJsonLdDescriptionCorruption(filenames: readonly string[]): number;
|
|
11
30
|
/**
|
|
12
31
|
* Prefix legacy descriptions with date and **localized** category label
|
|
13
32
|
* so duplicate strings become page-specific before the 180-character
|
|
@@ -36,6 +55,22 @@ export declare function getIndexFilename(lang: string): string;
|
|
|
36
55
|
* already substantive
|
|
37
56
|
*/
|
|
38
57
|
export declare function buildLegacyBackfillDescription(date: string, slug: string, lang: string, description: string): string;
|
|
58
|
+
/**
|
|
59
|
+
* Apply SEO meta tag replacements to a complete article HTML document.
|
|
60
|
+
*
|
|
61
|
+
* Exported for the regression test in
|
|
62
|
+
* `test/unit/news-indexes-jsonld-description-regex.test.js`, which
|
|
63
|
+
* locks in the JSON-LD description regex against the duplicate-tail
|
|
64
|
+
* bug (the legacy `"description":"[^"]*"` pattern terminated at the
|
|
65
|
+
* first JSON-escaped quote `\"` and left the previous description's
|
|
66
|
+
* tail in place, accumulating duplicates on every prebuild run).
|
|
67
|
+
*
|
|
68
|
+
* @param html - Existing article HTML
|
|
69
|
+
* @param description - Backfilled meta description
|
|
70
|
+
* @param keywords - Backfilled keyword list
|
|
71
|
+
* @returns Updated HTML
|
|
72
|
+
*/
|
|
73
|
+
export declare function applyArticleSeoBackfill(html: string, description: string, keywords: readonly string[]): string;
|
|
39
74
|
/**
|
|
40
75
|
* Backfill hreflang alternate links for all article HTML files.
|
|
41
76
|
*
|