euparliamentmonitor 0.8.32 → 0.8.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +4 -4
- package/scripts/constants/analysis-constants.d.ts +1 -1
- package/scripts/constants/analysis-constants.js +1 -1
- package/scripts/constants/language-articles.js +1 -1
- package/scripts/generators/news-enhanced.d.ts +2 -2
- package/scripts/generators/news-enhanced.js +3 -3
- package/scripts/generators/pipeline/generate-stage.js +2 -2
- package/scripts/mcp/wb-mcp-client.d.ts +10 -0
- package/scripts/mcp/wb-mcp-client.js +18 -0
- package/scripts/templates/article-template.d.ts +4 -0
- package/scripts/templates/article-template.js +10 -44
- package/scripts/utils/content-validator.d.ts +67 -0
- package/scripts/utils/content-validator.js +394 -0
- package/scripts/utils/validate-articles.js +111 -2
package/README.md
CHANGED
|
@@ -984,7 +984,7 @@ Projected workflow counts below include all CI/CD workflow definitions, agentic
|
|
|
984
984
|
|
|
985
985
|
| Year | Projected Workflow Definitions | AI Model | Key Capability |
|
|
986
986
|
|------|-------------------------------|----------|----------------|
|
|
987
|
-
| **2026** | 44–50 | Opus 4.
|
|
987
|
+
| **2026** | 44–50 | Opus 4.7–4.9 | 🟢 Agentic news generation |
|
|
988
988
|
| **2027** | 50–55 | Opus 5.x | 🔵 Predictive analytics |
|
|
989
989
|
| **2028** | 55–65 | Opus 6.x | 🟣 Multi-modal content |
|
|
990
990
|
| **2029** | 65–75 | Opus 7.x | 🟠 Autonomous pipeline |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "euparliamentmonitor",
|
|
3
|
-
"version": "0.8.
|
|
3
|
+
"version": "0.8.34",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "European Parliament Intelligence Platform - Monitor political activity with systematic transparency",
|
|
6
6
|
"main": "scripts/index.js",
|
|
@@ -135,7 +135,7 @@
|
|
|
135
135
|
},
|
|
136
136
|
"homepage": "https://euparliamentmonitor.com",
|
|
137
137
|
"devDependencies": {
|
|
138
|
-
"@axe-core/playwright": "4.11.
|
|
138
|
+
"@axe-core/playwright": "4.11.2",
|
|
139
139
|
"@eslint/js": "10.0.1",
|
|
140
140
|
"@playwright/test": "1.59.1",
|
|
141
141
|
"@types/d3": "7.4.3",
|
|
@@ -148,7 +148,7 @@
|
|
|
148
148
|
"chart.js": "4.5.1",
|
|
149
149
|
"chartjs-plugin-annotation": "3.1.0",
|
|
150
150
|
"d3": "7.9.0",
|
|
151
|
-
"eslint": "10.2.
|
|
151
|
+
"eslint": "10.2.1",
|
|
152
152
|
"eslint-config-prettier": "10.1.8",
|
|
153
153
|
"eslint-plugin-jsdoc": "62.9.0",
|
|
154
154
|
"eslint-plugin-security": "4.0.0",
|
|
@@ -163,7 +163,7 @@
|
|
|
163
163
|
"ts-api-utils": "2.5.0",
|
|
164
164
|
"tsx": "4.21.0",
|
|
165
165
|
"typedoc": "0.28.19",
|
|
166
|
-
"typescript": "6.0.
|
|
166
|
+
"typescript": "6.0.3",
|
|
167
167
|
"vitest": "4.1.4"
|
|
168
168
|
},
|
|
169
169
|
"engines": {
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* Any narrative or interpretive analysis text (for example: why, outlook,
|
|
16
16
|
* impact assessments, stakeholder reasoning, or mistake/consequence
|
|
17
17
|
* explanations) that is LEFT AS THIS MARKER is expected to be generated by
|
|
18
|
-
* the AI agent
|
|
18
|
+
* the AI agent in the agentic workflow, not by code.
|
|
19
19
|
*/
|
|
20
20
|
export declare const AI_MARKER = "[AI_ANALYSIS_REQUIRED]";
|
|
21
21
|
/**
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
* Any narrative or interpretive analysis text (for example: why, outlook,
|
|
18
18
|
* impact assessments, stakeholder reasoning, or mistake/consequence
|
|
19
19
|
* explanations) that is LEFT AS THIS MARKER is expected to be generated by
|
|
20
|
-
* the AI agent
|
|
20
|
+
* the AI agent in the agentic workflow, not by code.
|
|
21
21
|
*/
|
|
22
22
|
export const AI_MARKER = '[AI_ANALYSIS_REQUIRED]';
|
|
23
23
|
/**
|
|
@@ -3140,7 +3140,7 @@ export const WEEK_AHEAD_STAKEHOLDER_STRINGS = {
|
|
|
3140
3140
|
reasonInstitutionsCoordination: '需要机构间协调',
|
|
3141
3141
|
},
|
|
3142
3142
|
};
|
|
3143
|
-
// ─── AI analysis marker — all analysis text is produced by the AI agent
|
|
3143
|
+
// ─── AI analysis marker — all analysis text is produced by the AI agent ───
|
|
3144
3144
|
const AI_ANALYSIS_MARKER = '[AI_ANALYSIS_REQUIRED]';
|
|
3145
3145
|
const BRK_WHY_ANOMALIES = AI_ANALYSIS_MARKER;
|
|
3146
3146
|
const BRK_WHY_NORMAL = AI_ANALYSIS_MARKER;
|
|
@@ -26,13 +26,13 @@ export declare const runId: string;
|
|
|
26
26
|
/**
|
|
27
27
|
* AI-generated article title passed by the agentic workflow.
|
|
28
28
|
* When provided, this OVERRIDES any script-generated title.
|
|
29
|
-
* The AI agent
|
|
29
|
+
* The AI agent must analyse the content and produce this.
|
|
30
30
|
*/
|
|
31
31
|
export declare const aiTitle: string;
|
|
32
32
|
/**
|
|
33
33
|
* AI-generated article description/subtitle passed by the agentic workflow.
|
|
34
34
|
* When provided, this OVERRIDES any script-generated description.
|
|
35
|
-
* The AI agent
|
|
35
|
+
* The AI agent must analyse the content and produce this.
|
|
36
36
|
*/
|
|
37
37
|
export declare const aiDescription: string;
|
|
38
38
|
/**
|
|
@@ -96,13 +96,13 @@ export const runId = (runIdArg?.slice('--run-id='.length).trim() ||
|
|
|
96
96
|
/**
|
|
97
97
|
* AI-generated article title passed by the agentic workflow.
|
|
98
98
|
* When provided, this OVERRIDES any script-generated title.
|
|
99
|
-
* The AI agent
|
|
99
|
+
* The AI agent must analyse the content and produce this.
|
|
100
100
|
*/
|
|
101
101
|
export const aiTitle = titleArg ? titleArg.slice('--title='.length).trim() : '';
|
|
102
102
|
/**
|
|
103
103
|
* AI-generated article description/subtitle passed by the agentic workflow.
|
|
104
104
|
* When provided, this OVERRIDES any script-generated description.
|
|
105
|
-
* The AI agent
|
|
105
|
+
* The AI agent must analyse the content and produce this.
|
|
106
106
|
*/
|
|
107
107
|
export const aiDescription = descriptionArg
|
|
108
108
|
? descriptionArg.slice('--description='.length).trim()
|
|
@@ -378,7 +378,7 @@ async function runAnalysisWithGuard(date, client) {
|
|
|
378
378
|
}
|
|
379
379
|
/**
|
|
380
380
|
* Wire AI-provided title/description from CLI `--title` and `--description` flags.
|
|
381
|
-
* The AI agent
|
|
381
|
+
* The AI agent passes these after analysing the content.
|
|
382
382
|
* They override ALL script-generated metadata for the English version.
|
|
383
383
|
*/
|
|
384
384
|
function wireAIMetadata() {
|
|
@@ -19,7 +19,7 @@ import { writeSingleArticle } from './output-stage.js';
|
|
|
19
19
|
/**
|
|
20
20
|
* AI-generated article title provided by the agentic workflow.
|
|
21
21
|
* When non-empty, this OVERRIDES any script-generated title for the
|
|
22
|
-
* English version. The AI agent
|
|
22
|
+
* English version. The AI agent must analyse the article
|
|
23
23
|
* content and produce this — titles must NEVER be generated by code.
|
|
24
24
|
*/
|
|
25
25
|
let _aiTitle = '';
|
|
@@ -130,7 +130,7 @@ function generateSingleLanguageArticle(strategy, data, lang, dateStr, slug, outp
|
|
|
130
130
|
// preserved, but title and description enrichment is now subordinate
|
|
131
131
|
// to AI-provided values from --title and --description CLI flags.
|
|
132
132
|
//
|
|
133
|
-
// Architecture: The AI agent
|
|
133
|
+
// Architecture: The AI agent analyses the content and
|
|
134
134
|
// provides titles/descriptions via CLI flags. Script code NEVER
|
|
135
135
|
// generates final titles or descriptions — it only provides fallbacks.
|
|
136
136
|
const enrichedMetadata = enrichMetadataFromContent(content, baseMetadata);
|
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
import { MCPConnection } from './mcp-connection.js';
|
|
2
2
|
import type { MCPToolResult, MCPClientOptions } from '../types/index.js';
|
|
3
|
+
/**
|
|
4
|
+
* Canonical list of tools exposed by the World Bank MCP gateway. The news
|
|
5
|
+
* workflows, probe script, and the integration test suite all reference this
|
|
6
|
+
* list so a regression that adds/removes a tool fails a single drift guard
|
|
7
|
+
* (`test/integration/mcp/worldbank-mcp.test.js`) instead of silently breaking
|
|
8
|
+
* prompt/validator/probe coverage.
|
|
9
|
+
*
|
|
10
|
+
* Kept in sync with `analysis/methodologies/worldbank-indicator-mapping.md`.
|
|
11
|
+
*/
|
|
12
|
+
export declare const WORLD_BANK_MCP_TOOLS: readonly string[];
|
|
3
13
|
/**
|
|
4
14
|
* MCP Client for World Bank economic data access.
|
|
5
15
|
* Extends {@link MCPConnection} with World Bank-specific tool wrapper methods.
|
|
@@ -25,6 +25,24 @@ const WB_BINARY_FILE = process.platform === 'win32' ? `${WB_BINARY_NAME}.cmd` :
|
|
|
25
25
|
const WB_DEFAULT_SERVER = resolve(dirname(fileURLToPath(import.meta.url)), `../../node_modules/.bin/${WB_BINARY_FILE}`);
|
|
26
26
|
/** Fallback payload when indicator data is unavailable (empty CSV) */
|
|
27
27
|
const INDICATOR_FALLBACK = '';
|
|
28
|
+
/**
|
|
29
|
+
* Canonical list of tools exposed by the World Bank MCP gateway. The news
|
|
30
|
+
* workflows, probe script, and the integration test suite all reference this
|
|
31
|
+
* list so a regression that adds/removes a tool fails a single drift guard
|
|
32
|
+
* (`test/integration/mcp/worldbank-mcp.test.js`) instead of silently breaking
|
|
33
|
+
* prompt/validator/probe coverage.
|
|
34
|
+
*
|
|
35
|
+
* Kept in sync with `analysis/methodologies/worldbank-indicator-mapping.md`.
|
|
36
|
+
*/
|
|
37
|
+
export const WORLD_BANK_MCP_TOOLS = [
|
|
38
|
+
'search-indicators',
|
|
39
|
+
'get-countries',
|
|
40
|
+
'get-country-info',
|
|
41
|
+
'get-economic-data',
|
|
42
|
+
'get-social-data',
|
|
43
|
+
'get-education-data',
|
|
44
|
+
'get-health-data',
|
|
45
|
+
];
|
|
28
46
|
/**
|
|
29
47
|
* MCP Client for World Bank economic data access.
|
|
30
48
|
* Extends {@link MCPConnection} with World Bank-specific tool wrapper methods.
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module Templates/ArticleTemplate
|
|
3
|
+
* @description Generates HTML templates for news articles with proper structure and metadata
|
|
4
|
+
*/
|
|
1
5
|
import type { ArticleOptions, LanguageCode, AnalysisFileEntry } from '../types/index.js';
|
|
2
6
|
/**
|
|
3
7
|
* Generate complete HTML for a news article
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024-2026 Hack23 AB
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
/**
|
|
4
|
-
* @module Templates/ArticleTemplate
|
|
5
|
-
* @description Generates HTML templates for news articles with proper structure and metadata
|
|
6
|
-
*/
|
|
7
|
-
import { createHash } from 'crypto';
|
|
8
3
|
import { ALL_LANGUAGES, LANGUAGE_FLAGS, LANGUAGE_NAMES, ARTICLE_TYPE_LABELS, READ_TIME_LABELS, BACK_TO_NEWS_LABELS, ARTICLE_NAV_LABELS, RELATED_ARTICLES_NAV_LABELS, BREADCRUMB_HOME_LABELS, BREADCRUMB_NEWS_LABELS, SKIP_LINK_TEXTS, SOURCES_HEADING_LABELS, HEADER_SUBTITLE_LABELS, THEME_TOGGLE_LABELS, FOOTER_ABOUT_HEADING_LABELS, FOOTER_ABOUT_TEXT_LABELS, FOOTER_QUICK_LINKS_LABELS, FOOTER_BUILT_BY_LABELS, FOOTER_LANGUAGES_LABELS, ANALYSIS_TRANSPARENCY_LABELS, ANALYSIS_SUMMARY_LABELS, METHODOLOGY_LABELS, TRANSPARENCY_DISCLOSURE_LABELS, CLASSIFICATION_ANALYSIS_LABELS, THREAT_ASSESSMENT_LABELS, RISK_SCORING_LABELS, DEEP_ANALYSIS_LABELS, VIEW_SOURCE_LABELS, OPEN_SOURCE_NOTE_LABELS, AI_ANALYSIS_GUIDE_LABELS, SWOT_FRAMEWORK_LABELS, RISK_METHODOLOGY_LABELS, THREAT_FRAMEWORK_LABELS, CLASSIFICATION_GUIDE_LABELS, STYLE_GUIDE_LABELS, SIGNIFICANCE_CLASSIFICATION_LABELS, ACTOR_MAPPING_LABELS, FORCES_ANALYSIS_LABELS, IMPACT_MATRIX_LABELS, POLITICAL_THREAT_LANDSCAPE_LABELS, ACTOR_THREAT_PROFILING_LABELS, CONSEQUENCE_TREES_LABELS, LEGISLATIVE_DISRUPTION_LABELS, RISK_MATRIX_LABELS, QUANTITATIVE_SWOT_LABELS, POLITICAL_CAPITAL_RISK_LABELS, LEGISLATIVE_VELOCITY_RISK_LABELS, AGENT_RISK_WORKFLOW_LABELS, STAKEHOLDER_IMPACT_LABELS, COALITION_DYNAMICS_LABELS, VOTING_PATTERNS_LABELS, CROSS_SESSION_INTELLIGENCE_LABELS, SYNTHESIS_SUMMARY_LABELS, DOCUMENT_ANALYSIS_LABELS, SIGNIFICANCE_SCORING_LABELS, getLocalizedString, getTextDirection, } from '../constants/languages.js';
|
|
9
4
|
import { escapeHTML, isSafeURL } from '../utils/file-utils.js';
|
|
10
5
|
import { stripHtmlTags } from '../utils/html-sanitize.js';
|
|
11
|
-
import { APP_VERSION, createThemeToggleButton
|
|
6
|
+
import { APP_VERSION, createThemeToggleButton } from '../constants/config.js';
|
|
12
7
|
/** Pattern for valid article dates (YYYY-MM-DD) */
|
|
13
8
|
const DATE_PATTERN = /^\d{4}-\d{2}-\d{2}$/u;
|
|
14
9
|
/** Pattern for valid article slugs (lowercase letters, digits, hyphens) */
|
|
@@ -288,22 +283,12 @@ export function generateArticleHTML(options) {
|
|
|
288
283
|
const safeSriAttrs = stylesHash && SRI_HASH_PATTERN.test(stylesHash)
|
|
289
284
|
? ` integrity="${escapeHTML(stylesHash)}" crossorigin="anonymous"`
|
|
290
285
|
: '';
|
|
291
|
-
// Compute SHA-256
|
|
292
|
-
//
|
|
293
|
-
//
|
|
294
|
-
//
|
|
295
|
-
//
|
|
296
|
-
//
|
|
297
|
-
const jsonLdScriptContent = `\n ${jsonLd}\n `;
|
|
298
|
-
const jsonLdHash = `sha256-${createHash('sha256').update(jsonLdScriptContent).digest('base64')}`;
|
|
299
|
-
// Compute CSP hash for BreadcrumbList JSON-LD script
|
|
300
|
-
const breadcrumbLdScriptContent = `\n ${breadcrumbLd}\n `;
|
|
301
|
-
const breadcrumbLdHash = `sha256-${createHash('sha256').update(breadcrumbLdScriptContent).digest('base64')}`;
|
|
302
|
-
// Reading-progress script hash — content must exactly match the <script> block.
|
|
303
|
-
const readingProgressScript = `\n (function(){\n var bar=document.querySelector('.reading-progress');\n if(!bar)return;\n bar.style.display='block';\n var ticking=false;\n window.addEventListener('scroll',function(){\n if(!ticking){\n window.requestAnimationFrame(function(){\n var h=document.documentElement;\n var scrollTop=h.scrollTop||document.body.scrollTop;\n var scrollHeight=h.scrollHeight-h.clientHeight;\n bar.style.width=scrollHeight>0?((scrollTop/scrollHeight)*100)+'%':'0%';\n ticking=false;\n });\n ticking=true;\n }\n },{passive:true});\n })();\n `;
|
|
304
|
-
const readingProgressHash = `sha256-${createHash('sha256').update(readingProgressScript).digest('base64')}`;
|
|
305
|
-
// Theme toggle CSP hash — derived from the shared THEME_TOGGLE_SCRIPT_CONTENT constant
|
|
306
|
-
const themeToggleHash = `sha256-${createHash('sha256').update(THEME_TOGGLE_SCRIPT_CONTENT).digest('base64')}`;
|
|
286
|
+
// Compute SHA-256 hashes were previously required for inline <script>
|
|
287
|
+
// blocks (JSON-LD, reading progress, theme toggle). All executable inline
|
|
288
|
+
// scripts have been externalised to `js/article-runtime.js`, so the CSP
|
|
289
|
+
// reduces to `script-src 'self'`. JSON-LD blocks use
|
|
290
|
+
// `type="application/ld+json"` which is non-executable and not governed
|
|
291
|
+
// by `script-src`.
|
|
307
292
|
// Localized theme toggle button
|
|
308
293
|
const themeToggleLabel = escapeHTML(getLocalizedString(THEME_TOGGLE_LABELS, lang));
|
|
309
294
|
// Related articles navigation HTML (optional)
|
|
@@ -315,7 +300,7 @@ export function generateArticleHTML(options) {
|
|
|
315
300
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
316
301
|
<meta http-equiv="X-Content-Type-Options" content="nosniff">
|
|
317
302
|
<meta name="referrer" content="no-referrer">
|
|
318
|
-
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self'
|
|
303
|
+
<meta http-equiv="Content-Security-Policy" content="default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' https: data:; font-src 'self'; connect-src 'self'; frame-src 'none'; base-uri 'self'; form-action 'none'">
|
|
319
304
|
<title>${safeTitle} | EU Parliament Monitor</title>
|
|
320
305
|
<meta name="description" content="${safeSubtitle}">
|
|
321
306
|
<meta name="keywords" content="${safeKeywords}">
|
|
@@ -453,26 +438,7 @@ export function generateArticleHTML(options) {
|
|
|
453
438
|
</div>
|
|
454
439
|
</footer>
|
|
455
440
|
|
|
456
|
-
<script
|
|
457
|
-
(function(){
|
|
458
|
-
var bar=document.querySelector('.reading-progress');
|
|
459
|
-
if(!bar)return;
|
|
460
|
-
bar.style.display='block';
|
|
461
|
-
var ticking=false;
|
|
462
|
-
window.addEventListener('scroll',function(){
|
|
463
|
-
if(!ticking){
|
|
464
|
-
window.requestAnimationFrame(function(){
|
|
465
|
-
var h=document.documentElement;
|
|
466
|
-
var scrollTop=h.scrollTop||document.body.scrollTop;
|
|
467
|
-
var scrollHeight=h.scrollHeight-h.clientHeight;
|
|
468
|
-
bar.style.width=scrollHeight>0?((scrollTop/scrollHeight)*100)+'%':'0%';
|
|
469
|
-
ticking=false;
|
|
470
|
-
});
|
|
471
|
-
ticking=true;
|
|
472
|
-
}
|
|
473
|
-
},{passive:true});
|
|
474
|
-
})();
|
|
475
|
-
</script>${content.includes('data-chart-config')
|
|
441
|
+
<script src="../js/article-runtime.js" defer></script>${content.includes('data-chart-config')
|
|
476
442
|
? `
|
|
477
443
|
<script src="../js/vendor/chart.umd.min.js" defer></script>
|
|
478
444
|
<script src="../js/vendor/chartjs-plugin-annotation.min.js" defer></script>
|
|
@@ -481,7 +447,7 @@ export function generateArticleHTML(options) {
|
|
|
481
447
|
? `
|
|
482
448
|
<script src="../js/vendor/d3.min.js" defer></script>
|
|
483
449
|
<script src="../js/d3-init.js" defer></script>`
|
|
484
|
-
: ''}
|
|
450
|
+
: ''}
|
|
485
451
|
</body>
|
|
486
452
|
</html>`;
|
|
487
453
|
}
|
|
@@ -56,6 +56,73 @@ export interface TranslationValidationResult {
|
|
|
56
56
|
/** Collected translation quality metrics */
|
|
57
57
|
metrics: TranslationValidationMetrics;
|
|
58
58
|
}
|
|
59
|
+
/**
|
|
60
|
+
* Detect whether the article contains at least one Chart.js canvas with a
|
|
61
|
+
* well-formed `data-chart-config` JSON payload.
|
|
62
|
+
*
|
|
63
|
+
* A valid chart must:
|
|
64
|
+
* - be rendered via `<canvas data-chart-config="…">` (the declarative
|
|
65
|
+
* CSP-safe pattern hydrated by `js/chart-init.js`)
|
|
66
|
+
* - declare a supported Chart.js `type`
|
|
67
|
+
* - carry at least 3 data points in the first dataset (single-point charts
|
|
68
|
+
* are rejected by `SHARED_PROMPT_PATTERNS.md` anti-patterns)
|
|
69
|
+
*
|
|
70
|
+
* @param html - Raw article HTML
|
|
71
|
+
* @returns `true` when ≥1 chart meeting the rules is present
|
|
72
|
+
*/
|
|
73
|
+
export declare function articleHasChart(html: string): boolean;
|
|
74
|
+
/**
|
|
75
|
+
* Strong World Bank evidence tokens — plain substring match is enough to
|
|
76
|
+
* satisfy the gate because each is specific (the literal attribution phrase
|
|
77
|
+
* or an MCP tool name). Kept aligned with
|
|
78
|
+
* `analysis/methodologies/worldbank-indicator-mapping.md`.
|
|
79
|
+
*/
|
|
80
|
+
export declare const WORLD_BANK_STRONG_FINGERPRINTS: readonly string[];
|
|
81
|
+
/**
|
|
82
|
+
* Short indicator codes published by the World Bank MCP server. These are
|
|
83
|
+
* matched with a word boundary (`[^A-Z0-9_]` look-arounds) so that prose like
|
|
84
|
+
* "GDP growth slowed" does NOT count as World Bank evidence, but an analysis
|
|
85
|
+
* file line like `INDICATOR: GDP` does. All codes are uppercase, so the match
|
|
86
|
+
* is case-sensitive — case-insensitive mentions in English prose are intentionally
|
|
87
|
+
* rejected.
|
|
88
|
+
*/
|
|
89
|
+
export declare const WORLD_BANK_INDICATOR_CODES: readonly string[];
|
|
90
|
+
/**
|
|
91
|
+
* Backwards-compatible union of strong + short fingerprints. Kept exported so
|
|
92
|
+
* callers that only need a flat list (e.g. existing consumers that shipped
|
|
93
|
+
* before the strong/short split) continue to compile. New code SHOULD prefer
|
|
94
|
+
* {@link hasWorldBankEvidence}, which enforces the stricter word-boundary rule
|
|
95
|
+
* for short codes.
|
|
96
|
+
*/
|
|
97
|
+
export declare const WORLD_BANK_FINGERPRINTS: readonly string[];
|
|
98
|
+
/**
|
|
99
|
+
* Detect World Bank sourcing in any piece of text (article body OR analysis
|
|
100
|
+
* markdown). Returns `true` when the text contains either a strong fingerprint
|
|
101
|
+
* (the phrase "World Bank", an MCP tool name, etc.) or an indicator code with
|
|
102
|
+
* clean word boundaries.
|
|
103
|
+
*
|
|
104
|
+
* This is the single source of truth for the policy quality gate — both the
|
|
105
|
+
* content validator and the CLI validator's filesystem fallback use it so a
|
|
106
|
+
* legitimate evidence trail on either side satisfies the rule, and generic
|
|
107
|
+
* prose mentions of economic terms do not.
|
|
108
|
+
*
|
|
109
|
+
* @param text - Text to scan
|
|
110
|
+
* @returns `true` when at least one strong or word-bounded fingerprint matches
|
|
111
|
+
*/
|
|
112
|
+
export declare function hasWorldBankEvidence(text: string): boolean;
|
|
113
|
+
/**
|
|
114
|
+
* Verify that a policy article (or the linked analysis artifacts) contains at
|
|
115
|
+
* least one World Bank fingerprint — indicator code (word-bounded), MCP
|
|
116
|
+
* tool-trace token, or the phrase "World Bank" itself. Returns `true` if the
|
|
117
|
+
* gate is satisfied OR the article type is not on the mandatory list.
|
|
118
|
+
*
|
|
119
|
+
* @param html - Article HTML
|
|
120
|
+
* @param articleType - Slug of the article category (e.g. `"committee-reports"`)
|
|
121
|
+
* @param _analysisDir - Reserved for API symmetry; filesystem recursion is
|
|
122
|
+
* performed by the caller in `validate-articles.ts` to keep this module pure.
|
|
123
|
+
* @returns `true` when the World Bank evidence requirement is met or not applicable
|
|
124
|
+
*/
|
|
125
|
+
export declare function articlePolicyHasWorldBank(html: string, articleType: string, _analysisDir?: string): boolean;
|
|
59
126
|
/**
|
|
60
127
|
* Validate the quality of a generated article.
|
|
61
128
|
*
|
|
@@ -697,6 +697,400 @@ function collectQualityGateWarnings(html, warnings) {
|
|
|
697
697
|
if (emptySectionCount > 0) {
|
|
698
698
|
warnings.push(`Article contains ${emptySectionCount} empty or near-empty <section> element(s) that should be removed`);
|
|
699
699
|
}
|
|
700
|
+
// Chart presence gate
|
|
701
|
+
if (!articleHasChart(html)) {
|
|
702
|
+
warnings.push('Missing required Chart.js visualization: no <canvas data-chart-config="…"> element with a valid type found (≥1 required, see ai-first-quality.md quality gates)');
|
|
703
|
+
}
|
|
704
|
+
// Structural integrity gates — catch hand-written HTML bypassing the template
|
|
705
|
+
const langSwitcherCount = countLanguageSwitcherLinks(html);
|
|
706
|
+
if (langSwitcherCount < MIN_LANG_SWITCHER_LINKS) {
|
|
707
|
+
warnings.push(`Language switcher has only ${langSwitcherCount} link(s); the template always emits ${MIN_LANG_SWITCHER_LINKS} — this article may have been hand-written and skipped the template`);
|
|
708
|
+
}
|
|
709
|
+
if (!hasStandardFooterContent(html)) {
|
|
710
|
+
warnings.push('Footer is missing the standard `.footer-content` + `.footer-bottom` blocks — the template always emits these; article may have been hand-written');
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
/** Minimum number of language switcher links the template always emits (14 languages). */
|
|
714
|
+
const MIN_LANG_SWITCHER_LINKS = 14;
|
|
715
|
+
/** Chart.js types accepted by the `data-chart-config` declarative pattern. */
|
|
716
|
+
const CHART_JS_TYPES = /"type"\s*:\s*"(bar|line|pie|doughnut|radar|polarArea|scatter|bubble)"/u;
|
|
717
|
+
/**
|
|
718
|
+
* Check whether a character is HTML whitespace per the WHATWG spec
|
|
719
|
+
* (space, tab, LF, CR, FF).
|
|
720
|
+
*
|
|
721
|
+
* @param ch - Single character to test (may be empty string)
|
|
722
|
+
* @returns `true` when `ch` is one of the recognised whitespace chars
|
|
723
|
+
*/
|
|
724
|
+
function isHtmlWhitespace(ch) {
|
|
725
|
+
return ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r' || ch === '\f';
|
|
726
|
+
}
|
|
727
|
+
/**
|
|
728
|
+
* Decode the five entity escapes that `escapeHTML` emits into literal chars.
|
|
729
|
+
*
|
|
730
|
+
* @param raw - Entity-encoded substring extracted from an attribute value
|
|
731
|
+
* @returns Decoded literal string
|
|
732
|
+
*/
|
|
733
|
+
function decodeHtmlEntities(raw) {
|
|
734
|
+
return raw
|
|
735
|
+
.replace(/"/gu, '"')
|
|
736
|
+
.replace(/'/gu, "'")
|
|
737
|
+
.replace(/>/gu, '>')
|
|
738
|
+
.replace(/</gu, '<')
|
|
739
|
+
.replace(/&/gu, '&');
|
|
740
|
+
}
|
|
741
|
+
/**
|
|
742
|
+
* Check that the positions immediately before and after an attribute name
|
|
743
|
+
* form valid HTML word-boundary characters. Prevents `xdata-chart-config`
|
|
744
|
+
* from being treated as the `data-chart-config` attribute.
|
|
745
|
+
*
|
|
746
|
+
* @param tag - Full opening-tag text (without trailing `>`)
|
|
747
|
+
* @param attrIdx - Index where the attribute name was found
|
|
748
|
+
* @param attrLen - Length of the attribute name
|
|
749
|
+
* @returns `true` when both boundaries are whitespace / `<` / `=` / start-of-tag
|
|
750
|
+
*/
|
|
751
|
+
function hasAttributeBoundaries(tag, attrIdx, attrLen) {
|
|
752
|
+
const before = attrIdx === 0 ? '' : (tag[attrIdx - 1] ?? '');
|
|
753
|
+
const afterIdx = attrIdx + attrLen;
|
|
754
|
+
const after = afterIdx < tag.length ? (tag[afterIdx] ?? '') : '';
|
|
755
|
+
const leadOk = before === '' || isHtmlWhitespace(before) || before === '<';
|
|
756
|
+
const trailOk = after === '' || isHtmlWhitespace(after) || after === '=';
|
|
757
|
+
return leadOk && trailOk;
|
|
758
|
+
}
|
|
759
|
+
/**
|
|
760
|
+
* Starting just after an attribute name, locate the opening quote character
|
|
761
|
+
* (either `"` or `'`) that begins the attribute value, tolerating optional
|
|
762
|
+
* HTML whitespace on either side of the `=`.
|
|
763
|
+
*
|
|
764
|
+
* @param tag - Full opening-tag text
|
|
765
|
+
* @param from - Index immediately after the attribute name
|
|
766
|
+
* @returns `{quote, valueStart}` when a proper `=<whitespace?><quote>` run is
|
|
767
|
+
* present; `null` when the attribute is malformed or unquoted
|
|
768
|
+
*/
|
|
769
|
+
function findAttributeValueStart(tag, from) {
|
|
770
|
+
let i = from;
|
|
771
|
+
while (i < tag.length && isHtmlWhitespace(tag[i] ?? ''))
|
|
772
|
+
i++;
|
|
773
|
+
if (i >= tag.length || tag[i] !== '=')
|
|
774
|
+
return null;
|
|
775
|
+
i++;
|
|
776
|
+
while (i < tag.length && isHtmlWhitespace(tag[i] ?? ''))
|
|
777
|
+
i++;
|
|
778
|
+
if (i >= tag.length)
|
|
779
|
+
return null;
|
|
780
|
+
const quote = tag[i] ?? '';
|
|
781
|
+
if (quote !== '"' && quote !== "'")
|
|
782
|
+
return null;
|
|
783
|
+
return { quote, valueStart: i + 1 };
|
|
784
|
+
}
|
|
785
|
+
/**
|
|
786
|
+
* Scan an HTML attribute value in a single `<canvas>` tag starting at
|
|
787
|
+
* `tagStart`. Returns the decoded value of `attr` or `null` if not present.
|
|
788
|
+
* Uses only `indexOf` + single-character look-arounds so runtime is strictly
|
|
789
|
+
* linear in input length — this avoids the polynomial-ReDoS class of regex
|
|
790
|
+
* that CodeQL flags when nested character classes match the same tag prefix.
|
|
791
|
+
*
|
|
792
|
+
* Tolerates all HTML-compliant attribute forms:
|
|
793
|
+
* - double-quoted: `data-chart-config="..."`
|
|
794
|
+
* - single-quoted: `data-chart-config='...'`
|
|
795
|
+
* - optional whitespace around `=`: `data-chart-config = "..."`
|
|
796
|
+
*
|
|
797
|
+
* @param html - Full article HTML
|
|
798
|
+
* @param tagStart - Byte offset of the `<` that opens the canvas tag
|
|
799
|
+
* @param attr - Attribute name (e.g. `data-chart-config`)
|
|
800
|
+
* @returns Decoded attribute value, or `null` when the attribute is missing
|
|
801
|
+
*/
|
|
802
|
+
function extractCanvasAttribute(html, tagStart, attr) {
|
|
803
|
+
const tagEnd = html.indexOf('>', tagStart);
|
|
804
|
+
if (tagEnd === -1)
|
|
805
|
+
return null;
|
|
806
|
+
const tag = html.slice(tagStart, tagEnd);
|
|
807
|
+
let searchFrom = 0;
|
|
808
|
+
while (searchFrom < tag.length) {
|
|
809
|
+
const attrIdx = tag.indexOf(attr, searchFrom);
|
|
810
|
+
if (attrIdx === -1)
|
|
811
|
+
return null;
|
|
812
|
+
// Keep scanning past false matches with bad boundaries or without a
|
|
813
|
+
// proper `=<quote>` run; this keeps the function linear in tag length.
|
|
814
|
+
if (!hasAttributeBoundaries(tag, attrIdx, attr.length)) {
|
|
815
|
+
searchFrom = attrIdx + attr.length;
|
|
816
|
+
continue;
|
|
817
|
+
}
|
|
818
|
+
const valueHead = findAttributeValueStart(tag, attrIdx + attr.length);
|
|
819
|
+
if (!valueHead) {
|
|
820
|
+
searchFrom = attrIdx + attr.length;
|
|
821
|
+
continue;
|
|
822
|
+
}
|
|
823
|
+
const valueEnd = tag.indexOf(valueHead.quote, valueHead.valueStart);
|
|
824
|
+
if (valueEnd === -1)
|
|
825
|
+
return null;
|
|
826
|
+
return decodeHtmlEntities(tag.slice(valueHead.valueStart, valueEnd));
|
|
827
|
+
}
|
|
828
|
+
return null;
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Detect whether the article contains at least one Chart.js canvas with a
|
|
832
|
+
* well-formed `data-chart-config` JSON payload.
|
|
833
|
+
*
|
|
834
|
+
* A valid chart must:
|
|
835
|
+
* - be rendered via `<canvas data-chart-config="…">` (the declarative
|
|
836
|
+
* CSP-safe pattern hydrated by `js/chart-init.js`)
|
|
837
|
+
* - declare a supported Chart.js `type`
|
|
838
|
+
* - carry at least 3 data points in the first dataset (single-point charts
|
|
839
|
+
* are rejected by `SHARED_PROMPT_PATTERNS.md` anti-patterns)
|
|
840
|
+
*
|
|
841
|
+
* @param html - Raw article HTML
|
|
842
|
+
* @returns `true` when ≥1 chart meeting the rules is present
|
|
843
|
+
*/
|
|
844
|
+
export function articleHasChart(html) {
|
|
845
|
+
let cursor = 0;
|
|
846
|
+
while (cursor < html.length) {
|
|
847
|
+
const tagStart = html.indexOf('<canvas', cursor);
|
|
848
|
+
if (tagStart === -1)
|
|
849
|
+
return false;
|
|
850
|
+
const decoded = extractCanvasAttribute(html, tagStart, 'data-chart-config');
|
|
851
|
+
if (decoded !== null && CHART_JS_TYPES.test(decoded) && countFirstDatasetPoints(decoded) >= 3) {
|
|
852
|
+
return true;
|
|
853
|
+
}
|
|
854
|
+
// Advance past `<canvas` so overlapping matches cannot occur.
|
|
855
|
+
cursor = tagStart + '<canvas'.length;
|
|
856
|
+
}
|
|
857
|
+
return false;
|
|
858
|
+
}
|
|
859
|
+
/**
|
|
860
|
+
* Count data points in the first dataset of a Chart.js config JSON payload.
|
|
861
|
+
*
|
|
862
|
+
* Parses the decoded `data-chart-config` as JSON and returns the length of
|
|
863
|
+
* `config.data.datasets[0].data`. Handles both numeric-array datasets
|
|
864
|
+
* (`[1, 2, 3]`) and object-point datasets (`[{x:0,y:1}, …]`) correctly —
|
|
865
|
+
* the previous indexOf-based implementation miscounted scatter/bubble
|
|
866
|
+
* configs and accidentally looked at `data.labels` for typical layouts.
|
|
867
|
+
*
|
|
868
|
+
* @param json - Decoded Chart.js config JSON string
|
|
869
|
+
* @returns Number of data points in `data.datasets[0].data`, or 0 when absent/invalid
|
|
870
|
+
*/
|
|
871
|
+
function countFirstDatasetPoints(json) {
|
|
872
|
+
try {
|
|
873
|
+
const config = JSON.parse(json);
|
|
874
|
+
const firstDataset = config.data?.datasets?.[0];
|
|
875
|
+
return Array.isArray(firstDataset?.data) ? firstDataset.data.length : 0;
|
|
876
|
+
}
|
|
877
|
+
catch {
|
|
878
|
+
return 0;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
/**
|
|
882
|
+
* Count distinct language switcher links emitted in the article header.
|
|
883
|
+
*
|
|
884
|
+
* @param html - Complete article HTML
|
|
885
|
+
* @returns Number of `.lang-link` anchors inside the header `site-header__langs` nav
|
|
886
|
+
*/
|
|
887
|
+
function countLanguageSwitcherLinks(html) {
|
|
888
|
+
// Linear scan: locate the nav element by its unique class, then count
|
|
889
|
+
// `.lang-link` classes inside. Avoids the nested `[^">]*` regex pattern
|
|
890
|
+
// that CodeQL flags as polynomial-ReDoS-prone.
|
|
891
|
+
const marker = 'site-header__langs';
|
|
892
|
+
const markerIdx = html.indexOf(marker);
|
|
893
|
+
const NAV_CLOSE = '</nav>';
|
|
894
|
+
let scope = html;
|
|
895
|
+
if (markerIdx !== -1) {
|
|
896
|
+
// Find the closing `</nav>` of the enclosing nav (simple assumption:
|
|
897
|
+
// the next `</nav>` after the marker is the one we want). Falls back to
|
|
898
|
+
// the whole HTML if not found.
|
|
899
|
+
const endIdx = html.indexOf(NAV_CLOSE, markerIdx);
|
|
900
|
+
if (endIdx !== -1) {
|
|
901
|
+
// Walk backwards to find the opening `<nav`.
|
|
902
|
+
const startIdx = html.lastIndexOf('<nav', markerIdx);
|
|
903
|
+
if (startIdx !== -1) {
|
|
904
|
+
scope = html.slice(startIdx, endIdx);
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
// Count `lang-link` class tokens — bounded linear count.
|
|
909
|
+
const matches = scope.match(/\blang-link\b/gu);
|
|
910
|
+
return matches ? matches.length : 0;
|
|
911
|
+
}
|
|
912
|
+
/**
|
|
913
|
+
* Detect the two standard footer blocks always produced by `article-template.ts`.
|
|
914
|
+
*
|
|
915
|
+
* @param html - Complete article HTML
|
|
916
|
+
* @returns `true` when both `.footer-content` and `.footer-bottom` classes are present
|
|
917
|
+
*/
|
|
918
|
+
function hasStandardFooterContent(html) {
|
|
919
|
+
return /class="footer-content"/u.test(html) && /class="footer-bottom"/u.test(html);
|
|
920
|
+
}
|
|
921
|
+
/** Slugs for article types that MUST include World Bank economic context. */
|
|
922
|
+
const POLICY_SLUGS_REQUIRING_WORLD_BANK = new Set([
|
|
923
|
+
'committee-reports',
|
|
924
|
+
'propositions',
|
|
925
|
+
'motions',
|
|
926
|
+
'weekly-review',
|
|
927
|
+
'monthly-review',
|
|
928
|
+
'week-in-review',
|
|
929
|
+
'month-in-review',
|
|
930
|
+
'month-ahead',
|
|
931
|
+
]);
|
|
932
|
+
/**
|
|
933
|
+
* Strong World Bank evidence tokens — plain substring match is enough to
|
|
934
|
+
* satisfy the gate because each is specific (the literal attribution phrase
|
|
935
|
+
* or an MCP tool name). Kept aligned with
|
|
936
|
+
* `analysis/methodologies/worldbank-indicator-mapping.md`.
|
|
937
|
+
*/
|
|
938
|
+
export const WORLD_BANK_STRONG_FINGERPRINTS = [
|
|
939
|
+
'World Bank',
|
|
940
|
+
'world bank',
|
|
941
|
+
'worldbank',
|
|
942
|
+
'get-economic-data',
|
|
943
|
+
'get-social-data',
|
|
944
|
+
'get-education-data',
|
|
945
|
+
'get-health-data',
|
|
946
|
+
'get-country-info',
|
|
947
|
+
'get-countries',
|
|
948
|
+
'search-indicators',
|
|
949
|
+
];
|
|
950
|
+
/**
|
|
951
|
+
* Short indicator codes published by the World Bank MCP server. These are
|
|
952
|
+
* matched with a word boundary (`[^A-Z0-9_]` look-arounds) so that prose like
|
|
953
|
+
* "GDP growth slowed" does NOT count as World Bank evidence, but an analysis
|
|
954
|
+
* file line like `INDICATOR: GDP` does. All codes are uppercase, so the match
|
|
955
|
+
* is case-sensitive — case-insensitive mentions in English prose are intentionally
|
|
956
|
+
* rejected.
|
|
957
|
+
*/
|
|
958
|
+
export const WORLD_BANK_INDICATOR_CODES = [
|
|
959
|
+
'GDP',
|
|
960
|
+
'GDP_GROWTH',
|
|
961
|
+
'GDP_PER_CAPITA',
|
|
962
|
+
'GNI',
|
|
963
|
+
'GNI_PER_CAPITA',
|
|
964
|
+
'UNEMPLOYMENT',
|
|
965
|
+
'INFLATION',
|
|
966
|
+
'EXPORTS',
|
|
967
|
+
'EXPORTS_GDP',
|
|
968
|
+
'FDI',
|
|
969
|
+
'FDI_NET',
|
|
970
|
+
'POPULATION',
|
|
971
|
+
'LIFE_EXPECTANCY',
|
|
972
|
+
'BIRTH_RATE',
|
|
973
|
+
'DEATH_RATE',
|
|
974
|
+
'INTERNET_USERS',
|
|
975
|
+
'LITERACY_RATE',
|
|
976
|
+
'SCHOOL_ENROLLMENT',
|
|
977
|
+
'SCHOOL_COMPLETION',
|
|
978
|
+
'TEACHERS_PRIMARY',
|
|
979
|
+
'EDUCATION_EXPENDITURE',
|
|
980
|
+
'HEALTH_EXPENDITURE',
|
|
981
|
+
'PHYSICIANS',
|
|
982
|
+
'HOSPITAL_BEDS',
|
|
983
|
+
'IMMUNIZATION',
|
|
984
|
+
'HIV_PREVALENCE',
|
|
985
|
+
'MALNUTRITION',
|
|
986
|
+
'TUBERCULOSIS',
|
|
987
|
+
];
|
|
988
|
+
/**
|
|
989
|
+
* Backwards-compatible union of strong + short fingerprints. Kept exported so
|
|
990
|
+
* callers that only need a flat list (e.g. existing consumers that shipped
|
|
991
|
+
* before the strong/short split) continue to compile. New code SHOULD prefer
|
|
992
|
+
* {@link hasWorldBankEvidence}, which enforces the stricter word-boundary rule
|
|
993
|
+
* for short codes.
|
|
994
|
+
*/
|
|
995
|
+
export const WORLD_BANK_FINGERPRINTS = [
|
|
996
|
+
...WORLD_BANK_STRONG_FINGERPRINTS,
|
|
997
|
+
...WORLD_BANK_INDICATOR_CODES,
|
|
998
|
+
];
|
|
999
|
+
/**
|
|
1000
|
+
* Return true when any WORLD_BANK_INDICATOR_CODES entry appears in `text` with
|
|
1001
|
+
* word-boundary isolation on both sides. We treat `[A-Z0-9_]` as "identifier"
|
|
1002
|
+
* characters — that keeps `GDP_GROWTH` from accidentally matching inside the
|
|
1003
|
+
* shorter `GDP` scan, and keeps the English word "gdp" out of the match set.
|
|
1004
|
+
*/
|
|
1005
|
+
/** Characters that count as part of an identifier-style token for the word-boundary check. */
|
|
1006
|
+
const WORD_BOUNDARY_PATTERN = /[A-Z0-9_]/u;
|
|
1007
|
+
/**
|
|
1008
|
+
* Check whether `ch` is NOT an identifier-style character (so it qualifies
|
|
1009
|
+
* as a word boundary on either side of a World Bank indicator code).
|
|
1010
|
+
*
|
|
1011
|
+
* @param ch - Single character (may be empty string for start/end-of-string)
|
|
1012
|
+
* @returns `true` when `ch` is empty or a non-identifier character
|
|
1013
|
+
*/
|
|
1014
|
+
function isIdentifierBoundary(ch) {
|
|
1015
|
+
return ch === '' || !WORD_BOUNDARY_PATTERN.test(ch);
|
|
1016
|
+
}
|
|
1017
|
+
/**
|
|
1018
|
+
* Return `true` when `code` appears in `text` surrounded by identifier
|
|
1019
|
+
* boundaries on both sides. Linear scan over `text`.
|
|
1020
|
+
*
|
|
1021
|
+
* @param text - Text to scan
|
|
1022
|
+
* @param code - Indicator code to look for (all uppercase)
|
|
1023
|
+
* @returns `true` when a word-bounded occurrence is present
|
|
1024
|
+
*/
|
|
1025
|
+
function textContainsIndicatorCode(text, code) {
|
|
1026
|
+
let from = 0;
|
|
1027
|
+
while (from < text.length) {
|
|
1028
|
+
const idx = text.indexOf(code, from);
|
|
1029
|
+
if (idx === -1)
|
|
1030
|
+
return false;
|
|
1031
|
+
const before = idx === 0 ? '' : (text[idx - 1] ?? '');
|
|
1032
|
+
const afterIdx = idx + code.length;
|
|
1033
|
+
const after = afterIdx < text.length ? (text[afterIdx] ?? '') : '';
|
|
1034
|
+
if (isIdentifierBoundary(before) && isIdentifierBoundary(after))
|
|
1035
|
+
return true;
|
|
1036
|
+
from = idx + 1;
|
|
1037
|
+
}
|
|
1038
|
+
return false;
|
|
1039
|
+
}
|
|
1040
|
+
/**
|
|
1041
|
+
* Return true when any `WORLD_BANK_INDICATOR_CODES` entry appears in `text`
|
|
1042
|
+
* with word-boundary isolation on both sides. We treat `[A-Z0-9_]` as
|
|
1043
|
+
* "identifier" characters — that keeps `GDP_GROWTH` from accidentally matching
|
|
1044
|
+
* inside the shorter `GDP` scan, and keeps the English word "gdp" out of the
|
|
1045
|
+
* match set.
|
|
1046
|
+
*
|
|
1047
|
+
* @param text - Article body or analysis markdown to scan
|
|
1048
|
+
* @returns `true` when at least one canonical indicator code is present
|
|
1049
|
+
*/
|
|
1050
|
+
function hasIndicatorCodeWithBoundary(text) {
|
|
1051
|
+
for (const code of WORLD_BANK_INDICATOR_CODES) {
|
|
1052
|
+
if (textContainsIndicatorCode(text, code))
|
|
1053
|
+
return true;
|
|
1054
|
+
}
|
|
1055
|
+
return false;
|
|
1056
|
+
}
|
|
1057
|
+
/**
|
|
1058
|
+
* Detect World Bank sourcing in any piece of text (article body OR analysis
|
|
1059
|
+
* markdown). Returns `true` when the text contains either a strong fingerprint
|
|
1060
|
+
* (the phrase "World Bank", an MCP tool name, etc.) or an indicator code with
|
|
1061
|
+
* clean word boundaries.
|
|
1062
|
+
*
|
|
1063
|
+
* This is the single source of truth for the policy quality gate — both the
|
|
1064
|
+
* content validator and the CLI validator's filesystem fallback use it so a
|
|
1065
|
+
* legitimate evidence trail on either side satisfies the rule, and generic
|
|
1066
|
+
* prose mentions of economic terms do not.
|
|
1067
|
+
*
|
|
1068
|
+
* @param text - Text to scan
|
|
1069
|
+
* @returns `true` when at least one strong or word-bounded fingerprint matches
|
|
1070
|
+
*/
|
|
1071
|
+
export function hasWorldBankEvidence(text) {
|
|
1072
|
+
for (const fp of WORLD_BANK_STRONG_FINGERPRINTS) {
|
|
1073
|
+
if (text.includes(fp))
|
|
1074
|
+
return true;
|
|
1075
|
+
}
|
|
1076
|
+
return hasIndicatorCodeWithBoundary(text);
|
|
1077
|
+
}
|
|
1078
|
+
/**
|
|
1079
|
+
* Verify that a policy article (or the linked analysis artifacts) contains at
|
|
1080
|
+
* least one World Bank fingerprint — indicator code (word-bounded), MCP
|
|
1081
|
+
* tool-trace token, or the phrase "World Bank" itself. Returns `true` if the
|
|
1082
|
+
* gate is satisfied OR the article type is not on the mandatory list.
|
|
1083
|
+
*
|
|
1084
|
+
* @param html - Article HTML
|
|
1085
|
+
* @param articleType - Slug of the article category (e.g. `"committee-reports"`)
|
|
1086
|
+
* @param _analysisDir - Reserved for API symmetry; filesystem recursion is
|
|
1087
|
+
* performed by the caller in `validate-articles.ts` to keep this module pure.
|
|
1088
|
+
* @returns `true` when the World Bank evidence requirement is met or not applicable
|
|
1089
|
+
*/
|
|
1090
|
+
export function articlePolicyHasWorldBank(html, articleType, _analysisDir) {
|
|
1091
|
+
if (!POLICY_SLUGS_REQUIRING_WORLD_BANK.has(articleType))
|
|
1092
|
+
return true;
|
|
1093
|
+
return hasWorldBankEvidence(html);
|
|
700
1094
|
}
|
|
701
1095
|
/**
|
|
702
1096
|
* Validate the quality of a generated article.
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
*/
|
|
19
19
|
import fs from 'node:fs';
|
|
20
20
|
import path from 'node:path';
|
|
21
|
-
import { NEWS_DIR, ARTICLE_FILENAME_PATTERN } from '../constants/config.js';
|
|
22
|
-
import { validateArticleContent } from './content-validator.js';
|
|
21
|
+
import { NEWS_DIR, ARTICLE_FILENAME_PATTERN, PROJECT_ROOT } from '../constants/config.js';
|
|
22
|
+
import { validateArticleContent, articlePolicyHasWorldBank, hasWorldBankEvidence, } from './content-validator.js';
|
|
23
23
|
import { scoreArticleQuality } from './article-quality-scorer.js';
|
|
24
24
|
// ─── CLI argument parsing ─────────────────────────────────────────────────────
|
|
25
25
|
const args = process.argv.slice(2);
|
|
@@ -63,6 +63,110 @@ function slugToArticleType(slug) {
|
|
|
63
63
|
return mapping[slug] ?? slug;
|
|
64
64
|
}
|
|
65
65
|
// ─── Main validation logic ────────────────────────────────────────────────────
|
|
66
|
+
/**
|
|
67
|
+
* For policy article types, verify World Bank evidence in either the article
|
|
68
|
+
* body OR any `.md` file under the article's `analysis/daily/{date}/{slug}*`
|
|
69
|
+
* directory. Non-policy article types are always considered satisfied.
|
|
70
|
+
*
|
|
71
|
+
* @param html - Full HTML of the article being validated
|
|
72
|
+
* @param articleType - Article category slug (e.g. `"committee-reports"`)
|
|
73
|
+
* @param date - Article publication date (`YYYY-MM-DD`)
|
|
74
|
+
* @param slug - Article slug used to locate the matching analysis directory
|
|
75
|
+
* @returns Warning string when the gate fails, or `null` when satisfied.
|
|
76
|
+
*/
|
|
77
|
+
function checkWorldBankEvidence(html, articleType, date, slug) {
|
|
78
|
+
// Short-circuit for non-policy article types.
|
|
79
|
+
if (articlePolicyHasWorldBank(html, articleType))
|
|
80
|
+
return null;
|
|
81
|
+
// Sweep sibling analysis directories: analysis/daily/{date}/{slug}*
|
|
82
|
+
const analysisRoot = path.join(PROJECT_ROOT, 'analysis', 'daily', date);
|
|
83
|
+
if (!fs.existsSync(analysisRoot)) {
|
|
84
|
+
return `Missing required World Bank economic context for "${articleType}" article; analysis directory ${analysisRoot} does not exist`;
|
|
85
|
+
}
|
|
86
|
+
const candidates = safeReaddir(analysisRoot).filter((entry) => entry === slug || entry.startsWith(`${slug}-`) || entry.startsWith(`${slug}_`));
|
|
87
|
+
for (const dirName of candidates) {
|
|
88
|
+
if (directoryContainsWorldBankFingerprint(path.join(analysisRoot, dirName))) {
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return `Missing required World Bank economic context for "${articleType}" article; neither article body nor analysis files under ${analysisRoot} reference any World Bank indicator`;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* List directory entries, returning `[]` on any error (tolerate missing paths).
|
|
96
|
+
*
|
|
97
|
+
* @param dir - Directory to list
|
|
98
|
+
* @returns Array of entry names or `[]` when the directory cannot be read
|
|
99
|
+
*/
|
|
100
|
+
function safeReaddir(dir) {
|
|
101
|
+
try {
|
|
102
|
+
return fs.readdirSync(dir);
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
return [];
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Maximum recursion depth when searching an analysis directory for World Bank
|
|
110
|
+
* fingerprints. The starting directory is depth 0; the guard
|
|
111
|
+
* `depth >= ANALYSIS_SEARCH_MAX_DEPTH` stops recursion once it would exceed
|
|
112
|
+
* this depth. With `ANALYSIS_SEARCH_MAX_DEPTH = 3` the scanner reads files at
|
|
113
|
+
* depths 0, 1, 2 and 3 — enough to cover the expected layout
|
|
114
|
+
* `analysis/daily/{date}/{slug}/<subdir>/<file>.md` (depth 2) with one level
|
|
115
|
+
* of tolerance for deeper run artefacts. Trees deeper than this are truncated
|
|
116
|
+
* to guarantee bounded I/O during validator runs.
|
|
117
|
+
*/
|
|
118
|
+
const ANALYSIS_SEARCH_MAX_DEPTH = 3;
|
|
119
|
+
/**
|
|
120
|
+
* Depth-limited recursive search for any World Bank fingerprint in `.md` files.
|
|
121
|
+
* Uses {@link hasWorldBankEvidence} so the gate enforces the same
|
|
122
|
+
* strong-phrase / word-bounded-indicator rule used on article bodies.
|
|
123
|
+
*
|
|
124
|
+
* @param dir - Directory to scan
|
|
125
|
+
* @param depth - Current recursion depth (callers should omit; max is
|
|
126
|
+
* {@link ANALYSIS_SEARCH_MAX_DEPTH}, inclusive)
|
|
127
|
+
* @returns `true` when at least one `.md` file contains a World Bank fingerprint
|
|
128
|
+
*/
|
|
129
|
+
function directoryContainsWorldBankFingerprint(dir, depth = 0) {
|
|
130
|
+
if (depth > ANALYSIS_SEARCH_MAX_DEPTH)
|
|
131
|
+
return false;
|
|
132
|
+
let entries;
|
|
133
|
+
try {
|
|
134
|
+
entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
for (const entry of entries) {
|
|
140
|
+
if (entryContainsWorldBankFingerprint(dir, entry, depth))
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Test a single directory entry for World Bank fingerprints, recursing into
|
|
147
|
+
* subdirectories up to the shared depth cap.
|
|
148
|
+
*
|
|
149
|
+
* @param dir - Parent directory of `entry`
|
|
150
|
+
* @param entry - Directory entry to test
|
|
151
|
+
* @param depth - Current recursion depth of the caller
|
|
152
|
+
* @returns `true` when this entry (or any descendant) matches a fingerprint
|
|
153
|
+
*/
|
|
154
|
+
function entryContainsWorldBankFingerprint(dir, entry, depth) {
|
|
155
|
+
const full = path.join(dir, entry.name);
|
|
156
|
+
if (entry.isDirectory()) {
|
|
157
|
+
return directoryContainsWorldBankFingerprint(full, depth + 1);
|
|
158
|
+
}
|
|
159
|
+
if (!entry.isFile() || !entry.name.endsWith('.md'))
|
|
160
|
+
return false;
|
|
161
|
+
let content;
|
|
162
|
+
try {
|
|
163
|
+
content = fs.readFileSync(full, 'utf-8');
|
|
164
|
+
}
|
|
165
|
+
catch {
|
|
166
|
+
return false;
|
|
167
|
+
}
|
|
168
|
+
return hasWorldBankEvidence(content);
|
|
169
|
+
}
|
|
66
170
|
/**
|
|
67
171
|
* Validate a single article file and return a summary.
|
|
68
172
|
*
|
|
@@ -80,6 +184,11 @@ function validateSingleFile(filename) {
|
|
|
80
184
|
const html = fs.readFileSync(filePath, 'utf-8');
|
|
81
185
|
const articleType = slugToArticleType(slug);
|
|
82
186
|
const result = validateArticleContent(html, lang, articleType);
|
|
187
|
+
// World Bank gate — extend search to linked analysis markdown files
|
|
188
|
+
const wbWarning = checkWorldBankEvidence(html, articleType, date, slug);
|
|
189
|
+
if (wbWarning) {
|
|
190
|
+
result.warnings.push(wbWarning);
|
|
191
|
+
}
|
|
83
192
|
const summary = {
|
|
84
193
|
filename,
|
|
85
194
|
lang,
|