@ansvar/eu-regulations-mcp 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -22
- package/data/regulations.db +0 -0
- package/dist/database/sqlite-adapter.d.ts +2 -2
- package/dist/database/sqlite-adapter.d.ts.map +1 -1
- package/dist/database/sqlite-adapter.js.map +1 -1
- package/dist/http-server.js +27 -5
- package/dist/http-server.js.map +1 -1
- package/dist/index.js +27 -4
- package/dist/index.js.map +1 -1
- package/dist/tools/about.d.ts +40 -0
- package/dist/tools/about.d.ts.map +1 -0
- package/dist/tools/about.js +61 -0
- package/dist/tools/about.js.map +1 -0
- package/dist/tools/list.d.ts +7 -0
- package/dist/tools/list.d.ts.map +1 -1
- package/dist/tools/list.js +73 -8
- package/dist/tools/list.js.map +1 -1
- package/dist/tools/registry.d.ts +11 -1
- package/dist/tools/registry.d.ts.map +1 -1
- package/dist/tools/registry.js +56 -4
- package/dist/tools/registry.js.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +17 -5
- package/dist/worker.js.map +1 -1
- package/package.json +6 -5
- package/scripts/add-cross-references.sql +0 -200
- package/scripts/analyze-survey-responses.ts +0 -285
- package/scripts/build-db.ts +0 -421
- package/scripts/bulk-reingest-all.ts +0 -331
- package/scripts/check-updates.ts +0 -294
- package/scripts/extract-eprivacy-recitals.ts +0 -98
- package/scripts/ingest-eurlex-browser.ts +0 -113
- package/scripts/ingest-eurlex.ts +0 -349
- package/scripts/ingest-unece.ts +0 -382
- package/scripts/migrate-postgres.ts +0 -445
- package/scripts/migrate-to-postgres.ts +0 -353
- package/scripts/reingest-all-with-recitals.sh +0 -81
- package/scripts/sync-versions.ts +0 -206
- package/scripts/test-cross-refs.js +0 -26
- package/scripts/test-postgres-adapter.ts +0 -146
- package/scripts/update-dora-rts-metadata.ts +0 -112
- package/src/database/postgres-adapter.ts +0 -84
- package/src/database/sqlite-adapter.ts +0 -44
- package/src/database/types.ts +0 -10
- package/src/http-server.ts +0 -149
- package/src/index.ts +0 -61
- package/src/middleware/rate-limit.ts +0 -104
- package/src/tools/applicability.ts +0 -167
- package/src/tools/article.ts +0 -81
- package/src/tools/compare.ts +0 -217
- package/src/tools/definitions.ts +0 -49
- package/src/tools/evidence.ts +0 -84
- package/src/tools/list.ts +0 -124
- package/src/tools/map.ts +0 -86
- package/src/tools/recital.ts +0 -60
- package/src/tools/registry.ts +0 -311
- package/src/tools/search.ts +0 -297
- package/src/worker.ts +0 -708
|
@@ -1,331 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Bulk re-ingestion script for all 37 EU regulations.
|
|
5
|
-
*
|
|
6
|
-
* Uses Puppeteer-based browser ingestion to bypass EUR-Lex WAF.
|
|
7
|
-
* Processes regulations in batches of 3 parallel browser instances
|
|
8
|
-
* with 2s delays between batches for rate limiting.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* npx tsx scripts/bulk-reingest-all.ts
|
|
12
|
-
* npx tsx scripts/bulk-reingest-all.ts --dry-run # Preview without executing
|
|
13
|
-
*
|
|
14
|
-
* SECURITY NOTE: Uses execFile (not exec) to prevent command injection.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
import { execFile } from 'child_process';
|
|
18
|
-
import { promisify } from 'util';
|
|
19
|
-
import { readdir } from 'fs/promises';
|
|
20
|
-
import { join, basename } from 'path';
|
|
21
|
-
|
|
22
|
-
const execFileAsync = promisify(execFile);
|
|
23
|
-
|
|
24
|
-
interface Regulation {
|
|
25
|
-
celexId: string;
|
|
26
|
-
filename: string;
|
|
27
|
-
filepath: string;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
interface IngestionResult {
|
|
31
|
-
regulation: string;
|
|
32
|
-
success: boolean;
|
|
33
|
-
error?: string;
|
|
34
|
-
duration?: number;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
// Configuration
|
|
38
|
-
const BATCH_SIZE = 3; // Parallel browser instances
|
|
39
|
-
const BATCH_DELAY_MS = 2000; // Rate limiting between batches
|
|
40
|
-
const TIMEOUT_MS = 120000; // 2 minutes per regulation
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Discover all regulation JSON files in data/seed/
|
|
44
|
-
*/
|
|
45
|
-
async function discoverRegulations(): Promise<Regulation[]> {
|
|
46
|
-
const seedDir = join(process.cwd(), 'data', 'seed');
|
|
47
|
-
const files = await readdir(seedDir);
|
|
48
|
-
|
|
49
|
-
const regulations: Regulation[] = [];
|
|
50
|
-
|
|
51
|
-
for (const file of files) {
|
|
52
|
-
if (!file.endsWith('.json')) continue;
|
|
53
|
-
|
|
54
|
-
const filepath = join(seedDir, file);
|
|
55
|
-
const filename = basename(file, '.json');
|
|
56
|
-
|
|
57
|
-
// Read JSON to extract CELEX ID
|
|
58
|
-
try {
|
|
59
|
-
const { readFileSync } = await import('fs');
|
|
60
|
-
const content = readFileSync(filepath, 'utf-8');
|
|
61
|
-
const data = JSON.parse(content);
|
|
62
|
-
|
|
63
|
-
if (data.celex_id) {
|
|
64
|
-
regulations.push({
|
|
65
|
-
celexId: data.celex_id,
|
|
66
|
-
filename,
|
|
67
|
-
filepath,
|
|
68
|
-
});
|
|
69
|
-
} else {
|
|
70
|
-
console.warn(`⚠️ Warning: ${file} has no celex_id field`);
|
|
71
|
-
}
|
|
72
|
-
} catch (err) {
|
|
73
|
-
console.warn(`⚠️ Warning: Failed to parse ${file}:`, (err as Error).message);
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return regulations.sort((a, b) => a.filename.localeCompare(b.filename));
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Ingest a single regulation using the appropriate script
|
|
82
|
-
*/
|
|
83
|
-
async function ingestRegulation(regulation: Regulation): Promise<IngestionResult> {
|
|
84
|
-
const startTime = Date.now();
|
|
85
|
-
|
|
86
|
-
try {
|
|
87
|
-
// Determine which ingestion script to use
|
|
88
|
-
const isUNECE = regulation.celexId.startsWith('42021X');
|
|
89
|
-
const scriptName = isUNECE ? 'ingest-unece.ts' : 'ingest-eurlex.ts';
|
|
90
|
-
const scriptPath = join(process.cwd(), 'scripts', scriptName);
|
|
91
|
-
|
|
92
|
-
// SECURITY: Using execFile (not exec) to prevent command injection
|
|
93
|
-
const args = ['tsx', scriptPath, regulation.celexId, regulation.filepath];
|
|
94
|
-
|
|
95
|
-
// Add --browser flag for EUR-Lex regulations to bypass WAF
|
|
96
|
-
if (!isUNECE) {
|
|
97
|
-
args.push('--browser');
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
const { stdout, stderr } = await execFileAsync(
|
|
101
|
-
'npx',
|
|
102
|
-
args,
|
|
103
|
-
{
|
|
104
|
-
timeout: TIMEOUT_MS,
|
|
105
|
-
cwd: process.cwd(),
|
|
106
|
-
maxBuffer: 10 * 1024 * 1024, // 10MB buffer for large outputs
|
|
107
|
-
}
|
|
108
|
-
);
|
|
109
|
-
|
|
110
|
-
const duration = Date.now() - startTime;
|
|
111
|
-
|
|
112
|
-
// Log any warnings from stderr
|
|
113
|
-
if (stderr) {
|
|
114
|
-
console.log(` [stderr] ${stderr.trim()}`);
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
return {
|
|
118
|
-
regulation: regulation.filename,
|
|
119
|
-
success: true,
|
|
120
|
-
duration,
|
|
121
|
-
};
|
|
122
|
-
} catch (err: any) {
|
|
123
|
-
const duration = Date.now() - startTime;
|
|
124
|
-
|
|
125
|
-
return {
|
|
126
|
-
regulation: regulation.filename,
|
|
127
|
-
success: false,
|
|
128
|
-
error: err.message || String(err),
|
|
129
|
-
duration,
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Process regulations in batches with rate limiting
|
|
136
|
-
*/
|
|
137
|
-
async function processBatches(regulations: Regulation[]): Promise<IngestionResult[]> {
|
|
138
|
-
const results: IngestionResult[] = [];
|
|
139
|
-
const totalBatches = Math.ceil(regulations.length / BATCH_SIZE);
|
|
140
|
-
|
|
141
|
-
for (let i = 0; i < regulations.length; i += BATCH_SIZE) {
|
|
142
|
-
const batch = regulations.slice(i, i + BATCH_SIZE);
|
|
143
|
-
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
|
144
|
-
|
|
145
|
-
console.log(`\n📦 Batch ${batchNum}/${totalBatches} (${batch.length} regulations)`);
|
|
146
|
-
console.log('─'.repeat(60));
|
|
147
|
-
|
|
148
|
-
// Process batch in parallel
|
|
149
|
-
const batchPromises = batch.map(async (reg, idx) => {
|
|
150
|
-
const regNum = i + idx + 1;
|
|
151
|
-
console.log(`[${regNum}/${regulations.length}] Starting: ${reg.filename} (${reg.celexId})`);
|
|
152
|
-
|
|
153
|
-
const result = await ingestRegulation(reg);
|
|
154
|
-
|
|
155
|
-
if (result.success) {
|
|
156
|
-
console.log(`[${regNum}/${regulations.length}] ✅ ${reg.filename} (${result.duration}ms)`);
|
|
157
|
-
} else {
|
|
158
|
-
console.log(`[${regNum}/${regulations.length}] ❌ ${reg.filename}: ${result.error}`);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
return result;
|
|
162
|
-
});
|
|
163
|
-
|
|
164
|
-
const batchResults = await Promise.all(batchPromises);
|
|
165
|
-
results.push(...batchResults);
|
|
166
|
-
|
|
167
|
-
// Rate limiting: delay between batches (except after last batch)
|
|
168
|
-
if (i + BATCH_SIZE < regulations.length) {
|
|
169
|
-
console.log(`\n⏸️ Waiting ${BATCH_DELAY_MS}ms before next batch...`);
|
|
170
|
-
await new Promise(resolve => setTimeout(resolve, BATCH_DELAY_MS));
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
return results;
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
/**
|
|
178
|
-
* Print summary report
|
|
179
|
-
*/
|
|
180
|
-
function printSummary(results: IngestionResult[]) {
|
|
181
|
-
console.log('\n' + '═'.repeat(60));
|
|
182
|
-
console.log('📊 INGESTION SUMMARY');
|
|
183
|
-
console.log('═'.repeat(60));
|
|
184
|
-
|
|
185
|
-
const successful = results.filter(r => r.success);
|
|
186
|
-
const failed = results.filter(r => !r.success);
|
|
187
|
-
|
|
188
|
-
console.log(`\n✅ Successful: ${successful.length}/${results.length}`);
|
|
189
|
-
console.log(`❌ Failed: ${failed.length}/${results.length}`);
|
|
190
|
-
|
|
191
|
-
if (successful.length > 0) {
|
|
192
|
-
const totalDuration = successful.reduce((sum, r) => sum + (r.duration || 0), 0);
|
|
193
|
-
const avgDuration = totalDuration / successful.length;
|
|
194
|
-
console.log(`⏱️ Average duration: ${Math.round(avgDuration)}ms`);
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
if (failed.length > 0) {
|
|
198
|
-
console.log('\n❌ Failed regulations:');
|
|
199
|
-
for (const result of failed) {
|
|
200
|
-
console.log(` - ${result.regulation}: ${result.error}`);
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
console.log('\n' + '═'.repeat(60));
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
/**
|
|
208
|
-
* Rebuild database and verify recitals
|
|
209
|
-
*/
|
|
210
|
-
async function rebuildDatabase() {
|
|
211
|
-
console.log('\n🔨 Rebuilding database...');
|
|
212
|
-
|
|
213
|
-
try {
|
|
214
|
-
const { stdout } = await execFileAsync('npm', ['run', 'build:db'], {
|
|
215
|
-
cwd: process.cwd(),
|
|
216
|
-
timeout: 60000, // 1 minute timeout
|
|
217
|
-
});
|
|
218
|
-
|
|
219
|
-
console.log(stdout);
|
|
220
|
-
console.log('✅ Database rebuilt successfully');
|
|
221
|
-
} catch (err: any) {
|
|
222
|
-
console.error('❌ Database rebuild failed:', err.message);
|
|
223
|
-
throw err;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
/**
|
|
228
|
-
* Query and display recital counts
|
|
229
|
-
*/
|
|
230
|
-
async function verifyRecitals() {
|
|
231
|
-
console.log('\n📊 Verifying recital counts...');
|
|
232
|
-
|
|
233
|
-
try {
|
|
234
|
-
const { stdout } = await execFileAsync(
|
|
235
|
-
'sqlite3',
|
|
236
|
-
[
|
|
237
|
-
'data/regulations.db',
|
|
238
|
-
'SELECT regulation, COUNT(*) as recital_count FROM recitals GROUP BY regulation ORDER BY recital_count DESC LIMIT 10;',
|
|
239
|
-
],
|
|
240
|
-
{ cwd: process.cwd() }
|
|
241
|
-
);
|
|
242
|
-
|
|
243
|
-
console.log('\nTop 10 regulations by recital count:');
|
|
244
|
-
console.log(stdout);
|
|
245
|
-
|
|
246
|
-
// Get total count
|
|
247
|
-
const { stdout: totalStdout } = await execFileAsync(
|
|
248
|
-
'sqlite3',
|
|
249
|
-
['data/regulations.db', 'SELECT COUNT(*) FROM recitals;'],
|
|
250
|
-
{ cwd: process.cwd() }
|
|
251
|
-
);
|
|
252
|
-
|
|
253
|
-
const totalRecitals = parseInt(totalStdout.trim());
|
|
254
|
-
console.log(`\n📈 Total recitals in database: ${totalRecitals}`);
|
|
255
|
-
|
|
256
|
-
if (totalRecitals < 2000) {
|
|
257
|
-
console.warn(`⚠️ Warning: Expected ~2,500+ recitals, got ${totalRecitals}`);
|
|
258
|
-
} else {
|
|
259
|
-
console.log('✅ Recital count looks good!');
|
|
260
|
-
}
|
|
261
|
-
} catch (err: any) {
|
|
262
|
-
console.error('❌ Verification failed:', err.message);
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
/**
|
|
267
|
-
* Main execution
|
|
268
|
-
*/
|
|
269
|
-
async function main() {
|
|
270
|
-
const isDryRun = process.argv.includes('--dry-run');
|
|
271
|
-
|
|
272
|
-
console.log('🚀 EU Regulations Bulk Re-Ingestion');
|
|
273
|
-
console.log('═'.repeat(60));
|
|
274
|
-
console.log(`Mode: ${isDryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
275
|
-
console.log(`Batch size: ${BATCH_SIZE} parallel instances`);
|
|
276
|
-
console.log(`Batch delay: ${BATCH_DELAY_MS}ms`);
|
|
277
|
-
console.log(`Timeout per regulation: ${TIMEOUT_MS}ms`);
|
|
278
|
-
console.log('═'.repeat(60));
|
|
279
|
-
|
|
280
|
-
// Step 1: Discover regulations
|
|
281
|
-
console.log('\n🔍 Discovering regulations...');
|
|
282
|
-
const regulations = await discoverRegulations();
|
|
283
|
-
console.log(`Found ${regulations.length} regulations in data/seed/`);
|
|
284
|
-
|
|
285
|
-
if (regulations.length === 0) {
|
|
286
|
-
console.error('❌ No regulations found. Exiting.');
|
|
287
|
-
process.exit(1);
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
// List regulations
|
|
291
|
-
console.log('\nRegulations to process:');
|
|
292
|
-
for (const reg of regulations) {
|
|
293
|
-
const type = reg.celexId.startsWith('42021X') ? '[UNECE]' : '[EUR-Lex]';
|
|
294
|
-
console.log(` ${type} ${reg.filename} (${reg.celexId})`);
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
if (isDryRun) {
|
|
298
|
-
console.log('\n✅ Dry run complete. No ingestion performed.');
|
|
299
|
-
return;
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
// Confirm before starting
|
|
303
|
-
console.log('\n⚠️ This will re-ingest all regulations using browser automation.');
|
|
304
|
-
console.log(`⏱️ Estimated time: ${Math.ceil((regulations.length / BATCH_SIZE) * (BATCH_DELAY_MS / 1000))} seconds + ingestion time`);
|
|
305
|
-
|
|
306
|
-
// Step 2: Process batches
|
|
307
|
-
const results = await processBatches(regulations);
|
|
308
|
-
|
|
309
|
-
// Step 3: Print summary
|
|
310
|
-
printSummary(results);
|
|
311
|
-
|
|
312
|
-
// Step 4: Rebuild database
|
|
313
|
-
if (results.some(r => r.success)) {
|
|
314
|
-
await rebuildDatabase();
|
|
315
|
-
|
|
316
|
-
// Step 5: Verify recitals
|
|
317
|
-
await verifyRecitals();
|
|
318
|
-
} else {
|
|
319
|
-
console.error('\n❌ No successful ingestions. Skipping database rebuild.');
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// Exit with appropriate code
|
|
323
|
-
const hasFailures = results.some(r => !r.success);
|
|
324
|
-
process.exit(hasFailures ? 1 : 0);
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
// Run main
|
|
328
|
-
main().catch(err => {
|
|
329
|
-
console.error('💥 Fatal error:', err);
|
|
330
|
-
process.exit(1);
|
|
331
|
-
});
|
package/scripts/check-updates.ts
DELETED
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Check for updates to EU regulations from EUR-Lex.
|
|
5
|
-
* Compares current database versions against EUR-Lex metadata.
|
|
6
|
-
*
|
|
7
|
-
* Usage: npx tsx scripts/check-updates.ts
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
import Database from 'better-sqlite3';
|
|
11
|
-
import { existsSync } from 'fs';
|
|
12
|
-
import { join, dirname } from 'path';
|
|
13
|
-
import { fileURLToPath } from 'url';
|
|
14
|
-
|
|
15
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
16
|
-
const __dirname = dirname(__filename);
|
|
17
|
-
|
|
18
|
-
const DB_PATH = join(__dirname, '..', 'data', 'regulations.db');
|
|
19
|
-
|
|
20
|
-
interface SourceRecord {
|
|
21
|
-
regulation: string;
|
|
22
|
-
celex_id: string;
|
|
23
|
-
eur_lex_version: string | null;
|
|
24
|
-
last_fetched: string | null;
|
|
25
|
-
articles_expected: number | null;
|
|
26
|
-
articles_parsed: number | null;
|
|
27
|
-
quality_status: string;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
interface EurLexMetadata {
|
|
31
|
-
celexId: string;
|
|
32
|
-
lastModified: string;
|
|
33
|
-
title: string;
|
|
34
|
-
dateDocument: string;
|
|
35
|
-
consolidatedVersions?: string[];
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// No hardcoded list - source_registry table IS the source of truth
|
|
39
|
-
// To add a new regulation: ingest it, and it's automatically monitored
|
|
40
|
-
|
|
41
|
-
async function fetchEurLexMetadata(celexId: string): Promise<EurLexMetadata | null> {
|
|
42
|
-
const infoUrl = `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`;
|
|
43
|
-
|
|
44
|
-
try {
|
|
45
|
-
const response = await fetch(infoUrl, {
|
|
46
|
-
headers: {
|
|
47
|
-
'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0)',
|
|
48
|
-
'Accept': 'text/html',
|
|
49
|
-
},
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
if (!response.ok) {
|
|
53
|
-
console.error(`Failed to fetch metadata for ${celexId}: ${response.status}`);
|
|
54
|
-
return null;
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
const html = await response.text();
|
|
58
|
-
|
|
59
|
-
// Try multiple extraction methods in order of reliability:
|
|
60
|
-
|
|
61
|
-
// 1. ELI metadata (works for all document types including UNECE)
|
|
62
|
-
const eliDateDoc = html.match(/property="eli:date_document"[^>]*content="(\d{4}-\d{2}-\d{2})"/);
|
|
63
|
-
const eliDatePub = html.match(/property="eli:date_publication"[^>]*content="(\d{4}-\d{2}-\d{2})"/);
|
|
64
|
-
// Also try the reverse attribute order
|
|
65
|
-
const eliDateDoc2 = html.match(/content="(\d{4}-\d{2}-\d{2})"[^>]*property="eli:date_document"/);
|
|
66
|
-
const eliDatePub2 = html.match(/content="(\d{4}-\d{2}-\d{2})"[^>]*property="eli:date_publication"/);
|
|
67
|
-
|
|
68
|
-
// 2. Visible text patterns
|
|
69
|
-
const dateMatch = html.match(/Date of document:\s*(\d{2}\/\d{2}\/\d{4})/i);
|
|
70
|
-
|
|
71
|
-
// 3. Generic ELI date pattern (fallback)
|
|
72
|
-
const genericEli = html.match(/eli[^>]*(\d{4}-\d{2}-\d{2})/i);
|
|
73
|
-
|
|
74
|
-
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
|
|
75
|
-
|
|
76
|
-
// Use the best available date (prefer publication date for tracking updates)
|
|
77
|
-
const lastModified =
|
|
78
|
-
eliDatePub?.[1] || eliDatePub2?.[1] ||
|
|
79
|
-
eliDateDoc?.[1] || eliDateDoc2?.[1] ||
|
|
80
|
-
genericEli?.[1] ||
|
|
81
|
-
(dateMatch?.[1] ? convertDateFormat(dateMatch[1]) : null) ||
|
|
82
|
-
'unknown';
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
celexId,
|
|
86
|
-
lastModified,
|
|
87
|
-
title: titleMatch?.[1]?.trim() || 'Unknown',
|
|
88
|
-
dateDocument: eliDateDoc?.[1] || eliDateDoc2?.[1] || 'unknown',
|
|
89
|
-
};
|
|
90
|
-
} catch (error) {
|
|
91
|
-
console.error(`Error fetching metadata for ${celexId}:`, error);
|
|
92
|
-
return null;
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
// Convert DD/MM/YYYY to YYYY-MM-DD
|
|
97
|
-
function convertDateFormat(date: string): string {
|
|
98
|
-
const parts = date.split('/');
|
|
99
|
-
if (parts.length === 3) {
|
|
100
|
-
return `${parts[2]}-${parts[1]}-${parts[0]}`;
|
|
101
|
-
}
|
|
102
|
-
return date;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
// Sync mode: update database with current EUR-Lex versions
|
|
106
|
-
async function syncVersions(): Promise<void> {
|
|
107
|
-
console.log('Syncing EUR-Lex versions to database...\n');
|
|
108
|
-
|
|
109
|
-
if (!existsSync(DB_PATH)) {
|
|
110
|
-
console.log('Database not found. Run `npm run build:db` first.');
|
|
111
|
-
process.exit(1);
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
const db = new Database(DB_PATH);
|
|
115
|
-
|
|
116
|
-
const sources = db.prepare(`
|
|
117
|
-
SELECT regulation, celex_id FROM source_registry
|
|
118
|
-
WHERE celex_id IS NOT NULL AND celex_id != ''
|
|
119
|
-
`).all() as SourceRecord[];
|
|
120
|
-
|
|
121
|
-
const updateStmt = db.prepare(`
|
|
122
|
-
UPDATE source_registry SET eur_lex_version = ?, last_fetched = ?
|
|
123
|
-
WHERE regulation = ?
|
|
124
|
-
`);
|
|
125
|
-
|
|
126
|
-
const now = new Date().toISOString();
|
|
127
|
-
let updated = 0;
|
|
128
|
-
|
|
129
|
-
for (const source of sources) {
|
|
130
|
-
process.stdout.write(`${source.regulation}: `);
|
|
131
|
-
const metadata = await fetchEurLexMetadata(source.celex_id);
|
|
132
|
-
|
|
133
|
-
if (metadata && metadata.lastModified !== 'unknown') {
|
|
134
|
-
updateStmt.run(metadata.lastModified, now, source.regulation);
|
|
135
|
-
console.log(`synced to ${metadata.lastModified}`);
|
|
136
|
-
updated++;
|
|
137
|
-
} else {
|
|
138
|
-
console.log('skipped (unknown version)');
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
db.close();
|
|
143
|
-
console.log(`\n✓ Synced ${updated} regulation(s)`);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
async function checkForUpdates(): Promise<void> {
|
|
147
|
-
console.log('Checking EUR-Lex for regulation updates...\n');
|
|
148
|
-
|
|
149
|
-
// Check if database exists
|
|
150
|
-
if (!existsSync(DB_PATH)) {
|
|
151
|
-
console.log('Database not found. Run `npm run build:db` first.');
|
|
152
|
-
process.exit(1);
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
const db = new Database(DB_PATH, { readonly: true });
|
|
156
|
-
|
|
157
|
-
// Get all regulations from source_registry - this IS the source of truth
|
|
158
|
-
const sources = db.prepare(`
|
|
159
|
-
SELECT regulation, celex_id, eur_lex_version, last_fetched, quality_status
|
|
160
|
-
FROM source_registry
|
|
161
|
-
WHERE celex_id IS NOT NULL AND celex_id != ''
|
|
162
|
-
ORDER BY regulation
|
|
163
|
-
`).all() as SourceRecord[];
|
|
164
|
-
|
|
165
|
-
if (sources.length === 0) {
|
|
166
|
-
console.log('No regulations found in source_registry.');
|
|
167
|
-
console.log('Ingest regulations first with: npx tsx scripts/ingest-eurlex.ts <CELEX_ID> <output.json>');
|
|
168
|
-
db.close();
|
|
169
|
-
process.exit(0);
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
console.log(`Found ${sources.length} regulation(s) to check\n`);
|
|
173
|
-
console.log('Status Report');
|
|
174
|
-
console.log('='.repeat(80));
|
|
175
|
-
|
|
176
|
-
const updates: Array<{ id: string; celex_id: string; reason: string }> = [];
|
|
177
|
-
|
|
178
|
-
for (const source of sources) {
|
|
179
|
-
process.stdout.write(`\n${source.regulation.padEnd(20)} (${source.celex_id}): `);
|
|
180
|
-
|
|
181
|
-
// Fetch current EUR-Lex metadata
|
|
182
|
-
const metadata = await fetchEurLexMetadata(source.celex_id);
|
|
183
|
-
|
|
184
|
-
if (!metadata) {
|
|
185
|
-
console.log('FETCH FAILED');
|
|
186
|
-
continue;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
const lastFetched = source.last_fetched || 'never';
|
|
190
|
-
const eurLexVersion = metadata.lastModified;
|
|
191
|
-
|
|
192
|
-
// Helper to compare dates (returns true if eurLex is newer)
|
|
193
|
-
const isNewer = (eurLex: string, local: string): boolean => {
|
|
194
|
-
if (eurLex === 'unknown' || !eurLex) return false;
|
|
195
|
-
try {
|
|
196
|
-
const eurLexDate = new Date(eurLex);
|
|
197
|
-
const localDate = new Date(local);
|
|
198
|
-
return eurLexDate > localDate;
|
|
199
|
-
} catch {
|
|
200
|
-
return false;
|
|
201
|
-
}
|
|
202
|
-
};
|
|
203
|
-
|
|
204
|
-
if (eurLexVersion === 'unknown') {
|
|
205
|
-
// UNECE or non-standard documents - can't auto-check
|
|
206
|
-
console.log('MANUAL CHECK REQUIRED');
|
|
207
|
-
console.log(` Source type: Non-standard (UNECE/consolidated)`);
|
|
208
|
-
console.log(` Last fetched: ${lastFetched}`);
|
|
209
|
-
} else if (!source.eur_lex_version) {
|
|
210
|
-
// First time checking - record the version but don't flag as update
|
|
211
|
-
console.log('VERSION NOT TRACKED');
|
|
212
|
-
console.log(` EUR-Lex version: ${eurLexVersion}`);
|
|
213
|
-
console.log(` Run ingest again to record version`);
|
|
214
|
-
} else if (isNewer(eurLexVersion, source.eur_lex_version)) {
|
|
215
|
-
// EUR-Lex has a newer version
|
|
216
|
-
console.log('UPDATE AVAILABLE');
|
|
217
|
-
console.log(` Local version: ${source.eur_lex_version}`);
|
|
218
|
-
console.log(` EUR-Lex version: ${eurLexVersion}`);
|
|
219
|
-
updates.push({
|
|
220
|
-
id: source.regulation,
|
|
221
|
-
celex_id: source.celex_id,
|
|
222
|
-
reason: `Newer version: ${source.eur_lex_version} -> ${eurLexVersion}`
|
|
223
|
-
});
|
|
224
|
-
} else if (source.quality_status !== 'complete') {
|
|
225
|
-
console.log(`INCOMPLETE (${source.quality_status})`);
|
|
226
|
-
updates.push({
|
|
227
|
-
id: source.regulation,
|
|
228
|
-
celex_id: source.celex_id,
|
|
229
|
-
reason: `Quality status: ${source.quality_status}`
|
|
230
|
-
});
|
|
231
|
-
} else {
|
|
232
|
-
console.log('UP TO DATE');
|
|
233
|
-
console.log(` EUR-Lex version: ${eurLexVersion}`);
|
|
234
|
-
console.log(` Last fetched: ${lastFetched}`);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
db.close();
|
|
239
|
-
|
|
240
|
-
// Summary
|
|
241
|
-
console.log('\n' + '='.repeat(80));
|
|
242
|
-
console.log('Summary');
|
|
243
|
-
console.log('='.repeat(80));
|
|
244
|
-
|
|
245
|
-
if (updates.length === 0) {
|
|
246
|
-
console.log('\n✓ All monitored regulations are up to date.');
|
|
247
|
-
} else {
|
|
248
|
-
console.log(`\n⚠ ${updates.length} regulation(s) need attention:\n`);
|
|
249
|
-
for (const u of updates) {
|
|
250
|
-
console.log(` - ${u.id}: ${u.reason}`);
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
console.log('\nTo update, run:');
|
|
254
|
-
for (const u of updates) {
|
|
255
|
-
console.log(` npx tsx scripts/ingest-eurlex.ts ${u.celex_id} data/seed/${u.id.toLowerCase()}.json`);
|
|
256
|
-
}
|
|
257
|
-
console.log('\nThen: npm run build:db');
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
// Output for CI: write CELEX IDs to file for workflow to use
|
|
261
|
-
const celexList = sources.map(s => s.celex_id).join('|');
|
|
262
|
-
console.log(`\n::set-output name=celex_pattern::${celexList}`);
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
// Also provide a function to update the source registry after ingestion
|
|
266
|
-
export async function updateSourceRegistry(
|
|
267
|
-
db: Database.Database,
|
|
268
|
-
regulation: string,
|
|
269
|
-
celexId: string,
|
|
270
|
-
articleCount: number
|
|
271
|
-
): Promise<void> {
|
|
272
|
-
const now = new Date().toISOString();
|
|
273
|
-
|
|
274
|
-
db.prepare(`
|
|
275
|
-
INSERT OR REPLACE INTO source_registry
|
|
276
|
-
(regulation, celex_id, eur_lex_version, last_fetched, articles_expected, articles_parsed, quality_status)
|
|
277
|
-
VALUES (?, ?, ?, ?, ?, ?, 'complete')
|
|
278
|
-
`).run(regulation, celexId, now.split('T')[0], now, articleCount, articleCount);
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
// Main execution
|
|
282
|
-
const args = process.argv.slice(2);
|
|
283
|
-
|
|
284
|
-
if (args.includes('--sync')) {
|
|
285
|
-
syncVersions().catch(err => {
|
|
286
|
-
console.error('Error:', err);
|
|
287
|
-
process.exit(1);
|
|
288
|
-
});
|
|
289
|
-
} else {
|
|
290
|
-
checkForUpdates().catch(err => {
|
|
291
|
-
console.error('Error:', err);
|
|
292
|
-
process.exit(1);
|
|
293
|
-
});
|
|
294
|
-
}
|