@ansvar/eu-regulations-mcp 0.8.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -29
- package/data/regulations.db +0 -0
- package/data/seed/applicability/chips-act.json +67 -0
- package/data/seed/applicability/crma.json +85 -0
- package/data/seed/chips-act.json +714 -0
- package/data/seed/crma.json +877 -0
- package/data/seed/mappings/iso27001-chips-act.json +50 -0
- package/data/seed/mappings/iso27001-crma.json +50 -0
- package/data/seed/mappings/nist-csf-chips-act.json +56 -0
- package/data/seed/mappings/nist-csf-crma.json +56 -0
- package/dist/database/sqlite-adapter.d.ts +2 -2
- package/dist/database/sqlite-adapter.d.ts.map +1 -1
- package/dist/database/sqlite-adapter.js.map +1 -1
- package/dist/http-server.js +27 -5
- package/dist/http-server.js.map +1 -1
- package/dist/index.js +27 -4
- package/dist/index.js.map +1 -1
- package/dist/tools/about.d.ts +40 -0
- package/dist/tools/about.d.ts.map +1 -0
- package/dist/tools/about.js +61 -0
- package/dist/tools/about.js.map +1 -0
- package/dist/tools/list.d.ts +7 -0
- package/dist/tools/list.d.ts.map +1 -1
- package/dist/tools/list.js +73 -8
- package/dist/tools/list.js.map +1 -1
- package/dist/tools/registry.d.ts +11 -1
- package/dist/tools/registry.d.ts.map +1 -1
- package/dist/tools/registry.js +56 -4
- package/dist/tools/registry.js.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +17 -5
- package/dist/worker.js.map +1 -1
- package/package.json +8 -7
- package/scripts/add-cross-references.sql +0 -200
- package/scripts/analyze-survey-responses.ts +0 -285
- package/scripts/build-db.ts +0 -421
- package/scripts/bulk-reingest-all.ts +0 -331
- package/scripts/check-updates.ts +0 -294
- package/scripts/extract-eprivacy-recitals.ts +0 -98
- package/scripts/ingest-eurlex-browser.ts +0 -113
- package/scripts/ingest-eurlex.ts +0 -346
- package/scripts/ingest-unece.ts +0 -382
- package/scripts/migrate-postgres.ts +0 -445
- package/scripts/migrate-to-postgres.ts +0 -353
- package/scripts/reingest-all-with-recitals.sh +0 -81
- package/scripts/sync-versions.ts +0 -206
- package/scripts/test-cross-refs.js +0 -26
- package/scripts/test-postgres-adapter.ts +0 -146
- package/scripts/update-dora-rts-metadata.ts +0 -112
- package/src/database/postgres-adapter.ts +0 -84
- package/src/database/sqlite-adapter.ts +0 -44
- package/src/database/types.ts +0 -10
- package/src/http-server.ts +0 -149
- package/src/index.ts +0 -61
- package/src/middleware/rate-limit.ts +0 -104
- package/src/tools/applicability.ts +0 -167
- package/src/tools/article.ts +0 -81
- package/src/tools/compare.ts +0 -217
- package/src/tools/definitions.ts +0 -49
- package/src/tools/evidence.ts +0 -84
- package/src/tools/list.ts +0 -124
- package/src/tools/map.ts +0 -86
- package/src/tools/recital.ts +0 -60
- package/src/tools/registry.ts +0 -311
- package/src/tools/search.ts +0 -297
- package/src/worker.ts +0 -708
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
|
|
3
|
-
import { writeFileSync, readFileSync } from 'fs';
|
|
4
|
-
import { JSDOM } from 'jsdom';
|
|
5
|
-
|
|
6
|
-
async function fetchEPrivacyRecitals() {
|
|
7
|
-
const url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32002L0058';
|
|
8
|
-
console.log(`Fetching: ${url}`);
|
|
9
|
-
|
|
10
|
-
const response = await fetch(url, {
|
|
11
|
-
headers: {
|
|
12
|
-
'User-Agent': 'Mozilla/5.0 (compatible)',
|
|
13
|
-
'Accept': 'text/html',
|
|
14
|
-
},
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
if (!response.ok) {
|
|
18
|
-
throw new Error(`Failed: ${response.status}`);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
const html = await response.text();
|
|
22
|
-
const dom = new JSDOM(html);
|
|
23
|
-
const doc = dom.window.document;
|
|
24
|
-
|
|
25
|
-
// Extract all paragraphs
|
|
26
|
-
const paragraphs = Array.from(doc.querySelectorAll('p'));
|
|
27
|
-
|
|
28
|
-
const recitals: Array<{ recital_number: number; text: string }> = [];
|
|
29
|
-
let inRecitals = false;
|
|
30
|
-
let currentNumber: number | null = null;
|
|
31
|
-
let currentText: string[] = [];
|
|
32
|
-
|
|
33
|
-
for (const p of paragraphs) {
|
|
34
|
-
const text = p.textContent?.trim() || '';
|
|
35
|
-
|
|
36
|
-
// Check if we've entered the recitals section
|
|
37
|
-
if (text === 'Whereas:') {
|
|
38
|
-
inRecitals = true;
|
|
39
|
-
continue;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// Check if we've left the recitals section
|
|
43
|
-
if (text.match(/^HAVE ADOPTED/i) || text.match(/^Article\s+1/i)) {
|
|
44
|
-
if (currentNumber && currentText.length > 0) {
|
|
45
|
-
recitals.push({
|
|
46
|
-
recital_number: currentNumber,
|
|
47
|
-
text: currentText.join('\n\n'),
|
|
48
|
-
});
|
|
49
|
-
}
|
|
50
|
-
break;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
if (!inRecitals) continue;
|
|
54
|
-
|
|
55
|
-
// Check for recital number at start: "(1)", "(2)", etc.
|
|
56
|
-
const recitalMatch = text.match(/^\((\d+)\)/);
|
|
57
|
-
|
|
58
|
-
if (recitalMatch) {
|
|
59
|
-
// Save previous recital
|
|
60
|
-
if (currentNumber && currentText.length > 0) {
|
|
61
|
-
recitals.push({
|
|
62
|
-
recital_number: currentNumber,
|
|
63
|
-
text: currentText.join('\n\n'),
|
|
64
|
-
});
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// Start new recital
|
|
68
|
-
currentNumber = parseInt(recitalMatch[1]);
|
|
69
|
-
const remainingText = text.substring(recitalMatch[0].length).trim();
|
|
70
|
-
currentText = remainingText ? [remainingText] : [];
|
|
71
|
-
} else if (currentNumber && text.length > 0) {
|
|
72
|
-
// Add to current recital
|
|
73
|
-
currentText.push(text);
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
// Don't forget the last one
|
|
78
|
-
if (currentNumber && currentText.length > 0) {
|
|
79
|
-
recitals.push({
|
|
80
|
-
recital_number: currentNumber,
|
|
81
|
-
text: currentText.join('\n\n'),
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
console.log(`Extracted ${recitals.length} recitals`);
|
|
86
|
-
|
|
87
|
-
// Load existing ePrivacy JSON
|
|
88
|
-
const existingData = JSON.parse(readFileSync('data/seed/eprivacy.json', 'utf-8'));
|
|
89
|
-
existingData.recitals = recitals;
|
|
90
|
-
|
|
91
|
-
// Save updated file
|
|
92
|
-
writeFileSync('data/seed/eprivacy.json', JSON.stringify(existingData, null, 2));
|
|
93
|
-
console.log(`Saved to: data/seed/eprivacy.json`);
|
|
94
|
-
console.log(`Articles: ${existingData.articles.length}`);
|
|
95
|
-
console.log(`Recitals: ${recitals.length}`);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
fetchEPrivacyRecitals().catch(console.error);
|
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Browser-based EUR-Lex fetcher to bypass AWS WAF challenges.
|
|
5
|
-
*
|
|
6
|
-
* EUR-Lex deployed AWS WAF that returns 2036-byte JavaScript challenge pages
|
|
7
|
-
* instead of actual HTML when using fetch(). This script launches a headless
|
|
8
|
-
* browser to wait for the challenge to complete and retrieve the real content.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* npx tsx scripts/ingest-eurlex-browser.ts <celex_id>
|
|
12
|
-
* npx tsx scripts/ingest-eurlex-browser.ts 32016R0679
|
|
13
|
-
*
|
|
14
|
-
* Or import as a function:
|
|
15
|
-
* import { fetchEurLexWithBrowser } from './scripts/ingest-eurlex-browser';
|
|
16
|
-
* const html = await fetchEurLexWithBrowser('32016R0679');
|
|
17
|
-
*/
|
|
18
|
-
|
|
19
|
-
import puppeteer from 'puppeteer';
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* Fetches EUR-Lex regulation HTML using Puppeteer to bypass WAF challenges.
|
|
23
|
-
*
|
|
24
|
-
* @param celexId - CELEX identifier (e.g., '32016R0679' for GDPR)
|
|
25
|
-
* @returns Full HTML content of the regulation
|
|
26
|
-
* @throws Error if fetching fails or content is invalid
|
|
27
|
-
*/
|
|
28
|
-
export async function fetchEurLexWithBrowser(celexId: string): Promise<string> {
|
|
29
|
-
const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
|
|
30
|
-
console.log(`[Browser] Launching headless browser...`);
|
|
31
|
-
console.log(`[Browser] Fetching: ${url}`);
|
|
32
|
-
|
|
33
|
-
const browser = await puppeteer.launch({
|
|
34
|
-
headless: true,
|
|
35
|
-
args: [
|
|
36
|
-
'--no-sandbox',
|
|
37
|
-
'--disable-setuid-sandbox',
|
|
38
|
-
'--disable-dev-shm-usage',
|
|
39
|
-
'--disable-accelerated-2d-canvas',
|
|
40
|
-
'--disable-gpu',
|
|
41
|
-
],
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
try {
|
|
45
|
-
const page = await browser.newPage();
|
|
46
|
-
|
|
47
|
-
// Set realistic User-Agent to appear as a normal browser
|
|
48
|
-
await page.setUserAgent(
|
|
49
|
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
50
|
-
);
|
|
51
|
-
|
|
52
|
-
// Set viewport
|
|
53
|
-
await page.setViewport({ width: 1920, height: 1080 });
|
|
54
|
-
|
|
55
|
-
console.log(`[Browser] Navigating to URL...`);
|
|
56
|
-
await page.goto(url, {
|
|
57
|
-
waitUntil: 'networkidle0', // Wait until network is idle
|
|
58
|
-
timeout: 30000,
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
console.log(`[Browser] Page loaded. Waiting for WAF challenge to complete...`);
|
|
62
|
-
|
|
63
|
-
// Wait a bit longer to ensure WAF challenge JavaScript has executed
|
|
64
|
-
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
65
|
-
|
|
66
|
-
// Get the full HTML content
|
|
67
|
-
const html = await page.content();
|
|
68
|
-
|
|
69
|
-
console.log(`[Browser] Fetched ${html.length} bytes`);
|
|
70
|
-
|
|
71
|
-
// Validate that we got real content, not a WAF challenge page
|
|
72
|
-
if (html.length < 10000) {
|
|
73
|
-
throw new Error(`Fetched content is suspiciously small (${html.length} bytes). Possible WAF block.`);
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
if (html.includes('window.gokuProps')) {
|
|
77
|
-
throw new Error('Received AWS WAF challenge page instead of regulation content.');
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
if (!html.includes('Article')) {
|
|
81
|
-
console.warn('[Browser] Warning: HTML does not contain "Article" - may not be valid regulation content');
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
return html;
|
|
85
|
-
} finally {
|
|
86
|
-
await browser.close();
|
|
87
|
-
console.log(`[Browser] Browser closed`);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// CLI interface
|
|
92
|
-
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
93
|
-
const [,, celexId] = process.argv;
|
|
94
|
-
|
|
95
|
-
if (!celexId) {
|
|
96
|
-
console.log('Usage: npx tsx scripts/ingest-eurlex-browser.ts <celex_id>');
|
|
97
|
-
console.log('Example: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679');
|
|
98
|
-
console.log('\nThis will fetch the HTML and print it to stdout.');
|
|
99
|
-
console.log('Pipe to a file: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679 > output.html');
|
|
100
|
-
process.exit(1);
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
fetchEurLexWithBrowser(celexId)
|
|
104
|
-
.then(html => {
|
|
105
|
-
// Output HTML to stdout for piping
|
|
106
|
-
console.log('\n--- HTML Content ---\n');
|
|
107
|
-
console.log(html);
|
|
108
|
-
})
|
|
109
|
-
.catch(err => {
|
|
110
|
-
console.error('Error:', err.message);
|
|
111
|
-
process.exit(1);
|
|
112
|
-
});
|
|
113
|
-
}
|
package/scripts/ingest-eurlex.ts
DELETED
|
@@ -1,346 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Ingest EU regulations from EUR-Lex.
|
|
5
|
-
*
|
|
6
|
-
* Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]
|
|
7
|
-
* Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json
|
|
8
|
-
* Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
import { writeFileSync } from 'fs';
|
|
12
|
-
import { JSDOM } from 'jsdom';
|
|
13
|
-
import { fetchEurLexWithBrowser } from './ingest-eurlex-browser.js';
|
|
14
|
-
|
|
15
|
-
interface Article {
|
|
16
|
-
number: string;
|
|
17
|
-
title?: string;
|
|
18
|
-
text: string;
|
|
19
|
-
chapter?: string;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
interface Definition {
|
|
23
|
-
term: string;
|
|
24
|
-
definition: string;
|
|
25
|
-
article: string;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
interface Recital {
|
|
29
|
-
recital_number: number;
|
|
30
|
-
text: string;
|
|
31
|
-
related_articles?: string;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
interface RegulationData {
|
|
35
|
-
id: string;
|
|
36
|
-
full_name: string;
|
|
37
|
-
celex_id: string;
|
|
38
|
-
effective_date?: string;
|
|
39
|
-
eur_lex_url: string;
|
|
40
|
-
articles: Article[];
|
|
41
|
-
definitions: Definition[];
|
|
42
|
-
recitals?: Recital[];
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
const REGULATION_METADATA: Record<string, { id: string; full_name: string; effective_date?: string }> = {
|
|
46
|
-
'32016R0679': { id: 'GDPR', full_name: 'General Data Protection Regulation', effective_date: '2018-05-25' },
|
|
47
|
-
'32022L2555': { id: 'NIS2', full_name: 'Directive on measures for a high common level of cybersecurity across the Union', effective_date: '2024-10-17' },
|
|
48
|
-
'32022R2554': { id: 'DORA', full_name: 'Digital Operational Resilience Act', effective_date: '2025-01-17' },
|
|
49
|
-
'32024R1689': { id: 'AI_ACT', full_name: 'Artificial Intelligence Act', effective_date: '2024-08-01' },
|
|
50
|
-
'32024R2847': { id: 'CRA', full_name: 'Cyber Resilience Act', effective_date: '2024-12-10' },
|
|
51
|
-
'32019R0881': { id: 'CYBERSECURITY_ACT', full_name: 'EU Cybersecurity Act', effective_date: '2019-06-27' },
|
|
52
|
-
'32024R1183': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
|
|
53
|
-
'02014R0910-20241018': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
|
|
54
|
-
// Digital Single Market regulations
|
|
55
|
-
'32023R2854': { id: 'DATA_ACT', full_name: 'Data Act', effective_date: '2025-09-12' },
|
|
56
|
-
'32022R2065': { id: 'DSA', full_name: 'Digital Services Act', effective_date: '2024-02-17' },
|
|
57
|
-
'32022R1925': { id: 'DMA', full_name: 'Digital Markets Act', effective_date: '2023-05-02' },
|
|
58
|
-
// UN Regulations (adopted by EU)
|
|
59
|
-
'42021X0387': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system', effective_date: '2021-01-22' },
|
|
60
|
-
'42025X0005': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system (Supplement 3)', effective_date: '2025-01-10' },
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
async function fetchEurLexHtml(celexId: string, useBrowser = false): Promise<string> {
|
|
64
|
-
if (useBrowser) {
|
|
65
|
-
console.log('Using Puppeteer to bypass WAF...');
|
|
66
|
-
return fetchEurLexWithBrowser(celexId);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// Fallback to direct fetch (will fail with WAF)
|
|
70
|
-
const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
|
|
71
|
-
console.log(`Fetching: ${url}`);
|
|
72
|
-
|
|
73
|
-
const response = await fetch(url, {
|
|
74
|
-
headers: {
|
|
75
|
-
'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0; +https://github.com/Ansvar-Systems/EU_compliance_MCP)',
|
|
76
|
-
'Accept': 'text/html',
|
|
77
|
-
},
|
|
78
|
-
});
|
|
79
|
-
|
|
80
|
-
if (!response.ok) {
|
|
81
|
-
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
return response.text();
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
function parseRecitals(html: string): Recital[] {
|
|
88
|
-
const dom = new JSDOM(html);
|
|
89
|
-
const doc = dom.window.document;
|
|
90
|
-
|
|
91
|
-
const recitals: Recital[] = [];
|
|
92
|
-
const allText = doc.body?.textContent || '';
|
|
93
|
-
const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
|
|
94
|
-
|
|
95
|
-
let inRecitalsSection = false;
|
|
96
|
-
let currentRecital: { number: number; lines: string[] } | null = null;
|
|
97
|
-
|
|
98
|
-
for (const line of lines) {
|
|
99
|
-
// Detect start of recitals section
|
|
100
|
-
if (line.match(/^Having regard to/i) || line.match(/^Whereas:/i)) {
|
|
101
|
-
inRecitalsSection = true;
|
|
102
|
-
continue;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
// Detect end of recitals (usually "HAVE ADOPTED" or "Article 1")
|
|
106
|
-
if (line.match(/^HAVE ADOPTED/i) || line.match(/^Article\s+1$/i)) {
|
|
107
|
-
inRecitalsSection = false;
|
|
108
|
-
if (currentRecital && currentRecital.lines.length > 0) {
|
|
109
|
-
recitals.push({
|
|
110
|
-
recital_number: currentRecital.number,
|
|
111
|
-
text: currentRecital.lines.join('\n\n'),
|
|
112
|
-
});
|
|
113
|
-
}
|
|
114
|
-
break;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
if (!inRecitalsSection) continue;
|
|
118
|
-
|
|
119
|
-
// Match recital number: "(1)", "(123)", etc.
|
|
120
|
-
const recitalMatch = line.match(/^\((\d+)\)/);
|
|
121
|
-
if (recitalMatch) {
|
|
122
|
-
// Save previous recital
|
|
123
|
-
if (currentRecital && currentRecital.lines.length > 0) {
|
|
124
|
-
recitals.push({
|
|
125
|
-
recital_number: currentRecital.number,
|
|
126
|
-
text: currentRecital.lines.join('\n\n'),
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Start new recital
|
|
131
|
-
currentRecital = {
|
|
132
|
-
number: parseInt(recitalMatch[1]),
|
|
133
|
-
lines: [],
|
|
134
|
-
};
|
|
135
|
-
|
|
136
|
-
// Add remaining text after number
|
|
137
|
-
const textAfterNumber = line.substring(recitalMatch[0].length).trim();
|
|
138
|
-
if (textAfterNumber) {
|
|
139
|
-
currentRecital.lines.push(textAfterNumber);
|
|
140
|
-
}
|
|
141
|
-
continue;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Add line to current recital
|
|
145
|
-
if (currentRecital && line.length > 0) {
|
|
146
|
-
currentRecital.lines.push(line);
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
// Don't forget the last recital
|
|
151
|
-
if (currentRecital && currentRecital.lines.length > 0) {
|
|
152
|
-
recitals.push({
|
|
153
|
-
recital_number: currentRecital.number,
|
|
154
|
-
text: currentRecital.lines.join('\n\n'),
|
|
155
|
-
});
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
return recitals;
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
function parseArticles(html: string, celexId: string): { articles: Article[]; definitions: Definition[] } {
|
|
162
|
-
const dom = new JSDOM(html);
|
|
163
|
-
const doc = dom.window.document;
|
|
164
|
-
|
|
165
|
-
const articles: Article[] = [];
|
|
166
|
-
const definitions: Definition[] = [];
|
|
167
|
-
let currentChapter = '';
|
|
168
|
-
|
|
169
|
-
// Get all text content and split by article markers
|
|
170
|
-
const allText = doc.body?.textContent || '';
|
|
171
|
-
const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
|
|
172
|
-
|
|
173
|
-
let currentArticle: { number: string; title?: string; lines: string[] } | null = null;
|
|
174
|
-
|
|
175
|
-
for (const line of lines) {
|
|
176
|
-
const articleStart = line.match(/^Article\s+(\d+[a-z]?)$/i);
|
|
177
|
-
if (articleStart) {
|
|
178
|
-
if (currentArticle && currentArticle.lines.length > 0) {
|
|
179
|
-
articles.push({
|
|
180
|
-
number: currentArticle.number,
|
|
181
|
-
title: currentArticle.title,
|
|
182
|
-
text: currentArticle.lines.join('\n\n'),
|
|
183
|
-
chapter: currentChapter || undefined,
|
|
184
|
-
});
|
|
185
|
-
}
|
|
186
|
-
currentArticle = { number: articleStart[1], lines: [] };
|
|
187
|
-
continue;
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
const chapterStart = line.match(/^CHAPTER\s+([IVXLC]+)/i);
|
|
191
|
-
if (chapterStart) {
|
|
192
|
-
currentChapter = chapterStart[1];
|
|
193
|
-
continue;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
if (currentArticle) {
|
|
197
|
-
// Check if this is a title line (short, no period at end)
|
|
198
|
-
if (!currentArticle.title && currentArticle.lines.length === 0 && line.length < 100 && !line.endsWith('.')) {
|
|
199
|
-
currentArticle.title = line;
|
|
200
|
-
} else if (line.length > 0) {
|
|
201
|
-
currentArticle.lines.push(line);
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
// Don't forget the last article
|
|
207
|
-
if (currentArticle && currentArticle.lines.length > 0) {
|
|
208
|
-
articles.push({
|
|
209
|
-
number: currentArticle.number,
|
|
210
|
-
title: currentArticle.title,
|
|
211
|
-
text: currentArticle.lines.join('\n\n'),
|
|
212
|
-
chapter: currentChapter || undefined,
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Deduplicate articles - keep the one with the most content for each number
|
|
217
|
-
const articleMap = new Map<string, Article>();
|
|
218
|
-
for (const article of articles) {
|
|
219
|
-
const existing = articleMap.get(article.number);
|
|
220
|
-
if (!existing || article.text.length > existing.text.length) {
|
|
221
|
-
articleMap.set(article.number, article);
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
const deduplicatedArticles = Array.from(articleMap.values())
|
|
225
|
-
.sort((a, b) => {
|
|
226
|
-
// Extract numeric and letter parts (e.g., "5a" -> [5, "a"])
|
|
227
|
-
const matchA = a.number.match(/^(\d+)([a-z]?)$/);
|
|
228
|
-
const matchB = b.number.match(/^(\d+)([a-z]?)$/);
|
|
229
|
-
if (!matchA || !matchB) return 0;
|
|
230
|
-
|
|
231
|
-
const numA = parseInt(matchA[1]);
|
|
232
|
-
const numB = parseInt(matchB[1]);
|
|
233
|
-
|
|
234
|
-
// Sort by number first
|
|
235
|
-
if (numA !== numB) return numA - numB;
|
|
236
|
-
|
|
237
|
-
// Then by letter (empty string sorts before letters)
|
|
238
|
-
return (matchA[2] || '').localeCompare(matchB[2] || '');
|
|
239
|
-
});
|
|
240
|
-
|
|
241
|
-
// Extract definitions from Article 4 (or similar definitions article)
|
|
242
|
-
// Find definitions article from deduplicated list
|
|
243
|
-
const defsArticle = deduplicatedArticles.find(a =>
|
|
244
|
-
a.title?.toLowerCase().includes('definition')
|
|
245
|
-
);
|
|
246
|
-
|
|
247
|
-
if (defsArticle && defsArticle.text.includes('means')) {
|
|
248
|
-
// Normalize text: collapse whitespace and normalize quotes
|
|
249
|
-
const normalizedText = defsArticle.text
|
|
250
|
-
.replace(/\s+/g, ' ')
|
|
251
|
-
.replace(/[\u2018\u2019]/g, "'"); // Curly quotes to straight
|
|
252
|
-
|
|
253
|
-
// Parse definitions by extracting content between consecutive numbered entries
|
|
254
|
-
// This handles:
|
|
255
|
-
// - Complex definitions with internal periods/semicolons
|
|
256
|
-
// - 'term' or 'alternate' means... patterns (NIS2 Art 6)
|
|
257
|
-
// - 'term1', 'term2' and 'term3' mean... patterns (CRA Art 3)
|
|
258
|
-
// - 'term' of the something means... patterns (GDPR Art 4)
|
|
259
|
-
// - mean, respectively... patterns (CRA Art 3)
|
|
260
|
-
// - means: (a) ... patterns (complex definitions with sub-parts)
|
|
261
|
-
const defRegex = /\((\d+)\)\s*'([^']+)'(?:[^(]*?)means?[,:;]?\s+(.+?)(?=\(\d+\)\s*'|$)/g;
|
|
262
|
-
let defMatch;
|
|
263
|
-
while ((defMatch = defRegex.exec(normalizedText)) !== null) {
|
|
264
|
-
const term = defMatch[2].trim().toLowerCase();
|
|
265
|
-
const definition = defMatch[3].trim();
|
|
266
|
-
// Only add if we got meaningful content
|
|
267
|
-
if (term.length > 0 && definition.length > 10) {
|
|
268
|
-
definitions.push({
|
|
269
|
-
term,
|
|
270
|
-
definition,
|
|
271
|
-
article: defsArticle.number,
|
|
272
|
-
});
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
return { articles: deduplicatedArticles, definitions };
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
async function ingestRegulation(celexId: string, outputPath: string, useBrowser = false): Promise<void> {
|
|
281
|
-
const metadata = REGULATION_METADATA[celexId];
|
|
282
|
-
if (!metadata) {
|
|
283
|
-
console.warn(`Unknown CELEX ID: ${celexId}. Using generic metadata.`);
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
const html = await fetchEurLexHtml(celexId, useBrowser);
|
|
287
|
-
console.log(`Fetched ${html.length} bytes`);
|
|
288
|
-
|
|
289
|
-
// Parse recitals BEFORE articles
|
|
290
|
-
const recitals = parseRecitals(html);
|
|
291
|
-
console.log(`Parsed ${recitals.length} recitals`);
|
|
292
|
-
|
|
293
|
-
const { articles, definitions } = parseArticles(html, celexId);
|
|
294
|
-
console.log(`Parsed ${articles.length} articles, ${definitions.length} definitions`);
|
|
295
|
-
|
|
296
|
-
if (articles.length === 0) {
|
|
297
|
-
console.error('No articles found! The HTML structure may have changed.');
|
|
298
|
-
console.log('Saving raw HTML for debugging...');
|
|
299
|
-
writeFileSync(outputPath.replace('.json', '.html'), html);
|
|
300
|
-
return;
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
const regulation: RegulationData = {
|
|
304
|
-
id: metadata?.id || celexId,
|
|
305
|
-
full_name: metadata?.full_name || `Regulation ${celexId}`,
|
|
306
|
-
celex_id: celexId,
|
|
307
|
-
effective_date: metadata?.effective_date,
|
|
308
|
-
eur_lex_url: `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`,
|
|
309
|
-
articles,
|
|
310
|
-
definitions,
|
|
311
|
-
recitals,
|
|
312
|
-
};
|
|
313
|
-
|
|
314
|
-
writeFileSync(outputPath, JSON.stringify(regulation, null, 2));
|
|
315
|
-
console.log(`\nSaved to: ${outputPath}`);
|
|
316
|
-
console.log(`Articles: ${articles.length}`);
|
|
317
|
-
console.log(`Definitions: ${definitions.length}`);
|
|
318
|
-
console.log(`Recitals: ${recitals.length}`);
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
// Main
|
|
322
|
-
const args = process.argv.slice(2);
|
|
323
|
-
const useBrowser = args.includes('--browser');
|
|
324
|
-
const [celexId, outputPath] = args.filter(arg => arg !== '--browser');
|
|
325
|
-
|
|
326
|
-
if (!celexId || !outputPath) {
|
|
327
|
-
console.log('Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]');
|
|
328
|
-
console.log('Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json');
|
|
329
|
-
console.log('Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser');
|
|
330
|
-
console.log('\nOptions:');
|
|
331
|
-
console.log(' --browser Use Puppeteer to bypass EUR-Lex WAF challenges');
|
|
332
|
-
console.log('\nKnown CELEX IDs:');
|
|
333
|
-
Object.entries(REGULATION_METADATA).forEach(([id, meta]) => {
|
|
334
|
-
console.log(` ${id} - ${meta.id} (${meta.full_name})`);
|
|
335
|
-
});
|
|
336
|
-
process.exit(1);
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
if (useBrowser) {
|
|
340
|
-
console.log('Browser mode enabled - using Puppeteer to fetch content\n');
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
ingestRegulation(celexId, outputPath, useBrowser).catch(err => {
|
|
344
|
-
console.error('Error:', err);
|
|
345
|
-
process.exit(1);
|
|
346
|
-
});
|