@ansvar/eu-regulations-mcp 0.8.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +76 -29
  2. package/data/regulations.db +0 -0
  3. package/data/seed/applicability/chips-act.json +67 -0
  4. package/data/seed/applicability/crma.json +85 -0
  5. package/data/seed/chips-act.json +714 -0
  6. package/data/seed/crma.json +877 -0
  7. package/data/seed/mappings/iso27001-chips-act.json +50 -0
  8. package/data/seed/mappings/iso27001-crma.json +50 -0
  9. package/data/seed/mappings/nist-csf-chips-act.json +56 -0
  10. package/data/seed/mappings/nist-csf-crma.json +56 -0
  11. package/dist/database/sqlite-adapter.d.ts +2 -2
  12. package/dist/database/sqlite-adapter.d.ts.map +1 -1
  13. package/dist/database/sqlite-adapter.js.map +1 -1
  14. package/dist/http-server.js +27 -5
  15. package/dist/http-server.js.map +1 -1
  16. package/dist/index.js +27 -4
  17. package/dist/index.js.map +1 -1
  18. package/dist/tools/about.d.ts +40 -0
  19. package/dist/tools/about.d.ts.map +1 -0
  20. package/dist/tools/about.js +61 -0
  21. package/dist/tools/about.js.map +1 -0
  22. package/dist/tools/list.d.ts +7 -0
  23. package/dist/tools/list.d.ts.map +1 -1
  24. package/dist/tools/list.js +73 -8
  25. package/dist/tools/list.js.map +1 -1
  26. package/dist/tools/registry.d.ts +11 -1
  27. package/dist/tools/registry.d.ts.map +1 -1
  28. package/dist/tools/registry.js +56 -4
  29. package/dist/tools/registry.js.map +1 -1
  30. package/dist/worker.d.ts.map +1 -1
  31. package/dist/worker.js +17 -5
  32. package/dist/worker.js.map +1 -1
  33. package/package.json +8 -7
  34. package/scripts/add-cross-references.sql +0 -200
  35. package/scripts/analyze-survey-responses.ts +0 -285
  36. package/scripts/build-db.ts +0 -421
  37. package/scripts/bulk-reingest-all.ts +0 -331
  38. package/scripts/check-updates.ts +0 -294
  39. package/scripts/extract-eprivacy-recitals.ts +0 -98
  40. package/scripts/ingest-eurlex-browser.ts +0 -113
  41. package/scripts/ingest-eurlex.ts +0 -346
  42. package/scripts/ingest-unece.ts +0 -382
  43. package/scripts/migrate-postgres.ts +0 -445
  44. package/scripts/migrate-to-postgres.ts +0 -353
  45. package/scripts/reingest-all-with-recitals.sh +0 -81
  46. package/scripts/sync-versions.ts +0 -206
  47. package/scripts/test-cross-refs.js +0 -26
  48. package/scripts/test-postgres-adapter.ts +0 -146
  49. package/scripts/update-dora-rts-metadata.ts +0 -112
  50. package/src/database/postgres-adapter.ts +0 -84
  51. package/src/database/sqlite-adapter.ts +0 -44
  52. package/src/database/types.ts +0 -10
  53. package/src/http-server.ts +0 -149
  54. package/src/index.ts +0 -61
  55. package/src/middleware/rate-limit.ts +0 -104
  56. package/src/tools/applicability.ts +0 -167
  57. package/src/tools/article.ts +0 -81
  58. package/src/tools/compare.ts +0 -217
  59. package/src/tools/definitions.ts +0 -49
  60. package/src/tools/evidence.ts +0 -84
  61. package/src/tools/list.ts +0 -124
  62. package/src/tools/map.ts +0 -86
  63. package/src/tools/recital.ts +0 -60
  64. package/src/tools/registry.ts +0 -311
  65. package/src/tools/search.ts +0 -297
  66. package/src/worker.ts +0 -708
@@ -1,98 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- import { writeFileSync, readFileSync } from 'fs';
4
- import { JSDOM } from 'jsdom';
5
-
6
- async function fetchEPrivacyRecitals() {
7
- const url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32002L0058';
8
- console.log(`Fetching: ${url}`);
9
-
10
- const response = await fetch(url, {
11
- headers: {
12
- 'User-Agent': 'Mozilla/5.0 (compatible)',
13
- 'Accept': 'text/html',
14
- },
15
- });
16
-
17
- if (!response.ok) {
18
- throw new Error(`Failed: ${response.status}`);
19
- }
20
-
21
- const html = await response.text();
22
- const dom = new JSDOM(html);
23
- const doc = dom.window.document;
24
-
25
- // Extract all paragraphs
26
- const paragraphs = Array.from(doc.querySelectorAll('p'));
27
-
28
- const recitals: Array<{ recital_number: number; text: string }> = [];
29
- let inRecitals = false;
30
- let currentNumber: number | null = null;
31
- let currentText: string[] = [];
32
-
33
- for (const p of paragraphs) {
34
- const text = p.textContent?.trim() || '';
35
-
36
- // Check if we've entered the recitals section
37
- if (text === 'Whereas:') {
38
- inRecitals = true;
39
- continue;
40
- }
41
-
42
- // Check if we've left the recitals section
43
- if (text.match(/^HAVE ADOPTED/i) || text.match(/^Article\s+1/i)) {
44
- if (currentNumber && currentText.length > 0) {
45
- recitals.push({
46
- recital_number: currentNumber,
47
- text: currentText.join('\n\n'),
48
- });
49
- }
50
- break;
51
- }
52
-
53
- if (!inRecitals) continue;
54
-
55
- // Check for recital number at start: "(1)", "(2)", etc.
56
- const recitalMatch = text.match(/^\((\d+)\)/);
57
-
58
- if (recitalMatch) {
59
- // Save previous recital
60
- if (currentNumber && currentText.length > 0) {
61
- recitals.push({
62
- recital_number: currentNumber,
63
- text: currentText.join('\n\n'),
64
- });
65
- }
66
-
67
- // Start new recital
68
- currentNumber = parseInt(recitalMatch[1]);
69
- const remainingText = text.substring(recitalMatch[0].length).trim();
70
- currentText = remainingText ? [remainingText] : [];
71
- } else if (currentNumber && text.length > 0) {
72
- // Add to current recital
73
- currentText.push(text);
74
- }
75
- }
76
-
77
- // Don't forget the last one
78
- if (currentNumber && currentText.length > 0) {
79
- recitals.push({
80
- recital_number: currentNumber,
81
- text: currentText.join('\n\n'),
82
- });
83
- }
84
-
85
- console.log(`Extracted ${recitals.length} recitals`);
86
-
87
- // Load existing ePrivacy JSON
88
- const existingData = JSON.parse(readFileSync('data/seed/eprivacy.json', 'utf-8'));
89
- existingData.recitals = recitals;
90
-
91
- // Save updated file
92
- writeFileSync('data/seed/eprivacy.json', JSON.stringify(existingData, null, 2));
93
- console.log(`Saved to: data/seed/eprivacy.json`);
94
- console.log(`Articles: ${existingData.articles.length}`);
95
- console.log(`Recitals: ${recitals.length}`);
96
- }
97
-
98
- fetchEPrivacyRecitals().catch(console.error);
@@ -1,113 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- /**
4
- * Browser-based EUR-Lex fetcher to bypass AWS WAF challenges.
5
- *
6
- * EUR-Lex deployed AWS WAF that returns 2036-byte JavaScript challenge pages
7
- * instead of actual HTML when using fetch(). This script launches a headless
8
- * browser to wait for the challenge to complete and retrieve the real content.
9
- *
10
- * Usage:
11
- * npx tsx scripts/ingest-eurlex-browser.ts <celex_id>
12
- * npx tsx scripts/ingest-eurlex-browser.ts 32016R0679
13
- *
14
- * Or import as a function:
15
- * import { fetchEurLexWithBrowser } from './scripts/ingest-eurlex-browser';
16
- * const html = await fetchEurLexWithBrowser('32016R0679');
17
- */
18
-
19
- import puppeteer from 'puppeteer';
20
-
21
- /**
22
- * Fetches EUR-Lex regulation HTML using Puppeteer to bypass WAF challenges.
23
- *
24
- * @param celexId - CELEX identifier (e.g., '32016R0679' for GDPR)
25
- * @returns Full HTML content of the regulation
26
- * @throws Error if fetching fails or content is invalid
27
- */
28
- export async function fetchEurLexWithBrowser(celexId: string): Promise<string> {
29
- const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
30
- console.log(`[Browser] Launching headless browser...`);
31
- console.log(`[Browser] Fetching: ${url}`);
32
-
33
- const browser = await puppeteer.launch({
34
- headless: true,
35
- args: [
36
- '--no-sandbox',
37
- '--disable-setuid-sandbox',
38
- '--disable-dev-shm-usage',
39
- '--disable-accelerated-2d-canvas',
40
- '--disable-gpu',
41
- ],
42
- });
43
-
44
- try {
45
- const page = await browser.newPage();
46
-
47
- // Set realistic User-Agent to appear as a normal browser
48
- await page.setUserAgent(
49
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
50
- );
51
-
52
- // Set viewport
53
- await page.setViewport({ width: 1920, height: 1080 });
54
-
55
- console.log(`[Browser] Navigating to URL...`);
56
- await page.goto(url, {
57
- waitUntil: 'networkidle0', // Wait until network is idle
58
- timeout: 30000,
59
- });
60
-
61
- console.log(`[Browser] Page loaded. Waiting for WAF challenge to complete...`);
62
-
63
- // Wait a bit longer to ensure WAF challenge JavaScript has executed
64
- await new Promise(resolve => setTimeout(resolve, 5000));
65
-
66
- // Get the full HTML content
67
- const html = await page.content();
68
-
69
- console.log(`[Browser] Fetched ${html.length} bytes`);
70
-
71
- // Validate that we got real content, not a WAF challenge page
72
- if (html.length < 10000) {
73
- throw new Error(`Fetched content is suspiciously small (${html.length} bytes). Possible WAF block.`);
74
- }
75
-
76
- if (html.includes('window.gokuProps')) {
77
- throw new Error('Received AWS WAF challenge page instead of regulation content.');
78
- }
79
-
80
- if (!html.includes('Article')) {
81
- console.warn('[Browser] Warning: HTML does not contain "Article" - may not be valid regulation content');
82
- }
83
-
84
- return html;
85
- } finally {
86
- await browser.close();
87
- console.log(`[Browser] Browser closed`);
88
- }
89
- }
90
-
91
- // CLI interface
92
- if (import.meta.url === `file://${process.argv[1]}`) {
93
- const [,, celexId] = process.argv;
94
-
95
- if (!celexId) {
96
- console.log('Usage: npx tsx scripts/ingest-eurlex-browser.ts <celex_id>');
97
- console.log('Example: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679');
98
- console.log('\nThis will fetch the HTML and print it to stdout.');
99
- console.log('Pipe to a file: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679 > output.html');
100
- process.exit(1);
101
- }
102
-
103
- fetchEurLexWithBrowser(celexId)
104
- .then(html => {
105
- // Output HTML to stdout for piping
106
- console.log('\n--- HTML Content ---\n');
107
- console.log(html);
108
- })
109
- .catch(err => {
110
- console.error('Error:', err.message);
111
- process.exit(1);
112
- });
113
- }
@@ -1,346 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- /**
4
- * Ingest EU regulations from EUR-Lex.
5
- *
6
- * Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]
7
- * Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json
8
- * Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser
9
- */
10
-
11
- import { writeFileSync } from 'fs';
12
- import { JSDOM } from 'jsdom';
13
- import { fetchEurLexWithBrowser } from './ingest-eurlex-browser.js';
14
-
15
- interface Article {
16
- number: string;
17
- title?: string;
18
- text: string;
19
- chapter?: string;
20
- }
21
-
22
- interface Definition {
23
- term: string;
24
- definition: string;
25
- article: string;
26
- }
27
-
28
- interface Recital {
29
- recital_number: number;
30
- text: string;
31
- related_articles?: string;
32
- }
33
-
34
- interface RegulationData {
35
- id: string;
36
- full_name: string;
37
- celex_id: string;
38
- effective_date?: string;
39
- eur_lex_url: string;
40
- articles: Article[];
41
- definitions: Definition[];
42
- recitals?: Recital[];
43
- }
44
-
45
- const REGULATION_METADATA: Record<string, { id: string; full_name: string; effective_date?: string }> = {
46
- '32016R0679': { id: 'GDPR', full_name: 'General Data Protection Regulation', effective_date: '2018-05-25' },
47
- '32022L2555': { id: 'NIS2', full_name: 'Directive on measures for a high common level of cybersecurity across the Union', effective_date: '2024-10-17' },
48
- '32022R2554': { id: 'DORA', full_name: 'Digital Operational Resilience Act', effective_date: '2025-01-17' },
49
- '32024R1689': { id: 'AI_ACT', full_name: 'Artificial Intelligence Act', effective_date: '2024-08-01' },
50
- '32024R2847': { id: 'CRA', full_name: 'Cyber Resilience Act', effective_date: '2024-12-10' },
51
- '32019R0881': { id: 'CYBERSECURITY_ACT', full_name: 'EU Cybersecurity Act', effective_date: '2019-06-27' },
52
- '32024R1183': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
53
- '02014R0910-20241018': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
54
- // Digital Single Market regulations
55
- '32023R2854': { id: 'DATA_ACT', full_name: 'Data Act', effective_date: '2025-09-12' },
56
- '32022R2065': { id: 'DSA', full_name: 'Digital Services Act', effective_date: '2024-02-17' },
57
- '32022R1925': { id: 'DMA', full_name: 'Digital Markets Act', effective_date: '2023-05-02' },
58
- // UN Regulations (adopted by EU)
59
- '42021X0387': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system', effective_date: '2021-01-22' },
60
- '42025X0005': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system (Supplement 3)', effective_date: '2025-01-10' },
61
- };
62
-
63
- async function fetchEurLexHtml(celexId: string, useBrowser = false): Promise<string> {
64
- if (useBrowser) {
65
- console.log('Using Puppeteer to bypass WAF...');
66
- return fetchEurLexWithBrowser(celexId);
67
- }
68
-
69
- // Fallback to direct fetch (will fail with WAF)
70
- const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
71
- console.log(`Fetching: ${url}`);
72
-
73
- const response = await fetch(url, {
74
- headers: {
75
- 'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0; +https://github.com/Ansvar-Systems/EU_compliance_MCP)',
76
- 'Accept': 'text/html',
77
- },
78
- });
79
-
80
- if (!response.ok) {
81
- throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
82
- }
83
-
84
- return response.text();
85
- }
86
-
87
- function parseRecitals(html: string): Recital[] {
88
- const dom = new JSDOM(html);
89
- const doc = dom.window.document;
90
-
91
- const recitals: Recital[] = [];
92
- const allText = doc.body?.textContent || '';
93
- const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
94
-
95
- let inRecitalsSection = false;
96
- let currentRecital: { number: number; lines: string[] } | null = null;
97
-
98
- for (const line of lines) {
99
- // Detect start of recitals section
100
- if (line.match(/^Having regard to/i) || line.match(/^Whereas:/i)) {
101
- inRecitalsSection = true;
102
- continue;
103
- }
104
-
105
- // Detect end of recitals (usually "HAVE ADOPTED" or "Article 1")
106
- if (line.match(/^HAVE ADOPTED/i) || line.match(/^Article\s+1$/i)) {
107
- inRecitalsSection = false;
108
- if (currentRecital && currentRecital.lines.length > 0) {
109
- recitals.push({
110
- recital_number: currentRecital.number,
111
- text: currentRecital.lines.join('\n\n'),
112
- });
113
- }
114
- break;
115
- }
116
-
117
- if (!inRecitalsSection) continue;
118
-
119
- // Match recital number: "(1)", "(123)", etc.
120
- const recitalMatch = line.match(/^\((\d+)\)/);
121
- if (recitalMatch) {
122
- // Save previous recital
123
- if (currentRecital && currentRecital.lines.length > 0) {
124
- recitals.push({
125
- recital_number: currentRecital.number,
126
- text: currentRecital.lines.join('\n\n'),
127
- });
128
- }
129
-
130
- // Start new recital
131
- currentRecital = {
132
- number: parseInt(recitalMatch[1]),
133
- lines: [],
134
- };
135
-
136
- // Add remaining text after number
137
- const textAfterNumber = line.substring(recitalMatch[0].length).trim();
138
- if (textAfterNumber) {
139
- currentRecital.lines.push(textAfterNumber);
140
- }
141
- continue;
142
- }
143
-
144
- // Add line to current recital
145
- if (currentRecital && line.length > 0) {
146
- currentRecital.lines.push(line);
147
- }
148
- }
149
-
150
- // Don't forget the last recital
151
- if (currentRecital && currentRecital.lines.length > 0) {
152
- recitals.push({
153
- recital_number: currentRecital.number,
154
- text: currentRecital.lines.join('\n\n'),
155
- });
156
- }
157
-
158
- return recitals;
159
- }
160
-
161
- function parseArticles(html: string, celexId: string): { articles: Article[]; definitions: Definition[] } {
162
- const dom = new JSDOM(html);
163
- const doc = dom.window.document;
164
-
165
- const articles: Article[] = [];
166
- const definitions: Definition[] = [];
167
- let currentChapter = '';
168
-
169
- // Get all text content and split by article markers
170
- const allText = doc.body?.textContent || '';
171
- const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
172
-
173
- let currentArticle: { number: string; title?: string; lines: string[] } | null = null;
174
-
175
- for (const line of lines) {
176
- const articleStart = line.match(/^Article\s+(\d+[a-z]?)$/i);
177
- if (articleStart) {
178
- if (currentArticle && currentArticle.lines.length > 0) {
179
- articles.push({
180
- number: currentArticle.number,
181
- title: currentArticle.title,
182
- text: currentArticle.lines.join('\n\n'),
183
- chapter: currentChapter || undefined,
184
- });
185
- }
186
- currentArticle = { number: articleStart[1], lines: [] };
187
- continue;
188
- }
189
-
190
- const chapterStart = line.match(/^CHAPTER\s+([IVXLC]+)/i);
191
- if (chapterStart) {
192
- currentChapter = chapterStart[1];
193
- continue;
194
- }
195
-
196
- if (currentArticle) {
197
- // Check if this is a title line (short, no period at end)
198
- if (!currentArticle.title && currentArticle.lines.length === 0 && line.length < 100 && !line.endsWith('.')) {
199
- currentArticle.title = line;
200
- } else if (line.length > 0) {
201
- currentArticle.lines.push(line);
202
- }
203
- }
204
- }
205
-
206
- // Don't forget the last article
207
- if (currentArticle && currentArticle.lines.length > 0) {
208
- articles.push({
209
- number: currentArticle.number,
210
- title: currentArticle.title,
211
- text: currentArticle.lines.join('\n\n'),
212
- chapter: currentChapter || undefined,
213
- });
214
- }
215
-
216
- // Deduplicate articles - keep the one with the most content for each number
217
- const articleMap = new Map<string, Article>();
218
- for (const article of articles) {
219
- const existing = articleMap.get(article.number);
220
- if (!existing || article.text.length > existing.text.length) {
221
- articleMap.set(article.number, article);
222
- }
223
- }
224
- const deduplicatedArticles = Array.from(articleMap.values())
225
- .sort((a, b) => {
226
- // Extract numeric and letter parts (e.g., "5a" -> [5, "a"])
227
- const matchA = a.number.match(/^(\d+)([a-z]?)$/);
228
- const matchB = b.number.match(/^(\d+)([a-z]?)$/);
229
- if (!matchA || !matchB) return 0;
230
-
231
- const numA = parseInt(matchA[1]);
232
- const numB = parseInt(matchB[1]);
233
-
234
- // Sort by number first
235
- if (numA !== numB) return numA - numB;
236
-
237
- // Then by letter (empty string sorts before letters)
238
- return (matchA[2] || '').localeCompare(matchB[2] || '');
239
- });
240
-
241
- // Extract definitions from Article 4 (or similar definitions article)
242
- // Find definitions article from deduplicated list
243
- const defsArticle = deduplicatedArticles.find(a =>
244
- a.title?.toLowerCase().includes('definition')
245
- );
246
-
247
- if (defsArticle && defsArticle.text.includes('means')) {
248
- // Normalize text: collapse whitespace and normalize quotes
249
- const normalizedText = defsArticle.text
250
- .replace(/\s+/g, ' ')
251
- .replace(/[\u2018\u2019]/g, "'"); // Curly quotes to straight
252
-
253
- // Parse definitions by extracting content between consecutive numbered entries
254
- // This handles:
255
- // - Complex definitions with internal periods/semicolons
256
- // - 'term' or 'alternate' means... patterns (NIS2 Art 6)
257
- // - 'term1', 'term2' and 'term3' mean... patterns (CRA Art 3)
258
- // - 'term' of the something means... patterns (GDPR Art 4)
259
- // - mean, respectively... patterns (CRA Art 3)
260
- // - means: (a) ... patterns (complex definitions with sub-parts)
261
- const defRegex = /\((\d+)\)\s*'([^']+)'(?:[^(]*?)means?[,:;]?\s+(.+?)(?=\(\d+\)\s*'|$)/g;
262
- let defMatch;
263
- while ((defMatch = defRegex.exec(normalizedText)) !== null) {
264
- const term = defMatch[2].trim().toLowerCase();
265
- const definition = defMatch[3].trim();
266
- // Only add if we got meaningful content
267
- if (term.length > 0 && definition.length > 10) {
268
- definitions.push({
269
- term,
270
- definition,
271
- article: defsArticle.number,
272
- });
273
- }
274
- }
275
- }
276
-
277
- return { articles: deduplicatedArticles, definitions };
278
- }
279
-
280
- async function ingestRegulation(celexId: string, outputPath: string, useBrowser = false): Promise<void> {
281
- const metadata = REGULATION_METADATA[celexId];
282
- if (!metadata) {
283
- console.warn(`Unknown CELEX ID: ${celexId}. Using generic metadata.`);
284
- }
285
-
286
- const html = await fetchEurLexHtml(celexId, useBrowser);
287
- console.log(`Fetched ${html.length} bytes`);
288
-
289
- // Parse recitals BEFORE articles
290
- const recitals = parseRecitals(html);
291
- console.log(`Parsed ${recitals.length} recitals`);
292
-
293
- const { articles, definitions } = parseArticles(html, celexId);
294
- console.log(`Parsed ${articles.length} articles, ${definitions.length} definitions`);
295
-
296
- if (articles.length === 0) {
297
- console.error('No articles found! The HTML structure may have changed.');
298
- console.log('Saving raw HTML for debugging...');
299
- writeFileSync(outputPath.replace('.json', '.html'), html);
300
- return;
301
- }
302
-
303
- const regulation: RegulationData = {
304
- id: metadata?.id || celexId,
305
- full_name: metadata?.full_name || `Regulation ${celexId}`,
306
- celex_id: celexId,
307
- effective_date: metadata?.effective_date,
308
- eur_lex_url: `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`,
309
- articles,
310
- definitions,
311
- recitals,
312
- };
313
-
314
- writeFileSync(outputPath, JSON.stringify(regulation, null, 2));
315
- console.log(`\nSaved to: ${outputPath}`);
316
- console.log(`Articles: ${articles.length}`);
317
- console.log(`Definitions: ${definitions.length}`);
318
- console.log(`Recitals: ${recitals.length}`);
319
- }
320
-
321
- // Main
322
- const args = process.argv.slice(2);
323
- const useBrowser = args.includes('--browser');
324
- const [celexId, outputPath] = args.filter(arg => arg !== '--browser');
325
-
326
- if (!celexId || !outputPath) {
327
- console.log('Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]');
328
- console.log('Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json');
329
- console.log('Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser');
330
- console.log('\nOptions:');
331
- console.log(' --browser Use Puppeteer to bypass EUR-Lex WAF challenges');
332
- console.log('\nKnown CELEX IDs:');
333
- Object.entries(REGULATION_METADATA).forEach(([id, meta]) => {
334
- console.log(` ${id} - ${meta.id} (${meta.full_name})`);
335
- });
336
- process.exit(1);
337
- }
338
-
339
- if (useBrowser) {
340
- console.log('Browser mode enabled - using Puppeteer to fetch content\n');
341
- }
342
-
343
- ingestRegulation(celexId, outputPath, useBrowser).catch(err => {
344
- console.error('Error:', err);
345
- process.exit(1);
346
- });