@ansvar/eu-regulations-mcp 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +60 -22
  2. package/data/regulations.db +0 -0
  3. package/dist/database/sqlite-adapter.d.ts +2 -2
  4. package/dist/database/sqlite-adapter.d.ts.map +1 -1
  5. package/dist/database/sqlite-adapter.js.map +1 -1
  6. package/dist/http-server.js +27 -5
  7. package/dist/http-server.js.map +1 -1
  8. package/dist/index.js +27 -4
  9. package/dist/index.js.map +1 -1
  10. package/dist/tools/about.d.ts +40 -0
  11. package/dist/tools/about.d.ts.map +1 -0
  12. package/dist/tools/about.js +61 -0
  13. package/dist/tools/about.js.map +1 -0
  14. package/dist/tools/list.d.ts +7 -0
  15. package/dist/tools/list.d.ts.map +1 -1
  16. package/dist/tools/list.js +73 -8
  17. package/dist/tools/list.js.map +1 -1
  18. package/dist/tools/registry.d.ts +11 -1
  19. package/dist/tools/registry.d.ts.map +1 -1
  20. package/dist/tools/registry.js +56 -4
  21. package/dist/tools/registry.js.map +1 -1
  22. package/dist/worker.d.ts.map +1 -1
  23. package/dist/worker.js +17 -5
  24. package/dist/worker.js.map +1 -1
  25. package/package.json +6 -5
  26. package/scripts/add-cross-references.sql +0 -200
  27. package/scripts/analyze-survey-responses.ts +0 -285
  28. package/scripts/build-db.ts +0 -421
  29. package/scripts/bulk-reingest-all.ts +0 -331
  30. package/scripts/check-updates.ts +0 -294
  31. package/scripts/extract-eprivacy-recitals.ts +0 -98
  32. package/scripts/ingest-eurlex-browser.ts +0 -113
  33. package/scripts/ingest-eurlex.ts +0 -349
  34. package/scripts/ingest-unece.ts +0 -382
  35. package/scripts/migrate-postgres.ts +0 -445
  36. package/scripts/migrate-to-postgres.ts +0 -353
  37. package/scripts/reingest-all-with-recitals.sh +0 -81
  38. package/scripts/sync-versions.ts +0 -206
  39. package/scripts/test-cross-refs.js +0 -26
  40. package/scripts/test-postgres-adapter.ts +0 -146
  41. package/scripts/update-dora-rts-metadata.ts +0 -112
  42. package/src/database/postgres-adapter.ts +0 -84
  43. package/src/database/sqlite-adapter.ts +0 -44
  44. package/src/database/types.ts +0 -10
  45. package/src/http-server.ts +0 -149
  46. package/src/index.ts +0 -61
  47. package/src/middleware/rate-limit.ts +0 -104
  48. package/src/tools/applicability.ts +0 -167
  49. package/src/tools/article.ts +0 -81
  50. package/src/tools/compare.ts +0 -217
  51. package/src/tools/definitions.ts +0 -49
  52. package/src/tools/evidence.ts +0 -84
  53. package/src/tools/list.ts +0 -124
  54. package/src/tools/map.ts +0 -86
  55. package/src/tools/recital.ts +0 -60
  56. package/src/tools/registry.ts +0 -311
  57. package/src/tools/search.ts +0 -297
  58. package/src/worker.ts +0 -708
@@ -1,98 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- import { writeFileSync, readFileSync } from 'fs';
4
- import { JSDOM } from 'jsdom';
5
-
6
- async function fetchEPrivacyRecitals() {
7
- const url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32002L0058';
8
- console.log(`Fetching: ${url}`);
9
-
10
- const response = await fetch(url, {
11
- headers: {
12
- 'User-Agent': 'Mozilla/5.0 (compatible)',
13
- 'Accept': 'text/html',
14
- },
15
- });
16
-
17
- if (!response.ok) {
18
- throw new Error(`Failed: ${response.status}`);
19
- }
20
-
21
- const html = await response.text();
22
- const dom = new JSDOM(html);
23
- const doc = dom.window.document;
24
-
25
- // Extract all paragraphs
26
- const paragraphs = Array.from(doc.querySelectorAll('p'));
27
-
28
- const recitals: Array<{ recital_number: number; text: string }> = [];
29
- let inRecitals = false;
30
- let currentNumber: number | null = null;
31
- let currentText: string[] = [];
32
-
33
- for (const p of paragraphs) {
34
- const text = p.textContent?.trim() || '';
35
-
36
- // Check if we've entered the recitals section
37
- if (text === 'Whereas:') {
38
- inRecitals = true;
39
- continue;
40
- }
41
-
42
- // Check if we've left the recitals section
43
- if (text.match(/^HAVE ADOPTED/i) || text.match(/^Article\s+1/i)) {
44
- if (currentNumber && currentText.length > 0) {
45
- recitals.push({
46
- recital_number: currentNumber,
47
- text: currentText.join('\n\n'),
48
- });
49
- }
50
- break;
51
- }
52
-
53
- if (!inRecitals) continue;
54
-
55
- // Check for recital number at start: "(1)", "(2)", etc.
56
- const recitalMatch = text.match(/^\((\d+)\)/);
57
-
58
- if (recitalMatch) {
59
- // Save previous recital
60
- if (currentNumber && currentText.length > 0) {
61
- recitals.push({
62
- recital_number: currentNumber,
63
- text: currentText.join('\n\n'),
64
- });
65
- }
66
-
67
- // Start new recital
68
- currentNumber = parseInt(recitalMatch[1]);
69
- const remainingText = text.substring(recitalMatch[0].length).trim();
70
- currentText = remainingText ? [remainingText] : [];
71
- } else if (currentNumber && text.length > 0) {
72
- // Add to current recital
73
- currentText.push(text);
74
- }
75
- }
76
-
77
- // Don't forget the last one
78
- if (currentNumber && currentText.length > 0) {
79
- recitals.push({
80
- recital_number: currentNumber,
81
- text: currentText.join('\n\n'),
82
- });
83
- }
84
-
85
- console.log(`Extracted ${recitals.length} recitals`);
86
-
87
- // Load existing ePrivacy JSON
88
- const existingData = JSON.parse(readFileSync('data/seed/eprivacy.json', 'utf-8'));
89
- existingData.recitals = recitals;
90
-
91
- // Save updated file
92
- writeFileSync('data/seed/eprivacy.json', JSON.stringify(existingData, null, 2));
93
- console.log(`Saved to: data/seed/eprivacy.json`);
94
- console.log(`Articles: ${existingData.articles.length}`);
95
- console.log(`Recitals: ${recitals.length}`);
96
- }
97
-
98
- fetchEPrivacyRecitals().catch(console.error);
@@ -1,113 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- /**
4
- * Browser-based EUR-Lex fetcher to bypass AWS WAF challenges.
5
- *
6
- * EUR-Lex deployed AWS WAF that returns 2036-byte JavaScript challenge pages
7
- * instead of actual HTML when using fetch(). This script launches a headless
8
- * browser to wait for the challenge to complete and retrieve the real content.
9
- *
10
- * Usage:
11
- * npx tsx scripts/ingest-eurlex-browser.ts <celex_id>
12
- * npx tsx scripts/ingest-eurlex-browser.ts 32016R0679
13
- *
14
- * Or import as a function:
15
- * import { fetchEurLexWithBrowser } from './scripts/ingest-eurlex-browser';
16
- * const html = await fetchEurLexWithBrowser('32016R0679');
17
- */
18
-
19
- import puppeteer from 'puppeteer';
20
-
21
- /**
22
- * Fetches EUR-Lex regulation HTML using Puppeteer to bypass WAF challenges.
23
- *
24
- * @param celexId - CELEX identifier (e.g., '32016R0679' for GDPR)
25
- * @returns Full HTML content of the regulation
26
- * @throws Error if fetching fails or content is invalid
27
- */
28
- export async function fetchEurLexWithBrowser(celexId: string): Promise<string> {
29
- const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
30
- console.log(`[Browser] Launching headless browser...`);
31
- console.log(`[Browser] Fetching: ${url}`);
32
-
33
- const browser = await puppeteer.launch({
34
- headless: true,
35
- args: [
36
- '--no-sandbox',
37
- '--disable-setuid-sandbox',
38
- '--disable-dev-shm-usage',
39
- '--disable-accelerated-2d-canvas',
40
- '--disable-gpu',
41
- ],
42
- });
43
-
44
- try {
45
- const page = await browser.newPage();
46
-
47
- // Set realistic User-Agent to appear as a normal browser
48
- await page.setUserAgent(
49
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
50
- );
51
-
52
- // Set viewport
53
- await page.setViewport({ width: 1920, height: 1080 });
54
-
55
- console.log(`[Browser] Navigating to URL...`);
56
- await page.goto(url, {
57
- waitUntil: 'networkidle0', // Wait until network is idle
58
- timeout: 30000,
59
- });
60
-
61
- console.log(`[Browser] Page loaded. Waiting for WAF challenge to complete...`);
62
-
63
- // Wait a bit longer to ensure WAF challenge JavaScript has executed
64
- await new Promise(resolve => setTimeout(resolve, 5000));
65
-
66
- // Get the full HTML content
67
- const html = await page.content();
68
-
69
- console.log(`[Browser] Fetched ${html.length} bytes`);
70
-
71
- // Validate that we got real content, not a WAF challenge page
72
- if (html.length < 10000) {
73
- throw new Error(`Fetched content is suspiciously small (${html.length} bytes). Possible WAF block.`);
74
- }
75
-
76
- if (html.includes('window.gokuProps')) {
77
- throw new Error('Received AWS WAF challenge page instead of regulation content.');
78
- }
79
-
80
- if (!html.includes('Article')) {
81
- console.warn('[Browser] Warning: HTML does not contain "Article" - may not be valid regulation content');
82
- }
83
-
84
- return html;
85
- } finally {
86
- await browser.close();
87
- console.log(`[Browser] Browser closed`);
88
- }
89
- }
90
-
91
- // CLI interface
92
- if (import.meta.url === `file://${process.argv[1]}`) {
93
- const [,, celexId] = process.argv;
94
-
95
- if (!celexId) {
96
- console.log('Usage: npx tsx scripts/ingest-eurlex-browser.ts <celex_id>');
97
- console.log('Example: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679');
98
- console.log('\nThis will fetch the HTML and print it to stdout.');
99
- console.log('Pipe to a file: npx tsx scripts/ingest-eurlex-browser.ts 32016R0679 > output.html');
100
- process.exit(1);
101
- }
102
-
103
- fetchEurLexWithBrowser(celexId)
104
- .then(html => {
105
- // Output HTML to stdout for piping
106
- console.log('\n--- HTML Content ---\n');
107
- console.log(html);
108
- })
109
- .catch(err => {
110
- console.error('Error:', err.message);
111
- process.exit(1);
112
- });
113
- }
@@ -1,349 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
-
3
- /**
4
- * Ingest EU regulations from EUR-Lex.
5
- *
6
- * Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]
7
- * Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json
8
- * Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser
9
- */
10
-
11
- import { writeFileSync } from 'fs';
12
- import { JSDOM } from 'jsdom';
13
- import { fetchEurLexWithBrowser } from './ingest-eurlex-browser.js';
14
-
15
- interface Article {
16
- number: string;
17
- title?: string;
18
- text: string;
19
- chapter?: string;
20
- }
21
-
22
- interface Definition {
23
- term: string;
24
- definition: string;
25
- article: string;
26
- }
27
-
28
- interface Recital {
29
- recital_number: number;
30
- text: string;
31
- related_articles?: string;
32
- }
33
-
34
- interface RegulationData {
35
- id: string;
36
- full_name: string;
37
- celex_id: string;
38
- effective_date?: string;
39
- eur_lex_url: string;
40
- articles: Article[];
41
- definitions: Definition[];
42
- recitals?: Recital[];
43
- }
44
-
45
- const REGULATION_METADATA: Record<string, { id: string; full_name: string; effective_date?: string }> = {
46
- '32016R0679': { id: 'GDPR', full_name: 'General Data Protection Regulation', effective_date: '2018-05-25' },
47
- '32022L2555': { id: 'NIS2', full_name: 'Directive on measures for a high common level of cybersecurity across the Union', effective_date: '2024-10-17' },
48
- '32022R2554': { id: 'DORA', full_name: 'Digital Operational Resilience Act', effective_date: '2025-01-17' },
49
- '32024R1689': { id: 'AI_ACT', full_name: 'Artificial Intelligence Act', effective_date: '2024-08-01' },
50
- '32024R2847': { id: 'CRA', full_name: 'Cyber Resilience Act', effective_date: '2024-12-10' },
51
- '32019R0881': { id: 'CYBERSECURITY_ACT', full_name: 'EU Cybersecurity Act', effective_date: '2019-06-27' },
52
- '32024R1183': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
53
- '02014R0910-20241018': { id: 'EIDAS2', full_name: 'European Digital Identity Framework (eIDAS 2.0)', effective_date: '2024-05-20' },
54
- // Digital Single Market regulations
55
- '32023R2854': { id: 'DATA_ACT', full_name: 'Data Act', effective_date: '2025-09-12' },
56
- '32022R2065': { id: 'DSA', full_name: 'Digital Services Act', effective_date: '2024-02-17' },
57
- '32022R1925': { id: 'DMA', full_name: 'Digital Markets Act', effective_date: '2023-05-02' },
58
- // Product & Supply Chain regulations
59
- '32023R1781': { id: 'CHIPS_ACT', full_name: 'European Chips Act', effective_date: '2023-09-18' },
60
- '32024R1252': { id: 'CRMA', full_name: 'Critical Raw Materials Act', effective_date: '2024-05-23' },
61
- // UN Regulations (adopted by EU)
62
- '42021X0387': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system', effective_date: '2021-01-22' },
63
- '42025X0005': { id: 'UN_R155', full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system (Supplement 3)', effective_date: '2025-01-10' },
64
- };
65
-
66
- async function fetchEurLexHtml(celexId: string, useBrowser = false): Promise<string> {
67
- if (useBrowser) {
68
- console.log('Using Puppeteer to bypass WAF...');
69
- return fetchEurLexWithBrowser(celexId);
70
- }
71
-
72
- // Fallback to direct fetch (will fail with WAF)
73
- const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
74
- console.log(`Fetching: ${url}`);
75
-
76
- const response = await fetch(url, {
77
- headers: {
78
- 'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0; +https://github.com/Ansvar-Systems/EU_compliance_MCP)',
79
- 'Accept': 'text/html',
80
- },
81
- });
82
-
83
- if (!response.ok) {
84
- throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
85
- }
86
-
87
- return response.text();
88
- }
89
-
90
- function parseRecitals(html: string): Recital[] {
91
- const dom = new JSDOM(html);
92
- const doc = dom.window.document;
93
-
94
- const recitals: Recital[] = [];
95
- const allText = doc.body?.textContent || '';
96
- const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
97
-
98
- let inRecitalsSection = false;
99
- let currentRecital: { number: number; lines: string[] } | null = null;
100
-
101
- for (const line of lines) {
102
- // Detect start of recitals section
103
- if (line.match(/^Having regard to/i) || line.match(/^Whereas:/i)) {
104
- inRecitalsSection = true;
105
- continue;
106
- }
107
-
108
- // Detect end of recitals (usually "HAVE ADOPTED" or "Article 1")
109
- if (line.match(/^HAVE ADOPTED/i) || line.match(/^Article\s+1$/i)) {
110
- inRecitalsSection = false;
111
- if (currentRecital && currentRecital.lines.length > 0) {
112
- recitals.push({
113
- recital_number: currentRecital.number,
114
- text: currentRecital.lines.join('\n\n'),
115
- });
116
- }
117
- break;
118
- }
119
-
120
- if (!inRecitalsSection) continue;
121
-
122
- // Match recital number: "(1)", "(123)", etc.
123
- const recitalMatch = line.match(/^\((\d+)\)/);
124
- if (recitalMatch) {
125
- // Save previous recital
126
- if (currentRecital && currentRecital.lines.length > 0) {
127
- recitals.push({
128
- recital_number: currentRecital.number,
129
- text: currentRecital.lines.join('\n\n'),
130
- });
131
- }
132
-
133
- // Start new recital
134
- currentRecital = {
135
- number: parseInt(recitalMatch[1]),
136
- lines: [],
137
- };
138
-
139
- // Add remaining text after number
140
- const textAfterNumber = line.substring(recitalMatch[0].length).trim();
141
- if (textAfterNumber) {
142
- currentRecital.lines.push(textAfterNumber);
143
- }
144
- continue;
145
- }
146
-
147
- // Add line to current recital
148
- if (currentRecital && line.length > 0) {
149
- currentRecital.lines.push(line);
150
- }
151
- }
152
-
153
- // Don't forget the last recital
154
- if (currentRecital && currentRecital.lines.length > 0) {
155
- recitals.push({
156
- recital_number: currentRecital.number,
157
- text: currentRecital.lines.join('\n\n'),
158
- });
159
- }
160
-
161
- return recitals;
162
- }
163
-
164
- function parseArticles(html: string, celexId: string): { articles: Article[]; definitions: Definition[] } {
165
- const dom = new JSDOM(html);
166
- const doc = dom.window.document;
167
-
168
- const articles: Article[] = [];
169
- const definitions: Definition[] = [];
170
- let currentChapter = '';
171
-
172
- // Get all text content and split by article markers
173
- const allText = doc.body?.textContent || '';
174
- const lines = allText.split('\n').map(l => l.trim()).filter(l => l);
175
-
176
- let currentArticle: { number: string; title?: string; lines: string[] } | null = null;
177
-
178
- for (const line of lines) {
179
- const articleStart = line.match(/^Article\s+(\d+[a-z]?)$/i);
180
- if (articleStart) {
181
- if (currentArticle && currentArticle.lines.length > 0) {
182
- articles.push({
183
- number: currentArticle.number,
184
- title: currentArticle.title,
185
- text: currentArticle.lines.join('\n\n'),
186
- chapter: currentChapter || undefined,
187
- });
188
- }
189
- currentArticle = { number: articleStart[1], lines: [] };
190
- continue;
191
- }
192
-
193
- const chapterStart = line.match(/^CHAPTER\s+([IVXLC]+)/i);
194
- if (chapterStart) {
195
- currentChapter = chapterStart[1];
196
- continue;
197
- }
198
-
199
- if (currentArticle) {
200
- // Check if this is a title line (short, no period at end)
201
- if (!currentArticle.title && currentArticle.lines.length === 0 && line.length < 100 && !line.endsWith('.')) {
202
- currentArticle.title = line;
203
- } else if (line.length > 0) {
204
- currentArticle.lines.push(line);
205
- }
206
- }
207
- }
208
-
209
- // Don't forget the last article
210
- if (currentArticle && currentArticle.lines.length > 0) {
211
- articles.push({
212
- number: currentArticle.number,
213
- title: currentArticle.title,
214
- text: currentArticle.lines.join('\n\n'),
215
- chapter: currentChapter || undefined,
216
- });
217
- }
218
-
219
- // Deduplicate articles - keep the one with the most content for each number
220
- const articleMap = new Map<string, Article>();
221
- for (const article of articles) {
222
- const existing = articleMap.get(article.number);
223
- if (!existing || article.text.length > existing.text.length) {
224
- articleMap.set(article.number, article);
225
- }
226
- }
227
- const deduplicatedArticles = Array.from(articleMap.values())
228
- .sort((a, b) => {
229
- // Extract numeric and letter parts (e.g., "5a" -> [5, "a"])
230
- const matchA = a.number.match(/^(\d+)([a-z]?)$/);
231
- const matchB = b.number.match(/^(\d+)([a-z]?)$/);
232
- if (!matchA || !matchB) return 0;
233
-
234
- const numA = parseInt(matchA[1]);
235
- const numB = parseInt(matchB[1]);
236
-
237
- // Sort by number first
238
- if (numA !== numB) return numA - numB;
239
-
240
- // Then by letter (empty string sorts before letters)
241
- return (matchA[2] || '').localeCompare(matchB[2] || '');
242
- });
243
-
244
- // Extract definitions from Article 4 (or similar definitions article)
245
- // Find definitions article from deduplicated list
246
- const defsArticle = deduplicatedArticles.find(a =>
247
- a.title?.toLowerCase().includes('definition')
248
- );
249
-
250
- if (defsArticle && defsArticle.text.includes('means')) {
251
- // Normalize text: collapse whitespace and normalize quotes
252
- const normalizedText = defsArticle.text
253
- .replace(/\s+/g, ' ')
254
- .replace(/[\u2018\u2019]/g, "'"); // Curly quotes to straight
255
-
256
- // Parse definitions by extracting content between consecutive numbered entries
257
- // This handles:
258
- // - Complex definitions with internal periods/semicolons
259
- // - 'term' or 'alternate' means... patterns (NIS2 Art 6)
260
- // - 'term1', 'term2' and 'term3' mean... patterns (CRA Art 3)
261
- // - 'term' of the something means... patterns (GDPR Art 4)
262
- // - mean, respectively... patterns (CRA Art 3)
263
- // - means: (a) ... patterns (complex definitions with sub-parts)
264
- const defRegex = /\((\d+)\)\s*'([^']+)'(?:[^(]*?)means?[,:;]?\s+(.+?)(?=\(\d+\)\s*'|$)/g;
265
- let defMatch;
266
- while ((defMatch = defRegex.exec(normalizedText)) !== null) {
267
- const term = defMatch[2].trim().toLowerCase();
268
- const definition = defMatch[3].trim();
269
- // Only add if we got meaningful content
270
- if (term.length > 0 && definition.length > 10) {
271
- definitions.push({
272
- term,
273
- definition,
274
- article: defsArticle.number,
275
- });
276
- }
277
- }
278
- }
279
-
280
- return { articles: deduplicatedArticles, definitions };
281
- }
282
-
283
- async function ingestRegulation(celexId: string, outputPath: string, useBrowser = false): Promise<void> {
284
- const metadata = REGULATION_METADATA[celexId];
285
- if (!metadata) {
286
- console.warn(`Unknown CELEX ID: ${celexId}. Using generic metadata.`);
287
- }
288
-
289
- const html = await fetchEurLexHtml(celexId, useBrowser);
290
- console.log(`Fetched ${html.length} bytes`);
291
-
292
- // Parse recitals BEFORE articles
293
- const recitals = parseRecitals(html);
294
- console.log(`Parsed ${recitals.length} recitals`);
295
-
296
- const { articles, definitions } = parseArticles(html, celexId);
297
- console.log(`Parsed ${articles.length} articles, ${definitions.length} definitions`);
298
-
299
- if (articles.length === 0) {
300
- console.error('No articles found! The HTML structure may have changed.');
301
- console.log('Saving raw HTML for debugging...');
302
- writeFileSync(outputPath.replace('.json', '.html'), html);
303
- return;
304
- }
305
-
306
- const regulation: RegulationData = {
307
- id: metadata?.id || celexId,
308
- full_name: metadata?.full_name || `Regulation ${celexId}`,
309
- celex_id: celexId,
310
- effective_date: metadata?.effective_date,
311
- eur_lex_url: `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`,
312
- articles,
313
- definitions,
314
- recitals,
315
- };
316
-
317
- writeFileSync(outputPath, JSON.stringify(regulation, null, 2));
318
- console.log(`\nSaved to: ${outputPath}`);
319
- console.log(`Articles: ${articles.length}`);
320
- console.log(`Definitions: ${definitions.length}`);
321
- console.log(`Recitals: ${recitals.length}`);
322
- }
323
-
324
- // Main
325
- const args = process.argv.slice(2);
326
- const useBrowser = args.includes('--browser');
327
- const [celexId, outputPath] = args.filter(arg => arg !== '--browser');
328
-
329
- if (!celexId || !outputPath) {
330
- console.log('Usage: npx tsx scripts/ingest-eurlex.ts <celex_id> <output_file> [--browser]');
331
- console.log('Example: npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json');
332
- console.log('Example (with browser): npx tsx scripts/ingest-eurlex.ts 32016R0679 data/seed/gdpr.json --browser');
333
- console.log('\nOptions:');
334
- console.log(' --browser Use Puppeteer to bypass EUR-Lex WAF challenges');
335
- console.log('\nKnown CELEX IDs:');
336
- Object.entries(REGULATION_METADATA).forEach(([id, meta]) => {
337
- console.log(` ${id} - ${meta.id} (${meta.full_name})`);
338
- });
339
- process.exit(1);
340
- }
341
-
342
- if (useBrowser) {
343
- console.log('Browser mode enabled - using Puppeteer to fetch content\n');
344
- }
345
-
346
- ingestRegulation(celexId, outputPath, useBrowser).catch(err => {
347
- console.error('Error:', err);
348
- process.exit(1);
349
- });