@ansvar/eu-regulations-mcp 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -21
- package/README.md +159 -26
- package/data/seed/aifmd.json +432 -0
- package/data/seed/applicability/ai-act.json +87 -0
- package/data/seed/applicability/aifmd.json +74 -0
- package/data/seed/applicability/cbam.json +74 -0
- package/data/seed/applicability/cer.json +74 -0
- package/data/seed/applicability/cra.json +77 -0
- package/data/seed/applicability/csddd.json +74 -0
- package/data/seed/applicability/csrd.json +74 -0
- package/data/seed/applicability/cyber_solidarity.json +74 -0
- package/data/seed/applicability/cybersecurity-act.json +69 -0
- package/data/seed/applicability/data-act.json +71 -0
- package/data/seed/applicability/dga.json +74 -0
- package/data/seed/applicability/dma.json +77 -0
- package/data/seed/applicability/dsa.json +71 -0
- package/data/seed/applicability/eecc.json +74 -0
- package/data/seed/applicability/ehds.json +74 -0
- package/data/seed/applicability/eidas2.json +86 -0
- package/data/seed/applicability/eprivacy.json +74 -0
- package/data/seed/applicability/eu_taxonomy.json +74 -0
- package/data/seed/applicability/eucc.json +74 -0
- package/data/seed/applicability/eudr.json +74 -0
- package/data/seed/applicability/gpsr.json +74 -0
- package/data/seed/applicability/ivdr.json +74 -0
- package/data/seed/applicability/led.json +74 -0
- package/data/seed/applicability/machinery.json +74 -0
- package/data/seed/applicability/mdr.json +74 -0
- package/data/seed/applicability/mica.json +74 -0
- package/data/seed/applicability/mifid2.json +74 -0
- package/data/seed/applicability/mifir.json +74 -0
- package/data/seed/applicability/pld.json +74 -0
- package/data/seed/applicability/psd2.json +74 -0
- package/data/seed/applicability/red.json +74 -0
- package/data/seed/applicability/sfdr.json +74 -0
- package/data/seed/applicability/un-r155.json +68 -0
- package/data/seed/applicability/un-r156.json +68 -0
- package/data/seed/cbam.json +397 -0
- package/data/seed/cer.json +233 -0
- package/data/seed/csddd.json +205 -0
- package/data/seed/csrd.json +50 -0
- package/data/seed/cyber_solidarity.json +252 -0
- package/data/seed/data-act.json +517 -0
- package/data/seed/dga.json +342 -0
- package/data/seed/dma.json +499 -0
- package/data/seed/dsa.json +686 -0
- package/data/seed/eecc.json +981 -0
- package/data/seed/ehds.json +638 -0
- package/data/seed/eidas2.json +590 -0
- package/data/seed/eprivacy.json +115 -0
- package/data/seed/eu_taxonomy.json +285 -0
- package/data/seed/eucc.json +386 -0
- package/data/seed/eudr.json +401 -0
- package/data/seed/gpsr.json +462 -0
- package/data/seed/ivdr.json +1036 -0
- package/data/seed/led.json +480 -0
- package/data/seed/machinery.json +513 -0
- package/data/seed/mappings/iso27001-ai-act.json +114 -0
- package/data/seed/mappings/iso27001-aifmd.json +50 -0
- package/data/seed/mappings/iso27001-cbam.json +26 -0
- package/data/seed/mappings/iso27001-cer.json +74 -0
- package/data/seed/mappings/iso27001-cra.json +130 -0
- package/data/seed/mappings/iso27001-csddd.json +50 -0
- package/data/seed/mappings/iso27001-csrd.json +26 -0
- package/data/seed/mappings/iso27001-cyber_solidarity.json +82 -0
- package/data/seed/mappings/iso27001-cybersecurity-act.json +90 -0
- package/data/seed/mappings/iso27001-data-act.json +66 -0
- package/data/seed/mappings/iso27001-dga.json +50 -0
- package/data/seed/mappings/iso27001-dma.json +50 -0
- package/data/seed/mappings/iso27001-dsa.json +58 -0
- package/data/seed/mappings/iso27001-eecc.json +74 -0
- package/data/seed/mappings/iso27001-ehds.json +90 -0
- package/data/seed/mappings/iso27001-eidas2.json +106 -0
- package/data/seed/mappings/iso27001-eprivacy.json +66 -0
- package/data/seed/mappings/iso27001-eu_taxonomy.json +34 -0
- package/data/seed/mappings/iso27001-eucc.json +66 -0
- package/data/seed/mappings/iso27001-eudr.json +34 -0
- package/data/seed/mappings/iso27001-gpsr.json +42 -0
- package/data/seed/mappings/iso27001-ivdr.json +66 -0
- package/data/seed/mappings/iso27001-led.json +74 -0
- package/data/seed/mappings/iso27001-machinery.json +50 -0
- package/data/seed/mappings/iso27001-mdr.json +82 -0
- package/data/seed/mappings/iso27001-mica.json +66 -0
- package/data/seed/mappings/iso27001-mifid2.json +66 -0
- package/data/seed/mappings/iso27001-mifir.json +42 -0
- package/data/seed/mappings/iso27001-pld.json +26 -0
- package/data/seed/mappings/iso27001-psd2.json +82 -0
- package/data/seed/mappings/iso27001-red.json +42 -0
- package/data/seed/mappings/iso27001-sfdr.json +50 -0
- package/data/seed/mappings/iso27001-un-r155.json +130 -0
- package/data/seed/mappings/iso27001-un-r156.json +106 -0
- package/data/seed/mappings/nist-csf-ai-act.json +138 -0
- package/data/seed/mappings/nist-csf-aifmd.json +58 -0
- package/data/seed/mappings/nist-csf-cbam.json +42 -0
- package/data/seed/mappings/nist-csf-cer.json +90 -0
- package/data/seed/mappings/nist-csf-cra.json +130 -0
- package/data/seed/mappings/nist-csf-csddd.json +50 -0
- package/data/seed/mappings/nist-csf-csrd.json +34 -0
- package/data/seed/mappings/nist-csf-cyber_solidarity.json +90 -0
- package/data/seed/mappings/nist-csf-cybersecurity-act.json +90 -0
- package/data/seed/mappings/nist-csf-data-act.json +50 -0
- package/data/seed/mappings/nist-csf-dga.json +58 -0
- package/data/seed/mappings/nist-csf-dma.json +42 -0
- package/data/seed/mappings/nist-csf-dora.json +210 -0
- package/data/seed/mappings/nist-csf-dsa.json +82 -0
- package/data/seed/mappings/nist-csf-eecc.json +90 -0
- package/data/seed/mappings/nist-csf-ehds.json +98 -0
- package/data/seed/mappings/nist-csf-eidas2.json +114 -0
- package/data/seed/mappings/nist-csf-eprivacy.json +58 -0
- package/data/seed/mappings/nist-csf-eu_taxonomy.json +34 -0
- package/data/seed/mappings/nist-csf-eucc.json +66 -0
- package/data/seed/mappings/nist-csf-eudr.json +58 -0
- package/data/seed/mappings/nist-csf-gdpr.json +178 -0
- package/data/seed/mappings/nist-csf-gpsr.json +58 -0
- package/data/seed/mappings/nist-csf-ivdr.json +66 -0
- package/data/seed/mappings/nist-csf-led.json +74 -0
- package/data/seed/mappings/nist-csf-machinery.json +58 -0
- package/data/seed/mappings/nist-csf-mdr.json +66 -0
- package/data/seed/mappings/nist-csf-mica.json +98 -0
- package/data/seed/mappings/nist-csf-mifid2.json +74 -0
- package/data/seed/mappings/nist-csf-mifir.json +50 -0
- package/data/seed/mappings/nist-csf-nis2.json +194 -0
- package/data/seed/mappings/nist-csf-pld.json +34 -0
- package/data/seed/mappings/nist-csf-psd2.json +98 -0
- package/data/seed/mappings/nist-csf-red.json +58 -0
- package/data/seed/mappings/nist-csf-sfdr.json +42 -0
- package/data/seed/mappings/nist-csf-un-r155.json +130 -0
- package/data/seed/mappings/nist-csf-un-r156.json +98 -0
- package/data/seed/mdr.json +1066 -0
- package/data/seed/mica.json +1003 -0
- package/data/seed/mifid2.json +906 -0
- package/data/seed/mifir.json +512 -0
- package/data/seed/pld.json +244 -0
- package/data/seed/psd2.json +827 -0
- package/data/seed/red.json +452 -0
- package/data/seed/sfdr.json +228 -0
- package/data/seed/un-r155.json +166 -0
- package/data/seed/un-r156.json +150 -0
- package/dist/http-server.d.ts +9 -0
- package/dist/http-server.d.ts.map +1 -0
- package/dist/http-server.js +342 -0
- package/dist/http-server.js.map +1 -0
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/dist/tools/map.d.ts +1 -1
- package/dist/tools/map.d.ts.map +1 -1
- package/dist/tools/map.js +3 -3
- package/dist/tools/map.js.map +1 -1
- package/package.json +8 -3
- package/scripts/build-db.ts +20 -8
- package/scripts/check-updates.ts +141 -39
- package/scripts/ingest-eurlex.ts +9 -1
- package/scripts/ingest-unece.ts +368 -0
- package/src/http-server.ts +380 -0
- package/src/index.ts +4 -4
- package/src/tools/map.ts +4 -4
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Ingest UN/ECE regulations from EUR-Lex.
|
|
5
|
+
* UN regulations use numbered sections (1., 2., etc.) instead of "Article X".
|
|
6
|
+
*
|
|
7
|
+
* Usage: npx tsx scripts/ingest-unece.ts <celex_id> <output_file>
|
|
8
|
+
* Example: npx tsx scripts/ingest-unece.ts 42021X0387 data/seed/un-r155.json
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { writeFileSync } from 'fs';
|
|
12
|
+
import { JSDOM } from 'jsdom';
|
|
13
|
+
|
|
14
|
+
interface Article {
|
|
15
|
+
number: string;
|
|
16
|
+
title?: string;
|
|
17
|
+
text: string;
|
|
18
|
+
chapter?: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
interface Definition {
|
|
22
|
+
term: string;
|
|
23
|
+
definition: string;
|
|
24
|
+
article: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface RegulationData {
|
|
28
|
+
id: string;
|
|
29
|
+
full_name: string;
|
|
30
|
+
celex_id: string;
|
|
31
|
+
effective_date?: string;
|
|
32
|
+
eur_lex_url: string;
|
|
33
|
+
articles: Article[];
|
|
34
|
+
definitions: Definition[];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const UN_REGULATION_METADATA: Record<string, { id: string; full_name: string; effective_date?: string }> = {
|
|
38
|
+
'42021X0387': {
|
|
39
|
+
id: 'UN_R155',
|
|
40
|
+
full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system',
|
|
41
|
+
effective_date: '2021-01-22',
|
|
42
|
+
},
|
|
43
|
+
'42025X0005': {
|
|
44
|
+
id: 'UN_R155',
|
|
45
|
+
full_name: 'UN Regulation No. 155 - Cyber security and cyber security management system (Supplement 3)',
|
|
46
|
+
effective_date: '2025-01-10',
|
|
47
|
+
},
|
|
48
|
+
'42021X0388': {
|
|
49
|
+
id: 'UN_R156',
|
|
50
|
+
full_name: 'UN Regulation No. 156 - Software update and software update management system',
|
|
51
|
+
effective_date: '2021-01-22',
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
// Section titles for UN regulations (most are shared, some differ)
|
|
56
|
+
const COMMON_SECTION_TITLES: Record<string, string> = {
|
|
57
|
+
'1': 'Scope',
|
|
58
|
+
'2': 'Definitions',
|
|
59
|
+
'3': 'Application for approval',
|
|
60
|
+
'4': 'Markings',
|
|
61
|
+
'5': 'Approval',
|
|
62
|
+
'7': 'Specifications',
|
|
63
|
+
'8': 'Modification of vehicle type and extension of type approval',
|
|
64
|
+
'9': 'Conformity of production',
|
|
65
|
+
'10': 'Penalties for non-conformity of production',
|
|
66
|
+
'11': 'Production definitively discontinued',
|
|
67
|
+
'12': 'Names and addresses of Technical Services responsible for conducting approval tests, and of Type Approval Authorities',
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// Regulation-specific section titles (for section 6 which differs)
|
|
71
|
+
const REGULATION_SECTION_TITLES: Record<string, Record<string, string>> = {
|
|
72
|
+
UN_R155: {
|
|
73
|
+
'6': 'Certificate of Compliance for Cybersecurity Management System',
|
|
74
|
+
},
|
|
75
|
+
UN_R156: {
|
|
76
|
+
'6': 'Certificate of Compliance for Software Update Management System',
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
function getSectionTitle(sectionNum: string, regulationId: string): string {
|
|
81
|
+
const regSpecific = REGULATION_SECTION_TITLES[regulationId]?.[sectionNum];
|
|
82
|
+
if (regSpecific) return regSpecific;
|
|
83
|
+
return COMMON_SECTION_TITLES[sectionNum] || `Section ${sectionNum}`;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
async function fetchEurLexHtml(celexId: string): Promise<string> {
|
|
87
|
+
const url = `https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:${celexId}`;
|
|
88
|
+
console.log(`Fetching: ${url}`);
|
|
89
|
+
|
|
90
|
+
const response = await fetch(url, {
|
|
91
|
+
headers: {
|
|
92
|
+
'User-Agent': 'Mozilla/5.0 (compatible; EU-Compliance-MCP/1.0; +https://github.com/Ansvar-Systems/EU_compliance_MCP)',
|
|
93
|
+
Accept: 'text/html',
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
if (!response.ok) {
|
|
98
|
+
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return response.text();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function parseUnRegulation(html: string, celexId: string): { articles: Article[]; definitions: Definition[] } {
|
|
105
|
+
const dom = new JSDOM(html);
|
|
106
|
+
const doc = dom.window.document;
|
|
107
|
+
|
|
108
|
+
const articles: Article[] = [];
|
|
109
|
+
const definitions: Definition[] = [];
|
|
110
|
+
|
|
111
|
+
// Strategy: Get all text elements (p, span, td) and process sequentially
|
|
112
|
+
// UN regulations use numbered sections with content in tables/spans
|
|
113
|
+
const allElements = Array.from(doc.querySelectorAll('p, span, td'));
|
|
114
|
+
|
|
115
|
+
let currentSection: { number: string; title: string; lines: string[] } | null = null;
|
|
116
|
+
let currentAnnex: { number: string; title: string; lines: string[] } | null = null;
|
|
117
|
+
let inAnnex = false;
|
|
118
|
+
let seenSections = new Set<string>();
|
|
119
|
+
let inTableOfContents = true; // Skip TOC at start
|
|
120
|
+
|
|
121
|
+
for (const el of allElements) {
|
|
122
|
+
const text = el.textContent?.trim() || '';
|
|
123
|
+
if (!text || text.length < 2) continue;
|
|
124
|
+
|
|
125
|
+
// Detect end of table of contents - when we see the actual section header format
|
|
126
|
+
const mainSectionHeader = text.match(/^(\d{1,2})\.\s+[A-Z][A-Z\s]+$/);
|
|
127
|
+
if (mainSectionHeader && el.classList?.contains('oj-ti-grseq-1')) {
|
|
128
|
+
inTableOfContents = false;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Skip if still in table of contents
|
|
132
|
+
if (inTableOfContents && !el.classList?.contains('oj-ti-grseq-1')) {
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Check for main section headers (format: "8. MODIFICATION OF VEHICLE TYPE...")
|
|
137
|
+
// Note: some titles have hyphens (e.g., "NON-CONFORMITY")
|
|
138
|
+
const sectionHeaderMatch = text.match(/^(\d{1,2})\.\s+([A-Z][A-Z\s,\-]+)$/);
|
|
139
|
+
if (sectionHeaderMatch && el.classList?.contains('oj-ti-grseq-1')) {
|
|
140
|
+
const sectionNum = sectionHeaderMatch[1];
|
|
141
|
+
|
|
142
|
+
// Save current section if exists
|
|
143
|
+
if (currentSection && currentSection.lines.length > 0 && !seenSections.has(currentSection.number)) {
|
|
144
|
+
articles.push({
|
|
145
|
+
number: currentSection.number,
|
|
146
|
+
title: currentSection.title,
|
|
147
|
+
text: currentSection.lines.join('\n\n'),
|
|
148
|
+
});
|
|
149
|
+
seenSections.add(currentSection.number);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
currentSection = {
|
|
153
|
+
number: sectionNum,
|
|
154
|
+
title: R155_SECTION_TITLES[sectionNum] || sectionHeaderMatch[2].trim(),
|
|
155
|
+
lines: [],
|
|
156
|
+
};
|
|
157
|
+
currentAnnex = null;
|
|
158
|
+
inAnnex = false;
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Check for Annex headers
|
|
163
|
+
const annexMatch = text.match(/^Annex\s+(\d+)/i) || text.match(/^ANNEX\s+(\d+)/i);
|
|
164
|
+
if (annexMatch || (el.classList?.contains('oj-doc-ti') && text.includes('Annex'))) {
|
|
165
|
+
// Save current section/annex if exists
|
|
166
|
+
if (currentSection && currentSection.lines.length > 0 && !seenSections.has(currentSection.number)) {
|
|
167
|
+
articles.push({
|
|
168
|
+
number: currentSection.number,
|
|
169
|
+
title: currentSection.title,
|
|
170
|
+
text: currentSection.lines.join('\n\n'),
|
|
171
|
+
});
|
|
172
|
+
seenSections.add(currentSection.number);
|
|
173
|
+
}
|
|
174
|
+
if (currentAnnex && currentAnnex.lines.length > 0 && !seenSections.has(`Annex ${currentAnnex.number}`)) {
|
|
175
|
+
articles.push({
|
|
176
|
+
number: `Annex ${currentAnnex.number}`,
|
|
177
|
+
title: currentAnnex.title,
|
|
178
|
+
text: currentAnnex.lines.join('\n\n'),
|
|
179
|
+
chapter: 'Annexes',
|
|
180
|
+
});
|
|
181
|
+
seenSections.add(`Annex ${currentAnnex.number}`);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const annexNum = annexMatch?.[1] || text.match(/Annex\s+(\d+)/i)?.[1];
|
|
185
|
+
if (annexNum) {
|
|
186
|
+
inAnnex = true;
|
|
187
|
+
currentAnnex = { number: annexNum, title: extractAnnexTitle(text), lines: [] };
|
|
188
|
+
currentSection = null;
|
|
189
|
+
}
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Skip metadata and navigation elements
|
|
194
|
+
if (
|
|
195
|
+
text.includes('Official Journal') ||
|
|
196
|
+
text.includes('EUR-Lex') ||
|
|
197
|
+
text.includes('CONTENTS') ||
|
|
198
|
+
text.match(/^[A-Z]+$/) ||
|
|
199
|
+
text.match(/^L\s+\d+\/\d+$/) ||
|
|
200
|
+
text.match(/^\d+\.\d+\.\d+\s+EN$/)
|
|
201
|
+
) {
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Add content to current section or annex
|
|
206
|
+
if (inAnnex && currentAnnex) {
|
|
207
|
+
currentAnnex.lines.push(text);
|
|
208
|
+
} else if (currentSection) {
|
|
209
|
+
currentSection.lines.push(text);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Don't forget last section/annex
|
|
214
|
+
if (currentSection && currentSection.lines.length > 0 && !seenSections.has(currentSection.number)) {
|
|
215
|
+
articles.push({
|
|
216
|
+
number: currentSection.number,
|
|
217
|
+
title: currentSection.title,
|
|
218
|
+
text: currentSection.lines.join('\n\n'),
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
if (currentAnnex && currentAnnex.lines.length > 0 && !seenSections.has(`Annex ${currentAnnex.number}`)) {
|
|
222
|
+
articles.push({
|
|
223
|
+
number: `Annex ${currentAnnex.number}`,
|
|
224
|
+
title: currentAnnex.title,
|
|
225
|
+
text: currentAnnex.lines.join('\n\n'),
|
|
226
|
+
chapter: 'Annexes',
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Extract definitions from Section 2
|
|
231
|
+
// UN regulations use format: 2.1. 'term' means/refers to definition
|
|
232
|
+
// Note: Uses curly quotes (Unicode 8216/8217) not straight quotes
|
|
233
|
+
const defsSection = articles.find((a) => a.number === '2');
|
|
234
|
+
if (defsSection) {
|
|
235
|
+
// Normalize text: collapse newlines, handle both straight and curly quotes
|
|
236
|
+
const normalizedText = defsSection.text
|
|
237
|
+
.replace(/\n+/g, ' ')
|
|
238
|
+
.replace(/\s+/g, ' ')
|
|
239
|
+
.replace(/[\u2018\u2019]/g, "'"); // Convert curly quotes to straight
|
|
240
|
+
|
|
241
|
+
// Match patterns like: 2.1. 'Vehicle type' means/refers to ...
|
|
242
|
+
// Some definitions use "means", others use "refers to"
|
|
243
|
+
const defRegex = /(\d+\.\d+\.)\s*'([^']+)'\s+(?:means|refers to)\s+(.+?)(?=\d+\.\d+\.\s*'|$)/g;
|
|
244
|
+
const defMatches = normalizedText.matchAll(defRegex);
|
|
245
|
+
for (const match of defMatches) {
|
|
246
|
+
const term = match[2].trim().toLowerCase();
|
|
247
|
+
let definition = match[3].trim();
|
|
248
|
+
// Clean up the definition - remove trailing section numbers and punctuation
|
|
249
|
+
definition = definition.replace(/\s*\d+\.\d+\.\s*$/, '').replace(/[;.]$/, '').trim();
|
|
250
|
+
if (term && definition.length > 10) {
|
|
251
|
+
definitions.push({
|
|
252
|
+
term,
|
|
253
|
+
definition,
|
|
254
|
+
article: '2',
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Deduplicate and sort articles
|
|
261
|
+
const articleMap = new Map<string, Article>();
|
|
262
|
+
for (const article of articles) {
|
|
263
|
+
const existing = articleMap.get(article.number);
|
|
264
|
+
if (!existing || article.text.length > existing.text.length) {
|
|
265
|
+
articleMap.set(article.number, article);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const sortedArticles = Array.from(articleMap.values()).sort((a, b) => {
|
|
270
|
+
// Sort numbered sections first, then annexes
|
|
271
|
+
const aIsAnnex = a.number.startsWith('Annex');
|
|
272
|
+
const bIsAnnex = b.number.startsWith('Annex');
|
|
273
|
+
if (aIsAnnex && !bIsAnnex) return 1;
|
|
274
|
+
if (!aIsAnnex && bIsAnnex) return -1;
|
|
275
|
+
if (aIsAnnex && bIsAnnex) {
|
|
276
|
+
return parseInt(a.number.replace('Annex ', '')) - parseInt(b.number.replace('Annex ', ''));
|
|
277
|
+
}
|
|
278
|
+
return parseInt(a.number) - parseInt(b.number);
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
return { articles: sortedArticles, definitions };
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function extractAnnexTitle(text: string): string {
|
|
285
|
+
// Extract title after "Annex X"
|
|
286
|
+
const match = text.match(/Annex\s+\d+\s*[–—-]?\s*(.*)/i);
|
|
287
|
+
if (match && match[1]) {
|
|
288
|
+
return match[1].trim();
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Common annex titles for R155
|
|
292
|
+
const annexTitles: Record<string, string> = {
|
|
293
|
+
'1': 'Information document',
|
|
294
|
+
'2': 'Communication',
|
|
295
|
+
'3': 'Arrangements of the approval mark',
|
|
296
|
+
'4': 'Certificate of Compliance for CSMS',
|
|
297
|
+
'5': 'List of threats and corresponding mitigations',
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
const annexNum = text.match(/Annex\s+(\d+)/i)?.[1];
|
|
301
|
+
if (annexNum && annexTitles[annexNum]) {
|
|
302
|
+
return annexTitles[annexNum];
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
return '';
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
async function ingestUnRegulation(celexId: string, outputPath: string): Promise<void> {
|
|
309
|
+
const metadata = UN_REGULATION_METADATA[celexId];
|
|
310
|
+
if (!metadata) {
|
|
311
|
+
console.warn(`Unknown CELEX ID: ${celexId}. Using generic metadata.`);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const html = await fetchEurLexHtml(celexId);
|
|
315
|
+
console.log(`Fetched ${html.length} bytes`);
|
|
316
|
+
|
|
317
|
+
// Save HTML for debugging
|
|
318
|
+
writeFileSync(outputPath.replace('.json', '.html'), html);
|
|
319
|
+
|
|
320
|
+
const { articles, definitions } = parseUnRegulation(html, celexId);
|
|
321
|
+
console.log(`Parsed ${articles.length} articles/sections, ${definitions.length} definitions`);
|
|
322
|
+
|
|
323
|
+
if (articles.length === 0) {
|
|
324
|
+
console.error('No sections found! The HTML structure may have changed.');
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const regulation: RegulationData = {
|
|
329
|
+
id: metadata?.id || celexId,
|
|
330
|
+
full_name: metadata?.full_name || `UN Regulation ${celexId}`,
|
|
331
|
+
celex_id: celexId,
|
|
332
|
+
effective_date: metadata?.effective_date,
|
|
333
|
+
eur_lex_url: `https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:${celexId}`,
|
|
334
|
+
articles,
|
|
335
|
+
definitions,
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
writeFileSync(outputPath, JSON.stringify(regulation, null, 2));
|
|
339
|
+
console.log(`\nSaved to: ${outputPath}`);
|
|
340
|
+
console.log(`Sections: ${articles.filter((a) => !a.number.startsWith('Annex')).length}`);
|
|
341
|
+
console.log(`Annexes: ${articles.filter((a) => a.number.startsWith('Annex')).length}`);
|
|
342
|
+
console.log(`Definitions: ${definitions.length}`);
|
|
343
|
+
|
|
344
|
+
// Print summary
|
|
345
|
+
console.log('\nSections found:');
|
|
346
|
+
for (const article of articles) {
|
|
347
|
+
const preview = article.text.substring(0, 60).replace(/\n/g, ' ');
|
|
348
|
+
console.log(` ${article.number}: ${article.title || '(no title)'} - ${preview}...`);
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Main
|
|
353
|
+
const [, , celexId, outputPath] = process.argv;
|
|
354
|
+
|
|
355
|
+
if (!celexId || !outputPath) {
|
|
356
|
+
console.log('Usage: npx tsx scripts/ingest-unece.ts <celex_id> <output_file>');
|
|
357
|
+
console.log('Example: npx tsx scripts/ingest-unece.ts 42021X0387 data/seed/un-r155.json');
|
|
358
|
+
console.log('\nKnown UN/ECE CELEX IDs:');
|
|
359
|
+
Object.entries(UN_REGULATION_METADATA).forEach(([id, meta]) => {
|
|
360
|
+
console.log(` ${id} - ${meta.id} (${meta.full_name})`);
|
|
361
|
+
});
|
|
362
|
+
process.exit(1);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
ingestUnRegulation(celexId, outputPath).catch((err) => {
|
|
366
|
+
console.error('Error:', err);
|
|
367
|
+
process.exit(1);
|
|
368
|
+
});
|