quiz-gen 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,805 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ EUR-Lex Regulation HTML Parser - Simplified version
4
+ Builds flexible TOC (3-4 levels) and chunks only recitals and articles
5
+ Supports: Preamble > Chapters > Sections (optional) > Articles
6
+ """
7
+
8
+ import re
9
+ import requests
10
+ from typing import List, Dict, Optional
11
+ from bs4 import BeautifulSoup
12
+ from dataclasses import dataclass, asdict
13
+ from enum import Enum
14
+ import json
15
+
16
+
17
+ class SectionType(Enum):
18
+ """Types of regulation sections"""
19
+ # Level 0 - Document title
20
+ TITLE = "title" # CHUNK THIS
21
+
22
+ # Level 1 - Major sections
23
+ PREAMBLE = "preamble"
24
+ ENACTING_TERMS = "enacting_terms"
25
+ CONCLUDING_FORMULAS = "concluding_formulas"
26
+ ANNEX = "annex"
27
+ APPENDIX = "appendix"
28
+
29
+ # Level 2 - Preamble elements
30
+ CITATION = "citation" # CHUNK THIS
31
+ RECITAL = "recital" # CHUNK THIS
32
+
33
+ # Level 2/3 - Structural
34
+ CHAPTER = "chapter"
35
+ SECTION = "section"
36
+
37
+ # Level 3/4 - Content
38
+ ARTICLE = "article" # CHUNK THIS
39
+
40
+
41
+ @dataclass
42
+ class RegulationChunk:
43
+ """Represents a parsed chunk (recital or article only)"""
44
+ section_type: SectionType
45
+ number: Optional[str]
46
+ title: Optional[str]
47
+ content: str
48
+ hierarchy_path: Optional[List[str]] = None
49
+ metadata: Dict = None
50
+
51
+ def __post_init__(self):
52
+ if self.metadata is None:
53
+ self.metadata = {}
54
+ if self.hierarchy_path is None:
55
+ self.hierarchy_path = []
56
+
57
+ def to_dict(self) -> Dict:
58
+ data = asdict(self)
59
+ data['section_type'] = self.section_type.value
60
+ return data
61
+
62
+
63
+ class EURLexParser:
64
+ """Parse EUR-Lex HTML and build TOC + chunk recitals/articles"""
65
+
66
+ def __init__(self, url: str = None, html_content: str = None):
67
+ self.url = url
68
+ self.html_content = html_content
69
+ self.soup = None
70
+ self.chunks: List[RegulationChunk] = []
71
+ self.toc: Dict = {'title': '', 'sections': []}
72
+ self.current_hierarchy: List[str] = []
73
+ self.regulation_title: str = ''
74
+
75
+ def fetch(self) -> str:
76
+ """Fetch HTML content from URL"""
77
+ if not self.url:
78
+ raise ValueError("No URL provided")
79
+
80
+ headers = {'User-Agent': 'Mozilla/5.0'}
81
+ response = requests.get(self.url, headers=headers, timeout=30)
82
+ response.raise_for_status()
83
+ self.html_content = response.text
84
+ return self.html_content
85
+
86
+ def parse(self) -> tuple[List[RegulationChunk], Dict]:
87
+ """Parse and return (chunks, toc)"""
88
+ if not self.html_content:
89
+ if self.url:
90
+ self.fetch()
91
+ else:
92
+ raise ValueError("No HTML content or URL provided")
93
+
94
+ self.soup = BeautifulSoup(self.html_content, 'lxml-xml')
95
+
96
+ # Parse document structure
97
+ self._parse_title()
98
+ self._parse_preamble()
99
+ self._parse_enacting_terms()
100
+ self._parse_concluding_formulas()
101
+ self._parse_annexes()
102
+
103
+ print(f"\n✓ TOC built with {len(self.toc['sections'])} major sections (flexible 2-4 levels)")
104
+ print(f"✓ Created {len(self.chunks)} chunks (recitals + articles)")
105
+
106
+ return self.chunks, self.toc
107
+
108
+ def _parse_title(self):
109
+ """Parse main title and create a chunk for it"""
110
+ title_div = self.soup.find('div', class_='eli-main-title')
111
+ if title_div:
112
+ # Get all paragraphs from the title
113
+ title_paragraphs = title_div.find_all('p', class_='oj-doc-ti')
114
+ title_parts = [self._clean_text(p.get_text()) for p in title_paragraphs if p]
115
+
116
+ # First paragraph is typically the main title
117
+ if title_parts:
118
+ main_title = title_parts[0]
119
+ full_content = '\n\n'.join(title_parts)
120
+
121
+ # Store for use in hierarchy
122
+ self.regulation_title = main_title
123
+ self.toc['title'] = main_title
124
+
125
+ # CHUNK IT
126
+ chunk = RegulationChunk(
127
+ section_type=SectionType.TITLE,
128
+ number=None,
129
+ title=main_title,
130
+ content=full_content,
131
+ hierarchy_path=[main_title],
132
+ metadata={'id': title_div.get('id', '')}
133
+ )
134
+ self.chunks.append(chunk)
135
+
136
+ print(f"Parsed title: {main_title[:80]}...")
137
+
138
+ def _parse_preamble(self):
139
+ """Parse preamble: add single citation entry and recitals to TOC, chunk citation and recitals"""
140
+ preamble_section = {'type': 'preamble', 'title': 'Preamble', 'children': []}
141
+
142
+ # Parse citations - combine all into one chunk
143
+ citations = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^cit_\d+'))
144
+ if citations:
145
+ # Collect all citation text
146
+ citation_parts = []
147
+ citation_ids = []
148
+ for cit in citations:
149
+ para = cit.find('p', class_='oj-normal')
150
+ if para:
151
+ text = self._clean_text(para.get_text())
152
+ if text:
153
+ citation_parts.append(text)
154
+ citation_ids.append(cit.get('id', ''))
155
+
156
+ if citation_parts:
157
+ # Add single citation entry to TOC
158
+ preamble_section['children'].append({
159
+ 'type': 'citation',
160
+ 'title': 'Citation'
161
+ })
162
+
163
+ # CHUNK IT - single chunk with all citations
164
+ # Use first citation ID for navigation (cit_1)
165
+ hierarchy = [self.regulation_title, "Preamble", "Citation"] if self.regulation_title else ["Preamble", "Citation"]
166
+ chunk = RegulationChunk(
167
+ section_type=SectionType.CITATION,
168
+ number=None,
169
+ title="Citation",
170
+ content='\n\n'.join(citation_parts),
171
+ hierarchy_path=hierarchy,
172
+ metadata={'id': citation_ids[0] if citation_ids else 'cit_1', 'citation_ids': citation_ids}
173
+ )
174
+ self.chunks.append(chunk)
175
+
176
+ # Parse recitals
177
+ recitals = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^rct_\d+'))
178
+ for rct in recitals:
179
+ table = rct.find('table')
180
+ if table:
181
+ rows = table.find_all('tr')
182
+ for row in rows:
183
+ cells = row.find_all('td')
184
+ if len(cells) == 2:
185
+ num_text = self._clean_text(cells[0].get_text())
186
+ content = self._clean_text(cells[1].get_text())
187
+
188
+ match = re.match(r'^\((\d+)\)$', num_text)
189
+ if match:
190
+ num = match.group(1)
191
+
192
+ # Add to TOC
193
+ preamble_section['children'].append({
194
+ 'type': 'recital',
195
+ 'number': num,
196
+ 'title': f"Recital {num}"
197
+ })
198
+
199
+ # CHUNK IT
200
+ hierarchy = [self.regulation_title, "Preamble", f"Recital {num}"] if self.regulation_title else ["Preamble", f"Recital {num}"]
201
+ chunk = RegulationChunk(
202
+ section_type=SectionType.RECITAL,
203
+ number=num,
204
+ title=f"Recital {num}",
205
+ content=content,
206
+ hierarchy_path=hierarchy,
207
+ metadata={'id': rct.get('id', '')}
208
+ )
209
+ self.chunks.append(chunk)
210
+
211
+ self.toc['sections'].append(preamble_section)
212
+ print(f"Parsed preamble: {len(citations)} citations, {len(recitals)} recitals")
213
+
214
+ def _parse_enacting_terms(self):
215
+ """Parse chapters/sections/articles: all to TOC, chunk articles only"""
216
+ enacting_section = {'type': 'enacting_terms', 'title': 'Enacting Terms', 'children': []}
217
+
218
+ # Find all chapters
219
+ chapters = self.soup.find_all('div', id=re.compile(r'^cpt_'))
220
+
221
+ # Check if there are chapters
222
+ if not chapters:
223
+ # No chapters found - articles might be directly under enacting terms
224
+ # Look for articles at the top level
225
+ articles = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
226
+
227
+ if articles:
228
+ # Articles without chapters - parse them directly
229
+ hierarchy = [self.regulation_title, "Enacting Terms"] if self.regulation_title else ["Enacting Terms"]
230
+ for art_div in articles:
231
+ self._parse_article(art_div, enacting_section, hierarchy)
232
+
233
+ self.toc['sections'].append(enacting_section)
234
+ print(f"Parsed enacting terms: {len(articles)} articles (no chapters)")
235
+ return
236
+
237
+ for chapter_div in chapters:
238
+ # Get chapter number and title
239
+ chapter_p = chapter_div.find('p', class_='oj-ti-section-1')
240
+ if not chapter_p:
241
+ continue
242
+
243
+ chapter_text = self._clean_text(chapter_p.get_text())
244
+ chapter_match = re.match(r'CHAPTER\s+([IVXLCDM]+|\d+)', chapter_text, re.I)
245
+
246
+ if chapter_match:
247
+ chapter_num = chapter_match.group(1)
248
+
249
+ # Get subtitle (might be in oj-ti-section-2 or in eli-title)
250
+ subtitle_p = chapter_div.find('p', class_='oj-ti-section-2')
251
+ if not subtitle_p:
252
+ # Try to find title in eli-title container
253
+ title_div = chapter_div.find('div', class_='eli-title')
254
+ if title_div:
255
+ subtitle_p = title_div.find('p', class_='oj-ti-section-2')
256
+
257
+ chapter_title = self._clean_text(subtitle_p.get_text()) if subtitle_p else ''
258
+
259
+ full_title = f"CHAPTER {chapter_num}" + (f" - {chapter_title}" if chapter_title else "")
260
+
261
+ chapter_toc = {
262
+ 'type': 'chapter',
263
+ 'number': chapter_num,
264
+ 'title': full_title,
265
+ 'children': []
266
+ }
267
+
268
+ # Update hierarchy
269
+ self.current_hierarchy = [self.regulation_title, full_title] if self.regulation_title else [full_title]
270
+
271
+ # Check if this chapter has sections
272
+ sections = chapter_div.find_all('div', id=re.compile(r'^cpt_[^.]+\.sct_'), recursive=False)
273
+
274
+ if sections:
275
+ # Chapter has sections - parse them
276
+ for section_div in sections:
277
+ section_id = section_div.get('id', '')
278
+
279
+ # Get section title - look for SECTION I, SECTION II, etc.
280
+ section_title_p = section_div.find('p', class_='oj-ti-section-1')
281
+ if section_title_p:
282
+ section_text = self._clean_text(section_title_p.get_text())
283
+ section_match = re.match(r'SECTION\s+([IVXLCDM]+|\d+)', section_text, re.I)
284
+
285
+ if section_match:
286
+ section_num = section_match.group(1)
287
+
288
+ # Get section subtitle
289
+ section_subtitle_p = section_div.find('p', class_='oj-ti-section-2')
290
+ if not section_subtitle_p:
291
+ # Try in eli-title
292
+ title_div = section_div.find('div', class_='eli-title')
293
+ if title_div:
294
+ section_subtitle_p = title_div.find('p', class_='oj-ti-section-2')
295
+
296
+ section_title = self._clean_text(section_subtitle_p.get_text()) if section_subtitle_p else ''
297
+
298
+ section_full_title = f"SECTION {section_num}" + (f" - {section_title}" if section_title else "")
299
+
300
+ section_toc = {
301
+ 'type': 'section',
302
+ 'number': section_num,
303
+ 'title': section_full_title,
304
+ 'children': []
305
+ }
306
+
307
+ # Update hierarchy to include section
308
+ section_hierarchy = self.current_hierarchy + [section_full_title]
309
+
310
+ # Find articles within this section
311
+ articles = section_div.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
312
+
313
+ for art_div in articles:
314
+ self._parse_article(art_div, section_toc, section_hierarchy)
315
+
316
+ chapter_toc['children'].append(section_toc)
317
+ else:
318
+ # No sections - articles directly under chapter
319
+ articles = chapter_div.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
320
+
321
+ for art_div in articles:
322
+ self._parse_article(art_div, chapter_toc, self.current_hierarchy)
323
+
324
+ enacting_section['children'].append(chapter_toc)
325
+
326
+ self.toc['sections'].append(enacting_section)
327
+ print(f"Parsed enacting terms: {len(chapters)} chapters")
328
+
329
+ def _parse_article(self, art_div, parent_toc, hierarchy_path):
330
+ """Parse a single article and add to TOC and chunks"""
331
+ art_p = art_div.find('p', class_='oj-ti-art')
332
+ if not art_p:
333
+ return
334
+
335
+ art_text = self._clean_text(art_p.get_text())
336
+ art_match = re.search(r'Article\s+(\d+[a-z]*)', art_text, re.I)
337
+
338
+ if art_match:
339
+ art_num = art_match.group(1)
340
+
341
+ # Get article subtitle
342
+ art_subtitle_p = art_div.find('p', class_='oj-sti-art')
343
+ art_subtitle = self._clean_text(art_subtitle_p.get_text()) if art_subtitle_p else ''
344
+
345
+ art_full_title = f"Article {art_num}" + (f" - {art_subtitle}" if art_subtitle else "")
346
+
347
+ # Collect article content
348
+ content_parts = []
349
+
350
+ # Method 1: Look for content divs (numbered paragraphs like 1.1, 1.2)
351
+ content_divs = art_div.find_all('div', id=re.compile(r'^\d+\.\d+'))
352
+ for content_div in content_divs:
353
+ paras = content_div.find_all('p', class_='oj-normal')
354
+ for para in paras:
355
+ text = self._clean_text(para.get_text())
356
+ if text:
357
+ content_parts.append(text)
358
+
359
+ # Method 2: Look for direct paragraphs (intro text before tables)
360
+ direct_paras = art_div.find_all('p', class_='oj-normal', recursive=False)
361
+ for para in direct_paras:
362
+ text = self._clean_text(para.get_text())
363
+ if text:
364
+ content_parts.append(text)
365
+
366
+ # Method 3: Look for tables (list items like (a), (b), (c))
367
+ tables = art_div.find_all('table')
368
+ for table in tables:
369
+ rows = table.find_all('tr')
370
+ for row in rows:
371
+ cells = row.find_all('td')
372
+ if len(cells) == 2:
373
+ # First cell is typically the list marker (a), (b), etc.
374
+ marker = self._clean_text(cells[0].get_text())
375
+ text = self._clean_text(cells[1].get_text())
376
+ if text:
377
+ # Combine marker and text
378
+ content_parts.append(f"{marker} {text}")
379
+
380
+ full_content = '\n\n'.join(content_parts)
381
+ # Apply cleaning to the combined content to fix list formatting
382
+ full_content = self._clean_combined_text(full_content)
383
+
384
+ # Add to TOC
385
+ parent_toc['children'].append({
386
+ 'type': 'article',
387
+ 'number': art_num,
388
+ 'title': art_full_title
389
+ })
390
+
391
+ # CHUNK IT
392
+ chunk = RegulationChunk(
393
+ section_type=SectionType.ARTICLE,
394
+ number=art_num,
395
+ title=art_full_title,
396
+ content=full_content or art_subtitle,
397
+ hierarchy_path=hierarchy_path + [art_full_title],
398
+ metadata={'id': art_div.get('id', ''), 'subtitle': art_subtitle}
399
+ )
400
+ self.chunks.append(chunk)
401
+
402
+ def _parse_concluding_formulas(self):
403
+ """Parse concluding formulas and create a chunk"""
404
+ concluding_div = self.soup.find('div', class_='eli-subdivision', id=re.compile(r'^fnp_\d+'))
405
+ if concluding_div:
406
+ # Get all content from the concluding formulas
407
+ final_div = concluding_div.find('div', class_='oj-final')
408
+ if final_div:
409
+ content_parts = []
410
+ paras = final_div.find_all('p', class_='oj-normal')
411
+ for para in paras:
412
+ text = self._clean_text(para.get_text())
413
+ if text:
414
+ content_parts.append(text)
415
+
416
+ # Get signatory information
417
+ signatories = final_div.find_all('div', class_='oj-signatory')
418
+ for sig in signatories:
419
+ sig_paras = sig.find_all('p', class_='oj-signatory')
420
+ sig_parts = [self._clean_text(p.get_text()) for p in sig_paras if p]
421
+ if sig_parts:
422
+ content_parts.append('\n'.join(sig_parts))
423
+
424
+ full_content = '\n\n'.join(content_parts)
425
+
426
+ if full_content:
427
+ # Add to TOC
428
+ concluding_section = {
429
+ 'type': 'concluding_formulas',
430
+ 'title': 'Concluding formulas'
431
+ }
432
+ self.toc['sections'].append(concluding_section)
433
+
434
+ # CHUNK IT
435
+ hierarchy = [self.regulation_title, "Concluding formulas"] if self.regulation_title else ["Concluding formulas"]
436
+ chunk = RegulationChunk(
437
+ section_type=SectionType.CONCLUDING_FORMULAS,
438
+ number=None,
439
+ title="Concluding formulas",
440
+ content=full_content,
441
+ hierarchy_path=hierarchy,
442
+ metadata={'id': concluding_div.get('id', '')}
443
+ )
444
+ self.chunks.append(chunk)
445
+ print(f"Parsed concluding formulas")
446
+
447
+ def _parse_annexes(self):
448
+ """Parse annexes and appendices and create chunks for each"""
449
+ # Find all annexes and appendices
450
+ annexes = self.soup.find_all('div', class_='eli-container', id=re.compile(r'^anx_'))
451
+
452
+ if not annexes:
453
+ return
454
+
455
+ for annex_div in annexes:
456
+ # Get annex/appendix ID
457
+ annex_id = annex_div.get('id', '')
458
+
459
+ # Check if this is an appendix (contains .app_)
460
+ is_appendix = '.app_' in annex_id
461
+
462
+ if is_appendix:
463
+ # Parse as appendix: anx_1.app_1
464
+ app_match = re.match(r'^anx_(\d+)\.app_(\d+)', annex_id)
465
+ if app_match:
466
+ annex_num = app_match.group(1)
467
+ app_num = app_match.group(2)
468
+ section_type = SectionType.APPENDIX
469
+ identifier = f"{annex_num}.{app_num}"
470
+ else:
471
+ # Fallback
472
+ identifier = annex_id.replace('anx_', '').replace('.app_', '.')
473
+ section_type = SectionType.APPENDIX
474
+ else:
475
+ # Parse as annex: anx_1, anx_I, etc.
476
+ annex_match = re.match(r'^anx_([IVXLCDM]+|\d+[A-Z]?)', annex_id, re.I)
477
+ identifier = annex_match.group(1) if annex_match else annex_id.replace('anx_', '')
478
+ section_type = SectionType.ANNEX
479
+
480
+ # Get title from first oj-doc-ti paragraph
481
+ title_paragraphs = annex_div.find_all('p', class_='oj-doc-ti')
482
+ if title_paragraphs:
483
+ # Get first paragraph as main title
484
+ title_text = self._clean_text(title_paragraphs[0].get_text())
485
+ # If there are multiple oj-doc-ti paragraphs, combine them as subtitle
486
+ # (but NOT oj-ti-grseq-1 which typically contains PART numbers within the annex)
487
+ if len(title_paragraphs) > 1:
488
+ subtitle_parts = [self._clean_text(p.get_text()) for p in title_paragraphs[1:]]
489
+ subtitle = ' '.join(subtitle_parts)
490
+ else:
491
+ subtitle = ''
492
+ else:
493
+ # Fallback title
494
+ if is_appendix:
495
+ title_text = f"APPENDIX {identifier}"
496
+ else:
497
+ title_text = f"ANNEX {identifier}"
498
+ subtitle = ''
499
+
500
+ full_title = title_text + (f" - {subtitle}" if subtitle else "")
501
+
502
+ # Create a base title with identifier for use in parts
503
+ # This ensures parts are titled like "ANNEX 1 - PART 1" not just "ANNEX - PART 1"
504
+ if is_appendix:
505
+ base_title_with_id = f"APPENDIX {identifier}"
506
+ else:
507
+ # Check if title_text already contains the identifier
508
+ if identifier and identifier.upper() not in title_text.upper():
509
+ base_title_with_id = f"ANNEX {identifier}"
510
+ else:
511
+ base_title_with_id = title_text
512
+
513
+ # Check if annex contains parts (oj-ti-grseq-1 with PART X pattern)
514
+ part_headers = annex_div.find_all('p', class_='oj-ti-grseq-1')
515
+ parts_detected = []
516
+ for part_header in part_headers:
517
+ part_text = self._clean_text(part_header.get_text())
518
+ # Match "PART 1", "PART I", "Part 1", etc.
519
+ part_match = re.match(r'^PART\s+([IVXLCDM]+|\d+)', part_text, re.I)
520
+ if part_match:
521
+ parts_detected.append({
522
+ 'element': part_header,
523
+ 'number': part_match.group(1),
524
+ 'title': part_text
525
+ })
526
+
527
+ # If parts detected, create separate chunks for each part
528
+ if parts_detected:
529
+ # Add annex to TOC with parts as children
530
+ toc_entry = {
531
+ 'type': 'appendix' if is_appendix else 'annex',
532
+ 'number': identifier,
533
+ 'title': base_title_with_id, # Use base_title_with_id to show "ANNEX 1" not "ANNEX"
534
+ 'children': []
535
+ }
536
+
537
+ hierarchy_base = [self.regulation_title, base_title_with_id] if self.regulation_title else [base_title_with_id]
538
+
539
+ # Process each part
540
+ for i, part_info in enumerate(parts_detected):
541
+ part_elem = part_info['element']
542
+ part_num = part_info['number']
543
+ part_title = part_info['title']
544
+
545
+ # Collect content from this part until the next part or end of annex
546
+ content_parts = []
547
+
548
+ # Get the next part element to know where to stop
549
+ next_part_elem = parts_detected[i + 1]['element'] if i + 1 < len(parts_detected) else None
550
+
551
+ # Find all elements in the annex_div, and collect those between current and next part
552
+ collecting = False
553
+ for elem in annex_div.descendants:
554
+ # Start collecting after we find the part header
555
+ if elem == part_elem:
556
+ collecting = True
557
+ continue
558
+
559
+ # Stop if we hit the next part header
560
+ if next_part_elem and elem == next_part_elem:
561
+ break
562
+
563
+ if not collecting:
564
+ continue
565
+
566
+ # Collect normal paragraphs
567
+ if elem.name == 'p' and 'oj-normal' in elem.get('class', []):
568
+ text = self._clean_text(elem.get_text())
569
+ if text and len(text) > 10:
570
+ content_parts.append(text)
571
+
572
+ # Collect tables
573
+ elif elem.name == 'table' and 'oj-table' in elem.get('class', []):
574
+ # Get table headers
575
+ headers = []
576
+ header_cells = elem.find_all('p', class_='oj-tbl-hdr')
577
+ for hdr in header_cells:
578
+ text = self._clean_text(hdr.get_text())
579
+ if text:
580
+ headers.append(text)
581
+
582
+ if headers:
583
+ content_parts.append(' | '.join(headers))
584
+ content_parts.append('-' * 40)
585
+
586
+ # Get table rows
587
+ rows = elem.find_all('tr', class_='oj-table')
588
+ for row in rows:
589
+ cells = row.find_all('td', class_='oj-table')
590
+ cell_texts = []
591
+ for cell in cells:
592
+ cell_para = cell.find('p')
593
+ if cell_para:
594
+ text = self._clean_text(cell_para.get_text())
595
+ if text and 'oj-tbl-hdr' not in cell_para.get('class', []):
596
+ cell_texts.append(text)
597
+
598
+ if cell_texts:
599
+ content_parts.append(' | '.join(cell_texts))
600
+
601
+ part_content = '\n\n'.join(content_parts)
602
+ # Use base_title_with_id to include annex/appendix number in part titles
603
+ part_full_title = f"{base_title_with_id} - {part_title}"
604
+
605
+ # Add to TOC
606
+ toc_entry['children'].append({
607
+ 'type': 'part',
608
+ 'number': part_num,
609
+ 'title': part_title
610
+ })
611
+
612
+ # CHUNK IT
613
+ chunk = RegulationChunk(
614
+ section_type=section_type,
615
+ number=f"{identifier}.{part_num}",
616
+ title=part_full_title,
617
+ content=part_content,
618
+ hierarchy_path=hierarchy_base + [part_title],
619
+ metadata={'id': annex_id, 'part': part_num}
620
+ )
621
+ self.chunks.append(chunk)
622
+
623
+ self.toc['sections'].append(toc_entry)
624
+
625
+ else:
626
+ # No parts - treat as single chunk (original behavior)
627
+ # Collect annex content (all text from paragraphs and tables)
628
+ content_parts = []
629
+
630
+ # Get all normal paragraphs
631
+ paras = annex_div.find_all('p', class_='oj-normal')
632
+ for para in paras:
633
+ text = self._clean_text(para.get_text())
634
+ if text and len(text) > 10: # Filter out very short text
635
+ content_parts.append(text)
636
+
637
+ # Check if this is a table-based annex (like correlation tables)
638
+ tables = annex_div.find_all('table', class_='oj-table')
639
+ for table in tables:
640
+ # Get table headers
641
+ headers = []
642
+ header_cells = table.find_all('p', class_='oj-tbl-hdr')
643
+ for hdr in header_cells:
644
+ text = self._clean_text(hdr.get_text())
645
+ if text:
646
+ headers.append(text)
647
+
648
+ if headers:
649
+ content_parts.append(' | '.join(headers))
650
+ content_parts.append('-' * 40)
651
+
652
+ # Get table rows
653
+ rows = table.find_all('tr', class_='oj-table')
654
+ for row in rows:
655
+ cells = row.find_all('td', class_='oj-table')
656
+ cell_texts = []
657
+ for cell in cells:
658
+ cell_para = cell.find('p')
659
+ if cell_para:
660
+ text = self._clean_text(cell_para.get_text())
661
+ if text and 'oj-tbl-hdr' not in cell_para.get('class', []):
662
+ cell_texts.append(text)
663
+
664
+ if cell_texts:
665
+ content_parts.append(' | '.join(cell_texts))
666
+
667
+ # Join all content parts - no truncation
668
+ full_content = '\n\n'.join(content_parts)
669
+
670
+ # Create chunk even if content is just title/subtitle (for index purposes)
671
+ if not full_content and subtitle:
672
+ full_content = subtitle
673
+
674
+ if full_content or subtitle:
675
+ # Add to TOC
676
+ toc_entry = {
677
+ 'type': 'appendix' if is_appendix else 'annex',
678
+ 'number': identifier,
679
+ 'title': full_title
680
+ }
681
+ self.toc['sections'].append(toc_entry)
682
+
683
+ # CHUNK IT
684
+ hierarchy = [self.regulation_title, full_title] if self.regulation_title else [full_title]
685
+ chunk = RegulationChunk(
686
+ section_type=section_type,
687
+ number=identifier,
688
+ title=full_title,
689
+ content=full_content,
690
+ hierarchy_path=hierarchy,
691
+ metadata={'id': annex_id, 'subtitle': subtitle}
692
+ )
693
+ self.chunks.append(chunk)
694
+
695
+ # Count annexes vs appendices for reporting
696
+ annex_count = sum(1 for a in annexes if '.app_' not in a.get('id', ''))
697
+ appendix_count = sum(1 for a in annexes if '.app_' in a.get('id', ''))
698
+ if annex_count and appendix_count:
699
+ print(f"Parsed {annex_count} annexes and {appendix_count} appendices")
700
+ elif annex_count:
701
+ print(f"Parsed {annex_count} annexes")
702
+ elif appendix_count:
703
+ print(f"Parsed {appendix_count} appendices")
704
+
705
+ @staticmethod
706
+ def _clean_text(text: str) -> str:
707
+ """Clean and normalize text for individual paragraphs"""
708
+ text = re.sub(r'\s+', ' ', text)
709
+ return text.strip()
710
+
711
+ @staticmethod
712
+ def _clean_combined_text(text: str) -> str:
713
+ """Clean combined text content, fixing list and paragraph formatting"""
714
+ # Fix list items: (a)\n\n should become (a) with text on same line
715
+ text = re.sub(r'\(([a-z]+|[ivx]+)\)\n\n', r'(\1) ', text)
716
+
717
+ # Fix numbered list items within content
718
+ text = re.sub(r'\n\n(\d+\.)\n\n', r'\n\n\1 ', text)
719
+
720
+ return text
721
+
722
+ def save_chunks(self, filepath: str):
723
+ """Save chunks to JSON"""
724
+ data = [chunk.to_dict() for chunk in self.chunks]
725
+ with open(filepath, 'w', encoding='utf-8') as f:
726
+ json.dump(data, f, indent=2, ensure_ascii=False)
727
+ print(f"Saved {len(data)} chunks to {filepath}")
728
+
729
+ def save_toc(self, filepath: str):
730
+ """Save TOC to JSON"""
731
+ with open(filepath, 'w', encoding='utf-8') as f:
732
+ json.dump(self.toc, f, indent=2, ensure_ascii=False)
733
+ print(f"Saved TOC to {filepath}")
734
+
735
+ def print_toc(self):
736
+ """Print formatted TOC showing flexible hierarchy (2-4 levels)"""
737
+ print("\n" + "="*70)
738
+ print("TABLE OF CONTENTS")
739
+ print("="*70)
740
+
741
+ # Print regulation title
742
+ if self.toc.get('title'):
743
+ print(f"\n{self.toc['title']}")
744
+
745
+ for section in self.toc['sections']:
746
+ # Level 1: Major sections
747
+ print(f"\n{section['title'].upper()}")
748
+
749
+ # Handle sections without children (like concluding formulas, annexes)
750
+ if 'children' not in section:
751
+ continue
752
+
753
+ for child in section.get('children', []):
754
+ if child['type'] in ['citation', 'recital']:
755
+ # Level 2: Citations and Recitals
756
+ print(f" {child['title']}")
757
+ elif child['type'] == 'chapter':
758
+ # Level 2: Chapters
759
+ print(f" {child['title']}")
760
+ # Level 3: Sections or Articles
761
+ for item in child.get('children', []):
762
+ if item['type'] == 'section':
763
+ # Level 3: Sections
764
+ print(f" {item['title']}")
765
+ # Level 4: Articles within sections
766
+ for art in item.get('children', []):
767
+ print(f" {art['title']}")
768
+ elif item['type'] == 'article':
769
+ # Level 3: Articles (when no sections)
770
+ print(f" {item['title']}")
771
+
772
+
773
+ def main():
774
+ """Test the parser"""
775
+ url = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019R0947"
776
+
777
+ print(f"Parsing: {url}\n")
778
+
779
+ parser = EURLexParser(url=url)
780
+ chunks, toc = parser.parse()
781
+
782
+ # Print TOC
783
+ parser.print_toc()
784
+
785
+ # Print summary
786
+ print("\n" + "="*70)
787
+ print(f"SUMMARY")
788
+ print("="*70)
789
+ print(f"Total chunks: {len(chunks)}")
790
+ by_type = {}
791
+ for chunk in chunks:
792
+ t = chunk.section_type.value
793
+ by_type[t] = by_type.get(t, 0) + 1
794
+ for t, count in sorted(by_type.items()):
795
+ print(f" {t}: {count}")
796
+
797
+ # Save
798
+ parser.save_chunks('data/processed/easa_chunks.json')
799
+ parser.save_toc('data/processed/easa_toc.json')
800
+
801
+ print("\n✓ Done!")
802
+
803
+
804
+ if __name__ == "__main__":
805
+ main()