quiz-gen 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quiz_gen/__init__.py +23 -0
- quiz_gen/__version__.py +13 -0
- quiz_gen/agents/__init__.py +0 -0
- quiz_gen/agents/answer_generator.py +0 -0
- quiz_gen/agents/base_agent.py +0 -0
- quiz_gen/agents/orchestrator.py +0 -0
- quiz_gen/agents/question_generator.py +0 -0
- quiz_gen/agents/reviewer.py +0 -0
- quiz_gen/agents/validator.py +0 -0
- quiz_gen/cli.py +209 -0
- quiz_gen/config.py +0 -0
- quiz_gen/models/__init__.py +0 -0
- quiz_gen/models/chunk.py +0 -0
- quiz_gen/models/document.py +0 -0
- quiz_gen/models/question.py +0 -0
- quiz_gen/models/quiz.py +0 -0
- quiz_gen/parsers/__init__.py +13 -0
- quiz_gen/parsers/base.py +0 -0
- quiz_gen/parsers/html/eu_lex_parser.py +805 -0
- quiz_gen/parsers/pdf_parser.py +0 -0
- quiz_gen/parsers/utils.py +0 -0
- quiz_gen/storage/__init__.py +0 -0
- quiz_gen/storage/base.py +0 -0
- quiz_gen/storage/database.py +0 -0
- quiz_gen/storage/json_storage.py +0 -0
- quiz_gen/utils/__init__.py +0 -0
- quiz_gen/utils/helpers.py +0 -0
- quiz_gen/utils/logging.py +0 -0
- quiz_gen/validation/__init__.py +0 -0
- quiz_gen/validation/human_feedback.py +0 -0
- quiz_gen/validation/quality_checker.py +0 -0
- quiz_gen-0.1.5.dist-info/METADATA +395 -0
- quiz_gen-0.1.5.dist-info/RECORD +37 -0
- quiz_gen-0.1.5.dist-info/WHEEL +5 -0
- quiz_gen-0.1.5.dist-info/entry_points.txt +2 -0
- quiz_gen-0.1.5.dist-info/licenses/LICENSE +21 -0
- quiz_gen-0.1.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,805 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
EUR-Lex Regulation HTML Parser - Simplified version
|
|
4
|
+
Builds flexible TOC (3-4 levels) and chunks only recitals and articles
|
|
5
|
+
Supports: Preamble > Chapters > Sections (optional) > Articles
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import requests
|
|
10
|
+
from typing import List, Dict, Optional
|
|
11
|
+
from bs4 import BeautifulSoup
|
|
12
|
+
from dataclasses import dataclass, asdict
|
|
13
|
+
from enum import Enum
|
|
14
|
+
import json
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SectionType(Enum):
|
|
18
|
+
"""Types of regulation sections"""
|
|
19
|
+
# Level 0 - Document title
|
|
20
|
+
TITLE = "title" # CHUNK THIS
|
|
21
|
+
|
|
22
|
+
# Level 1 - Major sections
|
|
23
|
+
PREAMBLE = "preamble"
|
|
24
|
+
ENACTING_TERMS = "enacting_terms"
|
|
25
|
+
CONCLUDING_FORMULAS = "concluding_formulas"
|
|
26
|
+
ANNEX = "annex"
|
|
27
|
+
APPENDIX = "appendix"
|
|
28
|
+
|
|
29
|
+
# Level 2 - Preamble elements
|
|
30
|
+
CITATION = "citation" # CHUNK THIS
|
|
31
|
+
RECITAL = "recital" # CHUNK THIS
|
|
32
|
+
|
|
33
|
+
# Level 2/3 - Structural
|
|
34
|
+
CHAPTER = "chapter"
|
|
35
|
+
SECTION = "section"
|
|
36
|
+
|
|
37
|
+
# Level 3/4 - Content
|
|
38
|
+
ARTICLE = "article" # CHUNK THIS
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class RegulationChunk:
|
|
43
|
+
"""Represents a parsed chunk (recital or article only)"""
|
|
44
|
+
section_type: SectionType
|
|
45
|
+
number: Optional[str]
|
|
46
|
+
title: Optional[str]
|
|
47
|
+
content: str
|
|
48
|
+
hierarchy_path: Optional[List[str]] = None
|
|
49
|
+
metadata: Dict = None
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
if self.metadata is None:
|
|
53
|
+
self.metadata = {}
|
|
54
|
+
if self.hierarchy_path is None:
|
|
55
|
+
self.hierarchy_path = []
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict:
|
|
58
|
+
data = asdict(self)
|
|
59
|
+
data['section_type'] = self.section_type.value
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class EURLexParser:
|
|
64
|
+
"""Parse EUR-Lex HTML and build TOC + chunk recitals/articles"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, url: str = None, html_content: str = None):
|
|
67
|
+
self.url = url
|
|
68
|
+
self.html_content = html_content
|
|
69
|
+
self.soup = None
|
|
70
|
+
self.chunks: List[RegulationChunk] = []
|
|
71
|
+
self.toc: Dict = {'title': '', 'sections': []}
|
|
72
|
+
self.current_hierarchy: List[str] = []
|
|
73
|
+
self.regulation_title: str = ''
|
|
74
|
+
|
|
75
|
+
def fetch(self) -> str:
|
|
76
|
+
"""Fetch HTML content from URL"""
|
|
77
|
+
if not self.url:
|
|
78
|
+
raise ValueError("No URL provided")
|
|
79
|
+
|
|
80
|
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
81
|
+
response = requests.get(self.url, headers=headers, timeout=30)
|
|
82
|
+
response.raise_for_status()
|
|
83
|
+
self.html_content = response.text
|
|
84
|
+
return self.html_content
|
|
85
|
+
|
|
86
|
+
def parse(self) -> tuple[List[RegulationChunk], Dict]:
|
|
87
|
+
"""Parse and return (chunks, toc)"""
|
|
88
|
+
if not self.html_content:
|
|
89
|
+
if self.url:
|
|
90
|
+
self.fetch()
|
|
91
|
+
else:
|
|
92
|
+
raise ValueError("No HTML content or URL provided")
|
|
93
|
+
|
|
94
|
+
self.soup = BeautifulSoup(self.html_content, 'lxml-xml')
|
|
95
|
+
|
|
96
|
+
# Parse document structure
|
|
97
|
+
self._parse_title()
|
|
98
|
+
self._parse_preamble()
|
|
99
|
+
self._parse_enacting_terms()
|
|
100
|
+
self._parse_concluding_formulas()
|
|
101
|
+
self._parse_annexes()
|
|
102
|
+
|
|
103
|
+
print(f"\n✓ TOC built with {len(self.toc['sections'])} major sections (flexible 2-4 levels)")
|
|
104
|
+
print(f"✓ Created {len(self.chunks)} chunks (recitals + articles)")
|
|
105
|
+
|
|
106
|
+
return self.chunks, self.toc
|
|
107
|
+
|
|
108
|
+
def _parse_title(self):
|
|
109
|
+
"""Parse main title and create a chunk for it"""
|
|
110
|
+
title_div = self.soup.find('div', class_='eli-main-title')
|
|
111
|
+
if title_div:
|
|
112
|
+
# Get all paragraphs from the title
|
|
113
|
+
title_paragraphs = title_div.find_all('p', class_='oj-doc-ti')
|
|
114
|
+
title_parts = [self._clean_text(p.get_text()) for p in title_paragraphs if p]
|
|
115
|
+
|
|
116
|
+
# First paragraph is typically the main title
|
|
117
|
+
if title_parts:
|
|
118
|
+
main_title = title_parts[0]
|
|
119
|
+
full_content = '\n\n'.join(title_parts)
|
|
120
|
+
|
|
121
|
+
# Store for use in hierarchy
|
|
122
|
+
self.regulation_title = main_title
|
|
123
|
+
self.toc['title'] = main_title
|
|
124
|
+
|
|
125
|
+
# CHUNK IT
|
|
126
|
+
chunk = RegulationChunk(
|
|
127
|
+
section_type=SectionType.TITLE,
|
|
128
|
+
number=None,
|
|
129
|
+
title=main_title,
|
|
130
|
+
content=full_content,
|
|
131
|
+
hierarchy_path=[main_title],
|
|
132
|
+
metadata={'id': title_div.get('id', '')}
|
|
133
|
+
)
|
|
134
|
+
self.chunks.append(chunk)
|
|
135
|
+
|
|
136
|
+
print(f"Parsed title: {main_title[:80]}...")
|
|
137
|
+
|
|
138
|
+
def _parse_preamble(self):
|
|
139
|
+
"""Parse preamble: add single citation entry and recitals to TOC, chunk citation and recitals"""
|
|
140
|
+
preamble_section = {'type': 'preamble', 'title': 'Preamble', 'children': []}
|
|
141
|
+
|
|
142
|
+
# Parse citations - combine all into one chunk
|
|
143
|
+
citations = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^cit_\d+'))
|
|
144
|
+
if citations:
|
|
145
|
+
# Collect all citation text
|
|
146
|
+
citation_parts = []
|
|
147
|
+
citation_ids = []
|
|
148
|
+
for cit in citations:
|
|
149
|
+
para = cit.find('p', class_='oj-normal')
|
|
150
|
+
if para:
|
|
151
|
+
text = self._clean_text(para.get_text())
|
|
152
|
+
if text:
|
|
153
|
+
citation_parts.append(text)
|
|
154
|
+
citation_ids.append(cit.get('id', ''))
|
|
155
|
+
|
|
156
|
+
if citation_parts:
|
|
157
|
+
# Add single citation entry to TOC
|
|
158
|
+
preamble_section['children'].append({
|
|
159
|
+
'type': 'citation',
|
|
160
|
+
'title': 'Citation'
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
# CHUNK IT - single chunk with all citations
|
|
164
|
+
# Use first citation ID for navigation (cit_1)
|
|
165
|
+
hierarchy = [self.regulation_title, "Preamble", "Citation"] if self.regulation_title else ["Preamble", "Citation"]
|
|
166
|
+
chunk = RegulationChunk(
|
|
167
|
+
section_type=SectionType.CITATION,
|
|
168
|
+
number=None,
|
|
169
|
+
title="Citation",
|
|
170
|
+
content='\n\n'.join(citation_parts),
|
|
171
|
+
hierarchy_path=hierarchy,
|
|
172
|
+
metadata={'id': citation_ids[0] if citation_ids else 'cit_1', 'citation_ids': citation_ids}
|
|
173
|
+
)
|
|
174
|
+
self.chunks.append(chunk)
|
|
175
|
+
|
|
176
|
+
# Parse recitals
|
|
177
|
+
recitals = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^rct_\d+'))
|
|
178
|
+
for rct in recitals:
|
|
179
|
+
table = rct.find('table')
|
|
180
|
+
if table:
|
|
181
|
+
rows = table.find_all('tr')
|
|
182
|
+
for row in rows:
|
|
183
|
+
cells = row.find_all('td')
|
|
184
|
+
if len(cells) == 2:
|
|
185
|
+
num_text = self._clean_text(cells[0].get_text())
|
|
186
|
+
content = self._clean_text(cells[1].get_text())
|
|
187
|
+
|
|
188
|
+
match = re.match(r'^\((\d+)\)$', num_text)
|
|
189
|
+
if match:
|
|
190
|
+
num = match.group(1)
|
|
191
|
+
|
|
192
|
+
# Add to TOC
|
|
193
|
+
preamble_section['children'].append({
|
|
194
|
+
'type': 'recital',
|
|
195
|
+
'number': num,
|
|
196
|
+
'title': f"Recital {num}"
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
# CHUNK IT
|
|
200
|
+
hierarchy = [self.regulation_title, "Preamble", f"Recital {num}"] if self.regulation_title else ["Preamble", f"Recital {num}"]
|
|
201
|
+
chunk = RegulationChunk(
|
|
202
|
+
section_type=SectionType.RECITAL,
|
|
203
|
+
number=num,
|
|
204
|
+
title=f"Recital {num}",
|
|
205
|
+
content=content,
|
|
206
|
+
hierarchy_path=hierarchy,
|
|
207
|
+
metadata={'id': rct.get('id', '')}
|
|
208
|
+
)
|
|
209
|
+
self.chunks.append(chunk)
|
|
210
|
+
|
|
211
|
+
self.toc['sections'].append(preamble_section)
|
|
212
|
+
print(f"Parsed preamble: {len(citations)} citations, {len(recitals)} recitals")
|
|
213
|
+
|
|
214
|
+
def _parse_enacting_terms(self):
|
|
215
|
+
"""Parse chapters/sections/articles: all to TOC, chunk articles only"""
|
|
216
|
+
enacting_section = {'type': 'enacting_terms', 'title': 'Enacting Terms', 'children': []}
|
|
217
|
+
|
|
218
|
+
# Find all chapters
|
|
219
|
+
chapters = self.soup.find_all('div', id=re.compile(r'^cpt_'))
|
|
220
|
+
|
|
221
|
+
# Check if there are chapters
|
|
222
|
+
if not chapters:
|
|
223
|
+
# No chapters found - articles might be directly under enacting terms
|
|
224
|
+
# Look for articles at the top level
|
|
225
|
+
articles = self.soup.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
|
|
226
|
+
|
|
227
|
+
if articles:
|
|
228
|
+
# Articles without chapters - parse them directly
|
|
229
|
+
hierarchy = [self.regulation_title, "Enacting Terms"] if self.regulation_title else ["Enacting Terms"]
|
|
230
|
+
for art_div in articles:
|
|
231
|
+
self._parse_article(art_div, enacting_section, hierarchy)
|
|
232
|
+
|
|
233
|
+
self.toc['sections'].append(enacting_section)
|
|
234
|
+
print(f"Parsed enacting terms: {len(articles)} articles (no chapters)")
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
for chapter_div in chapters:
|
|
238
|
+
# Get chapter number and title
|
|
239
|
+
chapter_p = chapter_div.find('p', class_='oj-ti-section-1')
|
|
240
|
+
if not chapter_p:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
chapter_text = self._clean_text(chapter_p.get_text())
|
|
244
|
+
chapter_match = re.match(r'CHAPTER\s+([IVXLCDM]+|\d+)', chapter_text, re.I)
|
|
245
|
+
|
|
246
|
+
if chapter_match:
|
|
247
|
+
chapter_num = chapter_match.group(1)
|
|
248
|
+
|
|
249
|
+
# Get subtitle (might be in oj-ti-section-2 or in eli-title)
|
|
250
|
+
subtitle_p = chapter_div.find('p', class_='oj-ti-section-2')
|
|
251
|
+
if not subtitle_p:
|
|
252
|
+
# Try to find title in eli-title container
|
|
253
|
+
title_div = chapter_div.find('div', class_='eli-title')
|
|
254
|
+
if title_div:
|
|
255
|
+
subtitle_p = title_div.find('p', class_='oj-ti-section-2')
|
|
256
|
+
|
|
257
|
+
chapter_title = self._clean_text(subtitle_p.get_text()) if subtitle_p else ''
|
|
258
|
+
|
|
259
|
+
full_title = f"CHAPTER {chapter_num}" + (f" - {chapter_title}" if chapter_title else "")
|
|
260
|
+
|
|
261
|
+
chapter_toc = {
|
|
262
|
+
'type': 'chapter',
|
|
263
|
+
'number': chapter_num,
|
|
264
|
+
'title': full_title,
|
|
265
|
+
'children': []
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# Update hierarchy
|
|
269
|
+
self.current_hierarchy = [self.regulation_title, full_title] if self.regulation_title else [full_title]
|
|
270
|
+
|
|
271
|
+
# Check if this chapter has sections
|
|
272
|
+
sections = chapter_div.find_all('div', id=re.compile(r'^cpt_[^.]+\.sct_'), recursive=False)
|
|
273
|
+
|
|
274
|
+
if sections:
|
|
275
|
+
# Chapter has sections - parse them
|
|
276
|
+
for section_div in sections:
|
|
277
|
+
section_id = section_div.get('id', '')
|
|
278
|
+
|
|
279
|
+
# Get section title - look for SECTION I, SECTION II, etc.
|
|
280
|
+
section_title_p = section_div.find('p', class_='oj-ti-section-1')
|
|
281
|
+
if section_title_p:
|
|
282
|
+
section_text = self._clean_text(section_title_p.get_text())
|
|
283
|
+
section_match = re.match(r'SECTION\s+([IVXLCDM]+|\d+)', section_text, re.I)
|
|
284
|
+
|
|
285
|
+
if section_match:
|
|
286
|
+
section_num = section_match.group(1)
|
|
287
|
+
|
|
288
|
+
# Get section subtitle
|
|
289
|
+
section_subtitle_p = section_div.find('p', class_='oj-ti-section-2')
|
|
290
|
+
if not section_subtitle_p:
|
|
291
|
+
# Try in eli-title
|
|
292
|
+
title_div = section_div.find('div', class_='eli-title')
|
|
293
|
+
if title_div:
|
|
294
|
+
section_subtitle_p = title_div.find('p', class_='oj-ti-section-2')
|
|
295
|
+
|
|
296
|
+
section_title = self._clean_text(section_subtitle_p.get_text()) if section_subtitle_p else ''
|
|
297
|
+
|
|
298
|
+
section_full_title = f"SECTION {section_num}" + (f" - {section_title}" if section_title else "")
|
|
299
|
+
|
|
300
|
+
section_toc = {
|
|
301
|
+
'type': 'section',
|
|
302
|
+
'number': section_num,
|
|
303
|
+
'title': section_full_title,
|
|
304
|
+
'children': []
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
# Update hierarchy to include section
|
|
308
|
+
section_hierarchy = self.current_hierarchy + [section_full_title]
|
|
309
|
+
|
|
310
|
+
# Find articles within this section
|
|
311
|
+
articles = section_div.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
|
|
312
|
+
|
|
313
|
+
for art_div in articles:
|
|
314
|
+
self._parse_article(art_div, section_toc, section_hierarchy)
|
|
315
|
+
|
|
316
|
+
chapter_toc['children'].append(section_toc)
|
|
317
|
+
else:
|
|
318
|
+
# No sections - articles directly under chapter
|
|
319
|
+
articles = chapter_div.find_all('div', class_='eli-subdivision', id=re.compile(r'^art_\d+'))
|
|
320
|
+
|
|
321
|
+
for art_div in articles:
|
|
322
|
+
self._parse_article(art_div, chapter_toc, self.current_hierarchy)
|
|
323
|
+
|
|
324
|
+
enacting_section['children'].append(chapter_toc)
|
|
325
|
+
|
|
326
|
+
self.toc['sections'].append(enacting_section)
|
|
327
|
+
print(f"Parsed enacting terms: {len(chapters)} chapters")
|
|
328
|
+
|
|
329
|
+
def _parse_article(self, art_div, parent_toc, hierarchy_path):
|
|
330
|
+
"""Parse a single article and add to TOC and chunks"""
|
|
331
|
+
art_p = art_div.find('p', class_='oj-ti-art')
|
|
332
|
+
if not art_p:
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
art_text = self._clean_text(art_p.get_text())
|
|
336
|
+
art_match = re.search(r'Article\s+(\d+[a-z]*)', art_text, re.I)
|
|
337
|
+
|
|
338
|
+
if art_match:
|
|
339
|
+
art_num = art_match.group(1)
|
|
340
|
+
|
|
341
|
+
# Get article subtitle
|
|
342
|
+
art_subtitle_p = art_div.find('p', class_='oj-sti-art')
|
|
343
|
+
art_subtitle = self._clean_text(art_subtitle_p.get_text()) if art_subtitle_p else ''
|
|
344
|
+
|
|
345
|
+
art_full_title = f"Article {art_num}" + (f" - {art_subtitle}" if art_subtitle else "")
|
|
346
|
+
|
|
347
|
+
# Collect article content
|
|
348
|
+
content_parts = []
|
|
349
|
+
|
|
350
|
+
# Method 1: Look for content divs (numbered paragraphs like 1.1, 1.2)
|
|
351
|
+
content_divs = art_div.find_all('div', id=re.compile(r'^\d+\.\d+'))
|
|
352
|
+
for content_div in content_divs:
|
|
353
|
+
paras = content_div.find_all('p', class_='oj-normal')
|
|
354
|
+
for para in paras:
|
|
355
|
+
text = self._clean_text(para.get_text())
|
|
356
|
+
if text:
|
|
357
|
+
content_parts.append(text)
|
|
358
|
+
|
|
359
|
+
# Method 2: Look for direct paragraphs (intro text before tables)
|
|
360
|
+
direct_paras = art_div.find_all('p', class_='oj-normal', recursive=False)
|
|
361
|
+
for para in direct_paras:
|
|
362
|
+
text = self._clean_text(para.get_text())
|
|
363
|
+
if text:
|
|
364
|
+
content_parts.append(text)
|
|
365
|
+
|
|
366
|
+
# Method 3: Look for tables (list items like (a), (b), (c))
|
|
367
|
+
tables = art_div.find_all('table')
|
|
368
|
+
for table in tables:
|
|
369
|
+
rows = table.find_all('tr')
|
|
370
|
+
for row in rows:
|
|
371
|
+
cells = row.find_all('td')
|
|
372
|
+
if len(cells) == 2:
|
|
373
|
+
# First cell is typically the list marker (a), (b), etc.
|
|
374
|
+
marker = self._clean_text(cells[0].get_text())
|
|
375
|
+
text = self._clean_text(cells[1].get_text())
|
|
376
|
+
if text:
|
|
377
|
+
# Combine marker and text
|
|
378
|
+
content_parts.append(f"{marker} {text}")
|
|
379
|
+
|
|
380
|
+
full_content = '\n\n'.join(content_parts)
|
|
381
|
+
# Apply cleaning to the combined content to fix list formatting
|
|
382
|
+
full_content = self._clean_combined_text(full_content)
|
|
383
|
+
|
|
384
|
+
# Add to TOC
|
|
385
|
+
parent_toc['children'].append({
|
|
386
|
+
'type': 'article',
|
|
387
|
+
'number': art_num,
|
|
388
|
+
'title': art_full_title
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
# CHUNK IT
|
|
392
|
+
chunk = RegulationChunk(
|
|
393
|
+
section_type=SectionType.ARTICLE,
|
|
394
|
+
number=art_num,
|
|
395
|
+
title=art_full_title,
|
|
396
|
+
content=full_content or art_subtitle,
|
|
397
|
+
hierarchy_path=hierarchy_path + [art_full_title],
|
|
398
|
+
metadata={'id': art_div.get('id', ''), 'subtitle': art_subtitle}
|
|
399
|
+
)
|
|
400
|
+
self.chunks.append(chunk)
|
|
401
|
+
|
|
402
|
+
def _parse_concluding_formulas(self):
|
|
403
|
+
"""Parse concluding formulas and create a chunk"""
|
|
404
|
+
concluding_div = self.soup.find('div', class_='eli-subdivision', id=re.compile(r'^fnp_\d+'))
|
|
405
|
+
if concluding_div:
|
|
406
|
+
# Get all content from the concluding formulas
|
|
407
|
+
final_div = concluding_div.find('div', class_='oj-final')
|
|
408
|
+
if final_div:
|
|
409
|
+
content_parts = []
|
|
410
|
+
paras = final_div.find_all('p', class_='oj-normal')
|
|
411
|
+
for para in paras:
|
|
412
|
+
text = self._clean_text(para.get_text())
|
|
413
|
+
if text:
|
|
414
|
+
content_parts.append(text)
|
|
415
|
+
|
|
416
|
+
# Get signatory information
|
|
417
|
+
signatories = final_div.find_all('div', class_='oj-signatory')
|
|
418
|
+
for sig in signatories:
|
|
419
|
+
sig_paras = sig.find_all('p', class_='oj-signatory')
|
|
420
|
+
sig_parts = [self._clean_text(p.get_text()) for p in sig_paras if p]
|
|
421
|
+
if sig_parts:
|
|
422
|
+
content_parts.append('\n'.join(sig_parts))
|
|
423
|
+
|
|
424
|
+
full_content = '\n\n'.join(content_parts)
|
|
425
|
+
|
|
426
|
+
if full_content:
|
|
427
|
+
# Add to TOC
|
|
428
|
+
concluding_section = {
|
|
429
|
+
'type': 'concluding_formulas',
|
|
430
|
+
'title': 'Concluding formulas'
|
|
431
|
+
}
|
|
432
|
+
self.toc['sections'].append(concluding_section)
|
|
433
|
+
|
|
434
|
+
# CHUNK IT
|
|
435
|
+
hierarchy = [self.regulation_title, "Concluding formulas"] if self.regulation_title else ["Concluding formulas"]
|
|
436
|
+
chunk = RegulationChunk(
|
|
437
|
+
section_type=SectionType.CONCLUDING_FORMULAS,
|
|
438
|
+
number=None,
|
|
439
|
+
title="Concluding formulas",
|
|
440
|
+
content=full_content,
|
|
441
|
+
hierarchy_path=hierarchy,
|
|
442
|
+
metadata={'id': concluding_div.get('id', '')}
|
|
443
|
+
)
|
|
444
|
+
self.chunks.append(chunk)
|
|
445
|
+
print(f"Parsed concluding formulas")
|
|
446
|
+
|
|
447
|
+
def _parse_annexes(self):
|
|
448
|
+
"""Parse annexes and appendices and create chunks for each"""
|
|
449
|
+
# Find all annexes and appendices
|
|
450
|
+
annexes = self.soup.find_all('div', class_='eli-container', id=re.compile(r'^anx_'))
|
|
451
|
+
|
|
452
|
+
if not annexes:
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
for annex_div in annexes:
|
|
456
|
+
# Get annex/appendix ID
|
|
457
|
+
annex_id = annex_div.get('id', '')
|
|
458
|
+
|
|
459
|
+
# Check if this is an appendix (contains .app_)
|
|
460
|
+
is_appendix = '.app_' in annex_id
|
|
461
|
+
|
|
462
|
+
if is_appendix:
|
|
463
|
+
# Parse as appendix: anx_1.app_1
|
|
464
|
+
app_match = re.match(r'^anx_(\d+)\.app_(\d+)', annex_id)
|
|
465
|
+
if app_match:
|
|
466
|
+
annex_num = app_match.group(1)
|
|
467
|
+
app_num = app_match.group(2)
|
|
468
|
+
section_type = SectionType.APPENDIX
|
|
469
|
+
identifier = f"{annex_num}.{app_num}"
|
|
470
|
+
else:
|
|
471
|
+
# Fallback
|
|
472
|
+
identifier = annex_id.replace('anx_', '').replace('.app_', '.')
|
|
473
|
+
section_type = SectionType.APPENDIX
|
|
474
|
+
else:
|
|
475
|
+
# Parse as annex: anx_1, anx_I, etc.
|
|
476
|
+
annex_match = re.match(r'^anx_([IVXLCDM]+|\d+[A-Z]?)', annex_id, re.I)
|
|
477
|
+
identifier = annex_match.group(1) if annex_match else annex_id.replace('anx_', '')
|
|
478
|
+
section_type = SectionType.ANNEX
|
|
479
|
+
|
|
480
|
+
# Get title from first oj-doc-ti paragraph
|
|
481
|
+
title_paragraphs = annex_div.find_all('p', class_='oj-doc-ti')
|
|
482
|
+
if title_paragraphs:
|
|
483
|
+
# Get first paragraph as main title
|
|
484
|
+
title_text = self._clean_text(title_paragraphs[0].get_text())
|
|
485
|
+
# If there are multiple oj-doc-ti paragraphs, combine them as subtitle
|
|
486
|
+
# (but NOT oj-ti-grseq-1 which typically contains PART numbers within the annex)
|
|
487
|
+
if len(title_paragraphs) > 1:
|
|
488
|
+
subtitle_parts = [self._clean_text(p.get_text()) for p in title_paragraphs[1:]]
|
|
489
|
+
subtitle = ' '.join(subtitle_parts)
|
|
490
|
+
else:
|
|
491
|
+
subtitle = ''
|
|
492
|
+
else:
|
|
493
|
+
# Fallback title
|
|
494
|
+
if is_appendix:
|
|
495
|
+
title_text = f"APPENDIX {identifier}"
|
|
496
|
+
else:
|
|
497
|
+
title_text = f"ANNEX {identifier}"
|
|
498
|
+
subtitle = ''
|
|
499
|
+
|
|
500
|
+
full_title = title_text + (f" - {subtitle}" if subtitle else "")
|
|
501
|
+
|
|
502
|
+
# Create a base title with identifier for use in parts
|
|
503
|
+
# This ensures parts are titled like "ANNEX 1 - PART 1" not just "ANNEX - PART 1"
|
|
504
|
+
if is_appendix:
|
|
505
|
+
base_title_with_id = f"APPENDIX {identifier}"
|
|
506
|
+
else:
|
|
507
|
+
# Check if title_text already contains the identifier
|
|
508
|
+
if identifier and identifier.upper() not in title_text.upper():
|
|
509
|
+
base_title_with_id = f"ANNEX {identifier}"
|
|
510
|
+
else:
|
|
511
|
+
base_title_with_id = title_text
|
|
512
|
+
|
|
513
|
+
# Check if annex contains parts (oj-ti-grseq-1 with PART X pattern)
|
|
514
|
+
part_headers = annex_div.find_all('p', class_='oj-ti-grseq-1')
|
|
515
|
+
parts_detected = []
|
|
516
|
+
for part_header in part_headers:
|
|
517
|
+
part_text = self._clean_text(part_header.get_text())
|
|
518
|
+
# Match "PART 1", "PART I", "Part 1", etc.
|
|
519
|
+
part_match = re.match(r'^PART\s+([IVXLCDM]+|\d+)', part_text, re.I)
|
|
520
|
+
if part_match:
|
|
521
|
+
parts_detected.append({
|
|
522
|
+
'element': part_header,
|
|
523
|
+
'number': part_match.group(1),
|
|
524
|
+
'title': part_text
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
# If parts detected, create separate chunks for each part
|
|
528
|
+
if parts_detected:
|
|
529
|
+
# Add annex to TOC with parts as children
|
|
530
|
+
toc_entry = {
|
|
531
|
+
'type': 'appendix' if is_appendix else 'annex',
|
|
532
|
+
'number': identifier,
|
|
533
|
+
'title': base_title_with_id, # Use base_title_with_id to show "ANNEX 1" not "ANNEX"
|
|
534
|
+
'children': []
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
hierarchy_base = [self.regulation_title, base_title_with_id] if self.regulation_title else [base_title_with_id]
|
|
538
|
+
|
|
539
|
+
# Process each part
|
|
540
|
+
for i, part_info in enumerate(parts_detected):
|
|
541
|
+
part_elem = part_info['element']
|
|
542
|
+
part_num = part_info['number']
|
|
543
|
+
part_title = part_info['title']
|
|
544
|
+
|
|
545
|
+
# Collect content from this part until the next part or end of annex
|
|
546
|
+
content_parts = []
|
|
547
|
+
|
|
548
|
+
# Get the next part element to know where to stop
|
|
549
|
+
next_part_elem = parts_detected[i + 1]['element'] if i + 1 < len(parts_detected) else None
|
|
550
|
+
|
|
551
|
+
# Find all elements in the annex_div, and collect those between current and next part
|
|
552
|
+
collecting = False
|
|
553
|
+
for elem in annex_div.descendants:
|
|
554
|
+
# Start collecting after we find the part header
|
|
555
|
+
if elem == part_elem:
|
|
556
|
+
collecting = True
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
# Stop if we hit the next part header
|
|
560
|
+
if next_part_elem and elem == next_part_elem:
|
|
561
|
+
break
|
|
562
|
+
|
|
563
|
+
if not collecting:
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
# Collect normal paragraphs
|
|
567
|
+
if elem.name == 'p' and 'oj-normal' in elem.get('class', []):
|
|
568
|
+
text = self._clean_text(elem.get_text())
|
|
569
|
+
if text and len(text) > 10:
|
|
570
|
+
content_parts.append(text)
|
|
571
|
+
|
|
572
|
+
# Collect tables
|
|
573
|
+
elif elem.name == 'table' and 'oj-table' in elem.get('class', []):
|
|
574
|
+
# Get table headers
|
|
575
|
+
headers = []
|
|
576
|
+
header_cells = elem.find_all('p', class_='oj-tbl-hdr')
|
|
577
|
+
for hdr in header_cells:
|
|
578
|
+
text = self._clean_text(hdr.get_text())
|
|
579
|
+
if text:
|
|
580
|
+
headers.append(text)
|
|
581
|
+
|
|
582
|
+
if headers:
|
|
583
|
+
content_parts.append(' | '.join(headers))
|
|
584
|
+
content_parts.append('-' * 40)
|
|
585
|
+
|
|
586
|
+
# Get table rows
|
|
587
|
+
rows = elem.find_all('tr', class_='oj-table')
|
|
588
|
+
for row in rows:
|
|
589
|
+
cells = row.find_all('td', class_='oj-table')
|
|
590
|
+
cell_texts = []
|
|
591
|
+
for cell in cells:
|
|
592
|
+
cell_para = cell.find('p')
|
|
593
|
+
if cell_para:
|
|
594
|
+
text = self._clean_text(cell_para.get_text())
|
|
595
|
+
if text and 'oj-tbl-hdr' not in cell_para.get('class', []):
|
|
596
|
+
cell_texts.append(text)
|
|
597
|
+
|
|
598
|
+
if cell_texts:
|
|
599
|
+
content_parts.append(' | '.join(cell_texts))
|
|
600
|
+
|
|
601
|
+
part_content = '\n\n'.join(content_parts)
|
|
602
|
+
# Use base_title_with_id to include annex/appendix number in part titles
|
|
603
|
+
part_full_title = f"{base_title_with_id} - {part_title}"
|
|
604
|
+
|
|
605
|
+
# Add to TOC
|
|
606
|
+
toc_entry['children'].append({
|
|
607
|
+
'type': 'part',
|
|
608
|
+
'number': part_num,
|
|
609
|
+
'title': part_title
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
# CHUNK IT
|
|
613
|
+
chunk = RegulationChunk(
|
|
614
|
+
section_type=section_type,
|
|
615
|
+
number=f"{identifier}.{part_num}",
|
|
616
|
+
title=part_full_title,
|
|
617
|
+
content=part_content,
|
|
618
|
+
hierarchy_path=hierarchy_base + [part_title],
|
|
619
|
+
metadata={'id': annex_id, 'part': part_num}
|
|
620
|
+
)
|
|
621
|
+
self.chunks.append(chunk)
|
|
622
|
+
|
|
623
|
+
self.toc['sections'].append(toc_entry)
|
|
624
|
+
|
|
625
|
+
else:
|
|
626
|
+
# No parts - treat as single chunk (original behavior)
|
|
627
|
+
# Collect annex content (all text from paragraphs and tables)
|
|
628
|
+
content_parts = []
|
|
629
|
+
|
|
630
|
+
# Get all normal paragraphs
|
|
631
|
+
paras = annex_div.find_all('p', class_='oj-normal')
|
|
632
|
+
for para in paras:
|
|
633
|
+
text = self._clean_text(para.get_text())
|
|
634
|
+
if text and len(text) > 10: # Filter out very short text
|
|
635
|
+
content_parts.append(text)
|
|
636
|
+
|
|
637
|
+
# Check if this is a table-based annex (like correlation tables)
|
|
638
|
+
tables = annex_div.find_all('table', class_='oj-table')
|
|
639
|
+
for table in tables:
|
|
640
|
+
# Get table headers
|
|
641
|
+
headers = []
|
|
642
|
+
header_cells = table.find_all('p', class_='oj-tbl-hdr')
|
|
643
|
+
for hdr in header_cells:
|
|
644
|
+
text = self._clean_text(hdr.get_text())
|
|
645
|
+
if text:
|
|
646
|
+
headers.append(text)
|
|
647
|
+
|
|
648
|
+
if headers:
|
|
649
|
+
content_parts.append(' | '.join(headers))
|
|
650
|
+
content_parts.append('-' * 40)
|
|
651
|
+
|
|
652
|
+
# Get table rows
|
|
653
|
+
rows = table.find_all('tr', class_='oj-table')
|
|
654
|
+
for row in rows:
|
|
655
|
+
cells = row.find_all('td', class_='oj-table')
|
|
656
|
+
cell_texts = []
|
|
657
|
+
for cell in cells:
|
|
658
|
+
cell_para = cell.find('p')
|
|
659
|
+
if cell_para:
|
|
660
|
+
text = self._clean_text(cell_para.get_text())
|
|
661
|
+
if text and 'oj-tbl-hdr' not in cell_para.get('class', []):
|
|
662
|
+
cell_texts.append(text)
|
|
663
|
+
|
|
664
|
+
if cell_texts:
|
|
665
|
+
content_parts.append(' | '.join(cell_texts))
|
|
666
|
+
|
|
667
|
+
# Join all content parts - no truncation
|
|
668
|
+
full_content = '\n\n'.join(content_parts)
|
|
669
|
+
|
|
670
|
+
# Create chunk even if content is just title/subtitle (for index purposes)
|
|
671
|
+
if not full_content and subtitle:
|
|
672
|
+
full_content = subtitle
|
|
673
|
+
|
|
674
|
+
if full_content or subtitle:
|
|
675
|
+
# Add to TOC
|
|
676
|
+
toc_entry = {
|
|
677
|
+
'type': 'appendix' if is_appendix else 'annex',
|
|
678
|
+
'number': identifier,
|
|
679
|
+
'title': full_title
|
|
680
|
+
}
|
|
681
|
+
self.toc['sections'].append(toc_entry)
|
|
682
|
+
|
|
683
|
+
# CHUNK IT
|
|
684
|
+
hierarchy = [self.regulation_title, full_title] if self.regulation_title else [full_title]
|
|
685
|
+
chunk = RegulationChunk(
|
|
686
|
+
section_type=section_type,
|
|
687
|
+
number=identifier,
|
|
688
|
+
title=full_title,
|
|
689
|
+
content=full_content,
|
|
690
|
+
hierarchy_path=hierarchy,
|
|
691
|
+
metadata={'id': annex_id, 'subtitle': subtitle}
|
|
692
|
+
)
|
|
693
|
+
self.chunks.append(chunk)
|
|
694
|
+
|
|
695
|
+
# Count annexes vs appendices for reporting
|
|
696
|
+
annex_count = sum(1 for a in annexes if '.app_' not in a.get('id', ''))
|
|
697
|
+
appendix_count = sum(1 for a in annexes if '.app_' in a.get('id', ''))
|
|
698
|
+
if annex_count and appendix_count:
|
|
699
|
+
print(f"Parsed {annex_count} annexes and {appendix_count} appendices")
|
|
700
|
+
elif annex_count:
|
|
701
|
+
print(f"Parsed {annex_count} annexes")
|
|
702
|
+
elif appendix_count:
|
|
703
|
+
print(f"Parsed {appendix_count} appendices")
|
|
704
|
+
|
|
705
|
+
@staticmethod
|
|
706
|
+
def _clean_text(text: str) -> str:
|
|
707
|
+
"""Clean and normalize text for individual paragraphs"""
|
|
708
|
+
text = re.sub(r'\s+', ' ', text)
|
|
709
|
+
return text.strip()
|
|
710
|
+
|
|
711
|
+
@staticmethod
|
|
712
|
+
def _clean_combined_text(text: str) -> str:
|
|
713
|
+
"""Clean combined text content, fixing list and paragraph formatting"""
|
|
714
|
+
# Fix list items: (a)\n\n should become (a) with text on same line
|
|
715
|
+
text = re.sub(r'\(([a-z]+|[ivx]+)\)\n\n', r'(\1) ', text)
|
|
716
|
+
|
|
717
|
+
# Fix numbered list items within content
|
|
718
|
+
text = re.sub(r'\n\n(\d+\.)\n\n', r'\n\n\1 ', text)
|
|
719
|
+
|
|
720
|
+
return text
|
|
721
|
+
|
|
722
|
+
def save_chunks(self, filepath: str):
|
|
723
|
+
"""Save chunks to JSON"""
|
|
724
|
+
data = [chunk.to_dict() for chunk in self.chunks]
|
|
725
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
726
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
727
|
+
print(f"Saved {len(data)} chunks to {filepath}")
|
|
728
|
+
|
|
729
|
+
def save_toc(self, filepath: str):
|
|
730
|
+
"""Save TOC to JSON"""
|
|
731
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
732
|
+
json.dump(self.toc, f, indent=2, ensure_ascii=False)
|
|
733
|
+
print(f"Saved TOC to {filepath}")
|
|
734
|
+
|
|
735
|
+
def print_toc(self):
|
|
736
|
+
"""Print formatted TOC showing flexible hierarchy (2-4 levels)"""
|
|
737
|
+
print("\n" + "="*70)
|
|
738
|
+
print("TABLE OF CONTENTS")
|
|
739
|
+
print("="*70)
|
|
740
|
+
|
|
741
|
+
# Print regulation title
|
|
742
|
+
if self.toc.get('title'):
|
|
743
|
+
print(f"\n{self.toc['title']}")
|
|
744
|
+
|
|
745
|
+
for section in self.toc['sections']:
|
|
746
|
+
# Level 1: Major sections
|
|
747
|
+
print(f"\n{section['title'].upper()}")
|
|
748
|
+
|
|
749
|
+
# Handle sections without children (like concluding formulas, annexes)
|
|
750
|
+
if 'children' not in section:
|
|
751
|
+
continue
|
|
752
|
+
|
|
753
|
+
for child in section.get('children', []):
|
|
754
|
+
if child['type'] in ['citation', 'recital']:
|
|
755
|
+
# Level 2: Citations and Recitals
|
|
756
|
+
print(f" {child['title']}")
|
|
757
|
+
elif child['type'] == 'chapter':
|
|
758
|
+
# Level 2: Chapters
|
|
759
|
+
print(f" {child['title']}")
|
|
760
|
+
# Level 3: Sections or Articles
|
|
761
|
+
for item in child.get('children', []):
|
|
762
|
+
if item['type'] == 'section':
|
|
763
|
+
# Level 3: Sections
|
|
764
|
+
print(f" {item['title']}")
|
|
765
|
+
# Level 4: Articles within sections
|
|
766
|
+
for art in item.get('children', []):
|
|
767
|
+
print(f" {art['title']}")
|
|
768
|
+
elif item['type'] == 'article':
|
|
769
|
+
# Level 3: Articles (when no sections)
|
|
770
|
+
print(f" {item['title']}")
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def main():
|
|
774
|
+
"""Test the parser"""
|
|
775
|
+
url = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019R0947"
|
|
776
|
+
|
|
777
|
+
print(f"Parsing: {url}\n")
|
|
778
|
+
|
|
779
|
+
parser = EURLexParser(url=url)
|
|
780
|
+
chunks, toc = parser.parse()
|
|
781
|
+
|
|
782
|
+
# Print TOC
|
|
783
|
+
parser.print_toc()
|
|
784
|
+
|
|
785
|
+
# Print summary
|
|
786
|
+
print("\n" + "="*70)
|
|
787
|
+
print(f"SUMMARY")
|
|
788
|
+
print("="*70)
|
|
789
|
+
print(f"Total chunks: {len(chunks)}")
|
|
790
|
+
by_type = {}
|
|
791
|
+
for chunk in chunks:
|
|
792
|
+
t = chunk.section_type.value
|
|
793
|
+
by_type[t] = by_type.get(t, 0) + 1
|
|
794
|
+
for t, count in sorted(by_type.items()):
|
|
795
|
+
print(f" {t}: {count}")
|
|
796
|
+
|
|
797
|
+
# Save
|
|
798
|
+
parser.save_chunks('data/processed/easa_chunks.json')
|
|
799
|
+
parser.save_toc('data/processed/easa_toc.json')
|
|
800
|
+
|
|
801
|
+
print("\n✓ Done!")
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
if __name__ == "__main__":
|
|
805
|
+
main()
|