datamule 0.416__cp38-cp38-macosx_11_0_universal2.whl → 0.418__cp38-cp38-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

@@ -0,0 +1,332 @@
1
+ import os
2
+ import asyncio
3
+ import aiohttp
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+ import time
7
+ import shutil
8
+ import ssl
9
+ import zstandard as zstd
10
+ import io
11
+ import json
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import partial
14
+ from queue import Queue, Empty
15
+ from threading import Thread
16
+ from datamule.parser.sgml_parsing.sgml_parser_cy import parse_sgml_submission
17
+ import urllib.parse
18
+ from ..helper import identifier_to_cik
19
+
20
+ class InsufficientBalanceError(Exception):
21
+ def __init__(self, required_cost, current_balance, total_urls):
22
+ self.required_cost = required_cost
23
+ self.current_balance = current_balance
24
+ self.total_urls = total_urls
25
+ message = (f"Insufficient balance. Required: ${required_cost:.4f}, "
26
+ f"Current balance: ${current_balance:.4f}, "
27
+ f"Total URLs: {total_urls}")
28
+ super().__init__(message)
29
+
30
+ class PremiumDownloader:
31
+ def __init__(self, api_key=None):
32
+ self.BASE_URL = "https://library.datamule.xyz/original/nc/"
33
+ self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
34
+ self.CHUNK_SIZE = 2 * 1024 * 1024
35
+ self.MAX_CONCURRENT_DOWNLOADS = 100
36
+ self.MAX_DECOMPRESSION_WORKERS = 16
37
+ self.MAX_PROCESSING_WORKERS = 16
38
+ self.QUEUE_SIZE = 10
39
+ if api_key is not None:
40
+ self._api_key = api_key
41
+
42
+ @property
43
+ def api_key(self):
44
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
45
+
46
+ @api_key.setter
47
+ def api_key(self, value):
48
+ if not value:
49
+ raise ValueError("API key cannot be empty")
50
+ self._api_key = value
51
+
52
+ def _log_error(self, output_dir, filename, error_msg):
53
+ error_file = os.path.join(output_dir, 'errors.json')
54
+ try:
55
+ if os.path.exists(error_file):
56
+ with open(error_file, 'r') as f:
57
+ errors = json.load(f)
58
+ else:
59
+ errors = {}
60
+
61
+ errors[filename] = str(error_msg)
62
+
63
+ with open(error_file, 'w') as f:
64
+ json.dump(errors, f, indent=2)
65
+ except Exception as e:
66
+ print(f"Failed to log error to {error_file}: {str(e)}")
67
+
68
+ async def _fetch_submissions(self, session, submission_type=None, cik=None, filing_date=None, page=1):
69
+ params = {
70
+ 'api_key': self.api_key,
71
+ 'page': page
72
+ }
73
+
74
+ if submission_type:
75
+ if isinstance(submission_type, list):
76
+ params['submission_type'] = ','.join(str(x) for x in submission_type)
77
+ else:
78
+ params['submission_type'] = str(submission_type)
79
+
80
+ if cik:
81
+ if isinstance(cik, list):
82
+ params['cik'] = ','.join(str(x) for x in cik)
83
+ else:
84
+ params['cik'] = str(cik)
85
+
86
+ if filing_date:
87
+ if isinstance(filing_date, tuple):
88
+ params['startdt'] = str(filing_date[0])
89
+ params['enddt'] = str(filing_date[1])
90
+ else:
91
+ if isinstance(filing_date, list):
92
+ params['filing_date'] = ','.join(str(x) for x in filing_date)
93
+ else:
94
+ params['filing_date'] = str(filing_date)
95
+
96
+ url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
97
+
98
+ async with session.get(url) as response:
99
+ data = await response.json()
100
+ if not data.get('success'):
101
+ raise ValueError(f"API request failed: {data.get('error')}")
102
+
103
+ charges = data['metadata']['billing']['charges']
104
+ print(f"\nCost: ${charges['results']:.12f} downloads + ${charges['rows_read']:.12f} row reads = ${charges['total']:.12f}")
105
+ print(f"Balance: ${data['metadata']['billing']['remaining_balance']:.12f}")
106
+
107
+ urls = [f"{self.BASE_URL}{str(sub['accession_number']).zfill(18)}.sgml{'.zst' if sub.get('compressed', '').lower() == 'true' else ''}" for sub in data['data']]
108
+ return urls, data['metadata']['pagination']
109
+
110
+ class FileProcessor:
111
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
112
+ self.processing_queue = Queue(maxsize=queue_size)
113
+ self.should_stop = False
114
+ self.processing_workers = []
115
+ self.output_dir = output_dir
116
+ self.max_workers = max_workers
117
+ self.batch_size = 10
118
+ self.pbar = pbar
119
+ self.downloader = downloader
120
+
121
+ def start_processing_workers(self):
122
+ for _ in range(self.max_workers):
123
+ worker = Thread(target=self._processing_worker)
124
+ worker.daemon = True
125
+ worker.start()
126
+ self.processing_workers.append(worker)
127
+
128
+ def _process_file(self, item):
129
+ filename, content = item
130
+ clean_name = filename[:-4] if filename.endswith('.zst') else filename
131
+ output_path = os.path.join(self.output_dir, Path(clean_name).stem)
132
+ try:
133
+ parse_sgml_submission(None, output_dir=output_path, content=content)
134
+ self.pbar.update(1)
135
+ except Exception as e:
136
+ self.downloader._log_error(self.output_dir, filename, str(e))
137
+
138
+ def _processing_worker(self):
139
+ batch = []
140
+ while not self.should_stop:
141
+ try:
142
+ item = self.processing_queue.get(timeout=1)
143
+ if item is None:
144
+ break
145
+
146
+ batch.append(item)
147
+
148
+ if len(batch) >= self.batch_size or self.processing_queue.empty():
149
+ for item in batch:
150
+ self._process_file(item)
151
+ self.processing_queue.task_done()
152
+ batch = []
153
+
154
+ except Empty:
155
+ if batch:
156
+ for item in batch:
157
+ self._process_file(item)
158
+ self.processing_queue.task_done()
159
+ batch = []
160
+
161
+ def stop_workers(self):
162
+ self.should_stop = True
163
+ for _ in self.processing_workers:
164
+ self.processing_queue.put(None)
165
+ for worker in self.processing_workers:
166
+ worker.join()
167
+
168
+ def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
169
+ dctx = zstd.ZstdDecompressor()
170
+ try:
171
+ input_buffer = io.BytesIO(b''.join(compressed_chunks))
172
+ decompressed_content = io.BytesIO()
173
+
174
+ with dctx.stream_reader(input_buffer) as reader:
175
+ shutil.copyfileobj(reader, decompressed_content)
176
+
177
+ content = decompressed_content.getvalue().decode('utf-8')
178
+ processor.processing_queue.put((filename, content))
179
+ return True
180
+
181
+ except Exception as e:
182
+ self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
183
+ return False
184
+ finally:
185
+ try:
186
+ input_buffer.close()
187
+ decompressed_content.close()
188
+ except:
189
+ pass
190
+
191
+ def save_regular_file(self, chunks, filename, output_dir, processor):
192
+ try:
193
+ content = b''.join(chunks).decode('utf-8')
194
+ processor.processing_queue.put((filename, content))
195
+ return True
196
+
197
+ except Exception as e:
198
+ self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
199
+ return False
200
+
201
+ async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
202
+ async with semaphore:
203
+ chunks = []
204
+ filename = url.split('/')[-1]
205
+
206
+ api_key = self.api_key
207
+ if not api_key:
208
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
209
+
210
+ try:
211
+ headers = {
212
+ 'Connection': 'keep-alive',
213
+ 'Accept-Encoding': 'gzip, deflate, br',
214
+ 'Authorization': f'Bearer {api_key}'
215
+ }
216
+
217
+ async with session.get(url, headers=headers) as response:
218
+ if response.status == 200:
219
+ async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
220
+ chunks.append(chunk)
221
+
222
+ loop = asyncio.get_running_loop()
223
+ if filename.endswith('.zst'):
224
+ success = await loop.run_in_executor(
225
+ decompression_pool,
226
+ partial(self.decompress_stream, chunks, filename, output_dir, processor)
227
+ )
228
+ else:
229
+ success = await loop.run_in_executor(
230
+ decompression_pool,
231
+ partial(self.save_regular_file, chunks, filename, output_dir, processor)
232
+ )
233
+
234
+ if not success:
235
+ self._log_error(output_dir, filename, "Failed to process file")
236
+ elif response.status == 401:
237
+ self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
238
+ raise ValueError("Invalid API key")
239
+ else:
240
+ self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
241
+ except Exception as e:
242
+ self._log_error(output_dir, filename, str(e))
243
+
244
+ async def process_batch(self, urls, output_dir):
245
+ os.makedirs(output_dir, exist_ok=True)
246
+
247
+ with tqdm(total=len(urls), desc="Processing files") as pbar:
248
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
249
+ processor.start_processing_workers()
250
+
251
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
252
+ decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
253
+
254
+ connector = aiohttp.TCPConnector(
255
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
256
+ force_close=False,
257
+ ssl=ssl.create_default_context(),
258
+ ttl_dns_cache=300,
259
+ keepalive_timeout=60
260
+ )
261
+
262
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=3600)) as session:
263
+ tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
264
+ await asyncio.gather(*tasks, return_exceptions=True)
265
+
266
+ processor.processing_queue.join()
267
+ processor.stop_workers()
268
+ decompression_pool.shutdown()
269
+
270
+ async def download_all_pages(self, submission_type=None, cik=None, filing_date=None, output_dir="download"):
271
+ connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
272
+ async with aiohttp.ClientSession(connector=connector) as session:
273
+ try:
274
+ urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=1)
275
+ total_urls = urls.copy()
276
+ current_page = 1
277
+
278
+ while pagination.get('hasMore', False):
279
+ current_page += 1
280
+ more_urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=current_page)
281
+ total_urls.extend(more_urls)
282
+
283
+ if total_urls:
284
+ start_time = time.time()
285
+ await self.process_batch(total_urls, output_dir)
286
+ elapsed_time = time.time() - start_time
287
+ print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
288
+ else:
289
+ print("No submissions found matching the criteria")
290
+
291
+ except InsufficientBalanceError as e:
292
+ error_msg = {
293
+ "error": "insufficient_balance",
294
+ "required_cost": e.required_cost,
295
+ "current_balance": e.current_balance,
296
+ "total_urls": e.total_urls,
297
+ "additional_funds_needed": e.required_cost - e.current_balance
298
+ }
299
+ self._log_error(output_dir, "balance_check", error_msg)
300
+ return
301
+
302
+ def download_submissions(self, submission_type=None, cik=None, ticker=None, filing_date=None, output_dir="download"):
303
+ if self.api_key is None:
304
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
305
+
306
+ if filing_date is not None:
307
+ if isinstance(filing_date, str):
308
+ filing_date = int(filing_date.replace('-', ''))
309
+ elif isinstance(filing_date, list):
310
+ filing_date = [int(x.replace('-', '')) for x in filing_date]
311
+ elif isinstance(filing_date, tuple):
312
+ filing_date = (int(filing_date[0].replace('-', '')), int(filing_date[1].replace('-', '')))
313
+
314
+ if ticker is not None:
315
+ cik = identifier_to_cik(ticker)
316
+
317
+ if cik is not None:
318
+ if isinstance(cik, str):
319
+ cik = [int(cik)]
320
+ elif isinstance(cik, int):
321
+ cik = [cik]
322
+ elif isinstance(cik, list):
323
+ cik = [int(x) for x in cik]
324
+
325
+ async def _download():
326
+ try:
327
+ await self.download_all_pages(submission_type=submission_type, cik=cik, filing_date=filing_date, output_dir=output_dir)
328
+ except Exception as e:
329
+ if not isinstance(e, InsufficientBalanceError):
330
+ self._log_error(output_dir, "download_error", str(e))
331
+
332
+ asyncio.run(_download())
@@ -0,0 +1,82 @@
1
+ from pathlib import Path
2
+ import re
3
+ from .helper import load_file_content, clean_title
4
+
5
+ PART_PATTERN = re.compile(r'\n\s*part[.:)?\s]+([IVX]+|\d+)', re.I)
6
+ ITEM_PATTERN = re.compile(r'\n\s*item[.:)?\s]+(\d+[A-Z]?)', re.I)
7
+ IS_10K_PATTERN = re.compile(r'item[.:)?\s]+14', re.I)
8
+ TOC_END_PATTERN = re.compile(r'(?:item[.:)?\s]+14).*?(?=\n\s*item[.:)?\s]+1\b)', re.I | re.DOTALL)
9
+
10
+ ROMAN_TO_NUM = {'I': '1', 'II': '2', 'III': '3', 'IV': '4'}
11
+
12
+ ITEM_TO_PART = {
13
+ '1': 'I', '1A': 'I', '1B': 'I', '1C': 'I', '2': 'I', '3': 'I', '4': 'I',
14
+ '5': 'II', '6': 'II', '7': 'II', '7A': 'II', '8': 'II', '9': 'II', '9A': 'II', '9B': 'II', '9C': 'II',
15
+ '10': 'III', '11': 'III', '12': 'III', '13': 'III', '14': 'III',
16
+ '15': 'IV', '16': 'IV', '16A': 'IV'
17
+ }
18
+
19
+ def find_content_start(content):
20
+ toc_match = TOC_END_PATTERN.search(content)
21
+ if toc_match:
22
+ item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
23
+ item_1_match = item_1_pattern.search(content, toc_match.end())
24
+ if item_1_match:
25
+ return item_1_match.start()
26
+ return 0
27
+
28
+ def find_anchors(content):
29
+ start_pos = find_content_start(content)
30
+ content = '\n' + content[start_pos:]
31
+
32
+ anchors = []
33
+ for part_match in PART_PATTERN.finditer(content):
34
+ anchors.append(('part', part_match.group(1), part_match.start() + start_pos, part_match.group()))
35
+
36
+ for item_match in ITEM_PATTERN.finditer(content):
37
+ anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
38
+
39
+ return sorted(anchors, key=lambda x: x[2])
40
+
41
+ def extract_sections(content, anchors, filename):
42
+ if not anchors:
43
+ return {}
44
+
45
+ result = {
46
+ "metadata": {"document_name": Path(filename).stem},
47
+ "document": {
48
+ "part1": {}, "part2": {}, "part3": {}, "part4": {}
49
+ }
50
+ }
51
+
52
+ last_item = None
53
+ current_text = None
54
+
55
+ for i, current in enumerate(anchors):
56
+ if current[0] == 'item':
57
+ next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
58
+ text = content[current[2]:next_pos].strip()
59
+
60
+ if current[1] == last_item:
61
+ current_text += "\n\n" + text
62
+ else:
63
+ if last_item and last_item in ITEM_TO_PART:
64
+ part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
65
+ result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
66
+ current_text = text
67
+ last_item = current[1]
68
+
69
+ if last_item and last_item in ITEM_TO_PART:
70
+ part_num = ROMAN_TO_NUM[ITEM_TO_PART[last_item]]
71
+ result["document"][f"part{part_num}"][f"item{last_item.lower()}"] = current_text
72
+
73
+ # Only keep non-empty parts
74
+ result["document"] = {k:v for k,v in result["document"].items() if v}
75
+ return result
76
+
77
+ def parse_10k(filename):
78
+ content = load_file_content(filename)
79
+ if not IS_10K_PATTERN.search(content):
80
+ return {}
81
+ anchors = find_anchors(content)
82
+ return extract_sections(content, anchors, filename)
@@ -0,0 +1,73 @@
1
+ from pathlib import Path
2
+ from .helper import load_file_content, clean_title
3
+ import re
4
+
5
+ PART_II_PATTERN = re.compile(r'\n\s*part\s+II\.?(?:[:\s\.]|$)', re.I)
6
+ ITEM_PATTERN = re.compile(r'\n\s*item\s+(\d+[A-Z]?)\.?(?:[:\s\.]|$)', re.I)
7
+ TOC_END_PATTERN = re.compile(r'(?:item\s*6\.?).*?(?=\n\s*item\s*1\.?\b)', re.I | re.DOTALL)
8
+
9
+ def find_content_start(content):
10
+ toc_match = TOC_END_PATTERN.search(content)
11
+ if toc_match:
12
+ item_1_pattern = re.compile(r'\n\s*item\s*1\b', re.I)
13
+ item_1_match = item_1_pattern.search(content, toc_match.end())
14
+ if item_1_match:
15
+ return item_1_match.start()
16
+ return 0
17
+
18
+ def find_anchors(content):
19
+ start_pos = find_content_start(content)
20
+ content = '\n' + content[start_pos:]
21
+
22
+ part_ii_match = PART_II_PATTERN.search(content)
23
+ part_ii_pos = part_ii_match.start() + start_pos if part_ii_match else None
24
+
25
+ anchors = []
26
+ for item_match in ITEM_PATTERN.finditer(content):
27
+ anchors.append(('item', item_match.group(1), item_match.start() + start_pos, item_match.group()))
28
+
29
+ return sorted(anchors, key=lambda x: x[2]), part_ii_pos
30
+
31
+ def extract_sections(content, anchors_and_part2, filename):
32
+ anchors, part2_pos = anchors_and_part2
33
+ if not anchors:
34
+ return {}
35
+
36
+ result = {
37
+ "metadata": {"document_name": Path(filename).stem},
38
+ "document": {
39
+ "part1": {},
40
+ "part2": {}
41
+ }
42
+ }
43
+
44
+ last_item = None
45
+ current_text = None
46
+ last_pos = None
47
+
48
+ for i, current in enumerate(anchors):
49
+ next_pos = anchors[i+1][2] if i < len(anchors)-1 else len(content)
50
+
51
+ if current[1] == last_item:
52
+ current_text += "\n\n" + content[current[2]:next_pos].strip()
53
+ else:
54
+ if last_item is not None:
55
+ part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
56
+ result["document"][part_key][f"item{last_item.lower()}"] = current_text
57
+
58
+ current_text = content[current[2]:next_pos].strip()
59
+ last_item = current[1]
60
+ last_pos = current[2]
61
+
62
+ if last_item is not None:
63
+ part_key = "part2" if (part2_pos and last_pos >= part2_pos) else "part1"
64
+ result["document"][part_key][f"item{last_item.lower()}"] = current_text
65
+
66
+ # Clean empty parts
67
+ result["document"] = {k:v for k,v in result["document"].items() if v}
68
+ return result
69
+
70
+ def parse_10q(filename):
71
+ content = load_file_content(filename)
72
+ anchors_and_part2 = find_anchors(content)
73
+ return extract_sections(content, anchors_and_part2, filename)
@@ -0,0 +1,58 @@
1
+ import re
2
+ from pathlib import Path
3
+ from .helper import load_file_content, clean_title
4
+
5
+ ITEM_PATTERN = re.compile(
6
+ r"(?:^[ \t]*)"
7
+ r"(?:"
8
+ r"(?:Item|ITEM)\s*"
9
+ r"(?:"
10
+ r"1|"
11
+ r"2|"
12
+ r"3|"
13
+ r"4|"
14
+ r"5|"
15
+ r"6|"
16
+ r"7|"
17
+ r"8|"
18
+ r"9"
19
+ r")"
20
+ r"|"
21
+ r"SIGNATURES?"
22
+ r")",
23
+ re.IGNORECASE | re.MULTILINE
24
+ )
25
+
26
+ def parse_13d(filename: Path) -> dict:
27
+ text = load_file_content(filename)
28
+ matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN.finditer(text)]
29
+
30
+ result = {
31
+ "metadata": {"document_name": Path(filename).stem},
32
+ "document": {}
33
+ }
34
+
35
+ if not matches:
36
+ return result
37
+
38
+ for i, (current_match, start_pos) in enumerate(matches[:-1]):
39
+ section_text = WHITESPACE_PATTERN.sub(' ', text[start_pos:matches[i + 1][1]]).strip()
40
+ if section_text:
41
+ if "signature" in current_match.lower():
42
+ key = "signatures"
43
+ else:
44
+ key = f"item{current_match.lower().replace('item', '').strip()}"
45
+ result["document"][key] = section_text
46
+
47
+ last_match, last_pos = matches[-1]
48
+ section_text = WHITESPACE_PATTERN.sub(' ', text[last_pos:len(text)]).strip()
49
+ if section_text:
50
+ if "signature" in last_match.lower():
51
+ key = "signatures"
52
+ else:
53
+ key = f"item{last_match.lower().replace('item', '').strip()}"
54
+ result["document"][key] = section_text
55
+
56
+ return result
57
+
58
+ WHITESPACE_PATTERN = re.compile(r'\s+')
@@ -0,0 +1,61 @@
1
+ import re
2
+ from pathlib import Path
3
+ from .helper import load_file_content, clean_title
4
+
5
+ ITEM_PATTERN_13G = re.compile(
6
+ r"(?:^[ \t]*)"
7
+ r"(?:"
8
+ r"(?:Item|ITEM)\s*"
9
+ r"(?:"
10
+ r"10|" # Move 10 to the start so it's matched before 1
11
+ r"11|" # Similarly with 11 and 12
12
+ r"12|"
13
+ r"1|"
14
+ r"2|"
15
+ r"3|"
16
+ r"4|"
17
+ r"5|"
18
+ r"6|"
19
+ r"7|"
20
+ r"8|"
21
+ r"9"
22
+ r")"
23
+ r"|"
24
+ r"SIGNATURES?"
25
+ r")",
26
+ re.IGNORECASE | re.MULTILINE
27
+ )
28
+
29
+ def parse_13g(filename: Path) -> dict:
30
+ text = load_file_content(filename)
31
+ matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN_13G.finditer(text)]
32
+
33
+ result = {
34
+ "metadata": {"document_name": Path(filename).stem},
35
+ "document": {}
36
+ }
37
+
38
+ if not matches:
39
+ return result
40
+
41
+ for i, (current_match, start_pos) in enumerate(matches[:-1]):
42
+ section_text = WHITESPACE_PATTERN.sub(' ', text[start_pos:matches[i + 1][1]]).strip()
43
+ if section_text:
44
+ if "signature" in current_match.lower():
45
+ key = "signatures"
46
+ else:
47
+ key = f"item{current_match.lower().replace('item', '').strip()}"
48
+ result["document"][key] = section_text
49
+
50
+ last_match, last_pos = matches[-1]
51
+ section_text = WHITESPACE_PATTERN.sub(' ', text[last_pos:len(text)]).strip()
52
+ if section_text:
53
+ if "signature" in last_match.lower():
54
+ key = "signatures"
55
+ else:
56
+ key = f"item{last_match.lower().replace('item', '').strip()}"
57
+ result["document"][key] = section_text
58
+
59
+ return result
60
+
61
+ WHITESPACE_PATTERN = re.compile(r'\s+')
@@ -0,0 +1,84 @@
1
+ import re
2
+ from pathlib import Path
3
+ from .helper import load_file_content, clean_title
4
+
5
+ ITEM_PATTERN = re.compile(
6
+ r"(?:^[ \t]*)"
7
+ r"(?:"
8
+ r"(?:Item|ITEM)\s*"
9
+ r"(?:"
10
+ r"1\.0[1-4]|"
11
+ r"2\.0[1-6]|"
12
+ r"3\.0[1-3]|"
13
+ r"4\.0[1-2]|"
14
+ r"5\.0[1-8]|"
15
+ r"6\.0[1-5]|"
16
+ r"7\.01|"
17
+ r"8\.01|"
18
+ r"9\.01"
19
+ r")"
20
+ r"|"
21
+ r"SIGNATURES?"
22
+ r")",
23
+ re.IGNORECASE | re.MULTILINE
24
+ )
25
+
26
+ WHITESPACE_PATTERN = re.compile(r'\s+')
27
+
28
+ def parse_section(text: str, start: int, end: int) -> str:
29
+ return WHITESPACE_PATTERN.sub(' ', text[start:end].strip())
30
+
31
+ def validate_section_sequence(matches: list) -> None:
32
+ current_base = None
33
+
34
+ for match, _ in matches:
35
+ base_section = re.match(r'(?:Item|ITEM)\s*(?:\d+\.\d+|\bSIGNATURES?\b)', match)
36
+ if base_section:
37
+ base_section = base_section.group().upper()
38
+
39
+ if current_base is None:
40
+ current_base = base_section
41
+ elif base_section != current_base:
42
+ current_base = base_section
43
+ else:
44
+ raise DuplicateSectionError(f"Section {base_section} appears multiple times before a different section")
45
+
46
+ def parse_8k(filename: Path) -> dict:
47
+ text = load_file_content(filename)
48
+ matches = [(clean_title(m.group().strip()), m.start()) for m in ITEM_PATTERN.finditer(text)]
49
+
50
+ result = {
51
+ "metadata": {"document_name": Path(filename).stem},
52
+ "document": {}
53
+ }
54
+
55
+ if not matches:
56
+ return result
57
+
58
+ validate_section_sequence(matches)
59
+
60
+ # Process all sections except last
61
+ for i, (current_match, start_pos) in enumerate(matches[:-1]):
62
+ section_text = parse_section(text, start_pos, matches[i + 1][1])
63
+ if section_text:
64
+ if "signature" in current_match.lower():
65
+ key = "signatures"
66
+ else:
67
+ key = f"item{current_match.lower().replace('item', '').strip()}"
68
+ result["document"][key] = section_text
69
+
70
+ # Process last section
71
+ last_match, last_pos = matches[-1]
72
+ section_text = parse_section(text, last_pos, len(text))
73
+ if section_text:
74
+ if "signature" in last_match.lower():
75
+ key = "signatures"
76
+ else:
77
+ key = f"item{last_match.lower().replace('item', '').strip()}"
78
+ result["document"][key] = section_text
79
+
80
+ return result
81
+
82
+ class DuplicateSectionError(Exception):
83
+ """Raised when a section appears multiple times before a different section."""
84
+ pass