datamule 0.422__cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. datamule/__init__.py +71 -0
  2. datamule/data/company_former_names.csv +8148 -0
  3. datamule/data/company_metadata.csv +10049 -0
  4. datamule/data/company_tickers.csv +9999 -0
  5. datamule/data/sec-glossary.csv +728 -0
  6. datamule/data/xbrl_descriptions.csv +10024 -0
  7. datamule/dataset_builder/dataset_builder.py +259 -0
  8. datamule/document.py +130 -0
  9. datamule/downloader/downloader.py +364 -0
  10. datamule/downloader/premiumdownloader.py +332 -0
  11. datamule/helper.py +123 -0
  12. datamule/monitor.py +236 -0
  13. datamule/mulebot/__init__.py +1 -0
  14. datamule/mulebot/helper.py +35 -0
  15. datamule/mulebot/mulebot.py +130 -0
  16. datamule/mulebot/mulebot_server/__init__.py +1 -0
  17. datamule/mulebot/mulebot_server/server.py +87 -0
  18. datamule/mulebot/mulebot_server/static/css/minimalist.css +174 -0
  19. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +68 -0
  20. datamule/mulebot/mulebot_server/static/scripts/chat.js +92 -0
  21. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +56 -0
  22. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +15 -0
  23. datamule/mulebot/mulebot_server/static/scripts/main.js +57 -0
  24. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +27 -0
  25. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +47 -0
  26. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +129 -0
  27. datamule/mulebot/mulebot_server/static/scripts/utils.js +28 -0
  28. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +91 -0
  29. datamule/mulebot/search.py +52 -0
  30. datamule/mulebot/tools.py +82 -0
  31. datamule/packageupdater.py +207 -0
  32. datamule/parser/document_parsing/basic_10k_parser.py +82 -0
  33. datamule/parser/document_parsing/basic_10q_parser.py +73 -0
  34. datamule/parser/document_parsing/basic_13d_parser.py +58 -0
  35. datamule/parser/document_parsing/basic_13g_parser.py +61 -0
  36. datamule/parser/document_parsing/basic_8k_parser.py +84 -0
  37. datamule/parser/document_parsing/form_d_parser.py +70 -0
  38. datamule/parser/document_parsing/generalized_item_parser.py +78 -0
  39. datamule/parser/document_parsing/generalized_xml_parser.py +0 -0
  40. datamule/parser/document_parsing/helper.py +75 -0
  41. datamule/parser/document_parsing/information_table_parser_13fhr.py +41 -0
  42. datamule/parser/document_parsing/insider_trading_parser.py +158 -0
  43. datamule/parser/document_parsing/mappings.py +95 -0
  44. datamule/parser/document_parsing/n_port_p_parser.py +70 -0
  45. datamule/parser/document_parsing/sec_parser.py +73 -0
  46. datamule/parser/document_parsing/sgml_parser.py +94 -0
  47. datamule/parser/sgml_parsing/sgml_parser_cy.c +19082 -0
  48. datamule/parser/sgml_parsing/sgml_parser_cy.cpython-312-x86_64-linux-gnu.so +0 -0
  49. datamule/portfolio.py +21 -0
  50. datamule/submission.py +67 -0
  51. datamule-0.422.dist-info/METADATA +31 -0
  52. datamule-0.422.dist-info/RECORD +54 -0
  53. datamule-0.422.dist-info/WHEEL +6 -0
  54. datamule-0.422.dist-info/top_level.txt +1 -0
@@ -0,0 +1,364 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import os
4
+ from tqdm import tqdm
5
+ from datetime import datetime
6
+ from urllib.parse import urlencode
7
+ import aiofiles
8
+ import json
9
+ import time
10
+ from collections import deque
11
+
12
+ from ..helper import identifier_to_cik, load_package_csv, fix_filing_url, headers
13
+ from ..parser.sgml_parsing.sgml_parser_cy import parse_sgml_submission
14
+
15
+ class RetryException(Exception):
16
+ def __init__(self, url, retry_after=601):
17
+ self.url = url
18
+ self.retry_after = retry_after
19
+
20
+ class PreciseRateLimiter:
21
+ def __init__(self, rate, interval=1.0):
22
+ self.rate = rate # requests per interval
23
+ self.interval = interval # in seconds
24
+ self.token_time = self.interval / self.rate # time per token
25
+ self.last_time = time.time()
26
+ self.lock = asyncio.Lock()
27
+
28
+ async def acquire(self):
29
+ async with self.lock:
30
+ now = time.time()
31
+ wait_time = self.last_time + self.token_time - now
32
+ if wait_time > 0:
33
+ await asyncio.sleep(wait_time)
34
+ self.last_time = time.time()
35
+ return True
36
+
37
+ async def __aenter__(self):
38
+ await self.acquire()
39
+ return self
40
+
41
+ async def __aexit__(self, exc_type, exc, tb):
42
+ pass
43
+
44
+ class RateMonitor:
45
+ def __init__(self, window_size=1.0):
46
+ self.window_size = window_size
47
+ self.requests = deque()
48
+ self._lock = asyncio.Lock()
49
+
50
+ async def add_request(self, size_bytes):
51
+ async with self._lock:
52
+ now = time.time()
53
+ self.requests.append((now, size_bytes))
54
+ while self.requests and self.requests[0][0] < now - self.window_size:
55
+ self.requests.popleft()
56
+
57
+ def get_current_rates(self):
58
+ now = time.time()
59
+ while self.requests and self.requests[0][0] < now - self.window_size:
60
+ self.requests.popleft()
61
+
62
+ if not self.requests:
63
+ return 0, 0
64
+
65
+ request_count = len(self.requests)
66
+ byte_count = sum(size for _, size in self.requests)
67
+
68
+ requests_per_second = request_count / self.window_size
69
+ mb_per_second = (byte_count / 1024 / 1024) / self.window_size
70
+
71
+ return round(requests_per_second, 1), round(mb_per_second, 2)
72
+
73
+ class Downloader:
74
+ def __init__(self):
75
+ self.headers = headers
76
+ self.limiter = PreciseRateLimiter(5) # 10 requests per second
77
+ self.session = None
78
+ self.parse_filings = True
79
+ self.download_queue = asyncio.Queue()
80
+ self.rate_monitor = RateMonitor()
81
+ self.current_pbar = None
82
+ self.connection_semaphore = asyncio.Semaphore(5)
83
+
84
+ def update_progress_description(self):
85
+ if self.current_pbar:
86
+ reqs_per_sec, mb_per_sec = self.rate_monitor.get_current_rates()
87
+ self.current_pbar.set_description(
88
+ f"Progress [Rate: {reqs_per_sec}/s | {mb_per_sec} MB/s]"
89
+ )
90
+
91
+ async def __aenter__(self):
92
+ await self._init_session()
93
+ return self
94
+
95
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
96
+ await self._close()
97
+
98
+ async def _init_session(self):
99
+ if not self.session:
100
+ self.session = aiohttp.ClientSession(headers=self.headers)
101
+
102
+ async def _close(self):
103
+ if self.session:
104
+ await self.session.close()
105
+ self.session = None
106
+
107
+ async def _fetch_json(self, url):
108
+ """Fetch JSON with rate monitoring."""
109
+ async with self.limiter:
110
+ try:
111
+ url = fix_filing_url(url)
112
+ async with self.session.get(url) as response:
113
+ if response.status == 429:
114
+ raise RetryException(url)
115
+ response.raise_for_status()
116
+ content = await response.read()
117
+ await self.rate_monitor.add_request(len(content))
118
+ self.update_progress_description()
119
+ return await response.json()
120
+ except aiohttp.ClientResponseError as e:
121
+ if e.status == 429:
122
+ raise RetryException(url)
123
+ raise
124
+
125
+ async def _get_filing_urls_from_efts(self, base_url):
126
+ """Fetch filing URLs from EFTS in batches."""
127
+ start = 0
128
+ page_size = 100
129
+ urls = []
130
+
131
+ data = await self._fetch_json(f"{base_url}&from=0&size=1")
132
+ if not data or 'hits' not in data:
133
+ return []
134
+
135
+ total_hits = data['hits']['total']['value']
136
+ if not total_hits:
137
+ return []
138
+
139
+ pbar = tqdm(total=total_hits, desc="Fetching URLs [Rate: 0/s | 0 MB/s]")
140
+ self.current_pbar = pbar
141
+
142
+ while start < total_hits:
143
+ try:
144
+ tasks = [
145
+ self._fetch_json(f"{base_url}&from={start + i * page_size}&size={page_size}")
146
+ for i in range(10)
147
+ ]
148
+
149
+ results = await asyncio.gather(*tasks)
150
+
151
+ for data in results:
152
+ if data and 'hits' in data:
153
+ hits = data['hits']['hits']
154
+ if hits:
155
+ batch_urls = [
156
+ f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}.txt"
157
+ for hit in hits
158
+ ]
159
+ urls.extend(batch_urls)
160
+ pbar.update(len(hits))
161
+ self.update_progress_description()
162
+
163
+ start += 10 * page_size
164
+
165
+ except RetryException as e:
166
+ print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
167
+ await asyncio.sleep(e.retry_after)
168
+ continue
169
+ except Exception as e:
170
+ print(f"\nError fetching URLs batch at {start}: {str(e)}")
171
+ break
172
+
173
+ pbar.close()
174
+ self.current_pbar = None
175
+ return urls
176
+
177
+ async def _download_file(self, url, filepath):
178
+ """Download single file with precise rate limiting."""
179
+ async with self.connection_semaphore:
180
+ async with self.limiter:
181
+ try:
182
+ url = fix_filing_url(url)
183
+ async with self.session.get(url) as response:
184
+ if response.status == 429:
185
+ raise RetryException(url)
186
+ response.raise_for_status()
187
+ content = await response.read()
188
+ await self.rate_monitor.add_request(len(content))
189
+ self.update_progress_description()
190
+
191
+ parsed_data = None
192
+ if self.parse_filings:
193
+ try:
194
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
195
+ async with aiofiles.open(filepath, 'wb') as f:
196
+ await f.write(content)
197
+
198
+ parsed_data = parse_sgml_submission(
199
+ content=content.decode(),
200
+ output_dir=os.path.dirname(filepath) + f'/{url.split("/")[-1].split(".")[0].replace("-", "")}'
201
+ )
202
+
203
+ try:
204
+ os.remove(filepath)
205
+ except Exception as e:
206
+ print(f"\nError deleting original file {filepath}: {str(e)}")
207
+
208
+ except Exception as e:
209
+ print(f"\nError parsing {url}: {str(e)}")
210
+ try:
211
+ os.remove(filepath)
212
+ parsed_dir = os.path.dirname(filepath) + f'/{url.split("/")[-1].split(".")[0].replace("-", "")}'
213
+ if os.path.exists(parsed_dir):
214
+ import shutil
215
+ shutil.rmtree(parsed_dir)
216
+ except Exception as e:
217
+ print(f"\nError cleaning up files for {url}: {str(e)}")
218
+ else:
219
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
220
+ async with aiofiles.open(filepath, 'wb') as f:
221
+ await f.write(content)
222
+
223
+ return filepath, parsed_data
224
+
225
+ except Exception as e:
226
+ print(f"\nError downloading {url}: {str(e)}")
227
+ return None
228
+
229
+ async def _download_worker(self, pbar):
230
+ """Worker to process download queue."""
231
+ while True:
232
+ try:
233
+ url, filepath = await self.download_queue.get()
234
+ result = await self._download_file(url, filepath)
235
+ if result:
236
+ pbar.update(1)
237
+ self.download_queue.task_done()
238
+ except asyncio.CancelledError:
239
+ break
240
+ except Exception as e:
241
+ print(f"\nWorker error processing {url}: {str(e)}")
242
+ self.download_queue.task_done()
243
+
244
+ async def _download_and_process(self, urls, output_dir):
245
+ """Queue-based download processing."""
246
+ results = []
247
+ parsed_results = []
248
+
249
+ pbar = tqdm(total=len(urls), desc="Downloading files [Rate: 0/s | 0 MB/s]")
250
+ self.current_pbar = pbar
251
+
252
+ for url in urls:
253
+ filename = url.split('/')[-1]
254
+ filepath = os.path.join(output_dir, filename)
255
+ await self.download_queue.put((url, filepath))
256
+
257
+ workers = [asyncio.create_task(self._download_worker(pbar))
258
+ for _ in range(5)] # Match number of workers to semaphore
259
+
260
+ await self.download_queue.join()
261
+
262
+ for worker in workers:
263
+ worker.cancel()
264
+
265
+ await asyncio.gather(*workers, return_exceptions=True)
266
+
267
+ pbar.close()
268
+ self.current_pbar = None
269
+ return results, parsed_results
270
+
271
+ def download_submissions(self, output_dir='filings', cik=None, ticker=None, submission_type=None, date=None, parse=True):
272
+ """Main method to download SEC filings."""
273
+ self.parse_filings = parse
274
+
275
+ async def _download():
276
+ async with self as downloader:
277
+ if ticker is not None:
278
+ cik_value = identifier_to_cik(ticker)
279
+ else:
280
+ cik_value = cik
281
+
282
+ params = {}
283
+ if cik_value:
284
+ if isinstance(cik_value, list):
285
+ params['ciks'] = ','.join(str(c).zfill(10) for c in cik_value)
286
+ else:
287
+ params['ciks'] = str(cik_value).zfill(10)
288
+
289
+ params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type if submission_type else "-0"
290
+
291
+ if isinstance(date, list):
292
+ dates = [(d, d) for d in date]
293
+ elif isinstance(date, tuple):
294
+ dates = [date]
295
+ else:
296
+ date_str = date if date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
297
+ start, end = date_str.split(',')
298
+ dates = [(start, end)]
299
+
300
+ all_filepaths = []
301
+ all_parsed_data = []
302
+
303
+ for start_date, end_date in dates:
304
+ params['startdt'] = start_date
305
+ params['enddt'] = end_date
306
+ base_url = "https://efts.sec.gov/LATEST/search-index"
307
+ efts_url = f"{base_url}?{urlencode(params, doseq=True)}"
308
+
309
+ urls = await self._get_filing_urls_from_efts(efts_url)
310
+ if urls:
311
+ filepaths, parsed_data = await self._download_and_process(urls, output_dir)
312
+ all_filepaths.extend(filepaths)
313
+ all_parsed_data.extend(parsed_data)
314
+
315
+ return all_filepaths, all_parsed_data
316
+
317
+ return asyncio.run(_download())
318
+
319
+ def download_company_concepts(self, output_dir='company_concepts', cik=None, ticker=None):
320
+ """Download company concept data."""
321
+ async def _download_concepts():
322
+ async with self as downloader:
323
+ if ticker is not None:
324
+ ciks = identifier_to_cik(ticker)
325
+ elif cik:
326
+ ciks = [cik] if not isinstance(cik, list) else cik
327
+ else:
328
+ company_tickers = load_package_csv('company_tickers')
329
+ ciks = [company['cik'] for company in company_tickers]
330
+
331
+ os.makedirs(output_dir, exist_ok=True)
332
+ urls = [f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json' for cik in ciks]
333
+
334
+ pbar = tqdm(total=len(urls), desc="Downloading concepts [Rate: 0/s | 0 MB/s]")
335
+ self.current_pbar = pbar
336
+
337
+ for url in urls:
338
+ filename = url.split('/')[-1]
339
+ filepath = os.path.join(output_dir, filename)
340
+ await self.download_queue.put((url, filepath))
341
+
342
+ workers = [asyncio.create_task(self._download_worker(pbar))
343
+ for _ in range(5)]
344
+
345
+ await self.download_queue.join()
346
+
347
+ for worker in workers:
348
+ worker.cancel()
349
+
350
+ await asyncio.gather(*workers, return_exceptions=True)
351
+
352
+ pbar.close()
353
+ self.current_pbar = None
354
+
355
+ results = []
356
+ for url in urls:
357
+ filename = url.split('/')[-1]
358
+ filepath = os.path.join(output_dir, filename)
359
+ if os.path.exists(filepath):
360
+ results.append(filepath)
361
+
362
+ return results
363
+
364
+ return asyncio.run(_download_concepts())
@@ -0,0 +1,332 @@
1
+ import os
2
+ import asyncio
3
+ import aiohttp
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+ import time
7
+ import shutil
8
+ import ssl
9
+ import zstandard as zstd
10
+ import io
11
+ import json
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import partial
14
+ from queue import Queue, Empty
15
+ from threading import Thread
16
+ from datamule.parser.sgml_parsing.sgml_parser_cy import parse_sgml_submission
17
+ import urllib.parse
18
+ from ..helper import identifier_to_cik
19
+
20
+ class InsufficientBalanceError(Exception):
21
+ def __init__(self, required_cost, current_balance, total_urls):
22
+ self.required_cost = required_cost
23
+ self.current_balance = current_balance
24
+ self.total_urls = total_urls
25
+ message = (f"Insufficient balance. Required: ${required_cost:.4f}, "
26
+ f"Current balance: ${current_balance:.4f}, "
27
+ f"Total URLs: {total_urls}")
28
+ super().__init__(message)
29
+
30
+ class PremiumDownloader:
31
+ def __init__(self, api_key=None):
32
+ self.BASE_URL = "https://library.datamule.xyz/original/nc/"
33
+ self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
34
+ self.CHUNK_SIZE = 2 * 1024 * 1024
35
+ self.MAX_CONCURRENT_DOWNLOADS = 100
36
+ self.MAX_DECOMPRESSION_WORKERS = 16
37
+ self.MAX_PROCESSING_WORKERS = 16
38
+ self.QUEUE_SIZE = 10
39
+ if api_key is not None:
40
+ self._api_key = api_key
41
+
42
+ @property
43
+ def api_key(self):
44
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
45
+
46
+ @api_key.setter
47
+ def api_key(self, value):
48
+ if not value:
49
+ raise ValueError("API key cannot be empty")
50
+ self._api_key = value
51
+
52
+ def _log_error(self, output_dir, filename, error_msg):
53
+ error_file = os.path.join(output_dir, 'errors.json')
54
+ try:
55
+ if os.path.exists(error_file):
56
+ with open(error_file, 'r') as f:
57
+ errors = json.load(f)
58
+ else:
59
+ errors = {}
60
+
61
+ errors[filename] = str(error_msg)
62
+
63
+ with open(error_file, 'w') as f:
64
+ json.dump(errors, f, indent=2)
65
+ except Exception as e:
66
+ print(f"Failed to log error to {error_file}: {str(e)}")
67
+
68
+ async def _fetch_submissions(self, session, submission_type=None, cik=None, filing_date=None, page=1):
69
+ params = {
70
+ 'api_key': self.api_key,
71
+ 'page': page
72
+ }
73
+
74
+ if submission_type:
75
+ if isinstance(submission_type, list):
76
+ params['submission_type'] = ','.join(str(x) for x in submission_type)
77
+ else:
78
+ params['submission_type'] = str(submission_type)
79
+
80
+ if cik:
81
+ if isinstance(cik, list):
82
+ params['cik'] = ','.join(str(x) for x in cik)
83
+ else:
84
+ params['cik'] = str(cik)
85
+
86
+ if filing_date:
87
+ if isinstance(filing_date, tuple):
88
+ params['startdt'] = str(filing_date[0])
89
+ params['enddt'] = str(filing_date[1])
90
+ else:
91
+ if isinstance(filing_date, list):
92
+ params['filing_date'] = ','.join(str(x) for x in filing_date)
93
+ else:
94
+ params['filing_date'] = str(filing_date)
95
+
96
+ url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
97
+
98
+ async with session.get(url) as response:
99
+ data = await response.json()
100
+ if not data.get('success'):
101
+ raise ValueError(f"API request failed: {data.get('error')}")
102
+
103
+ charges = data['metadata']['billing']['charges']
104
+ print(f"\nCost: ${charges['results']:.12f} downloads + ${charges['rows_read']:.12f} row reads = ${charges['total']:.12f}")
105
+ print(f"Balance: ${data['metadata']['billing']['remaining_balance']:.12f}")
106
+
107
+ urls = [f"{self.BASE_URL}{str(sub['accession_number']).zfill(18)}.sgml{'.zst' if sub.get('compressed', '').lower() == 'true' else ''}" for sub in data['data']]
108
+ return urls, data['metadata']['pagination']
109
+
110
+ class FileProcessor:
111
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
112
+ self.processing_queue = Queue(maxsize=queue_size)
113
+ self.should_stop = False
114
+ self.processing_workers = []
115
+ self.output_dir = output_dir
116
+ self.max_workers = max_workers
117
+ self.batch_size = 10
118
+ self.pbar = pbar
119
+ self.downloader = downloader
120
+
121
+ def start_processing_workers(self):
122
+ for _ in range(self.max_workers):
123
+ worker = Thread(target=self._processing_worker)
124
+ worker.daemon = True
125
+ worker.start()
126
+ self.processing_workers.append(worker)
127
+
128
+ def _process_file(self, item):
129
+ filename, content = item
130
+ clean_name = filename[:-4] if filename.endswith('.zst') else filename
131
+ output_path = os.path.join(self.output_dir, Path(clean_name).stem)
132
+ try:
133
+ parse_sgml_submission(None, output_dir=output_path, content=content)
134
+ self.pbar.update(1)
135
+ except Exception as e:
136
+ self.downloader._log_error(self.output_dir, filename, str(e))
137
+
138
+ def _processing_worker(self):
139
+ batch = []
140
+ while not self.should_stop:
141
+ try:
142
+ item = self.processing_queue.get(timeout=1)
143
+ if item is None:
144
+ break
145
+
146
+ batch.append(item)
147
+
148
+ if len(batch) >= self.batch_size or self.processing_queue.empty():
149
+ for item in batch:
150
+ self._process_file(item)
151
+ self.processing_queue.task_done()
152
+ batch = []
153
+
154
+ except Empty:
155
+ if batch:
156
+ for item in batch:
157
+ self._process_file(item)
158
+ self.processing_queue.task_done()
159
+ batch = []
160
+
161
+ def stop_workers(self):
162
+ self.should_stop = True
163
+ for _ in self.processing_workers:
164
+ self.processing_queue.put(None)
165
+ for worker in self.processing_workers:
166
+ worker.join()
167
+
168
+ def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
169
+ dctx = zstd.ZstdDecompressor()
170
+ try:
171
+ input_buffer = io.BytesIO(b''.join(compressed_chunks))
172
+ decompressed_content = io.BytesIO()
173
+
174
+ with dctx.stream_reader(input_buffer) as reader:
175
+ shutil.copyfileobj(reader, decompressed_content)
176
+
177
+ content = decompressed_content.getvalue().decode('utf-8')
178
+ processor.processing_queue.put((filename, content))
179
+ return True
180
+
181
+ except Exception as e:
182
+ self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
183
+ return False
184
+ finally:
185
+ try:
186
+ input_buffer.close()
187
+ decompressed_content.close()
188
+ except:
189
+ pass
190
+
191
+ def save_regular_file(self, chunks, filename, output_dir, processor):
192
+ try:
193
+ content = b''.join(chunks).decode('utf-8')
194
+ processor.processing_queue.put((filename, content))
195
+ return True
196
+
197
+ except Exception as e:
198
+ self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
199
+ return False
200
+
201
+ async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
202
+ async with semaphore:
203
+ chunks = []
204
+ filename = url.split('/')[-1]
205
+
206
+ api_key = self.api_key
207
+ if not api_key:
208
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
209
+
210
+ try:
211
+ headers = {
212
+ 'Connection': 'keep-alive',
213
+ 'Accept-Encoding': 'gzip, deflate, br',
214
+ 'Authorization': f'Bearer {api_key}'
215
+ }
216
+
217
+ async with session.get(url, headers=headers) as response:
218
+ if response.status == 200:
219
+ async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
220
+ chunks.append(chunk)
221
+
222
+ loop = asyncio.get_running_loop()
223
+ if filename.endswith('.zst'):
224
+ success = await loop.run_in_executor(
225
+ decompression_pool,
226
+ partial(self.decompress_stream, chunks, filename, output_dir, processor)
227
+ )
228
+ else:
229
+ success = await loop.run_in_executor(
230
+ decompression_pool,
231
+ partial(self.save_regular_file, chunks, filename, output_dir, processor)
232
+ )
233
+
234
+ if not success:
235
+ self._log_error(output_dir, filename, "Failed to process file")
236
+ elif response.status == 401:
237
+ self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
238
+ raise ValueError("Invalid API key")
239
+ else:
240
+ self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
241
+ except Exception as e:
242
+ self._log_error(output_dir, filename, str(e))
243
+
244
+ async def process_batch(self, urls, output_dir):
245
+ os.makedirs(output_dir, exist_ok=True)
246
+
247
+ with tqdm(total=len(urls), desc="Processing files") as pbar:
248
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
249
+ processor.start_processing_workers()
250
+
251
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
252
+ decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
253
+
254
+ connector = aiohttp.TCPConnector(
255
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
256
+ force_close=False,
257
+ ssl=ssl.create_default_context(),
258
+ ttl_dns_cache=300,
259
+ keepalive_timeout=60
260
+ )
261
+
262
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=3600)) as session:
263
+ tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
264
+ await asyncio.gather(*tasks, return_exceptions=True)
265
+
266
+ processor.processing_queue.join()
267
+ processor.stop_workers()
268
+ decompression_pool.shutdown()
269
+
270
+ async def download_all_pages(self, submission_type=None, cik=None, filing_date=None, output_dir="download"):
271
+ connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
272
+ async with aiohttp.ClientSession(connector=connector) as session:
273
+ try:
274
+ urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=1)
275
+ total_urls = urls.copy()
276
+ current_page = 1
277
+
278
+ while pagination.get('hasMore', False):
279
+ current_page += 1
280
+ more_urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=current_page)
281
+ total_urls.extend(more_urls)
282
+
283
+ if total_urls:
284
+ start_time = time.time()
285
+ await self.process_batch(total_urls, output_dir)
286
+ elapsed_time = time.time() - start_time
287
+ print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
288
+ else:
289
+ print("No submissions found matching the criteria")
290
+
291
+ except InsufficientBalanceError as e:
292
+ error_msg = {
293
+ "error": "insufficient_balance",
294
+ "required_cost": e.required_cost,
295
+ "current_balance": e.current_balance,
296
+ "total_urls": e.total_urls,
297
+ "additional_funds_needed": e.required_cost - e.current_balance
298
+ }
299
+ self._log_error(output_dir, "balance_check", error_msg)
300
+ return
301
+
302
+ def download_submissions(self, submission_type=None, cik=None, ticker=None, filing_date=None, output_dir="download"):
303
+ if self.api_key is None:
304
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
305
+
306
+ if filing_date is not None:
307
+ if isinstance(filing_date, str):
308
+ filing_date = int(filing_date.replace('-', ''))
309
+ elif isinstance(filing_date, list):
310
+ filing_date = [int(x.replace('-', '')) for x in filing_date]
311
+ elif isinstance(filing_date, tuple):
312
+ filing_date = (int(filing_date[0].replace('-', '')), int(filing_date[1].replace('-', '')))
313
+
314
+ if ticker is not None:
315
+ cik = identifier_to_cik(ticker)
316
+
317
+ if cik is not None:
318
+ if isinstance(cik, str):
319
+ cik = [int(cik)]
320
+ elif isinstance(cik, int):
321
+ cik = [cik]
322
+ elif isinstance(cik, list):
323
+ cik = [int(x) for x in cik]
324
+
325
+ async def _download():
326
+ try:
327
+ await self.download_all_pages(submission_type=submission_type, cik=cik, filing_date=filing_date, output_dir=output_dir)
328
+ except Exception as e:
329
+ if not isinstance(e, InsufficientBalanceError):
330
+ self._log_error(output_dir, "download_error", str(e))
331
+
332
+ asyncio.run(_download())