datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +102 -18
- datamule/portfolio_compression_utils.py +291 -0
- datamule/seclibrary/downloader.py +163 -161
- datamule/submission.py +82 -186
- datamule/utils/construct_submissions_data.py +4 -4
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA +1 -1
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD +9 -8
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL +0 -0
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt +0 -0
@@ -8,13 +8,15 @@ import ssl
|
|
8
8
|
import zstandard as zstd
|
9
9
|
import io
|
10
10
|
import json
|
11
|
+
import tarfile
|
11
12
|
from concurrent.futures import ThreadPoolExecutor
|
12
13
|
from functools import partial
|
13
14
|
from queue import Queue, Empty
|
14
|
-
from threading import Thread
|
15
|
+
from threading import Thread, Lock
|
15
16
|
from .query import query
|
16
17
|
from os import cpu_count
|
17
|
-
from secsgml import
|
18
|
+
from secsgml import parse_sgml_content_into_memory
|
19
|
+
from secsgml.utils import bytes_to_str
|
18
20
|
|
19
21
|
|
20
22
|
|
@@ -24,25 +26,19 @@ class Downloader:
|
|
24
26
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
25
27
|
self.MAX_CONCURRENT_DOWNLOADS = 100
|
26
28
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
27
|
-
self.
|
28
|
-
self.QUEUE_SIZE = 10
|
29
|
+
self.MAX_TAR_WORKERS = cpu_count()
|
29
30
|
if api_key is not None:
|
30
31
|
self._api_key = api_key
|
31
|
-
# Create a shared event loop for async operations
|
32
32
|
self.loop = asyncio.new_event_loop()
|
33
|
-
# Create a thread to run the event loop
|
34
33
|
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
35
34
|
self.loop_thread.start()
|
36
|
-
# Create a queue for async tasks
|
37
35
|
self.async_queue = Queue()
|
38
36
|
|
39
37
|
def _run_event_loop(self):
|
40
|
-
"""Run the event loop in a separate thread"""
|
41
38
|
asyncio.set_event_loop(self.loop)
|
42
39
|
self.loop.run_forever()
|
43
40
|
|
44
41
|
def _run_coroutine(self, coro):
|
45
|
-
"""Run a coroutine in the event loop and return its result"""
|
46
42
|
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
47
43
|
return future.result()
|
48
44
|
|
@@ -72,65 +68,94 @@ class Downloader:
|
|
72
68
|
except Exception as e:
|
73
69
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
74
70
|
|
75
|
-
class
|
76
|
-
def __init__(self, output_dir,
|
77
|
-
self.processing_queue = Queue(maxsize=queue_size)
|
78
|
-
self.should_stop = False
|
79
|
-
self.processing_workers = []
|
71
|
+
class TarManager:
|
72
|
+
def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
|
80
73
|
self.output_dir = output_dir
|
81
|
-
self.
|
82
|
-
self.
|
83
|
-
self.
|
84
|
-
self.
|
85
|
-
self.
|
86
|
-
self.
|
87
|
-
self.
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
self.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
self.
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
74
|
+
self.num_tar_files = num_tar_files
|
75
|
+
self.max_batch_size = max_batch_size
|
76
|
+
self.tar_files = {}
|
77
|
+
self.tar_locks = {}
|
78
|
+
self.file_counters = {}
|
79
|
+
self.tar_sizes = {}
|
80
|
+
self.tar_sequences = {}
|
81
|
+
|
82
|
+
for i in range(num_tar_files):
|
83
|
+
tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
|
84
|
+
self.tar_files[i] = tarfile.open(tar_path, 'w')
|
85
|
+
self.tar_locks[i] = Lock()
|
86
|
+
self.file_counters[i] = 0
|
87
|
+
self.tar_sizes[i] = 0
|
88
|
+
self.tar_sequences[i] = 1
|
89
|
+
|
90
|
+
def get_tar_index(self, filename):
|
91
|
+
return hash(filename) % self.num_tar_files
|
92
|
+
|
93
|
+
def write_submission(self, filename, metadata, documents, standardize_metadata):
|
94
|
+
tar_index = self.get_tar_index(filename)
|
95
|
+
accession_num = filename.split('.')[0]
|
96
|
+
|
97
|
+
metadata_str = bytes_to_str(metadata, lower=False)
|
98
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
99
|
+
submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
|
100
|
+
|
101
|
+
with self.tar_locks[tar_index]:
|
102
|
+
if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
|
103
|
+
tar = self.tar_files[tar_index]
|
104
|
+
tar.close()
|
105
|
+
|
106
|
+
self.tar_sequences[tar_index] += 1
|
107
|
+
new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
|
108
|
+
self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
|
109
|
+
self.file_counters[tar_index] = 0
|
110
|
+
self.tar_sizes[tar_index] = 0
|
111
|
+
|
112
|
+
tar = self.tar_files[tar_index]
|
113
|
+
|
106
114
|
try:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
115
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
|
116
|
+
tarinfo.size = len(metadata_json)
|
117
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
118
|
+
|
119
|
+
for file_num, content in enumerate(documents):
|
120
|
+
doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
|
121
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
|
122
|
+
tarinfo.size = len(content)
|
123
|
+
tar.addfile(tarinfo, io.BytesIO(content))
|
124
|
+
|
125
|
+
self.file_counters[tar_index] += 1
|
126
|
+
self.tar_sizes[tar_index] += submission_size
|
127
|
+
return True
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
|
131
|
+
return False
|
132
|
+
|
133
|
+
def _get_document_name(self, metadata, file_num, standardize_metadata):
|
134
|
+
if standardize_metadata:
|
135
|
+
documents_key = b'documents'
|
136
|
+
filename_key = b'filename'
|
137
|
+
sequence_key = b'sequence'
|
138
|
+
else:
|
139
|
+
documents_key = b'DOCUMENTS'
|
140
|
+
filename_key = b'FILENAME'
|
141
|
+
sequence_key = b'SEQUENCE'
|
142
|
+
|
143
|
+
doc_metadata = metadata[documents_key][file_num]
|
144
|
+
filename = doc_metadata.get(filename_key)
|
145
|
+
if filename:
|
146
|
+
return filename.decode('utf-8')
|
147
|
+
else:
|
148
|
+
sequence = doc_metadata.get(sequence_key, b'document')
|
149
|
+
return sequence.decode('utf-8') + '.txt'
|
150
|
+
|
151
|
+
def close_all(self):
|
152
|
+
for i, tar in self.tar_files.items():
|
153
|
+
try:
|
154
|
+
tar.close()
|
155
|
+
except Exception as e:
|
156
|
+
print(f"Error closing tar {i}: {str(e)}")
|
132
157
|
|
133
|
-
def
|
158
|
+
def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
|
134
159
|
dctx = zstd.ZstdDecompressor()
|
135
160
|
try:
|
136
161
|
input_buffer = io.BytesIO(b''.join(compressed_chunks))
|
@@ -140,11 +165,19 @@ class Downloader:
|
|
140
165
|
shutil.copyfileobj(reader, decompressed_content)
|
141
166
|
|
142
167
|
content = decompressed_content.getvalue()
|
143
|
-
|
144
|
-
|
168
|
+
|
169
|
+
metadata, documents = parse_sgml_content_into_memory(
|
170
|
+
bytes_content=content,
|
171
|
+
filter_document_types=keep_document_types,
|
172
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
173
|
+
standardize_metadata=standardize_metadata
|
174
|
+
)
|
175
|
+
|
176
|
+
success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
|
177
|
+
return success
|
145
178
|
|
146
179
|
except Exception as e:
|
147
|
-
self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
|
180
|
+
self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
|
148
181
|
return False
|
149
182
|
finally:
|
150
183
|
try:
|
@@ -153,17 +186,25 @@ class Downloader:
|
|
153
186
|
except:
|
154
187
|
pass
|
155
188
|
|
156
|
-
def
|
189
|
+
def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
|
157
190
|
try:
|
158
191
|
content = b''.join(chunks)
|
159
|
-
|
160
|
-
|
192
|
+
|
193
|
+
metadata, documents = parse_sgml_content_into_memory(
|
194
|
+
bytes_content=content,
|
195
|
+
filter_document_types=keep_document_types,
|
196
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
197
|
+
standardize_metadata=standardize_metadata
|
198
|
+
)
|
199
|
+
|
200
|
+
success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
|
201
|
+
return success
|
161
202
|
|
162
203
|
except Exception as e:
|
163
|
-
self._log_error(output_dir, filename, f"
|
204
|
+
self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
|
164
205
|
return False
|
165
206
|
|
166
|
-
async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir,
|
207
|
+
async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
|
167
208
|
async with semaphore:
|
168
209
|
chunks = []
|
169
210
|
filename = url.split('/')[-1]
|
@@ -188,70 +229,70 @@ class Downloader:
|
|
188
229
|
if filename.endswith('.zst'):
|
189
230
|
success = await loop.run_in_executor(
|
190
231
|
decompression_pool,
|
191
|
-
partial(self.
|
232
|
+
partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
192
233
|
)
|
193
234
|
else:
|
194
235
|
success = await loop.run_in_executor(
|
195
236
|
decompression_pool,
|
196
|
-
partial(self.
|
237
|
+
partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
197
238
|
)
|
198
239
|
|
199
240
|
if not success:
|
200
241
|
self._log_error(output_dir, filename, "Failed to process file")
|
242
|
+
|
201
243
|
elif response.status == 401:
|
202
244
|
self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
|
203
245
|
raise ValueError("Invalid API key")
|
204
246
|
else:
|
205
247
|
self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
|
248
|
+
|
249
|
+
pbar.update(1)
|
250
|
+
|
206
251
|
except Exception as e:
|
207
252
|
self._log_error(output_dir, filename, str(e))
|
253
|
+
pbar.update(1)
|
208
254
|
|
209
|
-
async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
255
|
+
async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
|
210
256
|
os.makedirs(output_dir, exist_ok=True)
|
211
257
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
258
|
+
num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
|
259
|
+
|
260
|
+
tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
|
261
|
+
|
262
|
+
try:
|
263
|
+
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
264
|
+
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
265
|
+
decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
|
266
|
+
|
267
|
+
connector = aiohttp.TCPConnector(
|
268
|
+
limit=self.MAX_CONCURRENT_DOWNLOADS,
|
269
|
+
force_close=False,
|
270
|
+
ssl=ssl.create_default_context(),
|
271
|
+
ttl_dns_cache=300,
|
272
|
+
keepalive_timeout=60
|
273
|
+
)
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
|
276
|
+
tasks = [
|
277
|
+
self.download_and_process(
|
278
|
+
session, url, semaphore, decompression_pool,
|
279
|
+
keep_document_types, keep_filtered_metadata, standardize_metadata,
|
280
|
+
tar_manager, output_dir, pbar
|
281
|
+
)
|
282
|
+
for url in urls
|
283
|
+
]
|
284
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
285
|
+
|
286
|
+
decompression_pool.shutdown()
|
287
|
+
|
288
|
+
finally:
|
289
|
+
tar_manager.close_all()
|
236
290
|
|
237
291
|
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
|
238
|
-
skip_accession_numbers=[]):
|
239
|
-
"""
|
240
|
-
Query SEC filings and download/process them.
|
241
|
-
|
242
|
-
Parameters:
|
243
|
-
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
244
|
-
- cik: Company CIK number(s), string, int, or list
|
245
|
-
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
246
|
-
- output_dir: Directory to save downloaded files
|
247
|
-
- accession_numbers: List of specific accession numbers to download
|
248
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
249
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
250
|
-
"""
|
292
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
251
293
|
if self.api_key is None:
|
252
294
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
253
295
|
|
254
|
-
# Query the SEC filings first - before starting any async operations
|
255
296
|
print("Querying SEC filings...")
|
256
297
|
filings = query(
|
257
298
|
submission_type=submission_type,
|
@@ -260,19 +301,14 @@ class Downloader:
|
|
260
301
|
api_key=self.api_key
|
261
302
|
)
|
262
303
|
|
263
|
-
|
264
|
-
# After querying but before generating URLs
|
265
304
|
if accession_numbers:
|
266
305
|
accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
|
267
306
|
filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
|
268
307
|
|
269
|
-
|
270
308
|
if skip_accession_numbers:
|
271
309
|
skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
|
272
310
|
filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
|
273
311
|
|
274
|
-
# Generate URLs from the query results
|
275
|
-
|
276
312
|
print(f"Generating URLs for {len(filings)} filings...")
|
277
313
|
urls = []
|
278
314
|
for item in filings:
|
@@ -285,38 +321,21 @@ class Downloader:
|
|
285
321
|
print("No submissions found matching the criteria")
|
286
322
|
return
|
287
323
|
|
288
|
-
# Remove duplicates
|
289
324
|
urls = list(set(urls))
|
290
325
|
|
291
|
-
# Now start the async processing
|
292
326
|
start_time = time.time()
|
293
327
|
|
294
|
-
|
295
|
-
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
|
328
|
+
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
|
296
329
|
|
297
|
-
# Calculate and display performance metrics
|
298
330
|
elapsed_time = time.time() - start_time
|
299
331
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
300
332
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
301
333
|
|
302
334
|
def __del__(self):
|
303
|
-
"""Cleanup when the downloader is garbage collected"""
|
304
335
|
if hasattr(self, 'loop') and self.loop.is_running():
|
305
336
|
self.loop.call_soon_threadsafe(self.loop.stop)
|
306
337
|
|
307
|
-
|
308
|
-
|
309
|
-
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
310
|
-
"""
|
311
|
-
Download and process SEC filings using specific filenames.
|
312
|
-
|
313
|
-
Parameters:
|
314
|
-
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
315
|
-
- output_dir: Directory to save downloaded files
|
316
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
317
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
318
|
-
- standardize_metadata: Whether to standardize metadata format
|
319
|
-
"""
|
338
|
+
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
|
320
339
|
if self.api_key is None:
|
321
340
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
322
341
|
|
@@ -326,27 +345,23 @@ class Downloader:
|
|
326
345
|
if not isinstance(filenames, (list, tuple)):
|
327
346
|
filenames = [filenames]
|
328
347
|
|
329
|
-
# Validate filenames format
|
330
348
|
for filename in filenames:
|
331
349
|
if not isinstance(filename, str):
|
332
350
|
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
333
351
|
if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
|
334
352
|
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
|
335
353
|
|
336
|
-
# Generate URLs directly from filenames
|
337
354
|
print(f"Generating URLs for {len(filenames)} files...")
|
338
355
|
urls = []
|
339
356
|
for filename in filenames:
|
340
357
|
url = f"{self.BASE_URL}{filename}"
|
341
358
|
urls.append(url)
|
342
359
|
|
343
|
-
# Remove duplicates while preserving order
|
344
360
|
seen = set()
|
345
361
|
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
346
362
|
|
347
363
|
print(f"Downloading {len(urls)} files...")
|
348
364
|
|
349
|
-
# Process the batch asynchronously using existing infrastructure
|
350
365
|
start_time = time.time()
|
351
366
|
|
352
367
|
asyncio.run(self.process_batch(
|
@@ -354,33 +369,19 @@ class Downloader:
|
|
354
369
|
output_dir,
|
355
370
|
keep_document_types=keep_document_types,
|
356
371
|
keep_filtered_metadata=keep_filtered_metadata,
|
357
|
-
standardize_metadata=standardize_metadata
|
372
|
+
standardize_metadata=standardize_metadata,
|
373
|
+
max_batch_size=max_batch_size
|
358
374
|
))
|
359
375
|
|
360
|
-
# Calculate and display performance metrics
|
361
376
|
elapsed_time = time.time() - start_time
|
362
377
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
363
378
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
364
379
|
|
365
380
|
|
366
381
|
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
367
|
-
skip_accession_numbers=[]):
|
368
|
-
"""
|
369
|
-
Query SEC filings and download/process them.
|
370
|
-
|
371
|
-
Parameters:
|
372
|
-
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
373
|
-
- cik: Company CIK number(s), string, int, or list
|
374
|
-
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
375
|
-
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
376
|
-
- output_dir: Directory to save downloaded files
|
377
|
-
- accession_numbers: List of specific accession numbers to download
|
378
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
379
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
380
|
-
"""
|
382
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
381
383
|
if accession_numbers:
|
382
384
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
383
|
-
# check if acc no is empty list
|
384
385
|
elif accession_numbers == []:
|
385
386
|
raise ValueError("Applied filter resulted in empty accession numbers list")
|
386
387
|
downloader = Downloader(api_key=api_key)
|
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
393
394
|
keep_document_types=keep_document_types,
|
394
395
|
keep_filtered_metadata=keep_filtered_metadata,
|
395
396
|
standardize_metadata=standardize_metadata,
|
396
|
-
skip_accession_numbers=skip_accession_numbers
|
397
|
-
|
397
|
+
skip_accession_numbers=skip_accession_numbers,
|
398
|
+
max_batch_size=max_batch_size
|
399
|
+
)
|