datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,13 +8,15 @@ import ssl
8
8
  import zstandard as zstd
9
9
  import io
10
10
  import json
11
+ import tarfile
11
12
  from concurrent.futures import ThreadPoolExecutor
12
13
  from functools import partial
13
14
  from queue import Queue, Empty
14
- from threading import Thread
15
+ from threading import Thread, Lock
15
16
  from .query import query
16
17
  from os import cpu_count
17
- from secsgml import write_sgml_file_to_tar
18
+ from secsgml import parse_sgml_content_into_memory
19
+ from secsgml.utils import bytes_to_str
18
20
 
19
21
 
20
22
 
@@ -24,25 +26,19 @@ class Downloader:
24
26
  self.CHUNK_SIZE = 2 * 1024 * 1024
25
27
  self.MAX_CONCURRENT_DOWNLOADS = 100
26
28
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
27
- self.MAX_PROCESSING_WORKERS = cpu_count()
28
- self.QUEUE_SIZE = 10
29
+ self.MAX_TAR_WORKERS = cpu_count()
29
30
  if api_key is not None:
30
31
  self._api_key = api_key
31
- # Create a shared event loop for async operations
32
32
  self.loop = asyncio.new_event_loop()
33
- # Create a thread to run the event loop
34
33
  self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
35
34
  self.loop_thread.start()
36
- # Create a queue for async tasks
37
35
  self.async_queue = Queue()
38
36
 
39
37
  def _run_event_loop(self):
40
- """Run the event loop in a separate thread"""
41
38
  asyncio.set_event_loop(self.loop)
42
39
  self.loop.run_forever()
43
40
 
44
41
  def _run_coroutine(self, coro):
45
- """Run a coroutine in the event loop and return its result"""
46
42
  future = asyncio.run_coroutine_threadsafe(coro, self.loop)
47
43
  return future.result()
48
44
 
@@ -72,65 +68,94 @@ class Downloader:
72
68
  except Exception as e:
73
69
  print(f"Failed to log error to {error_file}: {str(e)}")
74
70
 
75
- class FileProcessor:
76
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
77
- self.processing_queue = Queue(maxsize=queue_size)
78
- self.should_stop = False
79
- self.processing_workers = []
71
+ class TarManager:
72
+ def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
80
73
  self.output_dir = output_dir
81
- self.max_workers = max_workers
82
- self.batch_size = 50
83
- self.pbar = pbar
84
- self.downloader = downloader
85
- self.keep_document_types = keep_document_types
86
- self.keep_filtered_metadata = keep_filtered_metadata
87
- self.standardize_metadata = standardize_metadata
88
-
89
- def start_processing_workers(self):
90
- for _ in range(self.max_workers):
91
- worker = Thread(target=self._processing_worker)
92
- worker.daemon = True
93
- worker.start()
94
- self.processing_workers.append(worker)
95
-
96
- def _process_file(self, item):
97
- filename, content = item
98
- output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
99
- write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
100
-
101
- self.pbar.update(1)
102
-
103
- def _processing_worker(self):
104
- batch = []
105
- while not self.should_stop:
74
+ self.num_tar_files = num_tar_files
75
+ self.max_batch_size = max_batch_size
76
+ self.tar_files = {}
77
+ self.tar_locks = {}
78
+ self.file_counters = {}
79
+ self.tar_sizes = {}
80
+ self.tar_sequences = {}
81
+
82
+ for i in range(num_tar_files):
83
+ tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
84
+ self.tar_files[i] = tarfile.open(tar_path, 'w')
85
+ self.tar_locks[i] = Lock()
86
+ self.file_counters[i] = 0
87
+ self.tar_sizes[i] = 0
88
+ self.tar_sequences[i] = 1
89
+
90
+ def get_tar_index(self, filename):
91
+ return hash(filename) % self.num_tar_files
92
+
93
+ def write_submission(self, filename, metadata, documents, standardize_metadata):
94
+ tar_index = self.get_tar_index(filename)
95
+ accession_num = filename.split('.')[0]
96
+
97
+ metadata_str = bytes_to_str(metadata, lower=False)
98
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
99
+ submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
100
+
101
+ with self.tar_locks[tar_index]:
102
+ if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
103
+ tar = self.tar_files[tar_index]
104
+ tar.close()
105
+
106
+ self.tar_sequences[tar_index] += 1
107
+ new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
108
+ self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
109
+ self.file_counters[tar_index] = 0
110
+ self.tar_sizes[tar_index] = 0
111
+
112
+ tar = self.tar_files[tar_index]
113
+
106
114
  try:
107
- item = self.processing_queue.get(timeout=1)
108
- if item is None:
109
- break
110
-
111
- batch.append(item)
112
-
113
- if len(batch) >= self.batch_size or self.processing_queue.empty():
114
- for item in batch:
115
- self._process_file(item)
116
- self.processing_queue.task_done()
117
- batch = []
118
-
119
- except Empty:
120
- if batch:
121
- for item in batch:
122
- self._process_file(item)
123
- self.processing_queue.task_done()
124
- batch = []
125
-
126
- def stop_workers(self):
127
- self.should_stop = True
128
- for _ in self.processing_workers:
129
- self.processing_queue.put(None)
130
- for worker in self.processing_workers:
131
- worker.join()
115
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
116
+ tarinfo.size = len(metadata_json)
117
+ tar.addfile(tarinfo, io.BytesIO(metadata_json))
118
+
119
+ for file_num, content in enumerate(documents):
120
+ doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
121
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
122
+ tarinfo.size = len(content)
123
+ tar.addfile(tarinfo, io.BytesIO(content))
124
+
125
+ self.file_counters[tar_index] += 1
126
+ self.tar_sizes[tar_index] += submission_size
127
+ return True
128
+
129
+ except Exception as e:
130
+ print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
131
+ return False
132
+
133
+ def _get_document_name(self, metadata, file_num, standardize_metadata):
134
+ if standardize_metadata:
135
+ documents_key = b'documents'
136
+ filename_key = b'filename'
137
+ sequence_key = b'sequence'
138
+ else:
139
+ documents_key = b'DOCUMENTS'
140
+ filename_key = b'FILENAME'
141
+ sequence_key = b'SEQUENCE'
142
+
143
+ doc_metadata = metadata[documents_key][file_num]
144
+ filename = doc_metadata.get(filename_key)
145
+ if filename:
146
+ return filename.decode('utf-8')
147
+ else:
148
+ sequence = doc_metadata.get(sequence_key, b'document')
149
+ return sequence.decode('utf-8') + '.txt'
150
+
151
+ def close_all(self):
152
+ for i, tar in self.tar_files.items():
153
+ try:
154
+ tar.close()
155
+ except Exception as e:
156
+ print(f"Error closing tar {i}: {str(e)}")
132
157
 
133
- def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
158
+ def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
134
159
  dctx = zstd.ZstdDecompressor()
135
160
  try:
136
161
  input_buffer = io.BytesIO(b''.join(compressed_chunks))
@@ -140,11 +165,19 @@ class Downloader:
140
165
  shutil.copyfileobj(reader, decompressed_content)
141
166
 
142
167
  content = decompressed_content.getvalue()
143
- processor.processing_queue.put((filename, content))
144
- return True
168
+
169
+ metadata, documents = parse_sgml_content_into_memory(
170
+ bytes_content=content,
171
+ filter_document_types=keep_document_types,
172
+ keep_filtered_metadata=keep_filtered_metadata,
173
+ standardize_metadata=standardize_metadata
174
+ )
175
+
176
+ success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
177
+ return success
145
178
 
146
179
  except Exception as e:
147
- self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
180
+ self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
148
181
  return False
149
182
  finally:
150
183
  try:
@@ -153,17 +186,25 @@ class Downloader:
153
186
  except:
154
187
  pass
155
188
 
156
- def save_regular_file(self, chunks, filename, output_dir, processor):
189
+ def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
157
190
  try:
158
191
  content = b''.join(chunks)
159
- processor.processing_queue.put((filename, content))
160
- return True
192
+
193
+ metadata, documents = parse_sgml_content_into_memory(
194
+ bytes_content=content,
195
+ filter_document_types=keep_document_types,
196
+ keep_filtered_metadata=keep_filtered_metadata,
197
+ standardize_metadata=standardize_metadata
198
+ )
199
+
200
+ success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
201
+ return success
161
202
 
162
203
  except Exception as e:
163
- self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
204
+ self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
164
205
  return False
165
206
 
166
- async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
207
+ async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
167
208
  async with semaphore:
168
209
  chunks = []
169
210
  filename = url.split('/')[-1]
@@ -188,70 +229,70 @@ class Downloader:
188
229
  if filename.endswith('.zst'):
189
230
  success = await loop.run_in_executor(
190
231
  decompression_pool,
191
- partial(self.decompress_stream, chunks, filename, output_dir, processor)
232
+ partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
192
233
  )
193
234
  else:
194
235
  success = await loop.run_in_executor(
195
236
  decompression_pool,
196
- partial(self.save_regular_file, chunks, filename, output_dir, processor)
237
+ partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
197
238
  )
198
239
 
199
240
  if not success:
200
241
  self._log_error(output_dir, filename, "Failed to process file")
242
+
201
243
  elif response.status == 401:
202
244
  self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
203
245
  raise ValueError("Invalid API key")
204
246
  else:
205
247
  self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
248
+
249
+ pbar.update(1)
250
+
206
251
  except Exception as e:
207
252
  self._log_error(output_dir, filename, str(e))
253
+ pbar.update(1)
208
254
 
209
- async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
255
+ async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
210
256
  os.makedirs(output_dir, exist_ok=True)
211
257
 
212
- with tqdm(total=len(urls), desc="Processing files") as pbar:
213
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
214
- keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
215
- processor.start_processing_workers()
216
-
217
- semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
218
- decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
219
-
220
- connector = aiohttp.TCPConnector(
221
- limit=self.MAX_CONCURRENT_DOWNLOADS,
222
- force_close=False,
223
- ssl=ssl.create_default_context(),
224
- ttl_dns_cache=300,
225
- keepalive_timeout=60
226
- )
227
-
228
- # timeout should be max 30s.
229
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
230
- tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
231
- await asyncio.gather(*tasks, return_exceptions=True)
232
-
233
- processor.processing_queue.join()
234
- processor.stop_workers()
235
- decompression_pool.shutdown()
258
+ num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
259
+
260
+ tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
261
+
262
+ try:
263
+ with tqdm(total=len(urls), desc="Processing files") as pbar:
264
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
265
+ decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
266
+
267
+ connector = aiohttp.TCPConnector(
268
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
269
+ force_close=False,
270
+ ssl=ssl.create_default_context(),
271
+ ttl_dns_cache=300,
272
+ keepalive_timeout=60
273
+ )
274
+
275
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
276
+ tasks = [
277
+ self.download_and_process(
278
+ session, url, semaphore, decompression_pool,
279
+ keep_document_types, keep_filtered_metadata, standardize_metadata,
280
+ tar_manager, output_dir, pbar
281
+ )
282
+ for url in urls
283
+ ]
284
+ await asyncio.gather(*tasks, return_exceptions=True)
285
+
286
+ decompression_pool.shutdown()
287
+
288
+ finally:
289
+ tar_manager.close_all()
236
290
 
237
291
  def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
238
- skip_accession_numbers=[]):
239
- """
240
- Query SEC filings and download/process them.
241
-
242
- Parameters:
243
- - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
244
- - cik: Company CIK number(s), string, int, or list
245
- - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
246
- - output_dir: Directory to save downloaded files
247
- - accession_numbers: List of specific accession numbers to download
248
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
249
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
250
- """
292
+ skip_accession_numbers=[], max_batch_size=1024*1024*1024):
251
293
  if self.api_key is None:
252
294
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
253
295
 
254
- # Query the SEC filings first - before starting any async operations
255
296
  print("Querying SEC filings...")
256
297
  filings = query(
257
298
  submission_type=submission_type,
@@ -260,19 +301,14 @@ class Downloader:
260
301
  api_key=self.api_key
261
302
  )
262
303
 
263
-
264
- # After querying but before generating URLs
265
304
  if accession_numbers:
266
305
  accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
267
306
  filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
268
307
 
269
-
270
308
  if skip_accession_numbers:
271
309
  skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
272
310
  filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
273
311
 
274
- # Generate URLs from the query results
275
-
276
312
  print(f"Generating URLs for {len(filings)} filings...")
277
313
  urls = []
278
314
  for item in filings:
@@ -285,38 +321,21 @@ class Downloader:
285
321
  print("No submissions found matching the criteria")
286
322
  return
287
323
 
288
- # Remove duplicates
289
324
  urls = list(set(urls))
290
325
 
291
- # Now start the async processing
292
326
  start_time = time.time()
293
327
 
294
- # Process the batch asynchronously
295
- asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
328
+ asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
296
329
 
297
- # Calculate and display performance metrics
298
330
  elapsed_time = time.time() - start_time
299
331
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
300
332
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
301
333
 
302
334
  def __del__(self):
303
- """Cleanup when the downloader is garbage collected"""
304
335
  if hasattr(self, 'loop') and self.loop.is_running():
305
336
  self.loop.call_soon_threadsafe(self.loop.stop)
306
337
 
307
-
308
-
309
- def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
310
- """
311
- Download and process SEC filings using specific filenames.
312
-
313
- Parameters:
314
- - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
315
- - output_dir: Directory to save downloaded files
316
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
317
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
318
- - standardize_metadata: Whether to standardize metadata format
319
- """
338
+ def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
320
339
  if self.api_key is None:
321
340
  raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
322
341
 
@@ -326,27 +345,23 @@ class Downloader:
326
345
  if not isinstance(filenames, (list, tuple)):
327
346
  filenames = [filenames]
328
347
 
329
- # Validate filenames format
330
348
  for filename in filenames:
331
349
  if not isinstance(filename, str):
332
350
  raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
333
351
  if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
334
352
  raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
335
353
 
336
- # Generate URLs directly from filenames
337
354
  print(f"Generating URLs for {len(filenames)} files...")
338
355
  urls = []
339
356
  for filename in filenames:
340
357
  url = f"{self.BASE_URL}{filename}"
341
358
  urls.append(url)
342
359
 
343
- # Remove duplicates while preserving order
344
360
  seen = set()
345
361
  urls = [url for url in urls if not (url in seen or seen.add(url))]
346
362
 
347
363
  print(f"Downloading {len(urls)} files...")
348
364
 
349
- # Process the batch asynchronously using existing infrastructure
350
365
  start_time = time.time()
351
366
 
352
367
  asyncio.run(self.process_batch(
@@ -354,33 +369,19 @@ class Downloader:
354
369
  output_dir,
355
370
  keep_document_types=keep_document_types,
356
371
  keep_filtered_metadata=keep_filtered_metadata,
357
- standardize_metadata=standardize_metadata
372
+ standardize_metadata=standardize_metadata,
373
+ max_batch_size=max_batch_size
358
374
  ))
359
375
 
360
- # Calculate and display performance metrics
361
376
  elapsed_time = time.time() - start_time
362
377
  print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
363
378
  print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
364
379
 
365
380
 
366
381
  def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
367
- skip_accession_numbers=[]):
368
- """
369
- Query SEC filings and download/process them.
370
-
371
- Parameters:
372
- - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
373
- - cik: Company CIK number(s), string, int, or list
374
- - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
375
- - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
376
- - output_dir: Directory to save downloaded files
377
- - accession_numbers: List of specific accession numbers to download
378
- - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
379
- - keep_filtered_metadata: Whether to keep metadata for filtered documents
380
- """
382
+ skip_accession_numbers=[], max_batch_size=1024*1024*1024):
381
383
  if accession_numbers:
382
384
  accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
383
- # check if acc no is empty list
384
385
  elif accession_numbers == []:
385
386
  raise ValueError("Applied filter resulted in empty accession numbers list")
386
387
  downloader = Downloader(api_key=api_key)
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
393
394
  keep_document_types=keep_document_types,
394
395
  keep_filtered_metadata=keep_filtered_metadata,
395
396
  standardize_metadata=standardize_metadata,
396
- skip_accession_numbers=skip_accession_numbers
397
- )
397
+ skip_accession_numbers=skip_accession_numbers,
398
+ max_batch_size=max_batch_size
399
+ )