datamule 2.3.9__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

datamule/book/book.py CHANGED
@@ -10,4 +10,9 @@ class Book:
10
10
  s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
11
11
  errors_json_filename=errors_json_filename, retry_errors=retry_errors,
12
12
  force_daily=force_daily, cik=cik, submission_type=submission_type,
13
- filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
13
+ filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
14
+
15
+
16
+ def download_filings_processed_r2():
17
+ pass
18
+
@@ -19,8 +19,9 @@ from secsgml import parse_sgml_content_into_memory
19
19
  from secsgml.utils import bytes_to_str
20
20
  from .datamule_lookup import datamule_lookup
21
21
  from ..utils.format_accession import format_accession
22
+ from ..providers.providers import SEC_FILINGS_SGML_BUCKET_ENDPOINT
22
23
 
23
- # could be cleaned up
24
+ # TODO could be cleaned up
24
25
 
25
26
  # Set up logging
26
27
  logging.basicConfig(
@@ -33,7 +34,7 @@ logger = logging.getLogger(__name__)
33
34
 
34
35
  class Downloader:
35
36
  def __init__(self, api_key=None):
36
- self.BASE_URL = "https://sec-library.datamule.xyz/"
37
+ self.BASE_URL = SEC_FILINGS_SGML_BUCKET_ENDPOINT
37
38
  self.CHUNK_SIZE = 2 * 1024 * 1024
38
39
  self.MAX_CONCURRENT_DOWNLOADS = 100
39
40
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
@@ -0,0 +1,664 @@
1
+ import os
2
+ import asyncio
3
+ import aiohttp
4
+ from tqdm import tqdm
5
+ import time
6
+ import ssl
7
+ import zstandard as zstd
8
+ import io
9
+ import json
10
+ import tarfile
11
+ import logging
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import partial
14
+ from threading import Lock
15
+ from os import cpu_count
16
+ from .datamule_lookup import datamule_lookup
17
+ from ..utils.format_accession import format_accession
18
+ from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
19
+
20
+ # Set up logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s',
24
+ handlers=logging.getLogger().handlers,
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class TarDownloader:
30
+ def __init__(self, api_key=None):
31
+ self.BASE_URL = SEC_FILINGS_TAR_BUCKET_ENDPOINT
32
+ self.CHUNK_SIZE = 2 * 1024 * 1024
33
+ self.MAX_CONCURRENT_DOWNLOADS = 100
34
+ self.MAX_EXTRACTION_WORKERS = cpu_count()
35
+ self.MAX_TAR_WORKERS = cpu_count()
36
+ self.RANGE_MERGE_THRESHOLD = 1024 # Merge ranges if gap <= 1024 bytes
37
+ if api_key is not None:
38
+ self._api_key = api_key
39
+ self.error_log_lock = Lock()
40
+
41
+ @property
42
+ def api_key(self):
43
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
44
+
45
+ @api_key.setter
46
+ def api_key(self, value):
47
+ if not value:
48
+ raise ValueError("API key cannot be empty")
49
+ self._api_key = value
50
+
51
+ def _log_error(self, output_dir, filename, error_msg):
52
+ error_file = os.path.join(output_dir, 'errors.json')
53
+ with self.error_log_lock:
54
+ try:
55
+ if os.path.exists(error_file):
56
+ with open(error_file, 'r') as f:
57
+ errors = json.load(f)
58
+ else:
59
+ errors = {}
60
+
61
+ errors[filename] = str(error_msg)
62
+
63
+ with open(error_file, 'w') as f:
64
+ json.dump(errors, f, indent=2)
65
+ except Exception as e:
66
+ logger.error(f"Failed to log error to {error_file}: {str(e)}")
67
+
68
+ def _get_document_ranges(self, accession_num, keep_document_types, range_lookup_db=None):
69
+ """
70
+ Get byte ranges for requested document types.
71
+
72
+ Args:
73
+ accession_num: The accession number
74
+ keep_document_types: List of document types to retrieve
75
+ range_lookup_db: Future database connection for looking up ranges
76
+
77
+ Returns:
78
+ dict mapping document_type to (start_byte, end_byte)
79
+ """
80
+ if range_lookup_db is not None:
81
+ # Future: Query database for ranges
82
+ # return range_lookup_db.get_ranges(accession_num, keep_document_types)
83
+ pass
84
+
85
+ # Hardcoded ranges for now
86
+ ranges = {}
87
+ if 'metadata' in keep_document_types:
88
+ # Metadata is always first 128KB
89
+ ranges['metadata'] = (0, 131071)
90
+
91
+ return ranges
92
+
93
+ def _merge_ranges(self, ranges):
94
+ """
95
+ Merge overlapping or close ranges.
96
+
97
+ Args:
98
+ ranges: dict mapping document_type to (start_byte, end_byte)
99
+
100
+ Returns:
101
+ list of merged (start_byte, end_byte) tuples, sorted
102
+ """
103
+ if not ranges:
104
+ return []
105
+
106
+ # Extract and sort ranges by start byte
107
+ range_list = sorted(ranges.values(), key=lambda x: x[0])
108
+
109
+ merged = []
110
+ current_start, current_end = range_list[0]
111
+
112
+ for start, end in range_list[1:]:
113
+ # Check if ranges overlap or are within merge threshold
114
+ if start <= current_end + self.RANGE_MERGE_THRESHOLD:
115
+ # Merge: extend current range
116
+ current_end = max(current_end, end)
117
+ else:
118
+ # No merge: save current range and start new one
119
+ merged.append((current_start, current_end))
120
+ current_start, current_end = start, end
121
+
122
+ # Add the last range
123
+ merged.append((current_start, current_end))
124
+
125
+ return merged
126
+
127
+ def _build_range_header(self, merged_ranges):
128
+ """
129
+ Build HTTP Range header from merged ranges.
130
+
131
+ Args:
132
+ merged_ranges: list of (start_byte, end_byte) tuples
133
+
134
+ Returns:
135
+ Range header string, e.g., "bytes=0-131071,200000-300000"
136
+ """
137
+ if not merged_ranges:
138
+ return None
139
+
140
+ range_specs = [f"{start}-{end}" for start, end in merged_ranges]
141
+ return f"bytes={','.join(range_specs)}"
142
+
143
+ def _parse_tar_header(self, header_bytes):
144
+ """
145
+ Parse a 512-byte tar header.
146
+
147
+ Returns:
148
+ dict with 'name', 'size', or None if invalid header
149
+ """
150
+ if len(header_bytes) < 512:
151
+ return None
152
+
153
+ # Check if it's a zero block (end of archive)
154
+ if header_bytes == b'\x00' * 512:
155
+ return None
156
+
157
+ try:
158
+ # Tar header format (POSIX ustar)
159
+ name = header_bytes[0:100].split(b'\x00')[0].decode('utf-8')
160
+ size_str = header_bytes[124:136].split(b'\x00')[0].decode('utf-8').strip()
161
+
162
+ if not size_str:
163
+ return None
164
+
165
+ # Size is in octal
166
+ size = int(size_str, 8)
167
+
168
+ return {
169
+ 'name': name,
170
+ 'size': size
171
+ }
172
+ except:
173
+ return None
174
+
175
+ def _extract_files_from_partial_tar(self, tar_bytes):
176
+ """
177
+ Extract files from partial tar data by manually parsing headers.
178
+
179
+ Args:
180
+ tar_bytes: Raw bytes from partial tar download
181
+
182
+ Returns:
183
+ list of dicts with 'name' and 'content'
184
+ """
185
+ files = []
186
+ offset = 0
187
+
188
+ while offset + 512 <= len(tar_bytes):
189
+ # Read header
190
+ header = self._parse_tar_header(tar_bytes[offset:offset+512])
191
+
192
+ if header is None:
193
+ # End of archive or invalid header
194
+ break
195
+
196
+ offset += 512 # Move past header
197
+
198
+ # Calculate file content end and padding
199
+ file_size = header['size']
200
+ content_end = offset + file_size
201
+
202
+ # Check if we have the full file content
203
+ if content_end > len(tar_bytes):
204
+ # File is truncated, skip it
205
+ break
206
+
207
+ # Extract file content
208
+ content = tar_bytes[offset:content_end]
209
+
210
+ files.append({
211
+ 'name': os.path.basename(header['name']),
212
+ 'content': content
213
+ })
214
+
215
+ # Move to next 512-byte boundary
216
+ padding = (512 - (file_size % 512)) % 512
217
+ offset = content_end + padding
218
+
219
+ return files
220
+
221
+ def _build_filename_to_type_map(self, metadata_content):
222
+ """
223
+ Parse metadata and build a mapping of filename to document type.
224
+
225
+ Args:
226
+ metadata_content: The metadata.json content as bytes
227
+
228
+ Returns:
229
+ dict mapping filename to document type
230
+ """
231
+ try:
232
+ metadata = json.loads(metadata_content)
233
+ filename_map = {}
234
+
235
+ if 'documents' in metadata:
236
+ for doc in metadata['documents']:
237
+ filename = doc.get('filename')
238
+ doc_type = doc.get('type')
239
+ if filename and doc_type:
240
+ filename_map[filename] = doc_type
241
+
242
+ return filename_map
243
+ except:
244
+ return {}
245
+
246
+ def _filter_documents_by_type(self, documents, filename_map, keep_document_types):
247
+ """
248
+ Filter documents based on their type from metadata.
249
+
250
+ Args:
251
+ documents: List of dicts with 'name' and 'content'
252
+ filename_map: Dict mapping filename to document type
253
+ keep_document_types: List of document types to keep
254
+
255
+ Returns:
256
+ Filtered list of documents
257
+ """
258
+ if not keep_document_types or not filename_map:
259
+ return documents
260
+
261
+ # 'metadata' is special - it's already handled separately
262
+ # Filter out 'metadata' from keep_document_types for document filtering
263
+ doc_types_to_keep = [dt for dt in keep_document_types if dt != 'metadata']
264
+
265
+ if not doc_types_to_keep:
266
+ # Only metadata requested, no other documents
267
+ return []
268
+
269
+ filtered = []
270
+ for doc in documents:
271
+ doc_type = filename_map.get(doc['name'])
272
+ if doc_type and doc_type in doc_types_to_keep:
273
+ filtered.append(doc)
274
+
275
+ return filtered
276
+
277
+ def _decompress_zstd(self, compressed_content):
278
+ """Decompress zstd content"""
279
+ dctx = zstd.ZstdDecompressor()
280
+ return dctx.decompress(compressed_content)
281
+
282
+ class TarManager:
283
+ def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
284
+ self.output_dir = output_dir
285
+ self.num_tar_files = num_tar_files
286
+ self.max_batch_size = max_batch_size
287
+ self.tar_files = {}
288
+ self.tar_locks = {}
289
+ self.file_counters = {}
290
+ self.tar_sizes = {}
291
+ self.tar_sequences = {}
292
+
293
+ for i in range(num_tar_files):
294
+ tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
295
+ self.tar_files[i] = tarfile.open(tar_path, 'a')
296
+ self.tar_locks[i] = Lock()
297
+ self.file_counters[i] = 0
298
+ self.tar_sizes[i] = 0
299
+ self.tar_sequences[i] = 1
300
+
301
+ def get_tar_index(self, accession_num):
302
+ return hash(accession_num) % self.num_tar_files
303
+
304
+ def write_submission(self, accession_num, metadata_content, documents):
305
+ tar_index = self.get_tar_index(accession_num)
306
+
307
+ submission_size = len(metadata_content) + sum(len(doc['content']) for doc in documents)
308
+
309
+ with self.tar_locks[tar_index]:
310
+ if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
311
+ tar = self.tar_files[tar_index]
312
+ tar.close()
313
+
314
+ self.tar_sequences[tar_index] += 1
315
+ new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
316
+ self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
317
+ self.file_counters[tar_index] = 0
318
+ self.tar_sizes[tar_index] = 0
319
+
320
+ tar = self.tar_files[tar_index]
321
+
322
+ try:
323
+ # Write metadata
324
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
325
+ tarinfo.size = len(metadata_content)
326
+ tar.addfile(tarinfo, io.BytesIO(metadata_content))
327
+
328
+ # Write documents
329
+ for doc in documents:
330
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc["name"]}')
331
+ tarinfo.size = len(doc['content'])
332
+ tar.addfile(tarinfo, io.BytesIO(doc['content']))
333
+
334
+ self.file_counters[tar_index] += 1
335
+ self.tar_sizes[tar_index] += submission_size
336
+ return True
337
+
338
+ except Exception as e:
339
+ logger.error(f"Error writing {accession_num} to tar {tar_index}: {str(e)}")
340
+ return False
341
+
342
+ def close_all(self):
343
+ for i, tar in self.tar_files.items():
344
+ try:
345
+ tar.close()
346
+ except Exception as e:
347
+ logger.error(f"Error closing tar {i}: {str(e)}")
348
+
349
+ def _parse_multipart_byteranges(self, content, content_type):
350
+ """
351
+ Parse multipart/byteranges response.
352
+ Currently simplified for single-range responses.
353
+ Future: implement full multipart parsing when using database with multiple ranges.
354
+
355
+ Args:
356
+ content: Response body bytes
357
+ content_type: Content-Type header value
358
+
359
+ Returns:
360
+ list of (start_byte, end_byte, data) tuples
361
+ """
362
+ # For now, handle single range responses only
363
+ if 'boundary=' not in content_type:
364
+ return [(None, None, content)]
365
+
366
+ # TODO: Implement full multipart parsing when database returns multiple discontinuous ranges
367
+ return [(None, None, content)]
368
+
369
+ def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
370
+ """Extract tar file and process its contents"""
371
+ try:
372
+ accession_num = filename.replace('.tar', '').split('/')[-1]
373
+
374
+ # If partial download (range request), manually parse tar headers
375
+ if is_partial:
376
+ files = self._extract_files_from_partial_tar(tar_content)
377
+
378
+ if not files:
379
+ self._log_error(output_dir, filename, "No files found in partial tar")
380
+ return False
381
+
382
+ # First file is metadata (never compressed)
383
+ metadata_content = files[0]['content']
384
+
385
+ # Remaining files are documents (always compressed)
386
+ documents = []
387
+ for file in files[1:]:
388
+ file['content'] = self._decompress_zstd(file['content'])
389
+ documents.append(file)
390
+
391
+ # Build filename to type mapping from metadata
392
+ filename_map = self._build_filename_to_type_map(metadata_content)
393
+
394
+ # Filter documents based on keep_document_types
395
+ documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
396
+
397
+ else:
398
+ # Full download, use tarfile library
399
+ tar_buffer = io.BytesIO(tar_content)
400
+
401
+ with tarfile.open(fileobj=tar_buffer, mode='r') as tar:
402
+ members = tar.getmembers()
403
+
404
+ if not members:
405
+ self._log_error(output_dir, filename, "Empty tar file")
406
+ return False
407
+
408
+ # Read all files
409
+ metadata_content = None
410
+ documents = []
411
+
412
+ for idx, member in enumerate(members):
413
+ if member.isfile():
414
+ file_content = tar.extractfile(member).read()
415
+
416
+ if idx == 0:
417
+ # First file is metadata (never compressed)
418
+ metadata_content = file_content
419
+ else:
420
+ # All other files are documents (always compressed)
421
+ file_content = self._decompress_zstd(file_content)
422
+
423
+ documents.append({
424
+ 'name': os.path.basename(member.name),
425
+ 'content': file_content
426
+ })
427
+
428
+ if metadata_content is None:
429
+ self._log_error(output_dir, filename, "No metadata found in tar")
430
+ return False
431
+
432
+ # Build filename to type mapping and filter
433
+ if keep_document_types:
434
+ filename_map = self._build_filename_to_type_map(metadata_content)
435
+ documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
436
+
437
+ tar_buffer.close()
438
+
439
+ # Write to output tar
440
+ success = tar_manager.write_submission(accession_num, metadata_content, documents)
441
+
442
+ if not success:
443
+ self._log_error(output_dir, filename, "Failed to write to output tar")
444
+
445
+ return success
446
+
447
+ except Exception as e:
448
+ self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
449
+ return False
450
+
451
+ async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
452
+ async with semaphore:
453
+ filename = url.split('/')[-1]
454
+ accession_num = filename.replace('.tar', '').split('/')[-1]
455
+
456
+ api_key = self.api_key
457
+ if not api_key:
458
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
459
+
460
+ try:
461
+ headers = {
462
+ 'Connection': 'keep-alive',
463
+ 'Accept-Encoding': 'gzip, deflate, br',
464
+ 'Authorization': f'Bearer {api_key}'
465
+ }
466
+
467
+ # Determine if we need partial download
468
+ range_header = None
469
+ is_partial = False
470
+ if keep_document_types:
471
+ # Get ranges for requested document types
472
+ doc_ranges = self._get_document_ranges(accession_num, keep_document_types, range_lookup_db)
473
+
474
+ if doc_ranges:
475
+ # Merge ranges
476
+ merged_ranges = self._merge_ranges(doc_ranges)
477
+
478
+ # Build range header
479
+ range_header = self._build_range_header(merged_ranges)
480
+
481
+ if range_header:
482
+ headers['Range'] = range_header
483
+ is_partial = True
484
+
485
+ async with session.get(url, headers=headers) as response:
486
+ if response.status in (200, 206): # 200 = full, 206 = partial
487
+ content_type = response.headers.get('Content-Type', '')
488
+
489
+ # Read all chunks
490
+ chunks = []
491
+ async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
492
+ chunks.append(chunk)
493
+
494
+ content = b''.join(chunks)
495
+
496
+ # Handle multipart response if needed
497
+ if response.status == 206 and 'multipart/byteranges' in content_type:
498
+ # Parse multipart response
499
+ parts = self._parse_multipart_byteranges(content, content_type)
500
+
501
+ # Reconstruct tar content from parts
502
+ tar_content = b''.join(part[2] for part in parts)
503
+ else:
504
+ tar_content = content
505
+
506
+ # Process in thread pool
507
+ loop = asyncio.get_running_loop()
508
+ success = await loop.run_in_executor(
509
+ extraction_pool,
510
+ partial(self.extract_and_process_tar, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial)
511
+ )
512
+
513
+ if not success:
514
+ self._log_error(output_dir, filename, "Failed to process tar file")
515
+
516
+ elif response.status == 401:
517
+ self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
518
+ raise ValueError("Invalid API key")
519
+ else:
520
+ self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
521
+
522
+ pbar.update(1)
523
+
524
+ except Exception as e:
525
+ self._log_error(output_dir, filename, str(e))
526
+ pbar.update(1)
527
+
528
+ async def process_batch(self, urls, output_dir, max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
529
+ os.makedirs(output_dir, exist_ok=True)
530
+
531
+ num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
532
+
533
+ tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
534
+
535
+ try:
536
+ with tqdm(total=len(urls), desc="Downloading tar files") as pbar:
537
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
538
+ extraction_pool = ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS)
539
+
540
+ connector = aiohttp.TCPConnector(
541
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
542
+ force_close=False,
543
+ ssl=ssl.create_default_context(),
544
+ ttl_dns_cache=300,
545
+ keepalive_timeout=60
546
+ )
547
+
548
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
549
+ tasks = [
550
+ self.download_and_process(
551
+ session, url, semaphore, extraction_pool,
552
+ tar_manager, output_dir, pbar, keep_document_types, range_lookup_db
553
+ )
554
+ for url in urls
555
+ ]
556
+ await asyncio.gather(*tasks, return_exceptions=True)
557
+
558
+ extraction_pool.shutdown()
559
+
560
+ finally:
561
+ tar_manager.close_all()
562
+
563
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads",
564
+ filtered_accession_numbers=None, skip_accession_numbers=[],
565
+ max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
566
+ if self.api_key is None:
567
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
568
+
569
+ logger.debug("Querying SEC filings...")
570
+
571
+ if not accession_numbers:
572
+ filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
573
+ columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False, api_key=self.api_key)
574
+
575
+ if filtered_accession_numbers:
576
+ filtered_accession_numbers = [format_accession(item, 'int') for item in filtered_accession_numbers]
577
+ filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
578
+
579
+ if skip_accession_numbers:
580
+ skip_accession_numbers = [format_accession(item, 'int') for item in skip_accession_numbers]
581
+ filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
582
+
583
+ logger.debug(f"Generating URLs for {len(filings)} filings...")
584
+ urls = []
585
+ for item in filings:
586
+ url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.tar"
587
+ urls.append(url)
588
+ else:
589
+ urls = []
590
+ for accession in accession_numbers:
591
+ url = f"{self.BASE_URL}{format_accession(accession, 'no-dash').zfill(18)}.tar"
592
+ urls.append(url)
593
+
594
+ if not urls:
595
+ logger.warning("No submissions found matching the criteria")
596
+ return
597
+
598
+ urls = list(set(urls))
599
+
600
+ start_time = time.time()
601
+
602
+ asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
603
+
604
+ elapsed_time = time.time() - start_time
605
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
606
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
607
+
608
+ def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
609
+ if self.api_key is None:
610
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
611
+
612
+ if not filenames:
613
+ raise ValueError("No filenames provided")
614
+
615
+ if not isinstance(filenames, (list, tuple)):
616
+ filenames = [filenames]
617
+
618
+ for filename in filenames:
619
+ if not isinstance(filename, str):
620
+ raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
621
+ if not filename.endswith('.tar'):
622
+ raise ValueError(f"Invalid filename format: {filename}. Expected .tar extension.")
623
+
624
+ logger.debug(f"Generating URLs for {len(filenames)} files...")
625
+ urls = []
626
+ for filename in filenames:
627
+ url = f"{self.BASE_URL}{filename}"
628
+ urls.append(url)
629
+
630
+ urls = list(set(urls))
631
+
632
+ logger.debug(f"Downloading {len(urls)} tar files...")
633
+
634
+ start_time = time.time()
635
+
636
+ asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
637
+
638
+ elapsed_time = time.time() - start_time
639
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
640
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
641
+
642
+
643
+ def download_tar(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads",
644
+ filtered_accession_numbers=None, skip_accession_numbers=[],
645
+ max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
646
+
647
+ if filtered_accession_numbers:
648
+ filtered_accession_numbers = [format_accession(x, 'int') for x in filtered_accession_numbers]
649
+ elif filtered_accession_numbers == []:
650
+ raise ValueError("Applied filter resulted in empty accession numbers list")
651
+
652
+ downloader = TarDownloader(api_key=api_key)
653
+ downloader.download(
654
+ submission_type=submission_type,
655
+ cik=cik,
656
+ filing_date=filing_date,
657
+ output_dir=output_dir,
658
+ filtered_accession_numbers=filtered_accession_numbers,
659
+ skip_accession_numbers=skip_accession_numbers,
660
+ max_batch_size=max_batch_size,
661
+ accession_numbers=accession_numbers,
662
+ keep_document_types=keep_document_types,
663
+ range_lookup_db=range_lookup_db
664
+ )
@@ -14,6 +14,7 @@ from ..sec.xbrl.filter_xbrl import filter_xbrl
14
14
  from ..sec.submissions.monitor import Monitor
15
15
  from .portfolio_compression_utils_legacy import CompressionManager
16
16
  from ..datamule.sec_connector import SecConnector
17
+ from ..datamule.tar_downloader import download_tar
17
18
  import shutil
18
19
 
19
20
 
@@ -220,8 +221,12 @@ class Portfolio:
220
221
  skip_accession_numbers = []
221
222
  if skip_existing:
222
223
  skip_accession_numbers = [sub.accession for sub in self]
223
-
224
+
225
+ # map legacy provider
224
226
  if provider == 'datamule':
227
+ provider = 'datamule-sgml'
228
+
229
+ if provider == 'datamule-sgml':
225
230
  seclibrary_download(
226
231
  output_dir=self.path,
227
232
  cik=cik,
@@ -235,6 +240,18 @@ class Portfolio:
235
240
  skip_accession_numbers=skip_accession_numbers,
236
241
  accession_numbers = accession_numbers
237
242
  )
243
+ elif provider == 'datamule-tar':
244
+ download_tar(
245
+ output_dir=self.path,
246
+ cik=cik,
247
+ api_key=self.api_key,
248
+ submission_type=submission_type,
249
+ filing_date=filing_date,
250
+ filtered_accession_numbers=filtered_accession_numbers,
251
+ skip_accession_numbers=skip_accession_numbers,
252
+ accession_numbers = accession_numbers,
253
+ keep_document_types=document_type
254
+ )
238
255
  else:
239
256
  # will later add accession_numbers arg in the free update.
240
257
  sec_download(
@@ -3,4 +3,4 @@
3
3
 
4
4
  SEC_LOOKUP_DB_ENDPOINT = ""
5
5
  SEC_FILINGS_SGML_BUCKET_ENDPOINT = "https://sec-library.datamule.xyz/"
6
- SEC_FILINGS_TAR_BUCKET_ENDPOINT = "https://sec-library.datamule.xyz/tar/"
6
+ SEC_FILINGS_TAR_BUCKET_ENDPOINT = "https://sec-library.tar.datamule.xyz/"
@@ -83,6 +83,7 @@ class Submission:
83
83
  self._tar = None
84
84
  self._tar_compression_type = 'zstd'
85
85
  self._tar_compression_level = 3
86
+ self._tar_compression_threshold = None
86
87
  self._accession_year_2d = None
87
88
  self._documents = None
88
89
 
@@ -109,7 +110,7 @@ class Submission:
109
110
  content_type = response.headers.get('Content-Type', '')
110
111
  if content_type == 'application/zstd':
111
112
  dctx = zstd.ZstdDecompressor()
112
- sgml_content = dctx.decompress(sgml_content)
113
+ sgml_content = dctx.decompressobj().decompress(sgml_content)
113
114
  else:
114
115
  raise ValueError(f"URL: {url}, Error: {response.getcode()}")
115
116
 
@@ -121,7 +122,6 @@ class Submission:
121
122
  metadata = transform_metadata_string(metadata)
122
123
 
123
124
  self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
124
-
125
125
  self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
126
126
 
127
127
  self.documents_obj_list = []
@@ -340,6 +340,8 @@ class Submission:
340
340
  context = xbrl_record['_context']
341
341
  period_start_date = context.get('period_instant') or context.get('period_startdate')
342
342
  period_end_date = context.get('period_enddate')
343
+ else:
344
+ context = None
343
345
 
344
346
  # Create record in the format expected by construct_fundamentals
345
347
  record = {
@@ -347,7 +349,8 @@ class Submission:
347
349
  'name': name,
348
350
  'value': value,
349
351
  'period_start_date': period_start_date,
350
- 'period_end_date': period_end_date
352
+ 'period_end_date': period_end_date,
353
+ 'context' : context
351
354
  }
352
355
 
353
356
  xbrl.append(record)
@@ -380,9 +383,10 @@ class Submission:
380
383
  def tar(self):
381
384
  return self._tar_submission().getvalue()
382
385
 
383
- def set_tar_compression(self,compression_type='zstd',level=3):
386
+ def set_tar_compression(self,compression_type='zstd',level=3,threshold=None):
384
387
  self._tar_compression_type = compression_type
385
388
  self._tar_compression_level = level
389
+ self._tar_compression_threshold = threshold
386
390
 
387
391
  def _tar_submission(self):
388
392
  if self._tar is not None:
@@ -393,7 +397,8 @@ class Submission:
393
397
  documents_obj_list=documents_obj_list,
394
398
  metadata=self.metadata.content,
395
399
  compression_type=self._tar_compression_type,
396
- level=self._tar_compression_level
400
+ level=self._tar_compression_level,
401
+ threshold=self._tar_compression_threshold
397
402
  )
398
403
  return self._tar
399
404
 
@@ -4,43 +4,42 @@ import tarfile
4
4
  import io
5
5
  import json
6
6
 
7
- # Note: we don't actually need accession at this level. TODO
8
7
 
9
- def compress_content(content, compression_type, level):
8
+ def compress_content(content, compression_type, level, threshold):
10
9
  if compression_type == 'zstd':
11
- # Create compressor with specified level
12
- compressor = zstd.ZstdCompressor(level=level)
13
-
14
10
  # Handle string content
15
- # This should never be called
16
11
  if isinstance(content, str):
17
12
  content_bytes = content.encode('utf-8')
18
13
  else:
19
14
  content_bytes = content
20
-
21
- # Compress and return
15
+
16
+ # If content smaller than threshold, return uncompressed
17
+ if threshold is not None and len(content_bytes) < threshold:
18
+ return content_bytes
19
+
20
+ # Compress with specified level
21
+ compressor = zstd.ZstdCompressor(level=level)
22
22
  return compressor.compress(content_bytes)
23
-
23
+
24
24
  # Return uncompressed if not zstd
25
25
  return content
26
26
 
27
- def compress_content_list(document_tuple_list, compression_type, level):
27
+
28
+ def compress_content_list(document_tuple_list, compression_type, level, threshold):
28
29
  if compression_type is None:
29
30
  return document_tuple_list
30
31
 
31
32
  if level is None:
32
33
  level = 3
33
34
 
34
- # Create new list to avoid modifying original
35
35
  compressed_list = []
36
- for document_tuple in document_tuple_list:
37
- content = document_tuple[0]
38
- accession = document_tuple[1]
39
- compressed_content = compress_content(content, compression_type, level)
36
+ for content, accession in document_tuple_list:
37
+ compressed_content = compress_content(content, compression_type, level, threshold)
40
38
  compressed_list.append((compressed_content, accession))
41
39
 
42
40
  return compressed_list
43
41
 
42
+
44
43
  def tar_content_list(metadata, document_tuple_list_compressed):
45
44
  # Update metadata with compressed sizes
46
45
  for i, (content, accession) in enumerate(document_tuple_list_compressed):
@@ -65,15 +64,18 @@ def tar_content_list(metadata, document_tuple_list_compressed):
65
64
  tarinfo.size = len(content)
66
65
  tar.addfile(tarinfo, io.BytesIO(content))
67
66
 
68
- # Return the tar buffer
69
- tar_buffer.seek(0) # Reset buffer position to beginning
67
+ tar_buffer.seek(0) # Reset buffer position
70
68
  return tar_buffer
71
69
 
72
- def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
73
- """Takes a list of documents, compresses them, then tars them."""
70
+
71
+ def tar_submission(metadata, documents_obj_list, compression_type=None, level=None, threshold=None):
72
+ """Takes a list of documents, compresses them (if above threshold), then tars them."""
74
73
  document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
75
- document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
76
- compression_type=compression_type,
77
- level=level)
74
+ document_tuple_list_compressed = compress_content_list(
75
+ document_tuple_list,
76
+ compression_type=compression_type,
77
+ level=level,
78
+ threshold=threshold
79
+ )
78
80
 
79
- return tar_content_list(metadata, document_tuple_list_compressed)
81
+ return tar_content_list(metadata, document_tuple_list_compressed)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.9
3
+ Version: 2.4.1
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -5,15 +5,16 @@ datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
5
5
  datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
6
6
  datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
7
7
  datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- datamule/book/book.py,sha256=Vw33JHhmulNDWRN2AQpUQrf8wgVqqUYg5QJgbKhBNak,773
8
+ datamule/book/book.py,sha256=AwQUKpd3iAUbUGs2SzODIiK7aBrG2YdqwjqMp8-Fvtg,839
9
9
  datamule/book/s3transfer.py,sha256=arftLhYThLSGvmBSNnU2rNpkqiyvwAL32OVAKP4HOAQ,12596
10
10
  datamule/cloud/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
12
12
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
14
14
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
15
- datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
15
+ datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
+ datamule/datamule/tar_downloader.py,sha256=w_HePdFJ-SjiFNLpQrFW-zn0qYjABZNRZSCO118FIgM,27326
17
18
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
19
20
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,10 +22,10 @@ datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2W
21
22
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
22
23
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
23
24
  datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- datamule/portfolio/portfolio.py,sha256=PQWROW5eDunOgu668n_RArQKBd8QFNjWCbdZabLE7AY,11677
25
+ datamule/portfolio/portfolio.py,sha256=UK27CoKntclIGgRhyiQjARMl5NNPCqTmBu4FtdXr4S4,12349
25
26
  datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
26
27
  datamule/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- datamule/providers/providers.py,sha256=mxU_YokPuUc87tdbsfLz0ReqcDc3XGkaiHnqfI_VUac,232
28
+ datamule/providers/providers.py,sha256=pfCjoWzDPRK46gh0RR5U0crBJGnSHJKIw6OVn9OpjXc,232
28
29
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
30
  datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
30
31
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -45,8 +46,8 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
45
46
  datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
47
  datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
47
48
  datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- datamule/submission/submission.py,sha256=I_7F658vTW1C_dsogIXdxXnV3W1Gbfj_6uzse1YHgY0,17343
49
- datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
49
+ datamule/submission/submission.py,sha256=cd1SKi3fzNmvXmgbtxA7j2zc2KnFE2f68Qbta9Bnlu8,17629
50
+ datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
50
51
  datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
52
  datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
52
53
  datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -67,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
68
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
68
69
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
69
70
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
70
- datamule-2.3.9.dist-info/METADATA,sha256=X9sKhS_wPqUynhpANtzNQntQUtzrVn6CIoJBmg4Rs7c,609
71
- datamule-2.3.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
72
- datamule-2.3.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
73
- datamule-2.3.9.dist-info/RECORD,,
71
+ datamule-2.4.1.dist-info/METADATA,sha256=61-fgRGZb-L2yINFRhsGU_ITyPxrh7RmwC_VCVIITE4,609
72
+ datamule-2.4.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
73
+ datamule-2.4.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
74
+ datamule-2.4.1.dist-info/RECORD,,