datamule 2.3.8__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

@@ -19,8 +19,9 @@ from secsgml import parse_sgml_content_into_memory
19
19
  from secsgml.utils import bytes_to_str
20
20
  from .datamule_lookup import datamule_lookup
21
21
  from ..utils.format_accession import format_accession
22
+ from ..providers.providers import SEC_FILINGS_SGML_BUCKET_ENDPOINT
22
23
 
23
- # could be cleaned up
24
+ # TODO could be cleaned up
24
25
 
25
26
  # Set up logging
26
27
  logging.basicConfig(
@@ -33,7 +34,7 @@ logger = logging.getLogger(__name__)
33
34
 
34
35
  class Downloader:
35
36
  def __init__(self, api_key=None):
36
- self.BASE_URL = "https://sec-library.datamule.xyz/"
37
+ self.BASE_URL = SEC_FILINGS_SGML_BUCKET_ENDPOINT
37
38
  self.CHUNK_SIZE = 2 * 1024 * 1024
38
39
  self.MAX_CONCURRENT_DOWNLOADS = 100
39
40
  self.MAX_DECOMPRESSION_WORKERS = cpu_count()
@@ -0,0 +1,719 @@
1
+ import os
2
+ import asyncio
3
+ import aiohttp
4
+ from tqdm import tqdm
5
+ import time
6
+ import ssl
7
+ import zstandard as zstd
8
+ import io
9
+ import json
10
+ import tarfile
11
+ import logging
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import partial
14
+ from queue import Queue
15
+ from threading import Thread, Lock
16
+ from os import cpu_count
17
+ from .datamule_lookup import datamule_lookup
18
+ from ..utils.format_accession import format_accession
19
+ from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
20
+
21
+ # Set up logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(levelname)s - %(message)s',
25
+ handlers=logging.getLogger().handlers,
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class TarDownloader:
31
+ def __init__(self, api_key=None):
32
+ self.BASE_URL = SEC_FILINGS_TAR_BUCKET_ENDPOINT
33
+ self.CHUNK_SIZE = 2 * 1024 * 1024
34
+ self.MAX_CONCURRENT_DOWNLOADS = 100
35
+ self.MAX_EXTRACTION_WORKERS = cpu_count()
36
+ self.MAX_TAR_WORKERS = cpu_count()
37
+ self.RANGE_MERGE_THRESHOLD = 1024 # Merge ranges if gap <= 1024 bytes
38
+ if api_key is not None:
39
+ self._api_key = api_key
40
+ self.loop = asyncio.new_event_loop()
41
+ self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
42
+ self.loop_thread.start()
43
+ self.async_queue = Queue()
44
+ self.error_log_lock = Lock()
45
+
46
+ def _run_event_loop(self):
47
+ asyncio.set_event_loop(self.loop)
48
+ self.loop.run_forever()
49
+
50
+ def _run_coroutine(self, coro):
51
+ future = asyncio.run_coroutine_threadsafe(coro, self.loop)
52
+ return future.result()
53
+
54
+ @property
55
+ def api_key(self):
56
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
57
+
58
+ @api_key.setter
59
+ def api_key(self, value):
60
+ if not value:
61
+ raise ValueError("API key cannot be empty")
62
+ self._api_key = value
63
+
64
+ def _log_error(self, output_dir, filename, error_msg):
65
+ error_file = os.path.join(output_dir, 'errors.json')
66
+ with self.error_log_lock:
67
+ try:
68
+ if os.path.exists(error_file):
69
+ with open(error_file, 'r') as f:
70
+ errors = json.load(f)
71
+ else:
72
+ errors = {}
73
+
74
+ errors[filename] = str(error_msg)
75
+
76
+ with open(error_file, 'w') as f:
77
+ json.dump(errors, f, indent=2)
78
+ except Exception as e:
79
+ logger.error(f"Failed to log error to {error_file}: {str(e)}")
80
+
81
+ def _get_document_ranges(self, accession_num, keep_document_types, range_lookup_db=None):
82
+ """
83
+ Get byte ranges for requested document types.
84
+
85
+ Args:
86
+ accession_num: The accession number
87
+ keep_document_types: List of document types to retrieve
88
+ range_lookup_db: Future database connection for looking up ranges
89
+
90
+ Returns:
91
+ dict mapping document_type to (start_byte, end_byte)
92
+ """
93
+ if range_lookup_db is not None:
94
+ # Future: Query database for ranges
95
+ # return range_lookup_db.get_ranges(accession_num, keep_document_types)
96
+ pass
97
+
98
+ # Hardcoded ranges for now
99
+ ranges = {}
100
+ if 'metadata' in keep_document_types:
101
+ # Metadata is always first 128KB
102
+ ranges['metadata'] = (0, 131071)
103
+
104
+ return ranges
105
+
106
+ def _merge_ranges(self, ranges):
107
+ """
108
+ Merge overlapping or close ranges.
109
+
110
+ Args:
111
+ ranges: dict mapping document_type to (start_byte, end_byte)
112
+
113
+ Returns:
114
+ list of merged (start_byte, end_byte) tuples, sorted
115
+ """
116
+ if not ranges:
117
+ return []
118
+
119
+ # Extract and sort ranges by start byte
120
+ range_list = sorted(ranges.values(), key=lambda x: x[0])
121
+
122
+ merged = []
123
+ current_start, current_end = range_list[0]
124
+
125
+ for start, end in range_list[1:]:
126
+ # Check if ranges overlap or are within merge threshold
127
+ if start <= current_end + self.RANGE_MERGE_THRESHOLD:
128
+ # Merge: extend current range
129
+ current_end = max(current_end, end)
130
+ else:
131
+ # No merge: save current range and start new one
132
+ merged.append((current_start, current_end))
133
+ current_start, current_end = start, end
134
+
135
+ # Add the last range
136
+ merged.append((current_start, current_end))
137
+
138
+ return merged
139
+
140
+ def _build_range_header(self, merged_ranges):
141
+ """
142
+ Build HTTP Range header from merged ranges.
143
+
144
+ Args:
145
+ merged_ranges: list of (start_byte, end_byte) tuples
146
+
147
+ Returns:
148
+ Range header string, e.g., "bytes=0-131071,200000-300000"
149
+ """
150
+ if not merged_ranges:
151
+ return None
152
+
153
+ range_specs = [f"{start}-{end}" for start, end in merged_ranges]
154
+ return f"bytes={','.join(range_specs)}"
155
+
156
+ def _parse_tar_header(self, header_bytes):
157
+ """
158
+ Parse a 512-byte tar header.
159
+
160
+ Returns:
161
+ dict with 'name', 'size', or None if invalid header
162
+ """
163
+ if len(header_bytes) < 512:
164
+ return None
165
+
166
+ # Check if it's a zero block (end of archive)
167
+ if header_bytes == b'\x00' * 512:
168
+ return None
169
+
170
+ try:
171
+ # Tar header format (POSIX ustar)
172
+ name = header_bytes[0:100].split(b'\x00')[0].decode('utf-8')
173
+ size_str = header_bytes[124:136].split(b'\x00')[0].decode('utf-8').strip()
174
+
175
+ if not size_str:
176
+ return None
177
+
178
+ # Size is in octal
179
+ size = int(size_str, 8)
180
+
181
+ return {
182
+ 'name': name,
183
+ 'size': size
184
+ }
185
+ except:
186
+ return None
187
+
188
+ def _extract_files_from_partial_tar(self, tar_bytes):
189
+ """
190
+ Extract files from partial tar data by manually parsing headers.
191
+
192
+ Args:
193
+ tar_bytes: Raw bytes from partial tar download
194
+
195
+ Returns:
196
+ list of dicts with 'name' and 'content'
197
+ """
198
+ files = []
199
+ offset = 0
200
+
201
+ while offset + 512 <= len(tar_bytes):
202
+ # Read header
203
+ header = self._parse_tar_header(tar_bytes[offset:offset+512])
204
+
205
+ if header is None:
206
+ # End of archive or invalid header
207
+ break
208
+
209
+ offset += 512 # Move past header
210
+
211
+ # Calculate file content end and padding
212
+ file_size = header['size']
213
+ content_end = offset + file_size
214
+
215
+ # Check if we have the full file content
216
+ if content_end > len(tar_bytes):
217
+ # File is truncated, skip it
218
+ break
219
+
220
+ # Extract file content
221
+ content = tar_bytes[offset:content_end]
222
+
223
+ files.append({
224
+ 'name': os.path.basename(header['name']),
225
+ 'content': content
226
+ })
227
+
228
+ # Move to next 512-byte boundary
229
+ padding = (512 - (file_size % 512)) % 512
230
+ offset = content_end + padding
231
+
232
+ return files
233
+
234
+ def _build_filename_to_type_map(self, metadata_content):
235
+ """
236
+ Parse metadata and build a mapping of filename to document type.
237
+
238
+ Args:
239
+ metadata_content: The metadata.json content as bytes
240
+
241
+ Returns:
242
+ dict mapping filename to document type
243
+ """
244
+ try:
245
+ metadata = json.loads(metadata_content)
246
+ filename_map = {}
247
+
248
+ if 'documents' in metadata:
249
+ for doc in metadata['documents']:
250
+ filename = doc.get('filename')
251
+ doc_type = doc.get('type')
252
+ if filename and doc_type:
253
+ filename_map[filename] = doc_type
254
+
255
+ return filename_map
256
+ except:
257
+ return {}
258
+
259
+ def _filter_documents_by_type(self, documents, filename_map, keep_document_types):
260
+ """
261
+ Filter documents based on their type from metadata.
262
+
263
+ Args:
264
+ documents: List of dicts with 'name' and 'content'
265
+ filename_map: Dict mapping filename to document type
266
+ keep_document_types: List of document types to keep
267
+
268
+ Returns:
269
+ Filtered list of documents
270
+ """
271
+ if not keep_document_types or not filename_map:
272
+ return documents
273
+
274
+ # 'metadata' is special - it's already handled separately
275
+ # Filter out 'metadata' from keep_document_types for document filtering
276
+ doc_types_to_keep = [dt for dt in keep_document_types if dt != 'metadata']
277
+
278
+ if not doc_types_to_keep:
279
+ # Only metadata requested, no other documents
280
+ return []
281
+
282
+ filtered = []
283
+ for doc in documents:
284
+ doc_type = filename_map.get(doc['name'])
285
+ if doc_type and doc_type in doc_types_to_keep:
286
+ filtered.append(doc)
287
+
288
+ return filtered
289
+
290
+ class TarManager:
291
+ def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
292
+ self.output_dir = output_dir
293
+ self.num_tar_files = num_tar_files
294
+ self.max_batch_size = max_batch_size
295
+ self.tar_files = {}
296
+ self.tar_locks = {}
297
+ self.file_counters = {}
298
+ self.tar_sizes = {}
299
+ self.tar_sequences = {}
300
+
301
+ for i in range(num_tar_files):
302
+ tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
303
+ self.tar_files[i] = tarfile.open(tar_path, 'a')
304
+ self.tar_locks[i] = Lock()
305
+ self.file_counters[i] = 0
306
+ self.tar_sizes[i] = 0
307
+ self.tar_sequences[i] = 1
308
+
309
+ def get_tar_index(self, accession_num):
310
+ return hash(accession_num) % self.num_tar_files
311
+
312
+ def write_submission(self, accession_num, metadata_content, documents):
313
+ tar_index = self.get_tar_index(accession_num)
314
+
315
+ submission_size = len(metadata_content) + sum(len(doc['content']) for doc in documents)
316
+
317
+ with self.tar_locks[tar_index]:
318
+ if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
319
+ tar = self.tar_files[tar_index]
320
+ tar.close()
321
+
322
+ self.tar_sequences[tar_index] += 1
323
+ new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
324
+ self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
325
+ self.file_counters[tar_index] = 0
326
+ self.tar_sizes[tar_index] = 0
327
+
328
+ tar = self.tar_files[tar_index]
329
+
330
+ try:
331
+ # Write metadata
332
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
333
+ tarinfo.size = len(metadata_content)
334
+ tar.addfile(tarinfo, io.BytesIO(metadata_content))
335
+
336
+ # Write documents
337
+ for doc in documents:
338
+ tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc["name"]}')
339
+ tarinfo.size = len(doc['content'])
340
+ tar.addfile(tarinfo, io.BytesIO(doc['content']))
341
+
342
+ self.file_counters[tar_index] += 1
343
+ self.tar_sizes[tar_index] += submission_size
344
+ return True
345
+
346
+ except Exception as e:
347
+ logger.error(f"Error writing {accession_num} to tar {tar_index}: {str(e)}")
348
+ return False
349
+
350
+ def close_all(self):
351
+ for i, tar in self.tar_files.items():
352
+ try:
353
+ tar.close()
354
+ except Exception as e:
355
+ logger.error(f"Error closing tar {i}: {str(e)}")
356
+
357
+ def _parse_multipart_byteranges(self, content, content_type):
358
+ """
359
+ Parse multipart/byteranges response.
360
+
361
+ Args:
362
+ content: Response body bytes
363
+ content_type: Content-Type header value
364
+
365
+ Returns:
366
+ list of (start_byte, end_byte, data) tuples
367
+ """
368
+ # Extract boundary from content type
369
+ if 'boundary=' not in content_type:
370
+ # Single range response, not multipart
371
+ return [(None, None, content)]
372
+
373
+ boundary = content_type.split('boundary=')[1].strip()
374
+ boundary_bytes = f'--{boundary}'.encode('utf-8')
375
+ end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
376
+
377
+ parts = []
378
+ sections = content.split(boundary_bytes)
379
+
380
+ for section in sections[1:]: # Skip first empty section
381
+ if section.startswith(end_boundary_bytes) or not section.strip():
382
+ continue
383
+
384
+ # Split headers from body
385
+ header_end = section.find(b'\r\n\r\n')
386
+ if header_end == -1:
387
+ header_end = section.find(b'\n\n')
388
+ if header_end == -1:
389
+ continue
390
+ body_start = header_end + 2
391
+ else:
392
+ body_start = header_end + 4
393
+
394
+ headers = section[:header_end].decode('utf-8', errors='ignore')
395
+ body = section[body_start:].rstrip(b'\r\n')
396
+
397
+ # Parse Content-Range header
398
+ start_byte = None
399
+ end_byte = None
400
+ for line in headers.split('\n'):
401
+ if line.lower().startswith('content-range:'):
402
+ # Format: "Content-Range: bytes START-END/TOTAL"
403
+ range_part = line.split(':')[1].strip()
404
+ if 'bytes ' in range_part:
405
+ byte_range = range_part.split('bytes ')[1].split('/')[0]
406
+ start_byte, end_byte = map(int, byte_range.split('-'))
407
+
408
+ parts.append((start_byte, end_byte, body))
409
+
410
+ return parts
411
+
412
+ def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
413
+ """Extract tar file and process its contents"""
414
+ try:
415
+ accession_num = filename.replace('.tar', '').split('/')[-1]
416
+
417
+ # If partial download (range request), manually parse tar headers
418
+ if is_partial:
419
+ files = self._extract_files_from_partial_tar(tar_content)
420
+
421
+ if not files:
422
+ self._log_error(output_dir, filename, "No files found in partial tar")
423
+ return False
424
+
425
+ # First file should be metadata
426
+ metadata_content = files[0]['content']
427
+ documents = files[1:] if len(files) > 1 else []
428
+
429
+ # Build filename to type mapping from metadata
430
+ filename_map = self._build_filename_to_type_map(metadata_content)
431
+
432
+ # Filter documents based on keep_document_types
433
+ documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
434
+
435
+ else:
436
+ # Full download, use tarfile library
437
+ tar_buffer = io.BytesIO(tar_content)
438
+
439
+ with tarfile.open(fileobj=tar_buffer, mode='r') as tar:
440
+ members = tar.getmembers()
441
+
442
+ if not members:
443
+ self._log_error(output_dir, filename, "Empty tar file")
444
+ return False
445
+
446
+ # Read all files
447
+ metadata_content = None
448
+ documents = []
449
+
450
+ for idx, member in enumerate(members):
451
+ if member.isfile():
452
+ file_content = tar.extractfile(member).read()
453
+
454
+ if idx == 0:
455
+ # First file is always metadata (never compressed)
456
+ metadata_content = file_content
457
+ else:
458
+ member_name = os.path.basename(member.name)
459
+
460
+ # Check if file is zstd compressed
461
+ if self._is_zstd_compressed(file_content):
462
+ file_content = self._decompress_zstd(file_content)
463
+
464
+ documents.append({
465
+ 'name': member_name,
466
+ 'content': file_content
467
+ })
468
+
469
+ if metadata_content is None:
470
+ self._log_error(output_dir, filename, "No metadata found in tar")
471
+ return False
472
+
473
+ # Build filename to type mapping and filter
474
+ if keep_document_types:
475
+ filename_map = self._build_filename_to_type_map(metadata_content)
476
+ documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
477
+
478
+ tar_buffer.close()
479
+
480
+ # Write to output tar
481
+ success = tar_manager.write_submission(accession_num, metadata_content, documents)
482
+
483
+ if not success:
484
+ self._log_error(output_dir, filename, "Failed to write to output tar")
485
+
486
+ return success
487
+
488
+ except Exception as e:
489
+ self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
490
+ return False
491
+
492
+ def _is_zstd_compressed(self, content):
493
+ """Check if content is zstd compressed by magic number"""
494
+ return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
495
+
496
+ def _decompress_zstd(self, compressed_content):
497
+ """Decompress zstd content"""
498
+ dctx = zstd.ZstdDecompressor()
499
+ return dctx.decompress(compressed_content)
500
+
501
+ async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
502
+ async with semaphore:
503
+ filename = url.split('/')[-1]
504
+ accession_num = filename.replace('.tar', '').split('/')[-1]
505
+
506
+ api_key = self.api_key
507
+ if not api_key:
508
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
509
+
510
+ try:
511
+ headers = {
512
+ 'Connection': 'keep-alive',
513
+ 'Accept-Encoding': 'gzip, deflate, br',
514
+ 'Authorization': f'Bearer {api_key}'
515
+ }
516
+
517
+ # Determine if we need partial download
518
+ range_header = None
519
+ is_partial = False
520
+ if keep_document_types:
521
+ # Get ranges for requested document types
522
+ doc_ranges = self._get_document_ranges(accession_num, keep_document_types, range_lookup_db)
523
+
524
+ if doc_ranges:
525
+ # Merge ranges
526
+ merged_ranges = self._merge_ranges(doc_ranges)
527
+
528
+ # Build range header
529
+ range_header = self._build_range_header(merged_ranges)
530
+
531
+ if range_header:
532
+ headers['Range'] = range_header
533
+ is_partial = True
534
+
535
+ async with session.get(url, headers=headers) as response:
536
+ if response.status in (200, 206): # 200 = full, 206 = partial
537
+ content_type = response.headers.get('Content-Type', '')
538
+
539
+ # Read all chunks
540
+ chunks = []
541
+ async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
542
+ chunks.append(chunk)
543
+
544
+ content = b''.join(chunks)
545
+
546
+ # Handle multipart response if needed
547
+ if response.status == 206 and 'multipart/byteranges' in content_type:
548
+ # Parse multipart response
549
+ parts = self._parse_multipart_byteranges(content, content_type)
550
+
551
+ # Reconstruct tar content from parts
552
+ tar_content = b''.join(part[2] for part in parts)
553
+ else:
554
+ tar_content = content
555
+
556
+ # Process in thread pool
557
+ loop = asyncio.get_running_loop()
558
+ success = await loop.run_in_executor(
559
+ extraction_pool,
560
+ partial(self.extract_and_process_tar, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial)
561
+ )
562
+
563
+ if not success:
564
+ self._log_error(output_dir, filename, "Failed to process tar file")
565
+
566
+ elif response.status == 401:
567
+ self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
568
+ raise ValueError("Invalid API key")
569
+ else:
570
+ self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
571
+
572
+ pbar.update(1)
573
+
574
+ except Exception as e:
575
+ self._log_error(output_dir, filename, str(e))
576
+ pbar.update(1)
577
+
578
+ async def process_batch(self, urls, output_dir, max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
579
+ os.makedirs(output_dir, exist_ok=True)
580
+
581
+ num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
582
+
583
+ tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
584
+
585
+ try:
586
+ with tqdm(total=len(urls), desc="Downloading tar files") as pbar:
587
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
588
+ extraction_pool = ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS)
589
+
590
+ connector = aiohttp.TCPConnector(
591
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
592
+ force_close=False,
593
+ ssl=ssl.create_default_context(),
594
+ ttl_dns_cache=300,
595
+ keepalive_timeout=60
596
+ )
597
+
598
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
599
+ tasks = [
600
+ self.download_and_process(
601
+ session, url, semaphore, extraction_pool,
602
+ tar_manager, output_dir, pbar, keep_document_types, range_lookup_db
603
+ )
604
+ for url in urls
605
+ ]
606
+ await asyncio.gather(*tasks, return_exceptions=True)
607
+
608
+ extraction_pool.shutdown()
609
+
610
+ finally:
611
+ tar_manager.close_all()
612
+
613
+ def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads",
614
+ filtered_accession_numbers=None, skip_accession_numbers=[],
615
+ max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
616
+ if self.api_key is None:
617
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
618
+
619
+ logger.debug("Querying SEC filings...")
620
+
621
+ if not accession_numbers:
622
+ filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
623
+ columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False, api_key=self.api_key)
624
+
625
+ if filtered_accession_numbers:
626
+ filtered_accession_numbers = [format_accession(item, 'int') for item in filtered_accession_numbers]
627
+ filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
628
+
629
+ if skip_accession_numbers:
630
+ skip_accession_numbers = [format_accession(item, 'int') for item in skip_accession_numbers]
631
+ filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
632
+
633
+ logger.debug(f"Generating URLs for {len(filings)} filings...")
634
+ urls = []
635
+ for item in filings:
636
+ url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.tar"
637
+ urls.append(url)
638
+ else:
639
+ urls = []
640
+ for accession in accession_numbers:
641
+ url = f"{self.BASE_URL}{format_accession(accession, 'no-dash').zfill(18)}.tar"
642
+ urls.append(url)
643
+
644
+ if not urls:
645
+ logger.warning("No submissions found matching the criteria")
646
+ return
647
+
648
+ urls = list(set(urls))
649
+
650
+ start_time = time.time()
651
+
652
+ asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
653
+
654
+ elapsed_time = time.time() - start_time
655
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
656
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
657
+
658
+ def __del__(self):
659
+ if hasattr(self, 'loop') and self.loop.is_running():
660
+ self.loop.call_soon_threadsafe(self.loop.stop)
661
+
662
+ def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
663
+ if self.api_key is None:
664
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
665
+
666
+ if not filenames:
667
+ raise ValueError("No filenames provided")
668
+
669
+ if not isinstance(filenames, (list, tuple)):
670
+ filenames = [filenames]
671
+
672
+ for filename in filenames:
673
+ if not isinstance(filename, str):
674
+ raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
675
+ if not filename.endswith('.tar'):
676
+ raise ValueError(f"Invalid filename format: {filename}. Expected .tar extension.")
677
+
678
+ logger.debug(f"Generating URLs for {len(filenames)} files...")
679
+ urls = []
680
+ for filename in filenames:
681
+ url = f"{self.BASE_URL}{filename}"
682
+ urls.append(url)
683
+
684
+ seen = set()
685
+ urls = [url for url in urls if not (url in seen or seen.add(url))]
686
+
687
+ logger.debug(f"Downloading {len(urls)} tar files...")
688
+
689
+ start_time = time.time()
690
+
691
+ asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
692
+
693
+ elapsed_time = time.time() - start_time
694
+ logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
695
+ logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
696
+
697
+
698
+ def download_tar(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads",
699
+ filtered_accession_numbers=None, skip_accession_numbers=[],
700
+ max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
701
+
702
+ if filtered_accession_numbers:
703
+ filtered_accession_numbers = [format_accession(x, 'int') for x in filtered_accession_numbers]
704
+ elif filtered_accession_numbers == []:
705
+ raise ValueError("Applied filter resulted in empty accession numbers list")
706
+
707
+ downloader = TarDownloader(api_key=api_key)
708
+ downloader.download(
709
+ submission_type=submission_type,
710
+ cik=cik,
711
+ filing_date=filing_date,
712
+ output_dir=output_dir,
713
+ filtered_accession_numbers=filtered_accession_numbers,
714
+ skip_accession_numbers=skip_accession_numbers,
715
+ max_batch_size=max_batch_size,
716
+ accession_numbers=accession_numbers,
717
+ keep_document_types=keep_document_types,
718
+ range_lookup_db=range_lookup_db
719
+ )
@@ -14,6 +14,7 @@ from ..sec.xbrl.filter_xbrl import filter_xbrl
14
14
  from ..sec.submissions.monitor import Monitor
15
15
  from .portfolio_compression_utils_legacy import CompressionManager
16
16
  from ..datamule.sec_connector import SecConnector
17
+ from ..datamule.tar_downloader import download_tar
17
18
  import shutil
18
19
 
19
20
 
@@ -34,7 +35,6 @@ class Portfolio:
34
35
 
35
36
  if self.path.exists():
36
37
  self._load_submissions()
37
- self.submissions_loaded = True
38
38
  else:
39
39
  self.path.mkdir(parents=True, exist_ok=True)
40
40
 
@@ -81,6 +81,8 @@ class Portfolio:
81
81
  self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
82
82
  print(f"Successfully loaded {len(self.submissions)} submissions")
83
83
 
84
+ self.submissions_loaded = True
85
+
84
86
  def _load_batch_submissions_worker(self, batch_tar_path, pbar):
85
87
  """Worker function to load submissions from one batch tar with progress updates"""
86
88
  # Open tar handle and store it
@@ -219,8 +221,12 @@ class Portfolio:
219
221
  skip_accession_numbers = []
220
222
  if skip_existing:
221
223
  skip_accession_numbers = [sub.accession for sub in self]
222
-
224
+
225
+ # map legacy provider
223
226
  if provider == 'datamule':
227
+ provider = 'datamule-sgml'
228
+
229
+ if provider == 'datamule-sgml':
224
230
  seclibrary_download(
225
231
  output_dir=self.path,
226
232
  cik=cik,
@@ -234,6 +240,18 @@ class Portfolio:
234
240
  skip_accession_numbers=skip_accession_numbers,
235
241
  accession_numbers = accession_numbers
236
242
  )
243
+ elif provider == 'datamule-tar':
244
+ download_tar(
245
+ output_dir=self.path,
246
+ cik=cik,
247
+ api_key=self.api_key,
248
+ submission_type=submission_type,
249
+ filing_date=filing_date,
250
+ filtered_accession_numbers=filtered_accession_numbers,
251
+ skip_accession_numbers=skip_accession_numbers,
252
+ accession_numbers = accession_numbers,
253
+ keep_document_types=document_type
254
+ )
237
255
  else:
238
256
  # will later add accession_numbers arg in the free update.
239
257
  sec_download(
File without changes
@@ -0,0 +1,6 @@
1
+ # TODO
2
+ # make it easy for people to bring their own cloud
3
+
4
+ SEC_LOOKUP_DB_ENDPOINT = ""
5
+ SEC_FILINGS_SGML_BUCKET_ENDPOINT = "https://sec-library.datamule.xyz/"
6
+ SEC_FILINGS_TAR_BUCKET_ENDPOINT = "https://sec-library.tar.datamule.xyz/"
@@ -82,7 +82,7 @@ class Streamer(EFTSQuery):
82
82
  if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
83
83
  return None, None, None
84
84
 
85
- if self.skip_accession_numbers is not None and accno_w_dash in self.skip_accession_numbers:
85
+ if self.skip_accession_numbers is not None and accno_no_dash in self.skip_accession_numbers:
86
86
  return None, None, None
87
87
 
88
88
  # Construct the URL
@@ -83,6 +83,7 @@ class Submission:
83
83
  self._tar = None
84
84
  self._tar_compression_type = 'zstd'
85
85
  self._tar_compression_level = 3
86
+ self._tar_compression_threshold = None
86
87
  self._accession_year_2d = None
87
88
  self._documents = None
88
89
 
@@ -380,9 +381,10 @@ class Submission:
380
381
  def tar(self):
381
382
  return self._tar_submission().getvalue()
382
383
 
383
- def set_tar_compression(self,compression_type='zstd',level=3):
384
+ def set_tar_compression(self,compression_type='zstd',level=3,threshold=None):
384
385
  self._tar_compression_type = compression_type
385
386
  self._tar_compression_level = level
387
+ self._tar_compression_threshold = threshold
386
388
 
387
389
  def _tar_submission(self):
388
390
  if self._tar is not None:
@@ -393,7 +395,8 @@ class Submission:
393
395
  documents_obj_list=documents_obj_list,
394
396
  metadata=self.metadata.content,
395
397
  compression_type=self._tar_compression_type,
396
- level=self._tar_compression_level
398
+ level=self._tar_compression_level,
399
+ threshold=self._tar_compression_threshold
397
400
  )
398
401
  return self._tar
399
402
 
@@ -4,43 +4,42 @@ import tarfile
4
4
  import io
5
5
  import json
6
6
 
7
- # Note: we don't actually need accession at this level. TODO
8
7
 
9
- def compress_content(content, compression_type, level):
8
+ def compress_content(content, compression_type, level, threshold):
10
9
  if compression_type == 'zstd':
11
- # Create compressor with specified level
12
- compressor = zstd.ZstdCompressor(level=level)
13
-
14
10
  # Handle string content
15
- # This should never be called
16
11
  if isinstance(content, str):
17
12
  content_bytes = content.encode('utf-8')
18
13
  else:
19
14
  content_bytes = content
20
-
21
- # Compress and return
15
+
16
+ # If content smaller than threshold, return uncompressed
17
+ if threshold is not None and len(content_bytes) < threshold:
18
+ return content_bytes
19
+
20
+ # Compress with specified level
21
+ compressor = zstd.ZstdCompressor(level=level)
22
22
  return compressor.compress(content_bytes)
23
-
23
+
24
24
  # Return uncompressed if not zstd
25
25
  return content
26
26
 
27
- def compress_content_list(document_tuple_list, compression_type, level):
27
+
28
+ def compress_content_list(document_tuple_list, compression_type, level, threshold):
28
29
  if compression_type is None:
29
30
  return document_tuple_list
30
31
 
31
32
  if level is None:
32
33
  level = 3
33
34
 
34
- # Create new list to avoid modifying original
35
35
  compressed_list = []
36
- for document_tuple in document_tuple_list:
37
- content = document_tuple[0]
38
- accession = document_tuple[1]
39
- compressed_content = compress_content(content, compression_type, level)
36
+ for content, accession in document_tuple_list:
37
+ compressed_content = compress_content(content, compression_type, level, threshold)
40
38
  compressed_list.append((compressed_content, accession))
41
39
 
42
40
  return compressed_list
43
41
 
42
+
44
43
  def tar_content_list(metadata, document_tuple_list_compressed):
45
44
  # Update metadata with compressed sizes
46
45
  for i, (content, accession) in enumerate(document_tuple_list_compressed):
@@ -65,15 +64,18 @@ def tar_content_list(metadata, document_tuple_list_compressed):
65
64
  tarinfo.size = len(content)
66
65
  tar.addfile(tarinfo, io.BytesIO(content))
67
66
 
68
- # Return the tar buffer
69
- tar_buffer.seek(0) # Reset buffer position to beginning
67
+ tar_buffer.seek(0) # Reset buffer position
70
68
  return tar_buffer
71
69
 
72
- def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
73
- """Takes a list of documents, compresses them, then tars them."""
70
+
71
+ def tar_submission(metadata, documents_obj_list, compression_type=None, level=None, threshold=None):
72
+ """Takes a list of documents, compresses them (if above threshold), then tars them."""
74
73
  document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
75
- document_tuple_list_compressed = compress_content_list(document_tuple_list, # Fixed: correct parameter name
76
- compression_type=compression_type,
77
- level=level)
74
+ document_tuple_list_compressed = compress_content_list(
75
+ document_tuple_list,
76
+ compression_type=compression_type,
77
+ level=level,
78
+ threshold=threshold
79
+ )
78
80
 
79
- return tar_content_list(metadata, document_tuple_list_compressed)
81
+ return tar_content_list(metadata, document_tuple_list_compressed)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 2.3.8
3
+ Version: 2.4.0
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -12,8 +12,9 @@ datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_t
12
12
  datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
14
14
  datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
15
- datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
15
+ datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
16
16
  datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
17
+ datamule/datamule/tar_downloader.py,sha256=5lHbk96MxtNVeuY1_uSAWj3tt5RqgOgvAr_7qQqbJmc,29483
17
18
  datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
19
20
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,8 +22,10 @@ datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2W
21
22
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
22
23
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
23
24
  datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- datamule/portfolio/portfolio.py,sha256=YPIvS4KKuEtm8A1XvNqDF39f4LJHhAFWmtpJzjbGDhY,11680
25
+ datamule/portfolio/portfolio.py,sha256=UK27CoKntclIGgRhyiQjARMl5NNPCqTmBu4FtdXr4S4,12349
25
26
  datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
27
+ datamule/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ datamule/providers/providers.py,sha256=pfCjoWzDPRK46gh0RR5U0crBJGnSHJKIw6OVn9OpjXc,232
26
29
  datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
30
  datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
28
31
  datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,7 +34,7 @@ datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZ
31
34
  datamule/sec/submissions/downloader.py,sha256=9Po1eQ6YEj3Yo9Qw_M5PjQM-OR8iocTNjPIyO3O8GMs,1513
32
35
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
33
36
  datamule/sec/submissions/monitor.py,sha256=6mE0NZFdPId69t4V53GwBb9sqtRN7HE54sU3WpU0bnY,11900
34
- datamule/sec/submissions/streamer.py,sha256=A6hunG_mOuBVqA9bBCXhNMcsPaZlhslA3WhopyUwdS4,11611
37
+ datamule/sec/submissions/streamer.py,sha256=AVawZ9pzjuqS5dxmZTvGtpDtHDSKp3r6XjJaF1W19Rs,11612
35
38
  datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
36
39
  datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
40
  datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
@@ -43,8 +46,8 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
43
46
  datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
47
  datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
45
48
  datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- datamule/submission/submission.py,sha256=I_7F658vTW1C_dsogIXdxXnV3W1Gbfj_6uzse1YHgY0,17343
47
- datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
49
+ datamule/submission/submission.py,sha256=JCGyfEVqaf8ct6h9h8WjK2zBnhg0lx9kKLud3nvJ2Eg,17516
50
+ datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
48
51
  datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
52
  datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
50
53
  datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -65,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
68
  datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
66
69
  datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
67
70
  datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
68
- datamule-2.3.8.dist-info/METADATA,sha256=wJ1iQL5mMQ6hyK9Wqh27ohWjZmCoZmi3XfXC5PwCwL8,609
69
- datamule-2.3.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
70
- datamule-2.3.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
71
- datamule-2.3.8.dist-info/RECORD,,
71
+ datamule-2.4.0.dist-info/METADATA,sha256=RSPqBwCagQnA41rQezMptrqFwnD0o65Fs74uGu12OlA,609
72
+ datamule-2.4.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
73
+ datamule-2.4.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
74
+ datamule-2.4.0.dist-info/RECORD,,