PyPI - datamule - Versions diffs - 2.3.8__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

datamule 2.3.8py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamule might be problematic. Click here for more details.

Files changed (12) hide show

datamule/datamule/downloader.py +3 -2
datamule/datamule/tar_downloader.py +719 -0
datamule/portfolio/portfolio.py +20 -2
datamule/providers/__init__.py +0 -0
datamule/providers/providers.py +6 -0
datamule/sec/submissions/streamer.py +1 -1
datamule/submission/submission.py +5 -2
datamule/submission/tar_submission.py +25 -23
{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/METADATA +1 -1
{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/RECORD +12 -9
{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/WHEEL +0 -0
{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/top_level.txt +0 -0

datamule/datamule/downloader.py CHANGED Viewed

@@ -19,8 +19,9 @@ from secsgml import parse_sgml_content_into_memory
 from secsgml.utils import bytes_to_str
 from .datamule_lookup import datamule_lookup
 from ..utils.format_accession import format_accession
+from ..providers.providers import SEC_FILINGS_SGML_BUCKET_ENDPOINT
-# could be cleaned up
+# TODO could be cleaned up
 # Set up logging
 logging.basicConfig(
@@ -33,7 +34,7 @@ logger = logging.getLogger(__name__)
 class Downloader:
     def __init__(self, api_key=None):
-        self.BASE_URL = "https://sec-library.datamule.xyz/"
+        self.BASE_URL = SEC_FILINGS_SGML_BUCKET_ENDPOINT
         self.CHUNK_SIZE = 2 * 1024 * 1024
         self.MAX_CONCURRENT_DOWNLOADS = 100
         self.MAX_DECOMPRESSION_WORKERS = cpu_count()

datamule/datamule/tar_downloader.py ADDED Viewed

@@ -0,0 +1,719 @@
+import os
+import asyncio
+import aiohttp
+from tqdm import tqdm
+import time
+import ssl
+import zstandard as zstd
+import io
+import json
+import tarfile
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from queue import Queue
+from threading import Thread, Lock
+from os import cpu_count
+from .datamule_lookup import datamule_lookup
+from ..utils.format_accession import format_accession
+from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=logging.getLogger().handlers,
+)
+logger = logging.getLogger(__name__)
+class TarDownloader:
+    def __init__(self, api_key=None):
+        self.BASE_URL = SEC_FILINGS_TAR_BUCKET_ENDPOINT
+        self.CHUNK_SIZE = 2 * 1024 * 1024
+        self.MAX_CONCURRENT_DOWNLOADS = 100
+        self.MAX_EXTRACTION_WORKERS = cpu_count()
+        self.MAX_TAR_WORKERS = cpu_count()
+        self.RANGE_MERGE_THRESHOLD = 1024  # Merge ranges if gap <= 1024 bytes
+        if api_key is not None:
+            self._api_key = api_key
+        self.loop = asyncio.new_event_loop()
+        self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
+        self.loop_thread.start()
+        self.async_queue = Queue()
+        self.error_log_lock = Lock()
+    def _run_event_loop(self):
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+    def _run_coroutine(self, coro):
+        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+        return future.result()
+    @property
+    def api_key(self):
+        return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
+    @api_key.setter
+    def api_key(self, value):
+        if not value:
+            raise ValueError("API key cannot be empty")
+        self._api_key = value
+    def _log_error(self, output_dir, filename, error_msg):
+        error_file = os.path.join(output_dir, 'errors.json')
+        with self.error_log_lock:
+            try:
+                if os.path.exists(error_file):
+                    with open(error_file, 'r') as f:
+                        errors = json.load(f)
+                else:
+                    errors = {}
+                errors[filename] = str(error_msg)
+                with open(error_file, 'w') as f:
+                    json.dump(errors, f, indent=2)
+            except Exception as e:
+                logger.error(f"Failed to log error to {error_file}: {str(e)}")
+    def _get_document_ranges(self, accession_num, keep_document_types, range_lookup_db=None):
+        """
+        Get byte ranges for requested document types.
+        Args:
+            accession_num: The accession number
+            keep_document_types: List of document types to retrieve
+            range_lookup_db: Future database connection for looking up ranges
+        Returns:
+            dict mapping document_type to (start_byte, end_byte)
+        """
+        if range_lookup_db is not None:
+            # Future: Query database for ranges
+            # return range_lookup_db.get_ranges(accession_num, keep_document_types)
+            pass
+        # Hardcoded ranges for now
+        ranges = {}
+        if 'metadata' in keep_document_types:
+            # Metadata is always first 128KB
+            ranges['metadata'] = (0, 131071)
+        return ranges
+    def _merge_ranges(self, ranges):
+        """
+        Merge overlapping or close ranges.
+        Args:
+            ranges: dict mapping document_type to (start_byte, end_byte)
+        Returns:
+            list of merged (start_byte, end_byte) tuples, sorted
+        """
+        if not ranges:
+            return []
+        # Extract and sort ranges by start byte
+        range_list = sorted(ranges.values(), key=lambda x: x[0])
+        merged = []
+        current_start, current_end = range_list[0]
+        for start, end in range_list[1:]:
+            # Check if ranges overlap or are within merge threshold
+            if start <= current_end + self.RANGE_MERGE_THRESHOLD:
+                # Merge: extend current range
+                current_end = max(current_end, end)
+            else:
+                # No merge: save current range and start new one
+                merged.append((current_start, current_end))
+                current_start, current_end = start, end
+        # Add the last range
+        merged.append((current_start, current_end))
+        return merged
+    def _build_range_header(self, merged_ranges):
+        """
+        Build HTTP Range header from merged ranges.
+        Args:
+            merged_ranges: list of (start_byte, end_byte) tuples
+        Returns:
+            Range header string, e.g., "bytes=0-131071,200000-300000"
+        """
+        if not merged_ranges:
+            return None
+        range_specs = [f"{start}-{end}" for start, end in merged_ranges]
+        return f"bytes={','.join(range_specs)}"
+    def _parse_tar_header(self, header_bytes):
+        """
+        Parse a 512-byte tar header.
+        Returns:
+            dict with 'name', 'size', or None if invalid header
+        """
+        if len(header_bytes) < 512:
+            return None
+        # Check if it's a zero block (end of archive)
+        if header_bytes == b'\x00' * 512:
+            return None
+        try:
+            # Tar header format (POSIX ustar)
+            name = header_bytes[0:100].split(b'\x00')[0].decode('utf-8')
+            size_str = header_bytes[124:136].split(b'\x00')[0].decode('utf-8').strip()
+            if not size_str:
+                return None
+            # Size is in octal
+            size = int(size_str, 8)
+            return {
+                'name': name,
+                'size': size
+            }
+        except:
+            return None
+    def _extract_files_from_partial_tar(self, tar_bytes):
+        """
+        Extract files from partial tar data by manually parsing headers.
+        Args:
+            tar_bytes: Raw bytes from partial tar download
+        Returns:
+            list of dicts with 'name' and 'content'
+        """
+        files = []
+        offset = 0
+        while offset + 512 <= len(tar_bytes):
+            # Read header
+            header = self._parse_tar_header(tar_bytes[offset:offset+512])
+            if header is None:
+                # End of archive or invalid header
+                break
+            offset += 512  # Move past header
+            # Calculate file content end and padding
+            file_size = header['size']
+            content_end = offset + file_size
+            # Check if we have the full file content
+            if content_end > len(tar_bytes):
+                # File is truncated, skip it
+                break
+            # Extract file content
+            content = tar_bytes[offset:content_end]
+            files.append({
+                'name': os.path.basename(header['name']),
+                'content': content
+            })
+            # Move to next 512-byte boundary
+            padding = (512 - (file_size % 512)) % 512
+            offset = content_end + padding
+        return files
+    def _build_filename_to_type_map(self, metadata_content):
+        """
+        Parse metadata and build a mapping of filename to document type.
+        Args:
+            metadata_content: The metadata.json content as bytes
+        Returns:
+            dict mapping filename to document type
+        """
+        try:
+            metadata = json.loads(metadata_content)
+            filename_map = {}
+            if 'documents' in metadata:
+                for doc in metadata['documents']:
+                    filename = doc.get('filename')
+                    doc_type = doc.get('type')
+                    if filename and doc_type:
+                        filename_map[filename] = doc_type
+            return filename_map
+        except:
+            return {}
+    def _filter_documents_by_type(self, documents, filename_map, keep_document_types):
+        """
+        Filter documents based on their type from metadata.
+        Args:
+            documents: List of dicts with 'name' and 'content'
+            filename_map: Dict mapping filename to document type
+            keep_document_types: List of document types to keep
+        Returns:
+            Filtered list of documents
+        """
+        if not keep_document_types or not filename_map:
+            return documents
+        # 'metadata' is special - it's already handled separately
+        # Filter out 'metadata' from keep_document_types for document filtering
+        doc_types_to_keep = [dt for dt in keep_document_types if dt != 'metadata']
+        if not doc_types_to_keep:
+            # Only metadata requested, no other documents
+            return []
+        filtered = []
+        for doc in documents:
+            doc_type = filename_map.get(doc['name'])
+            if doc_type and doc_type in doc_types_to_keep:
+                filtered.append(doc)
+        return filtered
+    class TarManager:
+        def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
+            self.output_dir = output_dir
+            self.num_tar_files = num_tar_files
+            self.max_batch_size = max_batch_size
+            self.tar_files = {}
+            self.tar_locks = {}
+            self.file_counters = {}
+            self.tar_sizes = {}
+            self.tar_sequences = {}
+            for i in range(num_tar_files):
+                tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
+                self.tar_files[i] = tarfile.open(tar_path, 'a')
+                self.tar_locks[i] = Lock()
+                self.file_counters[i] = 0
+                self.tar_sizes[i] = 0
+                self.tar_sequences[i] = 1
+        def get_tar_index(self, accession_num):
+            return hash(accession_num) % self.num_tar_files
+        def write_submission(self, accession_num, metadata_content, documents):
+            tar_index = self.get_tar_index(accession_num)
+            submission_size = len(metadata_content) + sum(len(doc['content']) for doc in documents)
+            with self.tar_locks[tar_index]:
+                if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
+                    tar = self.tar_files[tar_index]
+                    tar.close()
+                    self.tar_sequences[tar_index] += 1
+                    new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
+                    self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
+                    self.file_counters[tar_index] = 0
+                    self.tar_sizes[tar_index] = 0
+                tar = self.tar_files[tar_index]
+                try:
+                    # Write metadata
+                    tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
+                    tarinfo.size = len(metadata_content)
+                    tar.addfile(tarinfo, io.BytesIO(metadata_content))
+                    # Write documents
+                    for doc in documents:
+                        tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc["name"]}')
+                        tarinfo.size = len(doc['content'])
+                        tar.addfile(tarinfo, io.BytesIO(doc['content']))
+                    self.file_counters[tar_index] += 1
+                    self.tar_sizes[tar_index] += submission_size
+                    return True
+                except Exception as e:
+                    logger.error(f"Error writing {accession_num} to tar {tar_index}: {str(e)}")
+                    return False
+        def close_all(self):
+            for i, tar in self.tar_files.items():
+                try:
+                    tar.close()
+                except Exception as e:
+                    logger.error(f"Error closing tar {i}: {str(e)}")
+    def _parse_multipart_byteranges(self, content, content_type):
+        """
+        Parse multipart/byteranges response.
+        Args:
+            content: Response body bytes
+            content_type: Content-Type header value
+        Returns:
+            list of (start_byte, end_byte, data) tuples
+        """
+        # Extract boundary from content type
+        if 'boundary=' not in content_type:
+            # Single range response, not multipart
+            return [(None, None, content)]
+        boundary = content_type.split('boundary=')[1].strip()
+        boundary_bytes = f'--{boundary}'.encode('utf-8')
+        end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
+        parts = []
+        sections = content.split(boundary_bytes)
+        for section in sections[1:]:  # Skip first empty section
+            if section.startswith(end_boundary_bytes) or not section.strip():
+                continue
+            # Split headers from body
+            header_end = section.find(b'\r\n\r\n')
+            if header_end == -1:
+                header_end = section.find(b'\n\n')
+                if header_end == -1:
+                    continue
+                body_start = header_end + 2
+            else:
+                body_start = header_end + 4
+            headers = section[:header_end].decode('utf-8', errors='ignore')
+            body = section[body_start:].rstrip(b'\r\n')
+            # Parse Content-Range header
+            start_byte = None
+            end_byte = None
+            for line in headers.split('\n'):
+                if line.lower().startswith('content-range:'):
+                    # Format: "Content-Range: bytes START-END/TOTAL"
+                    range_part = line.split(':')[1].strip()
+                    if 'bytes ' in range_part:
+                        byte_range = range_part.split('bytes ')[1].split('/')[0]
+                        start_byte, end_byte = map(int, byte_range.split('-'))
+            parts.append((start_byte, end_byte, body))
+        return parts
+    def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
+        """Extract tar file and process its contents"""
+        try:
+            accession_num = filename.replace('.tar', '').split('/')[-1]
+            # If partial download (range request), manually parse tar headers
+            if is_partial:
+                files = self._extract_files_from_partial_tar(tar_content)
+                if not files:
+                    self._log_error(output_dir, filename, "No files found in partial tar")
+                    return False
+                # First file should be metadata
+                metadata_content = files[0]['content']
+                documents = files[1:] if len(files) > 1 else []
+                # Build filename to type mapping from metadata
+                filename_map = self._build_filename_to_type_map(metadata_content)
+                # Filter documents based on keep_document_types
+                documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
+            else:
+                # Full download, use tarfile library
+                tar_buffer = io.BytesIO(tar_content)
+                with tarfile.open(fileobj=tar_buffer, mode='r') as tar:
+                    members = tar.getmembers()
+                    if not members:
+                        self._log_error(output_dir, filename, "Empty tar file")
+                        return False
+                    # Read all files
+                    metadata_content = None
+                    documents = []
+                    for idx, member in enumerate(members):
+                        if member.isfile():
+                            file_content = tar.extractfile(member).read()
+                            if idx == 0:
+                                # First file is always metadata (never compressed)
+                                metadata_content = file_content
+                            else:
+                                member_name = os.path.basename(member.name)
+                                # Check if file is zstd compressed
+                                if self._is_zstd_compressed(file_content):
+                                    file_content = self._decompress_zstd(file_content)
+                                documents.append({
+                                    'name': member_name,
+                                    'content': file_content
+                                })
+                    if metadata_content is None:
+                        self._log_error(output_dir, filename, "No metadata found in tar")
+                        return False
+                    # Build filename to type mapping and filter
+                    if keep_document_types:
+                        filename_map = self._build_filename_to_type_map(metadata_content)
+                        documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
+                tar_buffer.close()
+            # Write to output tar
+            success = tar_manager.write_submission(accession_num, metadata_content, documents)
+            if not success:
+                self._log_error(output_dir, filename, "Failed to write to output tar")
+            return success
+        except Exception as e:
+            self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
+            return False
+    def _is_zstd_compressed(self, content):
+        """Check if content is zstd compressed by magic number"""
+        return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
+    def _decompress_zstd(self, compressed_content):
+        """Decompress zstd content"""
+        dctx = zstd.ZstdDecompressor()
+        return dctx.decompress(compressed_content)
+    async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
+        async with semaphore:
+            filename = url.split('/')[-1]
+            accession_num = filename.replace('.tar', '').split('/')[-1]
+            api_key = self.api_key
+            if not api_key:
+                raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
+            try:
+                headers = {
+                    'Connection': 'keep-alive',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'Authorization': f'Bearer {api_key}'
+                }
+                # Determine if we need partial download
+                range_header = None
+                is_partial = False
+                if keep_document_types:
+                    # Get ranges for requested document types
+                    doc_ranges = self._get_document_ranges(accession_num, keep_document_types, range_lookup_db)
+                    if doc_ranges:
+                        # Merge ranges
+                        merged_ranges = self._merge_ranges(doc_ranges)
+                        # Build range header
+                        range_header = self._build_range_header(merged_ranges)
+                        if range_header:
+                            headers['Range'] = range_header
+                            is_partial = True
+                async with session.get(url, headers=headers) as response:
+                    if response.status in (200, 206):  # 200 = full, 206 = partial
+                        content_type = response.headers.get('Content-Type', '')
+                        # Read all chunks
+                        chunks = []
+                        async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
+                            chunks.append(chunk)
+                        content = b''.join(chunks)
+                        # Handle multipart response if needed
+                        if response.status == 206 and 'multipart/byteranges' in content_type:
+                            # Parse multipart response
+                            parts = self._parse_multipart_byteranges(content, content_type)
+                            # Reconstruct tar content from parts
+                            tar_content = b''.join(part[2] for part in parts)
+                        else:
+                            tar_content = content
+                        # Process in thread pool
+                        loop = asyncio.get_running_loop()
+                        success = await loop.run_in_executor(
+                            extraction_pool,
+                            partial(self.extract_and_process_tar, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial)
+                        )
+                        if not success:
+                            self._log_error(output_dir, filename, "Failed to process tar file")
+                    elif response.status == 401:
+                        self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
+                        raise ValueError("Invalid API key")
+                    else:
+                        self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
+                    pbar.update(1)
+            except Exception as e:
+                self._log_error(output_dir, filename, str(e))
+                pbar.update(1)
+    async def process_batch(self, urls, output_dir, max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
+        os.makedirs(output_dir, exist_ok=True)
+        num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
+        tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
+        try:
+            with tqdm(total=len(urls), desc="Downloading tar files") as pbar:
+                semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
+                extraction_pool = ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS)
+                connector = aiohttp.TCPConnector(
+                    limit=self.MAX_CONCURRENT_DOWNLOADS,
+                    force_close=False,
+                    ssl=ssl.create_default_context(),
+                    ttl_dns_cache=300,
+                    keepalive_timeout=60
+                )
+                async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
+                    tasks = [
+                        self.download_and_process(
+                            session, url, semaphore, extraction_pool,
+                            tar_manager, output_dir, pbar, keep_document_types, range_lookup_db
+                        )
+                        for url in urls
+                    ]
+                    await asyncio.gather(*tasks, return_exceptions=True)
+                extraction_pool.shutdown()
+        finally:
+            tar_manager.close_all()
+    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads",
+                 filtered_accession_numbers=None, skip_accession_numbers=[],
+                 max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
+        if self.api_key is None:
+            raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
+        logger.debug("Querying SEC filings...")
+        if not accession_numbers:
+            filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
+                    columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False, api_key=self.api_key)
+            if filtered_accession_numbers:
+                filtered_accession_numbers = [format_accession(item, 'int') for item in filtered_accession_numbers]
+                filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
+            if skip_accession_numbers:
+                skip_accession_numbers = [format_accession(item, 'int') for item in skip_accession_numbers]
+                filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
+            logger.debug(f"Generating URLs for {len(filings)} filings...")
+            urls = []
+            for item in filings:
+                url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.tar"
+                urls.append(url)
+        else:
+            urls = []
+            for accession in accession_numbers:
+                url = f"{self.BASE_URL}{format_accession(accession, 'no-dash').zfill(18)}.tar"
+                urls.append(url)
+        if not urls:
+            logger.warning("No submissions found matching the criteria")
+            return
+        urls = list(set(urls))
+        start_time = time.time()
+        asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
+        elapsed_time = time.time() - start_time
+        logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
+        logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
+    def __del__(self):
+        if hasattr(self, 'loop') and self.loop.is_running():
+            self.loop.call_soon_threadsafe(self.loop.stop)
+    def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
+        if self.api_key is None:
+            raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
+        if not filenames:
+            raise ValueError("No filenames provided")
+        if not isinstance(filenames, (list, tuple)):
+            filenames = [filenames]
+        for filename in filenames:
+            if not isinstance(filename, str):
+                raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
+            if not filename.endswith('.tar'):
+                raise ValueError(f"Invalid filename format: {filename}. Expected .tar extension.")
+        logger.debug(f"Generating URLs for {len(filenames)} files...")
+        urls = []
+        for filename in filenames:
+            url = f"{self.BASE_URL}{filename}"
+            urls.append(url)
+        seen = set()
+        urls = [url for url in urls if not (url in seen or seen.add(url))]
+        logger.debug(f"Downloading {len(urls)} tar files...")
+        start_time = time.time()
+        asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
+        elapsed_time = time.time() - start_time
+        logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
+        logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
+def download_tar(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads",
+                 filtered_accession_numbers=None, skip_accession_numbers=[],
+                 max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
+    if filtered_accession_numbers:
+        filtered_accession_numbers = [format_accession(x, 'int') for x in filtered_accession_numbers]
+    elif filtered_accession_numbers == []:
+        raise ValueError("Applied filter resulted in empty accession numbers list")
+    downloader = TarDownloader(api_key=api_key)
+    downloader.download(
+        submission_type=submission_type,
+        cik=cik,
+        filing_date=filing_date,
+        output_dir=output_dir,
+        filtered_accession_numbers=filtered_accession_numbers,
+        skip_accession_numbers=skip_accession_numbers,
+        max_batch_size=max_batch_size,
+        accession_numbers=accession_numbers,
+        keep_document_types=keep_document_types,
+        range_lookup_db=range_lookup_db
+    )

datamule/portfolio/portfolio.py CHANGED Viewed

@@ -14,6 +14,7 @@ from ..sec.xbrl.filter_xbrl import filter_xbrl
 from ..sec.submissions.monitor import Monitor
 from .portfolio_compression_utils_legacy import CompressionManager
 from ..datamule.sec_connector import SecConnector
+from ..datamule.tar_downloader import download_tar
 import shutil
@@ -34,7 +35,6 @@ class Portfolio:
         if self.path.exists():
             self._load_submissions()
-            self.submissions_loaded = True
         else:
             self.path.mkdir(parents=True, exist_ok=True)
@@ -81,6 +81,8 @@ class Portfolio:
         self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
         print(f"Successfully loaded {len(self.submissions)} submissions")
+        self.submissions_loaded = True
     def _load_batch_submissions_worker(self, batch_tar_path, pbar):
         """Worker function to load submissions from one batch tar with progress updates"""
         # Open tar handle and store it
@@ -219,8 +221,12 @@ class Portfolio:
         skip_accession_numbers = []
         if skip_existing:
             skip_accession_numbers = [sub.accession for sub in self]
+        # map legacy provider
         if provider == 'datamule':
+            provider = 'datamule-sgml'
+        if provider == 'datamule-sgml':
             seclibrary_download(
                 output_dir=self.path,
                 cik=cik,
@@ -234,6 +240,18 @@ class Portfolio:
                 skip_accession_numbers=skip_accession_numbers,
                 accession_numbers = accession_numbers
             )
+        elif provider == 'datamule-tar':
+            download_tar(
+                output_dir=self.path,
+                cik=cik,
+                api_key=self.api_key,
+                submission_type=submission_type,
+                filing_date=filing_date,
+                filtered_accession_numbers=filtered_accession_numbers,
+                skip_accession_numbers=skip_accession_numbers,
+                accession_numbers = accession_numbers,
+                keep_document_types=document_type
+            )
         else:
             # will later add accession_numbers arg in the free update.
             sec_download(

datamule/providers/__init__.py ADDED Viewed

File without changes

datamule/providers/providers.py ADDED Viewed

@@ -0,0 +1,6 @@
+# TODO
+# make it easy for people to bring their own cloud
+SEC_LOOKUP_DB_ENDPOINT = ""
+SEC_FILINGS_SGML_BUCKET_ENDPOINT = "https://sec-library.datamule.xyz/"
+SEC_FILINGS_TAR_BUCKET_ENDPOINT = "https://sec-library.tar.datamule.xyz/"

datamule/sec/submissions/streamer.py CHANGED Viewed

@@ -82,7 +82,7 @@ class Streamer(EFTSQuery):
             if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
                 return None, None, None
-            if self.skip_accession_numbers is not None and accno_w_dash in self.skip_accession_numbers:
+            if self.skip_accession_numbers is not None and accno_no_dash in self.skip_accession_numbers:
                 return None, None, None
             # Construct the URL

datamule/submission/submission.py CHANGED Viewed

@@ -83,6 +83,7 @@ class Submission:
         self._tar = None
         self._tar_compression_type = 'zstd'
         self._tar_compression_level = 3
+        self._tar_compression_threshold = None
         self._accession_year_2d = None
         self._documents = None
@@ -380,9 +381,10 @@ class Submission:
     def tar(self):
         return self._tar_submission().getvalue()
-    def set_tar_compression(self,compression_type='zstd',level=3):
+    def set_tar_compression(self,compression_type='zstd',level=3,threshold=None):
         self._tar_compression_type = compression_type
         self._tar_compression_level = level
+        self._tar_compression_threshold = threshold
     def _tar_submission(self):
         if self._tar is not None:
@@ -393,7 +395,8 @@ class Submission:
                 documents_obj_list=documents_obj_list,
                 metadata=self.metadata.content,
                 compression_type=self._tar_compression_type,
-                level=self._tar_compression_level
+                level=self._tar_compression_level,
+                threshold=self._tar_compression_threshold
             )
             return self._tar

datamule/submission/tar_submission.py CHANGED Viewed

@@ -4,43 +4,42 @@ import tarfile
 import io
 import json
-# Note: we don't actually need accession at this level. TODO
-def compress_content(content, compression_type, level):
+def compress_content(content, compression_type, level, threshold):
     if compression_type == 'zstd':
-        # Create compressor with specified level
-        compressor = zstd.ZstdCompressor(level=level)
         # Handle string content
-        # This should never be called
         if isinstance(content, str):
             content_bytes = content.encode('utf-8')
         else:
             content_bytes = content
-        # Compress and return
+        # If content smaller than threshold, return uncompressed
+        if threshold is not None and len(content_bytes) < threshold:
+            return content_bytes
+        # Compress with specified level
+        compressor = zstd.ZstdCompressor(level=level)
         return compressor.compress(content_bytes)
     # Return uncompressed if not zstd
     return content
-def compress_content_list(document_tuple_list, compression_type, level):
+def compress_content_list(document_tuple_list, compression_type, level, threshold):
     if compression_type is None:
         return document_tuple_list
     if level is None:
         level = 3
-    # Create new list to avoid modifying original
     compressed_list = []
-    for document_tuple in document_tuple_list:
-        content = document_tuple[0]
-        accession = document_tuple[1]
-        compressed_content = compress_content(content, compression_type, level)
+    for content, accession in document_tuple_list:
+        compressed_content = compress_content(content, compression_type, level, threshold)
         compressed_list.append((compressed_content, accession))
     return compressed_list
 def tar_content_list(metadata, document_tuple_list_compressed):
     # Update metadata with compressed sizes
     for i, (content, accession) in enumerate(document_tuple_list_compressed):
@@ -65,15 +64,18 @@ def tar_content_list(metadata, document_tuple_list_compressed):
             tarinfo.size = len(content)
             tar.addfile(tarinfo, io.BytesIO(content))
-    # Return the tar buffer
-    tar_buffer.seek(0)  # Reset buffer position to beginning
+    tar_buffer.seek(0)  # Reset buffer position
     return tar_buffer
-def tar_submission(metadata, documents_obj_list, compression_type=None, level=None):
-    """Takes a list of documents, compresses them, then tars them."""
+def tar_submission(metadata, documents_obj_list, compression_type=None, level=None, threshold=None):
+    """Takes a list of documents, compresses them (if above threshold), then tars them."""
     document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
-    document_tuple_list_compressed = compress_content_list(document_tuple_list,  # Fixed: correct parameter name
-                                                          compression_type=compression_type,
-                                                          level=level)
+    document_tuple_list_compressed = compress_content_list(
+        document_tuple_list,
+        compression_type=compression_type,
+        level=level,
+        threshold=threshold
+    )
-    return tar_content_list(metadata, document_tuple_list_compressed)
+    return tar_content_list(metadata, document_tuple_list_compressed)

{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.3.8
+Version: 2.4.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/RECORD RENAMED Viewed

@@ -12,8 +12,9 @@ datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_t
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
 datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3SSKzTITB3o,12317
-datamule/datamule/downloader.py,sha256=Ss9mz0Jf5UAd-CZJ6oO96o9hN04xMQIF3-e1wahokdM,18581
+datamule/datamule/downloader.py,sha256=v0cG8eHZs9fttM55_ymHUWtPnCsK1aGiFTuM3jmLiCY,18650
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
+datamule/datamule/tar_downloader.py,sha256=5lHbk96MxtNVeuY1_uSAWj3tt5RqgOgvAr_7qQqbJmc,29483
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/document.py,sha256=Oj_7OMIldWB9HxlBca2gqr5E8ykDQZkPuUlcZjGuzqw,23016
 datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,8 +22,10 @@ datamule/mapping_dicts/html_mapping_dicts.py,sha256=pba3utMr2KldPeEGnMRkHyVw7D2W
 datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
 datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
 datamule/portfolio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/portfolio/portfolio.py,sha256=YPIvS4KKuEtm8A1XvNqDF39f4LJHhAFWmtpJzjbGDhY,11680
+datamule/portfolio/portfolio.py,sha256=UK27CoKntclIGgRhyiQjARMl5NNPCqTmBu4FtdXr4S4,12349
 datamule/portfolio/portfolio_compression_utils_legacy.py,sha256=1nlbz7JfBDrI0pwTyFiBF856xqGXvQRYBulLUpk7G1A,12695
+datamule/providers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datamule/providers/providers.py,sha256=pfCjoWzDPRK46gh0RR5U0crBJGnSHJKIw6OVn9OpjXc,232
 datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/utils.py,sha256=96bavyG2Kq1t8L1YA2vwYnAHKIKdRSoVXxBO5QH1HWo,2196
 datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -31,7 +34,7 @@ datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZ
 datamule/sec/submissions/downloader.py,sha256=9Po1eQ6YEj3Yo9Qw_M5PjQM-OR8iocTNjPIyO3O8GMs,1513
 datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
 datamule/sec/submissions/monitor.py,sha256=6mE0NZFdPId69t4V53GwBb9sqtRN7HE54sU3WpU0bnY,11900
-datamule/sec/submissions/streamer.py,sha256=A6hunG_mOuBVqA9bBCXhNMcsPaZlhslA3WhopyUwdS4,11611
+datamule/sec/submissions/streamer.py,sha256=AVawZ9pzjuqS5dxmZTvGtpDtHDSKp3r6XjJaF1W19Rs,11612
 datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
 datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
@@ -43,8 +46,8 @@ datamule/seclibrary/bq.py,sha256=TOP0WA6agDKu4vE1eHd62NDpAc02LDDrOP-g1bJpxbw,180
 datamule/sheet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sheet/sheet.py,sha256=Dw979JGygS566N0Iwsvqk0h1s26GfbrIHDWiBaS2oH8,10711
 datamule/submission/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/submission/submission.py,sha256=I_7F658vTW1C_dsogIXdxXnV3W1Gbfj_6uzse1YHgY0,17343
-datamule/submission/tar_submission.py,sha256=lkm1neVLW2_-G26VylL6Rzx98Cavvml0Qd2wlJHD0bw,3075
+datamule/submission/submission.py,sha256=JCGyfEVqaf8ct6h9h8WjK2zBnhg0lx9kKLud3nvJ2Eg,17516
+datamule/submission/tar_submission.py,sha256=uJHyTY5G8OVqmXzb0zaBEsLNthppGqYXbW-xFM4XMok,2901
 datamule/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/tables/tables.py,sha256=Z3Eu6bdjiaNx4pgXlTMwk2Q-DhpMpEAygF2kJdp-Pu8,5722
 datamule/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -65,7 +68,7 @@ datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
 datamule/utils/pdf.py,sha256=Z9xrdVhKex2YdvjYsaPaygRE_J6P_JNiUGkwflz2Hw0,735
-datamule-2.3.8.dist-info/METADATA,sha256=wJ1iQL5mMQ6hyK9Wqh27ohWjZmCoZmi3XfXC5PwCwL8,609
-datamule-2.3.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.3.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.3.8.dist-info/RECORD,,
+datamule-2.4.0.dist-info/METADATA,sha256=RSPqBwCagQnA41rQezMptrqFwnD0o65Fs74uGu12OlA,609
+datamule-2.4.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.4.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.4.0.dist-info/RECORD,,

{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.3.8.dist-info → datamule-2.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.3.8__py3-none-any.whl → 2.4.0__py3-none-any.whl

Potentially problematic release.

datamule 2.3.8py3-none-any.whl → 2.4.0py3-none-any.whl