PyPI - datamule - Versions diffs - 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl - Mend

datamule 1.6.1py3-none-any.whl → 1.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

datamule/portfolio.py +102 -18
datamule/portfolio_compression_utils.py +291 -0
datamule/seclibrary/downloader.py +163 -161
datamule/submission.py +82 -186
datamule/utils/construct_submissions_data.py +4 -4
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA +1 -1
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD +9 -8
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL +0 -0
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt +0 -0

datamule/seclibrary/downloader.py CHANGED Viewed

@@ -8,13 +8,15 @@ import ssl
 import zstandard as zstd
 import io
 import json
+import tarfile
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from queue import Queue, Empty
-from threading import Thread
+from threading import Thread, Lock
 from .query import query
 from os import cpu_count
-from secsgml import write_sgml_file_to_tar
+from secsgml import parse_sgml_content_into_memory
+from secsgml.utils import bytes_to_str
@@ -24,25 +26,19 @@ class Downloader:
         self.CHUNK_SIZE = 2 * 1024 * 1024
         self.MAX_CONCURRENT_DOWNLOADS = 100
         self.MAX_DECOMPRESSION_WORKERS = cpu_count()
-        self.MAX_PROCESSING_WORKERS = cpu_count()
-        self.QUEUE_SIZE = 10
+        self.MAX_TAR_WORKERS = cpu_count()
         if api_key is not None:
             self._api_key = api_key
-        # Create a shared event loop for async operations
         self.loop = asyncio.new_event_loop()
-        # Create a thread to run the event loop
         self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
         self.loop_thread.start()
-        # Create a queue for async tasks
         self.async_queue = Queue()
     def _run_event_loop(self):
-        """Run the event loop in a separate thread"""
         asyncio.set_event_loop(self.loop)
         self.loop.run_forever()
     def _run_coroutine(self, coro):
-        """Run a coroutine in the event loop and return its result"""
         future = asyncio.run_coroutine_threadsafe(coro, self.loop)
         return future.result()
@@ -72,65 +68,94 @@ class Downloader:
         except Exception as e:
             print(f"Failed to log error to {error_file}: {str(e)}")
-    class FileProcessor:
-        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
-            self.processing_queue = Queue(maxsize=queue_size)
-            self.should_stop = False
-            self.processing_workers = []
+    class TarManager:
+        def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
             self.output_dir = output_dir
-            self.max_workers = max_workers
-            self.batch_size = 50
-            self.pbar = pbar
-            self.downloader = downloader
-            self.keep_document_types = keep_document_types
-            self.keep_filtered_metadata = keep_filtered_metadata
-            self.standardize_metadata = standardize_metadata
-        def start_processing_workers(self):
-            for _ in range(self.max_workers):
-                worker = Thread(target=self._processing_worker)
-                worker.daemon = True
-                worker.start()
-                self.processing_workers.append(worker)
-        def _process_file(self, item):
-            filename, content = item
-            output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
-            write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
-            self.pbar.update(1)
-        def _processing_worker(self):
-            batch = []
-            while not self.should_stop:
+            self.num_tar_files = num_tar_files
+            self.max_batch_size = max_batch_size
+            self.tar_files = {}
+            self.tar_locks = {}
+            self.file_counters = {}
+            self.tar_sizes = {}
+            self.tar_sequences = {}
+            for i in range(num_tar_files):
+                tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
+                self.tar_files[i] = tarfile.open(tar_path, 'w')
+                self.tar_locks[i] = Lock()
+                self.file_counters[i] = 0
+                self.tar_sizes[i] = 0
+                self.tar_sequences[i] = 1
+        def get_tar_index(self, filename):
+            return hash(filename) % self.num_tar_files
+        def write_submission(self, filename, metadata, documents, standardize_metadata):
+            tar_index = self.get_tar_index(filename)
+            accession_num = filename.split('.')[0]
+            metadata_str = bytes_to_str(metadata, lower=False)
+            metadata_json = json.dumps(metadata_str).encode('utf-8')
+            submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
+            with self.tar_locks[tar_index]:
+                if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
+                    tar = self.tar_files[tar_index]
+                    tar.close()
+                    self.tar_sequences[tar_index] += 1
+                    new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
+                    self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
+                    self.file_counters[tar_index] = 0
+                    self.tar_sizes[tar_index] = 0
+                tar = self.tar_files[tar_index]
                 try:
-                    item = self.processing_queue.get(timeout=1)
-                    if item is None:
-                        break
-                    batch.append(item)
-                    if len(batch) >= self.batch_size or self.processing_queue.empty():
-                        for item in batch:
-                            self._process_file(item)
-                            self.processing_queue.task_done()
-                        batch = []
-                except Empty:
-                    if batch:
-                        for item in batch:
-                            self._process_file(item)
-                            self.processing_queue.task_done()
-                        batch = []
-        def stop_workers(self):
-            self.should_stop = True
-            for _ in self.processing_workers:
-                self.processing_queue.put(None)
-            for worker in self.processing_workers:
-                worker.join()
+                    tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
+                    tarinfo.size = len(metadata_json)
+                    tar.addfile(tarinfo, io.BytesIO(metadata_json))
+                    for file_num, content in enumerate(documents):
+                        doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
+                        tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
+                        tarinfo.size = len(content)
+                        tar.addfile(tarinfo, io.BytesIO(content))
+                    self.file_counters[tar_index] += 1
+                    self.tar_sizes[tar_index] += submission_size
+                    return True
+                except Exception as e:
+                    print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
+                    return False
+        def _get_document_name(self, metadata, file_num, standardize_metadata):
+            if standardize_metadata:
+                documents_key = b'documents'
+                filename_key = b'filename'
+                sequence_key = b'sequence'
+            else:
+                documents_key = b'DOCUMENTS'
+                filename_key = b'FILENAME'
+                sequence_key = b'SEQUENCE'
+            doc_metadata = metadata[documents_key][file_num]
+            filename = doc_metadata.get(filename_key)
+            if filename:
+                return filename.decode('utf-8')
+            else:
+                sequence = doc_metadata.get(sequence_key, b'document')
+                return sequence.decode('utf-8') + '.txt'
+        def close_all(self):
+            for i, tar in self.tar_files.items():
+                try:
+                    tar.close()
+                except Exception as e:
+                    print(f"Error closing tar {i}: {str(e)}")
-    def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
+    def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
         dctx = zstd.ZstdDecompressor()
         try:
             input_buffer = io.BytesIO(b''.join(compressed_chunks))
@@ -140,11 +165,19 @@ class Downloader:
                 shutil.copyfileobj(reader, decompressed_content)
             content = decompressed_content.getvalue()
-            processor.processing_queue.put((filename, content))
-            return True
+            metadata, documents = parse_sgml_content_into_memory(
+                bytes_content=content,
+                filter_document_types=keep_document_types,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata
+            )
+            success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
+            return success
         except Exception as e:
-            self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
+            self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
             return False
         finally:
             try:
@@ -153,17 +186,25 @@ class Downloader:
             except:
                 pass
-    def save_regular_file(self, chunks, filename, output_dir, processor):
+    def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
         try:
             content = b''.join(chunks)
-            processor.processing_queue.put((filename, content))
-            return True
+            metadata, documents = parse_sgml_content_into_memory(
+                bytes_content=content,
+                filter_document_types=keep_document_types,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata
+            )
+            success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
+            return success
         except Exception as e:
-            self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
+            self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
             return False
-    async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
+    async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
         async with semaphore:
             chunks = []
             filename = url.split('/')[-1]
@@ -188,70 +229,70 @@ class Downloader:
                         if filename.endswith('.zst'):
                             success = await loop.run_in_executor(
                                 decompression_pool,
-                                partial(self.decompress_stream, chunks, filename, output_dir, processor)
+                                partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
                             )
                         else:
                             success = await loop.run_in_executor(
                                 decompression_pool,
-                                partial(self.save_regular_file, chunks, filename, output_dir, processor)
+                                partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
                             )
                         if not success:
                             self._log_error(output_dir, filename, "Failed to process file")
                     elif response.status == 401:
                         self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
                         raise ValueError("Invalid API key")
                     else:
                         self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
+                    pbar.update(1)
             except Exception as e:
                 self._log_error(output_dir, filename, str(e))
+                pbar.update(1)
-    async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
+    async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
         os.makedirs(output_dir, exist_ok=True)
-        with tqdm(total=len(urls), desc="Processing files") as pbar:
-            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
-                                            keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
-            processor.start_processing_workers()
-            semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
-            decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
-            connector = aiohttp.TCPConnector(
-                limit=self.MAX_CONCURRENT_DOWNLOADS,
-                force_close=False,
-                ssl=ssl.create_default_context(),
-                ttl_dns_cache=300,
-                keepalive_timeout=60
-            )
-            # timeout should be max 30s.
-            async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
-                tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
-                await asyncio.gather(*tasks, return_exceptions=True)
-            processor.processing_queue.join()
-            processor.stop_workers()
-            decompression_pool.shutdown()
+        num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
+        tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
+        try:
+            with tqdm(total=len(urls), desc="Processing files") as pbar:
+                semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
+                decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
+                connector = aiohttp.TCPConnector(
+                    limit=self.MAX_CONCURRENT_DOWNLOADS,
+                    force_close=False,
+                    ssl=ssl.create_default_context(),
+                    ttl_dns_cache=300,
+                    keepalive_timeout=60
+                )
+                async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
+                    tasks = [
+                        self.download_and_process(
+                            session, url, semaphore, decompression_pool,
+                            keep_document_types, keep_filtered_metadata, standardize_metadata,
+                            tar_manager, output_dir, pbar
+                        )
+                        for url in urls
+                    ]
+                    await asyncio.gather(*tasks, return_exceptions=True)
+                decompression_pool.shutdown()
+        finally:
+            tar_manager.close_all()
     def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
-                 skip_accession_numbers=[]):
-        """
-        Query SEC filings and download/process them.
-        Parameters:
-        - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
-        - cik: Company CIK number(s), string, int, or list
-        - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
-        - output_dir: Directory to save downloaded files
-        - accession_numbers: List of specific accession numbers to download
-        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-        - keep_filtered_metadata: Whether to keep metadata for filtered documents
-        """
+                 skip_accession_numbers=[], max_batch_size=1024*1024*1024):
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
-        # Query the SEC filings first - before starting any async operations
         print("Querying SEC filings...")
         filings = query(
             submission_type=submission_type,
@@ -260,19 +301,14 @@ class Downloader:
             api_key=self.api_key
         )
-        # After querying but before generating URLs
         if accession_numbers:
             accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
             filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
         if skip_accession_numbers:
             skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
             filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
-        # Generate URLs from the query results
         print(f"Generating URLs for {len(filings)} filings...")
         urls = []
         for item in filings:
@@ -285,38 +321,21 @@ class Downloader:
             print("No submissions found matching the criteria")
             return
-        # Remove duplicates
         urls = list(set(urls))
-        # Now start the async processing
         start_time = time.time()
-        # Process the batch asynchronously
-        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
+        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
-        # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
         print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
         print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
     def __del__(self):
-        """Cleanup when the downloader is garbage collected"""
         if hasattr(self, 'loop') and self.loop.is_running():
             self.loop.call_soon_threadsafe(self.loop.stop)
-    def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
-        """
-        Download and process SEC filings using specific filenames.
-        Parameters:
-        - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
-        - output_dir: Directory to save downloaded files
-        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-        - keep_filtered_metadata: Whether to keep metadata for filtered documents
-        - standardize_metadata: Whether to standardize metadata format
-        """
+    def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -326,27 +345,23 @@ class Downloader:
         if not isinstance(filenames, (list, tuple)):
             filenames = [filenames]
-        # Validate filenames format
         for filename in filenames:
             if not isinstance(filename, str):
                 raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
             if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
                 raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
-        # Generate URLs directly from filenames
         print(f"Generating URLs for {len(filenames)} files...")
         urls = []
         for filename in filenames:
             url = f"{self.BASE_URL}{filename}"
             urls.append(url)
-        # Remove duplicates while preserving order
         seen = set()
         urls = [url for url in urls if not (url in seen or seen.add(url))]
         print(f"Downloading {len(urls)} files...")
-        # Process the batch asynchronously using existing infrastructure
         start_time = time.time()
         asyncio.run(self.process_batch(
@@ -354,33 +369,19 @@ class Downloader:
             output_dir,
             keep_document_types=keep_document_types,
             keep_filtered_metadata=keep_filtered_metadata,
-            standardize_metadata=standardize_metadata
+            standardize_metadata=standardize_metadata,
+            max_batch_size=max_batch_size
         ))
-        # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
         print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
         print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
 def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
-             skip_accession_numbers=[]):
-    """
-    Query SEC filings and download/process them.
-    Parameters:
-    - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
-    - cik: Company CIK number(s), string, int, or list
-    - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
-    - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
-    - output_dir: Directory to save downloaded files
-    - accession_numbers: List of specific accession numbers to download
-    - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-    - keep_filtered_metadata: Whether to keep metadata for filtered documents
-    """
+             skip_accession_numbers=[], max_batch_size=1024*1024*1024):
     if accession_numbers:
         accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
-    # check if acc no is empty list
     elif accession_numbers == []:
         raise ValueError("Applied filter resulted in empty accession numbers list")
     downloader = Downloader(api_key=api_key)
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
         keep_document_types=keep_document_types,
         keep_filtered_metadata=keep_filtered_metadata,
         standardize_metadata=standardize_metadata,
-        skip_accession_numbers=skip_accession_numbers
-    )
+        skip_accession_numbers=skip_accession_numbers,
+        max_batch_size=max_batch_size
+    )

datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl

datamule 1.6.1py3-none-any.whl → 1.6.3py3-none-any.whl