PyPI - datamule - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl - Mend

datamule 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

datamule/portfolio.py +92 -12
datamule/sec/submissions/monitor.py +115 -75
datamule/seclibrary/downloader.py +163 -161
datamule/submission.py +102 -66
datamule/utils/__init__.py +0 -0
datamule/utils/construct_submissions_data.py +150 -0
{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/METADATA +1 -1
{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/RECORD +10 -8
{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/WHEEL +0 -0
{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/top_level.txt +0 -0

datamule/portfolio.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from pathlib import Path
 from tqdm import tqdm
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from .submission import Submission
 from .sec.submissions.downloader import download as sec_download
 from .sec.submissions.textsearch import filter_text
 from .config import Config
 import os
+import tarfile
+from threading import Lock
 from .helper import _process_cik_and_metadata_filters
 from .seclibrary.downloader import download as seclibrary_download
 from .sec.xbrl.filter_xbrl import filter_xbrl
@@ -21,6 +23,10 @@ class Portfolio:
         self.submissions = []
         self.submissions_loaded = False
         self.MAX_WORKERS = os.cpu_count() - 1
+        # Batch tar support
+        self.batch_tar_handles = {}  # {batch_tar_path: tarfile_handle}
+        self.batch_tar_locks = {}    # {batch_tar_path: threading.Lock}
         self.monitor = Monitor()
@@ -34,9 +40,13 @@ class Portfolio:
         self.api_key = api_key
     def _load_submissions(self):
-        folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
-        print(f"Loading {len(folders)} submissions")
+        print(f"Loading submissions")
+        # Separate regular and batch items
+        regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
+        batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
+        # Load regular submissions (existing logic)
         def load_submission(folder):
             try:
                 return Submission(folder)
@@ -44,17 +54,86 @@ class Portfolio:
                 print(f"Error loading submission from {folder}: {str(e)}")
                 return None
-        with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
-            self.submissions = list(tqdm(
-                executor.map(load_submission, folders),
-                total=len(folders),
-                desc="Loading submissions"
-            ))
-        # Filter out None values from failed submissions
-        self.submissions = [s for s in self.submissions if s is not None]
+        regular_submissions = []
+        if regular_items:
+            with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+                regular_submissions = list(tqdm(
+                    executor.map(load_submission, regular_items),
+                    total=len(regular_items),
+                    desc="Loading regular submissions"
+                ))
+        # Load batch submissions with parallel processing + progress
+        batch_submissions = []
+        if batch_tars:
+            with tqdm(desc="Loading batch submissions", unit="submissions") as pbar:
+                with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+                    # Submit all batch tar jobs
+                    futures = [
+                        executor.submit(self._load_batch_submissions_worker, batch_tar, pbar)
+                        for batch_tar in batch_tars
+                    ]
+                    # Collect results as they complete
+                    for future in as_completed(futures):
+                        try:
+                            batch_submissions.extend(future.result())
+                        except Exception as e:
+                            print(f"Error in batch processing: {str(e)}")
+        # Combine and filter None values
+        self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
         print(f"Successfully loaded {len(self.submissions)} submissions")
+    def _load_batch_submissions_worker(self, batch_tar_path, pbar):
+        """Worker function to load submissions from one batch tar with progress updates"""
+        try:
+            # Open tar handle and store it
+            tar_handle = tarfile.open(batch_tar_path, 'r')
+            self.batch_tar_handles[batch_tar_path] = tar_handle
+            self.batch_tar_locks[batch_tar_path] = Lock()
+            # Find all accession directories
+            accession_prefixes = set()
+            for member in tar_handle.getmembers():
+                if '/' in member.name and member.name.endswith('metadata.json'):
+                    accession_prefix = member.name.split('/')[0]
+                    accession_prefixes.add(accession_prefix)
+            # Create submissions for each accession
+            submissions = []
+            for accession_prefix in accession_prefixes:
+                try:
+                    submission = Submission(
+                        batch_tar_path=batch_tar_path,
+                        accession_prefix=accession_prefix,
+                        portfolio_ref=self
+                    )
+                    submissions.append(submission)
+                    pbar.update(1)  # Update progress for each successful submission
+                except Exception as e:
+                    print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
+            return submissions
+        except Exception as e:
+            print(f"Error loading batch tar {batch_tar_path}: {str(e)}")
+            return []
+    def _close_batch_handles(self):
+        """Close all open batch tar handles to free resources"""
+        for handle in self.batch_tar_handles.values():
+            try:
+                handle.close()
+            except Exception as e:
+                print(f"Error closing batch tar handle: {str(e)}")
+        self.batch_tar_handles.clear()
+        self.batch_tar_locks.clear()
+    def __del__(self):
+        """Cleanup batch tar handles on destruction"""
+        self._close_batch_handles()
     def process_submissions(self, callback):
         """Process all submissions using a thread pool."""
         if not self.submissions_loaded:
@@ -169,6 +248,7 @@ class Portfolio:
             )
         self.submissions_loaded = False
     def monitor_submissions(self, data_callback=None, interval_callback=None,
                             polling_interval=1000, quiet=True, start_date=None,
                             validation_interval=600000):

datamule/sec/submissions/monitor.py CHANGED Viewed

@@ -9,16 +9,14 @@ from .eftsquery import EFTSQuery
 import aiohttp
 from zoneinfo import ZoneInfo
-async def poll_rss(limiter):
+async def poll_rss(limiter, session):
     base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
-    # Create a session specifically for this RSS polling operation
-    async with aiohttp.ClientSession(headers=headers) as session:
-        # Use the rate limiter before making the request
-        async with limiter:
-            # Make the HTTP request with the session
-            async with session.get(base_url) as response:
-                content = await response.read()
+    # Use the rate limiter before making the request
+    async with limiter:
+        # Use the provided session instead of creating a new one
+        async with session.get(base_url) as response:
+            content = await response.read()
     # Process the content
     content_str = content.decode('utf-8')
@@ -70,12 +68,31 @@ class Monitor():
         self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
         self.efts_query = EFTSQuery(quiet=True)
         self.efts_query.limiter = self.ratelimiters['sec.gov']
+        self.session = None
+        self.session_created_at = 0
+        self.session_lifetime = 300  # 5 minutes in seconds
     def set_domain_rate_limit(self, domain, rate):
         self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
         if domain == 'sec.gov':
             self.efts_query.limiter = self.ratelimiters[domain]
+    async def _ensure_fresh_session(self):
+        """Ensure we have a fresh session, recreating if expired or missing"""
+        current_time = time.time()
+        # Check if we need a new session
+        if (self.session is None or
+            current_time - self.session_created_at > self.session_lifetime):
+            # Close old session if it exists
+            if self.session:
+                await self.session.close()
+            # Create new session
+            self.session = aiohttp.ClientSession(headers=headers)
+            self.session_created_at = current_time
     async def _async_run_efts_query(self, **kwargs):
         """Async helper method to run EFTS query without creating a new event loop"""
         # Make sure to set quiet parameter if provided in kwargs
@@ -103,83 +120,106 @@ class Monitor():
         if polling_interval is None and validation_interval is None:
             raise ValueError("At least one of polling_interval or validation_interval must be specified")
-        # Backfill if start_date is provided
-        if start_date is not None:
-            today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
-            if not quiet:
-                print(f"Backfilling from {start_date} to {today_date}")
-            hits = clean_efts_hits(await self._async_run_efts_query(
-                filing_date=(start_date, today_date),
-                quiet=quiet
-            ))
-            new_hits = self._filter_new_accessions(hits)
-            if not quiet:
-                print(f"New submissions found: {len(new_hits)}")
-            if new_hits and data_callback:
-                data_callback(new_hits)
-        # Initialize timing variables
-        current_time = time.time()
-        last_polling_time = current_time
-        last_validation_time = current_time
-        # Determine which operations to perform
-        do_polling = polling_interval is not None
-        do_validation = validation_interval is not None
+        # Ensure we have a fresh session
+        await self._ensure_fresh_session()
-        while True:
-            current_time = time.time()
-            # RSS polling (if enabled)
-            if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
-                if not quiet:
-                    print(f"Polling RSS feed")
-                results = await poll_rss(self.ratelimiters['sec.gov'])
-                new_results = self._filter_new_accessions(results)
-                if new_results:
-                    if not quiet:
-                        print(f"Found {len(new_results)} new submissions via RSS")
-                    if data_callback:
-                        data_callback(new_results)
-                last_polling_time = current_time
-            # EFTS validation (if enabled)
-            if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
-                # Get submissions from the last 24 hours for validation
+        try:
+            # Backfill if start_date is provided
+            if start_date is not None:
                 today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
                 if not quiet:
-                    print(f"Validating submissions from {today_date}")
+                    print(f"Backfilling from {start_date} to {today_date}")
                 hits = clean_efts_hits(await self._async_run_efts_query(
-                    filing_date=(today_date, today_date),
+                    filing_date=(start_date, today_date),
                     quiet=quiet
                 ))
                 new_hits = self._filter_new_accessions(hits)
-                if new_hits:
-                    if not quiet:
-                        print(f"Found {len(new_hits)} new submissions via EFTS validation")
-                    if data_callback:
-                        data_callback(new_hits)
-                last_validation_time = current_time
+                if not quiet:
+                    print(f"New submissions found: {len(new_hits)}")
+                if new_hits and data_callback:
+                    data_callback(new_hits)
+            # Initialize timing variables
+            current_time = time.time()
+            last_polling_time = current_time
+            last_validation_time = current_time
-            # Interval callback
-            if interval_callback:
-                interval_callback()
-            # Calculate next wake-up time
-            next_times = []
-            if do_polling:
-                next_times.append(last_polling_time + (polling_interval / 1000))
-            if do_validation:
-                next_times.append(last_validation_time + (validation_interval / 1000))
+            # Determine which operations to perform
+            do_polling = polling_interval is not None
+            do_validation = validation_interval is not None
-            next_wake_time = min(next_times)
-            current_time = time.time()
-            time_to_sleep = max(0, next_wake_time - current_time)
-            await asyncio.sleep(time_to_sleep)
+            while True:
+                current_time = time.time()
+                # RSS polling (if enabled)
+                if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
+                    if not quiet:
+                        print(f"Polling RSS feed")
+                    # Ensure session is fresh before polling
+                    await self._ensure_fresh_session()
+                    try:
+                        results = await poll_rss(self.ratelimiters['sec.gov'], self.session)
+                        new_results = self._filter_new_accessions(results)
+                        if new_results:
+                            if not quiet:
+                                print(f"Found {len(new_results)} new submissions via RSS")
+                            if data_callback:
+                                data_callback(new_results)
+                    except Exception as e:
+                        if not quiet:
+                            print(f"RSS polling error: {e}, will recreate session on next poll")
+                        # Force session recreation on next poll
+                        if self.session:
+                            await self.session.close()
+                            self.session = None
+                    last_polling_time = current_time
+                # EFTS validation (if enabled)
+                if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
+                    # Get submissions from the last 24 hours for validation
+                    today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
+                    if not quiet:
+                        print(f"Validating submissions from {today_date}")
+                    hits = clean_efts_hits(await self._async_run_efts_query(
+                        filing_date=(today_date, today_date),
+                        quiet=quiet
+                    ))
+                    new_hits = self._filter_new_accessions(hits)
+                    if new_hits:
+                        if not quiet:
+                            print(f"Found {len(new_hits)} new submissions via EFTS validation")
+                        if data_callback:
+                            data_callback(new_hits)
+                    last_validation_time = current_time
+                # Interval callback
+                if interval_callback:
+                    interval_callback()
+                # Calculate next wake-up time
+                next_times = []
+                if do_polling:
+                    next_times.append(last_polling_time + (polling_interval / 1000))
+                if do_validation:
+                    next_times.append(last_validation_time + (validation_interval / 1000))
+                next_wake_time = min(next_times)
+                current_time = time.time()
+                time_to_sleep = max(0, next_wake_time - current_time)
+                await asyncio.sleep(time_to_sleep)
+        finally:
+            # Clean up the session when done
+            if self.session:
+                await self.session.close()
+                self.session = None
     def monitor_submissions(self, data_callback=None, interval_callback=None,
                             polling_interval=1000, quiet=True, start_date=None,

datamule/seclibrary/downloader.py CHANGED Viewed

@@ -8,13 +8,15 @@ import ssl
 import zstandard as zstd
 import io
 import json
+import tarfile
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from queue import Queue, Empty
-from threading import Thread
+from threading import Thread, Lock
 from .query import query
 from os import cpu_count
-from secsgml import write_sgml_file_to_tar
+from secsgml import parse_sgml_content_into_memory
+from secsgml.utils import bytes_to_str
@@ -24,25 +26,19 @@ class Downloader:
         self.CHUNK_SIZE = 2 * 1024 * 1024
         self.MAX_CONCURRENT_DOWNLOADS = 100
         self.MAX_DECOMPRESSION_WORKERS = cpu_count()
-        self.MAX_PROCESSING_WORKERS = cpu_count()
-        self.QUEUE_SIZE = 10
+        self.MAX_TAR_WORKERS = cpu_count()
         if api_key is not None:
             self._api_key = api_key
-        # Create a shared event loop for async operations
         self.loop = asyncio.new_event_loop()
-        # Create a thread to run the event loop
         self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
         self.loop_thread.start()
-        # Create a queue for async tasks
         self.async_queue = Queue()
     def _run_event_loop(self):
-        """Run the event loop in a separate thread"""
         asyncio.set_event_loop(self.loop)
         self.loop.run_forever()
     def _run_coroutine(self, coro):
-        """Run a coroutine in the event loop and return its result"""
         future = asyncio.run_coroutine_threadsafe(coro, self.loop)
         return future.result()
@@ -72,65 +68,94 @@ class Downloader:
         except Exception as e:
             print(f"Failed to log error to {error_file}: {str(e)}")
-    class FileProcessor:
-        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
-            self.processing_queue = Queue(maxsize=queue_size)
-            self.should_stop = False
-            self.processing_workers = []
+    class TarManager:
+        def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
             self.output_dir = output_dir
-            self.max_workers = max_workers
-            self.batch_size = 50
-            self.pbar = pbar
-            self.downloader = downloader
-            self.keep_document_types = keep_document_types
-            self.keep_filtered_metadata = keep_filtered_metadata
-            self.standardize_metadata = standardize_metadata
-        def start_processing_workers(self):
-            for _ in range(self.max_workers):
-                worker = Thread(target=self._processing_worker)
-                worker.daemon = True
-                worker.start()
-                self.processing_workers.append(worker)
-        def _process_file(self, item):
-            filename, content = item
-            output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
-            write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
-            self.pbar.update(1)
-        def _processing_worker(self):
-            batch = []
-            while not self.should_stop:
+            self.num_tar_files = num_tar_files
+            self.max_batch_size = max_batch_size
+            self.tar_files = {}
+            self.tar_locks = {}
+            self.file_counters = {}
+            self.tar_sizes = {}
+            self.tar_sequences = {}
+            for i in range(num_tar_files):
+                tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
+                self.tar_files[i] = tarfile.open(tar_path, 'w')
+                self.tar_locks[i] = Lock()
+                self.file_counters[i] = 0
+                self.tar_sizes[i] = 0
+                self.tar_sequences[i] = 1
+        def get_tar_index(self, filename):
+            return hash(filename) % self.num_tar_files
+        def write_submission(self, filename, metadata, documents, standardize_metadata):
+            tar_index = self.get_tar_index(filename)
+            accession_num = filename.split('.')[0]
+            metadata_str = bytes_to_str(metadata, lower=False)
+            metadata_json = json.dumps(metadata_str).encode('utf-8')
+            submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
+            with self.tar_locks[tar_index]:
+                if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
+                    tar = self.tar_files[tar_index]
+                    tar.close()
+                    self.tar_sequences[tar_index] += 1
+                    new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
+                    self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
+                    self.file_counters[tar_index] = 0
+                    self.tar_sizes[tar_index] = 0
+                tar = self.tar_files[tar_index]
                 try:
-                    item = self.processing_queue.get(timeout=1)
-                    if item is None:
-                        break
-                    batch.append(item)
-                    if len(batch) >= self.batch_size or self.processing_queue.empty():
-                        for item in batch:
-                            self._process_file(item)
-                            self.processing_queue.task_done()
-                        batch = []
-                except Empty:
-                    if batch:
-                        for item in batch:
-                            self._process_file(item)
-                            self.processing_queue.task_done()
-                        batch = []
-        def stop_workers(self):
-            self.should_stop = True
-            for _ in self.processing_workers:
-                self.processing_queue.put(None)
-            for worker in self.processing_workers:
-                worker.join()
+                    tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
+                    tarinfo.size = len(metadata_json)
+                    tar.addfile(tarinfo, io.BytesIO(metadata_json))
+                    for file_num, content in enumerate(documents):
+                        doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
+                        tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
+                        tarinfo.size = len(content)
+                        tar.addfile(tarinfo, io.BytesIO(content))
+                    self.file_counters[tar_index] += 1
+                    self.tar_sizes[tar_index] += submission_size
+                    return True
+                except Exception as e:
+                    print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
+                    return False
+        def _get_document_name(self, metadata, file_num, standardize_metadata):
+            if standardize_metadata:
+                documents_key = b'documents'
+                filename_key = b'filename'
+                sequence_key = b'sequence'
+            else:
+                documents_key = b'DOCUMENTS'
+                filename_key = b'FILENAME'
+                sequence_key = b'SEQUENCE'
+            doc_metadata = metadata[documents_key][file_num]
+            filename = doc_metadata.get(filename_key)
+            if filename:
+                return filename.decode('utf-8')
+            else:
+                sequence = doc_metadata.get(sequence_key, b'document')
+                return sequence.decode('utf-8') + '.txt'
+        def close_all(self):
+            for i, tar in self.tar_files.items():
+                try:
+                    tar.close()
+                except Exception as e:
+                    print(f"Error closing tar {i}: {str(e)}")
-    def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
+    def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
         dctx = zstd.ZstdDecompressor()
         try:
             input_buffer = io.BytesIO(b''.join(compressed_chunks))
@@ -140,11 +165,19 @@ class Downloader:
                 shutil.copyfileobj(reader, decompressed_content)
             content = decompressed_content.getvalue()
-            processor.processing_queue.put((filename, content))
-            return True
+            metadata, documents = parse_sgml_content_into_memory(
+                bytes_content=content,
+                filter_document_types=keep_document_types,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata
+            )
+            success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
+            return success
         except Exception as e:
-            self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
+            self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
             return False
         finally:
             try:
@@ -153,17 +186,25 @@ class Downloader:
             except:
                 pass
-    def save_regular_file(self, chunks, filename, output_dir, processor):
+    def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
         try:
             content = b''.join(chunks)
-            processor.processing_queue.put((filename, content))
-            return True
+            metadata, documents = parse_sgml_content_into_memory(
+                bytes_content=content,
+                filter_document_types=keep_document_types,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata
+            )
+            success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
+            return success
         except Exception as e:
-            self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
+            self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
             return False
-    async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
+    async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
         async with semaphore:
             chunks = []
             filename = url.split('/')[-1]
@@ -188,70 +229,70 @@ class Downloader:
                         if filename.endswith('.zst'):
                             success = await loop.run_in_executor(
                                 decompression_pool,
-                                partial(self.decompress_stream, chunks, filename, output_dir, processor)
+                                partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
                             )
                         else:
                             success = await loop.run_in_executor(
                                 decompression_pool,
-                                partial(self.save_regular_file, chunks, filename, output_dir, processor)
+                                partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
                             )
                         if not success:
                             self._log_error(output_dir, filename, "Failed to process file")
                     elif response.status == 401:
                         self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
                         raise ValueError("Invalid API key")
                     else:
                         self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
+                    pbar.update(1)
             except Exception as e:
                 self._log_error(output_dir, filename, str(e))
+                pbar.update(1)
-    async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
+    async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
         os.makedirs(output_dir, exist_ok=True)
-        with tqdm(total=len(urls), desc="Processing files") as pbar:
-            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
-                                            keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
-            processor.start_processing_workers()
-            semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
-            decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
-            connector = aiohttp.TCPConnector(
-                limit=self.MAX_CONCURRENT_DOWNLOADS,
-                force_close=False,
-                ssl=ssl.create_default_context(),
-                ttl_dns_cache=300,
-                keepalive_timeout=60
-            )
-            # timeout should be max 30s.
-            async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
-                tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
-                await asyncio.gather(*tasks, return_exceptions=True)
-            processor.processing_queue.join()
-            processor.stop_workers()
-            decompression_pool.shutdown()
+        num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
+        tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
+        try:
+            with tqdm(total=len(urls), desc="Processing files") as pbar:
+                semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
+                decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
+                connector = aiohttp.TCPConnector(
+                    limit=self.MAX_CONCURRENT_DOWNLOADS,
+                    force_close=False,
+                    ssl=ssl.create_default_context(),
+                    ttl_dns_cache=300,
+                    keepalive_timeout=60
+                )
+                async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
+                    tasks = [
+                        self.download_and_process(
+                            session, url, semaphore, decompression_pool,
+                            keep_document_types, keep_filtered_metadata, standardize_metadata,
+                            tar_manager, output_dir, pbar
+                        )
+                        for url in urls
+                    ]
+                    await asyncio.gather(*tasks, return_exceptions=True)
+                decompression_pool.shutdown()
+        finally:
+            tar_manager.close_all()
     def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
-                 skip_accession_numbers=[]):
-        """
-        Query SEC filings and download/process them.
-        Parameters:
-        - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
-        - cik: Company CIK number(s), string, int, or list
-        - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
-        - output_dir: Directory to save downloaded files
-        - accession_numbers: List of specific accession numbers to download
-        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-        - keep_filtered_metadata: Whether to keep metadata for filtered documents
-        """
+                 skip_accession_numbers=[], max_batch_size=1024*1024*1024):
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
-        # Query the SEC filings first - before starting any async operations
         print("Querying SEC filings...")
         filings = query(
             submission_type=submission_type,
@@ -260,19 +301,14 @@ class Downloader:
             api_key=self.api_key
         )
-        # After querying but before generating URLs
         if accession_numbers:
             accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
             filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
         if skip_accession_numbers:
             skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
             filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
-        # Generate URLs from the query results
         print(f"Generating URLs for {len(filings)} filings...")
         urls = []
         for item in filings:
@@ -285,38 +321,21 @@ class Downloader:
             print("No submissions found matching the criteria")
             return
-        # Remove duplicates
         urls = list(set(urls))
-        # Now start the async processing
         start_time = time.time()
-        # Process the batch asynchronously
-        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
+        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
-        # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
         print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
         print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
     def __del__(self):
-        """Cleanup when the downloader is garbage collected"""
         if hasattr(self, 'loop') and self.loop.is_running():
             self.loop.call_soon_threadsafe(self.loop.stop)
-    def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
-        """
-        Download and process SEC filings using specific filenames.
-        Parameters:
-        - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
-        - output_dir: Directory to save downloaded files
-        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-        - keep_filtered_metadata: Whether to keep metadata for filtered documents
-        - standardize_metadata: Whether to standardize metadata format
-        """
+    def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -326,27 +345,23 @@ class Downloader:
         if not isinstance(filenames, (list, tuple)):
             filenames = [filenames]
-        # Validate filenames format
         for filename in filenames:
             if not isinstance(filename, str):
                 raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
             if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
                 raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
-        # Generate URLs directly from filenames
         print(f"Generating URLs for {len(filenames)} files...")
         urls = []
         for filename in filenames:
             url = f"{self.BASE_URL}{filename}"
             urls.append(url)
-        # Remove duplicates while preserving order
         seen = set()
         urls = [url for url in urls if not (url in seen or seen.add(url))]
         print(f"Downloading {len(urls)} files...")
-        # Process the batch asynchronously using existing infrastructure
         start_time = time.time()
         asyncio.run(self.process_batch(
@@ -354,33 +369,19 @@ class Downloader:
             output_dir,
             keep_document_types=keep_document_types,
             keep_filtered_metadata=keep_filtered_metadata,
-            standardize_metadata=standardize_metadata
+            standardize_metadata=standardize_metadata,
+            max_batch_size=max_batch_size
         ))
-        # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
         print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
         print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
 def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
-             skip_accession_numbers=[]):
-    """
-    Query SEC filings and download/process them.
-    Parameters:
-    - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
-    - cik: Company CIK number(s), string, int, or list
-    - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
-    - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
-    - output_dir: Directory to save downloaded files
-    - accession_numbers: List of specific accession numbers to download
-    - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
-    - keep_filtered_metadata: Whether to keep metadata for filtered documents
-    """
+             skip_accession_numbers=[], max_batch_size=1024*1024*1024):
     if accession_numbers:
         accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
-    # check if acc no is empty list
     elif accession_numbers == []:
         raise ValueError("Applied filter resulted in empty accession numbers list")
     downloader = Downloader(api_key=api_key)
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
         keep_document_types=keep_document_types,
         keep_filtered_metadata=keep_filtered_metadata,
         standardize_metadata=standardize_metadata,
-        skip_accession_numbers=skip_accession_numbers
-    )
+        skip_accession_numbers=skip_accession_numbers,
+        max_batch_size=max_batch_size
+    )

datamule/submission.py CHANGED Viewed

@@ -2,50 +2,15 @@ from pathlib import Path
 import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
-from secsgml.utils import bytes_to_str
+from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
 from secsgml.parse_sgml import transform_metadata_string
 import tarfile
 import shutil
 import zstandard as zstd
 import gzip
 import io
-import copy
-def calculate_documents_locations_in_tar(metadata, documents):
-    # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
-    placeholder_metadata = copy.deepcopy(metadata)
-    for file_num in range(len(documents)):
-        if 'documents' in placeholder_metadata:
-            placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999"  # 10 digits
-            placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999"    # 10 digits
-    # Step 2: Calculate size with placeholders
-    placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
-    placeholder_json = json.dumps(placeholder_str).encode('utf-8')
-    metadata_size = len(placeholder_json)
-    # Step 3: Now calculate actual positions using this size
-    current_pos = 512 + metadata_size
-    current_pos += (512 - (current_pos % 512)) % 512
-    # Step 4: Calculate real positions and update original metadata (10-digit padded)
-    for file_num, content in enumerate(documents):
-        start_byte = current_pos + 512
-        end_byte = start_byte + len(content)
-        if 'documents' in metadata:
-            metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}"  # 10-digit padding
-            metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}"      # 10-digit padding
-        file_total_size = 512 + len(content)
-        padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
-        current_pos += padded_size
-    return metadata
 def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
      # Write tar directly to disk
@@ -78,11 +43,21 @@ def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,
             tar.addfile(tarinfo, io.BytesIO(content))
 class Submission:
-    def __init__(self, path=None,sgml_content=None,keep_document_types=None):
-        if path is None and sgml_content is None:
-            raise ValueError("Either path or sgml_content must be provided")
-        if path is not None and sgml_content is not None:
-            raise ValueError("Only one of path or sgml_content must be provided")
+    def __init__(self, path=None, sgml_content=None, keep_document_types=None,
+                 batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
+        # Validate parameters
+        param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
+        if param_count != 1:
+            raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
+        if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
+            raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
+        # Initialize batch tar attributes
+        self.batch_tar_path = batch_tar_path
+        self.accession_prefix = accession_prefix
+        self.portfolio_ref = portfolio_ref
         if sgml_content is not None:
             self.path = None
@@ -100,7 +75,7 @@ class Submission:
             filtered_metadata_documents = []
             for idx,doc in enumerate(self.metadata.content['documents']):
-                type = doc.get('type')()
+                type = doc.get('type')
                 # Keep only specified types
                 if keep_document_types is not None and type not in keep_document_types:
@@ -115,7 +90,26 @@ class Submission:
             self.metadata.content['documents'] = filtered_metadata_documents
-        if path is not None:
+        elif batch_tar_path is not None:
+            # Batch tar case
+            self.path = None
+            # Load metadata from batch tar
+            with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
+                tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
+                metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
+                metadata = json.loads(metadata_obj.read().decode('utf-8'))
+            # Set metadata path using :: notation
+            metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
+            # standardize metadata
+            metadata = transform_metadata_string(metadata)
+            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
+            self.accession = self.metadata.content['accession-number']
+            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+        elif path is not None:
             self.path = Path(path)
             if self.path.suffix == '.tar':
                 with tarfile.open(self.path,'r') as tar:
@@ -228,44 +222,86 @@ class Submission:
         doc = self.metadata.content['documents'][idx]
         # If loaded from sgml_content, return pre-loaded document
-        if self.path is None:
+        if self.path is None and self.batch_tar_path is None:
             return self.documents[idx]
-        # If loaded from path, load document on-demand
+        # Get filename
         filename = doc.get('filename')
         if filename is None:
             filename = doc['sequence'] + '.txt'
-        document_path = self.path / filename
-        extension = document_path.suffix
+        extension = Path(filename).suffix
-        if self.path.suffix == '.tar':
-            with tarfile.open(self.path, 'r') as tar:
-                # bandaid fix TODO
-                try:
-                    content = tar.extractfile(filename).read()
-                except:
+        # Handle batch tar case
+        if self.batch_tar_path is not None:
+            with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
+                tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
+                # Try different filename variations for compressed files
+                possible_filenames = [
+                    f'{self.accession_prefix}/{filename}',
+                    f'{self.accession_prefix}/{filename}.gz',
+                    f'{self.accession_prefix}/{filename}.zst'
+                ]
+                content = None
+                actual_filename = None
+                for attempt_filename in possible_filenames:
                     try:
-                        content = tar.extractfile(filename+'.gz').read()
+                        content = tar_handle.extractfile(attempt_filename).read()
+                        actual_filename = attempt_filename
+                        break
                     except:
-                        try:
-                            content = tar.extractfile(filename+'.zst').read()
-                        except:
-                            # some of these issues are on SEC data end, will fix when I setup cloud.
-                            raise ValueError(f"Something went wrong with tar: {self.path}")
+                        continue
+                if content is None:
+                    raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
                 # Decompress if compressed
-                if filename.endswith('.gz'):
+                if actual_filename.endswith('.gz'):
                     content = gzip.decompress(content)
-                elif filename.endswith('.zst'):
+                elif actual_filename.endswith('.zst'):
                     dctx = zstd.ZstdDecompressor()
                     content = dctx.decompress(content)
+                # Decode text files
+                if extension in ['.htm', '.html', '.txt', '.xml']:
+                    content = content.decode('utf-8', errors='replace')
+                document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
+        # Handle regular path case (existing logic)
         else:
-            with document_path.open('rb') as f:
-                content = f.read()
+            document_path = self.path / filename
+            if self.path.suffix == '.tar':
+                with tarfile.open(self.path, 'r') as tar:
+                    # so here is where we should use bytes instead with byte offset.
+                    # bandaid fix TODO
+                    try:
+                        content = tar.extractfile(filename).read()
+                    except:
+                        try:
+                            content = tar.extractfile(filename+'.gz').read()
+                        except:
+                            try:
+                                content = tar.extractfile(filename+'.zst').read()
+                            except:
+                                # some of these issues are on SEC data end, will fix when I setup cloud.
+                                raise ValueError(f"Something went wrong with tar: {self.path}")
+                    # Decompress if compressed
+                    if filename.endswith('.gz'):
+                        content = gzip.decompress(content)
+                    elif filename.endswith('.zst'):
+                        dctx = zstd.ZstdDecompressor()
+                        content = dctx.decompress(content)
+            else:
+                with document_path.open('rb') as f:
+                    content = f.read()
-            # Decode text files
-            if extension in ['.htm', '.html', '.txt', '.xml']:
-                content = content.decode('utf-8', errors='replace')
+                # Decode text files
+                if extension in ['.htm', '.html', '.txt', '.xml']:
+                    content = content.decode('utf-8', errors='replace')
         return Document(
             type=doc['type'],

datamule/utils/__init__.py ADDED Viewed

File without changes

datamule/utils/construct_submissions_data.py ADDED Viewed

@@ -0,0 +1,150 @@
+import zipfile
+import json
+import csv
+import os
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+import threading
+from tqdm import tqdm
+import urllib.request
+headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
+def process_file_batch(zip_file, filenames_batch):
+    """Process a batch of files from the zip archive"""
+    batch_filings = []
+    for filename in filenames_batch:
+        if not filename.startswith('CIK'):
+            continue
+        try:
+            # Extract CIK from filename
+            cik = int(filename.split('.')[0].split('-')[0][3:])
+            # Read raw bytes and parse JSON
+            with zip_file.open(filename) as file:
+                raw_data = file.read()
+                submissions_dct = json.loads(raw_data)
+            # Handle different file types
+            if 'submissions' in filename:
+                filings_data = submissions_dct
+            else:
+                filings_data = submissions_dct['filings']['recent']
+            # Extract required data
+            accession_numbers = filings_data['accessionNumber']
+            filing_dates = filings_data['filingDate']
+            forms = filings_data['form']
+            # Create filing records for this file
+            for j in range(len(accession_numbers)):
+                filing_record = {
+                    'accessionNumber': int(accession_numbers[j].replace('-','')),
+                    'filingDate': filing_dates[j],
+                    'submissionType': forms[j],
+                    'cik': cik
+                }
+                batch_filings.append(filing_record)
+        except Exception as e:
+            print(f"Error processing {filename}: {e}")
+            continue
+    return batch_filings
+def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
+    """Thread-safe CSV writing with lock"""
+    with write_lock:
+        if is_first_write:
+            with open(output_path, 'w', newline='') as csvfile:
+                fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                writer.writeheader()
+                writer.writerows(filings_data)
+        else:
+            with open(output_path, 'a', newline='') as csvfile:
+                fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                writer.writerows(filings_data)
+def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100):
+    """Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
+    if submissions_zip_path is None:
+        url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
+        temp_dir = tempfile.mkdtemp()
+        zip_path = os.path.join(temp_dir, 'submissions.zip')
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req) as response:
+            total_size = int(response.headers.get('Content-Length', 0))
+            with open(zip_path, 'wb') as f, tqdm(
+                desc="Downloading",
+                total=total_size,
+                unit='B',
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as pbar:
+                while True:
+                    chunk = response.read(8192)
+                    if not chunk:
+                        break
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+        submissions_zip_path = zip_path
+    # Keep zip file open throughout processing
+    with zipfile.ZipFile(submissions_zip_path, 'r') as zip_file:
+        # Get all CIK filenames
+        all_filenames = [f for f in zip_file.namelist() if f.startswith('CIK')]
+        print(f"Processing {len(all_filenames)} files with {max_workers} workers...")
+        # Create batches of filenames
+        filename_batches = []
+        for i in range(0, len(all_filenames), batch_size):
+            batch = all_filenames[i:i + batch_size]
+            filename_batches.append(batch)
+        # Setup for threading
+        write_lock = threading.Lock()
+        total_filings = 0
+        is_first_write = True
+        # Process batches with thread pool
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all batch jobs
+            future_to_batch = {
+                executor.submit(process_file_batch, zip_file, batch): i
+                for i, batch in enumerate(filename_batches)
+            }
+            # Process results with progress bar
+            with tqdm(total=len(filename_batches), desc="Processing batches", unit="batch") as pbar:
+                for future in future_to_batch:
+                    try:
+                        batch_filings = future.result()
+                        if batch_filings:  # Only write if we have data
+                            write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
+                            is_first_write = False
+                            total_filings += len(batch_filings)
+                        pbar.update(1)
+                        pbar.set_postfix({
+                            'filings': total_filings,
+                            'files': len(filename_batches[future_to_batch[future]])
+                        })
+                    except Exception as e:
+                        print(f"Error processing batch: {e}")
+                        pbar.update(1)
+    print(f"Complete! Processed {total_filings} total filings")
+    print(f"Data saved to {output_path}")

{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.6.0
+Version: 1.6.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/RECORD RENAMED Viewed

@@ -3,9 +3,9 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
 datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
 datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
 datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
-datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
+datamule/portfolio.py,sha256=360kfXmmnVFrmpz16KF2es6Mq94lnVqzie2DIgnMB9Y,11641
 datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
-datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
+datamule/submission.py,sha256=f2pecbuhK0VmN1w0beNUiK4n_4Ma_GGQ5JIGilmZPZE,15127
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
@@ -51,7 +51,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
 datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
 datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
-datamule/sec/submissions/monitor.py,sha256=ll0nfHzG8FI3bA8zVFrfsfZGnbt5qAD4rRZ4LG2SORY,9567
+datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
 datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
 datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
 datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,9 +61,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
 datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
-datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
+datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
 datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
-datamule-1.6.0.dist-info/METADATA,sha256=E4F7MeBNWhHn19TH7eUyQN_vnONCvw-NiObNCRbsLE0,524
-datamule-1.6.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-1.6.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-1.6.0.dist-info/RECORD,,
+datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
+datamule-1.6.2.dist-info/METADATA,sha256=VOeuSq7t_-D7dKJjWrHuEg9zwqNvLWU08dGL7W2T0ow,524
+datamule-1.6.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-1.6.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.6.2.dist-info/RECORD,,

{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl

datamule 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl