PyPI - datamule - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

datamule 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

datamule/__init__.py +3 -1
datamule/document/__init__.py +0 -0
datamule/document/document.py +255 -0
datamule/document/processing.py +604 -0
datamule/document/table.py +260 -0
datamule/package_updater.py +31 -0
datamule/portfolio.py +5 -3
datamule/sec/submissions/downloader.py +14 -37
datamule/seclibrary/downloader.py +50 -9
datamule/submission.py +102 -7
{datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
{datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/RECORD +14 -10
datamule/document.py +0 -465
{datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/WHEEL +0 -0
{datamule-1.2.0.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0

datamule/document/table.py ADDED Viewed

@@ -0,0 +1,260 @@
+from .mappings.atsn import *
+from .mappings.cfportal import *
+from .mappings.ex99a_sdr import *
+from .mappings.ex99c_sdr import *
+from .mappings.ex99g_sdr import *
+from .mappings.ex99i_sdr import *
+from .mappings.nmfp import *
+from .mappings.npx import *
+from .mappings.onefourtyfour import *
+from .mappings.ownership import *
+from .mappings.proxy_voting_record import *
+from .mappings.sbs import *
+from .mappings.sbsef import *
+from .mappings.schedule13 import *
+from .mappings.sdr import *
+from .mappings.ta import *
+from .mappings.thirteenfhr import *
+from .mappings.twentyfivense import *
+from .mappings.twentyfourf2nt import *
+from .mappings.information_table import *
+# need to check if mappings correctly create new columns
+class Table():
+    def __init__(self, data, type,accession):
+        if isinstance(data,dict):
+            data = [data]
+        self.type = type
+        self.data = data
+        self.accession = accession
+        self.columns = self.determine_columns()
+    def determine_columns(self):
+        if len(self.data) == 0:
+            return []
+        return self.data[0].keys()
+    def add_column(self,column_name,value):
+        for row in self.data:
+            row[column_name] = value
+    def map_data(self):
+        # Add the accession column to all rows first, ensuring it will be first
+        self.add_column('accession', self.accession)
+        # ATS-N, types: metadata_ats,cover_ats,part_one_ats,part_two_ats,part_three_ats,part_four_ats
+        if self.type == 'metadata_ats':
+            mapping_dict = metadata_ats_dict
+        elif self.type == 'cover_ats':
+            mapping_dict = cover_ats_dict
+        elif self.type == 'part_one_ats':
+            mapping_dict = part_one_ats_dict
+        elif self.type == 'part_two_ats':
+            mapping_dict = part_two_ats_dict
+        elif self.type == 'part_three_ats':
+            mapping_dict = part_three_ats_dict
+        elif self.type == 'part_four_ats':
+            mapping_dict = part_four_ats_dict
+        # CFPORTAL
+        elif self.type == 'metadata_cfportal':
+            mapping_dict = metadata_cfportal_dict
+        elif self.type == 'identifying_information_cfportal':
+            mapping_dict = identifying_information_cfportal_dict
+        elif self.type == 'form_of_organization_cfportal':
+            mapping_dict = form_of_organization_cfportal_dict
+        elif self.type == 'successions_cfportal':
+            mapping_dict = successions_cfportal_dict
+        elif self.type == 'control_relationships_cfportal':
+            mapping_dict = control_relationships_cfportal_dict
+        elif self.type == 'disclosure_answers_cfportal':
+            mapping_dict = disclosure_answers_cfportal_dict
+        elif self.type == 'non_securities_related_business_cfportal':
+            mapping_dict = non_securities_related_business_cfportal_dict
+        elif self.type == 'escrow_arrangements_cfportal':
+            mapping_dict = escrow_arrangements_cfportal_dict
+        elif self.type == 'execution_cfportal':
+            mapping_dict = execution_cfportal_dict
+        elif self.type == 'schedule_a_cfportal':
+            mapping_dict = schedule_a_cfportal_dict
+        elif self.type == 'schedule_b_cfportal':
+            mapping_dict = schedule_b_cfportal_dict
+        elif self.type == 'schedule_c_cfportal':
+            mapping_dict = schedule_c_cfportal_dict
+        elif self.type == 'schedule_d_cfportal':
+            mapping_dict = schedule_d_cfportal_dict
+        elif self.type == 'criminal_drip_info_cfportal':
+            mapping_dict = criminal_drip_info_cfportal_dict
+        elif self.type == 'regulatory_drip_info_cfportal':
+            mapping_dict = regulatory_drip_info_cfportal_dict
+        elif self.type == 'civil_judicial_drip_info_cfportal':
+            mapping_dict = civil_judicial_drip_info_cfportal_dict
+        elif self.type == 'bankruptcy_sipc_drip_info_cfportal':
+            mapping_dict = bankruptcy_sipc_drip_info_cfportal_dict
+        elif self.type == 'bond_drip_info_cfportal':
+            mapping_dict = bond_drip_info_cfportal_dict
+        elif self.type == 'judgement_drip_info_cfportal':
+            mapping_dict = judgement_drip_info_cfportal_dict
+        # SDR
+        # Information Table
+        elif self.type == 'information_table':
+            mapping_dict = information_table_dict
+        # NFMP
+        elif self.type == 'metadata_nmfp':
+            mapping_dict = metadata_nmfp_dict
+        elif self.type == 'general_information_nmfp':
+            mapping_dict = general_information_nmfp_dict
+        elif self.type == 'series_level_info_nmfp':
+            mapping_dict = series_level_info_nmfp_dict
+        elif self.type == 'class_level_info_nmfp':
+            mapping_dict = class_level_info_nmfp_dict
+        elif self.type == 'schedule_of_portfolio_securities_info_nmfp':
+            mapping_dict = schedule_of_portfolio_securities_info_nmfp_dict
+        elif self.type == 'signature_nmfp':
+            mapping_dict = signature_nmfp_dict
+        # NPX
+        elif self.type == 'npx':
+            mapping_dict = npx_dict
+        # 144
+        elif self.type == 'signatures_144':
+            mapping_dict = signatures_144_dict
+        elif self.type == 'securities_sold_in_past_3_months_144':
+            mapping_dict = securities_sold_in_past_3_months_144_dict
+        elif self.type == 'securities_to_be_sold_144':
+            mapping_dict = securities_to_be_sold_144_dict
+        elif self.type == 'securities_information_144':
+            mapping_dict = securities_information_144_dict
+        elif self.type == 'issuer_information_144':
+            mapping_dict = issuer_information_144_dict
+        elif self.type == 'metadata_144':
+            mapping_dict = metadata_144_dict
+        # Ownership
+        elif self.type == 'non_derivative_holding_ownership':
+            mapping_dict = non_derivative_holding_ownership_dict
+        elif self.type == 'non_derivative_transaction_ownership':
+            mapping_dict = non_derivative_transaction_ownership_dict
+        elif self.type == 'derivative_transaction_ownership':
+            mapping_dict = derivative_transaction_ownership_dict
+        elif self.type == 'derivative_holding_ownership':
+            mapping_dict = derivative_holding_ownership_dict
+        elif self.type == 'reporting_owner_ownership':
+            mapping_dict = reporting_owner_ownership_dict
+        elif self.type == 'metadata_ownership':
+            mapping_dict = metadata_ownership_dict
+        elif self.type == 'owner_signature_ownership':
+            mapping_dict = owner_signature_ownership_dict
+        # Proxy Voting Record
+        elif self.type == 'proxy_voting_record':
+            mapping_dict = proxy_voting_record_dict
+        # SBS
+        # SBSEF
+        elif self.type == 'sbsef':
+            mapping_dict = sbsef_dict
+        # Schedule 13
+        elif self.type == 'metadata_schedule_13':
+            mapping_dict = metadata_schedule_13_dict
+        elif self.type == 'cover_schedule_13':
+            mapping_dict = cover_schedule_13_dict
+        elif self.type == 'reporting_person_details_schedule_13':
+            mapping_dict = reporting_person_details_schedule_13_dict
+        elif self.type == 'item_1_schedule_13':
+            mapping_dict = item_1_schedule_13_dict
+        elif self.type == 'item_2_schedule_13':
+            mapping_dict = item_2_schedule_13_dict
+        elif self.type == 'item_3_schedule_13':
+            mapping_dict = item_3_schedule_13_dict
+        elif self.type == 'item_4_schedule_13':
+            mapping_dict = item_4_schedule_13_dict
+        elif self.type == 'item_5_schedule_13':
+            mapping_dict = item_5_schedule_13_dict
+        elif self.type == 'item_6_schedule_13':
+            mapping_dict = item_6_schedule_13_dict
+        elif self.type == 'item_7_schedule_13':
+            mapping_dict = item_7_schedule_13_dict
+        elif self.type == 'item_8_schedule_13':
+            mapping_dict = item_8_schedule_13_dict
+        elif self.type == 'item_9_schedule_13':
+            mapping_dict = item_9_schedule_13_dict
+        elif self.type == 'item_10_schedule_13':
+            mapping_dict = item_10_schedule_13_dict
+        elif self.type == 'signature_schedule_13':
+            mapping_dict = signature_schedule_13_dict
+        # SDR
+        elif self.type == 'sdr':
+            mapping_dict = sdr_dict
+        # TA
+        # 13F-HR
+        elif self.type == '13fhr':
+            mapping_dict = thirteenfhr_dict
+        # 25-NSE
+        elif self.type == '25nse':
+            mapping_dict = twentyfive_nse_dict
+        # 24F-2NT
+        elif self.type == 'metadata_24f_2nt':
+            mapping_dict = metadata_24f_2nt_dict
+        elif self.type == 'item_1_24f2nt':
+            mapping_dict = item_1_24f2nt_dict
+        elif self.type == 'item_2_24f2nt':
+            mapping_dict = item_2_24f2nt_dict
+        elif self.type == 'item_3_24f2nt':
+            mapping_dict = item_3_24f2nt_dict
+        elif self.type == 'item_4_24f2nt':
+            mapping_dict = item_4_24f2nt_dict
+        elif self.type == 'item_5_24f2nt':
+            mapping_dict = item_5_24f2nt_dict
+        elif self.type == 'item_6_24f2nt':
+            mapping_dict = item_6_24f2nt_dict
+        elif self.type == 'item_7_24f2nt':
+            mapping_dict = item_7_24f2nt_dict
+        elif self.type == 'item_8_24f2nt':
+            mapping_dict = item_8_24f2nt_dict
+        elif self.type == 'item_9_24f2nt':
+            mapping_dict = item_9_24f2nt_dict
+        elif self.type == 'signature_info_schedule_a':
+            mapping_dict = signature_24f2nt_dict
+        else:
+            mapping_dict = {}
+        # Update mapping dictionary to include accession at the beginning
+        # Create a new mapping with accession as the first key
+        new_mapping = {'accession': 'accession'}
+        # Add the rest of the mapping
+        new_mapping.update(mapping_dict)
+        mapping_dict = new_mapping
+        # apply the mapping to the data
+        for row in self.data:
+            ordered_row = {}
+            # First add all keys from the mapping dict in order
+            for old_key, new_key in mapping_dict.items():
+                if old_key in row:
+                    ordered_row[new_key] = row.pop(old_key)
+                else:
+                    # if the old key is not present, set the new key to None
+                    ordered_row[new_key] = None
+            # Then add any remaining keys that weren't in the mapping
+            for key, value in row.items():
+                ordered_row[key] = value
+            # Replace the original row with the ordered row
+            row.clear()
+            row.update(ordered_row)
+        self.determine_columns()

datamule/package_updater.py ADDED Viewed

@@ -0,0 +1,31 @@
+from pathlib import Path
+import urllib.request
+import gzip
+import shutil
+import os
+class PackageUpdater():
+    def __init__(self):
+        pass
+    def update_package_data():
+        # Create data directory in user's home
+        data_dir = Path.home() / ".datamule"
+        data_dir.mkdir(exist_ok=True)
+        # Download data file
+        file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
+        file_path = data_dir / "listed_filer_metadata.csv"
+        temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
+        if not file_path.exists():
+            print(f"Downloading data to {data_dir}")
+            urllib.request.urlretrieve(file_url, temp_gz_path)
+            with gzip.open(temp_gz_path, 'rb') as f_in:
+                with open(file_path, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            os.remove(temp_gz_path)
+            print(f"Data downloaded to {file_path}")

datamule/portfolio.py CHANGED Viewed

@@ -119,7 +119,7 @@ class Portfolio:
             # First query, just set the accession numbers
             self.accession_numbers = new_accession_numbers
-    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
         if provider is None:
             config = Config()
             provider = config.get_default_source()
@@ -134,7 +134,8 @@ class Portfolio:
                 cik=cik,
                 submission_type=submission_type,
                 filing_date=filing_date,
-                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
+                keep_document_types=document_type
             )
         else:
             sec_download(
@@ -143,7 +144,8 @@ class Portfolio:
                 submission_type=submission_type,
                 filing_date=filing_date,
                 requests_per_second=requests_per_second,
-                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
+                keep_document_types=document_type
             )
         self.submissions_loaded = False

datamule/sec/submissions/downloader.py CHANGED Viewed

@@ -1,35 +1,19 @@
 import os
 import json
 from .streamer import stream
-from secsgml import parse_sgml_submission_into_memory
 import aiofiles
+from ...submission import Submission
-async def download_callback(hit, content, cik, accno, url, output_dir="filings"):
+async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
     """Save downloaded SEC submission to disk."""
     try:
-        # Parse the SGML content
-        metadata, documents = parse_sgml_submission_into_memory(content=content.decode('utf-8', errors='replace'))
+        # Create a Submission object directly from the content
+        # Note: the content needs to be decoded from bytes to string for the parser
+        submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
+                               keep_document_types=keep_document_types)
-        # Create folder structure: output_dir/accno
-        file_dir = os.path.join(output_dir, str(accno))
-        os.makedirs(file_dir, exist_ok=True)
-        # Save metadata
-        metadata_path = os.path.join(file_dir, "metadata.json")
-        async with aiofiles.open(metadata_path, 'w') as f:
-            await f.write(json.dumps(metadata, indent=4))
-        # Save all documents
-        for idx, _ in enumerate(metadata['documents']):
-            try:
-                filename = metadata['documents'][idx]['filename']
-            except (KeyError, IndexError):
-                filename = f"{metadata['documents'][idx].get('sequence', idx)}.txt"
-            # Use async file writing
-            doc_path = os.path.join(file_dir, filename)
-            async with aiofiles.open(doc_path, 'wb') as f:
-                await f.write(documents[idx])
+        # Use the async save method to write the submission to disk
+        file_dir = await submission.save_async(output_dir=output_dir)
         return file_dir
     except Exception as e:
@@ -37,7 +21,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
         return None
 def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
-             requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
+             requests_per_second=5, output_dir="filings", accession_numbers=None,
+             quiet=False, keep_document_types=None):
     """
     Download SEC EDGAR filings and extract their documents.
@@ -51,27 +36,19 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
     - output_dir: Directory to save documents
     - accession_numbers: Optional list of accession numbers to filter by
     - quiet: Whether to suppress progress output
+    - keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
     Returns:
     - List of all document paths processed
-    Examples:
-    # Download filings by CIK
-    download(cik="1318605", submission_type="10-K")
-    # Download filings by company name
-    download(name="Tesla", submission_type="10-K")
-    # Download filings with location filter
-    download(name="Apple", location="CA", submission_type="10-K")
     """
     # Make sure output directory exists
     os.makedirs(output_dir, exist_ok=True)
     # Create a wrapper for the download_callback that includes the output_dir
     async def callback_wrapper(hit, content, cik, accno, url):
-        return await download_callback(hit, content, cik, accno, url, output_dir=output_dir)
+        return await download_callback(hit, content, cik, accno, url,
+                                     output_dir=output_dir,
+                                     keep_document_types=keep_document_types)
     # Call the stream function with our callback
     return stream(

datamule/seclibrary/downloader.py CHANGED Viewed

@@ -16,17 +16,35 @@ from threading import Thread
 from secsgml import parse_sgml_submission
 from .query import query
 from os import cpu_count
+from ..submission import Submission
 class Downloader:
     def __init__(self, api_key=None):
         self.BASE_URL = "https://library.datamule.xyz/original/nc/"
         self.CHUNK_SIZE = 2 * 1024 * 1024
-        self.MAX_CONCURRENT_DOWNLOADS = 250
+        self.MAX_CONCURRENT_DOWNLOADS = 100
         self.MAX_DECOMPRESSION_WORKERS = cpu_count()
         self.MAX_PROCESSING_WORKERS = cpu_count()
         self.QUEUE_SIZE = 10
         if api_key is not None:
             self._api_key = api_key
+        # Create a shared event loop for async operations
+        self.loop = asyncio.new_event_loop()
+        # Create a thread to run the event loop
+        self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
+        self.loop_thread.start()
+        # Create a queue for async tasks
+        self.async_queue = Queue()
+    def _run_event_loop(self):
+        """Run the event loop in a separate thread"""
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+    def _run_coroutine(self, coro):
+        """Run a coroutine in the event loop and return its result"""
+        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+        return future.result()
     @property
     def api_key(self):
@@ -55,7 +73,7 @@ class Downloader:
             print(f"Failed to log error to {error_file}: {str(e)}")
     class FileProcessor:
-        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
+        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
             self.processing_queue = Queue(maxsize=queue_size)
             self.should_stop = False
             self.processing_workers = []
@@ -64,6 +82,7 @@ class Downloader:
             self.batch_size = 50
             self.pbar = pbar
             self.downloader = downloader
+            self.keep_document_types = keep_document_types
         def start_processing_workers(self):
             for _ in range(self.max_workers):
@@ -75,7 +94,9 @@ class Downloader:
         def _process_file(self, item):
             filename, content = item
             try:
-                parse_sgml_submission(output_dir=self.output_dir, content=content)
+                submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
+                # Use the shared event loop to run save_async
+                self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
                 self.pbar.update(1)
             except Exception as e:
                 accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
@@ -189,11 +210,11 @@ class Downloader:
             except Exception as e:
                 self._log_error(output_dir, filename, str(e))
-    async def process_batch(self, urls, output_dir):
+    async def process_batch(self, urls, output_dir, keep_document_types=None):
         os.makedirs(output_dir, exist_ok=True)
         with tqdm(total=len(urls), desc="Processing files") as pbar:
-            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
+            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
             processor.start_processing_workers()
             semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -216,7 +237,7 @@ class Downloader:
             processor.stop_workers()
             decompression_pool.shutdown()
-    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
+    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
         """
         Query SEC filings and download/process them.
@@ -225,6 +246,8 @@ class Downloader:
         - cik: Company CIK number(s), string, int, or list
         - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
         - output_dir: Directory to save downloaded files
+        - accession_numbers: List of specific accession numbers to download
+        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
         """
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -262,15 +285,32 @@ class Downloader:
         start_time = time.time()
         # Process the batch asynchronously
-        asyncio.run(self.process_batch(urls, output_dir))
+        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
         # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
         print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
         print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
+    def __del__(self):
+        """Cleanup when the downloader is garbage collected"""
+        if hasattr(self, 'loop') and self.loop.is_running():
+            self.loop.call_soon_threadsafe(self.loop.stop)
-def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
+def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
+    """
+    Query SEC filings and download/process them.
+    Parameters:
+    - submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
+    - cik: Company CIK number(s), string, int, or list
+    - filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
+    - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
+    - output_dir: Directory to save downloaded files
+    - accession_numbers: List of specific accession numbers to download
+    - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
+    """
     if accession_numbers:
         accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
     # check if acc no is empty list
@@ -282,5 +322,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
         cik=cik,
         filing_date=filing_date,
         output_dir=output_dir,
-        accession_numbers=accession_numbers
+        accession_numbers=accession_numbers,
+        keep_document_types=keep_document_types
     )

datamule 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

datamule 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl