PyPI - datamule - Versions diffs - 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

datamule 1.1.8py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

datamule/__init__.py +3 -1
datamule/document/__init__.py +0 -0
datamule/document/document.py +255 -0
datamule/document/processing.py +604 -0
datamule/document/table.py +260 -0
datamule/package_updater.py +31 -0
datamule/portfolio.py +5 -3
datamule/sec/submissions/downloader.py +14 -37
datamule/seclibrary/bq.py +349 -12
datamule/seclibrary/downloader.py +50 -9
datamule/sheet.py +458 -34
datamule/submission.py +102 -7
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/METADATA +1 -1
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/RECORD +16 -12
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/WHEEL +1 -1
datamule/document.py +0 -472
{datamule-1.1.8.dist-info → datamule-1.2.1.dist-info}/top_level.txt +0 -0

datamule/document/table.py ADDED Viewed

@@ -0,0 +1,260 @@
+from .mappings.atsn import *
+from .mappings.cfportal import *
+from .mappings.ex99a_sdr import *
+from .mappings.ex99c_sdr import *
+from .mappings.ex99g_sdr import *
+from .mappings.ex99i_sdr import *
+from .mappings.nmfp import *
+from .mappings.npx import *
+from .mappings.onefourtyfour import *
+from .mappings.ownership import *
+from .mappings.proxy_voting_record import *
+from .mappings.sbs import *
+from .mappings.sbsef import *
+from .mappings.schedule13 import *
+from .mappings.sdr import *
+from .mappings.ta import *
+from .mappings.thirteenfhr import *
+from .mappings.twentyfivense import *
+from .mappings.twentyfourf2nt import *
+from .mappings.information_table import *
+# need to check if mappings correctly create new columns
+class Table():
+    def __init__(self, data, type,accession):
+        if isinstance(data,dict):
+            data = [data]
+        self.type = type
+        self.data = data
+        self.accession = accession
+        self.columns = self.determine_columns()
+    def determine_columns(self):
+        if len(self.data) == 0:
+            return []
+        return self.data[0].keys()
+    def add_column(self,column_name,value):
+        for row in self.data:
+            row[column_name] = value
+    def map_data(self):
+        # Add the accession column to all rows first, ensuring it will be first
+        self.add_column('accession', self.accession)
+        # ATS-N, types: metadata_ats,cover_ats,part_one_ats,part_two_ats,part_three_ats,part_four_ats
+        if self.type == 'metadata_ats':
+            mapping_dict = metadata_ats_dict
+        elif self.type == 'cover_ats':
+            mapping_dict = cover_ats_dict
+        elif self.type == 'part_one_ats':
+            mapping_dict = part_one_ats_dict
+        elif self.type == 'part_two_ats':
+            mapping_dict = part_two_ats_dict
+        elif self.type == 'part_three_ats':
+            mapping_dict = part_three_ats_dict
+        elif self.type == 'part_four_ats':
+            mapping_dict = part_four_ats_dict
+        # CFPORTAL
+        elif self.type == 'metadata_cfportal':
+            mapping_dict = metadata_cfportal_dict
+        elif self.type == 'identifying_information_cfportal':
+            mapping_dict = identifying_information_cfportal_dict
+        elif self.type == 'form_of_organization_cfportal':
+            mapping_dict = form_of_organization_cfportal_dict
+        elif self.type == 'successions_cfportal':
+            mapping_dict = successions_cfportal_dict
+        elif self.type == 'control_relationships_cfportal':
+            mapping_dict = control_relationships_cfportal_dict
+        elif self.type == 'disclosure_answers_cfportal':
+            mapping_dict = disclosure_answers_cfportal_dict
+        elif self.type == 'non_securities_related_business_cfportal':
+            mapping_dict = non_securities_related_business_cfportal_dict
+        elif self.type == 'escrow_arrangements_cfportal':
+            mapping_dict = escrow_arrangements_cfportal_dict
+        elif self.type == 'execution_cfportal':
+            mapping_dict = execution_cfportal_dict
+        elif self.type == 'schedule_a_cfportal':
+            mapping_dict = schedule_a_cfportal_dict
+        elif self.type == 'schedule_b_cfportal':
+            mapping_dict = schedule_b_cfportal_dict
+        elif self.type == 'schedule_c_cfportal':
+            mapping_dict = schedule_c_cfportal_dict
+        elif self.type == 'schedule_d_cfportal':
+            mapping_dict = schedule_d_cfportal_dict
+        elif self.type == 'criminal_drip_info_cfportal':
+            mapping_dict = criminal_drip_info_cfportal_dict
+        elif self.type == 'regulatory_drip_info_cfportal':
+            mapping_dict = regulatory_drip_info_cfportal_dict
+        elif self.type == 'civil_judicial_drip_info_cfportal':
+            mapping_dict = civil_judicial_drip_info_cfportal_dict
+        elif self.type == 'bankruptcy_sipc_drip_info_cfportal':
+            mapping_dict = bankruptcy_sipc_drip_info_cfportal_dict
+        elif self.type == 'bond_drip_info_cfportal':
+            mapping_dict = bond_drip_info_cfportal_dict
+        elif self.type == 'judgement_drip_info_cfportal':
+            mapping_dict = judgement_drip_info_cfportal_dict
+        # SDR
+        # Information Table
+        elif self.type == 'information_table':
+            mapping_dict = information_table_dict
+        # NFMP
+        elif self.type == 'metadata_nmfp':
+            mapping_dict = metadata_nmfp_dict
+        elif self.type == 'general_information_nmfp':
+            mapping_dict = general_information_nmfp_dict
+        elif self.type == 'series_level_info_nmfp':
+            mapping_dict = series_level_info_nmfp_dict
+        elif self.type == 'class_level_info_nmfp':
+            mapping_dict = class_level_info_nmfp_dict
+        elif self.type == 'schedule_of_portfolio_securities_info_nmfp':
+            mapping_dict = schedule_of_portfolio_securities_info_nmfp_dict
+        elif self.type == 'signature_nmfp':
+            mapping_dict = signature_nmfp_dict
+        # NPX
+        elif self.type == 'npx':
+            mapping_dict = npx_dict
+        # 144
+        elif self.type == 'signatures_144':
+            mapping_dict = signatures_144_dict
+        elif self.type == 'securities_sold_in_past_3_months_144':
+            mapping_dict = securities_sold_in_past_3_months_144_dict
+        elif self.type == 'securities_to_be_sold_144':
+            mapping_dict = securities_to_be_sold_144_dict
+        elif self.type == 'securities_information_144':
+            mapping_dict = securities_information_144_dict
+        elif self.type == 'issuer_information_144':
+            mapping_dict = issuer_information_144_dict
+        elif self.type == 'metadata_144':
+            mapping_dict = metadata_144_dict
+        # Ownership
+        elif self.type == 'non_derivative_holding_ownership':
+            mapping_dict = non_derivative_holding_ownership_dict
+        elif self.type == 'non_derivative_transaction_ownership':
+            mapping_dict = non_derivative_transaction_ownership_dict
+        elif self.type == 'derivative_transaction_ownership':
+            mapping_dict = derivative_transaction_ownership_dict
+        elif self.type == 'derivative_holding_ownership':
+            mapping_dict = derivative_holding_ownership_dict
+        elif self.type == 'reporting_owner_ownership':
+            mapping_dict = reporting_owner_ownership_dict
+        elif self.type == 'metadata_ownership':
+            mapping_dict = metadata_ownership_dict
+        elif self.type == 'owner_signature_ownership':
+            mapping_dict = owner_signature_ownership_dict
+        # Proxy Voting Record
+        elif self.type == 'proxy_voting_record':
+            mapping_dict = proxy_voting_record_dict
+        # SBS
+        # SBSEF
+        elif self.type == 'sbsef':
+            mapping_dict = sbsef_dict
+        # Schedule 13
+        elif self.type == 'metadata_schedule_13':
+            mapping_dict = metadata_schedule_13_dict
+        elif self.type == 'cover_schedule_13':
+            mapping_dict = cover_schedule_13_dict
+        elif self.type == 'reporting_person_details_schedule_13':
+            mapping_dict = reporting_person_details_schedule_13_dict
+        elif self.type == 'item_1_schedule_13':
+            mapping_dict = item_1_schedule_13_dict
+        elif self.type == 'item_2_schedule_13':
+            mapping_dict = item_2_schedule_13_dict
+        elif self.type == 'item_3_schedule_13':
+            mapping_dict = item_3_schedule_13_dict
+        elif self.type == 'item_4_schedule_13':
+            mapping_dict = item_4_schedule_13_dict
+        elif self.type == 'item_5_schedule_13':
+            mapping_dict = item_5_schedule_13_dict
+        elif self.type == 'item_6_schedule_13':
+            mapping_dict = item_6_schedule_13_dict
+        elif self.type == 'item_7_schedule_13':
+            mapping_dict = item_7_schedule_13_dict
+        elif self.type == 'item_8_schedule_13':
+            mapping_dict = item_8_schedule_13_dict
+        elif self.type == 'item_9_schedule_13':
+            mapping_dict = item_9_schedule_13_dict
+        elif self.type == 'item_10_schedule_13':
+            mapping_dict = item_10_schedule_13_dict
+        elif self.type == 'signature_schedule_13':
+            mapping_dict = signature_schedule_13_dict
+        # SDR
+        elif self.type == 'sdr':
+            mapping_dict = sdr_dict
+        # TA
+        # 13F-HR
+        elif self.type == '13fhr':
+            mapping_dict = thirteenfhr_dict
+        # 25-NSE
+        elif self.type == '25nse':
+            mapping_dict = twentyfive_nse_dict
+        # 24F-2NT
+        elif self.type == 'metadata_24f_2nt':
+            mapping_dict = metadata_24f_2nt_dict
+        elif self.type == 'item_1_24f2nt':
+            mapping_dict = item_1_24f2nt_dict
+        elif self.type == 'item_2_24f2nt':
+            mapping_dict = item_2_24f2nt_dict
+        elif self.type == 'item_3_24f2nt':
+            mapping_dict = item_3_24f2nt_dict
+        elif self.type == 'item_4_24f2nt':
+            mapping_dict = item_4_24f2nt_dict
+        elif self.type == 'item_5_24f2nt':
+            mapping_dict = item_5_24f2nt_dict
+        elif self.type == 'item_6_24f2nt':
+            mapping_dict = item_6_24f2nt_dict
+        elif self.type == 'item_7_24f2nt':
+            mapping_dict = item_7_24f2nt_dict
+        elif self.type == 'item_8_24f2nt':
+            mapping_dict = item_8_24f2nt_dict
+        elif self.type == 'item_9_24f2nt':
+            mapping_dict = item_9_24f2nt_dict
+        elif self.type == 'signature_info_schedule_a':
+            mapping_dict = signature_24f2nt_dict
+        else:
+            mapping_dict = {}
+        # Update mapping dictionary to include accession at the beginning
+        # Create a new mapping with accession as the first key
+        new_mapping = {'accession': 'accession'}
+        # Add the rest of the mapping
+        new_mapping.update(mapping_dict)
+        mapping_dict = new_mapping
+        # apply the mapping to the data
+        for row in self.data:
+            ordered_row = {}
+            # First add all keys from the mapping dict in order
+            for old_key, new_key in mapping_dict.items():
+                if old_key in row:
+                    ordered_row[new_key] = row.pop(old_key)
+                else:
+                    # if the old key is not present, set the new key to None
+                    ordered_row[new_key] = None
+            # Then add any remaining keys that weren't in the mapping
+            for key, value in row.items():
+                ordered_row[key] = value
+            # Replace the original row with the ordered row
+            row.clear()
+            row.update(ordered_row)
+        self.determine_columns()

datamule/package_updater.py ADDED Viewed

@@ -0,0 +1,31 @@
+from pathlib import Path
+import urllib.request
+import gzip
+import shutil
+import os
+class PackageUpdater():
+    def __init__(self):
+        pass
+    def update_package_data():
+        # Create data directory in user's home
+        data_dir = Path.home() / ".datamule"
+        data_dir.mkdir(exist_ok=True)
+        # Download data file
+        file_url = "https://github.com/john-friedman/datamule-data/raw/master/data/filer_metadata/listed_filer_metadata.csv.gz"
+        file_path = data_dir / "listed_filer_metadata.csv"
+        temp_gz_path = data_dir / "listed_filer_metadata.csv.gz"
+        if not file_path.exists():
+            print(f"Downloading data to {data_dir}")
+            urllib.request.urlretrieve(file_url, temp_gz_path)
+            with gzip.open(temp_gz_path, 'rb') as f_in:
+                with open(file_path, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            os.remove(temp_gz_path)
+            print(f"Data downloaded to {file_path}")

datamule/portfolio.py CHANGED Viewed

@@ -119,7 +119,7 @@ class Portfolio:
             # First query, just set the accession numbers
             self.accession_numbers = new_accession_numbers
-    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
         if provider is None:
             config = Config()
             provider = config.get_default_source()
@@ -134,7 +134,8 @@ class Portfolio:
                 cik=cik,
                 submission_type=submission_type,
                 filing_date=filing_date,
-                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
+                keep_document_types=document_type
             )
         else:
             sec_download(
@@ -143,7 +144,8 @@ class Portfolio:
                 submission_type=submission_type,
                 filing_date=filing_date,
                 requests_per_second=requests_per_second,
-                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
+                accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
+                keep_document_types=document_type
             )
         self.submissions_loaded = False

datamule/sec/submissions/downloader.py CHANGED Viewed

@@ -1,35 +1,19 @@
 import os
 import json
 from .streamer import stream
-from secsgml import parse_sgml_submission_into_memory
 import aiofiles
+from ...submission import Submission
-async def download_callback(hit, content, cik, accno, url, output_dir="filings"):
+async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
     """Save downloaded SEC submission to disk."""
     try:
-        # Parse the SGML content
-        metadata, documents = parse_sgml_submission_into_memory(content=content.decode('utf-8', errors='replace'))
+        # Create a Submission object directly from the content
+        # Note: the content needs to be decoded from bytes to string for the parser
+        submission = Submission(sgml_content=content.decode('utf-8', errors='replace'),
+                               keep_document_types=keep_document_types)
-        # Create folder structure: output_dir/accno
-        file_dir = os.path.join(output_dir, str(accno))
-        os.makedirs(file_dir, exist_ok=True)
-        # Save metadata
-        metadata_path = os.path.join(file_dir, "metadata.json")
-        async with aiofiles.open(metadata_path, 'w') as f:
-            await f.write(json.dumps(metadata, indent=4))
-        # Save all documents
-        for idx, _ in enumerate(metadata['documents']):
-            try:
-                filename = metadata['documents'][idx]['filename']
-            except (KeyError, IndexError):
-                filename = f"{metadata['documents'][idx].get('sequence', idx)}.txt"
-            # Use async file writing
-            doc_path = os.path.join(file_dir, filename)
-            async with aiofiles.open(doc_path, 'wb') as f:
-                await f.write(documents[idx])
+        # Use the async save method to write the submission to disk
+        file_dir = await submission.save_async(output_dir=output_dir)
         return file_dir
     except Exception as e:
@@ -37,7 +21,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
         return None
 def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
-             requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
+             requests_per_second=5, output_dir="filings", accession_numbers=None,
+             quiet=False, keep_document_types=None):
     """
     Download SEC EDGAR filings and extract their documents.
@@ -51,27 +36,19 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
     - output_dir: Directory to save documents
     - accession_numbers: Optional list of accession numbers to filter by
     - quiet: Whether to suppress progress output
+    - keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
     Returns:
     - List of all document paths processed
-    Examples:
-    # Download filings by CIK
-    download(cik="1318605", submission_type="10-K")
-    # Download filings by company name
-    download(name="Tesla", submission_type="10-K")
-    # Download filings with location filter
-    download(name="Apple", location="CA", submission_type="10-K")
     """
     # Make sure output directory exists
     os.makedirs(output_dir, exist_ok=True)
     # Create a wrapper for the download_callback that includes the output_dir
     async def callback_wrapper(hit, content, cik, accno, url):
-        return await download_callback(hit, content, cik, accno, url, output_dir=output_dir)
+        return await download_callback(hit, content, cik, accno, url,
+                                     output_dir=output_dir,
+                                     keep_document_types=keep_document_types)
     # Call the stream function with our callback
     return stream(

datamule 1.1.8__py3-none-any.whl → 1.2.1__py3-none-any.whl

datamule 1.1.8py3-none-any.whl → 1.2.1py3-none-any.whl