PyPI - datamule - Versions diffs - 2.1.5__tar.gz → 2.2.0__tar.gz - Mend

datamule 2.1.5tar.gz → 2.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{datamule-2.1.5 → datamule-2.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.5
+Version: 2.2.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -19,3 +19,4 @@ Requires-Dist: secxbrl
 Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
+Requires-Dist: flashtext

{datamule-2.1.5 → datamule-2.2.0}/datamule/document/document.py RENAMED Viewed

@@ -13,9 +13,133 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
+import warnings
 from .tables.tables import Tables
+from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+class Tickers:
+    def __init__(self, document):
+        self.document = document
+        self._tickers_data = None
+    def _get_tickers_data(self):
+        """Get all tickers data once and cache it"""
+        if self._tickers_data is None:
+            # Check if document extension is supported
+            if self.document.extension not in ['.htm', '.html', '.txt']:
+                self._tickers_data = {}
+            else:
+                self._tickers_data = get_all_tickers(self.document.text)
+        return self._tickers_data
+    def __getattr__(self, exchange_name):
+        data = self._get_tickers_data()
+        if exchange_name in data:
+            return data[exchange_name]
+        return []
+    def __bool__(self):
+        """Return True if any tickers were found"""
+        data = self._get_tickers_data()
+        return bool(data.get('all', []))
+    def __repr__(self):
+        """Show the full ticker data when printed or accessed directly"""
+        data = self._get_tickers_data()
+        return str(data)
+    def __str__(self):
+        """Show the full ticker data when printed"""
+        data = self._get_tickers_data()
+        return str(data)
+class Tags:
+    def __init__(self, document):
+        from ..tags.config import _active_dictionaries,_loaded_dictionaries
+        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
+        self.document = document
+        self._tickers = None
+        self.dictionaries = {}
+        # Load global dictionaries with their data
+        active_dicts = _active_dictionaries
+        for dict_name in active_dicts:
+            self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
+    def _check_support(self):
+        if self.not_supported:
+            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
+            return False
+        return True
+    @property
+    def cusips(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_cusip'):
+            if 'sc13dg_cusips' in self.dictionaries:
+                keywords = self.dictionaries['sc13dg_cusips']
+                self._cusip = get_cusip_using_regex(self.document.text, keywords)
+            else:
+                self._cusip = get_cusip_using_regex(self.document.text)
+        return self._cusip
+    @property
+    def isins(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_isin'):
+            if 'npx_isins' in self.dictionaries:
+                keywords = self.dictionaries['npx_isins']
+                self._isin = get_isin_using_regex(self.document.text, keywords)
+            else:
+                self._isin = get_isin_using_regex(self.document.text)
+        return self._isin
+    @property
+    def figis(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_figi'):
+            if 'npx_figis' in self.dictionaries:
+                keywords = self.dictionaries['npx_figis']
+                self._figi = get_figi_using_regex(self.document.text, keywords)
+            else:
+                self._figi = get_figi_using_regex(self.document.text)
+        return self._figi
+    @property
+    def tickers(self):
+        if self._tickers is None:
+            self._tickers = Tickers(self.document)
+        return self._tickers
+    @property
+    def persons(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_persons'):
+            if '8k_2024_persons' in self.dictionaries:
+                # Use FlashText dictionary lookup for 8K persons
+                self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
+            elif 'ssa_baby_first_names' in self.dictionaries:
+                # Use regex with SSA names for validation
+                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
+            else:
+                # Fallback to regex without validation
+                self._persons = get_full_names(self.document.text)
+        return self._persons
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,10 +158,13 @@ class Document:
             self.path = path
         self.extension = extension
         # this will be filled by parsed
         self._data = None
         self._tables = None
         self._text = None
+        self.tags = Tags(self)
@@ -119,93 +246,92 @@ class Document:
         if self.extension == '.txt':
             content = self.text
-            if self.type == '10-Q':
+            if self.type in ['10-Q', '10-Q/A']:
                 mapping_dict = dict_10q
-            elif self.type == '10-K':
+            elif self.type in ['10-K','10-K/A']:
                 mapping_dict = dict_10k
-            elif self.type == '8-K':
+            elif self.type in ['8-K', '8-K/A']:
                 mapping_dict = dict_8k
-            elif self.type == 'SC 13D':
+            elif self.type in ['SC 13D', 'SC 13D/A']:
                 mapping_dict = dict_13d
-            elif self.type == 'SC 13G':
+            elif self.type in ['SC 13G', 'SC 13G/A']:
                 mapping_dict = dict_13g
             self._data = {}
             self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
         elif self.extension in ['.htm', '.html']:
-            if self.type == '1-K':
+            if self.type in ['1-K', '1-K/A']:
                 mapping_dict = dict_1kpartii_html
-            elif self.type == '1-SA':
+            elif self.type in ['1-SA', '1-SA/A']:
                 mapping_dict = dict_1sa_html
-            elif self.type == '1-U':
+            elif self.type in ['1-U', '1-U/A']:
                 mapping_dict = dict_1u_html
-            elif self.type == '10-12B':
+            elif self.type in ['10-12B', '10-12B/A']:
                 mapping_dict = dict_1012b_html
-            elif self.type == '10-D':
+            elif self.type in ['10-D', '10-D/A']:
                 mapping_dict = dict_10d_html
-            elif self.type == '10-K':
+            elif self.type in ['10-K', '10-K/A']:
                 mapping_dict = dict_10k_html
-            elif self.type == '10-Q':
+            elif self.type in ['10-Q', '10-Q/A']:
                 mapping_dict = dict_10q_html
-            elif self.type == '20-F':
+            elif self.type in ['20-F', '20-F/A']:
                 mapping_dict = dict_20f_html
-            elif self.type == '8-A12B':
+            elif self.type in ['8-A12B', '8-A12B/A']:
                 mapping_dict = dict_8a12b_html
-            elif self.type == '8-A12G':
+            elif self.type in ['8-A12G', '8-A12G/A']:
                 mapping_dict = dict_8a12g_html
-            elif self.type == '8-K':
+            elif self.type in ['8-K', '8-K/A']:
                 mapping_dict = dict_8k_html
-            elif self.type == '8-K12B':
+            elif self.type in ['8-K12B', '8-K12B/A']:
                 mapping_dict = dict_8k12b_html
-            elif self.type == '8-K12G3':
+            elif self.type in ['8-K12G3', '8-K12G3/A']:
                 mapping_dict = dict_8k12g3_html
-            elif self.type == '8-K15D5':
+            elif self.type in ['8-K15D5', '8-K15D5/A']:
                 mapping_dict = dict_8k15d5_html
-            elif self.type == 'ABS-15G':
+            elif self.type in ['ABS-15G', 'ABS-15G/A']:
                 mapping_dict = dict_abs15g_html
-            elif self.type == 'ABS-EE':
+            elif self.type in ['ABS-EE', 'ABS-EE/A']:
                 mapping_dict = dict_absee_html
-            elif self.type == 'APP NTC':
-                dict_appntc_html
-            elif self.type == 'CB':
+            elif self.type in ['APP NTC', 'APP NTC/A']:
+                mapping_dict = dict_appntc_html
+            elif self.type in ['CB', 'CB/A']:
                 mapping_dict = dict_cb_html
-            elif self.type == 'DSTRBRPT':
+            elif self.type in ['DSTRBRPT', 'DSTRBRPT/A']:
                 mapping_dict = dict_dstrbrpt_html
-            elif self.type == 'N-18F1':
+            elif self.type in ['N-18F1', 'N-18F1/A']:
                 mapping_dict = dict_n18f1_html
-            elif self.type == 'N-CSRS':
+            elif self.type in ['N-CSRS', 'N-CSRS/A']:
                 mapping_dict = dict_ncsrs_html
-            elif self.type == 'NT-10K':
+            elif self.type in ['NT-10K', 'NT-10K/A']:
                 mapping_dict = dict_nt10k_html
-            elif self.type == 'NT-10Q':
+            elif self.type in ['NT-10Q', 'NT-10Q/A']:
                 mapping_dict = dict_nt10q_html
-            elif self.type == 'NT 20-F':
+            elif self.type in ['NT 20-F', 'NT 20-F/A']:
                 mapping_dict = dict_nt20f_html
-            elif self.type == 'NT-NCEN':
+            elif self.type in ['NT-NCEN', 'NT-NCEN/A']:
                 mapping_dict = dict_ntncen_html
-            elif self.type == 'NT-NCSR':
+            elif self.type in ['NT-NCSR', 'NT-NCSR/A']:
                 mapping_dict = dict_ntncsr_html
-            elif self.type == 'NTFNCEN':
+            elif self.type in ['NTFNCEN', 'NTFNCEN/A']:
                 mapping_dict = dict_ntfcen_html
-            elif self.type == 'NTFNCSR':
+            elif self.type in ['NTFNCSR', 'NTFNCSR/A']:
                 mapping_dict = dict_ntfncsr_html
-            elif self.type == 'EX-99.CERT':
+            elif self.type in ['EX-99.CERT', 'EX-99.CERT/A']:
                 mapping_dict = dict_ex99cert_html
-            elif self.type == 'SC 13E3':
+            elif self.type in ['SC 13E3', 'SC 13E3/A']:
                 mapping_dict = dict_sc13e3_html
-            elif self.type == 'SC 14D9':
+            elif self.type in ['SC 14D9', 'SC 14D9/A']:
                 mapping_dict = dict_sc14d9_html
-            elif self.type == 'SP 15D2':
+            elif self.type in ['SP 15D2', 'SP 15D2/A']:
                 mapping_dict = dict_sp15d2_html
-            elif self.type == 'SD':
+            elif self.type in ['SD', 'SD/A']:
                 mapping_dict = dict_sd_html
-            elif self.type == 'S-1':
+            elif self.type in ['S-1', 'S-1/A']:
                 mapping_dict = dict_s1_html
-            elif self.type == 'T-3':
+            elif self.type in ['T-3', 'T-3/A']:
                 mapping_dict = dict_t3_html
-            elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
+            elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
                 mapping_dict = dict_nt10k_html
             dct = html2dict(content=self.content, mapping_dict=mapping_dict)
@@ -233,7 +359,7 @@ class Document:
                 self._preprocess_html_content()
             elif self.extension == '.txt':
                 self._preprocess_txt_content()
-            return self._text
+        return self._text
     def write_json(self, output_filename=None):
         if not self.data:

datamule-2.2.0/datamule/sheet.py ADDED Viewed

@@ -0,0 +1,306 @@
+from pathlib import Path
+import csv
+import os
+from .helper import _process_cik_and_metadata_filters, load_package_dataset
+from .sec.xbrl.downloadcompanyfacts import download_company_facts
+from .datamule.datamule_lookup import datamule_lookup
+from .datamule.datamule_mysql_rds import query_mysql_rds
+from company_fundamentals.utils import get_fundamental_mappings
+from company_fundamentals import construct_fundamentals
+class Sheet:
+    def __init__(self, path):
+        self.path = Path(path)
+    # Keep
+    def get_submissions(self,cik=None, accession_number=None, submission_type=None, filing_date=None,
+                   columns=None, distinct=False, page_size=25000, quiet=False, api_key=None):
+        return datamule_lookup(cik, accession_number, submission_type, filing_date,
+                   columns, distinct, page_size, quiet, api_key)
+    def get_table(self,table,cik=None,ticker=None,**kwargs):
+        cik = _process_cik_and_metadata_filters(cik, ticker)
+        if table == 'fundamentals':
+            fundamentals = kwargs.pop('fundamentals', None)
+            if fundamentals is None:
+                raise ValueError("fundamentals parameter required for fundamentals table")
+            categories = kwargs.pop('categories',None)
+            mappings = get_fundamental_mappings(fundamentals=fundamentals)
+            #print(mappings)
+            taxonomies = [item[0] for item in mappings]
+            names = [item[1] for item in mappings]
+            xbrl = query_mysql_rds(table='simple_xbrl',cik=cik,taxonomy=taxonomies,name=names,**kwargs)
+            #print(xbrl)
+            return construct_fundamentals(xbrl, 'taxonomy', 'name', 'period_start_date', 'period_end_date', categories=categories,fundamentals=fundamentals)
+        else:
+            return query_mysql_rds(table=table,cik=cik,**kwargs)
+    def download_xbrl(
+        self,
+        cik=None,
+        ticker=None,
+        **kwargs
+    ):
+        # If no CIK or ticker specified, get all companies with tickers
+        if cik is None and ticker is None:
+            cik = [row['cik'] for row in load_package_dataset('company_tickers')]
+        # Normalize cik to list format
+        if isinstance(cik, (str, int)):
+            cik = [cik]
+        # Process CIK and metadata filters
+        cik_list = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
+        # Download facts for all CIKs in parallel
+        download_company_facts(cik=cik_list, output_dir=self.path)
+    def get_information_table(
+        self,
+        # Optional filtering parameters
+        columns=None,
+        name_of_issuer=None,
+        title_of_class=None,
+        cusip=None,
+        value=None,
+        ssh_prnamt=None,
+        ssh_prnamt_type=None,
+        investment_discretion=None,
+        voting_authority_sole=None,
+        voting_authority_shared=None,
+        voting_authority_none=None,
+        reporting_owner_cik=None,
+        put_call=None,
+        other_manager=None,
+        figi=None,
+        accession=None,
+        filing_date=None,
+        # API key handling
+        api_key=None,
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for 13F-HR information table data.
+        Parameters:
+        -----------
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        name_of_issuer, title_of_class, etc. : Various filters that can be:
+            - str: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        return get_information_table(
+            columns=columns,
+            name_of_issuer=name_of_issuer,
+            title_of_class=title_of_class,
+            cusip=cusip,
+            value=value,
+            ssh_prnamt=ssh_prnamt,
+            ssh_prnamt_type=ssh_prnamt_type,
+            investment_discretion=investment_discretion,
+            voting_authority_sole=voting_authority_sole,
+            voting_authority_shared=voting_authority_shared,
+            voting_authority_none=voting_authority_none,
+            reporting_owner_cik=reporting_owner_cik,
+            put_call=put_call,
+            other_manager=other_manager,
+            figi=figi,
+            accession=accession,
+            filing_date=filing_date,
+            # API key handling
+            api_key=api_key,
+            # Additional options
+            print_cost=print_cost,
+            verbose=verbose
+        )
+    def get_345(
+        self,
+        # Optional filtering parameters
+        columns=None,
+        is_derivative=None,
+        is_non_derivative=None,
+        security_title=None,
+        transaction_date=None,
+        document_type=None,
+        transaction_code=None,
+        equity_swap_involved=None,
+        transaction_timeliness=None,
+        transaction_shares=None,
+        transaction_price_per_share=None,
+        shares_owned_following_transaction=None,
+        ownership_type=None,
+        deemed_execution_date=None,
+        conversion_or_exercise_price=None,
+        exercise_date=None,
+        expiration_date=None,
+        underlying_security_title=None,
+        underlying_security_shares=None,
+        underlying_security_value=None,
+        accession=None,
+        reporting_owner_cik=None,
+        issuer_cik=None,
+        filing_date=None,
+        # API key handling
+        api_key=None,
+        # Additional options
+        print_cost=True,
+        verbose=False
+    ):
+        """
+        Query the SEC BigQuery API for Form 345 insider transaction data.
+        Parameters:
+        -----------
+        columns : List[str], optional
+            Specific columns to return. If None, all columns are returned.
+        # Filter parameters
+        is_derivative, security_title, etc. : Various filters that can be:
+            - str/bool: Exact match
+            - List[str]: Match any in list
+            - tuple: (min, max) range for numeric/date fields
+        reporting_owner_cik : str or List[str]
+            CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
+            Any match within the array will return the record.
+        issuer_cik : str or List[str]
+            CIK(s) of the company/companies
+        api_key : str, optional
+            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
+        print_cost : bool
+            Whether to print the query cost information
+        verbose : bool
+            Whether to print additional information about the query
+        Returns:
+        --------
+        List[Dict]
+            A list of dictionaries containing the query results
+        Raises:
+        -------
+        ValueError
+            If API key is missing or invalid
+        Exception
+            For API errors or other issues
+        """
+        return get_345(
+            columns=columns,
+            is_derivative=is_derivative,
+            is_non_derivative=is_non_derivative,
+            security_title=security_title,
+            transaction_date=transaction_date,
+            document_type=document_type,
+            transaction_code=transaction_code,
+            equity_swap_involved=equity_swap_involved,
+            transaction_timeliness=transaction_timeliness,
+            transaction_shares=transaction_shares,
+            transaction_price_per_share=transaction_price_per_share,
+            shares_owned_following_transaction=shares_owned_following_transaction,
+            ownership_type=ownership_type,
+            deemed_execution_date=deemed_execution_date,
+            conversion_or_exercise_price=conversion_or_exercise_price,
+            exercise_date=exercise_date,
+            expiration_date=expiration_date,
+            underlying_security_title=underlying_security_title,
+            underlying_security_shares=underlying_security_shares,
+            underlying_security_value=underlying_security_value,
+            accession=accession,
+            reporting_owner_cik=reporting_owner_cik,
+            issuer_cik=issuer_cik,
+            filing_date=filing_date,
+            # API key handling
+            api_key=api_key,
+            # Additional options
+            print_cost=print_cost,
+            verbose=verbose
+        )
+    def _download_to_csv(self, data, filepath, verbose=False):
+        """
+        Helper method to download data to a CSV file.
+        Parameters:
+        -----------
+        data : List[Dict]
+            The data to save
+        filepath : str or Path
+            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
+        verbose : bool
+            Whether to print additional information
+        Returns:
+        --------
+        List[Dict]
+            The input data (for method chaining)
+        """
+        # If no data returned, nothing to save
+        if not data:
+            if verbose:
+                print("No data returned from API. No file was created.")
+            return data
+        # Resolve filepath - if it's not absolute, make it relative to self.path
+        filepath_obj = Path(filepath)
+        if not filepath_obj.is_absolute():
+            filepath_obj = self.path / filepath_obj
+        # Create directory if it doesn't exist
+        os.makedirs(filepath_obj.parent, exist_ok=True)
+        # Get fieldnames from the first record
+        fieldnames = data[0].keys()
+        # Write to CSV
+        with open(filepath_obj, 'w', newline='') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(data)
+        if verbose:
+            print(f"Saved {len(data)} records to {filepath_obj}")

datamule-2.2.0/datamule/tags/config.py ADDED Viewed

@@ -0,0 +1,16 @@
+from ..utils.dictionaries import download_dictionary, load_dictionary
+_active_dictionaries = []
+_loaded_dictionaries = {}
+def set_dictionaries(dictionaries, overwrite=False):
+    """Set active dictionaries and load them into memory"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = dictionaries
+    _loaded_dictionaries = {}
+    for dict_name in dictionaries:
+        # Download if needed
+        download_dictionary(dict_name, overwrite=overwrite)
+        # Load into memory
+        _loaded_dictionaries[dict_name] = load_dictionary(dict_name)

datamule 2.1.5__tar.gz → 2.2.0__tar.gz

datamule 2.1.5tar.gz → 2.2.0tar.gz