PyPI - datamule - Versions diffs - 2.1.5__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

datamule 2.1.5py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

datamule/document/document.py +171 -45
datamule/sheet.py +1 -401
datamule/tags/__init__.py +0 -0
datamule/tags/config.py +16 -0
datamule/tags/regex.py +105 -0
datamule/tags/utils.py +149 -0
datamule/utils/dictionaries.py +76 -0
{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/METADATA +2 -1
{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/RECORD +11 -6
{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/WHEEL +0 -0
{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/top_level.txt +0 -0

datamule/document/document.py CHANGED Viewed

@@ -13,9 +13,133 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
+import warnings
 from .tables.tables import Tables
+from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+class Tickers:
+    def __init__(self, document):
+        self.document = document
+        self._tickers_data = None
+    def _get_tickers_data(self):
+        """Get all tickers data once and cache it"""
+        if self._tickers_data is None:
+            # Check if document extension is supported
+            if self.document.extension not in ['.htm', '.html', '.txt']:
+                self._tickers_data = {}
+            else:
+                self._tickers_data = get_all_tickers(self.document.text)
+        return self._tickers_data
+    def __getattr__(self, exchange_name):
+        data = self._get_tickers_data()
+        if exchange_name in data:
+            return data[exchange_name]
+        return []
+    def __bool__(self):
+        """Return True if any tickers were found"""
+        data = self._get_tickers_data()
+        return bool(data.get('all', []))
+    def __repr__(self):
+        """Show the full ticker data when printed or accessed directly"""
+        data = self._get_tickers_data()
+        return str(data)
+    def __str__(self):
+        """Show the full ticker data when printed"""
+        data = self._get_tickers_data()
+        return str(data)
+class Tags:
+    def __init__(self, document):
+        from ..tags.config import _active_dictionaries,_loaded_dictionaries
+        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
+        self.document = document
+        self._tickers = None
+        self.dictionaries = {}
+        # Load global dictionaries with their data
+        active_dicts = _active_dictionaries
+        for dict_name in active_dicts:
+            self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
+    def _check_support(self):
+        if self.not_supported:
+            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
+            return False
+        return True
+    @property
+    def cusips(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_cusip'):
+            if 'sc13dg_cusips' in self.dictionaries:
+                keywords = self.dictionaries['sc13dg_cusips']
+                self._cusip = get_cusip_using_regex(self.document.text, keywords)
+            else:
+                self._cusip = get_cusip_using_regex(self.document.text)
+        return self._cusip
+    @property
+    def isins(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_isin'):
+            if 'npx_isins' in self.dictionaries:
+                keywords = self.dictionaries['npx_isins']
+                self._isin = get_isin_using_regex(self.document.text, keywords)
+            else:
+                self._isin = get_isin_using_regex(self.document.text)
+        return self._isin
+    @property
+    def figis(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_figi'):
+            if 'npx_figis' in self.dictionaries:
+                keywords = self.dictionaries['npx_figis']
+                self._figi = get_figi_using_regex(self.document.text, keywords)
+            else:
+                self._figi = get_figi_using_regex(self.document.text)
+        return self._figi
+    @property
+    def tickers(self):
+        if self._tickers is None:
+            self._tickers = Tickers(self.document)
+        return self._tickers
+    @property
+    def persons(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_persons'):
+            if '8k_2024_persons' in self.dictionaries:
+                # Use FlashText dictionary lookup for 8K persons
+                self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
+            elif 'ssa_baby_first_names' in self.dictionaries:
+                # Use regex with SSA names for validation
+                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
+            else:
+                # Fallback to regex without validation
+                self._persons = get_full_names(self.document.text)
+        return self._persons
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,10 +158,13 @@ class Document:
             self.path = path
         self.extension = extension
         # this will be filled by parsed
         self._data = None
         self._tables = None
         self._text = None
+        self.tags = Tags(self)
@@ -119,93 +246,92 @@ class Document:
         if self.extension == '.txt':
             content = self.text
-            if self.type == '10-Q':
+            if self.type in ['10-Q', '10-Q/A']:
                 mapping_dict = dict_10q
-            elif self.type == '10-K':
+            elif self.type in ['10-K','10-K/A']:
                 mapping_dict = dict_10k
-            elif self.type == '8-K':
+            elif self.type in ['8-K', '8-K/A']:
                 mapping_dict = dict_8k
-            elif self.type == 'SC 13D':
+            elif self.type in ['SC 13D', 'SC 13D/A']:
                 mapping_dict = dict_13d
-            elif self.type == 'SC 13G':
+            elif self.type in ['SC 13G', 'SC 13G/A']:
                 mapping_dict = dict_13g
             self._data = {}
             self._data['document'] = dict2dict(txt2dict(content=content, mapping_dict=mapping_dict))
         elif self.extension in ['.htm', '.html']:
-            if self.type == '1-K':
+            if self.type in ['1-K', '1-K/A']:
                 mapping_dict = dict_1kpartii_html
-            elif self.type == '1-SA':
+            elif self.type in ['1-SA', '1-SA/A']:
                 mapping_dict = dict_1sa_html
-            elif self.type == '1-U':
+            elif self.type in ['1-U', '1-U/A']:
                 mapping_dict = dict_1u_html
-            elif self.type == '10-12B':
+            elif self.type in ['10-12B', '10-12B/A']:
                 mapping_dict = dict_1012b_html
-            elif self.type == '10-D':
+            elif self.type in ['10-D', '10-D/A']:
                 mapping_dict = dict_10d_html
-            elif self.type == '10-K':
+            elif self.type in ['10-K', '10-K/A']:
                 mapping_dict = dict_10k_html
-            elif self.type == '10-Q':
+            elif self.type in ['10-Q', '10-Q/A']:
                 mapping_dict = dict_10q_html
-            elif self.type == '20-F':
+            elif self.type in ['20-F', '20-F/A']:
                 mapping_dict = dict_20f_html
-            elif self.type == '8-A12B':
+            elif self.type in ['8-A12B', '8-A12B/A']:
                 mapping_dict = dict_8a12b_html
-            elif self.type == '8-A12G':
+            elif self.type in ['8-A12G', '8-A12G/A']:
                 mapping_dict = dict_8a12g_html
-            elif self.type == '8-K':
+            elif self.type in ['8-K', '8-K/A']:
                 mapping_dict = dict_8k_html
-            elif self.type == '8-K12B':
+            elif self.type in ['8-K12B', '8-K12B/A']:
                 mapping_dict = dict_8k12b_html
-            elif self.type == '8-K12G3':
+            elif self.type in ['8-K12G3', '8-K12G3/A']:
                 mapping_dict = dict_8k12g3_html
-            elif self.type == '8-K15D5':
+            elif self.type in ['8-K15D5', '8-K15D5/A']:
                 mapping_dict = dict_8k15d5_html
-            elif self.type == 'ABS-15G':
+            elif self.type in ['ABS-15G', 'ABS-15G/A']:
                 mapping_dict = dict_abs15g_html
-            elif self.type == 'ABS-EE':
+            elif self.type in ['ABS-EE', 'ABS-EE/A']:
                 mapping_dict = dict_absee_html
-            elif self.type == 'APP NTC':
-                dict_appntc_html
-            elif self.type == 'CB':
+            elif self.type in ['APP NTC', 'APP NTC/A']:
+                mapping_dict = dict_appntc_html
+            elif self.type in ['CB', 'CB/A']:
                 mapping_dict = dict_cb_html
-            elif self.type == 'DSTRBRPT':
+            elif self.type in ['DSTRBRPT', 'DSTRBRPT/A']:
                 mapping_dict = dict_dstrbrpt_html
-            elif self.type == 'N-18F1':
+            elif self.type in ['N-18F1', 'N-18F1/A']:
                 mapping_dict = dict_n18f1_html
-            elif self.type == 'N-CSRS':
+            elif self.type in ['N-CSRS', 'N-CSRS/A']:
                 mapping_dict = dict_ncsrs_html
-            elif self.type == 'NT-10K':
+            elif self.type in ['NT-10K', 'NT-10K/A']:
                 mapping_dict = dict_nt10k_html
-            elif self.type == 'NT-10Q':
+            elif self.type in ['NT-10Q', 'NT-10Q/A']:
                 mapping_dict = dict_nt10q_html
-            elif self.type == 'NT 20-F':
+            elif self.type in ['NT 20-F', 'NT 20-F/A']:
                 mapping_dict = dict_nt20f_html
-            elif self.type == 'NT-NCEN':
+            elif self.type in ['NT-NCEN', 'NT-NCEN/A']:
                 mapping_dict = dict_ntncen_html
-            elif self.type == 'NT-NCSR':
+            elif self.type in ['NT-NCSR', 'NT-NCSR/A']:
                 mapping_dict = dict_ntncsr_html
-            elif self.type == 'NTFNCEN':
+            elif self.type in ['NTFNCEN', 'NTFNCEN/A']:
                 mapping_dict = dict_ntfcen_html
-            elif self.type == 'NTFNCSR':
+            elif self.type in ['NTFNCSR', 'NTFNCSR/A']:
                 mapping_dict = dict_ntfncsr_html
-            elif self.type == 'EX-99.CERT':
+            elif self.type in ['EX-99.CERT', 'EX-99.CERT/A']:
                 mapping_dict = dict_ex99cert_html
-            elif self.type == 'SC 13E3':
+            elif self.type in ['SC 13E3', 'SC 13E3/A']:
                 mapping_dict = dict_sc13e3_html
-            elif self.type == 'SC 14D9':
+            elif self.type in ['SC 14D9', 'SC 14D9/A']:
                 mapping_dict = dict_sc14d9_html
-            elif self.type == 'SP 15D2':
+            elif self.type in ['SP 15D2', 'SP 15D2/A']:
                 mapping_dict = dict_sp15d2_html
-            elif self.type == 'SD':
+            elif self.type in ['SD', 'SD/A']:
                 mapping_dict = dict_sd_html
-            elif self.type == 'S-1':
+            elif self.type in ['S-1', 'S-1/A']:
                 mapping_dict = dict_s1_html
-            elif self.type == 'T-3':
+            elif self.type in ['T-3', 'T-3/A']:
                 mapping_dict = dict_t3_html
-            elif self.type in ['NT 10-K', 'NT 10-Q','NT 20-F']:
+            elif self.type in ['NT 10-K', 'NT 10-K/A', 'NT 10-Q', 'NT 10-Q/A', 'NT 20-F', 'NT 20-F/A']:
                 mapping_dict = dict_nt10k_html
             dct = html2dict(content=self.content, mapping_dict=mapping_dict)
@@ -233,7 +359,7 @@ class Document:
                 self._preprocess_html_content()
             elif self.extension == '.txt':
                 self._preprocess_txt_content()
-            return self._text
+        return self._text
     def write_json(self, output_filename=None):
         if not self.data:

datamule/sheet.py CHANGED Viewed

@@ -7,9 +7,6 @@ from .datamule.datamule_lookup import datamule_lookup
 from .datamule.datamule_mysql_rds import query_mysql_rds
 from company_fundamentals.utils import get_fundamental_mappings
 from company_fundamentals import construct_fundamentals
-# slated for deprecation?
-from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
 class Sheet:
     def __init__(self, path):
         self.path = Path(path)
@@ -306,401 +303,4 @@ class Sheet:
         if verbose:
             print(f"Saved {len(data)} records to {filepath_obj}")
-    def download_information_table(
-        self,
-        filepath,
-        # Optional filtering parameters
-        columns=None,
-        name_of_issuer=None,
-        title_of_class=None,
-        cusip=None,
-        value=None,
-        ssh_prnamt=None,
-        ssh_prnamt_type=None,
-        investment_discretion=None,
-        voting_authority_sole=None,
-        voting_authority_shared=None,
-        voting_authority_none=None,
-        reporting_owner_cik=None,
-        put_call=None,
-        other_manager=None,
-        figi=None,
-        accession=None,
-        filing_date=None,
-        # API key handling
-        api_key=None,
-        # Additional options
-        print_cost=True,
-        verbose=False
-    ):
-        """
-        Query the SEC BigQuery API for 13F-HR information table data and save to CSV.
-        Parameters:
-        -----------
-        filepath : str
-            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
-        columns : List[str], optional
-            Specific columns to return. If None, all columns are returned.
-        # Filter parameters
-        name_of_issuer, title_of_class, etc. : Various filters that can be:
-            - str: Exact match
-            - List[str]: Match any in list
-            - tuple: (min, max) range for numeric/date fields
-        api_key : str, optional
-            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
-        print_cost : bool
-            Whether to print the query cost information
-        verbose : bool
-            Whether to print additional information about the query
-        Returns:
-        --------
-        List[Dict]
-            A list of dictionaries containing the query results
-        Raises:
-        -------
-        ValueError
-            If API key is missing or invalid
-        Exception
-            For API errors or other issues
-        """
-        # Get the data from the API
-        data = self.get_information_table(
-            columns=columns,
-            name_of_issuer=name_of_issuer,
-            title_of_class=title_of_class,
-            cusip=cusip,
-            value=value,
-            ssh_prnamt=ssh_prnamt,
-            ssh_prnamt_type=ssh_prnamt_type,
-            investment_discretion=investment_discretion,
-            voting_authority_sole=voting_authority_sole,
-            voting_authority_shared=voting_authority_shared,
-            voting_authority_none=voting_authority_none,
-            reporting_owner_cik=reporting_owner_cik,
-            put_call=put_call,
-            other_manager=other_manager,
-            figi=figi,
-            accession=accession,
-            filing_date=filing_date,
-            api_key=api_key,
-            print_cost=print_cost,
-            verbose=verbose
-        )
-        # Save to CSV using the helper method
-        return self._download_to_csv(data, filepath, verbose)
-    def download_345(
-        self,
-        filepath,
-        # Optional filtering parameters
-        columns=None,
-        is_derivative=None,
-        is_non_derivative=None,
-        security_title=None,
-        transaction_date=None,
-        document_type=None,
-        transaction_code=None,
-        equity_swap_involved=None,
-        transaction_timeliness=None,
-        transaction_shares=None,
-        transaction_price_per_share=None,
-        shares_owned_following_transaction=None,
-        ownership_type=None,
-        deemed_execution_date=None,
-        conversion_or_exercise_price=None,
-        exercise_date=None,
-        expiration_date=None,
-        underlying_security_title=None,
-        underlying_security_shares=None,
-        underlying_security_value=None,
-        accession=None,
-        reporting_owner_cik=None,
-        issuer_cik=None,
-        filing_date=None,
-        # API key handling
-        api_key=None,
-        # Additional options
-        print_cost=True,
-        verbose=False
-    ):
-        """
-        Query the SEC BigQuery API for Form 345 insider transaction data and save to CSV.
-        Parameters:
-        -----------
-        filepath : str
-            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
-        columns : List[str], optional
-            Specific columns to return. If None, all columns are returned.
-        # Filter parameters
-        is_derivative, security_title, etc. : Various filters that can be:
-            - str/bool: Exact match
-            - List[str]: Match any in list
-            - tuple: (min, max) range for numeric/date fields
-        reporting_owner_cik : str or List[str]
-            CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
-            Any match within the array will return the record.
-        issuer_cik : str or List[str]
-            CIK(s) of the company/companies
-        api_key : str, optional
-            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
-        print_cost : bool
-            Whether to print the query cost information
-        verbose : bool
-            Whether to print additional information about the query
-        Returns:
-        --------
-        List[Dict]
-            A list of dictionaries containing the query results
-        Raises:
-        -------
-        ValueError
-            If API key is missing or invalid
-        Exception
-            For API errors or other issues
-        """
-        # Get the data from the API
-        data = self.get_345(
-            columns=columns,
-            is_derivative=is_derivative,
-            is_non_derivative=is_non_derivative,
-            security_title=security_title,
-            transaction_date=transaction_date,
-            document_type=document_type,
-            transaction_code=transaction_code,
-            equity_swap_involved=equity_swap_involved,
-            transaction_timeliness=transaction_timeliness,
-            transaction_shares=transaction_shares,
-            transaction_price_per_share=transaction_price_per_share,
-            shares_owned_following_transaction=shares_owned_following_transaction,
-            ownership_type=ownership_type,
-            deemed_execution_date=deemed_execution_date,
-            conversion_or_exercise_price=conversion_or_exercise_price,
-            exercise_date=exercise_date,
-            expiration_date=expiration_date,
-            underlying_security_title=underlying_security_title,
-            underlying_security_shares=underlying_security_shares,
-            underlying_security_value=underlying_security_value,
-            accession=accession,
-            reporting_owner_cik=reporting_owner_cik,
-            issuer_cik=issuer_cik,
-            filing_date=filing_date,
-            api_key=api_key,
-            print_cost=print_cost,
-            verbose=verbose
-        )
-        # Save to CSV using the helper method
-        return self._download_to_csv(data, filepath, verbose)
-    def get_proxy_voting_record(
-        self,
-        # Optional filtering parameters
-        columns=None,
-        meeting_date=None,
-        isin=None,
-        cusip=None,
-        issuer_name=None,
-        vote_description=None,
-        shares_on_loan=None,
-        shares_voted=None,
-        vote_category=None,
-        vote_record=None,
-        vote_source=None,
-        how_voted=None,
-        figi=None,
-        management_recommendation=None,
-        accession=None,
-        reporting_owner_cik=None,
-        filing_date=None,
-        # API key handling
-        api_key=None,
-        # Additional options
-        print_cost=True,
-        verbose=False
-    ):
-        """
-        Query the SEC BigQuery API for NPX proxy voting record data.
-        Parameters:
-        -----------
-        columns : List[str], optional
-            Specific columns to return. If None, all columns are returned.
-        # Filter parameters
-        meeting_date, isin, cusip, etc. : Various filters that can be:
-            - str: Exact match
-            - List[str]: Match any in list
-            - tuple: (min, max) range for numeric/date fields
-        shares_on_loan, shares_voted : int/float or tuple
-            Numeric values or (min, max) range
-        filing_date : str or tuple
-            Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
-        api_key : str, optional
-            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
-        print_cost : bool
-            Whether to print the query cost information
-        verbose : bool
-            Whether to print additional information about the query
-        Returns:
-        --------
-        List[Dict]
-            A list of dictionaries containing the query results
-        Raises:
-        -------
-        ValueError
-            If API key is missing or invalid
-        Exception
-            For API errors or other issues
-        """
-        return get_proxy_voting_record(
-            columns=columns,
-            meeting_date=meeting_date,
-            isin=isin,
-            cusip=cusip,
-            issuer_name=issuer_name,
-            vote_description=vote_description,
-            shares_on_loan=shares_on_loan,
-            shares_voted=shares_voted,
-            vote_category=vote_category,
-            vote_record=vote_record,
-            vote_source=vote_source,
-            how_voted=how_voted,
-            figi=figi,
-            management_recommendation=management_recommendation,
-            accession=accession,
-            reporting_owner_cik=reporting_owner_cik,
-            filing_date=filing_date,
-            # API key handling
-            api_key=api_key,
-            # Additional options
-            print_cost=print_cost,
-            verbose=verbose
-        )
-    def download_proxy_voting_record(
-        self,
-        filepath,
-        # Optional filtering parameters
-        columns=None,
-        meeting_date=None,
-        isin=None,
-        cusip=None,
-        issuer_name=None,
-        vote_description=None,
-        shares_on_loan=None,
-        shares_voted=None,
-        vote_category=None,
-        vote_record=None,
-        vote_source=None,
-        how_voted=None,
-        figi=None,
-        management_recommendation=None,
-        accession=None,
-        reporting_owner_cik=None,
-        filing_date=None,
-        # API key handling
-        api_key=None,
-        # Additional options
-        print_cost=True,
-        verbose=False
-    ):
-        """
-        Query the SEC BigQuery API for NPX proxy voting record data and save to CSV.
-        Parameters:
-        -----------
-        filepath : str
-            Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
-        columns : List[str], optional
-            Specific columns to return. If None, all columns are returned.
-        # Filter parameters
-        meeting_date, isin, cusip, etc. : Various filters that can be:
-            - str: Exact match
-            - List[str]: Match any in list
-            - tuple: (min, max) range for numeric/date fields
-        shares_on_loan, shares_voted : int/float or tuple
-            Numeric values or (min, max) range
-        filing_date : str or tuple
-            Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
-        api_key : str, optional
-            SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
-        print_cost : bool
-            Whether to print the query cost information
-        verbose : bool
-            Whether to print additional information about the query
-        Returns:
-        --------
-        List[Dict]
-            A list of dictionaries containing the query results
-        Raises:
-        -------
-        ValueError
-            If API key is missing or invalid
-        Exception
-            For API errors or other issues
-        """
-        # Get the data from the API
-        data = self.get_proxy_voting_record(
-            columns=columns,
-            meeting_date=meeting_date,
-            isin=isin,
-            cusip=cusip,
-            issuer_name=issuer_name,
-            vote_description=vote_description,
-            shares_on_loan=shares_on_loan,
-            shares_voted=shares_voted,
-            vote_category=vote_category,
-            vote_record=vote_record,
-            vote_source=vote_source,
-            how_voted=how_voted,
-            figi=figi,
-            management_recommendation=management_recommendation,
-            accession=accession,
-            reporting_owner_cik=reporting_owner_cik,
-            filing_date=filing_date,
-            api_key=api_key,
-            print_cost=print_cost,
-            verbose=verbose
-        )
-        # Save to CSV using the helper method
-        return self._download_to_csv(data, filepath, verbose)

datamule/tags/__init__.py ADDED Viewed

File without changes

datamule/tags/config.py ADDED Viewed

@@ -0,0 +1,16 @@
+from ..utils.dictionaries import download_dictionary, load_dictionary
+_active_dictionaries = []
+_loaded_dictionaries = {}
+def set_dictionaries(dictionaries, overwrite=False):
+    """Set active dictionaries and load them into memory"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = dictionaries
+    _loaded_dictionaries = {}
+    for dict_name in dictionaries:
+        # Download if needed
+        download_dictionary(dict_name, overwrite=overwrite)
+        # Load into memory
+        _loaded_dictionaries[dict_name] = load_dictionary(dict_name)

datamule/tags/regex.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Exchange ticker regexes with word boundaries
+nyse_regex = r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"
+nasdaq_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+nyse_american_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+london_stock_exchange_regex = r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"
+toronto_stock_exchange_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_paris_regex = r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"
+euronext_amsterdam_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_brussels_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_lisbon_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
+euronext_milan_regex = r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"
+deutsche_borse_xetra_regex = r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"
+six_swiss_exchange_regex = r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"
+tokyo_stock_exchange_regex = r"\b(\d{4})\b"
+hong_kong_stock_exchange_regex = r"\b(\d{4,5})\b"
+shanghai_stock_exchange_regex = r"\b(6\d{5})\b"
+shenzhen_stock_exchange_regex = r"\b([03]\d{5})\b"
+australian_securities_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
+singapore_exchange_regex = r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"
+nse_bse_regex = r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"
+sao_paulo_b3_regex = r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"
+mexico_bmv_regex = r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"
+korea_exchange_regex = r"\b(\d{6})\b"
+taiwan_stock_exchange_regex = r"\b(\d{4})\b"
+johannesburg_stock_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
+tel_aviv_stock_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
+moscow_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
+istanbul_stock_exchange_regex = r"\b([A-Z]{5})(\.[A-Z]+)?\b"
+nasdaq_stockholm_regex = r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"
+oslo_bors_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
+otc_markets_us_regex = r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"
+pink_sheets_regex = r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"
+ticker_regex_list = [
+    ("nyse", r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"),
+    ("nasdaq", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("nyse_american", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("london_stock_exchange", r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"),
+    ("toronto_stock_exchange", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_paris", r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"),
+    ("euronext_amsterdam", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_brussels", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_lisbon", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
+    ("euronext_milan", r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"),
+    ("deutsche_borse_xetra", r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"),
+    ("six_swiss_exchange", r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"),
+    ("tokyo_stock_exchange", r"\b(\d{4})\b"),
+    ("hong_kong_stock_exchange", r"\b(\d{4,5})\b"),
+    ("shanghai_stock_exchange", r"\b(6\d{5})\b"),
+    ("shenzhen_stock_exchange", r"\b([03]\d{5})\b"),
+    ("australian_securities_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
+    ("singapore_exchange", r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"),
+    ("nse_bse", r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"),
+    ("sao_paulo_b3", r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"),
+    ("mexico_bmv", r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"),
+    ("korea_exchange", r"\b(\d{6})\b"),
+    ("taiwan_stock_exchange", r"\b(\d{4})\b"),
+    ("johannesburg_stock_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
+    ("tel_aviv_stock_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
+    ("moscow_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
+    ("istanbul_stock_exchange", r"\b([A-Z]{5})(\.[A-Z]+)?\b"),
+    ("nasdaq_stockholm", r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"),
+    ("oslo_bors", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
+    ("otc_markets_us", r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"),
+    ("pink_sheets", r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"),
+]
+# Security identifier regexes with word boundaries
+cusip_regex = r"\b[0-9A-Z]{8}[0-9]\b"
+isin_regex = r"\b[A-Z]{2}[0-9A-Z]{9}[0-9]\b"
+figi_regex = r"\b[A-Z]{2}G[A-Z0-9]{8}[0-9]\b"
+particles = {
+   # Dutch - single words only
+   'van', 'der', 'den', 'de',
+   # German - single words only
+   'von', 'zu', 'vom', 'zur', 'zum',
+   # Spanish - single words only
+   'de', 'del', 'y',
+   # Portuguese - single words only
+   'da', 'das', 'do', 'dos', 'e',
+   # French - single words only
+   'de', 'du', 'des', 'le', 'la', 'les', "d'",
+   # Italian - single words only
+   'da', 'di', 'del', 'della', 'delle', 'dei', 'degli', 'dello',
+   # Irish/Scottish
+   'mac', 'mc', 'o',
+   # Arabic
+   'al', 'el', 'ibn', 'bin', 'bint', 'abu',
+   # Other European
+   'af', 'av',  # Scandinavian
+   'ter',       # Dutch/Flemish
+   'op',        # Dutch
+   'aan',       # Dutch
+   'ten',       # Dutch
+   'het',       # Dutch
+   'in',        # Dutch
+}

datamule/tags/utils.py ADDED Viewed

@@ -0,0 +1,149 @@
+import re
+from .regex import cusip_regex, isin_regex, figi_regex, ticker_regex_list
+from .regex import particles
+from flashtext import KeywordProcessor
+def get_cusip_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(cusip_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_isin_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(isin_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_figi_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(figi_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_tickers_using_regex(text, regex_pattern):
+    """Extract tickers using the given regex pattern with position information"""
+    matches = []
+    for match in re.finditer(regex_pattern, text):
+        # Handle tuples from regex groups - take the first capture group
+        if match.groups():
+            ticker = match.group(1) if match.group(1) else match.group(0)
+        else:
+            ticker = match.group(0)
+        matches.append((ticker, match.start(), match.end()))
+    return matches
+def get_all_tickers(text):
+    """Get all tickers from all exchanges organized by exchange with position info"""
+    result = {}
+    all_tickers = []
+    for exchange_name, regex_pattern in ticker_regex_list:
+        tickers = get_tickers_using_regex(text, regex_pattern)
+        result[exchange_name] = tickers
+        all_tickers.extend(tickers)
+    # Remove duplicates while preserving order for 'all'
+    # Keep track of seen ticker values (first element of tuple)
+    seen = set()
+    result['all'] = [x for x in all_tickers if not (x[0] in seen or seen.add(x[0]))]
+    return result
+def get_ticker_regex_dict():
+    """Return ticker regex list as a dictionary for easy lookup"""
+    return dict(ticker_regex_list)
+# will change in future to accomodate other datasets
+def validate_full_name(full_name,keywords):
+    if len(full_name) == 1:
+        return False
+    # check all is upper
+    if all(word.isupper() for word in full_name):
+        return False
+    # check if any number in word
+    if any(any(char.isdigit() for char in word) for word in full_name):
+        return False
+    if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
+        return False
+    # add optional set lookups
+    if keywords is not None:
+        # return false if first word is not in keywords set
+        if full_name[0] not in keywords:
+            return False
+    return True
+def get_full_names(text,keywords=None):
+    words = text.split()
+    full_names = []
+    current_pos = None
+    word_start_positions = []
+    # Calculate word positions in the original text
+    pos = 0
+    for word in words:
+        start = text.find(word, pos)
+        word_start_positions.append(start)
+        pos = start + len(word)
+    for idx, word in enumerate(words):
+        if current_pos is None:
+            if word[0].isupper():
+                current_pos = idx
+        else:
+            if word[0].isupper() or word.lower() in particles:
+                continue
+            else:
+                full_name = words[current_pos:idx]
+                if validate_full_name(full_name,keywords):
+                    name_text = ' '.join(full_name)
+                    start_pos = word_start_positions[current_pos]
+                    # Calculate end position of the last word in the name
+                    last_word_idx = idx - 1
+                    end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
+                    full_names.append((name_text, start_pos, end_pos))
+                current_pos = None
+    # handle last case - if we're still tracking a name when we reach the end
+    if current_pos is not None:
+        full_name = words[current_pos:]
+        if validate_full_name(full_name,keywords):
+            name_text = ' '.join(full_name)
+            start_pos = word_start_positions[current_pos]
+            # Calculate end position of the last word
+            last_word_idx = len(words) - 1
+            end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
+            full_names.append((name_text, start_pos, end_pos))
+    return full_names
+# add dictionary lookup based on precomputed lists
+def get_full_names_dictionary_lookup(text, dictionary):
+    keyword_processor = KeywordProcessor(case_sensitive=True)
+    for key in dictionary.keys():
+        keyword_processor.add_keyword(key, key)
+    matches = []
+    keywords_found = keyword_processor.extract_keywords(text, span_info=True)
+    for keyword, start_pos, end_pos in keywords_found:
+        matches.append((keyword, start_pos, end_pos))
+    return matches

datamule/utils/dictionaries.py ADDED Viewed

@@ -0,0 +1,76 @@
+from pathlib import Path
+import urllib.request
+import json
+urls = {
+    "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
+    "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
+    "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
+    "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
+    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
+}
+def download_dictionary(name,overwrite=False):
+    url = urls[name]
+    # Create dictionaries directory in datamule folder
+    dict_dir = Path.home() / ".datamule" / "dictionaries"
+    dict_dir.mkdir(parents=True, exist_ok=True)
+    # check if file exists first
+    if not overwrite:
+        filename = url.split('/')[-1]
+        file_path = dict_dir / filename
+        if file_path.exists():
+            return
+    # Extract filename from URL
+    filename = url.split('/')[-1]
+    file_path = dict_dir / filename
+    print(f"Downloading {name} dictionary to {file_path}")
+    urllib.request.urlretrieve(url, file_path)
+    return
+def load_dictionary(name):
+    # Get or download the dictionary file
+    dict_dir = Path.home() / ".datamule" / "dictionaries"
+    filename = urls[name].split('/')[-1]
+    file_path = dict_dir / filename
+    # Download if doesn't exist
+    if not file_path.exists():
+        download_dictionary(name)
+    # Load the dictionary based on name
+    if name == "ssa_baby_first_names":
+        names_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                names_set.add(line.strip())
+        return names_set
+    elif name == "npx_figis":
+        figi_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                figi_set.add(line.strip())
+        return figi_set
+    elif name == "npx_isins":
+        isin_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                isin_set.add(line.strip())
+        return isin_set
+    elif name == "sc13dg_cusips":
+        cusip_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                cusip_set.add(line.strip())
+        return cusip_set
+    elif name == "8k_2024_persons":
+        with open(file_path, 'r', encoding='utf-8') as f:
+            persons_list = json.load(f)
+        return persons_list
+    else:
+        raise ValueError("dictionary not found")

{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.5
+Version: 2.2.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -19,4 +19,5 @@ Requires-Dist: secxbrl
 Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company-fundamentals
+Requires-Dist: flashtext

{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
 datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
 datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
 datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
-datamule/sheet.py,sha256=Ws_YRtpvewLVioarngVMe8cgG_sp11MP9_goGbRaiWE,23952
+datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
 datamule/submission.py,sha256=TdQDfFjOKXy2qAZcD6hc9kjDSxmuZLqk8WRhtMjjC-g,15822
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=lWFmRnX7UfcX2W-2bs5wuFjWDiuFphIlWRXoatAOMZ0,15328
+datamule/document/document.py,sha256=QjncYOIdf0Zf_0AONEOXu2KlPxMksGZzvwmHOpbM5N8,20450
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -48,10 +48,15 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
 datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
+datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datamule/tags/config.py,sha256=JVdIkqu9rBEAadNLP-FiIbZ35TRORGIDCJvqDh0CuqE,585
+datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
+datamule/tags/utils.py,sha256=k5fyjMjJNh6gZjj491sw_9rnMqYIlHHDBathkDcHD0A,5423
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
+datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-2.1.5.dist-info/METADATA,sha256=O7H7morwBBDgqmyytWiui81VS4Wt5PoyIsNGv3_j3rk,560
-datamule-2.1.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.1.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.1.5.dist-info/RECORD,,
+datamule-2.2.0.dist-info/METADATA,sha256=fuT_ABK8D6LhEi1_TjtVnIKobXdafBPiMSGy3XCWyRo,585
+datamule-2.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.2.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.2.0.dist-info/RECORD,,

{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.1.5.dist-info → datamule-2.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.1.5__py3-none-any.whl → 2.2.0__py3-none-any.whl

datamule 2.1.5py3-none-any.whl → 2.2.0py3-none-any.whl