PyPI - datamule - Versions diffs - 2.2.1__py3-none-any.whl → 2.2.2__py3-none-any.whl - Mend

datamule 2.2.1py3-none-any.whl → 2.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

datamule/document/document.py CHANGED Viewed

@@ -13,11 +13,35 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
-import warnings
 from .tables.tables import Tables
 from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+class DataWithTags(dict):
+    def __init__(self, data, document):
+        super().__init__(data)
+        self._document = document
+        self._tags = None
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='data')  # New fragment-based behavior
+        return self._tags
+class TextWithTags(str):
+    def __new__(cls, content, document):
+        instance = str.__new__(cls, content)
+        instance._document = document
+        instance._tags = None
+        return instance
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='text')  # Original behavior
+        return self._tags
 class Tickers:
     def __init__(self, document):
@@ -27,11 +51,7 @@ class Tickers:
     def _get_tickers_data(self):
         """Get all tickers data once and cache it"""
         if self._tickers_data is None:
-            # Check if document extension is supported
-            if self.document.extension not in ['.htm', '.html', '.txt']:
-                self._tickers_data = {}
-            else:
-                self._tickers_data = get_all_tickers(self.document.text)
+           self._tickers_data = get_all_tickers(self.document.text)
         return self._tickers_data
     def __getattr__(self, exchange_name):
@@ -58,13 +78,14 @@ class Tickers:
         return str(data)
 class Tags:
-    def __init__(self, document):
+    def __init__(self, document, mode='text'):
         from ..tags.config import _active_dictionaries,_loaded_dictionaries
-        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
         self.document = document
+        self.mode = mode  # 'text' or 'data'
         self._tickers = None
         self.dictionaries = {}
         self.processors = {}
+        self._text_sources = None
         # Load global dictionaries with their data and processors
         active_dicts = _active_dictionaries
@@ -73,77 +94,131 @@ class Tags:
             self.dictionaries[dict_name] = dict_info['data']
             if dict_info['processor'] is not None:
                 self.processors[dict_name] = dict_info['processor']
-    def _check_support(self):
-        if self.not_supported:
-            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
-            return False
-        return True
+    def _get_text_sources(self):
+        """Get text sources based on mode - either single text or multiple fragments"""
+        if self._text_sources is None:
+            if self.mode == 'text':
+                # Original behavior - single text source
+                self._text_sources = [{'id': None, 'text': str(self.document.text)}]
+            else:  # mode == 'data'
+                # New behavior - multiple text fragments
+                self._text_sources = []
+                self._extract_text_fragments(self.document.data, '')
+        return self._text_sources
+    def _extract_text_fragments(self, data, parent_id=''):
+        """Extract all text fragments with their document IDs from parsed data"""
+        if isinstance(data, dict):
+            for key, value in data.items():
+                if key in ["text", "title"] and isinstance(value, str):
+                    # Use the current dictionary's parent key as the fragment ID
+                    self._text_sources.append({
+                        'id': parent_id,
+                        'text': value
+                    })
+                elif isinstance(value, (dict, list)):
+                    # Pass the current key as the parent_id for the next level
+                    self._extract_text_fragments(value, key)
+        elif isinstance(data, list):
+            for i, item in enumerate(data):
+                if isinstance(item, (dict, list)):
+                    self._extract_text_fragments(item, parent_id)
+    def _format_results(self, results, fragment_id):
+        """Format results based on mode"""
+        if self.mode == 'text':
+            # Original format: (match, start, end)
+            return results
+        else:
+            # New format: (match, fragment_id, start, end)
+            return [(match, fragment_id, start, end) for match, start, end in results]
     @property
     def cusips(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_cusips'):
+            self._cusips = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_cusip'):
-            if 'sc13dg_cusips' in self.dictionaries:
-                keywords = self.dictionaries['sc13dg_cusips']
-                self._cusip = get_cusip_using_regex(self.document.text, keywords)
-            else:
-                self._cusip = get_cusip_using_regex(self.document.text)
-        return self._cusip
+            for source in sources:
+                if 'sc13dg_cusips' in self.dictionaries:
+                    keywords = self.dictionaries['sc13dg_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                elif "13fhr_information_table_cusips" in self.dictionaries:
+                    keywords = self.dictionaries['13fhr_information_table_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                else:
+                    results = get_cusip_using_regex(source['text'])
+                # Format results based on mode
+                formatted_results = self._format_results(results, source['id'])
+                self._cusips.extend(formatted_results)
+        return self._cusips
     @property
     def isins(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_isins'):
+            self._isins = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_isin'):
-            if 'npx_isins' in self.dictionaries:
-                keywords = self.dictionaries['npx_isins']
-                self._isin = get_isin_using_regex(self.document.text, keywords)
-            else:
-                self._isin = get_isin_using_regex(self.document.text)
-        return self._isin
+            for source in sources:
+                if 'npx_isins' in self.dictionaries:
+                    keywords = self.dictionaries['npx_isins']
+                    results = get_isin_using_regex(source['text'], keywords)
+                else:
+                    results = get_isin_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._isins.extend(formatted_results)
+        return self._isins
     @property
     def figis(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_figis'):
+            self._figis = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_figi'):
-            if 'npx_figis' in self.dictionaries:
-                keywords = self.dictionaries['npx_figis']
-                self._figi = get_figi_using_regex(self.document.text, keywords)
-            else:
-                self._figi = get_figi_using_regex(self.document.text)
-        return self._figi
+            for source in sources:
+                if 'npx_figis' in self.dictionaries:
+                    keywords = self.dictionaries['npx_figis']
+                    results = get_figi_using_regex(source['text'], keywords)
+                else:
+                    results = get_figi_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._figis.extend(formatted_results)
+        return self._figis
     @property
     def tickers(self):
+        # Tickers work differently - they need the full document context
+        # Keep original behavior for now
         if self._tickers is None:
             self._tickers = Tickers(self.document)
         return self._tickers
     @property
     def persons(self):
-        if not self._check_support():
-            return None
         if not hasattr(self, '_persons'):
-            if '8k_2024_persons' in self.processors:
-                # Use pre-built processor
-                self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
-            elif 'ssa_baby_first_names' in self.dictionaries:
-                # Use regex with SSA names for validation
-                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
-            else:
-                # Fallback to regex without validation
-                self._persons = get_full_names(self.document.text)
+            self._persons = []
+            sources = self._get_text_sources()
+            for source in sources:
+                if '8k_2024_persons' in self.processors:
+                    results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
+                elif 'ssa_baby_first_names' in self.dictionaries:
+                    results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
+                else:
+                    results = get_full_names(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._persons.extend(formatted_results)
         return self._persons
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -168,8 +243,6 @@ class Document:
         self._tables = None
         self._text = None
-        self.tags = Tags(self)
     #_load_text_content
@@ -354,15 +427,26 @@ class Document:
     def data(self):
         if self._data is None:
             self.parse()
+        if self._data is None:
+            self._data = {}
+        if not isinstance(self._data, DataWithTags):
+            self._data = DataWithTags(self._data, self)
         return self._data
     @property
     def text(self):
         if self._text is None:
             if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()
+                self._preprocess_html_content()  # Still sets self._text to plain string
             elif self.extension == '.txt':
-                self._preprocess_txt_content()
+                self._preprocess_txt_content()   # Still sets self._text to plain string
+            # Convert the plain string to TextWithTags
+            plain_text = self._text
+            self._text = TextWithTags(plain_text, self)
         return self._text
     def write_json(self, output_filename=None):

datamule/utils/dictionaries.py CHANGED Viewed

@@ -6,7 +6,8 @@ urls = {
     "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
     "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
     "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
-    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
+    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
+    "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
 }
@@ -67,6 +68,12 @@ def load_dictionary(name):
             for line in f:
                 cusip_set.add(line.strip())
         return cusip_set
+    elif name == "13fhr_information_table_cusips":
+        cusip_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                cusip_set.add(line.strip())
+        return cusip_set
     elif name == "8k_2024_persons":
         with open(file_path, 'r', encoding='utf-8') as f:

{datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.1
+Version: 2.2.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=yiev4AYewjp8bPjWn9cuL43N2O11s9WUo4X2e7WUgiY,20628
+datamule/document/document.py,sha256=mpoWmK8K7B92ukXj4WZzFhYOwpoVop5DZYfj2Q-6FE8,24332
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -54,9 +54,9 @@ datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
 datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
-datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
+datamule/utils/dictionaries.py,sha256=1VwzuyDausEsvMIJRa2UD7SvtmlMRHmT_tFeaCY6eXo,3201
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-2.2.1.dist-info/METADATA,sha256=aINGZMWV34SclEt-2Ij2d2848PJA7cLF6ZoBL2LwpfY,585
-datamule-2.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.2.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.2.1.dist-info/RECORD,,
+datamule-2.2.2.dist-info/METADATA,sha256=pVMWNBGvR-KNKCYOvfvcFa95srRzS3j_t-zuW6QiXQk,585
+datamule-2.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.2.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.2.2.dist-info/RECORD,,

{datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.2.1.dist-info → datamule-2.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.2.1__py3-none-any.whl → 2.2.2__py3-none-any.whl

datamule 2.2.1py3-none-any.whl → 2.2.2py3-none-any.whl