PyPI - datamule - Versions diffs - 2.2.0__tar.gz → 2.2.2__tar.gz - Mend

datamule 2.2.0tar.gz → 2.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{datamule-2.2.0 → datamule-2.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.0
+Version: 2.2.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.0 → datamule-2.2.2}/datamule/document/document.py RENAMED Viewed

@@ -13,11 +13,35 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
-import warnings
 from .tables.tables import Tables
 from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+class DataWithTags(dict):
+    def __init__(self, data, document):
+        super().__init__(data)
+        self._document = document
+        self._tags = None
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='data')  # New fragment-based behavior
+        return self._tags
+class TextWithTags(str):
+    def __new__(cls, content, document):
+        instance = str.__new__(cls, content)
+        instance._document = document
+        instance._tags = None
+        return instance
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='text')  # Original behavior
+        return self._tags
 class Tickers:
     def __init__(self, document):
@@ -27,11 +51,7 @@ class Tickers:
     def _get_tickers_data(self):
         """Get all tickers data once and cache it"""
         if self._tickers_data is None:
-            # Check if document extension is supported
-            if self.document.extension not in ['.htm', '.html', '.txt']:
-                self._tickers_data = {}
-            else:
-                self._tickers_data = get_all_tickers(self.document.text)
+           self._tickers_data = get_all_tickers(self.document.text)
         return self._tickers_data
     def __getattr__(self, exchange_name):
@@ -58,88 +78,147 @@ class Tickers:
         return str(data)
 class Tags:
-    def __init__(self, document):
+    def __init__(self, document, mode='text'):
         from ..tags.config import _active_dictionaries,_loaded_dictionaries
-        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
         self.document = document
+        self.mode = mode  # 'text' or 'data'
         self._tickers = None
         self.dictionaries = {}
+        self.processors = {}
+        self._text_sources = None
-        # Load global dictionaries with their data
+        # Load global dictionaries with their data and processors
         active_dicts = _active_dictionaries
         for dict_name in active_dicts:
-            self.dictionaries[dict_name] = _loaded_dictionaries[dict_name]
+            dict_info = _loaded_dictionaries[dict_name]
+            self.dictionaries[dict_name] = dict_info['data']
+            if dict_info['processor'] is not None:
+                self.processors[dict_name] = dict_info['processor']
-    def _check_support(self):
-        if self.not_supported:
-            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
-            return False
-        return True
+    def _get_text_sources(self):
+        """Get text sources based on mode - either single text or multiple fragments"""
+        if self._text_sources is None:
+            if self.mode == 'text':
+                # Original behavior - single text source
+                self._text_sources = [{'id': None, 'text': str(self.document.text)}]
+            else:  # mode == 'data'
+                # New behavior - multiple text fragments
+                self._text_sources = []
+                self._extract_text_fragments(self.document.data, '')
+        return self._text_sources
+    def _extract_text_fragments(self, data, parent_id=''):
+        """Extract all text fragments with their document IDs from parsed data"""
+        if isinstance(data, dict):
+            for key, value in data.items():
+                if key in ["text", "title"] and isinstance(value, str):
+                    # Use the current dictionary's parent key as the fragment ID
+                    self._text_sources.append({
+                        'id': parent_id,
+                        'text': value
+                    })
+                elif isinstance(value, (dict, list)):
+                    # Pass the current key as the parent_id for the next level
+                    self._extract_text_fragments(value, key)
+        elif isinstance(data, list):
+            for i, item in enumerate(data):
+                if isinstance(item, (dict, list)):
+                    self._extract_text_fragments(item, parent_id)
+    def _format_results(self, results, fragment_id):
+        """Format results based on mode"""
+        if self.mode == 'text':
+            # Original format: (match, start, end)
+            return results
+        else:
+            # New format: (match, fragment_id, start, end)
+            return [(match, fragment_id, start, end) for match, start, end in results]
     @property
     def cusips(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_cusips'):
+            self._cusips = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_cusip'):
-            if 'sc13dg_cusips' in self.dictionaries:
-                keywords = self.dictionaries['sc13dg_cusips']
-                self._cusip = get_cusip_using_regex(self.document.text, keywords)
-            else:
-                self._cusip = get_cusip_using_regex(self.document.text)
-        return self._cusip
+            for source in sources:
+                if 'sc13dg_cusips' in self.dictionaries:
+                    keywords = self.dictionaries['sc13dg_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                elif "13fhr_information_table_cusips" in self.dictionaries:
+                    keywords = self.dictionaries['13fhr_information_table_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                else:
+                    results = get_cusip_using_regex(source['text'])
+                # Format results based on mode
+                formatted_results = self._format_results(results, source['id'])
+                self._cusips.extend(formatted_results)
+        return self._cusips
     @property
     def isins(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_isins'):
+            self._isins = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_isin'):
-            if 'npx_isins' in self.dictionaries:
-                keywords = self.dictionaries['npx_isins']
-                self._isin = get_isin_using_regex(self.document.text, keywords)
-            else:
-                self._isin = get_isin_using_regex(self.document.text)
-        return self._isin
+            for source in sources:
+                if 'npx_isins' in self.dictionaries:
+                    keywords = self.dictionaries['npx_isins']
+                    results = get_isin_using_regex(source['text'], keywords)
+                else:
+                    results = get_isin_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._isins.extend(formatted_results)
+        return self._isins
     @property
     def figis(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_figis'):
+            self._figis = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_figi'):
-            if 'npx_figis' in self.dictionaries:
-                keywords = self.dictionaries['npx_figis']
-                self._figi = get_figi_using_regex(self.document.text, keywords)
-            else:
-                self._figi = get_figi_using_regex(self.document.text)
-        return self._figi
+            for source in sources:
+                if 'npx_figis' in self.dictionaries:
+                    keywords = self.dictionaries['npx_figis']
+                    results = get_figi_using_regex(source['text'], keywords)
+                else:
+                    results = get_figi_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._figis.extend(formatted_results)
+        return self._figis
     @property
     def tickers(self):
+        # Tickers work differently - they need the full document context
+        # Keep original behavior for now
         if self._tickers is None:
             self._tickers = Tickers(self.document)
         return self._tickers
     @property
     def persons(self):
-        if not self._check_support():
-            return None
         if not hasattr(self, '_persons'):
-            if '8k_2024_persons' in self.dictionaries:
-                # Use FlashText dictionary lookup for 8K persons
-                self._persons = get_full_names_dictionary_lookup(self.document.text, self.dictionaries['8k_2024_persons'])
-            elif 'ssa_baby_first_names' in self.dictionaries:
-                # Use regex with SSA names for validation
-                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
-            else:
-                # Fallback to regex without validation
-                self._persons = get_full_names(self.document.text)
+            self._persons = []
+            sources = self._get_text_sources()
+            for source in sources:
+                if '8k_2024_persons' in self.processors:
+                    results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
+                elif 'ssa_baby_first_names' in self.dictionaries:
+                    results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
+                else:
+                    results = get_full_names(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._persons.extend(formatted_results)
         return self._persons
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -164,8 +243,6 @@ class Document:
         self._tables = None
         self._text = None
-        self.tags = Tags(self)
     #_load_text_content
@@ -350,15 +427,26 @@ class Document:
     def data(self):
         if self._data is None:
             self.parse()
+        if self._data is None:
+            self._data = {}
+        if not isinstance(self._data, DataWithTags):
+            self._data = DataWithTags(self._data, self)
         return self._data
     @property
     def text(self):
         if self._text is None:
             if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()
+                self._preprocess_html_content()  # Still sets self._text to plain string
             elif self.extension == '.txt':
-                self._preprocess_txt_content()
+                self._preprocess_txt_content()   # Still sets self._text to plain string
+            # Convert the plain string to TextWithTags
+            plain_text = self._text
+            self._text = TextWithTags(plain_text, self)
         return self._text
     def write_json(self, output_filename=None):

datamule-2.2.2/datamule/tags/config.py ADDED Viewed

@@ -0,0 +1,33 @@
+from ..utils.dictionaries import download_dictionary, load_dictionary
+_active_dictionaries = []
+_loaded_dictionaries = {}
+def set_dictionaries(dictionaries, overwrite=False):
+    """Set active dictionaries and load them into memory"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = dictionaries
+    _loaded_dictionaries = {}
+    for dict_name in dictionaries:
+        # Download if needed
+        download_dictionary(dict_name, overwrite=overwrite)
+        # Load raw data
+        raw_data = load_dictionary(dict_name)
+        # Create processor for dictionary lookup methods
+        if dict_name in ['8k_2024_persons']:  # Add other dict names as needed
+            from flashtext import KeywordProcessor
+            processor = KeywordProcessor(case_sensitive=True)
+            for key in raw_data.keys():
+                processor.add_keyword(key, key)
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': processor
+            }
+        else:
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': None
+            }

{datamule-2.2.0 → datamule-2.2.2}/datamule/tags/utils.py RENAMED Viewed

@@ -134,14 +134,10 @@ def get_full_names(text,keywords=None):
     return full_names
 # add dictionary lookup based on precomputed lists
-def get_full_names_dictionary_lookup(text, dictionary):
-    keyword_processor = KeywordProcessor(case_sensitive=True)
-    for key in dictionary.keys():
-        keyword_processor.add_keyword(key, key)
+def get_full_names_dictionary_lookup(text, processor):
+    """Use pre-built KeywordProcessor instead of creating new one"""
     matches = []
-    keywords_found = keyword_processor.extract_keywords(text, span_info=True)
+    keywords_found = processor.extract_keywords(text, span_info=True)
     for keyword, start_pos, end_pos in keywords_found:
         matches.append((keyword, start_pos, end_pos))

{datamule-2.2.0 → datamule-2.2.2}/datamule/utils/dictionaries.py RENAMED Viewed

@@ -6,7 +6,8 @@ urls = {
     "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
     "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
     "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
-    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
+    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
+    "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
 }
@@ -67,6 +68,12 @@ def load_dictionary(name):
             for line in f:
                 cusip_set.add(line.strip())
         return cusip_set
+    elif name == "13fhr_information_table_cusips":
+        cusip_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                cusip_set.add(line.strip())
+        return cusip_set
     elif name == "8k_2024_persons":
         with open(file_path, 'r', encoding='utf-8') as f:

{datamule-2.2.0 → datamule-2.2.2}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.0
+Version: 2.2.2
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.0 → datamule-2.2.2}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.0",
+    version="2.2.2",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",

datamule-2.2.0/datamule/tags/config.py DELETED Viewed

@@ -1,16 +0,0 @@
-from ..utils.dictionaries import download_dictionary, load_dictionary
-_active_dictionaries = []
-_loaded_dictionaries = {}
-def set_dictionaries(dictionaries, overwrite=False):
-    """Set active dictionaries and load them into memory"""
-    global _active_dictionaries, _loaded_dictionaries
-    _active_dictionaries = dictionaries
-    _loaded_dictionaries = {}
-    for dict_name in dictionaries:
-        # Download if needed
-        download_dictionary(dict_name, overwrite=overwrite)
-        # Load into memory
-        _loaded_dictionaries[dict_name] = load_dictionary(dict_name)