PyPI - datamule - Versions diffs - 2.2.4__tar.gz → 2.2.6__tar.gz - Mend

datamule 2.2.4tar.gz → 2.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{datamule-2.2.4 → datamule-2.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.4
+Version: 2.2.6
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.4 → datamule-2.2.6}/datamule/document/document.py RENAMED Viewed

@@ -3,7 +3,7 @@ import csv
 import re
 from doc2dict import xml2dict, txt2dict, dict2dict
 from doc2dict.mapping import flatten_hierarchy
-from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
+from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
         if not hasattr(self, '_persons'):
             self._persons = []
             sources = self._get_text_sources()
             for source in sources:
                 if '8k_2024_persons' in self.processors:
                     results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
@@ -289,71 +288,17 @@ class Document:
         self._data = None
         self._tables = None
         self._text = None
+        self._markdown = None
+        # booleans
+        self._data_bool = self.extension in ('.htm', '.html','.txt')
+        self._text_bool = self._data_bool
+        self._markdown_bool = self._data_bool
+        self._visualize_bool = self._data_bool
+        self._tables_bool = self.extension in ('.xml')
-    #_load_text_content
-    def _preprocess_txt_content(self):
-            self._text = self.content.decode().translate(str.maketrans({
-                '\xa0': ' ', '\u2003': ' ',
-                '\u2018': "'", '\u2019': "'",
-                '\u201c': '"', '\u201d': '"'
-            }))
-    # needs work
-    def _preprocess_html_content(self):
-        parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
-        # Remove hidden elements first
-        hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
-        for node in hidden_nodes:
-            node.decompose()
-        blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
-        lines = []
-        current_line = []
-        def flush_line():
-            if current_line:
-                # Don't add spaces between adjacent spans
-                lines.append(''.join(current_line))
-                current_line.clear()
-        for node in parser.root.traverse(include_text=True):
-            if node.tag in ('script', 'style', 'css'):
-                continue
-            if node.tag in blocks:
-                flush_line()
-                lines.append('')
-            if node.text_content:
-                text = node.text_content.strip()
-                if text:
-                    if node.tag in blocks:
-                        flush_line()
-                        lines.append(text)
-                        lines.append('')
-                    else:
-                        # Only add space if nodes aren't directly adjacent
-                        if current_line and not current_line[-1].endswith(' '):
-                            if node.prev and node.prev.text_content:
-                                if node.parent != node.prev.parent or node.prev.next != node:
-                                    current_line.append(' ')
-                        current_line.append(text)
-        flush_line()
-        text = '\n'.join(lines)
-        while '\n\n\n' in text:
-            text = text.replace('\n\n\n', '\n\n')
-        self._text = text.translate(str.maketrans({
-            '\xa0': ' ', '\u2003': ' ',
-            '\u2018': "'", '\u2019': "'",
-            '\u201c': '"', '\u201d': '"'
-        }))
     def contains_string(self, pattern):
         """Works for select files"""
         if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -485,17 +430,21 @@ class Document:
     @property
     def text(self):
-        if self._text is None:
-            if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()  # Still sets self._text to plain string
-            elif self.extension == '.txt':
-                self._preprocess_txt_content()   # Still sets self._text to plain string
-            # Convert the plain string to TextWithTags
-            plain_text = self._text
-            self._text = TextWithTags(plain_text, self)
+        if self._text_bool:
+            if self._text is None:
+                text = flatten_dict(self.data,'text')
+                self._text = TextWithTags(text, self)
         return self._text
+    @property
+    def markdown(self):
+        if self._markdown_bool:
+            if self._markdown is None:
+                self._markdown = flatten_dict(self.data,'markdown')
+        return self._markdown
     def write_json(self, output_filename=None):
         if not self.data:
             self.parse()
@@ -544,6 +493,17 @@ class Document:
                     writer.writeheader()
                     writer.writerows(table.data)
+    def reset_nlp(self):
+        """Reset all NLP analysis by creating fresh wrapper objects"""
+        # Reset data wrapper
+        if hasattr(self, '_data') and self._data is not None:
+            raw_data = dict(self._data)  # Extract the underlying dict
+            self._data = DataWithTags(raw_data, self)
+        # Reset text wrapper
+        if hasattr(self, '_text') and self._text is not None:
+            raw_text = str(self._text)  # Extract the underlying string
+            self._text = TextWithTags(raw_text, self)
     def _document_to_section_text(self, document_data, parent_key=''):
         items = []

{datamule-2.2.4 → datamule-2.2.6}/datamule/submission.py RENAMED Viewed

@@ -121,6 +121,16 @@ class Submission:
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+        # booleans
+        self._has_xbrl = any(
+                doc['type'] in ('EX-100.INS', 'EX-101.INS') or
+                doc.get('filename', '').endswith('_htm.xml')
+                for doc in self.metadata.content['documents']
+            )
+        self._has_fundamentals = self._has_xbrl
     def _load_document_by_index(self, idx):
         """Load a document by its index in the metadata documents list."""
         doc = self.metadata.content['documents'][idx]

{datamule-2.2.4 → datamule-2.2.6}/datamule/tags/config.py RENAMED Viewed

@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
 _active_dictionaries = []
 _loaded_dictionaries = {}
+def clear_dictionaries():
+    """Remove all active dictionaries"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = []
+    _loaded_dictionaries = {}
 def set_dictionaries(dictionaries, overwrite=False):
     """Set active dictionaries and load them into memory"""
     global _active_dictionaries, _loaded_dictionaries

{datamule-2.2.4 → datamule-2.2.6}/datamule/tags/utils.py RENAMED Viewed

@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
     return dict(ticker_regex_list)
 # will change in future to accomodate other datasets
-def validate_full_name(full_name,keywords):
+def validate_full_name(full_name, keywords):
     if len(full_name) == 1:
         return False
-    # check all is upper
-    if all(word.isupper() for word in full_name):
+    # Clean punctuation before validation
+    cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
+    # Skip validation if cleaning removed everything
+    if not all(cleaned_name):
         return False
-    # check if any number in word
-    if any(any(char.isdigit() for char in word) for word in full_name):
+    # Apply existing checks to cleaned words
+    if all(word.isupper() for word in cleaned_name):
         return False
-    if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
+    # check if any number in word
+    if any(any(char.isdigit() for char in word) for word in cleaned_name):
         return False
     # add optional set lookups
     if keywords is not None:
         # return false if first word is not in keywords set
-        if full_name[0] not in keywords:
+        if cleaned_name[0] not in keywords:
             return False
     return True
 def get_full_names(text,keywords=None):

{datamule-2.2.4 → datamule-2.2.6}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.4
+Version: 2.2.6
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.4 → datamule-2.2.6}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.4",
+    version="2.2.6",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",