PyPI - datamule - Versions diffs - 2.2.4__py3-none-any.whl → 2.2.6__py3-none-any.whl - Mend

datamule 2.2.4py3-none-any.whl → 2.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

datamule/document/document.py +33 -73
datamule/submission.py +10 -0
datamule/tags/config.py +6 -0
datamule/tags/utils.py +14 -8
{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/METADATA +1 -1
{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/RECORD +8 -8
{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/WHEEL +0 -0
{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/top_level.txt +0 -0

datamule/document/document.py CHANGED Viewed

@@ -3,7 +3,7 @@ import csv
 import re
 from doc2dict import xml2dict, txt2dict, dict2dict
 from doc2dict.mapping import flatten_hierarchy
-from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
+from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
@@ -221,7 +221,6 @@ class Tags(TextAnalysisBase):
         if not hasattr(self, '_persons'):
             self._persons = []
             sources = self._get_text_sources()
             for source in sources:
                 if '8k_2024_persons' in self.processors:
                     results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
@@ -289,71 +288,17 @@ class Document:
         self._data = None
         self._tables = None
         self._text = None
+        self._markdown = None
+        # booleans
+        self._data_bool = self.extension in ('.htm', '.html','.txt')
+        self._text_bool = self._data_bool
+        self._markdown_bool = self._data_bool
+        self._visualize_bool = self._data_bool
+        self._tables_bool = self.extension in ('.xml')
-    #_load_text_content
-    def _preprocess_txt_content(self):
-            self._text = self.content.decode().translate(str.maketrans({
-                '\xa0': ' ', '\u2003': ' ',
-                '\u2018': "'", '\u2019': "'",
-                '\u201c': '"', '\u201d': '"'
-            }))
-    # needs work
-    def _preprocess_html_content(self):
-        parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
-        # Remove hidden elements first
-        hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
-        for node in hidden_nodes:
-            node.decompose()
-        blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
-        lines = []
-        current_line = []
-        def flush_line():
-            if current_line:
-                # Don't add spaces between adjacent spans
-                lines.append(''.join(current_line))
-                current_line.clear()
-        for node in parser.root.traverse(include_text=True):
-            if node.tag in ('script', 'style', 'css'):
-                continue
-            if node.tag in blocks:
-                flush_line()
-                lines.append('')
-            if node.text_content:
-                text = node.text_content.strip()
-                if text:
-                    if node.tag in blocks:
-                        flush_line()
-                        lines.append(text)
-                        lines.append('')
-                    else:
-                        # Only add space if nodes aren't directly adjacent
-                        if current_line and not current_line[-1].endswith(' '):
-                            if node.prev and node.prev.text_content:
-                                if node.parent != node.prev.parent or node.prev.next != node:
-                                    current_line.append(' ')
-                        current_line.append(text)
-        flush_line()
-        text = '\n'.join(lines)
-        while '\n\n\n' in text:
-            text = text.replace('\n\n\n', '\n\n')
-        self._text = text.translate(str.maketrans({
-            '\xa0': ' ', '\u2003': ' ',
-            '\u2018': "'", '\u2019': "'",
-            '\u201c': '"', '\u201d': '"'
-        }))
     def contains_string(self, pattern):
         """Works for select files"""
         if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -485,17 +430,21 @@ class Document:
     @property
     def text(self):
-        if self._text is None:
-            if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()  # Still sets self._text to plain string
-            elif self.extension == '.txt':
-                self._preprocess_txt_content()   # Still sets self._text to plain string
-            # Convert the plain string to TextWithTags
-            plain_text = self._text
-            self._text = TextWithTags(plain_text, self)
+        if self._text_bool:
+            if self._text is None:
+                text = flatten_dict(self.data,'text')
+                self._text = TextWithTags(text, self)
         return self._text
+    @property
+    def markdown(self):
+        if self._markdown_bool:
+            if self._markdown is None:
+                self._markdown = flatten_dict(self.data,'markdown')
+        return self._markdown
     def write_json(self, output_filename=None):
         if not self.data:
             self.parse()
@@ -544,6 +493,17 @@ class Document:
                     writer.writeheader()
                     writer.writerows(table.data)
+    def reset_nlp(self):
+        """Reset all NLP analysis by creating fresh wrapper objects"""
+        # Reset data wrapper
+        if hasattr(self, '_data') and self._data is not None:
+            raw_data = dict(self._data)  # Extract the underlying dict
+            self._data = DataWithTags(raw_data, self)
+        # Reset text wrapper
+        if hasattr(self, '_text') and self._text is not None:
+            raw_text = str(self._text)  # Extract the underlying string
+            self._text = TextWithTags(raw_text, self)
     def _document_to_section_text(self, document_data, parent_key=''):
         items = []

datamule/submission.py CHANGED Viewed

@@ -121,6 +121,16 @@ class Submission:
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+        # booleans
+        self._has_xbrl = any(
+                doc['type'] in ('EX-100.INS', 'EX-101.INS') or
+                doc.get('filename', '').endswith('_htm.xml')
+                for doc in self.metadata.content['documents']
+            )
+        self._has_fundamentals = self._has_xbrl
     def _load_document_by_index(self, idx):
         """Load a document by its index in the metadata documents list."""
         doc = self.metadata.content['documents'][idx]

datamule/tags/config.py CHANGED Viewed

@@ -3,6 +3,12 @@ from .dictionaries import download_dictionary, load_dictionary
 _active_dictionaries = []
 _loaded_dictionaries = {}
+def clear_dictionaries():
+    """Remove all active dictionaries"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = []
+    _loaded_dictionaries = {}
 def set_dictionaries(dictionaries, overwrite=False):
     """Set active dictionaries and load them into memory"""
     global _active_dictionaries, _loaded_dictionaries

datamule/tags/utils.py CHANGED Viewed

@@ -67,25 +67,31 @@ def get_ticker_regex_dict():
     return dict(ticker_regex_list)
 # will change in future to accomodate other datasets
-def validate_full_name(full_name,keywords):
+def validate_full_name(full_name, keywords):
     if len(full_name) == 1:
         return False
-    # check all is upper
-    if all(word.isupper() for word in full_name):
+    # Clean punctuation before validation
+    cleaned_name = [word.rstrip(".,;:!?()[]") for word in full_name]
+    # Skip validation if cleaning removed everything
+    if not all(cleaned_name):
         return False
-    # check if any number in word
-    if any(any(char.isdigit() for char in word) for word in full_name):
+    # Apply existing checks to cleaned words
+    if all(word.isupper() for word in cleaned_name):
         return False
-    if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
+    # check if any number in word
+    if any(any(char.isdigit() for char in word) for word in cleaned_name):
         return False
     # add optional set lookups
     if keywords is not None:
         # return false if first word is not in keywords set
-        if full_name[0] not in keywords:
+        if cleaned_name[0] not in keywords:
             return False
     return True
 def get_full_names(text,keywords=None):

{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.4
+Version: 2.2.6
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/RECORD RENAMED Viewed

@@ -7,7 +7,7 @@ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,9
 datamule/portfolio.py,sha256=0-E1ZSEjJ8hba7HxF8oCrRneNuF_KKISOY6K4dRg0Cg,12282
 datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
 datamule/sheet.py,sha256=KD7yAgSB8BE-Z4GDuH58IV-2DJ673nMcEsrCyJbeYp8,10707
-datamule/submission.py,sha256=TdQDfFjOKXy2qAZcD6hc9kjDSxmuZLqk8WRhtMjjC-g,15822
+datamule/submission.py,sha256=phHmi9ScjWHtVLjEoEdAO7RieUSKN5gPr0onfg5R8wE,16139
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=oOib-bFPZ0rsIk8WBgBVY73CwuU18MZDmXnAQ8fTVD8,26124
+datamule/document/document.py,sha256=AuF5JSVjFHA2w5JoLq8zG1UOq906PvJNcp50Qia--fE,24521
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -50,14 +50,14 @@ datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
 datamule/sentiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/tags/config.py,sha256=w7386pyvnWYPNwgMVT_Nw5ivXibOeFuSuMEI7lRsGrk,1495
+datamule/tags/config.py,sha256=rxawvOBDT2v72Aw-VkmnUOLsKSAIrZBrjz_E0hPU7MY,1677
 datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
 datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
-datamule/tags/utils.py,sha256=hexmz_3YnoPrC98A5DTz1xa8o58xZ1yKbzQYP1XiQts,6100
+datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-2.2.4.dist-info/METADATA,sha256=SD47CDv1rjDKzI0GukLS7HEAEPN45RlQ5rqZauG1YJE,585
-datamule-2.2.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.2.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.2.4.dist-info/RECORD,,
+datamule-2.2.6.dist-info/METADATA,sha256=lY7IAgOEQ9TUlWaKRhypyBfRIXS3jmr5q9sEHOgaYfg,585
+datamule-2.2.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.2.6.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.2.6.dist-info/RECORD,,

{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.2.4.dist-info → datamule-2.2.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.2.4__py3-none-any.whl → 2.2.6__py3-none-any.whl

datamule 2.2.4py3-none-any.whl → 2.2.6py3-none-any.whl