PyPI - datamule - Versions diffs - 2.2.5__py3-none-any.whl → 2.2.7__py3-none-any.whl - Mend

datamule 2.2.5py3-none-any.whl → 2.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

datamule/document/document.py CHANGED Viewed

@@ -3,7 +3,7 @@ import csv
 import re
 from doc2dict import xml2dict, txt2dict, dict2dict
 from doc2dict.mapping import flatten_hierarchy
-from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict
+from doc2dict import html2dict, visualize_dict, get_title, unnest_dict, pdf2dict, flatten_dict
 from ..mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
 from ..mapping_dicts.xml_mapping_dicts import dict_345
 from ..mapping_dicts.html_mapping_dicts import *
@@ -288,77 +288,17 @@ class Document:
         self._data = None
         self._tables = None
         self._text = None
+        self._markdown = None
         # booleans
-        self._text_bool = self.extension in ('.htm', '.html','.txt')
         self._data_bool = self.extension in ('.htm', '.html','.txt')
+        self._text_bool = self._data_bool
+        self._markdown_bool = self._data_bool
         self._visualize_bool = self._data_bool
         self._tables_bool = self.extension in ('.xml')
-    #_load_text_content
-    def _preprocess_txt_content(self):
-            self._text = self.content.decode().translate(str.maketrans({
-                '\xa0': ' ', '\u2003': ' ',
-                '\u2018': "'", '\u2019': "'",
-                '\u201c': '"', '\u201d': '"'
-            }))
-    # needs work
-    def _preprocess_html_content(self):
-        parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
-        # Remove hidden elements first
-        hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
-        for node in hidden_nodes:
-            node.decompose()
-        blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
-        lines = []
-        current_line = []
-        def flush_line():
-            if current_line:
-                # Don't add spaces between adjacent spans
-                lines.append(''.join(current_line))
-                current_line.clear()
-        for node in parser.root.traverse(include_text=True):
-            if node.tag in ('script', 'style', 'css'):
-                continue
-            if node.tag in blocks:
-                flush_line()
-                lines.append('')
-            if node.text_content:
-                text = node.text_content.strip()
-                if text:
-                    if node.tag in blocks:
-                        flush_line()
-                        lines.append(text)
-                        lines.append('')
-                    else:
-                        # Only add space if nodes aren't directly adjacent
-                        if current_line and not current_line[-1].endswith(' '):
-                            if node.prev and node.prev.text_content:
-                                if node.parent != node.prev.parent or node.prev.next != node:
-                                    current_line.append(' ')
-                        current_line.append(text)
-        flush_line()
-        text = '\n'.join(lines)
-        while '\n\n\n' in text:
-            text = text.replace('\n\n\n', '\n\n')
-        self._text = text.translate(str.maketrans({
-            '\xa0': ' ', '\u2003': ' ',
-            '\u2018': "'", '\u2019': "'",
-            '\u201c': '"', '\u201d': '"'
-        }))
     def contains_string(self, pattern):
         """Works for select files"""
         if self.extension in ['.htm', '.html', '.txt','.xml']:
@@ -477,30 +417,35 @@ class Document:
     @property
     def data(self):
-        if self._data is None:
-            self.parse()
+        if self._data_bool:
+            if self._data is None:
+                self.parse()
-        if self._data is None:
-            self._data = {}
-        if not isinstance(self._data, DataWithTags):
-            self._data = DataWithTags(self._data, self)
+            if self._data is None:
+                self._data = {}
+            if not isinstance(self._data, DataWithTags):
+                self._data = DataWithTags(self._data, self)
         return self._data
     @property
     def text(self):
-        if self._text is None:
-            if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()  # Still sets self._text to plain string
-            elif self.extension == '.txt':
-                self._preprocess_txt_content()   # Still sets self._text to plain string
-            # Convert the plain string to TextWithTags
-            plain_text = self._text
-            self._text = TextWithTags(plain_text, self)
+        if self._text_bool:
+            if self._text is None:
+                text = flatten_dict(self.data,'text')
+                self._text = TextWithTags(text, self)
         return self._text
+    @property
+    def markdown(self):
+        if self._markdown_bool:
+            if self._markdown is None:
+                self._markdown = flatten_dict(self.data,'markdown')
+        return self._markdown
     def write_json(self, output_filename=None):
         if not self.data:
             self.parse()
@@ -612,18 +557,16 @@ class Document:
             webbrowser.open('file://' + temp_path)
         else:
             print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
-        if not self.data:
-            self.parse()
+        if self._data_bool:
+            if not self.data:
+                self.parse()
-        result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
-        if format == 'text':
-            result = [item[1] for item in result]
-            result = [unnest_dict(item) for item in result]
-        return result
+            result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
+            if format == 'dict':
+                return [item[1] for item in result]
+            else:
+                return [flatten_dict(item[1],format) for item in result]
    # TODO CHANGE THIS

{datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.5
+Version: 2.2.7
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=B22ULAuYzclxxVCH4DsLWUIyFUC5Iep-Hl1W3RgCfeg,18580
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=msIMoLdxjcwdMv4ijwCMLutySk2-5BvGU266nWQkzg4,26909
+datamule/document/document.py,sha256=tFsNUMVeBvx_3Td5bKPMlEJGjyzQtac4tui8jk2PusE,24629
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -57,7 +57,7 @@ datamule/tags/utils.py,sha256=6B0jtwiFMQAU5mmdqWX_ZRa76uREY-DUBdM_ttt9cXk,6261
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-2.2.5.dist-info/METADATA,sha256=Mm0hhgixEljkpYk__oV2nIUe9ceglvbdhJr0lgEZ_b0,585
-datamule-2.2.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.2.5.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.2.5.dist-info/RECORD,,
+datamule-2.2.7.dist-info/METADATA,sha256=WMBfQuS6vgKcVlP04FGpD0BWDMU2nRaMVU_lsFQd9T4,585
+datamule-2.2.7.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.2.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.2.7.dist-info/RECORD,,

{datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.2.5.dist-info → datamule-2.2.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.2.5__py3-none-any.whl → 2.2.7__py3-none-any.whl

datamule 2.2.5py3-none-any.whl → 2.2.7py3-none-any.whl