PyPI - datamule - Versions diffs - 2.1.1__tar.gz → 2.1.3__tar.gz - Mend

datamule 2.1.1tar.gz → 2.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{datamule-2.1.1 → datamule-2.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.1
+Version: 2.1.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.1.1 → datamule-2.1.3}/datamule/datamule/downloader.py RENAMED Viewed

@@ -23,7 +23,11 @@ from ..utils.format_accession import format_accession
 # could be cleaned up
 # Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=logging.getLogger().handlers,
+)
 logger = logging.getLogger(__name__)

{datamule-2.1.1 → datamule-2.1.3}/datamule/document/document.py RENAMED Viewed

@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
 from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
+import tempfile
 from .tables.tables import Tables
@@ -36,18 +37,19 @@ class Document:
         # this will be filled by parsed
         self._data = None
         self._tables = None
+        self._text = None
     #_load_text_content
     def _preprocess_txt_content(self):
-            return self.content.translate(str.maketrans({
+            self._text = self.content.decode().translate(str.maketrans({
                 '\xa0': ' ', '\u2003': ' ',
                 '\u2018': "'", '\u2019': "'",
                 '\u201c': '"', '\u201d': '"'
             }))
-    # will deprecate this when we add html2dict
+    # needs work
     def _preprocess_html_content(self):
         parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
@@ -95,7 +97,7 @@ class Document:
         while '\n\n\n' in text:
             text = text.replace('\n\n\n', '\n\n')
-        return text.translate(str.maketrans({
+        self._text = text.translate(str.maketrans({
             '\xa0': ' ', '\u2003': ' ',
             '\u2018': "'", '\u2019': "'",
             '\u201c': '"', '\u201d': '"'
@@ -116,7 +118,7 @@ class Document:
         mapping_dict = None
         if self.extension == '.txt':
-            content = self._preprocess_txt_content()
+            content = self.text
             if self.type == '10-Q':
                 mapping_dict = dict_10q
             elif self.type == '10-K':
@@ -224,6 +226,15 @@ class Document:
             self.parse()
         return self._data
+    @property
+    def text(self):
+        if self._text is None:
+            if self.extension in ['.htm','.html']:
+                self._preprocess_html_content()
+            elif self.extension == '.txt':
+                self._preprocess_txt_content()
+            return self._text
     def write_json(self, output_filename=None):
         if not self.data:
             self.parse()
@@ -308,18 +319,28 @@ class Document:
             self.parse()
         if not self.data:
-            if self.extension in ['.jpg', '.png', '.pdf']:
-                webbrowser.open('file://' + str(self.path))
-            else:
-                pass
+            pass
         else:
             visualize_dict(self.data)
-    def get_section(self, title, format='dict'):
+    # alpha feature
+    def open(self):
+        """Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
+        if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
+            # Create a temporary file with the content and open it
+            with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
+                f.write(self.content)
+                temp_path = f.name
+            webbrowser.open('file://' + temp_path)
+        else:
+            print(f"Cannot open files with extension {self.extension}")
+    def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
         if not self.data:
             self.parse()
-        result = get_title(self.data,title)
+        result = get_title(self.data,title=title,title_regex=title_regex,title_class=title_class)
         if format == 'text':
             result = [item[1] for item in result]

{datamule-2.1.1 → datamule-2.1.3}/datamule/mapping_dicts/html_mapping_dicts.py RENAMED Viewed

@@ -1,7 +1,7 @@
 dict_10k_html = {
     ('part',r'^part\s*([ivx]+)$') : 0,
     ('signatures',r'^signatures?\.*$') : 0,
-    ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
+    ('item',r'^item\s*(\d+)\.?([a-z])?(?![a-z])') : 1,
 }
 dict_10q_html = dict_10k_html
@@ -48,7 +48,7 @@ dict_10d_html = dict_10k_html
 dict_20f_html = {
     ('part',r'^part\s*([ivx]+)') : 0,
-    ('item',r'^item\s*(\d+)\.?([a-z])?') : 1,
+    ('item',r'^item\s*(\d+)\.?([a-z])?(?![a-z])') : 1,
     ('letter',r'\d*\.?([a-z])') : 2,
     ('signatures',r'^signatures?\.*$') : 0,
 }

{datamule-2.1.1 → datamule-2.1.3}/datamule/submission.py RENAMED Viewed

@@ -163,8 +163,8 @@ class Submission:
                     content = zstd.ZstdDecompressor().decompress(content)
                 # Decode text files
-                if extension in ['.htm', '.html', '.txt', '.xml']:
-                    content = content.decode('utf-8', errors='replace')
+                # if extension in ['.htm', '.html', '.txt', '.xml']:
+                #     content = content.decode('utf-8', errors='replace')
                 document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
@@ -197,8 +197,8 @@ class Submission:
                         content = zstd.ZstdDecompressor().decompress(content)
                     # Decode text files
-                    if extension in ['.htm', '.html', '.txt', '.xml']:
-                        content = content.decode('utf-8', errors='replace')
+                    # if extension in ['.htm', '.html', '.txt', '.xml']:
+                    #     content = content.decode('utf-8', errors='replace')
                     document_path = f"{self.path}::{actual_filename}"
@@ -219,8 +219,8 @@ class Submission:
                     content = zstd.ZstdDecompressor().decompress(content)
                 # Decode text files
-                if extension in ['.htm', '.html', '.txt', '.xml']:
-                    content = content.decode('utf-8', errors='replace')
+                # if extension in ['.htm', '.html', '.txt', '.xml']:
+                #     content = content.decode('utf-8', errors='replace')
         return Document(
             type=doc['type'],

{datamule-2.1.1 → datamule-2.1.3}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.1
+Version: 2.1.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.1.1 → datamule-2.1.3}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.1.1",
+    version="2.1.3",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",