PyPI - datamule - Versions diffs - 2.1.2__tar.gz → 2.1.4__tar.gz - Mend

datamule 2.1.2tar.gz → 2.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{datamule-2.1.2 → datamule-2.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.2
+Version: 2.1.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

datamule-2.1.4/datamule/datasets.py ADDED Viewed

@@ -0,0 +1,49 @@
+# datamule/datasets.py
+from pathlib import Path
+import requests
+import gzip
+import shutil
+import csv
+# Dataset URLs
+DATASET_URLS = {
+    "cik_cusip_crosswalk": "https://github.com/john-friedman/datamule-data/raw/refs/heads/master/data/datasets/cik_cusip_crosswalk.csv.gz"
+}
+def update_dataset(name):
+    """Force update a dataset by re-downloading it."""
+    return _get_dataset(name, update=True)
+def _get_dataset(name, update=False):
+    """Internal function to get dataset as list of dicts, downloading if necessary."""
+    if name not in DATASET_URLS:
+        raise ValueError(f"Unknown dataset: {name}")
+    url = DATASET_URLS[name]
+    data_dir = Path.home() / ".datamule" / "datasets"
+    file_path = data_dir / f"{name}.csv"
+    if not file_path.exists() or update:
+        print(f"Downloading {name}...")
+        data_dir.mkdir(parents=True, exist_ok=True)
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        gz_path = file_path.with_suffix('.csv.gz')
+        with open(gz_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        with gzip.open(gz_path, 'rb') as f_in:
+            with open(file_path, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        gz_path.unlink()
+    # Read CSV and return as list of dicts
+    with open(file_path, 'r') as f:
+        return list(csv.DictReader(f))
+# Dataset available as list of dicts on import
+cik_cusip_crosswalk = _get_dataset("cik_cusip_crosswalk")

{datamule-2.1.2 → datamule-2.1.4}/datamule/document/document.py RENAMED Viewed

@@ -12,6 +12,7 @@ from selectolax.parser import HTMLParser
 from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
+import tempfile
 from .tables.tables import Tables
@@ -36,18 +37,19 @@ class Document:
         # this will be filled by parsed
         self._data = None
         self._tables = None
+        self._text = None
     #_load_text_content
     def _preprocess_txt_content(self):
-            return self.content.translate(str.maketrans({
+            self._text = self.content.decode().translate(str.maketrans({
                 '\xa0': ' ', '\u2003': ' ',
                 '\u2018': "'", '\u2019': "'",
                 '\u201c': '"', '\u201d': '"'
             }))
-    # will deprecate this when we add html2dict
+    # needs work
     def _preprocess_html_content(self):
         parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
@@ -95,7 +97,7 @@ class Document:
         while '\n\n\n' in text:
             text = text.replace('\n\n\n', '\n\n')
-        return text.translate(str.maketrans({
+        self._text = text.translate(str.maketrans({
             '\xa0': ' ', '\u2003': ' ',
             '\u2018': "'", '\u2019': "'",
             '\u201c': '"', '\u201d': '"'
@@ -116,7 +118,7 @@ class Document:
         mapping_dict = None
         if self.extension == '.txt':
-            content = self._preprocess_txt_content()
+            content = self.text
             if self.type == '10-Q':
                 mapping_dict = dict_10q
             elif self.type == '10-K':
@@ -224,6 +226,15 @@ class Document:
             self.parse()
         return self._data
+    @property
+    def text(self):
+        if self._text is None:
+            if self.extension in ['.htm','.html']:
+                self._preprocess_html_content()
+            elif self.extension == '.txt':
+                self._preprocess_txt_content()
+            return self._text
     def write_json(self, output_filename=None):
         if not self.data:
             self.parse()
@@ -308,13 +319,23 @@ class Document:
             self.parse()
         if not self.data:
-            if self.extension in ['.jpg', '.png', '.pdf']:
-                webbrowser.open('file://' + str(self.path))
-            else:
-                pass
+            pass
         else:
             visualize_dict(self.data)
+    # alpha feature
+    def open(self):
+        """Open the document. Experimental. Creates copy in temp, rather than use tar path for now."""
+        if self.extension in ['.htm', '.html','.txt','.jpg','.png', '.pdf']:
+            # Create a temporary file with the content and open it
+            with tempfile.NamedTemporaryFile(mode='wb', suffix=self.extension, delete=False) as f:
+                f.write(self.content)
+                temp_path = f.name
+            webbrowser.open('file://' + temp_path)
+        else:
+            print(f"Cannot open files with extension {self.extension}")
     def get_section(self, title=None, title_regex=None,title_class=None, format='dict'):
         if not self.data:
             self.parse()

{datamule-2.1.2 → datamule-2.1.4}/datamule/portfolio.py RENAMED Viewed

@@ -96,12 +96,16 @@ class Portfolio:
         # Create submissions for each accession
         submissions = []
         for accession_prefix in accession_prefixes:
-            submission = Submission(
-                batch_tar_path=batch_tar_path,
-                accession_prefix=accession_prefix,
-                portfolio_ref=self
-            )
-            submissions.append(submission)
+            try:
+                submission = Submission(
+                    batch_tar_path=batch_tar_path,
+                    accession_prefix=accession_prefix,
+                    portfolio_ref=self
+                )
+                submissions.append(submission)
+            except Exception as e:
+                pass
+                #print(f"Path: {batch_tar_path}. Exception: {e}")
             pbar.update(1)  # Update progress for each successful submission
         return submissions

{datamule-2.1.2 → datamule-2.1.4}/datamule/submission.py RENAMED Viewed

@@ -12,6 +12,7 @@ import urllib.request
 from secxbrl import parse_inline_xbrl
 from company_fundamentals import construct_fundamentals
 from decimal import Decimal
+from .utils.format_accession import format_accession
 class Submission:
@@ -93,11 +94,10 @@ class Submission:
             # standardize metadata
             metadata = transform_metadata_string(metadata)
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
-            self.accession = self.metadata.content['accession-number']
+            # lets just use accesion-prefix, to get around malformed metadata files (1995 has a lot!)
+            self.accession = format_accession(self.accession_prefix,'dash')
-            # Band-aid fix: some SGML files in the SEC are bad lol, so they have TWO header sections. Will fix post w/ my cleaned archive
-            if isinstance(self.accession,list):
-                self.accession = self.accession[0]
             #print(f"s: {self.metadata.content['accession-number']} : {batch_tar_path}")
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"

{datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.2
+Version: 2.1.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.1.2 → datamule-2.1.4}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 setup.py
 datamule/__init__.py
 datamule/config.py
+datamule/datasets.py
 datamule/helper.py
 datamule/index.py
 datamule/package_updater.py

{datamule-2.1.2 → datamule-2.1.4}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.1.2",
+    version="2.1.4",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",