PyPI - datamule - Versions diffs - 0.427__tar.gz → 0.428__tar.gz - Mend

datamule 0.427tar.gz → 0.428tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{datamule-0.427 → datamule-0.428}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 0.427
+Version: 0.428
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-0.427 → datamule-0.428}/datamule/__init__.py RENAMED Viewed

@@ -39,6 +39,9 @@ def __getattr__(name):
     elif name == 'load_package_dataset':
         from .helper import load_package_dataset
         return load_package_dataset
+    elif name == 'Config':
+        from .config import Config
+        return Config
     raise AttributeError(f"module 'datamule' has no attribute '{name}'")
 # Lazy load nest_asyncio only when needed

datamule-0.428/datamule/config.py ADDED Viewed

@@ -0,0 +1,29 @@
+import json
+import os
+class Config:
+    def __init__(self):
+        self.config_path = os.path.expanduser("~/.datamule/config.json")
+        self._ensure_config_exists()
+    def _ensure_config_exists(self):
+        os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
+        if not os.path.exists(self.config_path):
+            self._save_config({"default_source": None})
+    def _save_config(self, config):
+        with open(self.config_path, 'w') as f:
+            json.dump(config, f)
+    def set_default_source(self, source):
+        config = self._load_config()
+        config["default_source"] = source
+        self._save_config(config)
+    def get_default_source(self):
+        config = self._load_config()
+        return config.get("default_source")
+    def _load_config(self):
+        with open(self.config_path) as f:
+            return json.load(f)

{datamule-0.427 → datamule-0.428}/datamule/document.py RENAMED Viewed

@@ -1,7 +1,9 @@
 import json
 import csv
 from .parser.document_parsing.sec_parser import Parser
+from .parser.document_parsing.helper import load_file_content
 from .helper import convert_to_dashed_accession
+import re
 # we need to modify parse filing to take option in memory
@@ -10,12 +12,22 @@ parser = Parser()
 class Document:
     def __init__(self, type, filename):
         self.type = type
-        self.filename = filename
+        self.path = filename
         self.data = None
+        self.content = None
+    def contains_string(self, pattern):
+        """Currently only works for .htm, .html, and .txt files"""
+        if self.path.suffix in ['.htm', '.html', '.txt']:
+            if self.content is None:
+                self.content = load_file_content(self.path)
+            return bool(re.search(pattern, self.content))
+        return False
+    # Note: this method will be heavily modified in the future
     def parse(self):
-        self.data = parser.parse_filing(self.filename, self.type)
+        self.data = parser.parse_filing(self.path, self.type)
         return self.data
     def write_json(self, output_filename=None):
@@ -23,7 +35,7 @@ class Document:
             raise ValueError("No data to write. Parse filing first.")
         if output_filename is None:
-            output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
+            output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
         with open(output_filename, 'w') as f:
             json.dump(self.data, f, indent=2)
@@ -33,7 +45,7 @@ class Document:
             raise ValueError("No data available. Please call parse_filing() first.")
         if output_filename is None:
-            output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
+            output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
         with open(output_filename, 'w', newline='') as csvfile:
             if not self.data:

datamule-0.428/datamule/portfolio.py ADDED Viewed

@@ -0,0 +1,78 @@
+from pathlib import Path
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+from .submission import Submission
+from .downloader.premiumdownloader import PremiumDownloader
+from .downloader.downloader import Downloader
+from .config import Config
+class Portfolio:
+    def create(cls, path):
+        # This method handles the process pool lifecycle
+        with ProcessPoolExecutor() as executor:
+            portfolio = cls(path, executor)
+            return portfolio
+    def __init__(self, path, executor=None):
+        self.path = Path(path)
+        # check if path exists
+        if self.path.exists():
+            folders = [f for f in self.path.iterdir() if f.is_dir()]
+            print(f"Loading {len(folders)} submissions")
+            if executor is None:
+                # Fall back to sequential loading if no executor
+                self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
+            else:
+                # Use provided executor for parallel loading
+                self.submissions = list(tqdm(
+                    executor.map(Submission, folders),
+                    total=len(folders),
+                    desc="Loading submissions"
+                ))
+        else:
+            pass
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
+        if provider is None:
+            config = Config()
+            provider = config.get_default_source()
+        if provider == 'sec':
+            downloader = Downloader()
+        elif provider == 'datamule':
+            downloader = PremiumDownloader()
+        downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
+                                        )
+    def __iter__(self):
+        return iter(self.submissions)
+    def document_type(self, document_types):
+        # Convert single document type to list for consistent handling
+        if isinstance(document_types, str):
+            document_types = [document_types]
+        for submission in self.submissions:
+            yield from submission.document_type(document_types)
+    def contains_string(self, pattern, document_types=None, executor=None):
+        def check_document(document):
+            return document if document.contains_string(pattern) else None
+        documents = list(self.document_type(document_types) if document_types else (
+            doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
+        ))
+        if executor:
+            results = list(tqdm(
+                executor.map(check_document, documents),
+                total=len(documents),
+                desc=f"Searching for '{pattern}'"
+            ))
+            yield from (doc for doc in results if doc is not None)
+        else:
+            for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
+                if document.contains_string(pattern):
+                    yield document

{datamule-0.427 → datamule-0.428}/datamule/submission.py RENAMED Viewed

@@ -64,4 +64,13 @@ class Submission:
                     continue
                 document_path = self.path / filename
-                yield Document(doc['TYPE'], document_path)
+                yield Document(doc['TYPE'], document_path)
+    def __iter__(self):
+        for doc in self.metadata['documents']:
+            filename = doc.get('FILENAME')
+            if filename is None:
+                continue
+            document_path = self.path / filename
+            yield Document(doc['TYPE'], document_path)

{datamule-0.427 → datamule-0.428}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 0.427
+Version: 0.428
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-0.427 → datamule-0.428}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,5 +1,6 @@
 setup.py
 datamule/__init__.py
+datamule/config.py
 datamule/document.py
 datamule/helper.py
 datamule/monitor.py

{datamule-0.427 → datamule-0.428}/setup.py RENAMED Viewed

@@ -55,7 +55,7 @@ extras["all"] = list(all_dependencies)
 setup(
     name="datamule",
     author="John Friedman",
-    version="0.427",
+    version="0.428",
     description="Making it easier to use SEC filings.",
     packages=find_namespace_packages(include=['datamule*']),
     url="https://github.com/john-friedman/datamule-python",

datamule-0.427/datamule/portfolio.py DELETED Viewed

@@ -1,31 +0,0 @@
-from pathlib import Path
-from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor
-from .submission import Submission
-class Portfolio:
-    @classmethod
-    def create(cls, path):
-        # This method handles the process pool lifecycle
-        with ProcessPoolExecutor() as executor:
-            portfolio = cls(path, executor)
-            return portfolio
-    def __init__(self, path, executor=None):
-        self.path = Path(path)
-        folders = [f for f in self.path.iterdir() if f.is_dir()]
-        print(f"Loading {len(folders)} submissions")
-        if executor is None:
-            # Fall back to sequential loading if no executor
-            self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
-        else:
-            # Use provided executor for parallel loading
-            self.submissions = list(tqdm(
-                executor.map(Submission, folders),
-                total=len(folders),
-                desc="Loading submissions"
-            ))
-    def __iter__(self):
-        return iter(self.submissions)