PyPI - datamule - Versions diffs - 0.427__cp313-cp313-win_amd64.whl → 0.428__cp313-cp313-win_amd64.whl - Mend

datamule 0.427__cp313-cp313-win_amd64.whl → 0.428__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

datamule/__init__.py +3 -0
datamule/config.py +29 -0
datamule/document.py +16 -4
datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd +0 -0
datamule/portfolio.py +61 -14
datamule/submission.py +10 -1
{datamule-0.427.dist-info → datamule-0.428.dist-info}/METADATA +3 -3
{datamule-0.427.dist-info → datamule-0.428.dist-info}/RECORD +10 -9
{datamule-0.427.dist-info → datamule-0.428.dist-info}/WHEEL +0 -0
{datamule-0.427.dist-info → datamule-0.428.dist-info}/top_level.txt +0 -0

datamule/__init__.py CHANGED Viewed

@@ -39,6 +39,9 @@ def __getattr__(name):
     elif name == 'load_package_dataset':
         from .helper import load_package_dataset
         return load_package_dataset
+    elif name == 'Config':
+        from .config import Config
+        return Config
     raise AttributeError(f"module 'datamule' has no attribute '{name}'")
 # Lazy load nest_asyncio only when needed

datamule/config.py ADDED Viewed

@@ -0,0 +1,29 @@
+import json
+import os
+class Config:
+    def __init__(self):
+        self.config_path = os.path.expanduser("~/.datamule/config.json")
+        self._ensure_config_exists()
+    def _ensure_config_exists(self):
+        os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
+        if not os.path.exists(self.config_path):
+            self._save_config({"default_source": None})
+    def _save_config(self, config):
+        with open(self.config_path, 'w') as f:
+            json.dump(config, f)
+    def set_default_source(self, source):
+        config = self._load_config()
+        config["default_source"] = source
+        self._save_config(config)
+    def get_default_source(self):
+        config = self._load_config()
+        return config.get("default_source")
+    def _load_config(self):
+        with open(self.config_path) as f:
+            return json.load(f)

datamule/document.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 import csv
 from .parser.document_parsing.sec_parser import Parser
+from .parser.document_parsing.helper import load_file_content
 from .helper import convert_to_dashed_accession
+import re
 # we need to modify parse filing to take option in memory
@@ -10,12 +12,22 @@ parser = Parser()
 class Document:
     def __init__(self, type, filename):
         self.type = type
-        self.filename = filename
+        self.path = filename
         self.data = None
+        self.content = None
+    def contains_string(self, pattern):
+        """Currently only works for .htm, .html, and .txt files"""
+        if self.path.suffix in ['.htm', '.html', '.txt']:
+            if self.content is None:
+                self.content = load_file_content(self.path)
+            return bool(re.search(pattern, self.content))
+        return False
+    # Note: this method will be heavily modified in the future
     def parse(self):
-        self.data = parser.parse_filing(self.filename, self.type)
+        self.data = parser.parse_filing(self.path, self.type)
         return self.data
     def write_json(self, output_filename=None):
@@ -23,7 +35,7 @@ class Document:
             raise ValueError("No data to write. Parse filing first.")
         if output_filename is None:
-            output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
+            output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
         with open(output_filename, 'w') as f:
             json.dump(self.data, f, indent=2)
@@ -33,7 +45,7 @@ class Document:
             raise ValueError("No data available. Please call parse_filing() first.")
         if output_filename is None:
-            output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
+            output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
         with open(output_filename, 'w', newline='') as csvfile:
             if not self.data:

datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd CHANGED Viewed

Binary file

datamule/portfolio.py CHANGED Viewed

@@ -2,9 +2,11 @@ from pathlib import Path
 from tqdm import tqdm
 from concurrent.futures import ProcessPoolExecutor
 from .submission import Submission
+from .downloader.premiumdownloader import PremiumDownloader
+from .downloader.downloader import Downloader
+from .config import Config
 class Portfolio:
-    @classmethod
     def create(cls, path):
         # This method handles the process pool lifecycle
         with ProcessPoolExecutor() as executor:
@@ -13,19 +15,64 @@ class Portfolio:
     def __init__(self, path, executor=None):
         self.path = Path(path)
-        folders = [f for f in self.path.iterdir() if f.is_dir()]
-        print(f"Loading {len(folders)} submissions")
-        if executor is None:
-            # Fall back to sequential loading if no executor
-            self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
+        # check if path exists
+        if self.path.exists():
+            folders = [f for f in self.path.iterdir() if f.is_dir()]
+            print(f"Loading {len(folders)} submissions")
+            if executor is None:
+                # Fall back to sequential loading if no executor
+                self.submissions = [Submission(f) for f in tqdm(folders, desc="Loading submissions")]
+            else:
+                # Use provided executor for parallel loading
+                self.submissions = list(tqdm(
+                    executor.map(Submission, folders),
+                    total=len(folders),
+                    desc="Loading submissions"
+                ))
         else:
-            # Use provided executor for parallel loading
-            self.submissions = list(tqdm(
-                executor.map(Submission, folders),
-                total=len(folders),
-                desc="Loading submissions"
-            ))
+            pass
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None):
+        if provider is None:
+            config = Config()
+            provider = config.get_default_source()
+        if provider == 'sec':
+            downloader = Downloader()
+        elif provider == 'datamule':
+            downloader = PremiumDownloader()
+        downloader.download_submissions(output_dir=self.path, cik=cik, ticker=ticker, submission_type=submission_type, filing_date=filing_date
+                                        )
     def __iter__(self):
-        return iter(self.submissions)
+        return iter(self.submissions)
+    def document_type(self, document_types):
+        # Convert single document type to list for consistent handling
+        if isinstance(document_types, str):
+            document_types = [document_types]
+        for submission in self.submissions:
+            yield from submission.document_type(document_types)
+    def contains_string(self, pattern, document_types=None, executor=None):
+        def check_document(document):
+            return document if document.contains_string(pattern) else None
+        documents = list(self.document_type(document_types) if document_types else (
+            doc for sub in tqdm(self.submissions, desc="Collecting documents") for doc in sub
+        ))
+        if executor:
+            results = list(tqdm(
+                executor.map(check_document, documents),
+                total=len(documents),
+                desc=f"Searching for '{pattern}'"
+            ))
+            yield from (doc for doc in results if doc is not None)
+        else:
+            for document in tqdm(documents, desc=f"Searching for '{pattern}'"):
+                if document.contains_string(pattern):
+                    yield document

datamule/submission.py CHANGED Viewed

@@ -64,4 +64,13 @@ class Submission:
                     continue
                 document_path = self.path / filename
-                yield Document(doc['TYPE'], document_path)
+                yield Document(doc['TYPE'], document_path)
+    def __iter__(self):
+        for doc in self.metadata['documents']:
+            filename = doc.get('FILENAME')
+            if filename is None:
+                continue
+            document_path = self.path / filename
+            yield Document(doc['TYPE'], document_path)

{datamule-0.427.dist-info → datamule-0.428.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 0.427
+Version: 0.428
 Summary: Making it easier to use SEC filings.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -24,8 +24,8 @@ Requires-Dist: pandas; extra == "dataset-builder"
 Requires-Dist: google-generativeai; extra == "dataset-builder"
 Requires-Dist: psutil; extra == "dataset-builder"
 Provides-Extra: all
+Requires-Dist: flask; extra == "all"
 Requires-Dist: google-generativeai; extra == "all"
+Requires-Dist: pandas; extra == "all"
 Requires-Dist: psutil; extra == "all"
-Requires-Dist: flask; extra == "all"
 Requires-Dist: openai; extra == "all"
-Requires-Dist: pandas; extra == "all"

{datamule-0.427.dist-info → datamule-0.428.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
-datamule/__init__.py,sha256=Li3iau_u87wQQhoPliSTTpGaf3OMf5jIvqtHFJmCvnw,2338
-datamule/document.py,sha256=6xEaI-32AQiBxX3gZcX4Qr49bgvcvLviFAwUGpTwtr0,5273
+datamule/__init__.py,sha256=ghCMkcNrtQ2dYz9ulnlZ0JSe-aJF0YSVa3da2g0eIWk,2425
+datamule/config.py,sha256=lrXwhbWFAF3eTa6B4OQgvexSYvaFa-EkWofpLn6AKZM,911
+datamule/document.py,sha256=NoJS6Q9f7Z2bZzKkaoxooYHdDwqs-TCw_BkVH3krtIU,5774
 datamule/helper.py,sha256=8HOjB3Y7svw_zjEY-AY5JKOJ-LrBiuQMPyok3MH6CCg,4716
 datamule/monitor.py,sha256=WVds1HGV_ojYgWmo0b4Dsiv9mzZ85HHnCucH-7XoUw8,9350
 datamule/packageupdater.py,sha256=X73XlXs9bYSPiwtceaSOEq6wSCqKoG8lyhNyuha6aG0,9801
-datamule/portfolio.py,sha256=OOlu_05S88uwg5Me_EpY3C7BQ93Yq9eE0_tMY_Gqyrw,1117
-datamule/submission.py,sha256=sB6tidsAdaqP5VIQEFPq6PjLTgmD-crgdNviaOpiqlU,2558
+datamule/portfolio.py,sha256=gKnudwDS-_Qtm0j9YtfGg2mhk_-TVg2h7Ax-faIYqsY,3193
+datamule/submission.py,sha256=cfX7fvHQBObff0N1jzikCGTuAUE1bGIyqenLRxch9eg,2865
 datamule/data/company_former_names.csv,sha256=zTBWdV12_JE3aROFOMrFNTHLPW_M4TDruxtl15-XfA0,714528
 datamule/data/company_metadata.csv,sha256=X7uSIwConqC0sz-moIhXIISg6FI7GLGSlvAfDDf8Sd0,3078648
 datamule/data/company_tickers.csv,sha256=ihU6aNFriN0lADloCO85Op04deFk3qVcLZ0EJhi5QVo,410362
@@ -46,8 +47,8 @@ datamule/parser/document_parsing/n_port_p_parser.py,sha256=T6GliMm-TETPsFM-hDKt1
 datamule/parser/document_parsing/sec_parser.py,sha256=YewOdOsi0P25teQuxS5DNEND9ZCyxE2ewK1DoP9mPto,2788
 datamule/parser/document_parsing/sgml_parser.py,sha256=ASpe1SzgPj4qk0VOmmuMiEQeatjcwZzsuO3MvsYCHhc,3410
 datamule/parser/sgml_parsing/sgml_parser_cy.c,sha256=66QwBAmhxkKdhCgCjOkg29umbIgQoK4T5_mmMy3NkkM,841089
-datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=S_2QAcI8mDzM4hijIuz-O17QwDRXY6vFJdq6mO_ymx0,132096
-datamule-0.427.dist-info/METADATA,sha256=T40XFqzuBmDa2cjcwXumRdPV-H_T95ODY1h6QSJAE_E,1037
-datamule-0.427.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
-datamule-0.427.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-0.427.dist-info/RECORD,,
+datamule/parser/sgml_parsing/sgml_parser_cy.cp313-win_amd64.pyd,sha256=_vSFnvAIOWLmrNXyHnQ9q65Flm0uv52J0xw2hGkhsOc,132096
+datamule-0.428.dist-info/METADATA,sha256=uTujTeBT6sheV6gRv0X6WJ-x1xdoM0zAJj2tVvOVNDU,1037
+datamule-0.428.dist-info/WHEEL,sha256=4-iQBlRoDdX1wfPofc7KLWa5Cys4eZSgXs6GVU8fKlQ,101
+datamule-0.428.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-0.428.dist-info/RECORD,,

{datamule-0.427.dist-info → datamule-0.428.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-0.427.dist-info → datamule-0.428.dist-info}/top_level.txt RENAMED Viewed

File without changes