PyPI - datamule - Versions diffs - 1.5.3__tar.gz → 1.5.5__tar.gz - Mend

datamule 1.5.3tar.gz → 1.5.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{datamule-1.5.3 → datamule-1.5.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.5.3
+Version: 1.5.5
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.5.3 → datamule-1.5.5}/datamule/sec/submissions/monitor.py RENAMED Viewed

@@ -7,7 +7,7 @@ import asyncio
 from ..utils import headers, PreciseRateLimiter
 from .eftsquery import EFTSQuery
 import aiohttp
+from zoneinfo import ZoneInfo
 async def poll_rss(limiter):
     base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
@@ -77,15 +77,22 @@ class Monitor():
         )
     async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
-                            polling_interval=1000, quiet=True, start_date=None,
-                            validation_interval=60000):
+                        polling_interval=1000, quiet=True, start_date=None,
+                        validation_interval=60000):
         """
         Async implementation of monitor_submissions.
+        Either polling_interval or validation_interval (or both) must be specified.
+        If polling_interval is None, only EFTS validation will be performed.
+        If validation_interval is None, only RSS polling will be performed.
         """
+        # Validate that at least one interval is specified
+        if polling_interval is None and validation_interval is None:
+            raise ValueError("At least one of polling_interval or validation_interval must be specified")
         # Backfill if start_date is provided
         if start_date is not None:
-            today_date = datetime.now().date().strftime('%Y-%m-%d')
+            today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
             if not quiet:
                 print(f"Backfilling from {start_date} to {today_date}")
@@ -100,26 +107,35 @@ class Monitor():
             if new_hits and data_callback:
                 data_callback(new_hits)
-        last_polling_time = time.time()
-        last_validation_time = last_polling_time
-        current_time = last_polling_time
+        # Initialize timing variables
+        current_time = time.time()
+        last_polling_time = current_time
+        last_validation_time = current_time
+        # Determine which operations to perform
+        do_polling = polling_interval is not None
+        do_validation = validation_interval is not None
         while True:
-            # RSS polling
-            if not quiet:
-                print(f"Polling RSS feed")
-            results = await poll_rss(self.ratelimiters['sec.gov'])
-            new_results = self._filter_new_accessions(results)
-            if new_results:
+            current_time = time.time()
+            # RSS polling (if enabled)
+            if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
                 if not quiet:
-                    print(f"Found {len(new_results)} new submissions via RSS")
-                if data_callback:
-                    data_callback(new_results)
+                    print(f"Polling RSS feed")
+                results = await poll_rss(self.ratelimiters['sec.gov'])
+                new_results = self._filter_new_accessions(results)
+                if new_results:
+                    if not quiet:
+                        print(f"Found {len(new_results)} new submissions via RSS")
+                    if data_callback:
+                        data_callback(new_results)
+                last_polling_time = current_time
-            # EFTS validation
-            if validation_interval and (current_time - last_validation_time) >= validation_interval/1000:
+            # EFTS validation (if enabled)
+            if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
                 # Get submissions from the last 24 hours for validation
-                today_date = datetime.now().strftime('%Y-%m-%d')
+                today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
                 if not quiet:
                     print(f"Validating submissions from {today_date}")
@@ -134,19 +150,23 @@ class Monitor():
                         print(f"Found {len(new_hits)} new submissions via EFTS validation")
                     if data_callback:
                         data_callback(new_hits)
-                last_polling_time = time.time()
                 last_validation_time = current_time
             # Interval callback
             if interval_callback:
                 interval_callback()
-            next_poll_time = last_polling_time + (polling_interval / 1000)
+            # Calculate next wake-up time
+            next_times = []
+            if do_polling:
+                next_times.append(last_polling_time + (polling_interval / 1000))
+            if do_validation:
+                next_times.append(last_validation_time + (validation_interval / 1000))
+            next_wake_time = min(next_times)
             current_time = time.time()
-            time_to_sleep = max(0, next_poll_time - current_time)
+            time_to_sleep = max(0, next_wake_time - current_time)
             await asyncio.sleep(time_to_sleep)
-            last_polling_time = next_poll_time
     def monitor_submissions(self, data_callback=None, interval_callback=None,
                             polling_interval=1000, quiet=True, start_date=None,

{datamule-1.5.3 → datamule-1.5.5}/datamule/seclibrary/downloader.py RENAMED Viewed

@@ -98,6 +98,7 @@ class Downloader:
             filename, content = item
             output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
             write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
             self.pbar.update(1)
         def _processing_worker(self):
@@ -296,6 +297,64 @@ class Downloader:
             self.loop.call_soon_threadsafe(self.loop.stop)
+    def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
+        """
+        Download and process SEC filings using specific filenames.
+        Parameters:
+        - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
+        - output_dir: Directory to save downloaded files
+        - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
+        - keep_filtered_metadata: Whether to keep metadata for filtered documents
+        - standardize_metadata: Whether to standardize metadata format
+        """
+        if self.api_key is None:
+            raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
+        if not filenames:
+            raise ValueError("No filenames provided")
+        if not isinstance(filenames, (list, tuple)):
+            filenames = [filenames]
+        # Validate filenames format
+        for filename in filenames:
+            if not isinstance(filename, str):
+                raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
+            if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
+                raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
+        # Generate URLs directly from filenames
+        print(f"Generating URLs for {len(filenames)} files...")
+        urls = []
+        for filename in filenames:
+            url = f"{self.BASE_URL}{filename}"
+            urls.append(url)
+        # Remove duplicates while preserving order
+        seen = set()
+        urls = [url for url in urls if not (url in seen or seen.add(url))]
+        print(f"Downloading {len(urls)} files...")
+        # Process the batch asynchronously using existing infrastructure
+        start_time = time.time()
+        asyncio.run(self.process_batch(
+            urls,
+            output_dir,
+            keep_document_types=keep_document_types,
+            keep_filtered_metadata=keep_filtered_metadata,
+            standardize_metadata=standardize_metadata
+        ))
+        # Calculate and display performance metrics
+        elapsed_time = time.time() - start_time
+        print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
+        print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
 def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
     """
     Query SEC filings and download/process them.
@@ -325,4 +384,27 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
         keep_document_types=keep_document_types,
         keep_filtered_metadata=keep_filtered_metadata,
         standardize_metadata=standardize_metadata
+    )
+def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
+    """
+    Download and process SEC filings using specific filenames.
+    Parameters:
+    - filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
+    - api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
+    - output_dir: Directory to save downloaded files
+    - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
+    - keep_filtered_metadata: Whether to keep metadata for filtered documents
+    - standardize_metadata: Whether to standardize metadata format
+    """
+    downloader = Downloader(api_key=api_key)
+    downloader.QUEUE_SIZE = 1
+    downloader.MAX_CONCURRENT_DOWNLOADS = 1
+    downloader.download_files_using_filename(
+        filenames=filenames,
+        output_dir=output_dir,
+        keep_document_types=keep_document_types,
+        keep_filtered_metadata=keep_filtered_metadata,
+        standardize_metadata=standardize_metadata
     )

{datamule-1.5.3 → datamule-1.5.5}/datamule/submission.py RENAMED Viewed

@@ -3,6 +3,7 @@ import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
 from secsgml.utils import bytes_to_str
+from secsgml.parse_sgml import transform_metadata_string
 import tarfile
 import shutil
 import zstandard as zstd
@@ -86,6 +87,10 @@ class Submission:
         if sgml_content is not None:
             self.path = None
             metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
+            # standardize metadata
+            metadata = transform_metadata_string(metadata)
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
             # code dupe
             self.accession = self.metadata.content['accession-number']
@@ -123,6 +128,9 @@ class Submission:
                 metadata_path = self.path / 'metadata.json'
                 with metadata_path.open('r') as f:
                     metadata = json.load(f)
+            # standardize metadata
+            metadata = transform_metadata_string(metadata)
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"

{datamule-1.5.3 → datamule-1.5.5}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.5.3
+Version: 1.5.5
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.5.3 → datamule-1.5.5}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="1.5.3",
+    version="1.5.5",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",