PyPI - datamule - Versions diffs - 1.4.9__tar.gz → 1.5.0__tar.gz - Mend

datamule 1.4.9tar.gz → 1.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{datamule-1.4.9 → datamule-1.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.4.9
+Version: 1.5.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.4.9 → datamule-1.5.0}/datamule/portfolio.py RENAMED Viewed

@@ -125,7 +125,7 @@ class Portfolio:
             # First query, just set the accession numbers
             self.accession_numbers = new_accession_numbers
-    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=None,requests_per_second=5, **kwargs):
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
         if provider is None:
             config = Config()
             provider = config.get_default_source()

datamule-1.5.0/datamule/sec/submissions/downloader.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+from .streamer import stream
+from secsgml import write_sgml_file_to_tar
+from tqdm import tqdm
+def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
+             requests_per_second=5, output_dir="filings", accession_numbers=None,
+             quiet=False, keep_document_types=[]):
+    # Make sure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    pbar = tqdm(desc="Writing", unit=" submissions", disable=quiet,position=2)
+    # Create a wrapper for the download_callback that includes the output_dir
+    async def callback_wrapper(hit, content, cik, accno, url):
+        output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
+        write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
+        pbar.update(1)
+    # Call the stream function with our callback
+    return stream(
+        cik=cik,
+        name=name,
+        submission_type=submission_type,
+        filing_date=filing_date,
+        location=location,
+        requests_per_second=requests_per_second,
+        document_callback=callback_wrapper,
+        accession_numbers=accession_numbers,
+        quiet=quiet
+    )

{datamule-1.4.9 → datamule-1.5.0}/datamule/seclibrary/downloader.py RENAMED Viewed

@@ -15,6 +15,7 @@ from threading import Thread
 from .query import query
 from os import cpu_count
 from ..submission import Submission
+from secsgml import write_sgml_file_to_tar
@@ -73,7 +74,7 @@ class Downloader:
             print(f"Failed to log error to {error_file}: {str(e)}")
     class FileProcessor:
-        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
+        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
             self.processing_queue = Queue(maxsize=queue_size)
             self.should_stop = False
             self.processing_workers = []
@@ -93,17 +94,9 @@ class Downloader:
         def _process_file(self, item):
             filename, content = item
-            try:
-                submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
-                # Use the shared event loop to run save_async
-                self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
-                self.pbar.update(1)
-            except Exception as e:
-                print(f"Exception {e} in {filename}")
-                accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
-                if os.path.exists(accession_dir):
-                    shutil.rmtree(accession_dir)
-                self.downloader._log_error(self.output_dir, filename, str(e))
+            output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
+            write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
+            self.pbar.update(1)
         def _processing_worker(self):
             batch = []
@@ -211,7 +204,7 @@ class Downloader:
             except Exception as e:
                 self._log_error(output_dir, filename, str(e))
-    async def process_batch(self, urls, output_dir, keep_document_types=None):
+    async def process_batch(self, urls, output_dir, keep_document_types=[]):
         os.makedirs(output_dir, exist_ok=True)
         with tqdm(total=len(urls), desc="Processing files") as pbar:
@@ -238,7 +231,7 @@ class Downloader:
             processor.stop_workers()
             decompression_pool.shutdown()
-    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
+    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
         """
         Query SEC filings and download/process them.
@@ -299,7 +292,7 @@ class Downloader:
             self.loop.call_soon_threadsafe(self.loop.stop)
-def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
+def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
     """
     Query SEC filings and download/process them.

datamule-1.5.0/datamule/submission.py ADDED Viewed

@@ -0,0 +1,215 @@
+from pathlib import Path
+import json
+from .document.document import Document
+from secsgml import parse_sgml_content_into_memory
+import tarfile
+import shutil
+import zstandard as zstd
+from io import BytesIO
+import gzip
+class Submission:
+    def __init__(self, path=None,sgml_content=None,keep_document_types=None):
+        if path is None and sgml_content is None:
+            raise ValueError("Either path or sgml_content must be provided")
+        if path is not None and sgml_content is not None:
+            raise ValueError("Only one of path or sgml_content must be provided")
+        if sgml_content is not None:
+            self.path = None
+            metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
+            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
+            # code dupe
+            self.accession = self.metadata.content['accession-number']
+            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+            self.documents = []
+            filtered_metadata_documents = []
+            for idx,doc in enumerate(self.metadata.content['documents']):
+                type = doc.get('type')()
+                # Keep only specified types
+                if keep_document_types is not None and type not in keep_document_types:
+                    continue
+                # write as txt if not declared
+                filename = doc.get('filename','.txt')
+                extension = Path(filename).suffix
+                self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
+                filtered_metadata_documents.append(doc)
+            self.metadata.content['documents'] = filtered_metadata_documents
+        if path is not None:
+            self.path = Path(path)
+            if self.path.suffix == '.tar':
+                with tarfile.open(self.path,'r') as tar:
+                    metadata_obj = tar.extractfile('metadata.json')
+                    metadata = json.loads(metadata_obj.read().decode('utf-8'))
+                # tarpath
+                metadata_path = f"{self.path}::metadata.json"
+            else:
+                metadata_path = self.path / 'metadata.json'
+                with metadata_path.open('r') as f:
+                    metadata = json.load(f)
+            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
+            self.accession = self.metadata.content['accession-number']
+            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+    def compress(self, compression=None, level=None, threshold=1048576):
+        if self.path is None:
+            raise ValueError("Compress requires path")
+        if compression is not None and compression not in ['gzip', 'zstd']:
+            raise ValueError("compression must be 'gzip' or 'zstd'")
+        # Create tar file (replace directory with .tar file)
+        tar_path = self.path.with_suffix('.tar')
+        with tarfile.open(tar_path, 'w') as tar:
+            # Add metadata.json first
+            metadata_path = self.path / 'metadata.json'
+            if metadata_path.exists():
+                tar.add(metadata_path, arcname='metadata.json')
+            # Add documents in order
+            for doc in self.metadata.content['documents']:
+                filename = doc.get('filename')
+                if filename is None:
+                    filename = doc['sequence'] + '.txt'
+                file_path = self.path / filename
+                if file_path.exists():
+                    file_size = file_path.stat().st_size
+                    # Compress if compression specified and over threshold
+                    if compression is not None and file_size >= threshold:
+                        content = file_path.read_bytes()
+                        if compression == 'gzip':
+                            compressed_content = gzip.compress(content, compresslevel=level or 6)
+                            compressed_filename = filename + '.gz'
+                        else:  # zstd
+                            cctx = zstd.ZstdCompressor(level=level or 3)
+                            compressed_content = cctx.compress(content)
+                            compressed_filename = filename + '.zst'
+                        # Add compressed file to tar
+                        tarinfo = tarfile.TarInfo(name=compressed_filename)
+                        tarinfo.size = len(compressed_content)
+                        tar.addfile(tarinfo, BytesIO(compressed_content))
+                    else:
+                        # Add uncompressed file
+                        tar.add(file_path, arcname=filename)
+        # Delete original folder
+        shutil.rmtree(self.path)
+        # Update path to point to new tar file
+        self.path = tar_path
+    def decompress(self):
+        if self.path is None:
+            raise ValueError("Decompress requires path")
+        elif self.path.suffix != '.tar':
+            raise ValueError("Can only decompress tar")
+        # Create output directory (path without .tar extension)
+        output_dir = self.path.with_suffix('')
+        output_dir.mkdir(exist_ok=True)
+        with tarfile.open(self.path, 'r') as tar:
+            for member in tar.getmembers():
+                if member.isfile():
+                    content = tar.extractfile(member).read()
+                    # Decompress if gzipped
+                    if member.name.endswith('.gz'):
+                        content = gzip.decompress(content)
+                        output_path = output_dir / member.name[:-3]  # Remove .gz extension
+                    else:
+                        output_path = output_dir / member.name
+                    # Write to output directory
+                    output_path.parent.mkdir(parents=True, exist_ok=True)
+                    with output_path.open('wb') as f:
+                        f.write(content)
+        # delete original file
+        self.path.unlink()
+        self.path = output_dir
+    def _load_document_by_index(self, idx):
+        """Load a document by its index in the metadata documents list."""
+        doc = self.metadata.content['documents'][idx]
+        # If loaded from sgml_content, return pre-loaded document
+        if self.path is None:
+            return self.documents[idx]
+        # If loaded from path, load document on-demand
+        filename = doc.get('filename')
+        if filename is None:
+            filename = doc['sequence'] + '.txt'
+        document_path = self.path / filename
+        extension = document_path.suffix
+        if self.path.suffix == '.tar':
+            with tarfile.open(self.path, 'r') as tar:
+                # bandaid fix TODO
+                try:
+                    content = tar.extractfile(filename).read()
+                except:
+                    try:
+                        content = tar.extractfile(filename+'.gz').read()
+                    except:
+                        try:
+                            content = tar.extractfile(filename+'.zst').read()
+                        except:
+                            raise ValueError("Something went wrong with tar")
+                # Decompress if compressed
+                if filename.endswith('.gz'):
+                    content = gzip.decompress(content)
+                elif filename.endswith('.zst'):
+                    dctx = zstd.ZstdDecompressor()
+                    content = dctx.decompress(content)
+        else:
+            with document_path.open('rb') as f:
+                content = f.read()
+            # Decode text files
+            if extension in ['.htm', '.html', '.txt', '.xml']:
+                content = content.decode('utf-8', errors='replace')
+        return Document(
+            type=doc['type'],
+            content=content,
+            extension=extension,
+            filing_date=self.filing_date,
+            accession=self.accession,
+            path=document_path
+        )
+    def __iter__(self):
+        """Make Submission iterable by yielding all documents."""
+        for idx in range(len(self.metadata.content['documents'])):
+            yield self._load_document_by_index(idx)
+    def document_type(self, document_type):
+        """Yield documents matching the specified type(s)."""
+        # Convert single document type to list for consistent handling
+        if isinstance(document_type, str):
+            document_types = [document_type]
+        else:
+            document_types = [item for item in document_type]
+        for idx, doc in enumerate(self.metadata.content['documents']):
+            if doc['type'] in document_types:
+                yield self._load_document_by_index(idx)

{datamule-1.4.9 → datamule-1.5.0}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.4.9
+Version: 1.5.0
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.4.9 → datamule-1.5.0}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="1.4.9",
+    version="1.5.0",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",

datamule-1.4.9/datamule/sec/submissions/downloader.py DELETED Viewed

@@ -1,64 +0,0 @@
-import os
-import json
-from .streamer import stream
-import aiofiles
-from ...submission import Submission
-async def download_callback(hit, content, cik, accno, url, output_dir="filings", keep_document_types=None):
-    """Save downloaded SEC submission to disk."""
-    try:
-        # Create a Submission object directly from the content
-        # Note: the content needs to be decoded from bytes to string for the parser
-        submission = Submission(sgml_content=content,
-                               keep_document_types=keep_document_types)
-        # Use the async save method to write the submission to disk
-        file_dir = await submission.save_async(output_dir=output_dir)
-        return file_dir
-    except Exception as e:
-        print(f"Error processing {accno}: {e}")
-        return None
-def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
-             requests_per_second=5, output_dir="filings", accession_numbers=None,
-             quiet=False, keep_document_types=None):
-    """
-    Download SEC EDGAR filings and extract their documents.
-    Parameters:
-    - cik: CIK number(s) to query for
-    - submission_type: Filing type(s) to query for (default: 10-K)
-    - filing_date: Date or date range to query for
-    - location: Location code to filter by (e.g., 'CA' for California)
-    - name: Company name to search for (alternative to providing CIK)
-    - requests_per_second: Rate limit for SEC requests
-    - output_dir: Directory to save documents
-    - accession_numbers: Optional list of accession numbers to filter by
-    - quiet: Whether to suppress progress output
-    - keep_document_types: Optional list of document types to keep (e.g. ['10-K', 'EX-10.1'])
-    Returns:
-    - List of all document paths processed
-    """
-    # Make sure output directory exists
-    os.makedirs(output_dir, exist_ok=True)
-    # Create a wrapper for the download_callback that includes the output_dir
-    async def callback_wrapper(hit, content, cik, accno, url):
-        return await download_callback(hit, content, cik, accno, url,
-                                     output_dir=output_dir,
-                                     keep_document_types=keep_document_types)
-    # Call the stream function with our callback
-    return stream(
-        cik=cik,
-        name=name,
-        submission_type=submission_type,
-        filing_date=filing_date,
-        location=location,
-        requests_per_second=requests_per_second,
-        document_callback=callback_wrapper,
-        accession_numbers=accession_numbers,
-        quiet=quiet
-    )

datamule-1.4.9/datamule/submission.py DELETED Viewed

@@ -1,197 +0,0 @@
-from pathlib import Path
-import json
-from .document.document import Document
-from secsgml import parse_sgml_content_into_memory
-import os
-import aiofiles
-# TODO add .tar path
-class Submission:
-    def __init__(self, path=None,sgml_content=None,keep_document_types=None):
-        if path is None and sgml_content is None:
-            raise ValueError("Either path or sgml_content must be provided")
-        if path is not None and sgml_content is not None:
-            raise ValueError("Only one of path or sgml_content must be provided")
-        if sgml_content is not None:
-            self.path = None
-            metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
-            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
-            # code dupe
-            self.accession = self.metadata.content['accession-number']
-            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
-            self.documents = []
-            filtered_metadata_documents = []
-            for idx,doc in enumerate(self.metadata.content['documents']):
-                type = doc.get('type').upper()
-                # Keep only specified types
-                if keep_document_types is not None and type not in keep_document_types:
-                    continue
-                # write as txt if not declared
-                filename = doc.get('filename','.txt')
-                extension = Path(filename).suffix
-                self.documents.append(Document(type=type.upper(), content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
-                filtered_metadata_documents.append(doc)
-            self.metadata.content['documents'] = filtered_metadata_documents
-        if path is not None:
-            self.path = Path(path)
-            metadata_path = self.path / 'metadata.json'
-            with metadata_path.open('r') as f:
-                metadata = json.load(f)
-            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
-            # Code dupe
-            self.accession = self.metadata.content['accession-number']
-            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
-    def document_type(self, document_type):
-        # Convert single document type to list for consistent handling
-        if isinstance(document_type, str):
-            document_types = [document_type.lower()]
-        else:
-            document_types = [item.lower() for item in document_type]
-        for idx,doc in enumerate(self.metadata.content['documents']):
-            if doc['type'] in document_types:
-                # if loaded from path
-                if self.path is not None:
-                    filename = doc.get('filename')
-                    # oh we need handling here for sequences case
-                    if filename is None:
-                        filename = doc['sequence'] + '.txt'
-                    document_path = self.path / filename
-                    extension = document_path.suffix
-                    with document_path.open('rb') as f:
-                        content = f.read()
-                    if extension in ['.htm','.html','.txt','.xml']:
-                        content = content.decode('utf-8', errors='replace')
-                    yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
-                # if loaded from sgml_content
-                else:
-                    yield self.documents[idx]
-    def __iter__(self):
-        for idx,doc in enumerate(self.metadata.content['documents']):
-            # if loaded from path
-            if self.path is not None:
-                filename = doc.get('filename')
-                # oh we need handling here for sequences case
-                if filename is None:
-                    filename = doc['sequence'] + '.txt'
-                document_path = self.path / filename
-                extension = document_path.suffix
-                # check if the file exists
-                if document_path.exists():
-                    with document_path.open('rb') as f:
-                        content = f.read()
-                    if extension in ['.htm','.html','.txt','.xml']:
-                        content = content.decode('utf-8', errors='replace')
-                    yield Document(type=doc['type'].upper(), content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
-                else:
-                    print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
-            # if loaded from sgml_content
-            else:
-                yield self.documents[idx]
-    def save(self, output_dir="filings"):
-        file_dir = Path(output_dir) / str(self.accession)
-        file_dir.mkdir(parents=True, exist_ok=True)
-        metadata_path = file_dir / "metadata.json"
-        with open(metadata_path, 'w') as f:
-            json.dump(self.metadata.content, f, indent=4)
-        for idx, doc in enumerate(self.metadata.content['documents']):
-            filename = doc.get('filename')
-            if filename is None:
-                filename = f"{doc.get('sequence')}.txt"
-            doc_path = file_dir / filename
-            if self.path is not None:
-                if hasattr(self, 'documents') and self.documents:
-                    content = self.documents[idx].content
-                else:
-                    orig_doc_path = self.path / filename
-                    if orig_doc_path.exists():
-                        with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
-                            content = f.read()
-                    else:
-                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
-                        continue
-            else:
-                content = self.documents[idx].content
-            if isinstance(content, bytes):
-                with open(doc_path, 'wb') as f:
-                    f.write(content)
-            else:
-                with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
-                    f.write(content)
-        return file_dir
-    async def save_async(self, output_dir="filings"):
-        file_dir = Path(output_dir) / str(self.accession)
-        os.makedirs(file_dir, exist_ok=True)
-        metadata_path = file_dir / "metadata.json"
-        async with aiofiles.open(metadata_path, 'w') as f:
-            await f.write(json.dumps(self.metadata.content, indent=4))
-        for idx, doc in enumerate(self.metadata.content['documents']):
-            filename = doc.get('filename')
-            # oh we need handling here for sequences case
-            if filename is None:
-                filename = doc['sequence'] + '.txt'
-            doc_path = file_dir / filename
-            if self.path is not None:
-                if hasattr(self, 'documents') and self.documents:
-                    content = self.documents[idx].content
-                else:
-                    orig_doc_path = self.path / filename
-                    if orig_doc_path.exists():
-                        async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
-                            content = await f.read()
-                    else:
-                        print(f"Warning: File {orig_doc_path} does not exist, skipping.")
-                        continue
-            else:
-                content = self.documents[idx].content
-            if isinstance(content, bytes):
-                async with aiofiles.open(doc_path, 'wb') as f:
-                    await f.write(content)
-            else:
-                async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
-                    await f.write(content)
-        return file_dir