PyPI - datamule - Versions diffs - 1.6.2__tar.gz → 1.6.4__tar.gz - Mend

datamule 1.6.2tar.gz → 1.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

{datamule-1.6.2 → datamule-1.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.6.2
+Version: 1.6.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.6.2 → datamule-1.6.4}/datamule/portfolio.py RENAMED Viewed

@@ -12,9 +12,12 @@ from .helper import _process_cik_and_metadata_filters
 from .seclibrary.downloader import download as seclibrary_download
 from .sec.xbrl.filter_xbrl import filter_xbrl
 from .sec.submissions.monitor import Monitor
+from .portfolio_compression_utils import CompressionManager
 #from .sec.xbrl.xbrlmonitor import XBRLMonitor
 from .datamule.sec_connector import SecConnector
+from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
+import json
+import io
 class Portfolio:
     def __init__(self, path):
@@ -48,11 +51,7 @@ class Portfolio:
         # Load regular submissions (existing logic)
         def load_submission(folder):
-            try:
-                return Submission(folder)
-            except Exception as e:
-                print(f"Error loading submission from {folder}: {str(e)}")
-                return None
+            return Submission(folder)
         regular_submissions = []
         if regular_items:
@@ -76,10 +75,7 @@ class Portfolio:
                     # Collect results as they complete
                     for future in as_completed(futures):
-                        try:
-                            batch_submissions.extend(future.result())
-                        except Exception as e:
-                            print(f"Error in batch processing: {str(e)}")
+                        batch_submissions.extend(future.result())
         # Combine and filter None values
         self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
@@ -87,46 +83,54 @@ class Portfolio:
     def _load_batch_submissions_worker(self, batch_tar_path, pbar):
         """Worker function to load submissions from one batch tar with progress updates"""
-        try:
-            # Open tar handle and store it
-            tar_handle = tarfile.open(batch_tar_path, 'r')
-            self.batch_tar_handles[batch_tar_path] = tar_handle
-            self.batch_tar_locks[batch_tar_path] = Lock()
-            # Find all accession directories
-            accession_prefixes = set()
-            for member in tar_handle.getmembers():
-                if '/' in member.name and member.name.endswith('metadata.json'):
-                    accession_prefix = member.name.split('/')[0]
-                    accession_prefixes.add(accession_prefix)
-            # Create submissions for each accession
-            submissions = []
-            for accession_prefix in accession_prefixes:
-                try:
-                    submission = Submission(
-                        batch_tar_path=batch_tar_path,
-                        accession_prefix=accession_prefix,
-                        portfolio_ref=self
-                    )
-                    submissions.append(submission)
-                    pbar.update(1)  # Update progress for each successful submission
-                except Exception as e:
-                    print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
-            return submissions
+        # Open tar handle and store it
+        tar_handle = tarfile.open(batch_tar_path, 'r')
+        self.batch_tar_handles[batch_tar_path] = tar_handle
+        self.batch_tar_locks[batch_tar_path] = Lock()
+        # Find all accession directories
+        accession_prefixes = set()
+        for member in tar_handle.getmembers():
+            if '/' in member.name and member.name.endswith('metadata.json'):
+                accession_prefix = member.name.split('/')[0]
+                accession_prefixes.add(accession_prefix)
+        # Create submissions for each accession
+        submissions = []
+        for accession_prefix in accession_prefixes:
+            submission = Submission(
+                batch_tar_path=batch_tar_path,
+                accession_prefix=accession_prefix,
+                portfolio_ref=self
+            )
+            submissions.append(submission)
+            pbar.update(1)  # Update progress for each successful submission
+        return submissions
-        except Exception as e:
-            print(f"Error loading batch tar {batch_tar_path}: {str(e)}")
-            return []
+    def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
+        """
+        Compress all individual submissions into batch tar files.
+        Args:
+            compression: None, 'gzip', or 'zstd' for document compression (default: None)
+            compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
+            threshold: Size threshold for compressing individual documents (default: 1MB)
+            max_batch_size: Maximum size per batch tar file (default: 1GB)
+        """
+        CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
+    def decompress(self):
+        """
+        Decompress all batch tar files back to individual submission directories.
+        """
+        CompressionManager().decompress_portfolio(self, self.MAX_WORKERS)
     def _close_batch_handles(self):
         """Close all open batch tar handles to free resources"""
         for handle in self.batch_tar_handles.values():
-            try:
-                handle.close()
-            except Exception as e:
-                print(f"Error closing batch tar handle: {str(e)}")
+            handle.close()
         self.batch_tar_handles.clear()
         self.batch_tar_locks.clear()

datamule-1.6.4/datamule/portfolio_compression_utils.py ADDED Viewed

@@ -0,0 +1,291 @@
+import json
+import io
+import gzip
+import zstandard as zstd
+import tarfile
+import shutil
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
+class CompressionManager:
+    def compress_portfolio(self, portfolio, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024, max_workers=None):
+        """
+        Compress all individual submissions into batch tar files.
+        Args:
+            portfolio: Portfolio instance
+            compression: None, 'gzip', or 'zstd' for document compression (default: None)
+            compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
+            threshold: Size threshold for compressing individual documents (default: 1MB)
+            max_batch_size: Maximum size per batch tar file (default: 1GB)
+            max_workers: Number of threads for parallel document processing (default: portfolio.MAX_WORKERS)
+        """
+        if max_workers is None:
+            max_workers = portfolio.MAX_WORKERS
+        portfolio._close_batch_handles()
+        if not portfolio.submissions_loaded:
+            portfolio._load_submissions()
+        # Only compress non-batch submissions
+        submissions = [s for s in portfolio.submissions if s.batch_tar_path is None]
+        if not submissions:
+            print("No submissions to compress")
+            return
+        print(f"Compressing {len(submissions)} submissions...")
+        # Set default compression level if not specified
+        if compression_level is None:
+            compression_level = 6 if compression == 'gzip' else 3
+        # Group submissions into batches
+        current_batch = 0
+        current_size = 0
+        sequence = 1
+        current_tar = None
+        with tqdm(total=len(submissions), desc="Compressing submissions") as pbar:
+            for submission in submissions:
+                # Parallel document processing
+                with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                    doc_futures = [
+                        executor.submit(self._process_document, doc, compression, threshold, compression_level)
+                        for doc in submission
+                    ]
+                    # Collect results maintaining order
+                    documents = []
+                    compression_list = []
+                    for future in doc_futures:
+                        content, compression_type = future.result()
+                        documents.append(content)
+                        compression_list.append(compression_type)
+                # Calculate submission size
+                metadata_str = bytes_to_str(submission.metadata.content, lower=False)
+                metadata_json = json.dumps(metadata_str).encode('utf-8')
+                submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
+                # Check if we need a new batch tar
+                if current_size > 0 and current_size + submission_size > max_batch_size:
+                    if current_tar:
+                        current_tar.close()
+                    sequence += 1
+                    current_size = 0
+                    current_tar = None
+                # Create tar if needed
+                if current_tar is None:
+                    batch_path = portfolio.path / f'batch_{current_batch:03d}_{sequence:03d}.tar'
+                    current_tar = tarfile.open(batch_path, 'w')
+                # Write submission to tar
+                self._write_submission_to_tar(
+                    current_tar,
+                    submission,
+                    documents,
+                    compression_list,
+                    submission.accession
+                )
+                current_size += submission_size
+                # Remove original submission directory/tar
+                if submission.path:
+                    if submission.path.is_dir():
+                        shutil.rmtree(submission.path)
+                    elif submission.path.suffix == '.tar':
+                        submission.path.unlink()
+                pbar.update(1)
+        # Close final tar
+        if current_tar:
+            current_tar.close()
+        # Reload submissions to reflect new batch structure
+        portfolio.submissions_loaded = False
+        portfolio._load_submissions()
+        print("Compression complete.")
+    def decompress_portfolio(self, portfolio, max_workers=None):
+        """
+        Decompress all batch tar files back to individual submission directories.
+        Args:
+            portfolio: Portfolio instance
+            max_workers: Number of threads for parallel file processing (default: portfolio.MAX_WORKERS)
+        """
+        if max_workers is None:
+            max_workers = portfolio.MAX_WORKERS
+        if not portfolio.submissions_loaded:
+            portfolio._load_submissions()
+        # Find all batch tar files
+        batch_tars = [f for f in portfolio.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
+        if not batch_tars:
+            print("No batch tar files found to decompress")
+            return
+        print(f"Decompressing {len(batch_tars)} batch tar files...")
+        # FIRST: Close all batch tar handles to free the files
+        portfolio._close_batch_handles()
+        total_extracted = 0
+        with tqdm(desc="Decompressing submissions", unit="submissions") as pbar:
+            for batch_tar in batch_tars:
+                with tarfile.open(batch_tar, 'r') as tar:
+                    # Find all accession directories in this tar
+                    accession_dirs = set()
+                    for member in tar.getmembers():
+                        if '/' in member.name:
+                            accession_dir = member.name.split('/')[0]
+                            accession_dirs.add(accession_dir)
+                    # Extract each submission
+                    for accession_dir in accession_dirs:
+                        output_dir = portfolio.path / accession_dir
+                        output_dir.mkdir(exist_ok=True)
+                        # Get all files for this accession
+                        accession_files = [m for m in tar.getmembers()
+                                            if m.name.startswith(f'{accession_dir}/') and m.isfile()]
+                        # Parallel file extraction
+                        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                            file_futures = [
+                                executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
+                                for member in accession_files
+                            ]
+                            # Wait for all files to be processed
+                            for future in as_completed(file_futures):
+                                future.result()
+                        total_extracted += 1
+                        pbar.update(1)
+        # NOW delete the batch tar files after everything is extracted
+        for batch_tar in batch_tars:
+            batch_tar.unlink()
+        # Reload submissions to reflect new directory structure
+        portfolio.submissions_loaded = False
+        portfolio._load_submissions()
+        print(f"Decompression complete. Extracted {total_extracted} submissions.")
+    def _process_document(self, doc, compression, threshold, compression_level):
+        """Process a single document: load content and apply compression if needed."""
+        content = doc.content
+        if isinstance(content, str):
+            content = content.encode('utf-8')
+        # Apply document-level compression if threshold met AND compression is specified
+        if compression and len(content) >= threshold:
+            if compression == 'gzip':
+                content = gzip.compress(content, compresslevel=compression_level)
+                compression_type = 'gzip'
+            elif compression == 'zstd':
+                content = zstd.ZstdCompressor(level=compression_level).compress(content)
+                compression_type = 'zstd'
+            else:
+                compression_type = ''
+        else:
+            compression_type = ''
+        return content, compression_type
+    def _extract_file(self, member, tar, accession_dir, output_dir):
+        """Extract and decompress a single file from tar."""
+        relative_path = member.name[len(accession_dir)+1:]  # Remove accession prefix
+        output_path = output_dir / relative_path
+        content = tar.extractfile(member).read()
+        # Handle decompression based on filename
+        if relative_path.endswith('.gz'):
+            # File MUST be gzipped if it has .gz extension
+            content = gzip.decompress(content)
+            output_path = output_path.with_suffix('')  # Remove .gz
+        elif relative_path.endswith('.zst'):
+            # File MUST be zstd compressed if it has .zst extension
+            content = zstd.ZstdDecompressor().decompress(content)
+            output_path = output_path.with_suffix('')  # Remove .zst
+        # Special handling for metadata.json
+        if output_path.name == 'metadata.json':
+            metadata = json.loads(content.decode('utf-8'))
+            # Remove tar-specific metadata
+            for doc in metadata['documents']:
+                doc.pop('secsgml_start_byte', None)
+                doc.pop('secsgml_end_byte', None)
+                # Update filenames to match decompressed files
+                filename = doc.get('filename', '')
+                if filename.endswith('.gz'):
+                    doc['filename'] = filename[:-3]  # Remove .gz
+                elif filename.endswith('.zst'):
+                    doc['filename'] = filename[:-4]  # Remove .zst
+            with output_path.open('w', encoding='utf-8') as f:
+                json.dump(metadata, f, indent=2)
+        else:
+            # Write document file
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with output_path.open('wb') as f:
+                f.write(content)
+    def _write_submission_to_tar(self, tar_handle, submission, documents, compression_list, accession_prefix):
+        """Write a submission to a tar file with optional document compression."""
+        # Prepare metadata
+        metadata = submission.metadata.content.copy()
+        # Update filenames for compressed documents BEFORE size calculation
+        for i, compression in enumerate(compression_list):
+            if compression:
+                doc = metadata['documents'][i]
+                filename = doc.get('filename', doc['sequence'] + '.txt')
+                if compression == 'gzip' and not filename.endswith('.gz'):
+                    doc['filename'] = filename + '.gz'
+                elif compression == 'zstd' and not filename.endswith('.zst'):
+                    doc['filename'] = filename + '.zst'
+        # Add document sizes to metadata for calculate_documents_locations_in_tar
+        for i, content in enumerate(documents):
+            metadata['documents'][i]['secsgml_size_bytes'] = len(content)
+        # NOW calculate document positions with the correct filenames
+        metadata = calculate_documents_locations_in_tar(metadata)
+        # Write metadata
+        metadata_str = bytes_to_str(metadata, lower=False)
+        metadata_json = json.dumps(metadata_str).encode('utf-8')
+        tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/metadata.json')
+        tarinfo.size = len(metadata_json)
+        tar_handle.addfile(tarinfo, io.BytesIO(metadata_json))
+        # Write documents
+        for i, content in enumerate(documents):
+            doc = metadata['documents'][i]
+            filename = doc.get('filename', doc['sequence'] + '.txt')
+            tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
+            tarinfo.size = len(content)
+            tar_handle.addfile(tarinfo, io.BytesIO(content))

{datamule-1.6.2 → datamule-1.6.4}/datamule/submission.py RENAMED Viewed

@@ -2,46 +2,12 @@ from pathlib import Path
 import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
-from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
 from secsgml.parse_sgml import transform_metadata_string
 import tarfile
-import shutil
 import zstandard as zstd
 import gzip
-import io
-def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
-     # Write tar directly to disk
-    with tarfile.open(output_path, 'w') as tar:
-        # calculate document locations in tar
-        metadata = calculate_documents_locations_in_tar(metadata, documents)
-        # serialize metadata
-        metadata_str  = bytes_to_str(metadata,lower=False)
-        metadata_json = json.dumps(metadata_str).encode('utf-8')
-        # save metadata
-        tarinfo = tarfile.TarInfo(name='metadata.json')
-        tarinfo.size = len(metadata_json)
-        tar.addfile(tarinfo, io.BytesIO(metadata_json))
-        for file_num, content in enumerate(documents, 0):
-            if standardize_metadata:
-                document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
-            compression = compression_list[file_num]
-            if compression == 'gzip':
-                document_name = f'{document_name}.gz'
-            elif compression == 'zstd':
-                document_name = f'{document_name}.zst'
-            tarinfo = tarfile.TarInfo(name=f'{document_name}')
-            tarinfo.size = len(content)
-            tar.addfile(tarinfo, io.BytesIO(content))
 class Submission:
     def __init__(self, path=None, sgml_content=None, keep_document_types=None,
                  batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
@@ -128,94 +94,6 @@ class Submission:
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
-    def compress(self, compression=None, level=None, threshold=1048576):
-        if self.path is None:
-            raise ValueError("Compress requires path")
-        if compression is not None and compression not in ['gzip', 'zstd']:
-            raise ValueError("compression must be 'gzip' or 'zstd'")
-        # check if we're loading from a dir or a tar file
-        is_dir_not_tar = True
-        if self.path.suffix == '.tar':
-            is_dir_not_tar = False
-        elif not self.path.is_dir():
-            raise ValueError("Path must be a directory to compress")
-        # Create tar file (replace directory with .tar file)
-        tar_path = self.path.with_suffix('.tar')
-        # load all files in the directory or tar file
-        documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
-        # we should compress everything here first.
-        compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
-        documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
-            len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
-            len(doc) >= threshold else doc for doc in documents]
-        metadata = self.metadata.content.copy()
-        write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
-        # Delete original folder
-        if is_dir_not_tar:
-            shutil.rmtree(self.path)
-            # otherwise, we already replaced the tar file
-            # Update path to point to new tar file
-            self.path = tar_path
-    def decompress(self):
-        if self.path is None:
-            raise ValueError("Decompress requires path")
-        elif self.path.suffix != '.tar':
-            raise ValueError("Can only decompress tar")
-        # Create output directory (path without .tar extension)
-        output_dir = self.path.with_suffix('')
-        output_dir.mkdir(exist_ok=True)
-        with tarfile.open(self.path, 'r') as tar:
-            for member in tar.getmembers():
-                if member.isfile():
-                    content = tar.extractfile(member).read()
-                    # Decompress based on file extension
-                    if member.name.endswith('.gz'):
-                        content = gzip.decompress(content)
-                        output_path = output_dir / member.name[:-3]  # Remove .gz extension
-                    elif member.name.endswith('.zst'):
-                        dctx = zstd.ZstdDecompressor()
-                        content = dctx.decompress(content)
-                        output_path = output_dir / member.name[:-4]  # Remove .zst extension
-                    else:
-                        output_path = output_dir / member.name
-                    # check if it is metadata.json
-                    if output_path.name == 'metadata.json':
-                        # load as json
-                        metadata = json.loads(content.decode('utf-8'))
-                        # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
-                        for doc in metadata['documents']:
-                            if 'secsgml_start_byte' in doc:
-                                del doc['secsgml_start_byte']
-                            if 'secsgml_end_byte' in doc:
-                                del doc['secsgml_end_byte']
-                        with output_path.open('w', encoding='utf-8') as f:
-                            json.dump(metadata, f)
-                    else:
-                        # Write to output directory
-                        output_path.parent.mkdir(parents=True, exist_ok=True)
-                        with output_path.open('wb') as f:
-                            f.write(content)
-        # delete original file
-        self.path.unlink()
-        self.path = output_dir
     def _load_document_by_index(self, idx):
         """Load a document by its index in the metadata documents list."""
@@ -225,44 +103,38 @@ class Submission:
         if self.path is None and self.batch_tar_path is None:
             return self.documents[idx]
-        # Get filename
+        # Get filename from metadata - this is the source of truth
         filename = doc.get('filename')
         if filename is None:
             filename = doc['sequence'] + '.txt'
-        extension = Path(filename).suffix
+        # Get the base extension (before any compression extension)
+        # If filename ends with .gz or .zst, the real extension is before that
+        if filename.endswith('.gz'):
+            extension = Path(filename[:-3]).suffix
+            is_compressed = 'gzip'
+        elif filename.endswith('.zst'):
+            extension = Path(filename[:-4]).suffix
+            is_compressed = 'zstd'
+        else:
+            extension = Path(filename).suffix
+            is_compressed = False
         # Handle batch tar case
         if self.batch_tar_path is not None:
             with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
                 tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
-                # Try different filename variations for compressed files
-                possible_filenames = [
-                    f'{self.accession_prefix}/{filename}',
-                    f'{self.accession_prefix}/{filename}.gz',
-                    f'{self.accession_prefix}/{filename}.zst'
-                ]
-                content = None
-                actual_filename = None
-                for attempt_filename in possible_filenames:
-                    try:
-                        content = tar_handle.extractfile(attempt_filename).read()
-                        actual_filename = attempt_filename
-                        break
-                    except:
-                        continue
-                if content is None:
-                    raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
+                # Use exact filename from metadata
+                tar_path = f'{self.accession_prefix}/{filename}'
+                content = tar_handle.extractfile(tar_path).read()
-                # Decompress if compressed
-                if actual_filename.endswith('.gz'):
+                # Decompress if needed based on filename extension
+                if is_compressed == 'gzip':
                     content = gzip.decompress(content)
-                elif actual_filename.endswith('.zst'):
-                    dctx = zstd.ZstdDecompressor()
-                    content = dctx.decompress(content)
+                elif is_compressed == 'zstd':
+                    content = zstd.ZstdDecompressor().decompress(content)
                 # Decode text files
                 if extension in ['.htm', '.html', '.txt', '.xml']:
@@ -270,35 +142,56 @@ class Submission:
                 document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
-        # Handle regular path case (existing logic)
+        # Handle regular path case
         else:
-            document_path = self.path / filename
+            # Check if path is a tar file (old format)
             if self.path.suffix == '.tar':
                 with tarfile.open(self.path, 'r') as tar:
-                    # so here is where we should use bytes instead with byte offset.
-                    # bandaid fix TODO
+                    # Try to extract the file, handling compression
                     try:
                         content = tar.extractfile(filename).read()
+                        actual_filename = filename
                     except:
                         try:
-                            content = tar.extractfile(filename+'.gz').read()
+                            content = tar.extractfile(filename + '.gz').read()
+                            actual_filename = filename + '.gz'
+                            is_compressed = 'gzip'
                         except:
-                            try:
-                                content = tar.extractfile(filename+'.zst').read()
+                            try:
+                                content = tar.extractfile(filename + '.zst').read()
+                                actual_filename = filename + '.zst'
+                                is_compressed = 'zstd'
                             except:
-                                # some of these issues are on SEC data end, will fix when I setup cloud.
-                                raise ValueError(f"Something went wrong with tar: {self.path}")
+                                raise FileNotFoundError(f"Document file not found in tar: {filename}")
                     # Decompress if compressed
-                    if filename.endswith('.gz'):
+                    if is_compressed == 'gzip':
                         content = gzip.decompress(content)
-                    elif filename.endswith('.zst'):
-                        dctx = zstd.ZstdDecompressor()
-                        content = dctx.decompress(content)
+                    elif is_compressed == 'zstd':
+                        content = zstd.ZstdDecompressor().decompress(content)
+                    # Decode text files
+                    if extension in ['.htm', '.html', '.txt', '.xml']:
+                        content = content.decode('utf-8', errors='replace')
+                    document_path = f"{self.path}::{actual_filename}"
             else:
+                # Regular directory case
+                document_path = self.path / filename
+                if not document_path.exists():
+                    raise FileNotFoundError(f"Document file not found: {document_path}")
                 with document_path.open('rb') as f:
                     content = f.read()
+                # Decompress if needed based on filename extension
+                if is_compressed == 'gzip':
+                    content = gzip.decompress(content)
+                elif is_compressed == 'zstd':
+                    content = zstd.ZstdDecompressor().decompress(content)
                 # Decode text files
                 if extension in ['.htm', '.html', '.txt', '.xml']:
                     content = content.decode('utf-8', errors='replace')
@@ -311,7 +204,6 @@ class Submission:
             accession=self.accession,
             path=document_path
         )
     def __iter__(self):
         """Make Submission iterable by yielding all documents."""
         for idx in range(len(self.metadata.content['documents'])):

{datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.6.2
+Version: 1.6.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.6.2 → datamule-1.6.4}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,6 +5,7 @@ datamule/helper.py
 datamule/index.py
 datamule/package_updater.py
 datamule/portfolio.py
+datamule/portfolio_compression_utils.py
 datamule/sheet.py
 datamule/submission.py
 datamule.egg-info/PKG-INFO

{datamule-1.6.2 → datamule-1.6.4}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="1.6.2",
+    version="1.6.4",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",