PyPI - datamule - Versions diffs - 1.5.2__tar.gz → 1.5.3__tar.gz - Mend

datamule 1.5.2tar.gz → 1.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{datamule-1.5.2 → datamule-1.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.5.2
+Version: 1.5.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.5.2 → datamule-1.5.3}/datamule/portfolio.py RENAMED Viewed

@@ -34,7 +34,6 @@ class Portfolio:
     def _load_submissions(self):
         folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
-        print(folders)
         print(f"Loading {len(folders)} submissions")
         def load_submission(folder):
@@ -126,7 +125,8 @@ class Portfolio:
             # First query, just set the accession numbers
             self.accession_numbers = new_accession_numbers
-    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],requests_per_second=5, **kwargs):
+    def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
+                             requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
         if provider is None:
             config = Config()
             provider = config.get_default_source()
@@ -143,7 +143,9 @@ class Portfolio:
                 submission_type=submission_type,
                 filing_date=filing_date,
                 accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
-                keep_document_types=document_type
+                keep_document_types=document_type,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata,
             )
         else:
             sec_download(
@@ -153,7 +155,9 @@ class Portfolio:
                 filing_date=filing_date,
                 requests_per_second=requests_per_second,
                 accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
-                keep_document_types=document_type
+                keep_document_types=document_type,
+                keep_filtered_metadata=keep_filtered_metadata,
+                standardize_metadata=standardize_metadata,
             )
         self.submissions_loaded = False

{datamule-1.5.2 → datamule-1.5.3}/datamule/sec/submissions/downloader.py RENAMED Viewed

@@ -5,7 +5,7 @@ from tqdm import tqdm
 def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
              requests_per_second=5, output_dir="filings", accession_numbers=None,
-             quiet=False, keep_document_types=[]):
+             quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
     # Make sure output directory exists
     os.makedirs(output_dir, exist_ok=True)
@@ -14,7 +14,8 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
     # Create a wrapper for the download_callback that includes the output_dir
     async def callback_wrapper(hit, content, cik, accno, url):
         output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
-        write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types)
+        write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types,keep_filtered_metadata=keep_filtered_metadata,
+                               standardize_metadata=standardize_metadata)
         pbar.update(1)

{datamule-1.5.2 → datamule-1.5.3}/datamule/seclibrary/downloader.py RENAMED Viewed

@@ -74,7 +74,7 @@ class Downloader:
             print(f"Failed to log error to {error_file}: {str(e)}")
     class FileProcessor:
-        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
+        def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
             self.processing_queue = Queue(maxsize=queue_size)
             self.should_stop = False
             self.processing_workers = []
@@ -84,6 +84,8 @@ class Downloader:
             self.pbar = pbar
             self.downloader = downloader
             self.keep_document_types = keep_document_types
+            self.keep_filtered_metadata = keep_filtered_metadata
+            self.standardize_metadata = standardize_metadata
         def start_processing_workers(self):
             for _ in range(self.max_workers):
@@ -95,7 +97,7 @@ class Downloader:
         def _process_file(self, item):
             filename, content = item
             output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
-            write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
+            write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
             self.pbar.update(1)
         def _processing_worker(self):
@@ -204,11 +206,12 @@ class Downloader:
             except Exception as e:
                 self._log_error(output_dir, filename, str(e))
-    async def process_batch(self, urls, output_dir, keep_document_types=[]):
+    async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
         os.makedirs(output_dir, exist_ok=True)
         with tqdm(total=len(urls), desc="Processing files") as pbar:
-            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
+            processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
+                                            keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
             processor.start_processing_workers()
             semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
@@ -231,7 +234,7 @@ class Downloader:
             processor.stop_workers()
             decompression_pool.shutdown()
-    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
+    def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
         """
         Query SEC filings and download/process them.
@@ -242,6 +245,7 @@ class Downloader:
         - output_dir: Directory to save downloaded files
         - accession_numbers: List of specific accession numbers to download
         - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
+        - keep_filtered_metadata: Whether to keep metadata for filtered documents
         """
         if self.api_key is None:
             raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
@@ -279,7 +283,7 @@ class Downloader:
         start_time = time.time()
         # Process the batch asynchronously
-        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
+        asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
         # Calculate and display performance metrics
         elapsed_time = time.time() - start_time
@@ -292,7 +296,7 @@ class Downloader:
             self.loop.call_soon_threadsafe(self.loop.stop)
-def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
+def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
     """
     Query SEC filings and download/process them.
@@ -304,6 +308,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
     - output_dir: Directory to save downloaded files
     - accession_numbers: List of specific accession numbers to download
     - keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
+    - keep_filtered_metadata: Whether to keep metadata for filtered documents
     """
     if accession_numbers:
         accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
@@ -317,5 +322,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
         filing_date=filing_date,
         output_dir=output_dir,
         accession_numbers=accession_numbers,
-        keep_document_types=keep_document_types
+        keep_document_types=keep_document_types,
+        keep_filtered_metadata=keep_filtered_metadata,
+        standardize_metadata=standardize_metadata
     )

{datamule-1.5.2 → datamule-1.5.3}/datamule/submission.py RENAMED Viewed

@@ -2,11 +2,79 @@ from pathlib import Path
 import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
+from secsgml.utils import bytes_to_str
 import tarfile
 import shutil
 import zstandard as zstd
-from io import BytesIO
 import gzip
+import io
+import copy
+def calculate_documents_locations_in_tar(metadata, documents):
+    # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
+    placeholder_metadata = copy.deepcopy(metadata)
+    for file_num in range(len(documents)):
+        if 'documents' in placeholder_metadata:
+            placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999"  # 10 digits
+            placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999"    # 10 digits
+    # Step 2: Calculate size with placeholders
+    placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
+    placeholder_json = json.dumps(placeholder_str).encode('utf-8')
+    metadata_size = len(placeholder_json)
+    # Step 3: Now calculate actual positions using this size
+    current_pos = 512 + metadata_size
+    current_pos += (512 - (current_pos % 512)) % 512
+    # Step 4: Calculate real positions and update original metadata (10-digit padded)
+    for file_num, content in enumerate(documents):
+        start_byte = current_pos + 512
+        end_byte = start_byte + len(content)
+        if 'documents' in metadata:
+            metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}"  # 10-digit padding
+            metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}"      # 10-digit padding
+        file_total_size = 512 + len(content)
+        padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
+        current_pos += padded_size
+    return metadata
+def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
+     # Write tar directly to disk
+    with tarfile.open(output_path, 'w') as tar:
+        # calculate document locations in tar
+        metadata = calculate_documents_locations_in_tar(metadata, documents)
+        # serialize metadata
+        metadata_str  = bytes_to_str(metadata,lower=False)
+        metadata_json = json.dumps(metadata_str).encode('utf-8')
+        # save metadata
+        tarinfo = tarfile.TarInfo(name='metadata.json')
+        tarinfo.size = len(metadata_json)
+        tar.addfile(tarinfo, io.BytesIO(metadata_json))
+        for file_num, content in enumerate(documents, 0):
+            if standardize_metadata:
+                document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
+            compression = compression_list[file_num]
+            if compression == 'gzip':
+                document_name = f'{document_name}.gz'
+            elif compression == 'zstd':
+                document_name = f'{document_name}.zst'
+            tarinfo = tarfile.TarInfo(name=f'{document_name}')
+            tarinfo.size = len(content)
+            tar.addfile(tarinfo, io.BytesIO(content))
 class Submission:
     def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -68,51 +136,34 @@ class Submission:
         if compression is not None and compression not in ['gzip', 'zstd']:
             raise ValueError("compression must be 'gzip' or 'zstd'")
+        # check if we're loading from a dir or a tar file
+        is_dir_not_tar = True
+        if self.path.suffix == '.tar':
+            is_dir_not_tar = False
+        elif not self.path.is_dir():
+            raise ValueError("Path must be a directory to compress")
         # Create tar file (replace directory with .tar file)
         tar_path = self.path.with_suffix('.tar')
+        # load all files in the directory or tar file
+        documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
-        with tarfile.open(tar_path, 'w') as tar:
-            # Add metadata.json first
-            metadata_path = self.path / 'metadata.json'
-            if metadata_path.exists():
-                tar.add(metadata_path, arcname='metadata.json')
-            # Add documents in order
-            for doc in self.metadata.content['documents']:
-                filename = doc.get('filename')
-                if filename is None:
-                    filename = doc['sequence'] + '.txt'
-                file_path = self.path / filename
-                if file_path.exists():
-                    file_size = file_path.stat().st_size
-                    # Compress if compression specified and over threshold
-                    if compression is not None and file_size >= threshold:
-                        content = file_path.read_bytes()
-                        if compression == 'gzip':
-                            compressed_content = gzip.compress(content, compresslevel=level or 6)
-                            compressed_filename = filename + '.gz'
-                        else:  # zstd
-                            cctx = zstd.ZstdCompressor(level=level or 3)
-                            compressed_content = cctx.compress(content)
-                            compressed_filename = filename + '.zst'
-                        # Add compressed file to tar
-                        tarinfo = tarfile.TarInfo(name=compressed_filename)
-                        tarinfo.size = len(compressed_content)
-                        tar.addfile(tarinfo, BytesIO(compressed_content))
-                    else:
-                        # Add uncompressed file
-                        tar.add(file_path, arcname=filename)
+        # we should compress everything here first.
+        compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
+        documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
+            len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
+            len(doc) >= threshold else doc for doc in documents]
+        metadata = self.metadata.content.copy()
+        write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
         # Delete original folder
-        shutil.rmtree(self.path)
-        # Update path to point to new tar file
-        self.path = tar_path
+        if is_dir_not_tar:
+            shutil.rmtree(self.path)
+            # otherwise, we already replaced the tar file
+            # Update path to point to new tar file
+            self.path = tar_path
     def decompress(self):
         if self.path is None:
@@ -129,17 +180,36 @@ class Submission:
                 if member.isfile():
                     content = tar.extractfile(member).read()
-                    # Decompress if gzipped
+                    # Decompress based on file extension
                     if member.name.endswith('.gz'):
                         content = gzip.decompress(content)
                         output_path = output_dir / member.name[:-3]  # Remove .gz extension
+                    elif member.name.endswith('.zst'):
+                        dctx = zstd.ZstdDecompressor()
+                        content = dctx.decompress(content)
+                        output_path = output_dir / member.name[:-4]  # Remove .zst extension
                     else:
                         output_path = output_dir / member.name
-                    # Write to output directory
-                    output_path.parent.mkdir(parents=True, exist_ok=True)
-                    with output_path.open('wb') as f:
-                        f.write(content)
+                    # check if it is metadata.json
+                    if output_path.name == 'metadata.json':
+                        # load as json
+                        metadata = json.loads(content.decode('utf-8'))
+                        # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
+                        for doc in metadata['documents']:
+                            if 'secsgml_start_byte' in doc:
+                                del doc['secsgml_start_byte']
+                            if 'secsgml_end_byte' in doc:
+                                del doc['secsgml_end_byte']
+                        with output_path.open('w', encoding='utf-8') as f:
+                            json.dump(metadata, f)
+                    else:
+                        # Write to output directory
+                        output_path.parent.mkdir(parents=True, exist_ok=True)
+                        with output_path.open('wb') as f:
+                            f.write(content)
         # delete original file
         self.path.unlink()

{datamule-1.5.2 → datamule-1.5.3}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.5.2
+Version: 1.5.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.5.2 → datamule-1.5.3}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="1.5.2",
+    version="1.5.3",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",