PyPI - datamule - Versions diffs - 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl - Mend

datamule 1.6.1py3-none-any.whl → 1.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

datamule/portfolio.py +102 -18
datamule/portfolio_compression_utils.py +291 -0
datamule/seclibrary/downloader.py +163 -161
datamule/submission.py +82 -186
datamule/utils/construct_submissions_data.py +4 -4
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA +1 -1
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD +9 -8
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL +0 -0
{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt +0 -0

datamule/submission.py CHANGED Viewed

@@ -2,87 +2,28 @@ from pathlib import Path
 import json
 from .document.document import Document
 from secsgml import parse_sgml_content_into_memory
-from secsgml.utils import bytes_to_str
 from secsgml.parse_sgml import transform_metadata_string
 import tarfile
-import shutil
 import zstandard as zstd
 import gzip
-import io
-import copy
-def calculate_documents_locations_in_tar(metadata, documents):
-    # Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
-    placeholder_metadata = copy.deepcopy(metadata)
-    for file_num in range(len(documents)):
-        if 'documents' in placeholder_metadata:
-            placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999"  # 10 digits
-            placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999"    # 10 digits
-    # Step 2: Calculate size with placeholders
-    placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
-    placeholder_json = json.dumps(placeholder_str).encode('utf-8')
-    metadata_size = len(placeholder_json)
-    # Step 3: Now calculate actual positions using this size
-    current_pos = 512 + metadata_size
-    current_pos += (512 - (current_pos % 512)) % 512
-    # Step 4: Calculate real positions and update original metadata (10-digit padded)
-    for file_num, content in enumerate(documents):
-        start_byte = current_pos + 512
-        end_byte = start_byte + len(content)
+class Submission:
+    def __init__(self, path=None, sgml_content=None, keep_document_types=None,
+                 batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
-        if 'documents' in metadata:
-            metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}"  # 10-digit padding
-            metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}"      # 10-digit padding
+        # Validate parameters
+        param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
+        if param_count != 1:
+            raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
-        file_total_size = 512 + len(content)
-        padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
-        current_pos += padded_size
-    return metadata
-def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
-     # Write tar directly to disk
-    with tarfile.open(output_path, 'w') as tar:
-        # calculate document locations in tar
-        metadata = calculate_documents_locations_in_tar(metadata, documents)
+        if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
+            raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
-        # serialize metadata
-        metadata_str  = bytes_to_str(metadata,lower=False)
-        metadata_json = json.dumps(metadata_str).encode('utf-8')
-        # save metadata
-        tarinfo = tarfile.TarInfo(name='metadata.json')
-        tarinfo.size = len(metadata_json)
-        tar.addfile(tarinfo, io.BytesIO(metadata_json))
-        for file_num, content in enumerate(documents, 0):
-            if standardize_metadata:
-                document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
-            compression = compression_list[file_num]
-            if compression == 'gzip':
-                document_name = f'{document_name}.gz'
-            elif compression == 'zstd':
-                document_name = f'{document_name}.zst'
-            tarinfo = tarfile.TarInfo(name=f'{document_name}')
-            tarinfo.size = len(content)
-            tar.addfile(tarinfo, io.BytesIO(content))
-class Submission:
-    def __init__(self, path=None,sgml_content=None,keep_document_types=None):
-        if path is None and sgml_content is None:
-            raise ValueError("Either path or sgml_content must be provided")
-        if path is not None and sgml_content is not None:
-            raise ValueError("Only one of path or sgml_content must be provided")
+        # Initialize batch tar attributes
+        self.batch_tar_path = batch_tar_path
+        self.accession_prefix = accession_prefix
+        self.portfolio_ref = portfolio_ref
         if sgml_content is not None:
             self.path = None
@@ -100,7 +41,7 @@ class Submission:
             filtered_metadata_documents = []
             for idx,doc in enumerate(self.metadata.content['documents']):
-                type = doc.get('type')()
+                type = doc.get('type')
                 # Keep only specified types
                 if keep_document_types is not None and type not in keep_document_types:
@@ -115,7 +56,26 @@ class Submission:
             self.metadata.content['documents'] = filtered_metadata_documents
-        if path is not None:
+        elif batch_tar_path is not None:
+            # Batch tar case
+            self.path = None
+            # Load metadata from batch tar
+            with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
+                tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
+                metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
+                metadata = json.loads(metadata_obj.read().decode('utf-8'))
+            # Set metadata path using :: notation
+            metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
+            # standardize metadata
+            metadata = transform_metadata_string(metadata)
+            self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
+            self.accession = self.metadata.content['accession-number']
+            self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
+        elif path is not None:
             self.path = Path(path)
             if self.path.suffix == '.tar':
                 with tarfile.open(self.path,'r') as tar:
@@ -134,135 +94,71 @@ class Submission:
             self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
             self.accession = self.metadata.content['accession-number']
             self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
-    def compress(self, compression=None, level=None, threshold=1048576):
-        if self.path is None:
-            raise ValueError("Compress requires path")
-        if compression is not None and compression not in ['gzip', 'zstd']:
-            raise ValueError("compression must be 'gzip' or 'zstd'")
-        # check if we're loading from a dir or a tar file
-        is_dir_not_tar = True
-        if self.path.suffix == '.tar':
-            is_dir_not_tar = False
-        elif not self.path.is_dir():
-            raise ValueError("Path must be a directory to compress")
-        # Create tar file (replace directory with .tar file)
-        tar_path = self.path.with_suffix('.tar')
-        # load all files in the directory or tar file
-        documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
-        # we should compress everything here first.
-        compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
-        documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
-            len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
-            len(doc) >= threshold else doc for doc in documents]
-        metadata = self.metadata.content.copy()
-        write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
-        # Delete original folder
-        if is_dir_not_tar:
-            shutil.rmtree(self.path)
-            # otherwise, we already replaced the tar file
-            # Update path to point to new tar file
-            self.path = tar_path
-    def decompress(self):
-        if self.path is None:
-            raise ValueError("Decompress requires path")
-        elif self.path.suffix != '.tar':
-            raise ValueError("Can only decompress tar")
-        # Create output directory (path without .tar extension)
-        output_dir = self.path.with_suffix('')
-        output_dir.mkdir(exist_ok=True)
-        with tarfile.open(self.path, 'r') as tar:
-            for member in tar.getmembers():
-                if member.isfile():
-                    content = tar.extractfile(member).read()
-                    # Decompress based on file extension
-                    if member.name.endswith('.gz'):
-                        content = gzip.decompress(content)
-                        output_path = output_dir / member.name[:-3]  # Remove .gz extension
-                    elif member.name.endswith('.zst'):
-                        dctx = zstd.ZstdDecompressor()
-                        content = dctx.decompress(content)
-                        output_path = output_dir / member.name[:-4]  # Remove .zst extension
-                    else:
-                        output_path = output_dir / member.name
-                    # check if it is metadata.json
-                    if output_path.name == 'metadata.json':
-                        # load as json
-                        metadata = json.loads(content.decode('utf-8'))
-                        # remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
-                        for doc in metadata['documents']:
-                            if 'secsgml_start_byte' in doc:
-                                del doc['secsgml_start_byte']
-                            if 'secsgml_end_byte' in doc:
-                                del doc['secsgml_end_byte']
-                        with output_path.open('w', encoding='utf-8') as f:
-                            json.dump(metadata, f)
-                    else:
-                        # Write to output directory
-                        output_path.parent.mkdir(parents=True, exist_ok=True)
-                        with output_path.open('wb') as f:
-                            f.write(content)
-        # delete original file
-        self.path.unlink()
-        self.path = output_dir
     def _load_document_by_index(self, idx):
         """Load a document by its index in the metadata documents list."""
         doc = self.metadata.content['documents'][idx]
         # If loaded from sgml_content, return pre-loaded document
-        if self.path is None:
+        if self.path is None and self.batch_tar_path is None:
             return self.documents[idx]
-        # If loaded from path, load document on-demand
+        # Get filename from metadata - this is the source of truth
         filename = doc.get('filename')
         if filename is None:
             filename = doc['sequence'] + '.txt'
-        document_path = self.path / filename
-        extension = document_path.suffix
+        # Get the base extension (before any compression extension)
+        # If filename ends with .gz or .zst, the real extension is before that
+        if filename.endswith('.gz'):
+            extension = Path(filename[:-3]).suffix
+            is_compressed = 'gzip'
+        elif filename.endswith('.zst'):
+            extension = Path(filename[:-4]).suffix
+            is_compressed = 'zstd'
+        else:
+            extension = Path(filename).suffix
+            is_compressed = False
-        if self.path.suffix == '.tar':
-            with tarfile.open(self.path, 'r') as tar:
-                # bandaid fix TODO
-                try:
-                    content = tar.extractfile(filename).read()
-                except:
-                    try:
-                        content = tar.extractfile(filename+'.gz').read()
-                    except:
-                        try:
-                            content = tar.extractfile(filename+'.zst').read()
-                        except:
-                            # some of these issues are on SEC data end, will fix when I setup cloud.
-                            raise ValueError(f"Something went wrong with tar: {self.path}")
-                # Decompress if compressed
-                if filename.endswith('.gz'):
+        # Handle batch tar case
+        if self.batch_tar_path is not None:
+            with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
+                tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
+                # Use exact filename from metadata
+                tar_path = f'{self.accession_prefix}/{filename}'
+                content = tar_handle.extractfile(tar_path).read()
+                # Decompress if needed based on filename extension
+                if is_compressed == 'gzip':
                     content = gzip.decompress(content)
-                elif filename.endswith('.zst'):
-                    dctx = zstd.ZstdDecompressor()
-                    content = dctx.decompress(content)
+                elif is_compressed == 'zstd':
+                    content = zstd.ZstdDecompressor().decompress(content)
+                # Decode text files
+                if extension in ['.htm', '.html', '.txt', '.xml']:
+                    content = content.decode('utf-8', errors='replace')
+                document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
+        # Handle regular path case
         else:
+            # Use exact filename from metadata
+            document_path = self.path / filename
+            if not document_path.exists():
+                raise FileNotFoundError(f"Document file not found: {document_path}")
             with document_path.open('rb') as f:
                 content = f.read()
+            # Decompress if needed based on filename extension
+            if is_compressed == 'gzip':
+                content = gzip.decompress(content)
+            elif is_compressed == 'zstd':
+                content = zstd.ZstdDecompressor().decompress(content)
             # Decode text files
             if extension in ['.htm', '.html', '.txt', '.xml']:
                 content = content.decode('utf-8', errors='replace')

datamule/utils/construct_submissions_data.py CHANGED Viewed

@@ -41,9 +41,9 @@ def process_file_batch(zip_file, filenames_batch):
             # Create filing records for this file
             for j in range(len(accession_numbers)):
                 filing_record = {
-                    'accessionNumber': accession_numbers[j],
+                    'accessionNumber': int(accession_numbers[j].replace('-','')),
                     'filingDate': filing_dates[j],
-                    'form': forms[j],
+                    'submissionType': forms[j],
                     'cik': cik
                 }
                 batch_filings.append(filing_record)
@@ -59,13 +59,13 @@ def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
     with write_lock:
         if is_first_write:
             with open(output_path, 'w', newline='') as csvfile:
-                fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
+                fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
                 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                 writer.writeheader()
                 writer.writerows(filings_data)
         else:
             with open(output_path, 'a', newline='') as csvfile:
-                fieldnames = ['accessionNumber', 'filingDate', 'form', 'cik']
+                fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
                 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                 writer.writerows(filings_data)

{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 1.6.1
+Version: 1.6.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD RENAMED Viewed

@@ -3,9 +3,10 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
 datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
 datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
 datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
-datamule/portfolio.py,sha256=eF1eDSwIg-CI8ZmZAHRjCGU0UhuPN4ijxPB0YDT4s2o,8023
+datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
+datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
 datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
-datamule/submission.py,sha256=vAiYNas1YrWgm4Grw24peJbfSUVERySEko1zmdtG49s,13033
+datamule/submission.py,sha256=yDPglaFJ65nXn7Lxh-JFTQGKVVmBJDHBVWTf4UEUm2M,8610
 datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
 datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
@@ -61,11 +62,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
 datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
-datamule/seclibrary/downloader.py,sha256=ylv69VF22IVfrdeCkiGr5mVa2GKrPC9zFiDJU1fiBu8,17262
+datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
 datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/utils/construct_submissions_data.py,sha256=Jn37Ra2_nCIalATCjP_484eUiFP_YeglX_uNdK4Qfu8,5883
-datamule-1.6.1.dist-info/METADATA,sha256=0SEtRwvbaGgU-x_D8u3n0MUPYLssODtQf4GhQrGfl7s,524
-datamule-1.6.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-1.6.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-1.6.1.dist-info/RECORD,,
+datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
+datamule-1.6.3.dist-info/METADATA,sha256=9tb_ecnMVFHYq-Jcj_O0xAYUtM6v2PEZRxdEtPnorD4,524
+datamule-1.6.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-1.6.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-1.6.3.dist-info/RECORD,,

{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl

datamule 1.6.1py3-none-any.whl → 1.6.3py3-none-any.whl