datamule 1.6.1__tar.gz → 1.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. {datamule-1.6.1 → datamule-1.6.3}/PKG-INFO +1 -1
  2. {datamule-1.6.1 → datamule-1.6.3}/datamule/portfolio.py +102 -18
  3. datamule-1.6.3/datamule/portfolio_compression_utils.py +291 -0
  4. {datamule-1.6.1 → datamule-1.6.3}/datamule/seclibrary/downloader.py +163 -161
  5. datamule-1.6.3/datamule/submission.py +190 -0
  6. {datamule-1.6.1 → datamule-1.6.3}/datamule/utils/construct_submissions_data.py +4 -4
  7. {datamule-1.6.1 → datamule-1.6.3}/datamule.egg-info/PKG-INFO +1 -1
  8. {datamule-1.6.1 → datamule-1.6.3}/datamule.egg-info/SOURCES.txt +1 -0
  9. {datamule-1.6.1 → datamule-1.6.3}/setup.py +1 -1
  10. datamule-1.6.1/datamule/submission.py +0 -294
  11. {datamule-1.6.1 → datamule-1.6.3}/datamule/__init__.py +0 -0
  12. {datamule-1.6.1 → datamule-1.6.3}/datamule/config.py +0 -0
  13. {datamule-1.6.1 → datamule-1.6.3}/datamule/data/listed_filer_metadata.csv +0 -0
  14. {datamule-1.6.1 → datamule-1.6.3}/datamule/datamule/__init__.py +0 -0
  15. {datamule-1.6.1 → datamule-1.6.3}/datamule/datamule/sec_connector.py +0 -0
  16. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/__init__.py +0 -0
  17. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/document.py +0 -0
  18. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/__init__.py +0 -0
  19. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/atsn.py +0 -0
  20. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/cfportal.py +0 -0
  21. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/d.py +0 -0
  22. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ex102_abs.py +0 -0
  23. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ex99a_sdr.py +0 -0
  24. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ex99c_sdr.py +0 -0
  25. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ex99g_sdr.py +0 -0
  26. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ex99i_sdr.py +0 -0
  27. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/information_table.py +0 -0
  28. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/nmfp.py +0 -0
  29. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/npx.py +0 -0
  30. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/onefourtyfour.py +0 -0
  31. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ownership.py +0 -0
  32. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/proxy_voting_record.py +0 -0
  33. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/sbs.py +0 -0
  34. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/sbsef.py +0 -0
  35. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/schedule13.py +0 -0
  36. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/sdr.py +0 -0
  37. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/submission_metadata.py +0 -0
  38. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/ta.py +0 -0
  39. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/thirteenfhr.py +0 -0
  40. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/twentyfivense.py +0 -0
  41. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings/twentyfourf2nt.py +0 -0
  42. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings_new/__init__.py +0 -0
  43. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings_new/mappings.py +0 -0
  44. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/mappings_new/ownership.py +0 -0
  45. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/processing.py +0 -0
  46. {datamule-1.6.1 → datamule-1.6.3}/datamule/document/table.py +0 -0
  47. {datamule-1.6.1 → datamule-1.6.3}/datamule/helper.py +0 -0
  48. {datamule-1.6.1 → datamule-1.6.3}/datamule/index.py +0 -0
  49. {datamule-1.6.1 → datamule-1.6.3}/datamule/mapping_dicts/__init__.py +0 -0
  50. {datamule-1.6.1 → datamule-1.6.3}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
  51. {datamule-1.6.1 → datamule-1.6.3}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  52. {datamule-1.6.1 → datamule-1.6.3}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  53. {datamule-1.6.1 → datamule-1.6.3}/datamule/package_updater.py +0 -0
  54. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/__init__.py +0 -0
  55. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/infrastructure/__init__.py +0 -0
  56. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  57. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/__init__.py +0 -0
  58. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/downloader.py +0 -0
  59. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/eftsquery.py +0 -0
  60. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/monitor.py +0 -0
  61. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/streamer.py +0 -0
  62. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/submissions/textsearch.py +0 -0
  63. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/utils.py +0 -0
  64. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/xbrl/__init__.py +0 -0
  65. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  66. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  67. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  68. {datamule-1.6.1 → datamule-1.6.3}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  69. {datamule-1.6.1 → datamule-1.6.3}/datamule/seclibrary/__init__.py +0 -0
  70. {datamule-1.6.1 → datamule-1.6.3}/datamule/seclibrary/bq.py +0 -0
  71. {datamule-1.6.1 → datamule-1.6.3}/datamule/seclibrary/query.py +0 -0
  72. {datamule-1.6.1 → datamule-1.6.3}/datamule/sheet.py +0 -0
  73. {datamule-1.6.1 → datamule-1.6.3}/datamule/utils/__init__.py +0 -0
  74. {datamule-1.6.1 → datamule-1.6.3}/datamule.egg-info/dependency_links.txt +0 -0
  75. {datamule-1.6.1 → datamule-1.6.3}/datamule.egg-info/requires.txt +0 -0
  76. {datamule-1.6.1 → datamule-1.6.3}/datamule.egg-info/top_level.txt +0 -0
  77. {datamule-1.6.1 → datamule-1.6.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.6.1
3
+ Version: 1.6.3
4
4
  Summary: Work with SEC submissions at scale.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,18 +1,23 @@
1
1
  from pathlib import Path
2
2
  from tqdm import tqdm
3
- from concurrent.futures import ThreadPoolExecutor
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from .submission import Submission
5
5
  from .sec.submissions.downloader import download as sec_download
6
6
  from .sec.submissions.textsearch import filter_text
7
7
  from .config import Config
8
8
  import os
9
+ import tarfile
10
+ from threading import Lock
9
11
  from .helper import _process_cik_and_metadata_filters
10
12
  from .seclibrary.downloader import download as seclibrary_download
11
13
  from .sec.xbrl.filter_xbrl import filter_xbrl
12
14
  from .sec.submissions.monitor import Monitor
15
+ from .portfolio_compression_utils import CompressionManager
13
16
  #from .sec.xbrl.xbrlmonitor import XBRLMonitor
14
17
  from .datamule.sec_connector import SecConnector
15
-
18
+ from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
19
+ import json
20
+ import io
16
21
 
17
22
  class Portfolio:
18
23
  def __init__(self, path):
@@ -21,6 +26,10 @@ class Portfolio:
21
26
  self.submissions = []
22
27
  self.submissions_loaded = False
23
28
  self.MAX_WORKERS = os.cpu_count() - 1
29
+
30
+ # Batch tar support
31
+ self.batch_tar_handles = {} # {batch_tar_path: tarfile_handle}
32
+ self.batch_tar_locks = {} # {batch_tar_path: threading.Lock}
24
33
 
25
34
  self.monitor = Monitor()
26
35
 
@@ -34,27 +43,101 @@ class Portfolio:
34
43
  self.api_key = api_key
35
44
 
36
45
  def _load_submissions(self):
37
- folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
38
- print(f"Loading {len(folders)} submissions")
46
+ print(f"Loading submissions")
47
+
48
+ # Separate regular and batch items
49
+ regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
50
+ batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
39
51
 
52
+ # Load regular submissions (existing logic)
40
53
  def load_submission(folder):
41
- try:
42
- return Submission(folder)
43
- except Exception as e:
44
- print(f"Error loading submission from {folder}: {str(e)}")
45
- return None
54
+ return Submission(folder)
46
55
 
47
- with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
48
- self.submissions = list(tqdm(
49
- executor.map(load_submission, folders),
50
- total=len(folders),
51
- desc="Loading submissions"
52
- ))
53
-
54
- # Filter out None values from failed submissions
55
- self.submissions = [s for s in self.submissions if s is not None]
56
+ regular_submissions = []
57
+ if regular_items:
58
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
59
+ regular_submissions = list(tqdm(
60
+ executor.map(load_submission, regular_items),
61
+ total=len(regular_items),
62
+ desc="Loading regular submissions"
63
+ ))
64
+
65
+ # Load batch submissions with parallel processing + progress
66
+ batch_submissions = []
67
+ if batch_tars:
68
+ with tqdm(desc="Loading batch submissions", unit="submissions") as pbar:
69
+ with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
70
+ # Submit all batch tar jobs
71
+ futures = [
72
+ executor.submit(self._load_batch_submissions_worker, batch_tar, pbar)
73
+ for batch_tar in batch_tars
74
+ ]
75
+
76
+ # Collect results as they complete
77
+ for future in as_completed(futures):
78
+ batch_submissions.extend(future.result())
79
+
80
+ # Combine and filter None values
81
+ self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
56
82
  print(f"Successfully loaded {len(self.submissions)} submissions")
57
83
 
84
+ def _load_batch_submissions_worker(self, batch_tar_path, pbar):
85
+ """Worker function to load submissions from one batch tar with progress updates"""
86
+ # Open tar handle and store it
87
+ tar_handle = tarfile.open(batch_tar_path, 'r')
88
+ self.batch_tar_handles[batch_tar_path] = tar_handle
89
+ self.batch_tar_locks[batch_tar_path] = Lock()
90
+
91
+ # Find all accession directories
92
+ accession_prefixes = set()
93
+ for member in tar_handle.getmembers():
94
+ if '/' in member.name and member.name.endswith('metadata.json'):
95
+ accession_prefix = member.name.split('/')[0]
96
+ accession_prefixes.add(accession_prefix)
97
+
98
+ # Create submissions for each accession
99
+ submissions = []
100
+ for accession_prefix in accession_prefixes:
101
+ submission = Submission(
102
+ batch_tar_path=batch_tar_path,
103
+ accession_prefix=accession_prefix,
104
+ portfolio_ref=self
105
+ )
106
+ submissions.append(submission)
107
+ pbar.update(1) # Update progress for each successful submission
108
+
109
+ return submissions
110
+
111
+
112
+ def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
113
+ """
114
+ Compress all individual submissions into batch tar files.
115
+
116
+ Args:
117
+ compression: None, 'gzip', or 'zstd' for document compression (default: None)
118
+ compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
119
+ threshold: Size threshold for compressing individual documents (default: 1MB)
120
+ max_batch_size: Maximum size per batch tar file (default: 1GB)
121
+ """
122
+ CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
123
+
124
+ def decompress(self):
125
+ """
126
+ Decompress all batch tar files back to individual submission directories.
127
+ """
128
+ CompressionManager().decompress_portfolio(self, self.MAX_WORKERS)
129
+
130
+ def _close_batch_handles(self):
131
+ """Close all open batch tar handles to free resources"""
132
+ for handle in self.batch_tar_handles.values():
133
+ handle.close()
134
+ self.batch_tar_handles.clear()
135
+ self.batch_tar_locks.clear()
136
+
137
+ def __del__(self):
138
+ """Cleanup batch tar handles on destruction"""
139
+ self._close_batch_handles()
140
+
58
141
  def process_submissions(self, callback):
59
142
  """Process all submissions using a thread pool."""
60
143
  if not self.submissions_loaded:
@@ -169,6 +252,7 @@ class Portfolio:
169
252
  )
170
253
 
171
254
  self.submissions_loaded = False
255
+
172
256
  def monitor_submissions(self, data_callback=None, interval_callback=None,
173
257
  polling_interval=1000, quiet=True, start_date=None,
174
258
  validation_interval=600000):
@@ -0,0 +1,291 @@
1
+ import json
2
+ import io
3
+ import gzip
4
+ import zstandard as zstd
5
+ import tarfile
6
+ import shutil
7
+ from tqdm import tqdm
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
10
+
11
+
12
+ class CompressionManager:
13
+
14
+ def compress_portfolio(self, portfolio, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024, max_workers=None):
15
+ """
16
+ Compress all individual submissions into batch tar files.
17
+
18
+ Args:
19
+ portfolio: Portfolio instance
20
+ compression: None, 'gzip', or 'zstd' for document compression (default: None)
21
+ compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
22
+ threshold: Size threshold for compressing individual documents (default: 1MB)
23
+ max_batch_size: Maximum size per batch tar file (default: 1GB)
24
+ max_workers: Number of threads for parallel document processing (default: portfolio.MAX_WORKERS)
25
+ """
26
+ if max_workers is None:
27
+ max_workers = portfolio.MAX_WORKERS
28
+
29
+ portfolio._close_batch_handles()
30
+
31
+ if not portfolio.submissions_loaded:
32
+ portfolio._load_submissions()
33
+
34
+ # Only compress non-batch submissions
35
+ submissions = [s for s in portfolio.submissions if s.batch_tar_path is None]
36
+
37
+ if not submissions:
38
+ print("No submissions to compress")
39
+ return
40
+
41
+ print(f"Compressing {len(submissions)} submissions...")
42
+
43
+ # Set default compression level if not specified
44
+ if compression_level is None:
45
+ compression_level = 6 if compression == 'gzip' else 3
46
+
47
+ # Group submissions into batches
48
+ current_batch = 0
49
+ current_size = 0
50
+ sequence = 1
51
+ current_tar = None
52
+
53
+ with tqdm(total=len(submissions), desc="Compressing submissions") as pbar:
54
+ for submission in submissions:
55
+ # Parallel document processing
56
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
57
+ doc_futures = [
58
+ executor.submit(self._process_document, doc, compression, threshold, compression_level)
59
+ for doc in submission
60
+ ]
61
+
62
+ # Collect results maintaining order
63
+ documents = []
64
+ compression_list = []
65
+ for future in doc_futures:
66
+ content, compression_type = future.result()
67
+ documents.append(content)
68
+ compression_list.append(compression_type)
69
+
70
+ # Calculate submission size
71
+ metadata_str = bytes_to_str(submission.metadata.content, lower=False)
72
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
73
+ submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
74
+
75
+ # Check if we need a new batch tar
76
+ if current_size > 0 and current_size + submission_size > max_batch_size:
77
+ if current_tar:
78
+ current_tar.close()
79
+ sequence += 1
80
+ current_size = 0
81
+ current_tar = None
82
+
83
+ # Create tar if needed
84
+ if current_tar is None:
85
+ batch_path = portfolio.path / f'batch_{current_batch:03d}_{sequence:03d}.tar'
86
+ current_tar = tarfile.open(batch_path, 'w')
87
+
88
+ # Write submission to tar
89
+ self._write_submission_to_tar(
90
+ current_tar,
91
+ submission,
92
+ documents,
93
+ compression_list,
94
+ submission.accession
95
+ )
96
+
97
+ current_size += submission_size
98
+
99
+ # Remove original submission directory/tar
100
+ if submission.path:
101
+ if submission.path.is_dir():
102
+ shutil.rmtree(submission.path)
103
+ elif submission.path.suffix == '.tar':
104
+ submission.path.unlink()
105
+
106
+ pbar.update(1)
107
+
108
+ # Close final tar
109
+ if current_tar:
110
+ current_tar.close()
111
+
112
+ # Reload submissions to reflect new batch structure
113
+ portfolio.submissions_loaded = False
114
+ portfolio._load_submissions()
115
+
116
+ print("Compression complete.")
117
+
118
+ def decompress_portfolio(self, portfolio, max_workers=None):
119
+ """
120
+ Decompress all batch tar files back to individual submission directories.
121
+
122
+ Args:
123
+ portfolio: Portfolio instance
124
+ max_workers: Number of threads for parallel file processing (default: portfolio.MAX_WORKERS)
125
+ """
126
+ if max_workers is None:
127
+ max_workers = portfolio.MAX_WORKERS
128
+
129
+ if not portfolio.submissions_loaded:
130
+ portfolio._load_submissions()
131
+
132
+ # Find all batch tar files
133
+ batch_tars = [f for f in portfolio.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
134
+
135
+ if not batch_tars:
136
+ print("No batch tar files found to decompress")
137
+ return
138
+
139
+ print(f"Decompressing {len(batch_tars)} batch tar files...")
140
+
141
+ # FIRST: Close all batch tar handles to free the files
142
+ portfolio._close_batch_handles()
143
+
144
+ total_extracted = 0
145
+
146
+ with tqdm(desc="Decompressing submissions", unit="submissions") as pbar:
147
+ for batch_tar in batch_tars:
148
+ with tarfile.open(batch_tar, 'r') as tar:
149
+ # Find all accession directories in this tar
150
+ accession_dirs = set()
151
+ for member in tar.getmembers():
152
+ if '/' in member.name:
153
+ accession_dir = member.name.split('/')[0]
154
+ accession_dirs.add(accession_dir)
155
+
156
+ # Extract each submission
157
+ for accession_dir in accession_dirs:
158
+ output_dir = portfolio.path / accession_dir
159
+ output_dir.mkdir(exist_ok=True)
160
+
161
+ # Get all files for this accession
162
+ accession_files = [m for m in tar.getmembers()
163
+ if m.name.startswith(f'{accession_dir}/') and m.isfile()]
164
+
165
+ # Parallel file extraction
166
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
167
+ file_futures = [
168
+ executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
169
+ for member in accession_files
170
+ ]
171
+
172
+ # Wait for all files to be processed
173
+ for future in as_completed(file_futures):
174
+ future.result()
175
+
176
+ total_extracted += 1
177
+ pbar.update(1)
178
+
179
+
180
+ # NOW delete the batch tar files after everything is extracted
181
+ for batch_tar in batch_tars:
182
+ batch_tar.unlink()
183
+
184
+
185
+ # Reload submissions to reflect new directory structure
186
+ portfolio.submissions_loaded = False
187
+ portfolio._load_submissions()
188
+
189
+ print(f"Decompression complete. Extracted {total_extracted} submissions.")
190
+
191
+ def _process_document(self, doc, compression, threshold, compression_level):
192
+ """Process a single document: load content and apply compression if needed."""
193
+ content = doc.content
194
+ if isinstance(content, str):
195
+ content = content.encode('utf-8')
196
+
197
+ # Apply document-level compression if threshold met AND compression is specified
198
+ if compression and len(content) >= threshold:
199
+ if compression == 'gzip':
200
+ content = gzip.compress(content, compresslevel=compression_level)
201
+ compression_type = 'gzip'
202
+ elif compression == 'zstd':
203
+ content = zstd.ZstdCompressor(level=compression_level).compress(content)
204
+ compression_type = 'zstd'
205
+ else:
206
+ compression_type = ''
207
+ else:
208
+ compression_type = ''
209
+
210
+ return content, compression_type
211
+
212
+ def _extract_file(self, member, tar, accession_dir, output_dir):
213
+ """Extract and decompress a single file from tar."""
214
+ relative_path = member.name[len(accession_dir)+1:] # Remove accession prefix
215
+ output_path = output_dir / relative_path
216
+
217
+ content = tar.extractfile(member).read()
218
+
219
+ # Handle decompression based on filename
220
+ if relative_path.endswith('.gz'):
221
+ # File MUST be gzipped if it has .gz extension
222
+ content = gzip.decompress(content)
223
+ output_path = output_path.with_suffix('') # Remove .gz
224
+
225
+ elif relative_path.endswith('.zst'):
226
+ # File MUST be zstd compressed if it has .zst extension
227
+ content = zstd.ZstdDecompressor().decompress(content)
228
+ output_path = output_path.with_suffix('') # Remove .zst
229
+
230
+ # Special handling for metadata.json
231
+ if output_path.name == 'metadata.json':
232
+ metadata = json.loads(content.decode('utf-8'))
233
+ # Remove tar-specific metadata
234
+ for doc in metadata['documents']:
235
+ doc.pop('secsgml_start_byte', None)
236
+ doc.pop('secsgml_end_byte', None)
237
+
238
+ # Update filenames to match decompressed files
239
+ filename = doc.get('filename', '')
240
+ if filename.endswith('.gz'):
241
+ doc['filename'] = filename[:-3] # Remove .gz
242
+ elif filename.endswith('.zst'):
243
+ doc['filename'] = filename[:-4] # Remove .zst
244
+
245
+ with output_path.open('w', encoding='utf-8') as f:
246
+ json.dump(metadata, f, indent=2)
247
+ else:
248
+ # Write document file
249
+ output_path.parent.mkdir(parents=True, exist_ok=True)
250
+ with output_path.open('wb') as f:
251
+ f.write(content)
252
+
253
+
254
+ def _write_submission_to_tar(self, tar_handle, submission, documents, compression_list, accession_prefix):
255
+ """Write a submission to a tar file with optional document compression."""
256
+ # Prepare metadata
257
+ metadata = submission.metadata.content.copy()
258
+
259
+ # Update filenames for compressed documents BEFORE size calculation
260
+ for i, compression in enumerate(compression_list):
261
+ if compression:
262
+ doc = metadata['documents'][i]
263
+ filename = doc.get('filename', doc['sequence'] + '.txt')
264
+ if compression == 'gzip' and not filename.endswith('.gz'):
265
+ doc['filename'] = filename + '.gz'
266
+ elif compression == 'zstd' and not filename.endswith('.zst'):
267
+ doc['filename'] = filename + '.zst'
268
+
269
+ # Add document sizes to metadata for calculate_documents_locations_in_tar
270
+ for i, content in enumerate(documents):
271
+ metadata['documents'][i]['secsgml_size_bytes'] = len(content)
272
+
273
+ # NOW calculate document positions with the correct filenames
274
+ metadata = calculate_documents_locations_in_tar(metadata)
275
+
276
+ # Write metadata
277
+ metadata_str = bytes_to_str(metadata, lower=False)
278
+ metadata_json = json.dumps(metadata_str).encode('utf-8')
279
+
280
+ tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/metadata.json')
281
+ tarinfo.size = len(metadata_json)
282
+ tar_handle.addfile(tarinfo, io.BytesIO(metadata_json))
283
+
284
+ # Write documents
285
+ for i, content in enumerate(documents):
286
+ doc = metadata['documents'][i]
287
+ filename = doc.get('filename', doc['sequence'] + '.txt')
288
+
289
+ tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
290
+ tarinfo.size = len(content)
291
+ tar_handle.addfile(tarinfo, io.BytesIO(content))