datamule 1.6.2__py3-none-any.whl → 1.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +49 -45
- datamule/portfolio_compression_utils.py +291 -0
- datamule/submission.py +56 -164
- {datamule-1.6.2.dist-info → datamule-1.6.4.dist-info}/METADATA +1 -1
- {datamule-1.6.2.dist-info → datamule-1.6.4.dist-info}/RECORD +7 -6
- {datamule-1.6.2.dist-info → datamule-1.6.4.dist-info}/WHEEL +0 -0
- {datamule-1.6.2.dist-info → datamule-1.6.4.dist-info}/top_level.txt +0 -0
datamule/portfolio.py
CHANGED
@@ -12,9 +12,12 @@ from .helper import _process_cik_and_metadata_filters
|
|
12
12
|
from .seclibrary.downloader import download as seclibrary_download
|
13
13
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
14
14
|
from .sec.submissions.monitor import Monitor
|
15
|
+
from .portfolio_compression_utils import CompressionManager
|
15
16
|
#from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
16
17
|
from .datamule.sec_connector import SecConnector
|
17
|
-
|
18
|
+
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
19
|
+
import json
|
20
|
+
import io
|
18
21
|
|
19
22
|
class Portfolio:
|
20
23
|
def __init__(self, path):
|
@@ -48,11 +51,7 @@ class Portfolio:
|
|
48
51
|
|
49
52
|
# Load regular submissions (existing logic)
|
50
53
|
def load_submission(folder):
|
51
|
-
|
52
|
-
return Submission(folder)
|
53
|
-
except Exception as e:
|
54
|
-
print(f"Error loading submission from {folder}: {str(e)}")
|
55
|
-
return None
|
54
|
+
return Submission(folder)
|
56
55
|
|
57
56
|
regular_submissions = []
|
58
57
|
if regular_items:
|
@@ -76,10 +75,7 @@ class Portfolio:
|
|
76
75
|
|
77
76
|
# Collect results as they complete
|
78
77
|
for future in as_completed(futures):
|
79
|
-
|
80
|
-
batch_submissions.extend(future.result())
|
81
|
-
except Exception as e:
|
82
|
-
print(f"Error in batch processing: {str(e)}")
|
78
|
+
batch_submissions.extend(future.result())
|
83
79
|
|
84
80
|
# Combine and filter None values
|
85
81
|
self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
|
@@ -87,46 +83,54 @@ class Portfolio:
|
|
87
83
|
|
88
84
|
def _load_batch_submissions_worker(self, batch_tar_path, pbar):
|
89
85
|
"""Worker function to load submissions from one batch tar with progress updates"""
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
except Exception as e:
|
115
|
-
print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
|
116
|
-
|
117
|
-
return submissions
|
86
|
+
# Open tar handle and store it
|
87
|
+
tar_handle = tarfile.open(batch_tar_path, 'r')
|
88
|
+
self.batch_tar_handles[batch_tar_path] = tar_handle
|
89
|
+
self.batch_tar_locks[batch_tar_path] = Lock()
|
90
|
+
|
91
|
+
# Find all accession directories
|
92
|
+
accession_prefixes = set()
|
93
|
+
for member in tar_handle.getmembers():
|
94
|
+
if '/' in member.name and member.name.endswith('metadata.json'):
|
95
|
+
accession_prefix = member.name.split('/')[0]
|
96
|
+
accession_prefixes.add(accession_prefix)
|
97
|
+
|
98
|
+
# Create submissions for each accession
|
99
|
+
submissions = []
|
100
|
+
for accession_prefix in accession_prefixes:
|
101
|
+
submission = Submission(
|
102
|
+
batch_tar_path=batch_tar_path,
|
103
|
+
accession_prefix=accession_prefix,
|
104
|
+
portfolio_ref=self
|
105
|
+
)
|
106
|
+
submissions.append(submission)
|
107
|
+
pbar.update(1) # Update progress for each successful submission
|
108
|
+
|
109
|
+
return submissions
|
118
110
|
|
119
|
-
|
120
|
-
|
121
|
-
|
111
|
+
|
112
|
+
def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
|
113
|
+
"""
|
114
|
+
Compress all individual submissions into batch tar files.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
118
|
+
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
119
|
+
threshold: Size threshold for compressing individual documents (default: 1MB)
|
120
|
+
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
121
|
+
"""
|
122
|
+
CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
|
123
|
+
|
124
|
+
def decompress(self):
|
125
|
+
"""
|
126
|
+
Decompress all batch tar files back to individual submission directories.
|
127
|
+
"""
|
128
|
+
CompressionManager().decompress_portfolio(self, self.MAX_WORKERS)
|
122
129
|
|
123
130
|
def _close_batch_handles(self):
|
124
131
|
"""Close all open batch tar handles to free resources"""
|
125
132
|
for handle in self.batch_tar_handles.values():
|
126
|
-
|
127
|
-
handle.close()
|
128
|
-
except Exception as e:
|
129
|
-
print(f"Error closing batch tar handle: {str(e)}")
|
133
|
+
handle.close()
|
130
134
|
self.batch_tar_handles.clear()
|
131
135
|
self.batch_tar_locks.clear()
|
132
136
|
|
@@ -0,0 +1,291 @@
|
|
1
|
+
import json
|
2
|
+
import io
|
3
|
+
import gzip
|
4
|
+
import zstandard as zstd
|
5
|
+
import tarfile
|
6
|
+
import shutil
|
7
|
+
from tqdm import tqdm
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
+
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
10
|
+
|
11
|
+
|
12
|
+
class CompressionManager:
|
13
|
+
|
14
|
+
def compress_portfolio(self, portfolio, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024, max_workers=None):
|
15
|
+
"""
|
16
|
+
Compress all individual submissions into batch tar files.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
portfolio: Portfolio instance
|
20
|
+
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
21
|
+
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
22
|
+
threshold: Size threshold for compressing individual documents (default: 1MB)
|
23
|
+
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
24
|
+
max_workers: Number of threads for parallel document processing (default: portfolio.MAX_WORKERS)
|
25
|
+
"""
|
26
|
+
if max_workers is None:
|
27
|
+
max_workers = portfolio.MAX_WORKERS
|
28
|
+
|
29
|
+
portfolio._close_batch_handles()
|
30
|
+
|
31
|
+
if not portfolio.submissions_loaded:
|
32
|
+
portfolio._load_submissions()
|
33
|
+
|
34
|
+
# Only compress non-batch submissions
|
35
|
+
submissions = [s for s in portfolio.submissions if s.batch_tar_path is None]
|
36
|
+
|
37
|
+
if not submissions:
|
38
|
+
print("No submissions to compress")
|
39
|
+
return
|
40
|
+
|
41
|
+
print(f"Compressing {len(submissions)} submissions...")
|
42
|
+
|
43
|
+
# Set default compression level if not specified
|
44
|
+
if compression_level is None:
|
45
|
+
compression_level = 6 if compression == 'gzip' else 3
|
46
|
+
|
47
|
+
# Group submissions into batches
|
48
|
+
current_batch = 0
|
49
|
+
current_size = 0
|
50
|
+
sequence = 1
|
51
|
+
current_tar = None
|
52
|
+
|
53
|
+
with tqdm(total=len(submissions), desc="Compressing submissions") as pbar:
|
54
|
+
for submission in submissions:
|
55
|
+
# Parallel document processing
|
56
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
57
|
+
doc_futures = [
|
58
|
+
executor.submit(self._process_document, doc, compression, threshold, compression_level)
|
59
|
+
for doc in submission
|
60
|
+
]
|
61
|
+
|
62
|
+
# Collect results maintaining order
|
63
|
+
documents = []
|
64
|
+
compression_list = []
|
65
|
+
for future in doc_futures:
|
66
|
+
content, compression_type = future.result()
|
67
|
+
documents.append(content)
|
68
|
+
compression_list.append(compression_type)
|
69
|
+
|
70
|
+
# Calculate submission size
|
71
|
+
metadata_str = bytes_to_str(submission.metadata.content, lower=False)
|
72
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
73
|
+
submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
|
74
|
+
|
75
|
+
# Check if we need a new batch tar
|
76
|
+
if current_size > 0 and current_size + submission_size > max_batch_size:
|
77
|
+
if current_tar:
|
78
|
+
current_tar.close()
|
79
|
+
sequence += 1
|
80
|
+
current_size = 0
|
81
|
+
current_tar = None
|
82
|
+
|
83
|
+
# Create tar if needed
|
84
|
+
if current_tar is None:
|
85
|
+
batch_path = portfolio.path / f'batch_{current_batch:03d}_{sequence:03d}.tar'
|
86
|
+
current_tar = tarfile.open(batch_path, 'w')
|
87
|
+
|
88
|
+
# Write submission to tar
|
89
|
+
self._write_submission_to_tar(
|
90
|
+
current_tar,
|
91
|
+
submission,
|
92
|
+
documents,
|
93
|
+
compression_list,
|
94
|
+
submission.accession
|
95
|
+
)
|
96
|
+
|
97
|
+
current_size += submission_size
|
98
|
+
|
99
|
+
# Remove original submission directory/tar
|
100
|
+
if submission.path:
|
101
|
+
if submission.path.is_dir():
|
102
|
+
shutil.rmtree(submission.path)
|
103
|
+
elif submission.path.suffix == '.tar':
|
104
|
+
submission.path.unlink()
|
105
|
+
|
106
|
+
pbar.update(1)
|
107
|
+
|
108
|
+
# Close final tar
|
109
|
+
if current_tar:
|
110
|
+
current_tar.close()
|
111
|
+
|
112
|
+
# Reload submissions to reflect new batch structure
|
113
|
+
portfolio.submissions_loaded = False
|
114
|
+
portfolio._load_submissions()
|
115
|
+
|
116
|
+
print("Compression complete.")
|
117
|
+
|
118
|
+
def decompress_portfolio(self, portfolio, max_workers=None):
|
119
|
+
"""
|
120
|
+
Decompress all batch tar files back to individual submission directories.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
portfolio: Portfolio instance
|
124
|
+
max_workers: Number of threads for parallel file processing (default: portfolio.MAX_WORKERS)
|
125
|
+
"""
|
126
|
+
if max_workers is None:
|
127
|
+
max_workers = portfolio.MAX_WORKERS
|
128
|
+
|
129
|
+
if not portfolio.submissions_loaded:
|
130
|
+
portfolio._load_submissions()
|
131
|
+
|
132
|
+
# Find all batch tar files
|
133
|
+
batch_tars = [f for f in portfolio.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
134
|
+
|
135
|
+
if not batch_tars:
|
136
|
+
print("No batch tar files found to decompress")
|
137
|
+
return
|
138
|
+
|
139
|
+
print(f"Decompressing {len(batch_tars)} batch tar files...")
|
140
|
+
|
141
|
+
# FIRST: Close all batch tar handles to free the files
|
142
|
+
portfolio._close_batch_handles()
|
143
|
+
|
144
|
+
total_extracted = 0
|
145
|
+
|
146
|
+
with tqdm(desc="Decompressing submissions", unit="submissions") as pbar:
|
147
|
+
for batch_tar in batch_tars:
|
148
|
+
with tarfile.open(batch_tar, 'r') as tar:
|
149
|
+
# Find all accession directories in this tar
|
150
|
+
accession_dirs = set()
|
151
|
+
for member in tar.getmembers():
|
152
|
+
if '/' in member.name:
|
153
|
+
accession_dir = member.name.split('/')[0]
|
154
|
+
accession_dirs.add(accession_dir)
|
155
|
+
|
156
|
+
# Extract each submission
|
157
|
+
for accession_dir in accession_dirs:
|
158
|
+
output_dir = portfolio.path / accession_dir
|
159
|
+
output_dir.mkdir(exist_ok=True)
|
160
|
+
|
161
|
+
# Get all files for this accession
|
162
|
+
accession_files = [m for m in tar.getmembers()
|
163
|
+
if m.name.startswith(f'{accession_dir}/') and m.isfile()]
|
164
|
+
|
165
|
+
# Parallel file extraction
|
166
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
167
|
+
file_futures = [
|
168
|
+
executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
|
169
|
+
for member in accession_files
|
170
|
+
]
|
171
|
+
|
172
|
+
# Wait for all files to be processed
|
173
|
+
for future in as_completed(file_futures):
|
174
|
+
future.result()
|
175
|
+
|
176
|
+
total_extracted += 1
|
177
|
+
pbar.update(1)
|
178
|
+
|
179
|
+
|
180
|
+
# NOW delete the batch tar files after everything is extracted
|
181
|
+
for batch_tar in batch_tars:
|
182
|
+
batch_tar.unlink()
|
183
|
+
|
184
|
+
|
185
|
+
# Reload submissions to reflect new directory structure
|
186
|
+
portfolio.submissions_loaded = False
|
187
|
+
portfolio._load_submissions()
|
188
|
+
|
189
|
+
print(f"Decompression complete. Extracted {total_extracted} submissions.")
|
190
|
+
|
191
|
+
def _process_document(self, doc, compression, threshold, compression_level):
|
192
|
+
"""Process a single document: load content and apply compression if needed."""
|
193
|
+
content = doc.content
|
194
|
+
if isinstance(content, str):
|
195
|
+
content = content.encode('utf-8')
|
196
|
+
|
197
|
+
# Apply document-level compression if threshold met AND compression is specified
|
198
|
+
if compression and len(content) >= threshold:
|
199
|
+
if compression == 'gzip':
|
200
|
+
content = gzip.compress(content, compresslevel=compression_level)
|
201
|
+
compression_type = 'gzip'
|
202
|
+
elif compression == 'zstd':
|
203
|
+
content = zstd.ZstdCompressor(level=compression_level).compress(content)
|
204
|
+
compression_type = 'zstd'
|
205
|
+
else:
|
206
|
+
compression_type = ''
|
207
|
+
else:
|
208
|
+
compression_type = ''
|
209
|
+
|
210
|
+
return content, compression_type
|
211
|
+
|
212
|
+
def _extract_file(self, member, tar, accession_dir, output_dir):
|
213
|
+
"""Extract and decompress a single file from tar."""
|
214
|
+
relative_path = member.name[len(accession_dir)+1:] # Remove accession prefix
|
215
|
+
output_path = output_dir / relative_path
|
216
|
+
|
217
|
+
content = tar.extractfile(member).read()
|
218
|
+
|
219
|
+
# Handle decompression based on filename
|
220
|
+
if relative_path.endswith('.gz'):
|
221
|
+
# File MUST be gzipped if it has .gz extension
|
222
|
+
content = gzip.decompress(content)
|
223
|
+
output_path = output_path.with_suffix('') # Remove .gz
|
224
|
+
|
225
|
+
elif relative_path.endswith('.zst'):
|
226
|
+
# File MUST be zstd compressed if it has .zst extension
|
227
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
228
|
+
output_path = output_path.with_suffix('') # Remove .zst
|
229
|
+
|
230
|
+
# Special handling for metadata.json
|
231
|
+
if output_path.name == 'metadata.json':
|
232
|
+
metadata = json.loads(content.decode('utf-8'))
|
233
|
+
# Remove tar-specific metadata
|
234
|
+
for doc in metadata['documents']:
|
235
|
+
doc.pop('secsgml_start_byte', None)
|
236
|
+
doc.pop('secsgml_end_byte', None)
|
237
|
+
|
238
|
+
# Update filenames to match decompressed files
|
239
|
+
filename = doc.get('filename', '')
|
240
|
+
if filename.endswith('.gz'):
|
241
|
+
doc['filename'] = filename[:-3] # Remove .gz
|
242
|
+
elif filename.endswith('.zst'):
|
243
|
+
doc['filename'] = filename[:-4] # Remove .zst
|
244
|
+
|
245
|
+
with output_path.open('w', encoding='utf-8') as f:
|
246
|
+
json.dump(metadata, f, indent=2)
|
247
|
+
else:
|
248
|
+
# Write document file
|
249
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
250
|
+
with output_path.open('wb') as f:
|
251
|
+
f.write(content)
|
252
|
+
|
253
|
+
|
254
|
+
def _write_submission_to_tar(self, tar_handle, submission, documents, compression_list, accession_prefix):
|
255
|
+
"""Write a submission to a tar file with optional document compression."""
|
256
|
+
# Prepare metadata
|
257
|
+
metadata = submission.metadata.content.copy()
|
258
|
+
|
259
|
+
# Update filenames for compressed documents BEFORE size calculation
|
260
|
+
for i, compression in enumerate(compression_list):
|
261
|
+
if compression:
|
262
|
+
doc = metadata['documents'][i]
|
263
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
264
|
+
if compression == 'gzip' and not filename.endswith('.gz'):
|
265
|
+
doc['filename'] = filename + '.gz'
|
266
|
+
elif compression == 'zstd' and not filename.endswith('.zst'):
|
267
|
+
doc['filename'] = filename + '.zst'
|
268
|
+
|
269
|
+
# Add document sizes to metadata for calculate_documents_locations_in_tar
|
270
|
+
for i, content in enumerate(documents):
|
271
|
+
metadata['documents'][i]['secsgml_size_bytes'] = len(content)
|
272
|
+
|
273
|
+
# NOW calculate document positions with the correct filenames
|
274
|
+
metadata = calculate_documents_locations_in_tar(metadata)
|
275
|
+
|
276
|
+
# Write metadata
|
277
|
+
metadata_str = bytes_to_str(metadata, lower=False)
|
278
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
279
|
+
|
280
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/metadata.json')
|
281
|
+
tarinfo.size = len(metadata_json)
|
282
|
+
tar_handle.addfile(tarinfo, io.BytesIO(metadata_json))
|
283
|
+
|
284
|
+
# Write documents
|
285
|
+
for i, content in enumerate(documents):
|
286
|
+
doc = metadata['documents'][i]
|
287
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
288
|
+
|
289
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
|
290
|
+
tarinfo.size = len(content)
|
291
|
+
tar_handle.addfile(tarinfo, io.BytesIO(content))
|
datamule/submission.py
CHANGED
@@ -2,46 +2,12 @@ from pathlib import Path
|
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
|
-
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
6
5
|
from secsgml.parse_sgml import transform_metadata_string
|
7
6
|
import tarfile
|
8
|
-
import shutil
|
9
7
|
import zstandard as zstd
|
10
8
|
import gzip
|
11
|
-
import io
|
12
9
|
|
13
10
|
|
14
|
-
|
15
|
-
def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
|
16
|
-
# Write tar directly to disk
|
17
|
-
with tarfile.open(output_path, 'w') as tar:
|
18
|
-
|
19
|
-
# calculate document locations in tar
|
20
|
-
metadata = calculate_documents_locations_in_tar(metadata, documents)
|
21
|
-
|
22
|
-
# serialize metadata
|
23
|
-
metadata_str = bytes_to_str(metadata,lower=False)
|
24
|
-
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
25
|
-
# save metadata
|
26
|
-
tarinfo = tarfile.TarInfo(name='metadata.json')
|
27
|
-
tarinfo.size = len(metadata_json)
|
28
|
-
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
29
|
-
|
30
|
-
for file_num, content in enumerate(documents, 0):
|
31
|
-
if standardize_metadata:
|
32
|
-
document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
|
33
|
-
|
34
|
-
compression = compression_list[file_num]
|
35
|
-
if compression == 'gzip':
|
36
|
-
document_name = f'{document_name}.gz'
|
37
|
-
elif compression == 'zstd':
|
38
|
-
document_name = f'{document_name}.zst'
|
39
|
-
|
40
|
-
|
41
|
-
tarinfo = tarfile.TarInfo(name=f'{document_name}')
|
42
|
-
tarinfo.size = len(content)
|
43
|
-
tar.addfile(tarinfo, io.BytesIO(content))
|
44
|
-
|
45
11
|
class Submission:
|
46
12
|
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
47
13
|
batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
|
@@ -128,94 +94,6 @@ class Submission:
|
|
128
94
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
129
95
|
self.accession = self.metadata.content['accession-number']
|
130
96
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
def compress(self, compression=None, level=None, threshold=1048576):
|
135
|
-
if self.path is None:
|
136
|
-
raise ValueError("Compress requires path")
|
137
|
-
|
138
|
-
if compression is not None and compression not in ['gzip', 'zstd']:
|
139
|
-
raise ValueError("compression must be 'gzip' or 'zstd'")
|
140
|
-
|
141
|
-
# check if we're loading from a dir or a tar file
|
142
|
-
is_dir_not_tar = True
|
143
|
-
if self.path.suffix == '.tar':
|
144
|
-
is_dir_not_tar = False
|
145
|
-
elif not self.path.is_dir():
|
146
|
-
raise ValueError("Path must be a directory to compress")
|
147
|
-
# Create tar file (replace directory with .tar file)
|
148
|
-
tar_path = self.path.with_suffix('.tar')
|
149
|
-
|
150
|
-
# load all files in the directory or tar file
|
151
|
-
documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
|
152
|
-
|
153
|
-
|
154
|
-
# we should compress everything here first.
|
155
|
-
compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
|
156
|
-
documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
|
157
|
-
len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
|
158
|
-
len(doc) >= threshold else doc for doc in documents]
|
159
|
-
|
160
|
-
metadata = self.metadata.content.copy()
|
161
|
-
write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
|
162
|
-
|
163
|
-
# Delete original folder
|
164
|
-
if is_dir_not_tar:
|
165
|
-
shutil.rmtree(self.path)
|
166
|
-
# otherwise, we already replaced the tar file
|
167
|
-
# Update path to point to new tar file
|
168
|
-
self.path = tar_path
|
169
|
-
|
170
|
-
def decompress(self):
|
171
|
-
if self.path is None:
|
172
|
-
raise ValueError("Decompress requires path")
|
173
|
-
elif self.path.suffix != '.tar':
|
174
|
-
raise ValueError("Can only decompress tar")
|
175
|
-
|
176
|
-
# Create output directory (path without .tar extension)
|
177
|
-
output_dir = self.path.with_suffix('')
|
178
|
-
output_dir.mkdir(exist_ok=True)
|
179
|
-
|
180
|
-
with tarfile.open(self.path, 'r') as tar:
|
181
|
-
for member in tar.getmembers():
|
182
|
-
if member.isfile():
|
183
|
-
content = tar.extractfile(member).read()
|
184
|
-
|
185
|
-
# Decompress based on file extension
|
186
|
-
if member.name.endswith('.gz'):
|
187
|
-
content = gzip.decompress(content)
|
188
|
-
output_path = output_dir / member.name[:-3] # Remove .gz extension
|
189
|
-
elif member.name.endswith('.zst'):
|
190
|
-
dctx = zstd.ZstdDecompressor()
|
191
|
-
content = dctx.decompress(content)
|
192
|
-
output_path = output_dir / member.name[:-4] # Remove .zst extension
|
193
|
-
else:
|
194
|
-
output_path = output_dir / member.name
|
195
|
-
|
196
|
-
# check if it is metadata.json
|
197
|
-
if output_path.name == 'metadata.json':
|
198
|
-
# load as json
|
199
|
-
metadata = json.loads(content.decode('utf-8'))
|
200
|
-
# remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
|
201
|
-
for doc in metadata['documents']:
|
202
|
-
if 'secsgml_start_byte' in doc:
|
203
|
-
del doc['secsgml_start_byte']
|
204
|
-
|
205
|
-
if 'secsgml_end_byte' in doc:
|
206
|
-
del doc['secsgml_end_byte']
|
207
|
-
|
208
|
-
with output_path.open('w', encoding='utf-8') as f:
|
209
|
-
json.dump(metadata, f)
|
210
|
-
else:
|
211
|
-
# Write to output directory
|
212
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
213
|
-
with output_path.open('wb') as f:
|
214
|
-
f.write(content)
|
215
|
-
|
216
|
-
# delete original file
|
217
|
-
self.path.unlink()
|
218
|
-
self.path = output_dir
|
219
97
|
|
220
98
|
def _load_document_by_index(self, idx):
|
221
99
|
"""Load a document by its index in the metadata documents list."""
|
@@ -225,44 +103,38 @@ class Submission:
|
|
225
103
|
if self.path is None and self.batch_tar_path is None:
|
226
104
|
return self.documents[idx]
|
227
105
|
|
228
|
-
# Get filename
|
106
|
+
# Get filename from metadata - this is the source of truth
|
229
107
|
filename = doc.get('filename')
|
230
108
|
if filename is None:
|
231
109
|
filename = doc['sequence'] + '.txt'
|
232
110
|
|
233
|
-
extension
|
111
|
+
# Get the base extension (before any compression extension)
|
112
|
+
# If filename ends with .gz or .zst, the real extension is before that
|
113
|
+
if filename.endswith('.gz'):
|
114
|
+
extension = Path(filename[:-3]).suffix
|
115
|
+
is_compressed = 'gzip'
|
116
|
+
elif filename.endswith('.zst'):
|
117
|
+
extension = Path(filename[:-4]).suffix
|
118
|
+
is_compressed = 'zstd'
|
119
|
+
else:
|
120
|
+
extension = Path(filename).suffix
|
121
|
+
is_compressed = False
|
234
122
|
|
235
123
|
# Handle batch tar case
|
236
124
|
if self.batch_tar_path is not None:
|
237
125
|
with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
|
238
126
|
tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
|
239
127
|
|
240
|
-
#
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
f'{self.accession_prefix}/{filename}.zst'
|
245
|
-
]
|
246
|
-
|
247
|
-
content = None
|
248
|
-
actual_filename = None
|
249
|
-
for attempt_filename in possible_filenames:
|
250
|
-
try:
|
251
|
-
content = tar_handle.extractfile(attempt_filename).read()
|
252
|
-
actual_filename = attempt_filename
|
253
|
-
break
|
254
|
-
except:
|
255
|
-
continue
|
256
|
-
|
257
|
-
if content is None:
|
258
|
-
raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
|
128
|
+
# Use exact filename from metadata
|
129
|
+
tar_path = f'{self.accession_prefix}/{filename}'
|
130
|
+
content = tar_handle.extractfile(tar_path).read()
|
131
|
+
|
259
132
|
|
260
|
-
# Decompress if
|
261
|
-
if
|
133
|
+
# Decompress if needed based on filename extension
|
134
|
+
if is_compressed == 'gzip':
|
262
135
|
content = gzip.decompress(content)
|
263
|
-
elif
|
264
|
-
|
265
|
-
content = dctx.decompress(content)
|
136
|
+
elif is_compressed == 'zstd':
|
137
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
266
138
|
|
267
139
|
# Decode text files
|
268
140
|
if extension in ['.htm', '.html', '.txt', '.xml']:
|
@@ -270,35 +142,56 @@ class Submission:
|
|
270
142
|
|
271
143
|
document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
|
272
144
|
|
273
|
-
# Handle regular path case
|
145
|
+
# Handle regular path case
|
274
146
|
else:
|
275
|
-
|
276
|
-
|
147
|
+
# Check if path is a tar file (old format)
|
277
148
|
if self.path.suffix == '.tar':
|
278
149
|
with tarfile.open(self.path, 'r') as tar:
|
279
|
-
#
|
280
|
-
# bandaid fix TODO
|
150
|
+
# Try to extract the file, handling compression
|
281
151
|
try:
|
282
152
|
content = tar.extractfile(filename).read()
|
153
|
+
actual_filename = filename
|
283
154
|
except:
|
284
155
|
try:
|
285
|
-
content = tar.extractfile(filename+'.gz').read()
|
156
|
+
content = tar.extractfile(filename + '.gz').read()
|
157
|
+
actual_filename = filename + '.gz'
|
158
|
+
is_compressed = 'gzip'
|
286
159
|
except:
|
287
|
-
try:
|
288
|
-
content = tar.extractfile(filename+'.zst').read()
|
160
|
+
try:
|
161
|
+
content = tar.extractfile(filename + '.zst').read()
|
162
|
+
actual_filename = filename + '.zst'
|
163
|
+
is_compressed = 'zstd'
|
289
164
|
except:
|
290
|
-
|
291
|
-
|
165
|
+
raise FileNotFoundError(f"Document file not found in tar: {filename}")
|
166
|
+
|
292
167
|
# Decompress if compressed
|
293
|
-
if
|
168
|
+
if is_compressed == 'gzip':
|
294
169
|
content = gzip.decompress(content)
|
295
|
-
elif
|
296
|
-
|
297
|
-
|
170
|
+
elif is_compressed == 'zstd':
|
171
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
172
|
+
|
173
|
+
# Decode text files
|
174
|
+
if extension in ['.htm', '.html', '.txt', '.xml']:
|
175
|
+
content = content.decode('utf-8', errors='replace')
|
176
|
+
|
177
|
+
document_path = f"{self.path}::{actual_filename}"
|
178
|
+
|
298
179
|
else:
|
180
|
+
# Regular directory case
|
181
|
+
document_path = self.path / filename
|
182
|
+
|
183
|
+
if not document_path.exists():
|
184
|
+
raise FileNotFoundError(f"Document file not found: {document_path}")
|
185
|
+
|
299
186
|
with document_path.open('rb') as f:
|
300
187
|
content = f.read()
|
301
|
-
|
188
|
+
|
189
|
+
# Decompress if needed based on filename extension
|
190
|
+
if is_compressed == 'gzip':
|
191
|
+
content = gzip.decompress(content)
|
192
|
+
elif is_compressed == 'zstd':
|
193
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
194
|
+
|
302
195
|
# Decode text files
|
303
196
|
if extension in ['.htm', '.html', '.txt', '.xml']:
|
304
197
|
content = content.decode('utf-8', errors='replace')
|
@@ -311,7 +204,6 @@ class Submission:
|
|
311
204
|
accession=self.accession,
|
312
205
|
path=document_path
|
313
206
|
)
|
314
|
-
|
315
207
|
def __iter__(self):
|
316
208
|
"""Make Submission iterable by yielding all documents."""
|
317
209
|
for idx in range(len(self.metadata.content['documents'])):
|
@@ -3,9 +3,10 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
-
datamule/portfolio.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=tADqQMkFaFyjanbJ0QcaOHGdJJB254rOg29FW7a13l0,11835
|
7
|
+
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
7
8
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
9
|
+
datamule/submission.py,sha256=ooLsesZ5HkgSWyEFID4u08CobTxdo35eAUHSCB6fw2k,10332
|
9
10
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
11
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
12
|
datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
|
@@ -65,7 +66,7 @@ datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOa
|
|
65
66
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
66
67
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
68
|
datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
|
68
|
-
datamule-1.6.
|
69
|
-
datamule-1.6.
|
70
|
-
datamule-1.6.
|
71
|
-
datamule-1.6.
|
69
|
+
datamule-1.6.4.dist-info/METADATA,sha256=4v85X90MyUpvQC37rMyiEA_3UA-2scIHz4tTF1xH2e4,524
|
70
|
+
datamule-1.6.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
71
|
+
datamule-1.6.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
72
|
+
datamule-1.6.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|