datamule 1.6.1__py3-none-any.whl → 1.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +102 -18
- datamule/portfolio_compression_utils.py +291 -0
- datamule/seclibrary/downloader.py +163 -161
- datamule/submission.py +82 -186
- datamule/utils/construct_submissions_data.py +4 -4
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/METADATA +1 -1
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/RECORD +9 -8
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/WHEEL +0 -0
- {datamule-1.6.1.dist-info → datamule-1.6.3.dist-info}/top_level.txt +0 -0
datamule/portfolio.py
CHANGED
@@ -1,18 +1,23 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from tqdm import tqdm
|
3
|
-
from concurrent.futures import ThreadPoolExecutor
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
4
|
from .submission import Submission
|
5
5
|
from .sec.submissions.downloader import download as sec_download
|
6
6
|
from .sec.submissions.textsearch import filter_text
|
7
7
|
from .config import Config
|
8
8
|
import os
|
9
|
+
import tarfile
|
10
|
+
from threading import Lock
|
9
11
|
from .helper import _process_cik_and_metadata_filters
|
10
12
|
from .seclibrary.downloader import download as seclibrary_download
|
11
13
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
12
14
|
from .sec.submissions.monitor import Monitor
|
15
|
+
from .portfolio_compression_utils import CompressionManager
|
13
16
|
#from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
14
17
|
from .datamule.sec_connector import SecConnector
|
15
|
-
|
18
|
+
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
19
|
+
import json
|
20
|
+
import io
|
16
21
|
|
17
22
|
class Portfolio:
|
18
23
|
def __init__(self, path):
|
@@ -21,6 +26,10 @@ class Portfolio:
|
|
21
26
|
self.submissions = []
|
22
27
|
self.submissions_loaded = False
|
23
28
|
self.MAX_WORKERS = os.cpu_count() - 1
|
29
|
+
|
30
|
+
# Batch tar support
|
31
|
+
self.batch_tar_handles = {} # {batch_tar_path: tarfile_handle}
|
32
|
+
self.batch_tar_locks = {} # {batch_tar_path: threading.Lock}
|
24
33
|
|
25
34
|
self.monitor = Monitor()
|
26
35
|
|
@@ -34,27 +43,101 @@ class Portfolio:
|
|
34
43
|
self.api_key = api_key
|
35
44
|
|
36
45
|
def _load_submissions(self):
|
37
|
-
|
38
|
-
|
46
|
+
print(f"Loading submissions")
|
47
|
+
|
48
|
+
# Separate regular and batch items
|
49
|
+
regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
|
50
|
+
batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
39
51
|
|
52
|
+
# Load regular submissions (existing logic)
|
40
53
|
def load_submission(folder):
|
41
|
-
|
42
|
-
return Submission(folder)
|
43
|
-
except Exception as e:
|
44
|
-
print(f"Error loading submission from {folder}: {str(e)}")
|
45
|
-
return None
|
54
|
+
return Submission(folder)
|
46
55
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
+
regular_submissions = []
|
57
|
+
if regular_items:
|
58
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
59
|
+
regular_submissions = list(tqdm(
|
60
|
+
executor.map(load_submission, regular_items),
|
61
|
+
total=len(regular_items),
|
62
|
+
desc="Loading regular submissions"
|
63
|
+
))
|
64
|
+
|
65
|
+
# Load batch submissions with parallel processing + progress
|
66
|
+
batch_submissions = []
|
67
|
+
if batch_tars:
|
68
|
+
with tqdm(desc="Loading batch submissions", unit="submissions") as pbar:
|
69
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
70
|
+
# Submit all batch tar jobs
|
71
|
+
futures = [
|
72
|
+
executor.submit(self._load_batch_submissions_worker, batch_tar, pbar)
|
73
|
+
for batch_tar in batch_tars
|
74
|
+
]
|
75
|
+
|
76
|
+
# Collect results as they complete
|
77
|
+
for future in as_completed(futures):
|
78
|
+
batch_submissions.extend(future.result())
|
79
|
+
|
80
|
+
# Combine and filter None values
|
81
|
+
self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
|
56
82
|
print(f"Successfully loaded {len(self.submissions)} submissions")
|
57
83
|
|
84
|
+
def _load_batch_submissions_worker(self, batch_tar_path, pbar):
|
85
|
+
"""Worker function to load submissions from one batch tar with progress updates"""
|
86
|
+
# Open tar handle and store it
|
87
|
+
tar_handle = tarfile.open(batch_tar_path, 'r')
|
88
|
+
self.batch_tar_handles[batch_tar_path] = tar_handle
|
89
|
+
self.batch_tar_locks[batch_tar_path] = Lock()
|
90
|
+
|
91
|
+
# Find all accession directories
|
92
|
+
accession_prefixes = set()
|
93
|
+
for member in tar_handle.getmembers():
|
94
|
+
if '/' in member.name and member.name.endswith('metadata.json'):
|
95
|
+
accession_prefix = member.name.split('/')[0]
|
96
|
+
accession_prefixes.add(accession_prefix)
|
97
|
+
|
98
|
+
# Create submissions for each accession
|
99
|
+
submissions = []
|
100
|
+
for accession_prefix in accession_prefixes:
|
101
|
+
submission = Submission(
|
102
|
+
batch_tar_path=batch_tar_path,
|
103
|
+
accession_prefix=accession_prefix,
|
104
|
+
portfolio_ref=self
|
105
|
+
)
|
106
|
+
submissions.append(submission)
|
107
|
+
pbar.update(1) # Update progress for each successful submission
|
108
|
+
|
109
|
+
return submissions
|
110
|
+
|
111
|
+
|
112
|
+
def compress(self, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024):
|
113
|
+
"""
|
114
|
+
Compress all individual submissions into batch tar files.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
118
|
+
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
119
|
+
threshold: Size threshold for compressing individual documents (default: 1MB)
|
120
|
+
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
121
|
+
"""
|
122
|
+
CompressionManager().compress_portfolio(self, compression, compression_level, threshold, max_batch_size, self.MAX_WORKERS)
|
123
|
+
|
124
|
+
def decompress(self):
|
125
|
+
"""
|
126
|
+
Decompress all batch tar files back to individual submission directories.
|
127
|
+
"""
|
128
|
+
CompressionManager().decompress_portfolio(self, self.MAX_WORKERS)
|
129
|
+
|
130
|
+
def _close_batch_handles(self):
|
131
|
+
"""Close all open batch tar handles to free resources"""
|
132
|
+
for handle in self.batch_tar_handles.values():
|
133
|
+
handle.close()
|
134
|
+
self.batch_tar_handles.clear()
|
135
|
+
self.batch_tar_locks.clear()
|
136
|
+
|
137
|
+
def __del__(self):
|
138
|
+
"""Cleanup batch tar handles on destruction"""
|
139
|
+
self._close_batch_handles()
|
140
|
+
|
58
141
|
def process_submissions(self, callback):
|
59
142
|
"""Process all submissions using a thread pool."""
|
60
143
|
if not self.submissions_loaded:
|
@@ -169,6 +252,7 @@ class Portfolio:
|
|
169
252
|
)
|
170
253
|
|
171
254
|
self.submissions_loaded = False
|
255
|
+
|
172
256
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
173
257
|
polling_interval=1000, quiet=True, start_date=None,
|
174
258
|
validation_interval=600000):
|
@@ -0,0 +1,291 @@
|
|
1
|
+
import json
|
2
|
+
import io
|
3
|
+
import gzip
|
4
|
+
import zstandard as zstd
|
5
|
+
import tarfile
|
6
|
+
import shutil
|
7
|
+
from tqdm import tqdm
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
+
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
10
|
+
|
11
|
+
|
12
|
+
class CompressionManager:
|
13
|
+
|
14
|
+
def compress_portfolio(self, portfolio, compression=None, compression_level=None, threshold=1048576, max_batch_size=1024*1024*1024, max_workers=None):
|
15
|
+
"""
|
16
|
+
Compress all individual submissions into batch tar files.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
portfolio: Portfolio instance
|
20
|
+
compression: None, 'gzip', or 'zstd' for document compression (default: None)
|
21
|
+
compression_level: Compression level, if None uses defaults (gzip=6, zstd=3)
|
22
|
+
threshold: Size threshold for compressing individual documents (default: 1MB)
|
23
|
+
max_batch_size: Maximum size per batch tar file (default: 1GB)
|
24
|
+
max_workers: Number of threads for parallel document processing (default: portfolio.MAX_WORKERS)
|
25
|
+
"""
|
26
|
+
if max_workers is None:
|
27
|
+
max_workers = portfolio.MAX_WORKERS
|
28
|
+
|
29
|
+
portfolio._close_batch_handles()
|
30
|
+
|
31
|
+
if not portfolio.submissions_loaded:
|
32
|
+
portfolio._load_submissions()
|
33
|
+
|
34
|
+
# Only compress non-batch submissions
|
35
|
+
submissions = [s for s in portfolio.submissions if s.batch_tar_path is None]
|
36
|
+
|
37
|
+
if not submissions:
|
38
|
+
print("No submissions to compress")
|
39
|
+
return
|
40
|
+
|
41
|
+
print(f"Compressing {len(submissions)} submissions...")
|
42
|
+
|
43
|
+
# Set default compression level if not specified
|
44
|
+
if compression_level is None:
|
45
|
+
compression_level = 6 if compression == 'gzip' else 3
|
46
|
+
|
47
|
+
# Group submissions into batches
|
48
|
+
current_batch = 0
|
49
|
+
current_size = 0
|
50
|
+
sequence = 1
|
51
|
+
current_tar = None
|
52
|
+
|
53
|
+
with tqdm(total=len(submissions), desc="Compressing submissions") as pbar:
|
54
|
+
for submission in submissions:
|
55
|
+
# Parallel document processing
|
56
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
57
|
+
doc_futures = [
|
58
|
+
executor.submit(self._process_document, doc, compression, threshold, compression_level)
|
59
|
+
for doc in submission
|
60
|
+
]
|
61
|
+
|
62
|
+
# Collect results maintaining order
|
63
|
+
documents = []
|
64
|
+
compression_list = []
|
65
|
+
for future in doc_futures:
|
66
|
+
content, compression_type = future.result()
|
67
|
+
documents.append(content)
|
68
|
+
compression_list.append(compression_type)
|
69
|
+
|
70
|
+
# Calculate submission size
|
71
|
+
metadata_str = bytes_to_str(submission.metadata.content, lower=False)
|
72
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
73
|
+
submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
|
74
|
+
|
75
|
+
# Check if we need a new batch tar
|
76
|
+
if current_size > 0 and current_size + submission_size > max_batch_size:
|
77
|
+
if current_tar:
|
78
|
+
current_tar.close()
|
79
|
+
sequence += 1
|
80
|
+
current_size = 0
|
81
|
+
current_tar = None
|
82
|
+
|
83
|
+
# Create tar if needed
|
84
|
+
if current_tar is None:
|
85
|
+
batch_path = portfolio.path / f'batch_{current_batch:03d}_{sequence:03d}.tar'
|
86
|
+
current_tar = tarfile.open(batch_path, 'w')
|
87
|
+
|
88
|
+
# Write submission to tar
|
89
|
+
self._write_submission_to_tar(
|
90
|
+
current_tar,
|
91
|
+
submission,
|
92
|
+
documents,
|
93
|
+
compression_list,
|
94
|
+
submission.accession
|
95
|
+
)
|
96
|
+
|
97
|
+
current_size += submission_size
|
98
|
+
|
99
|
+
# Remove original submission directory/tar
|
100
|
+
if submission.path:
|
101
|
+
if submission.path.is_dir():
|
102
|
+
shutil.rmtree(submission.path)
|
103
|
+
elif submission.path.suffix == '.tar':
|
104
|
+
submission.path.unlink()
|
105
|
+
|
106
|
+
pbar.update(1)
|
107
|
+
|
108
|
+
# Close final tar
|
109
|
+
if current_tar:
|
110
|
+
current_tar.close()
|
111
|
+
|
112
|
+
# Reload submissions to reflect new batch structure
|
113
|
+
portfolio.submissions_loaded = False
|
114
|
+
portfolio._load_submissions()
|
115
|
+
|
116
|
+
print("Compression complete.")
|
117
|
+
|
118
|
+
def decompress_portfolio(self, portfolio, max_workers=None):
|
119
|
+
"""
|
120
|
+
Decompress all batch tar files back to individual submission directories.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
portfolio: Portfolio instance
|
124
|
+
max_workers: Number of threads for parallel file processing (default: portfolio.MAX_WORKERS)
|
125
|
+
"""
|
126
|
+
if max_workers is None:
|
127
|
+
max_workers = portfolio.MAX_WORKERS
|
128
|
+
|
129
|
+
if not portfolio.submissions_loaded:
|
130
|
+
portfolio._load_submissions()
|
131
|
+
|
132
|
+
# Find all batch tar files
|
133
|
+
batch_tars = [f for f in portfolio.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
134
|
+
|
135
|
+
if not batch_tars:
|
136
|
+
print("No batch tar files found to decompress")
|
137
|
+
return
|
138
|
+
|
139
|
+
print(f"Decompressing {len(batch_tars)} batch tar files...")
|
140
|
+
|
141
|
+
# FIRST: Close all batch tar handles to free the files
|
142
|
+
portfolio._close_batch_handles()
|
143
|
+
|
144
|
+
total_extracted = 0
|
145
|
+
|
146
|
+
with tqdm(desc="Decompressing submissions", unit="submissions") as pbar:
|
147
|
+
for batch_tar in batch_tars:
|
148
|
+
with tarfile.open(batch_tar, 'r') as tar:
|
149
|
+
# Find all accession directories in this tar
|
150
|
+
accession_dirs = set()
|
151
|
+
for member in tar.getmembers():
|
152
|
+
if '/' in member.name:
|
153
|
+
accession_dir = member.name.split('/')[0]
|
154
|
+
accession_dirs.add(accession_dir)
|
155
|
+
|
156
|
+
# Extract each submission
|
157
|
+
for accession_dir in accession_dirs:
|
158
|
+
output_dir = portfolio.path / accession_dir
|
159
|
+
output_dir.mkdir(exist_ok=True)
|
160
|
+
|
161
|
+
# Get all files for this accession
|
162
|
+
accession_files = [m for m in tar.getmembers()
|
163
|
+
if m.name.startswith(f'{accession_dir}/') and m.isfile()]
|
164
|
+
|
165
|
+
# Parallel file extraction
|
166
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
167
|
+
file_futures = [
|
168
|
+
executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
|
169
|
+
for member in accession_files
|
170
|
+
]
|
171
|
+
|
172
|
+
# Wait for all files to be processed
|
173
|
+
for future in as_completed(file_futures):
|
174
|
+
future.result()
|
175
|
+
|
176
|
+
total_extracted += 1
|
177
|
+
pbar.update(1)
|
178
|
+
|
179
|
+
|
180
|
+
# NOW delete the batch tar files after everything is extracted
|
181
|
+
for batch_tar in batch_tars:
|
182
|
+
batch_tar.unlink()
|
183
|
+
|
184
|
+
|
185
|
+
# Reload submissions to reflect new directory structure
|
186
|
+
portfolio.submissions_loaded = False
|
187
|
+
portfolio._load_submissions()
|
188
|
+
|
189
|
+
print(f"Decompression complete. Extracted {total_extracted} submissions.")
|
190
|
+
|
191
|
+
def _process_document(self, doc, compression, threshold, compression_level):
|
192
|
+
"""Process a single document: load content and apply compression if needed."""
|
193
|
+
content = doc.content
|
194
|
+
if isinstance(content, str):
|
195
|
+
content = content.encode('utf-8')
|
196
|
+
|
197
|
+
# Apply document-level compression if threshold met AND compression is specified
|
198
|
+
if compression and len(content) >= threshold:
|
199
|
+
if compression == 'gzip':
|
200
|
+
content = gzip.compress(content, compresslevel=compression_level)
|
201
|
+
compression_type = 'gzip'
|
202
|
+
elif compression == 'zstd':
|
203
|
+
content = zstd.ZstdCompressor(level=compression_level).compress(content)
|
204
|
+
compression_type = 'zstd'
|
205
|
+
else:
|
206
|
+
compression_type = ''
|
207
|
+
else:
|
208
|
+
compression_type = ''
|
209
|
+
|
210
|
+
return content, compression_type
|
211
|
+
|
212
|
+
def _extract_file(self, member, tar, accession_dir, output_dir):
|
213
|
+
"""Extract and decompress a single file from tar."""
|
214
|
+
relative_path = member.name[len(accession_dir)+1:] # Remove accession prefix
|
215
|
+
output_path = output_dir / relative_path
|
216
|
+
|
217
|
+
content = tar.extractfile(member).read()
|
218
|
+
|
219
|
+
# Handle decompression based on filename
|
220
|
+
if relative_path.endswith('.gz'):
|
221
|
+
# File MUST be gzipped if it has .gz extension
|
222
|
+
content = gzip.decompress(content)
|
223
|
+
output_path = output_path.with_suffix('') # Remove .gz
|
224
|
+
|
225
|
+
elif relative_path.endswith('.zst'):
|
226
|
+
# File MUST be zstd compressed if it has .zst extension
|
227
|
+
content = zstd.ZstdDecompressor().decompress(content)
|
228
|
+
output_path = output_path.with_suffix('') # Remove .zst
|
229
|
+
|
230
|
+
# Special handling for metadata.json
|
231
|
+
if output_path.name == 'metadata.json':
|
232
|
+
metadata = json.loads(content.decode('utf-8'))
|
233
|
+
# Remove tar-specific metadata
|
234
|
+
for doc in metadata['documents']:
|
235
|
+
doc.pop('secsgml_start_byte', None)
|
236
|
+
doc.pop('secsgml_end_byte', None)
|
237
|
+
|
238
|
+
# Update filenames to match decompressed files
|
239
|
+
filename = doc.get('filename', '')
|
240
|
+
if filename.endswith('.gz'):
|
241
|
+
doc['filename'] = filename[:-3] # Remove .gz
|
242
|
+
elif filename.endswith('.zst'):
|
243
|
+
doc['filename'] = filename[:-4] # Remove .zst
|
244
|
+
|
245
|
+
with output_path.open('w', encoding='utf-8') as f:
|
246
|
+
json.dump(metadata, f, indent=2)
|
247
|
+
else:
|
248
|
+
# Write document file
|
249
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
250
|
+
with output_path.open('wb') as f:
|
251
|
+
f.write(content)
|
252
|
+
|
253
|
+
|
254
|
+
def _write_submission_to_tar(self, tar_handle, submission, documents, compression_list, accession_prefix):
|
255
|
+
"""Write a submission to a tar file with optional document compression."""
|
256
|
+
# Prepare metadata
|
257
|
+
metadata = submission.metadata.content.copy()
|
258
|
+
|
259
|
+
# Update filenames for compressed documents BEFORE size calculation
|
260
|
+
for i, compression in enumerate(compression_list):
|
261
|
+
if compression:
|
262
|
+
doc = metadata['documents'][i]
|
263
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
264
|
+
if compression == 'gzip' and not filename.endswith('.gz'):
|
265
|
+
doc['filename'] = filename + '.gz'
|
266
|
+
elif compression == 'zstd' and not filename.endswith('.zst'):
|
267
|
+
doc['filename'] = filename + '.zst'
|
268
|
+
|
269
|
+
# Add document sizes to metadata for calculate_documents_locations_in_tar
|
270
|
+
for i, content in enumerate(documents):
|
271
|
+
metadata['documents'][i]['secsgml_size_bytes'] = len(content)
|
272
|
+
|
273
|
+
# NOW calculate document positions with the correct filenames
|
274
|
+
metadata = calculate_documents_locations_in_tar(metadata)
|
275
|
+
|
276
|
+
# Write metadata
|
277
|
+
metadata_str = bytes_to_str(metadata, lower=False)
|
278
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
279
|
+
|
280
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/metadata.json')
|
281
|
+
tarinfo.size = len(metadata_json)
|
282
|
+
tar_handle.addfile(tarinfo, io.BytesIO(metadata_json))
|
283
|
+
|
284
|
+
# Write documents
|
285
|
+
for i, content in enumerate(documents):
|
286
|
+
doc = metadata['documents'][i]
|
287
|
+
filename = doc.get('filename', doc['sequence'] + '.txt')
|
288
|
+
|
289
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
|
290
|
+
tarinfo.size = len(content)
|
291
|
+
tar_handle.addfile(tarinfo, io.BytesIO(content))
|