datamule 1.5.2__py3-none-any.whl → 1.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +8 -4
- datamule/sec/submissions/downloader.py +3 -2
- datamule/sec/submissions/monitor.py +42 -22
- datamule/seclibrary/downloader.py +97 -8
- datamule/submission.py +123 -45
- {datamule-1.5.2.dist-info → datamule-1.5.4.dist-info}/METADATA +1 -1
- {datamule-1.5.2.dist-info → datamule-1.5.4.dist-info}/RECORD +9 -9
- {datamule-1.5.2.dist-info → datamule-1.5.4.dist-info}/WHEEL +0 -0
- {datamule-1.5.2.dist-info → datamule-1.5.4.dist-info}/top_level.txt +0 -0
datamule/portfolio.py
CHANGED
@@ -34,7 +34,6 @@ class Portfolio:
|
|
34
34
|
|
35
35
|
def _load_submissions(self):
|
36
36
|
folders = [f for f in self.path.iterdir() if f.is_dir() or f.suffix=='.tar']
|
37
|
-
print(folders)
|
38
37
|
print(f"Loading {len(folders)} submissions")
|
39
38
|
|
40
39
|
def load_submission(folder):
|
@@ -126,7 +125,8 @@ class Portfolio:
|
|
126
125
|
# First query, just set the accession numbers
|
127
126
|
self.accession_numbers = new_accession_numbers
|
128
127
|
|
129
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
|
128
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
|
129
|
+
requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True, **kwargs):
|
130
130
|
if provider is None:
|
131
131
|
config = Config()
|
132
132
|
provider = config.get_default_source()
|
@@ -143,7 +143,9 @@ class Portfolio:
|
|
143
143
|
submission_type=submission_type,
|
144
144
|
filing_date=filing_date,
|
145
145
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
146
|
-
keep_document_types=document_type
|
146
|
+
keep_document_types=document_type,
|
147
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
148
|
+
standardize_metadata=standardize_metadata,
|
147
149
|
)
|
148
150
|
else:
|
149
151
|
sec_download(
|
@@ -153,7 +155,9 @@ class Portfolio:
|
|
153
155
|
filing_date=filing_date,
|
154
156
|
requests_per_second=requests_per_second,
|
155
157
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None,
|
156
|
-
keep_document_types=document_type
|
158
|
+
keep_document_types=document_type,
|
159
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
160
|
+
standardize_metadata=standardize_metadata,
|
157
161
|
)
|
158
162
|
|
159
163
|
self.submissions_loaded = False
|
@@ -5,7 +5,7 @@ from tqdm import tqdm
|
|
5
5
|
|
6
6
|
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
7
7
|
requests_per_second=5, output_dir="filings", accession_numbers=None,
|
8
|
-
quiet=False, keep_document_types=[]):
|
8
|
+
quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
|
9
9
|
# Make sure output directory exists
|
10
10
|
os.makedirs(output_dir, exist_ok=True)
|
11
11
|
|
@@ -14,7 +14,8 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
|
|
14
14
|
# Create a wrapper for the download_callback that includes the output_dir
|
15
15
|
async def callback_wrapper(hit, content, cik, accno, url):
|
16
16
|
output_path = os.path.join(output_dir, accno.replace('-','') + '.tar')
|
17
|
-
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types
|
17
|
+
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=keep_document_types,keep_filtered_metadata=keep_filtered_metadata,
|
18
|
+
standardize_metadata=standardize_metadata)
|
18
19
|
pbar.update(1)
|
19
20
|
|
20
21
|
|
@@ -77,12 +77,19 @@ class Monitor():
|
|
77
77
|
)
|
78
78
|
|
79
79
|
async def _async_monitor_submissions(self, data_callback=None, interval_callback=None,
|
80
|
-
|
81
|
-
|
80
|
+
polling_interval=1000, quiet=True, start_date=None,
|
81
|
+
validation_interval=60000):
|
82
82
|
"""
|
83
83
|
Async implementation of monitor_submissions.
|
84
|
+
Either polling_interval or validation_interval (or both) must be specified.
|
85
|
+
If polling_interval is None, only EFTS validation will be performed.
|
86
|
+
If validation_interval is None, only RSS polling will be performed.
|
84
87
|
"""
|
85
88
|
|
89
|
+
# Validate that at least one interval is specified
|
90
|
+
if polling_interval is None and validation_interval is None:
|
91
|
+
raise ValueError("At least one of polling_interval or validation_interval must be specified")
|
92
|
+
|
86
93
|
# Backfill if start_date is provided
|
87
94
|
if start_date is not None:
|
88
95
|
today_date = datetime.now().date().strftime('%Y-%m-%d')
|
@@ -100,24 +107,33 @@ class Monitor():
|
|
100
107
|
if new_hits and data_callback:
|
101
108
|
data_callback(new_hits)
|
102
109
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
# Initialize timing variables
|
111
|
+
current_time = time.time()
|
112
|
+
last_polling_time = current_time
|
113
|
+
last_validation_time = current_time
|
114
|
+
|
115
|
+
# Determine which operations to perform
|
116
|
+
do_polling = polling_interval is not None
|
117
|
+
do_validation = validation_interval is not None
|
118
|
+
|
107
119
|
while True:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
new_results = self._filter_new_accessions(results)
|
113
|
-
if new_results:
|
120
|
+
current_time = time.time()
|
121
|
+
|
122
|
+
# RSS polling (if enabled)
|
123
|
+
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
114
124
|
if not quiet:
|
115
|
-
print(f"
|
116
|
-
|
117
|
-
|
125
|
+
print(f"Polling RSS feed")
|
126
|
+
results = await poll_rss(self.ratelimiters['sec.gov'])
|
127
|
+
new_results = self._filter_new_accessions(results)
|
128
|
+
if new_results:
|
129
|
+
if not quiet:
|
130
|
+
print(f"Found {len(new_results)} new submissions via RSS")
|
131
|
+
if data_callback:
|
132
|
+
data_callback(new_results)
|
133
|
+
last_polling_time = current_time
|
118
134
|
|
119
|
-
# EFTS validation
|
120
|
-
if
|
135
|
+
# EFTS validation (if enabled)
|
136
|
+
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
121
137
|
# Get submissions from the last 24 hours for validation
|
122
138
|
today_date = datetime.now().strftime('%Y-%m-%d')
|
123
139
|
if not quiet:
|
@@ -134,19 +150,23 @@ class Monitor():
|
|
134
150
|
print(f"Found {len(new_hits)} new submissions via EFTS validation")
|
135
151
|
if data_callback:
|
136
152
|
data_callback(new_hits)
|
137
|
-
last_polling_time = time.time()
|
138
153
|
last_validation_time = current_time
|
139
154
|
|
140
155
|
# Interval callback
|
141
156
|
if interval_callback:
|
142
157
|
interval_callback()
|
143
158
|
|
144
|
-
|
159
|
+
# Calculate next wake-up time
|
160
|
+
next_times = []
|
161
|
+
if do_polling:
|
162
|
+
next_times.append(last_polling_time + (polling_interval / 1000))
|
163
|
+
if do_validation:
|
164
|
+
next_times.append(last_validation_time + (validation_interval / 1000))
|
165
|
+
|
166
|
+
next_wake_time = min(next_times)
|
145
167
|
current_time = time.time()
|
146
|
-
time_to_sleep = max(0,
|
168
|
+
time_to_sleep = max(0, next_wake_time - current_time)
|
147
169
|
await asyncio.sleep(time_to_sleep)
|
148
|
-
last_polling_time = next_poll_time
|
149
|
-
|
150
170
|
|
151
171
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
152
172
|
polling_interval=1000, quiet=True, start_date=None,
|
@@ -74,7 +74,7 @@ class Downloader:
|
|
74
74
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
75
75
|
|
76
76
|
class FileProcessor:
|
77
|
-
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[]):
|
77
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=[], keep_filtered_metadata=False,standardize_metadata=True):
|
78
78
|
self.processing_queue = Queue(maxsize=queue_size)
|
79
79
|
self.should_stop = False
|
80
80
|
self.processing_workers = []
|
@@ -84,6 +84,8 @@ class Downloader:
|
|
84
84
|
self.pbar = pbar
|
85
85
|
self.downloader = downloader
|
86
86
|
self.keep_document_types = keep_document_types
|
87
|
+
self.keep_filtered_metadata = keep_filtered_metadata
|
88
|
+
self.standardize_metadata = standardize_metadata
|
87
89
|
|
88
90
|
def start_processing_workers(self):
|
89
91
|
for _ in range(self.max_workers):
|
@@ -95,7 +97,8 @@ class Downloader:
|
|
95
97
|
def _process_file(self, item):
|
96
98
|
filename, content = item
|
97
99
|
output_path = os.path.join(self.output_dir, filename.split('.')[0] + '.tar')
|
98
|
-
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types)
|
100
|
+
write_sgml_file_to_tar(output_path, bytes_content=content, filter_document_types=self.keep_document_types, keep_filtered_metadata=self.keep_filtered_metadata,standardize_metadata=self.standardize_metadata)
|
101
|
+
|
99
102
|
self.pbar.update(1)
|
100
103
|
|
101
104
|
def _processing_worker(self):
|
@@ -204,11 +207,12 @@ class Downloader:
|
|
204
207
|
except Exception as e:
|
205
208
|
self._log_error(output_dir, filename, str(e))
|
206
209
|
|
207
|
-
async def process_batch(self, urls, output_dir, keep_document_types=[]):
|
210
|
+
async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
208
211
|
os.makedirs(output_dir, exist_ok=True)
|
209
212
|
|
210
213
|
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
211
|
-
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types
|
214
|
+
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types,
|
215
|
+
keep_filtered_metadata=keep_filtered_metadata,standardize_metadata=standardize_metadata)
|
212
216
|
processor.start_processing_workers()
|
213
217
|
|
214
218
|
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
@@ -231,7 +235,7 @@ class Downloader:
|
|
231
235
|
processor.stop_workers()
|
232
236
|
decompression_pool.shutdown()
|
233
237
|
|
234
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[]):
|
238
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
235
239
|
"""
|
236
240
|
Query SEC filings and download/process them.
|
237
241
|
|
@@ -242,6 +246,7 @@ class Downloader:
|
|
242
246
|
- output_dir: Directory to save downloaded files
|
243
247
|
- accession_numbers: List of specific accession numbers to download
|
244
248
|
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
249
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
245
250
|
"""
|
246
251
|
if self.api_key is None:
|
247
252
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
@@ -279,7 +284,7 @@ class Downloader:
|
|
279
284
|
start_time = time.time()
|
280
285
|
|
281
286
|
# Process the batch asynchronously
|
282
|
-
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
|
287
|
+
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
|
283
288
|
|
284
289
|
# Calculate and display performance metrics
|
285
290
|
elapsed_time = time.time() - start_time
|
@@ -292,7 +297,65 @@ class Downloader:
|
|
292
297
|
self.loop.call_soon_threadsafe(self.loop.stop)
|
293
298
|
|
294
299
|
|
295
|
-
|
300
|
+
|
301
|
+
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
302
|
+
"""
|
303
|
+
Download and process SEC filings using specific filenames.
|
304
|
+
|
305
|
+
Parameters:
|
306
|
+
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
307
|
+
- output_dir: Directory to save downloaded files
|
308
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
309
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
310
|
+
- standardize_metadata: Whether to standardize metadata format
|
311
|
+
"""
|
312
|
+
if self.api_key is None:
|
313
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
314
|
+
|
315
|
+
if not filenames:
|
316
|
+
raise ValueError("No filenames provided")
|
317
|
+
|
318
|
+
if not isinstance(filenames, (list, tuple)):
|
319
|
+
filenames = [filenames]
|
320
|
+
|
321
|
+
# Validate filenames format
|
322
|
+
for filename in filenames:
|
323
|
+
if not isinstance(filename, str):
|
324
|
+
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
325
|
+
if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
|
326
|
+
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
|
327
|
+
|
328
|
+
# Generate URLs directly from filenames
|
329
|
+
print(f"Generating URLs for {len(filenames)} files...")
|
330
|
+
urls = []
|
331
|
+
for filename in filenames:
|
332
|
+
url = f"{self.BASE_URL}{filename}"
|
333
|
+
urls.append(url)
|
334
|
+
|
335
|
+
# Remove duplicates while preserving order
|
336
|
+
seen = set()
|
337
|
+
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
338
|
+
|
339
|
+
print(f"Downloading {len(urls)} files...")
|
340
|
+
|
341
|
+
# Process the batch asynchronously using existing infrastructure
|
342
|
+
start_time = time.time()
|
343
|
+
|
344
|
+
asyncio.run(self.process_batch(
|
345
|
+
urls,
|
346
|
+
output_dir,
|
347
|
+
keep_document_types=keep_document_types,
|
348
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
349
|
+
standardize_metadata=standardize_metadata
|
350
|
+
))
|
351
|
+
|
352
|
+
# Calculate and display performance metrics
|
353
|
+
elapsed_time = time.time() - start_time
|
354
|
+
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
355
|
+
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
356
|
+
|
357
|
+
|
358
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True):
|
296
359
|
"""
|
297
360
|
Query SEC filings and download/process them.
|
298
361
|
|
@@ -304,6 +367,7 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
304
367
|
- output_dir: Directory to save downloaded files
|
305
368
|
- accession_numbers: List of specific accession numbers to download
|
306
369
|
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
370
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
307
371
|
"""
|
308
372
|
if accession_numbers:
|
309
373
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
@@ -317,5 +381,30 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
317
381
|
filing_date=filing_date,
|
318
382
|
output_dir=output_dir,
|
319
383
|
accession_numbers=accession_numbers,
|
320
|
-
keep_document_types=keep_document_types
|
384
|
+
keep_document_types=keep_document_types,
|
385
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
386
|
+
standardize_metadata=standardize_metadata
|
387
|
+
)
|
388
|
+
|
389
|
+
def download_files_using_filename(filenames, api_key=None, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
390
|
+
"""
|
391
|
+
Download and process SEC filings using specific filenames.
|
392
|
+
|
393
|
+
Parameters:
|
394
|
+
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
395
|
+
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
396
|
+
- output_dir: Directory to save downloaded files
|
397
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
398
|
+
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
399
|
+
- standardize_metadata: Whether to standardize metadata format
|
400
|
+
"""
|
401
|
+
downloader = Downloader(api_key=api_key)
|
402
|
+
downloader.QUEUE_SIZE = 1
|
403
|
+
downloader.MAX_CONCURRENT_DOWNLOADS = 1
|
404
|
+
downloader.download_files_using_filename(
|
405
|
+
filenames=filenames,
|
406
|
+
output_dir=output_dir,
|
407
|
+
keep_document_types=keep_document_types,
|
408
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
409
|
+
standardize_metadata=standardize_metadata
|
321
410
|
)
|
datamule/submission.py
CHANGED
@@ -2,11 +2,80 @@ from pathlib import Path
|
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
|
+
from secsgml.utils import bytes_to_str
|
6
|
+
from secsgml.parse_sgml import transform_metadata_string
|
5
7
|
import tarfile
|
6
8
|
import shutil
|
7
9
|
import zstandard as zstd
|
8
|
-
from io import BytesIO
|
9
10
|
import gzip
|
11
|
+
import io
|
12
|
+
import copy
|
13
|
+
|
14
|
+
|
15
|
+
def calculate_documents_locations_in_tar(metadata, documents):
|
16
|
+
# Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
|
17
|
+
placeholder_metadata = copy.deepcopy(metadata)
|
18
|
+
|
19
|
+
for file_num in range(len(documents)):
|
20
|
+
if 'documents' in placeholder_metadata:
|
21
|
+
placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
|
22
|
+
placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
|
23
|
+
|
24
|
+
# Step 2: Calculate size with placeholders
|
25
|
+
placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
|
26
|
+
placeholder_json = json.dumps(placeholder_str).encode('utf-8')
|
27
|
+
metadata_size = len(placeholder_json)
|
28
|
+
|
29
|
+
# Step 3: Now calculate actual positions using this size
|
30
|
+
current_pos = 512 + metadata_size
|
31
|
+
current_pos += (512 - (current_pos % 512)) % 512
|
32
|
+
|
33
|
+
# Step 4: Calculate real positions and update original metadata (10-digit padded)
|
34
|
+
for file_num, content in enumerate(documents):
|
35
|
+
start_byte = current_pos + 512
|
36
|
+
end_byte = start_byte + len(content)
|
37
|
+
|
38
|
+
if 'documents' in metadata:
|
39
|
+
metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
|
40
|
+
metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
|
41
|
+
|
42
|
+
|
43
|
+
file_total_size = 512 + len(content)
|
44
|
+
padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
|
45
|
+
current_pos += padded_size
|
46
|
+
|
47
|
+
return metadata
|
48
|
+
|
49
|
+
|
50
|
+
def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
|
51
|
+
# Write tar directly to disk
|
52
|
+
with tarfile.open(output_path, 'w') as tar:
|
53
|
+
|
54
|
+
# calculate document locations in tar
|
55
|
+
metadata = calculate_documents_locations_in_tar(metadata, documents)
|
56
|
+
|
57
|
+
# serialize metadata
|
58
|
+
metadata_str = bytes_to_str(metadata,lower=False)
|
59
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
60
|
+
# save metadata
|
61
|
+
tarinfo = tarfile.TarInfo(name='metadata.json')
|
62
|
+
tarinfo.size = len(metadata_json)
|
63
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
64
|
+
|
65
|
+
for file_num, content in enumerate(documents, 0):
|
66
|
+
if standardize_metadata:
|
67
|
+
document_name = metadata['documents'][file_num]['filename'] if metadata['documents'][file_num].get('filename') else metadata['documents'][file_num]['sequence'] + '.txt'
|
68
|
+
|
69
|
+
compression = compression_list[file_num]
|
70
|
+
if compression == 'gzip':
|
71
|
+
document_name = f'{document_name}.gz'
|
72
|
+
elif compression == 'zstd':
|
73
|
+
document_name = f'{document_name}.zst'
|
74
|
+
|
75
|
+
|
76
|
+
tarinfo = tarfile.TarInfo(name=f'{document_name}')
|
77
|
+
tarinfo.size = len(content)
|
78
|
+
tar.addfile(tarinfo, io.BytesIO(content))
|
10
79
|
|
11
80
|
class Submission:
|
12
81
|
def __init__(self, path=None,sgml_content=None,keep_document_types=None):
|
@@ -18,6 +87,10 @@ class Submission:
|
|
18
87
|
if sgml_content is not None:
|
19
88
|
self.path = None
|
20
89
|
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
|
90
|
+
|
91
|
+
# standardize metadata
|
92
|
+
metadata = transform_metadata_string(metadata)
|
93
|
+
|
21
94
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
|
22
95
|
# code dupe
|
23
96
|
self.accession = self.metadata.content['accession-number']
|
@@ -55,6 +128,9 @@ class Submission:
|
|
55
128
|
metadata_path = self.path / 'metadata.json'
|
56
129
|
with metadata_path.open('r') as f:
|
57
130
|
metadata = json.load(f)
|
131
|
+
|
132
|
+
# standardize metadata
|
133
|
+
metadata = transform_metadata_string(metadata)
|
58
134
|
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
59
135
|
self.accession = self.metadata.content['accession-number']
|
60
136
|
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
@@ -68,51 +144,34 @@ class Submission:
|
|
68
144
|
if compression is not None and compression not in ['gzip', 'zstd']:
|
69
145
|
raise ValueError("compression must be 'gzip' or 'zstd'")
|
70
146
|
|
147
|
+
# check if we're loading from a dir or a tar file
|
148
|
+
is_dir_not_tar = True
|
149
|
+
if self.path.suffix == '.tar':
|
150
|
+
is_dir_not_tar = False
|
151
|
+
elif not self.path.is_dir():
|
152
|
+
raise ValueError("Path must be a directory to compress")
|
71
153
|
# Create tar file (replace directory with .tar file)
|
72
154
|
tar_path = self.path.with_suffix('.tar')
|
155
|
+
|
156
|
+
# load all files in the directory or tar file
|
157
|
+
documents = [doc.content.encode('utf-8') if isinstance(doc.content, str) else doc.content for doc in self]
|
73
158
|
|
74
|
-
with tarfile.open(tar_path, 'w') as tar:
|
75
|
-
# Add metadata.json first
|
76
|
-
metadata_path = self.path / 'metadata.json'
|
77
|
-
if metadata_path.exists():
|
78
|
-
tar.add(metadata_path, arcname='metadata.json')
|
79
|
-
|
80
|
-
# Add documents in order
|
81
|
-
for doc in self.metadata.content['documents']:
|
82
|
-
filename = doc.get('filename')
|
83
|
-
if filename is None:
|
84
|
-
filename = doc['sequence'] + '.txt'
|
85
|
-
|
86
|
-
file_path = self.path / filename
|
87
|
-
if file_path.exists():
|
88
|
-
file_size = file_path.stat().st_size
|
89
159
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
if compression == 'gzip':
|
96
|
-
compressed_content = gzip.compress(content, compresslevel=level or 6)
|
97
|
-
compressed_filename = filename + '.gz'
|
98
|
-
else: # zstd
|
99
|
-
cctx = zstd.ZstdCompressor(level=level or 3)
|
100
|
-
compressed_content = cctx.compress(content)
|
101
|
-
compressed_filename = filename + '.zst'
|
102
|
-
|
103
|
-
# Add compressed file to tar
|
104
|
-
tarinfo = tarfile.TarInfo(name=compressed_filename)
|
105
|
-
tarinfo.size = len(compressed_content)
|
106
|
-
tar.addfile(tarinfo, BytesIO(compressed_content))
|
107
|
-
else:
|
108
|
-
# Add uncompressed file
|
109
|
-
tar.add(file_path, arcname=filename)
|
160
|
+
# we should compress everything here first.
|
161
|
+
compression_list = [compression if len(doc) >= threshold else '' for doc in documents]
|
162
|
+
documents = [gzip.compress(doc, compresslevel=level or 6) if compression == 'gzip' and
|
163
|
+
len(doc) >= threshold else zstd.ZstdCompressor(level=level or 3).compress(doc) if compression == 'zstd' and
|
164
|
+
len(doc) >= threshold else doc for doc in documents]
|
110
165
|
|
166
|
+
metadata = self.metadata.content.copy()
|
167
|
+
write_submission_to_tar(tar_path,metadata,documents,compression_list=compression_list,standardize_metadata=True)
|
168
|
+
|
111
169
|
# Delete original folder
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
170
|
+
if is_dir_not_tar:
|
171
|
+
shutil.rmtree(self.path)
|
172
|
+
# otherwise, we already replaced the tar file
|
173
|
+
# Update path to point to new tar file
|
174
|
+
self.path = tar_path
|
116
175
|
|
117
176
|
def decompress(self):
|
118
177
|
if self.path is None:
|
@@ -129,17 +188,36 @@ class Submission:
|
|
129
188
|
if member.isfile():
|
130
189
|
content = tar.extractfile(member).read()
|
131
190
|
|
132
|
-
# Decompress
|
191
|
+
# Decompress based on file extension
|
133
192
|
if member.name.endswith('.gz'):
|
134
193
|
content = gzip.decompress(content)
|
135
194
|
output_path = output_dir / member.name[:-3] # Remove .gz extension
|
195
|
+
elif member.name.endswith('.zst'):
|
196
|
+
dctx = zstd.ZstdDecompressor()
|
197
|
+
content = dctx.decompress(content)
|
198
|
+
output_path = output_dir / member.name[:-4] # Remove .zst extension
|
136
199
|
else:
|
137
200
|
output_path = output_dir / member.name
|
138
201
|
|
139
|
-
#
|
140
|
-
output_path.
|
141
|
-
|
142
|
-
|
202
|
+
# check if it is metadata.json
|
203
|
+
if output_path.name == 'metadata.json':
|
204
|
+
# load as json
|
205
|
+
metadata = json.loads(content.decode('utf-8'))
|
206
|
+
# remove SECSGML_START_BYTE and SECSGML_END_BYTE from documents
|
207
|
+
for doc in metadata['documents']:
|
208
|
+
if 'secsgml_start_byte' in doc:
|
209
|
+
del doc['secsgml_start_byte']
|
210
|
+
|
211
|
+
if 'secsgml_end_byte' in doc:
|
212
|
+
del doc['secsgml_end_byte']
|
213
|
+
|
214
|
+
with output_path.open('w', encoding='utf-8') as f:
|
215
|
+
json.dump(metadata, f)
|
216
|
+
else:
|
217
|
+
# Write to output directory
|
218
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
219
|
+
with output_path.open('wb') as f:
|
220
|
+
f.write(content)
|
143
221
|
|
144
222
|
# delete original file
|
145
223
|
self.path.unlink()
|
@@ -3,9 +3,9 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
-
datamule/portfolio.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=iW54frGfoCQb-6aYfocDqQQPe0gc_22voedv0It_1q0,7517
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=6JIi-ayLL-jENVj6Q4IhmrYlAreJI7xBAHP_NYaDB6k,12918
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
datamule/document/document.py,sha256=04Rivdphq0D1HEGIBjtl1LelJr-IyQU1qCMi8yNJajw,14038
|
@@ -44,9 +44,9 @@ datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
|
|
44
44
|
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNsBw5Jv0Tx5aljiGUJkk7DRk,18745
|
46
46
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
datamule/sec/submissions/downloader.py,sha256=
|
47
|
+
datamule/sec/submissions/downloader.py,sha256=tDWn8bsK9XabQo2pBGYSiqTw37MmqM8rEma8Ph7zp-o,1391
|
48
48
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
49
|
-
datamule/sec/submissions/monitor.py,sha256=
|
49
|
+
datamule/sec/submissions/monitor.py,sha256=CvpHywnrn4Lwk_3rWRE5K5UNYrdJ9Gyon97Uo0Ocq-4,8985
|
50
50
|
datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
|
51
51
|
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
52
52
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -56,9 +56,9 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
56
56
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
57
57
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
58
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
59
|
-
datamule/seclibrary/downloader.py,sha256=
|
59
|
+
datamule/seclibrary/downloader.py,sha256=wNRURTGb3eqg12Ltt4578L0WcAm7DmCWg0Rm0Om6Z4U,17959
|
60
60
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
61
|
-
datamule-1.5.
|
62
|
-
datamule-1.5.
|
63
|
-
datamule-1.5.
|
64
|
-
datamule-1.5.
|
61
|
+
datamule-1.5.4.dist-info/METADATA,sha256=jl-zXUtvVrWz4Etn1BW8zsZ2AQ7CaE-zDF18sS0Lf7E,469
|
62
|
+
datamule-1.5.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
63
|
+
datamule-1.5.4.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
64
|
+
datamule-1.5.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|