datamule 2.3.8__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamule might be problematic. Click here for more details.
- {datamule-2.3.8 → datamule-2.4.0}/PKG-INFO +1 -1
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datamule/downloader.py +3 -2
- datamule-2.4.0/datamule/datamule/tar_downloader.py +719 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/portfolio/portfolio.py +20 -2
- datamule-2.4.0/datamule/providers/providers.py +6 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/submissions/streamer.py +1 -1
- {datamule-2.3.8 → datamule-2.4.0}/datamule/submission/submission.py +5 -2
- {datamule-2.3.8 → datamule-2.4.0}/datamule/submission/tar_submission.py +25 -23
- datamule-2.4.0/datamule/utils/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-2.3.8 → datamule-2.4.0}/datamule.egg-info/SOURCES.txt +3 -0
- {datamule-2.3.8 → datamule-2.4.0}/setup.py +1 -1
- {datamule-2.3.8 → datamule-2.4.0}/datamule/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/book/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/book/book.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/book/s3transfer.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/cloud/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/config.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datamule/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datamule/datamule_lookup.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datamule/datamule_mysql_rds.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datamule/sec_connector.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/datasets.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/document/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/document/document.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/helper.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/index.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/package_updater.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/portfolio/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/portfolio/portfolio_compression_utils_legacy.py +0 -0
- {datamule-2.3.8/datamule/sec → datamule-2.4.0/datamule/providers}/__init__.py +0 -0
- {datamule-2.3.8/datamule/sec/infrastructure → datamule-2.4.0/datamule/sec}/__init__.py +0 -0
- {datamule-2.3.8/datamule/sec/submissions → datamule-2.4.0/datamule/sec/infrastructure}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-2.3.8/datamule/sec/xbrl → datamule-2.4.0/datamule/sec/submissions}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/utils.py +0 -0
- {datamule-2.3.8/datamule/seclibrary → datamule-2.4.0/datamule/sec/xbrl}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-2.3.8/datamule/sheet → datamule-2.4.0/datamule/seclibrary}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/seclibrary/bq.py +0 -0
- {datamule-2.3.8/datamule/submission → datamule-2.4.0/datamule/sheet}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/sheet/sheet.py +0 -0
- {datamule-2.3.8/datamule/tables → datamule-2.4.0/datamule/submission}/__init__.py +0 -0
- {datamule-2.3.8/datamule/tags → datamule-2.4.0/datamule/tables}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_13fhr.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_25nse.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_informationtable.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_npx.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_ownership.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_proxyvotingrecord.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_sbsef.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/tables_sdr.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tables/utils.py +0 -0
- {datamule-2.3.8/datamule/utils → datamule-2.4.0/datamule/tags}/__init__.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tags/config.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tags/dictionaries.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tags/regex.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/tags/utils.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/utils/format_accession.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule/utils/pdf.py +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule.egg-info/requires.txt +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/datamule.egg-info/top_level.txt +0 -0
- {datamule-2.3.8 → datamule-2.4.0}/setup.cfg +0 -0
|
@@ -19,8 +19,9 @@ from secsgml import parse_sgml_content_into_memory
|
|
|
19
19
|
from secsgml.utils import bytes_to_str
|
|
20
20
|
from .datamule_lookup import datamule_lookup
|
|
21
21
|
from ..utils.format_accession import format_accession
|
|
22
|
+
from ..providers.providers import SEC_FILINGS_SGML_BUCKET_ENDPOINT
|
|
22
23
|
|
|
23
|
-
# could be cleaned up
|
|
24
|
+
# TODO could be cleaned up
|
|
24
25
|
|
|
25
26
|
# Set up logging
|
|
26
27
|
logging.basicConfig(
|
|
@@ -33,7 +34,7 @@ logger = logging.getLogger(__name__)
|
|
|
33
34
|
|
|
34
35
|
class Downloader:
|
|
35
36
|
def __init__(self, api_key=None):
|
|
36
|
-
self.BASE_URL =
|
|
37
|
+
self.BASE_URL = SEC_FILINGS_SGML_BUCKET_ENDPOINT
|
|
37
38
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
|
38
39
|
self.MAX_CONCURRENT_DOWNLOADS = 100
|
|
39
40
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
|
@@ -0,0 +1,719 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import aiohttp
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import time
|
|
6
|
+
import ssl
|
|
7
|
+
import zstandard as zstd
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import tarfile
|
|
11
|
+
import logging
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
13
|
+
from functools import partial
|
|
14
|
+
from queue import Queue
|
|
15
|
+
from threading import Thread, Lock
|
|
16
|
+
from os import cpu_count
|
|
17
|
+
from .datamule_lookup import datamule_lookup
|
|
18
|
+
from ..utils.format_accession import format_accession
|
|
19
|
+
from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
|
|
20
|
+
|
|
21
|
+
# Set up logging
|
|
22
|
+
logging.basicConfig(
|
|
23
|
+
level=logging.INFO,
|
|
24
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
25
|
+
handlers=logging.getLogger().handlers,
|
|
26
|
+
)
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TarDownloader:
|
|
31
|
+
def __init__(self, api_key=None):
|
|
32
|
+
self.BASE_URL = SEC_FILINGS_TAR_BUCKET_ENDPOINT
|
|
33
|
+
self.CHUNK_SIZE = 2 * 1024 * 1024
|
|
34
|
+
self.MAX_CONCURRENT_DOWNLOADS = 100
|
|
35
|
+
self.MAX_EXTRACTION_WORKERS = cpu_count()
|
|
36
|
+
self.MAX_TAR_WORKERS = cpu_count()
|
|
37
|
+
self.RANGE_MERGE_THRESHOLD = 1024 # Merge ranges if gap <= 1024 bytes
|
|
38
|
+
if api_key is not None:
|
|
39
|
+
self._api_key = api_key
|
|
40
|
+
self.loop = asyncio.new_event_loop()
|
|
41
|
+
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
|
42
|
+
self.loop_thread.start()
|
|
43
|
+
self.async_queue = Queue()
|
|
44
|
+
self.error_log_lock = Lock()
|
|
45
|
+
|
|
46
|
+
def _run_event_loop(self):
|
|
47
|
+
asyncio.set_event_loop(self.loop)
|
|
48
|
+
self.loop.run_forever()
|
|
49
|
+
|
|
50
|
+
def _run_coroutine(self, coro):
|
|
51
|
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
52
|
+
return future.result()
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def api_key(self):
|
|
56
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
|
57
|
+
|
|
58
|
+
@api_key.setter
|
|
59
|
+
def api_key(self, value):
|
|
60
|
+
if not value:
|
|
61
|
+
raise ValueError("API key cannot be empty")
|
|
62
|
+
self._api_key = value
|
|
63
|
+
|
|
64
|
+
def _log_error(self, output_dir, filename, error_msg):
|
|
65
|
+
error_file = os.path.join(output_dir, 'errors.json')
|
|
66
|
+
with self.error_log_lock:
|
|
67
|
+
try:
|
|
68
|
+
if os.path.exists(error_file):
|
|
69
|
+
with open(error_file, 'r') as f:
|
|
70
|
+
errors = json.load(f)
|
|
71
|
+
else:
|
|
72
|
+
errors = {}
|
|
73
|
+
|
|
74
|
+
errors[filename] = str(error_msg)
|
|
75
|
+
|
|
76
|
+
with open(error_file, 'w') as f:
|
|
77
|
+
json.dump(errors, f, indent=2)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Failed to log error to {error_file}: {str(e)}")
|
|
80
|
+
|
|
81
|
+
def _get_document_ranges(self, accession_num, keep_document_types, range_lookup_db=None):
|
|
82
|
+
"""
|
|
83
|
+
Get byte ranges for requested document types.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
accession_num: The accession number
|
|
87
|
+
keep_document_types: List of document types to retrieve
|
|
88
|
+
range_lookup_db: Future database connection for looking up ranges
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
dict mapping document_type to (start_byte, end_byte)
|
|
92
|
+
"""
|
|
93
|
+
if range_lookup_db is not None:
|
|
94
|
+
# Future: Query database for ranges
|
|
95
|
+
# return range_lookup_db.get_ranges(accession_num, keep_document_types)
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
# Hardcoded ranges for now
|
|
99
|
+
ranges = {}
|
|
100
|
+
if 'metadata' in keep_document_types:
|
|
101
|
+
# Metadata is always first 128KB
|
|
102
|
+
ranges['metadata'] = (0, 131071)
|
|
103
|
+
|
|
104
|
+
return ranges
|
|
105
|
+
|
|
106
|
+
def _merge_ranges(self, ranges):
|
|
107
|
+
"""
|
|
108
|
+
Merge overlapping or close ranges.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
ranges: dict mapping document_type to (start_byte, end_byte)
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
list of merged (start_byte, end_byte) tuples, sorted
|
|
115
|
+
"""
|
|
116
|
+
if not ranges:
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
# Extract and sort ranges by start byte
|
|
120
|
+
range_list = sorted(ranges.values(), key=lambda x: x[0])
|
|
121
|
+
|
|
122
|
+
merged = []
|
|
123
|
+
current_start, current_end = range_list[0]
|
|
124
|
+
|
|
125
|
+
for start, end in range_list[1:]:
|
|
126
|
+
# Check if ranges overlap or are within merge threshold
|
|
127
|
+
if start <= current_end + self.RANGE_MERGE_THRESHOLD:
|
|
128
|
+
# Merge: extend current range
|
|
129
|
+
current_end = max(current_end, end)
|
|
130
|
+
else:
|
|
131
|
+
# No merge: save current range and start new one
|
|
132
|
+
merged.append((current_start, current_end))
|
|
133
|
+
current_start, current_end = start, end
|
|
134
|
+
|
|
135
|
+
# Add the last range
|
|
136
|
+
merged.append((current_start, current_end))
|
|
137
|
+
|
|
138
|
+
return merged
|
|
139
|
+
|
|
140
|
+
def _build_range_header(self, merged_ranges):
|
|
141
|
+
"""
|
|
142
|
+
Build HTTP Range header from merged ranges.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
merged_ranges: list of (start_byte, end_byte) tuples
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Range header string, e.g., "bytes=0-131071,200000-300000"
|
|
149
|
+
"""
|
|
150
|
+
if not merged_ranges:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
range_specs = [f"{start}-{end}" for start, end in merged_ranges]
|
|
154
|
+
return f"bytes={','.join(range_specs)}"
|
|
155
|
+
|
|
156
|
+
def _parse_tar_header(self, header_bytes):
|
|
157
|
+
"""
|
|
158
|
+
Parse a 512-byte tar header.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
dict with 'name', 'size', or None if invalid header
|
|
162
|
+
"""
|
|
163
|
+
if len(header_bytes) < 512:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
# Check if it's a zero block (end of archive)
|
|
167
|
+
if header_bytes == b'\x00' * 512:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
# Tar header format (POSIX ustar)
|
|
172
|
+
name = header_bytes[0:100].split(b'\x00')[0].decode('utf-8')
|
|
173
|
+
size_str = header_bytes[124:136].split(b'\x00')[0].decode('utf-8').strip()
|
|
174
|
+
|
|
175
|
+
if not size_str:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
# Size is in octal
|
|
179
|
+
size = int(size_str, 8)
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
'name': name,
|
|
183
|
+
'size': size
|
|
184
|
+
}
|
|
185
|
+
except:
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
def _extract_files_from_partial_tar(self, tar_bytes):
|
|
189
|
+
"""
|
|
190
|
+
Extract files from partial tar data by manually parsing headers.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
tar_bytes: Raw bytes from partial tar download
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
list of dicts with 'name' and 'content'
|
|
197
|
+
"""
|
|
198
|
+
files = []
|
|
199
|
+
offset = 0
|
|
200
|
+
|
|
201
|
+
while offset + 512 <= len(tar_bytes):
|
|
202
|
+
# Read header
|
|
203
|
+
header = self._parse_tar_header(tar_bytes[offset:offset+512])
|
|
204
|
+
|
|
205
|
+
if header is None:
|
|
206
|
+
# End of archive or invalid header
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
offset += 512 # Move past header
|
|
210
|
+
|
|
211
|
+
# Calculate file content end and padding
|
|
212
|
+
file_size = header['size']
|
|
213
|
+
content_end = offset + file_size
|
|
214
|
+
|
|
215
|
+
# Check if we have the full file content
|
|
216
|
+
if content_end > len(tar_bytes):
|
|
217
|
+
# File is truncated, skip it
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
# Extract file content
|
|
221
|
+
content = tar_bytes[offset:content_end]
|
|
222
|
+
|
|
223
|
+
files.append({
|
|
224
|
+
'name': os.path.basename(header['name']),
|
|
225
|
+
'content': content
|
|
226
|
+
})
|
|
227
|
+
|
|
228
|
+
# Move to next 512-byte boundary
|
|
229
|
+
padding = (512 - (file_size % 512)) % 512
|
|
230
|
+
offset = content_end + padding
|
|
231
|
+
|
|
232
|
+
return files
|
|
233
|
+
|
|
234
|
+
def _build_filename_to_type_map(self, metadata_content):
|
|
235
|
+
"""
|
|
236
|
+
Parse metadata and build a mapping of filename to document type.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
metadata_content: The metadata.json content as bytes
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
dict mapping filename to document type
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
metadata = json.loads(metadata_content)
|
|
246
|
+
filename_map = {}
|
|
247
|
+
|
|
248
|
+
if 'documents' in metadata:
|
|
249
|
+
for doc in metadata['documents']:
|
|
250
|
+
filename = doc.get('filename')
|
|
251
|
+
doc_type = doc.get('type')
|
|
252
|
+
if filename and doc_type:
|
|
253
|
+
filename_map[filename] = doc_type
|
|
254
|
+
|
|
255
|
+
return filename_map
|
|
256
|
+
except:
|
|
257
|
+
return {}
|
|
258
|
+
|
|
259
|
+
def _filter_documents_by_type(self, documents, filename_map, keep_document_types):
|
|
260
|
+
"""
|
|
261
|
+
Filter documents based on their type from metadata.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
documents: List of dicts with 'name' and 'content'
|
|
265
|
+
filename_map: Dict mapping filename to document type
|
|
266
|
+
keep_document_types: List of document types to keep
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Filtered list of documents
|
|
270
|
+
"""
|
|
271
|
+
if not keep_document_types or not filename_map:
|
|
272
|
+
return documents
|
|
273
|
+
|
|
274
|
+
# 'metadata' is special - it's already handled separately
|
|
275
|
+
# Filter out 'metadata' from keep_document_types for document filtering
|
|
276
|
+
doc_types_to_keep = [dt for dt in keep_document_types if dt != 'metadata']
|
|
277
|
+
|
|
278
|
+
if not doc_types_to_keep:
|
|
279
|
+
# Only metadata requested, no other documents
|
|
280
|
+
return []
|
|
281
|
+
|
|
282
|
+
filtered = []
|
|
283
|
+
for doc in documents:
|
|
284
|
+
doc_type = filename_map.get(doc['name'])
|
|
285
|
+
if doc_type and doc_type in doc_types_to_keep:
|
|
286
|
+
filtered.append(doc)
|
|
287
|
+
|
|
288
|
+
return filtered
|
|
289
|
+
|
|
290
|
+
class TarManager:
|
|
291
|
+
def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
|
|
292
|
+
self.output_dir = output_dir
|
|
293
|
+
self.num_tar_files = num_tar_files
|
|
294
|
+
self.max_batch_size = max_batch_size
|
|
295
|
+
self.tar_files = {}
|
|
296
|
+
self.tar_locks = {}
|
|
297
|
+
self.file_counters = {}
|
|
298
|
+
self.tar_sizes = {}
|
|
299
|
+
self.tar_sequences = {}
|
|
300
|
+
|
|
301
|
+
for i in range(num_tar_files):
|
|
302
|
+
tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
|
|
303
|
+
self.tar_files[i] = tarfile.open(tar_path, 'a')
|
|
304
|
+
self.tar_locks[i] = Lock()
|
|
305
|
+
self.file_counters[i] = 0
|
|
306
|
+
self.tar_sizes[i] = 0
|
|
307
|
+
self.tar_sequences[i] = 1
|
|
308
|
+
|
|
309
|
+
def get_tar_index(self, accession_num):
|
|
310
|
+
return hash(accession_num) % self.num_tar_files
|
|
311
|
+
|
|
312
|
+
def write_submission(self, accession_num, metadata_content, documents):
|
|
313
|
+
tar_index = self.get_tar_index(accession_num)
|
|
314
|
+
|
|
315
|
+
submission_size = len(metadata_content) + sum(len(doc['content']) for doc in documents)
|
|
316
|
+
|
|
317
|
+
with self.tar_locks[tar_index]:
|
|
318
|
+
if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
|
|
319
|
+
tar = self.tar_files[tar_index]
|
|
320
|
+
tar.close()
|
|
321
|
+
|
|
322
|
+
self.tar_sequences[tar_index] += 1
|
|
323
|
+
new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
|
|
324
|
+
self.tar_files[tar_index] = tarfile.open(new_tar_path, 'a')
|
|
325
|
+
self.file_counters[tar_index] = 0
|
|
326
|
+
self.tar_sizes[tar_index] = 0
|
|
327
|
+
|
|
328
|
+
tar = self.tar_files[tar_index]
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
# Write metadata
|
|
332
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
|
|
333
|
+
tarinfo.size = len(metadata_content)
|
|
334
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_content))
|
|
335
|
+
|
|
336
|
+
# Write documents
|
|
337
|
+
for doc in documents:
|
|
338
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc["name"]}')
|
|
339
|
+
tarinfo.size = len(doc['content'])
|
|
340
|
+
tar.addfile(tarinfo, io.BytesIO(doc['content']))
|
|
341
|
+
|
|
342
|
+
self.file_counters[tar_index] += 1
|
|
343
|
+
self.tar_sizes[tar_index] += submission_size
|
|
344
|
+
return True
|
|
345
|
+
|
|
346
|
+
except Exception as e:
|
|
347
|
+
logger.error(f"Error writing {accession_num} to tar {tar_index}: {str(e)}")
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
def close_all(self):
|
|
351
|
+
for i, tar in self.tar_files.items():
|
|
352
|
+
try:
|
|
353
|
+
tar.close()
|
|
354
|
+
except Exception as e:
|
|
355
|
+
logger.error(f"Error closing tar {i}: {str(e)}")
|
|
356
|
+
|
|
357
|
+
def _parse_multipart_byteranges(self, content, content_type):
|
|
358
|
+
"""
|
|
359
|
+
Parse multipart/byteranges response.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
content: Response body bytes
|
|
363
|
+
content_type: Content-Type header value
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
list of (start_byte, end_byte, data) tuples
|
|
367
|
+
"""
|
|
368
|
+
# Extract boundary from content type
|
|
369
|
+
if 'boundary=' not in content_type:
|
|
370
|
+
# Single range response, not multipart
|
|
371
|
+
return [(None, None, content)]
|
|
372
|
+
|
|
373
|
+
boundary = content_type.split('boundary=')[1].strip()
|
|
374
|
+
boundary_bytes = f'--{boundary}'.encode('utf-8')
|
|
375
|
+
end_boundary_bytes = f'--{boundary}--'.encode('utf-8')
|
|
376
|
+
|
|
377
|
+
parts = []
|
|
378
|
+
sections = content.split(boundary_bytes)
|
|
379
|
+
|
|
380
|
+
for section in sections[1:]: # Skip first empty section
|
|
381
|
+
if section.startswith(end_boundary_bytes) or not section.strip():
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# Split headers from body
|
|
385
|
+
header_end = section.find(b'\r\n\r\n')
|
|
386
|
+
if header_end == -1:
|
|
387
|
+
header_end = section.find(b'\n\n')
|
|
388
|
+
if header_end == -1:
|
|
389
|
+
continue
|
|
390
|
+
body_start = header_end + 2
|
|
391
|
+
else:
|
|
392
|
+
body_start = header_end + 4
|
|
393
|
+
|
|
394
|
+
headers = section[:header_end].decode('utf-8', errors='ignore')
|
|
395
|
+
body = section[body_start:].rstrip(b'\r\n')
|
|
396
|
+
|
|
397
|
+
# Parse Content-Range header
|
|
398
|
+
start_byte = None
|
|
399
|
+
end_byte = None
|
|
400
|
+
for line in headers.split('\n'):
|
|
401
|
+
if line.lower().startswith('content-range:'):
|
|
402
|
+
# Format: "Content-Range: bytes START-END/TOTAL"
|
|
403
|
+
range_part = line.split(':')[1].strip()
|
|
404
|
+
if 'bytes ' in range_part:
|
|
405
|
+
byte_range = range_part.split('bytes ')[1].split('/')[0]
|
|
406
|
+
start_byte, end_byte = map(int, byte_range.split('-'))
|
|
407
|
+
|
|
408
|
+
parts.append((start_byte, end_byte, body))
|
|
409
|
+
|
|
410
|
+
return parts
|
|
411
|
+
|
|
412
|
+
def extract_and_process_tar(self, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial=False):
|
|
413
|
+
"""Extract tar file and process its contents"""
|
|
414
|
+
try:
|
|
415
|
+
accession_num = filename.replace('.tar', '').split('/')[-1]
|
|
416
|
+
|
|
417
|
+
# If partial download (range request), manually parse tar headers
|
|
418
|
+
if is_partial:
|
|
419
|
+
files = self._extract_files_from_partial_tar(tar_content)
|
|
420
|
+
|
|
421
|
+
if not files:
|
|
422
|
+
self._log_error(output_dir, filename, "No files found in partial tar")
|
|
423
|
+
return False
|
|
424
|
+
|
|
425
|
+
# First file should be metadata
|
|
426
|
+
metadata_content = files[0]['content']
|
|
427
|
+
documents = files[1:] if len(files) > 1 else []
|
|
428
|
+
|
|
429
|
+
# Build filename to type mapping from metadata
|
|
430
|
+
filename_map = self._build_filename_to_type_map(metadata_content)
|
|
431
|
+
|
|
432
|
+
# Filter documents based on keep_document_types
|
|
433
|
+
documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
|
|
434
|
+
|
|
435
|
+
else:
|
|
436
|
+
# Full download, use tarfile library
|
|
437
|
+
tar_buffer = io.BytesIO(tar_content)
|
|
438
|
+
|
|
439
|
+
with tarfile.open(fileobj=tar_buffer, mode='r') as tar:
|
|
440
|
+
members = tar.getmembers()
|
|
441
|
+
|
|
442
|
+
if not members:
|
|
443
|
+
self._log_error(output_dir, filename, "Empty tar file")
|
|
444
|
+
return False
|
|
445
|
+
|
|
446
|
+
# Read all files
|
|
447
|
+
metadata_content = None
|
|
448
|
+
documents = []
|
|
449
|
+
|
|
450
|
+
for idx, member in enumerate(members):
|
|
451
|
+
if member.isfile():
|
|
452
|
+
file_content = tar.extractfile(member).read()
|
|
453
|
+
|
|
454
|
+
if idx == 0:
|
|
455
|
+
# First file is always metadata (never compressed)
|
|
456
|
+
metadata_content = file_content
|
|
457
|
+
else:
|
|
458
|
+
member_name = os.path.basename(member.name)
|
|
459
|
+
|
|
460
|
+
# Check if file is zstd compressed
|
|
461
|
+
if self._is_zstd_compressed(file_content):
|
|
462
|
+
file_content = self._decompress_zstd(file_content)
|
|
463
|
+
|
|
464
|
+
documents.append({
|
|
465
|
+
'name': member_name,
|
|
466
|
+
'content': file_content
|
|
467
|
+
})
|
|
468
|
+
|
|
469
|
+
if metadata_content is None:
|
|
470
|
+
self._log_error(output_dir, filename, "No metadata found in tar")
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
# Build filename to type mapping and filter
|
|
474
|
+
if keep_document_types:
|
|
475
|
+
filename_map = self._build_filename_to_type_map(metadata_content)
|
|
476
|
+
documents = self._filter_documents_by_type(documents, filename_map, keep_document_types)
|
|
477
|
+
|
|
478
|
+
tar_buffer.close()
|
|
479
|
+
|
|
480
|
+
# Write to output tar
|
|
481
|
+
success = tar_manager.write_submission(accession_num, metadata_content, documents)
|
|
482
|
+
|
|
483
|
+
if not success:
|
|
484
|
+
self._log_error(output_dir, filename, "Failed to write to output tar")
|
|
485
|
+
|
|
486
|
+
return success
|
|
487
|
+
|
|
488
|
+
except Exception as e:
|
|
489
|
+
self._log_error(output_dir, filename, f"Tar extraction error: {str(e)}")
|
|
490
|
+
return False
|
|
491
|
+
|
|
492
|
+
def _is_zstd_compressed(self, content):
|
|
493
|
+
"""Check if content is zstd compressed by magic number"""
|
|
494
|
+
return len(content) >= 4 and content[:4] == b'\x28\xb5\x2f\xfd'
|
|
495
|
+
|
|
496
|
+
def _decompress_zstd(self, compressed_content):
|
|
497
|
+
"""Decompress zstd content"""
|
|
498
|
+
dctx = zstd.ZstdDecompressor()
|
|
499
|
+
return dctx.decompress(compressed_content)
|
|
500
|
+
|
|
501
|
+
async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, range_lookup_db=None):
|
|
502
|
+
async with semaphore:
|
|
503
|
+
filename = url.split('/')[-1]
|
|
504
|
+
accession_num = filename.replace('.tar', '').split('/')[-1]
|
|
505
|
+
|
|
506
|
+
api_key = self.api_key
|
|
507
|
+
if not api_key:
|
|
508
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
|
509
|
+
|
|
510
|
+
try:
|
|
511
|
+
headers = {
|
|
512
|
+
'Connection': 'keep-alive',
|
|
513
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
514
|
+
'Authorization': f'Bearer {api_key}'
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
# Determine if we need partial download
|
|
518
|
+
range_header = None
|
|
519
|
+
is_partial = False
|
|
520
|
+
if keep_document_types:
|
|
521
|
+
# Get ranges for requested document types
|
|
522
|
+
doc_ranges = self._get_document_ranges(accession_num, keep_document_types, range_lookup_db)
|
|
523
|
+
|
|
524
|
+
if doc_ranges:
|
|
525
|
+
# Merge ranges
|
|
526
|
+
merged_ranges = self._merge_ranges(doc_ranges)
|
|
527
|
+
|
|
528
|
+
# Build range header
|
|
529
|
+
range_header = self._build_range_header(merged_ranges)
|
|
530
|
+
|
|
531
|
+
if range_header:
|
|
532
|
+
headers['Range'] = range_header
|
|
533
|
+
is_partial = True
|
|
534
|
+
|
|
535
|
+
async with session.get(url, headers=headers) as response:
|
|
536
|
+
if response.status in (200, 206): # 200 = full, 206 = partial
|
|
537
|
+
content_type = response.headers.get('Content-Type', '')
|
|
538
|
+
|
|
539
|
+
# Read all chunks
|
|
540
|
+
chunks = []
|
|
541
|
+
async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
|
|
542
|
+
chunks.append(chunk)
|
|
543
|
+
|
|
544
|
+
content = b''.join(chunks)
|
|
545
|
+
|
|
546
|
+
# Handle multipart response if needed
|
|
547
|
+
if response.status == 206 and 'multipart/byteranges' in content_type:
|
|
548
|
+
# Parse multipart response
|
|
549
|
+
parts = self._parse_multipart_byteranges(content, content_type)
|
|
550
|
+
|
|
551
|
+
# Reconstruct tar content from parts
|
|
552
|
+
tar_content = b''.join(part[2] for part in parts)
|
|
553
|
+
else:
|
|
554
|
+
tar_content = content
|
|
555
|
+
|
|
556
|
+
# Process in thread pool
|
|
557
|
+
loop = asyncio.get_running_loop()
|
|
558
|
+
success = await loop.run_in_executor(
|
|
559
|
+
extraction_pool,
|
|
560
|
+
partial(self.extract_and_process_tar, tar_content, filename, tar_manager, output_dir, keep_document_types, is_partial)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
if not success:
|
|
564
|
+
self._log_error(output_dir, filename, "Failed to process tar file")
|
|
565
|
+
|
|
566
|
+
elif response.status == 401:
|
|
567
|
+
self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
|
|
568
|
+
raise ValueError("Invalid API key")
|
|
569
|
+
else:
|
|
570
|
+
self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
|
|
571
|
+
|
|
572
|
+
pbar.update(1)
|
|
573
|
+
|
|
574
|
+
except Exception as e:
|
|
575
|
+
self._log_error(output_dir, filename, str(e))
|
|
576
|
+
pbar.update(1)
|
|
577
|
+
|
|
578
|
+
async def process_batch(self, urls, output_dir, max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
|
|
579
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
580
|
+
|
|
581
|
+
num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
|
|
582
|
+
|
|
583
|
+
tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
|
|
584
|
+
|
|
585
|
+
try:
|
|
586
|
+
with tqdm(total=len(urls), desc="Downloading tar files") as pbar:
|
|
587
|
+
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
|
588
|
+
extraction_pool = ThreadPoolExecutor(max_workers=self.MAX_EXTRACTION_WORKERS)
|
|
589
|
+
|
|
590
|
+
connector = aiohttp.TCPConnector(
|
|
591
|
+
limit=self.MAX_CONCURRENT_DOWNLOADS,
|
|
592
|
+
force_close=False,
|
|
593
|
+
ssl=ssl.create_default_context(),
|
|
594
|
+
ttl_dns_cache=300,
|
|
595
|
+
keepalive_timeout=60
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
|
|
599
|
+
tasks = [
|
|
600
|
+
self.download_and_process(
|
|
601
|
+
session, url, semaphore, extraction_pool,
|
|
602
|
+
tar_manager, output_dir, pbar, keep_document_types, range_lookup_db
|
|
603
|
+
)
|
|
604
|
+
for url in urls
|
|
605
|
+
]
|
|
606
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
607
|
+
|
|
608
|
+
extraction_pool.shutdown()
|
|
609
|
+
|
|
610
|
+
finally:
|
|
611
|
+
tar_manager.close_all()
|
|
612
|
+
|
|
613
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads",
|
|
614
|
+
filtered_accession_numbers=None, skip_accession_numbers=[],
|
|
615
|
+
max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
|
|
616
|
+
if self.api_key is None:
|
|
617
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
|
618
|
+
|
|
619
|
+
logger.debug("Querying SEC filings...")
|
|
620
|
+
|
|
621
|
+
if not accession_numbers:
|
|
622
|
+
filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
|
|
623
|
+
columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False, api_key=self.api_key)
|
|
624
|
+
|
|
625
|
+
if filtered_accession_numbers:
|
|
626
|
+
filtered_accession_numbers = [format_accession(item, 'int') for item in filtered_accession_numbers]
|
|
627
|
+
filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
|
|
628
|
+
|
|
629
|
+
if skip_accession_numbers:
|
|
630
|
+
skip_accession_numbers = [format_accession(item, 'int') for item in skip_accession_numbers]
|
|
631
|
+
filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
|
|
632
|
+
|
|
633
|
+
logger.debug(f"Generating URLs for {len(filings)} filings...")
|
|
634
|
+
urls = []
|
|
635
|
+
for item in filings:
|
|
636
|
+
url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.tar"
|
|
637
|
+
urls.append(url)
|
|
638
|
+
else:
|
|
639
|
+
urls = []
|
|
640
|
+
for accession in accession_numbers:
|
|
641
|
+
url = f"{self.BASE_URL}{format_accession(accession, 'no-dash').zfill(18)}.tar"
|
|
642
|
+
urls.append(url)
|
|
643
|
+
|
|
644
|
+
if not urls:
|
|
645
|
+
logger.warning("No submissions found matching the criteria")
|
|
646
|
+
return
|
|
647
|
+
|
|
648
|
+
urls = list(set(urls))
|
|
649
|
+
|
|
650
|
+
start_time = time.time()
|
|
651
|
+
|
|
652
|
+
asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
|
|
653
|
+
|
|
654
|
+
elapsed_time = time.time() - start_time
|
|
655
|
+
logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
|
|
656
|
+
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
|
657
|
+
|
|
658
|
+
def __del__(self):
|
|
659
|
+
if hasattr(self, 'loop') and self.loop.is_running():
|
|
660
|
+
self.loop.call_soon_threadsafe(self.loop.stop)
|
|
661
|
+
|
|
662
|
+
def download_files_using_filename(self, filenames, output_dir="downloads", max_batch_size=1024*1024*1024, keep_document_types=[], range_lookup_db=None):
|
|
663
|
+
if self.api_key is None:
|
|
664
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
|
665
|
+
|
|
666
|
+
if not filenames:
|
|
667
|
+
raise ValueError("No filenames provided")
|
|
668
|
+
|
|
669
|
+
if not isinstance(filenames, (list, tuple)):
|
|
670
|
+
filenames = [filenames]
|
|
671
|
+
|
|
672
|
+
for filename in filenames:
|
|
673
|
+
if not isinstance(filename, str):
|
|
674
|
+
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
|
675
|
+
if not filename.endswith('.tar'):
|
|
676
|
+
raise ValueError(f"Invalid filename format: {filename}. Expected .tar extension.")
|
|
677
|
+
|
|
678
|
+
logger.debug(f"Generating URLs for {len(filenames)} files...")
|
|
679
|
+
urls = []
|
|
680
|
+
for filename in filenames:
|
|
681
|
+
url = f"{self.BASE_URL}{filename}"
|
|
682
|
+
urls.append(url)
|
|
683
|
+
|
|
684
|
+
seen = set()
|
|
685
|
+
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
|
686
|
+
|
|
687
|
+
logger.debug(f"Downloading {len(urls)} tar files...")
|
|
688
|
+
|
|
689
|
+
start_time = time.time()
|
|
690
|
+
|
|
691
|
+
asyncio.run(self.process_batch(urls, output_dir, max_batch_size=max_batch_size, keep_document_types=keep_document_types, range_lookup_db=range_lookup_db))
|
|
692
|
+
|
|
693
|
+
elapsed_time = time.time() - start_time
|
|
694
|
+
logger.debug(f"Processing completed in {elapsed_time:.2f} seconds")
|
|
695
|
+
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def download_tar(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads",
|
|
699
|
+
filtered_accession_numbers=None, skip_accession_numbers=[],
|
|
700
|
+
max_batch_size=1024*1024*1024, accession_numbers=None, keep_document_types=[], range_lookup_db=None):
|
|
701
|
+
|
|
702
|
+
if filtered_accession_numbers:
|
|
703
|
+
filtered_accession_numbers = [format_accession(x, 'int') for x in filtered_accession_numbers]
|
|
704
|
+
elif filtered_accession_numbers == []:
|
|
705
|
+
raise ValueError("Applied filter resulted in empty accession numbers list")
|
|
706
|
+
|
|
707
|
+
downloader = TarDownloader(api_key=api_key)
|
|
708
|
+
downloader.download(
|
|
709
|
+
submission_type=submission_type,
|
|
710
|
+
cik=cik,
|
|
711
|
+
filing_date=filing_date,
|
|
712
|
+
output_dir=output_dir,
|
|
713
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
|
714
|
+
skip_accession_numbers=skip_accession_numbers,
|
|
715
|
+
max_batch_size=max_batch_size,
|
|
716
|
+
accession_numbers=accession_numbers,
|
|
717
|
+
keep_document_types=keep_document_types,
|
|
718
|
+
range_lookup_db=range_lookup_db
|
|
719
|
+
)
|
|
@@ -14,6 +14,7 @@ from ..sec.xbrl.filter_xbrl import filter_xbrl
|
|
|
14
14
|
from ..sec.submissions.monitor import Monitor
|
|
15
15
|
from .portfolio_compression_utils_legacy import CompressionManager
|
|
16
16
|
from ..datamule.sec_connector import SecConnector
|
|
17
|
+
from ..datamule.tar_downloader import download_tar
|
|
17
18
|
import shutil
|
|
18
19
|
|
|
19
20
|
|
|
@@ -34,7 +35,6 @@ class Portfolio:
|
|
|
34
35
|
|
|
35
36
|
if self.path.exists():
|
|
36
37
|
self._load_submissions()
|
|
37
|
-
self.submissions_loaded = True
|
|
38
38
|
else:
|
|
39
39
|
self.path.mkdir(parents=True, exist_ok=True)
|
|
40
40
|
|
|
@@ -81,6 +81,8 @@ class Portfolio:
|
|
|
81
81
|
self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
|
|
82
82
|
print(f"Successfully loaded {len(self.submissions)} submissions")
|
|
83
83
|
|
|
84
|
+
self.submissions_loaded = True
|
|
85
|
+
|
|
84
86
|
def _load_batch_submissions_worker(self, batch_tar_path, pbar):
|
|
85
87
|
"""Worker function to load submissions from one batch tar with progress updates"""
|
|
86
88
|
# Open tar handle and store it
|
|
@@ -219,8 +221,12 @@ class Portfolio:
|
|
|
219
221
|
skip_accession_numbers = []
|
|
220
222
|
if skip_existing:
|
|
221
223
|
skip_accession_numbers = [sub.accession for sub in self]
|
|
222
|
-
|
|
224
|
+
|
|
225
|
+
# map legacy provider
|
|
223
226
|
if provider == 'datamule':
|
|
227
|
+
provider = 'datamule-sgml'
|
|
228
|
+
|
|
229
|
+
if provider == 'datamule-sgml':
|
|
224
230
|
seclibrary_download(
|
|
225
231
|
output_dir=self.path,
|
|
226
232
|
cik=cik,
|
|
@@ -234,6 +240,18 @@ class Portfolio:
|
|
|
234
240
|
skip_accession_numbers=skip_accession_numbers,
|
|
235
241
|
accession_numbers = accession_numbers
|
|
236
242
|
)
|
|
243
|
+
elif provider == 'datamule-tar':
|
|
244
|
+
download_tar(
|
|
245
|
+
output_dir=self.path,
|
|
246
|
+
cik=cik,
|
|
247
|
+
api_key=self.api_key,
|
|
248
|
+
submission_type=submission_type,
|
|
249
|
+
filing_date=filing_date,
|
|
250
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
|
251
|
+
skip_accession_numbers=skip_accession_numbers,
|
|
252
|
+
accession_numbers = accession_numbers,
|
|
253
|
+
keep_document_types=document_type
|
|
254
|
+
)
|
|
237
255
|
else:
|
|
238
256
|
# will later add accession_numbers arg in the free update.
|
|
239
257
|
sec_download(
|
|
@@ -82,7 +82,7 @@ class Streamer(EFTSQuery):
|
|
|
82
82
|
if self.accession_numbers is not None and accno_w_dash not in self.accession_numbers:
|
|
83
83
|
return None, None, None
|
|
84
84
|
|
|
85
|
-
if self.skip_accession_numbers is not None and
|
|
85
|
+
if self.skip_accession_numbers is not None and accno_no_dash in self.skip_accession_numbers:
|
|
86
86
|
return None, None, None
|
|
87
87
|
|
|
88
88
|
# Construct the URL
|
|
@@ -83,6 +83,7 @@ class Submission:
|
|
|
83
83
|
self._tar = None
|
|
84
84
|
self._tar_compression_type = 'zstd'
|
|
85
85
|
self._tar_compression_level = 3
|
|
86
|
+
self._tar_compression_threshold = None
|
|
86
87
|
self._accession_year_2d = None
|
|
87
88
|
self._documents = None
|
|
88
89
|
|
|
@@ -380,9 +381,10 @@ class Submission:
|
|
|
380
381
|
def tar(self):
|
|
381
382
|
return self._tar_submission().getvalue()
|
|
382
383
|
|
|
383
|
-
def set_tar_compression(self,compression_type='zstd',level=3):
|
|
384
|
+
def set_tar_compression(self,compression_type='zstd',level=3,threshold=None):
|
|
384
385
|
self._tar_compression_type = compression_type
|
|
385
386
|
self._tar_compression_level = level
|
|
387
|
+
self._tar_compression_threshold = threshold
|
|
386
388
|
|
|
387
389
|
def _tar_submission(self):
|
|
388
390
|
if self._tar is not None:
|
|
@@ -393,7 +395,8 @@ class Submission:
|
|
|
393
395
|
documents_obj_list=documents_obj_list,
|
|
394
396
|
metadata=self.metadata.content,
|
|
395
397
|
compression_type=self._tar_compression_type,
|
|
396
|
-
level=self._tar_compression_level
|
|
398
|
+
level=self._tar_compression_level,
|
|
399
|
+
threshold=self._tar_compression_threshold
|
|
397
400
|
)
|
|
398
401
|
return self._tar
|
|
399
402
|
|
|
@@ -4,43 +4,42 @@ import tarfile
|
|
|
4
4
|
import io
|
|
5
5
|
import json
|
|
6
6
|
|
|
7
|
-
# Note: we don't actually need accession at this level. TODO
|
|
8
7
|
|
|
9
|
-
def compress_content(content, compression_type, level):
|
|
8
|
+
def compress_content(content, compression_type, level, threshold):
|
|
10
9
|
if compression_type == 'zstd':
|
|
11
|
-
# Create compressor with specified level
|
|
12
|
-
compressor = zstd.ZstdCompressor(level=level)
|
|
13
|
-
|
|
14
10
|
# Handle string content
|
|
15
|
-
# This should never be called
|
|
16
11
|
if isinstance(content, str):
|
|
17
12
|
content_bytes = content.encode('utf-8')
|
|
18
13
|
else:
|
|
19
14
|
content_bytes = content
|
|
20
|
-
|
|
21
|
-
#
|
|
15
|
+
|
|
16
|
+
# If content smaller than threshold, return uncompressed
|
|
17
|
+
if threshold is not None and len(content_bytes) < threshold:
|
|
18
|
+
return content_bytes
|
|
19
|
+
|
|
20
|
+
# Compress with specified level
|
|
21
|
+
compressor = zstd.ZstdCompressor(level=level)
|
|
22
22
|
return compressor.compress(content_bytes)
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
# Return uncompressed if not zstd
|
|
25
25
|
return content
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
def compress_content_list(document_tuple_list, compression_type, level, threshold):
|
|
28
29
|
if compression_type is None:
|
|
29
30
|
return document_tuple_list
|
|
30
31
|
|
|
31
32
|
if level is None:
|
|
32
33
|
level = 3
|
|
33
34
|
|
|
34
|
-
# Create new list to avoid modifying original
|
|
35
35
|
compressed_list = []
|
|
36
|
-
for
|
|
37
|
-
|
|
38
|
-
accession = document_tuple[1]
|
|
39
|
-
compressed_content = compress_content(content, compression_type, level)
|
|
36
|
+
for content, accession in document_tuple_list:
|
|
37
|
+
compressed_content = compress_content(content, compression_type, level, threshold)
|
|
40
38
|
compressed_list.append((compressed_content, accession))
|
|
41
39
|
|
|
42
40
|
return compressed_list
|
|
43
41
|
|
|
42
|
+
|
|
44
43
|
def tar_content_list(metadata, document_tuple_list_compressed):
|
|
45
44
|
# Update metadata with compressed sizes
|
|
46
45
|
for i, (content, accession) in enumerate(document_tuple_list_compressed):
|
|
@@ -65,15 +64,18 @@ def tar_content_list(metadata, document_tuple_list_compressed):
|
|
|
65
64
|
tarinfo.size = len(content)
|
|
66
65
|
tar.addfile(tarinfo, io.BytesIO(content))
|
|
67
66
|
|
|
68
|
-
#
|
|
69
|
-
tar_buffer.seek(0) # Reset buffer position to beginning
|
|
67
|
+
tar_buffer.seek(0) # Reset buffer position
|
|
70
68
|
return tar_buffer
|
|
71
69
|
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
|
|
71
|
+
def tar_submission(metadata, documents_obj_list, compression_type=None, level=None, threshold=None):
|
|
72
|
+
"""Takes a list of documents, compresses them (if above threshold), then tars them."""
|
|
74
73
|
document_tuple_list = [(doc.content, doc.accession) for doc in documents_obj_list]
|
|
75
|
-
document_tuple_list_compressed = compress_content_list(
|
|
76
|
-
|
|
77
|
-
|
|
74
|
+
document_tuple_list_compressed = compress_content_list(
|
|
75
|
+
document_tuple_list,
|
|
76
|
+
compression_type=compression_type,
|
|
77
|
+
level=level,
|
|
78
|
+
threshold=threshold
|
|
79
|
+
)
|
|
78
80
|
|
|
79
|
-
return tar_content_list(metadata, document_tuple_list_compressed)
|
|
81
|
+
return tar_content_list(metadata, document_tuple_list_compressed)
|
|
File without changes
|
|
@@ -20,6 +20,7 @@ datamule/datamule/datamule_lookup.py
|
|
|
20
20
|
datamule/datamule/datamule_mysql_rds.py
|
|
21
21
|
datamule/datamule/downloader.py
|
|
22
22
|
datamule/datamule/sec_connector.py
|
|
23
|
+
datamule/datamule/tar_downloader.py
|
|
23
24
|
datamule/document/__init__.py
|
|
24
25
|
datamule/document/document.py
|
|
25
26
|
datamule/mapping_dicts/__init__.py
|
|
@@ -29,6 +30,8 @@ datamule/mapping_dicts/xml_mapping_dicts.py
|
|
|
29
30
|
datamule/portfolio/__init__.py
|
|
30
31
|
datamule/portfolio/portfolio.py
|
|
31
32
|
datamule/portfolio/portfolio_compression_utils_legacy.py
|
|
33
|
+
datamule/providers/__init__.py
|
|
34
|
+
datamule/providers/providers.py
|
|
32
35
|
datamule/sec/__init__.py
|
|
33
36
|
datamule/sec/utils.py
|
|
34
37
|
datamule/sec/infrastructure/__init__.py
|
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
|
32
32
|
setup(
|
|
33
33
|
name="datamule",
|
|
34
34
|
author="John Friedman",
|
|
35
|
-
version="2.
|
|
35
|
+
version="2.4.0",
|
|
36
36
|
description="Work with SEC submissions at scale.",
|
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamule-2.3.8/datamule/sec/submissions → datamule-2.4.0/datamule/sec/infrastructure}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|