datamule 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/portfolio.py +92 -12
- datamule/sec/submissions/monitor.py +115 -75
- datamule/seclibrary/downloader.py +163 -161
- datamule/submission.py +102 -66
- datamule/utils/__init__.py +0 -0
- datamule/utils/construct_submissions_data.py +150 -0
- {datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/METADATA +1 -1
- {datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/RECORD +10 -8
- {datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/WHEEL +0 -0
- {datamule-1.6.0.dist-info → datamule-1.6.2.dist-info}/top_level.txt +0 -0
datamule/portfolio.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from tqdm import tqdm
|
3
|
-
from concurrent.futures import ThreadPoolExecutor
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
4
|
from .submission import Submission
|
5
5
|
from .sec.submissions.downloader import download as sec_download
|
6
6
|
from .sec.submissions.textsearch import filter_text
|
7
7
|
from .config import Config
|
8
8
|
import os
|
9
|
+
import tarfile
|
10
|
+
from threading import Lock
|
9
11
|
from .helper import _process_cik_and_metadata_filters
|
10
12
|
from .seclibrary.downloader import download as seclibrary_download
|
11
13
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
@@ -21,6 +23,10 @@ class Portfolio:
|
|
21
23
|
self.submissions = []
|
22
24
|
self.submissions_loaded = False
|
23
25
|
self.MAX_WORKERS = os.cpu_count() - 1
|
26
|
+
|
27
|
+
# Batch tar support
|
28
|
+
self.batch_tar_handles = {} # {batch_tar_path: tarfile_handle}
|
29
|
+
self.batch_tar_locks = {} # {batch_tar_path: threading.Lock}
|
24
30
|
|
25
31
|
self.monitor = Monitor()
|
26
32
|
|
@@ -34,9 +40,13 @@ class Portfolio:
|
|
34
40
|
self.api_key = api_key
|
35
41
|
|
36
42
|
def _load_submissions(self):
|
37
|
-
|
38
|
-
|
43
|
+
print(f"Loading submissions")
|
44
|
+
|
45
|
+
# Separate regular and batch items
|
46
|
+
regular_items = [f for f in self.path.iterdir() if (f.is_dir() or f.suffix=='.tar') and 'batch' not in f.name]
|
47
|
+
batch_tars = [f for f in self.path.iterdir() if f.is_file() and 'batch' in f.name and f.suffix == '.tar']
|
39
48
|
|
49
|
+
# Load regular submissions (existing logic)
|
40
50
|
def load_submission(folder):
|
41
51
|
try:
|
42
52
|
return Submission(folder)
|
@@ -44,17 +54,86 @@ class Portfolio:
|
|
44
54
|
print(f"Error loading submission from {folder}: {str(e)}")
|
45
55
|
return None
|
46
56
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
57
|
+
regular_submissions = []
|
58
|
+
if regular_items:
|
59
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
60
|
+
regular_submissions = list(tqdm(
|
61
|
+
executor.map(load_submission, regular_items),
|
62
|
+
total=len(regular_items),
|
63
|
+
desc="Loading regular submissions"
|
64
|
+
))
|
65
|
+
|
66
|
+
# Load batch submissions with parallel processing + progress
|
67
|
+
batch_submissions = []
|
68
|
+
if batch_tars:
|
69
|
+
with tqdm(desc="Loading batch submissions", unit="submissions") as pbar:
|
70
|
+
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
71
|
+
# Submit all batch tar jobs
|
72
|
+
futures = [
|
73
|
+
executor.submit(self._load_batch_submissions_worker, batch_tar, pbar)
|
74
|
+
for batch_tar in batch_tars
|
75
|
+
]
|
76
|
+
|
77
|
+
# Collect results as they complete
|
78
|
+
for future in as_completed(futures):
|
79
|
+
try:
|
80
|
+
batch_submissions.extend(future.result())
|
81
|
+
except Exception as e:
|
82
|
+
print(f"Error in batch processing: {str(e)}")
|
83
|
+
|
84
|
+
# Combine and filter None values
|
85
|
+
self.submissions = [s for s in (regular_submissions + batch_submissions) if s is not None]
|
56
86
|
print(f"Successfully loaded {len(self.submissions)} submissions")
|
57
87
|
|
88
|
+
def _load_batch_submissions_worker(self, batch_tar_path, pbar):
|
89
|
+
"""Worker function to load submissions from one batch tar with progress updates"""
|
90
|
+
try:
|
91
|
+
# Open tar handle and store it
|
92
|
+
tar_handle = tarfile.open(batch_tar_path, 'r')
|
93
|
+
self.batch_tar_handles[batch_tar_path] = tar_handle
|
94
|
+
self.batch_tar_locks[batch_tar_path] = Lock()
|
95
|
+
|
96
|
+
# Find all accession directories
|
97
|
+
accession_prefixes = set()
|
98
|
+
for member in tar_handle.getmembers():
|
99
|
+
if '/' in member.name and member.name.endswith('metadata.json'):
|
100
|
+
accession_prefix = member.name.split('/')[0]
|
101
|
+
accession_prefixes.add(accession_prefix)
|
102
|
+
|
103
|
+
# Create submissions for each accession
|
104
|
+
submissions = []
|
105
|
+
for accession_prefix in accession_prefixes:
|
106
|
+
try:
|
107
|
+
submission = Submission(
|
108
|
+
batch_tar_path=batch_tar_path,
|
109
|
+
accession_prefix=accession_prefix,
|
110
|
+
portfolio_ref=self
|
111
|
+
)
|
112
|
+
submissions.append(submission)
|
113
|
+
pbar.update(1) # Update progress for each successful submission
|
114
|
+
except Exception as e:
|
115
|
+
print(f"Error loading batch submission {accession_prefix} from {batch_tar_path.name}: {str(e)}")
|
116
|
+
|
117
|
+
return submissions
|
118
|
+
|
119
|
+
except Exception as e:
|
120
|
+
print(f"Error loading batch tar {batch_tar_path}: {str(e)}")
|
121
|
+
return []
|
122
|
+
|
123
|
+
def _close_batch_handles(self):
|
124
|
+
"""Close all open batch tar handles to free resources"""
|
125
|
+
for handle in self.batch_tar_handles.values():
|
126
|
+
try:
|
127
|
+
handle.close()
|
128
|
+
except Exception as e:
|
129
|
+
print(f"Error closing batch tar handle: {str(e)}")
|
130
|
+
self.batch_tar_handles.clear()
|
131
|
+
self.batch_tar_locks.clear()
|
132
|
+
|
133
|
+
def __del__(self):
|
134
|
+
"""Cleanup batch tar handles on destruction"""
|
135
|
+
self._close_batch_handles()
|
136
|
+
|
58
137
|
def process_submissions(self, callback):
|
59
138
|
"""Process all submissions using a thread pool."""
|
60
139
|
if not self.submissions_loaded:
|
@@ -169,6 +248,7 @@ class Portfolio:
|
|
169
248
|
)
|
170
249
|
|
171
250
|
self.submissions_loaded = False
|
251
|
+
|
172
252
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
173
253
|
polling_interval=1000, quiet=True, start_date=None,
|
174
254
|
validation_interval=600000):
|
@@ -9,16 +9,14 @@ from .eftsquery import EFTSQuery
|
|
9
9
|
import aiohttp
|
10
10
|
from zoneinfo import ZoneInfo
|
11
11
|
|
12
|
-
async def poll_rss(limiter):
|
12
|
+
async def poll_rss(limiter, session):
|
13
13
|
base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?count=100&action=getcurrent&output=rss'
|
14
14
|
|
15
|
-
#
|
16
|
-
async with
|
17
|
-
# Use the
|
18
|
-
async with
|
19
|
-
|
20
|
-
async with session.get(base_url) as response:
|
21
|
-
content = await response.read()
|
15
|
+
# Use the rate limiter before making the request
|
16
|
+
async with limiter:
|
17
|
+
# Use the provided session instead of creating a new one
|
18
|
+
async with session.get(base_url) as response:
|
19
|
+
content = await response.read()
|
22
20
|
|
23
21
|
# Process the content
|
24
22
|
content_str = content.decode('utf-8')
|
@@ -70,12 +68,31 @@ class Monitor():
|
|
70
68
|
self.ratelimiters = {'sec.gov': PreciseRateLimiter(rate=5)}
|
71
69
|
self.efts_query = EFTSQuery(quiet=True)
|
72
70
|
self.efts_query.limiter = self.ratelimiters['sec.gov']
|
71
|
+
self.session = None
|
72
|
+
self.session_created_at = 0
|
73
|
+
self.session_lifetime = 300 # 5 minutes in seconds
|
73
74
|
|
74
75
|
def set_domain_rate_limit(self, domain, rate):
|
75
76
|
self.ratelimiters[domain] = PreciseRateLimiter(rate=rate)
|
76
77
|
if domain == 'sec.gov':
|
77
78
|
self.efts_query.limiter = self.ratelimiters[domain]
|
78
79
|
|
80
|
+
async def _ensure_fresh_session(self):
|
81
|
+
"""Ensure we have a fresh session, recreating if expired or missing"""
|
82
|
+
current_time = time.time()
|
83
|
+
|
84
|
+
# Check if we need a new session
|
85
|
+
if (self.session is None or
|
86
|
+
current_time - self.session_created_at > self.session_lifetime):
|
87
|
+
|
88
|
+
# Close old session if it exists
|
89
|
+
if self.session:
|
90
|
+
await self.session.close()
|
91
|
+
|
92
|
+
# Create new session
|
93
|
+
self.session = aiohttp.ClientSession(headers=headers)
|
94
|
+
self.session_created_at = current_time
|
95
|
+
|
79
96
|
async def _async_run_efts_query(self, **kwargs):
|
80
97
|
"""Async helper method to run EFTS query without creating a new event loop"""
|
81
98
|
# Make sure to set quiet parameter if provided in kwargs
|
@@ -103,83 +120,106 @@ class Monitor():
|
|
103
120
|
if polling_interval is None and validation_interval is None:
|
104
121
|
raise ValueError("At least one of polling_interval or validation_interval must be specified")
|
105
122
|
|
106
|
-
#
|
107
|
-
|
108
|
-
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
109
|
-
if not quiet:
|
110
|
-
print(f"Backfilling from {start_date} to {today_date}")
|
111
|
-
|
112
|
-
hits = clean_efts_hits(await self._async_run_efts_query(
|
113
|
-
filing_date=(start_date, today_date),
|
114
|
-
quiet=quiet
|
115
|
-
))
|
116
|
-
|
117
|
-
new_hits = self._filter_new_accessions(hits)
|
118
|
-
if not quiet:
|
119
|
-
print(f"New submissions found: {len(new_hits)}")
|
120
|
-
if new_hits and data_callback:
|
121
|
-
data_callback(new_hits)
|
122
|
-
|
123
|
-
# Initialize timing variables
|
124
|
-
current_time = time.time()
|
125
|
-
last_polling_time = current_time
|
126
|
-
last_validation_time = current_time
|
127
|
-
|
128
|
-
# Determine which operations to perform
|
129
|
-
do_polling = polling_interval is not None
|
130
|
-
do_validation = validation_interval is not None
|
123
|
+
# Ensure we have a fresh session
|
124
|
+
await self._ensure_fresh_session()
|
131
125
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
# RSS polling (if enabled)
|
136
|
-
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
137
|
-
if not quiet:
|
138
|
-
print(f"Polling RSS feed")
|
139
|
-
results = await poll_rss(self.ratelimiters['sec.gov'])
|
140
|
-
new_results = self._filter_new_accessions(results)
|
141
|
-
if new_results:
|
142
|
-
if not quiet:
|
143
|
-
print(f"Found {len(new_results)} new submissions via RSS")
|
144
|
-
if data_callback:
|
145
|
-
data_callback(new_results)
|
146
|
-
last_polling_time = current_time
|
147
|
-
|
148
|
-
# EFTS validation (if enabled)
|
149
|
-
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
150
|
-
# Get submissions from the last 24 hours for validation
|
126
|
+
try:
|
127
|
+
# Backfill if start_date is provided
|
128
|
+
if start_date is not None:
|
151
129
|
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
152
130
|
if not quiet:
|
153
|
-
print(f"
|
131
|
+
print(f"Backfilling from {start_date} to {today_date}")
|
154
132
|
|
155
133
|
hits = clean_efts_hits(await self._async_run_efts_query(
|
156
|
-
filing_date=(
|
134
|
+
filing_date=(start_date, today_date),
|
157
135
|
quiet=quiet
|
158
136
|
))
|
159
|
-
|
137
|
+
|
160
138
|
new_hits = self._filter_new_accessions(hits)
|
161
|
-
if
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
139
|
+
if not quiet:
|
140
|
+
print(f"New submissions found: {len(new_hits)}")
|
141
|
+
if new_hits and data_callback:
|
142
|
+
data_callback(new_hits)
|
143
|
+
|
144
|
+
# Initialize timing variables
|
145
|
+
current_time = time.time()
|
146
|
+
last_polling_time = current_time
|
147
|
+
last_validation_time = current_time
|
167
148
|
|
168
|
-
#
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
# Calculate next wake-up time
|
173
|
-
next_times = []
|
174
|
-
if do_polling:
|
175
|
-
next_times.append(last_polling_time + (polling_interval / 1000))
|
176
|
-
if do_validation:
|
177
|
-
next_times.append(last_validation_time + (validation_interval / 1000))
|
149
|
+
# Determine which operations to perform
|
150
|
+
do_polling = polling_interval is not None
|
151
|
+
do_validation = validation_interval is not None
|
178
152
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
153
|
+
while True:
|
154
|
+
current_time = time.time()
|
155
|
+
|
156
|
+
# RSS polling (if enabled)
|
157
|
+
if do_polling and (current_time - last_polling_time) >= polling_interval/1000:
|
158
|
+
if not quiet:
|
159
|
+
print(f"Polling RSS feed")
|
160
|
+
|
161
|
+
# Ensure session is fresh before polling
|
162
|
+
await self._ensure_fresh_session()
|
163
|
+
|
164
|
+
try:
|
165
|
+
results = await poll_rss(self.ratelimiters['sec.gov'], self.session)
|
166
|
+
new_results = self._filter_new_accessions(results)
|
167
|
+
if new_results:
|
168
|
+
if not quiet:
|
169
|
+
print(f"Found {len(new_results)} new submissions via RSS")
|
170
|
+
if data_callback:
|
171
|
+
data_callback(new_results)
|
172
|
+
except Exception as e:
|
173
|
+
if not quiet:
|
174
|
+
print(f"RSS polling error: {e}, will recreate session on next poll")
|
175
|
+
# Force session recreation on next poll
|
176
|
+
if self.session:
|
177
|
+
await self.session.close()
|
178
|
+
self.session = None
|
179
|
+
|
180
|
+
last_polling_time = current_time
|
181
|
+
|
182
|
+
# EFTS validation (if enabled)
|
183
|
+
if do_validation and (current_time - last_validation_time) >= validation_interval/1000:
|
184
|
+
# Get submissions from the last 24 hours for validation
|
185
|
+
today_date = datetime.now(ZoneInfo("America/New_York")).strftime('%Y-%m-%d')
|
186
|
+
if not quiet:
|
187
|
+
print(f"Validating submissions from {today_date}")
|
188
|
+
|
189
|
+
hits = clean_efts_hits(await self._async_run_efts_query(
|
190
|
+
filing_date=(today_date, today_date),
|
191
|
+
quiet=quiet
|
192
|
+
))
|
193
|
+
|
194
|
+
new_hits = self._filter_new_accessions(hits)
|
195
|
+
if new_hits:
|
196
|
+
if not quiet:
|
197
|
+
print(f"Found {len(new_hits)} new submissions via EFTS validation")
|
198
|
+
if data_callback:
|
199
|
+
data_callback(new_hits)
|
200
|
+
last_validation_time = current_time
|
201
|
+
|
202
|
+
# Interval callback
|
203
|
+
if interval_callback:
|
204
|
+
interval_callback()
|
205
|
+
|
206
|
+
# Calculate next wake-up time
|
207
|
+
next_times = []
|
208
|
+
if do_polling:
|
209
|
+
next_times.append(last_polling_time + (polling_interval / 1000))
|
210
|
+
if do_validation:
|
211
|
+
next_times.append(last_validation_time + (validation_interval / 1000))
|
212
|
+
|
213
|
+
next_wake_time = min(next_times)
|
214
|
+
current_time = time.time()
|
215
|
+
time_to_sleep = max(0, next_wake_time - current_time)
|
216
|
+
await asyncio.sleep(time_to_sleep)
|
217
|
+
|
218
|
+
finally:
|
219
|
+
# Clean up the session when done
|
220
|
+
if self.session:
|
221
|
+
await self.session.close()
|
222
|
+
self.session = None
|
183
223
|
|
184
224
|
def monitor_submissions(self, data_callback=None, interval_callback=None,
|
185
225
|
polling_interval=1000, quiet=True, start_date=None,
|
@@ -8,13 +8,15 @@ import ssl
|
|
8
8
|
import zstandard as zstd
|
9
9
|
import io
|
10
10
|
import json
|
11
|
+
import tarfile
|
11
12
|
from concurrent.futures import ThreadPoolExecutor
|
12
13
|
from functools import partial
|
13
14
|
from queue import Queue, Empty
|
14
|
-
from threading import Thread
|
15
|
+
from threading import Thread, Lock
|
15
16
|
from .query import query
|
16
17
|
from os import cpu_count
|
17
|
-
from secsgml import
|
18
|
+
from secsgml import parse_sgml_content_into_memory
|
19
|
+
from secsgml.utils import bytes_to_str
|
18
20
|
|
19
21
|
|
20
22
|
|
@@ -24,25 +26,19 @@ class Downloader:
|
|
24
26
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
25
27
|
self.MAX_CONCURRENT_DOWNLOADS = 100
|
26
28
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
27
|
-
self.
|
28
|
-
self.QUEUE_SIZE = 10
|
29
|
+
self.MAX_TAR_WORKERS = cpu_count()
|
29
30
|
if api_key is not None:
|
30
31
|
self._api_key = api_key
|
31
|
-
# Create a shared event loop for async operations
|
32
32
|
self.loop = asyncio.new_event_loop()
|
33
|
-
# Create a thread to run the event loop
|
34
33
|
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
35
34
|
self.loop_thread.start()
|
36
|
-
# Create a queue for async tasks
|
37
35
|
self.async_queue = Queue()
|
38
36
|
|
39
37
|
def _run_event_loop(self):
|
40
|
-
"""Run the event loop in a separate thread"""
|
41
38
|
asyncio.set_event_loop(self.loop)
|
42
39
|
self.loop.run_forever()
|
43
40
|
|
44
41
|
def _run_coroutine(self, coro):
|
45
|
-
"""Run a coroutine in the event loop and return its result"""
|
46
42
|
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
47
43
|
return future.result()
|
48
44
|
|
@@ -72,65 +68,94 @@ class Downloader:
|
|
72
68
|
except Exception as e:
|
73
69
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
74
70
|
|
75
|
-
class
|
76
|
-
def __init__(self, output_dir,
|
77
|
-
self.processing_queue = Queue(maxsize=queue_size)
|
78
|
-
self.should_stop = False
|
79
|
-
self.processing_workers = []
|
71
|
+
class TarManager:
|
72
|
+
def __init__(self, output_dir, num_tar_files, max_batch_size=1024*1024*1024):
|
80
73
|
self.output_dir = output_dir
|
81
|
-
self.
|
82
|
-
self.
|
83
|
-
self.
|
84
|
-
self.
|
85
|
-
self.
|
86
|
-
self.
|
87
|
-
self.
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
self.
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
self.
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
74
|
+
self.num_tar_files = num_tar_files
|
75
|
+
self.max_batch_size = max_batch_size
|
76
|
+
self.tar_files = {}
|
77
|
+
self.tar_locks = {}
|
78
|
+
self.file_counters = {}
|
79
|
+
self.tar_sizes = {}
|
80
|
+
self.tar_sequences = {}
|
81
|
+
|
82
|
+
for i in range(num_tar_files):
|
83
|
+
tar_path = os.path.join(output_dir, f'batch_{i:03d}_001.tar')
|
84
|
+
self.tar_files[i] = tarfile.open(tar_path, 'w')
|
85
|
+
self.tar_locks[i] = Lock()
|
86
|
+
self.file_counters[i] = 0
|
87
|
+
self.tar_sizes[i] = 0
|
88
|
+
self.tar_sequences[i] = 1
|
89
|
+
|
90
|
+
def get_tar_index(self, filename):
|
91
|
+
return hash(filename) % self.num_tar_files
|
92
|
+
|
93
|
+
def write_submission(self, filename, metadata, documents, standardize_metadata):
|
94
|
+
tar_index = self.get_tar_index(filename)
|
95
|
+
accession_num = filename.split('.')[0]
|
96
|
+
|
97
|
+
metadata_str = bytes_to_str(metadata, lower=False)
|
98
|
+
metadata_json = json.dumps(metadata_str).encode('utf-8')
|
99
|
+
submission_size = len(metadata_json) + sum(len(doc) for doc in documents)
|
100
|
+
|
101
|
+
with self.tar_locks[tar_index]:
|
102
|
+
if self.tar_sizes[tar_index] > 0 and self.tar_sizes[tar_index] + submission_size > self.max_batch_size:
|
103
|
+
tar = self.tar_files[tar_index]
|
104
|
+
tar.close()
|
105
|
+
|
106
|
+
self.tar_sequences[tar_index] += 1
|
107
|
+
new_tar_path = os.path.join(self.output_dir, f'batch_{tar_index:03d}_{self.tar_sequences[tar_index]:03d}.tar')
|
108
|
+
self.tar_files[tar_index] = tarfile.open(new_tar_path, 'w')
|
109
|
+
self.file_counters[tar_index] = 0
|
110
|
+
self.tar_sizes[tar_index] = 0
|
111
|
+
|
112
|
+
tar = self.tar_files[tar_index]
|
113
|
+
|
106
114
|
try:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
115
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/metadata.json')
|
116
|
+
tarinfo.size = len(metadata_json)
|
117
|
+
tar.addfile(tarinfo, io.BytesIO(metadata_json))
|
118
|
+
|
119
|
+
for file_num, content in enumerate(documents):
|
120
|
+
doc_name = self._get_document_name(metadata, file_num, standardize_metadata)
|
121
|
+
tarinfo = tarfile.TarInfo(name=f'{accession_num}/{doc_name}')
|
122
|
+
tarinfo.size = len(content)
|
123
|
+
tar.addfile(tarinfo, io.BytesIO(content))
|
124
|
+
|
125
|
+
self.file_counters[tar_index] += 1
|
126
|
+
self.tar_sizes[tar_index] += submission_size
|
127
|
+
return True
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
print(f"Error writing {filename} to tar {tar_index}: {str(e)}")
|
131
|
+
return False
|
132
|
+
|
133
|
+
def _get_document_name(self, metadata, file_num, standardize_metadata):
|
134
|
+
if standardize_metadata:
|
135
|
+
documents_key = b'documents'
|
136
|
+
filename_key = b'filename'
|
137
|
+
sequence_key = b'sequence'
|
138
|
+
else:
|
139
|
+
documents_key = b'DOCUMENTS'
|
140
|
+
filename_key = b'FILENAME'
|
141
|
+
sequence_key = b'SEQUENCE'
|
142
|
+
|
143
|
+
doc_metadata = metadata[documents_key][file_num]
|
144
|
+
filename = doc_metadata.get(filename_key)
|
145
|
+
if filename:
|
146
|
+
return filename.decode('utf-8')
|
147
|
+
else:
|
148
|
+
sequence = doc_metadata.get(sequence_key, b'document')
|
149
|
+
return sequence.decode('utf-8') + '.txt'
|
150
|
+
|
151
|
+
def close_all(self):
|
152
|
+
for i, tar in self.tar_files.items():
|
153
|
+
try:
|
154
|
+
tar.close()
|
155
|
+
except Exception as e:
|
156
|
+
print(f"Error closing tar {i}: {str(e)}")
|
132
157
|
|
133
|
-
def
|
158
|
+
def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
|
134
159
|
dctx = zstd.ZstdDecompressor()
|
135
160
|
try:
|
136
161
|
input_buffer = io.BytesIO(b''.join(compressed_chunks))
|
@@ -140,11 +165,19 @@ class Downloader:
|
|
140
165
|
shutil.copyfileobj(reader, decompressed_content)
|
141
166
|
|
142
167
|
content = decompressed_content.getvalue()
|
143
|
-
|
144
|
-
|
168
|
+
|
169
|
+
metadata, documents = parse_sgml_content_into_memory(
|
170
|
+
bytes_content=content,
|
171
|
+
filter_document_types=keep_document_types,
|
172
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
173
|
+
standardize_metadata=standardize_metadata
|
174
|
+
)
|
175
|
+
|
176
|
+
success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
|
177
|
+
return success
|
145
178
|
|
146
179
|
except Exception as e:
|
147
|
-
self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
|
180
|
+
self._log_error(output_dir, filename, f"Decompression/parsing error: {str(e)}")
|
148
181
|
return False
|
149
182
|
finally:
|
150
183
|
try:
|
@@ -153,17 +186,25 @@ class Downloader:
|
|
153
186
|
except:
|
154
187
|
pass
|
155
188
|
|
156
|
-
def
|
189
|
+
def parse_and_write_regular_file(self, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir):
|
157
190
|
try:
|
158
191
|
content = b''.join(chunks)
|
159
|
-
|
160
|
-
|
192
|
+
|
193
|
+
metadata, documents = parse_sgml_content_into_memory(
|
194
|
+
bytes_content=content,
|
195
|
+
filter_document_types=keep_document_types,
|
196
|
+
keep_filtered_metadata=keep_filtered_metadata,
|
197
|
+
standardize_metadata=standardize_metadata
|
198
|
+
)
|
199
|
+
|
200
|
+
success = tar_manager.write_submission(filename, metadata, documents, standardize_metadata)
|
201
|
+
return success
|
161
202
|
|
162
203
|
except Exception as e:
|
163
|
-
self._log_error(output_dir, filename, f"
|
204
|
+
self._log_error(output_dir, filename, f"Parsing error: {str(e)}")
|
164
205
|
return False
|
165
206
|
|
166
|
-
async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir,
|
207
|
+
async def download_and_process(self, session, url, semaphore, decompression_pool, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir, pbar):
|
167
208
|
async with semaphore:
|
168
209
|
chunks = []
|
169
210
|
filename = url.split('/')[-1]
|
@@ -188,70 +229,70 @@ class Downloader:
|
|
188
229
|
if filename.endswith('.zst'):
|
189
230
|
success = await loop.run_in_executor(
|
190
231
|
decompression_pool,
|
191
|
-
partial(self.
|
232
|
+
partial(self.decompress_and_parse_and_write, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
192
233
|
)
|
193
234
|
else:
|
194
235
|
success = await loop.run_in_executor(
|
195
236
|
decompression_pool,
|
196
|
-
partial(self.
|
237
|
+
partial(self.parse_and_write_regular_file, chunks, filename, keep_document_types, keep_filtered_metadata, standardize_metadata, tar_manager, output_dir)
|
197
238
|
)
|
198
239
|
|
199
240
|
if not success:
|
200
241
|
self._log_error(output_dir, filename, "Failed to process file")
|
242
|
+
|
201
243
|
elif response.status == 401:
|
202
244
|
self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
|
203
245
|
raise ValueError("Invalid API key")
|
204
246
|
else:
|
205
247
|
self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
|
248
|
+
|
249
|
+
pbar.update(1)
|
250
|
+
|
206
251
|
except Exception as e:
|
207
252
|
self._log_error(output_dir, filename, str(e))
|
253
|
+
pbar.update(1)
|
208
254
|
|
209
|
-
async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
255
|
+
async def process_batch(self, urls, output_dir, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
|
210
256
|
os.makedirs(output_dir, exist_ok=True)
|
211
257
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
258
|
+
num_tar_files = min(self.MAX_TAR_WORKERS, len(urls))
|
259
|
+
|
260
|
+
tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
|
261
|
+
|
262
|
+
try:
|
263
|
+
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
264
|
+
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
265
|
+
decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
|
266
|
+
|
267
|
+
connector = aiohttp.TCPConnector(
|
268
|
+
limit=self.MAX_CONCURRENT_DOWNLOADS,
|
269
|
+
force_close=False,
|
270
|
+
ssl=ssl.create_default_context(),
|
271
|
+
ttl_dns_cache=300,
|
272
|
+
keepalive_timeout=60
|
273
|
+
)
|
274
|
+
|
275
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
|
276
|
+
tasks = [
|
277
|
+
self.download_and_process(
|
278
|
+
session, url, semaphore, decompression_pool,
|
279
|
+
keep_document_types, keep_filtered_metadata, standardize_metadata,
|
280
|
+
tar_manager, output_dir, pbar
|
281
|
+
)
|
282
|
+
for url in urls
|
283
|
+
]
|
284
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
285
|
+
|
286
|
+
decompression_pool.shutdown()
|
287
|
+
|
288
|
+
finally:
|
289
|
+
tar_manager.close_all()
|
236
290
|
|
237
291
|
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
|
238
|
-
skip_accession_numbers=[]):
|
239
|
-
"""
|
240
|
-
Query SEC filings and download/process them.
|
241
|
-
|
242
|
-
Parameters:
|
243
|
-
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
244
|
-
- cik: Company CIK number(s), string, int, or list
|
245
|
-
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
246
|
-
- output_dir: Directory to save downloaded files
|
247
|
-
- accession_numbers: List of specific accession numbers to download
|
248
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
249
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
250
|
-
"""
|
292
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
251
293
|
if self.api_key is None:
|
252
294
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
253
295
|
|
254
|
-
# Query the SEC filings first - before starting any async operations
|
255
296
|
print("Querying SEC filings...")
|
256
297
|
filings = query(
|
257
298
|
submission_type=submission_type,
|
@@ -260,19 +301,14 @@ class Downloader:
|
|
260
301
|
api_key=self.api_key
|
261
302
|
)
|
262
303
|
|
263
|
-
|
264
|
-
# After querying but before generating URLs
|
265
304
|
if accession_numbers:
|
266
305
|
accession_numbers = [str(int(item.replace('-',''))) for item in accession_numbers]
|
267
306
|
filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
|
268
307
|
|
269
|
-
|
270
308
|
if skip_accession_numbers:
|
271
309
|
skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
|
272
310
|
filings = [filing for filing in filings if filing['accession_number'] not in skip_accession_numbers]
|
273
311
|
|
274
|
-
# Generate URLs from the query results
|
275
|
-
|
276
312
|
print(f"Generating URLs for {len(filings)} filings...")
|
277
313
|
urls = []
|
278
314
|
for item in filings:
|
@@ -285,38 +321,21 @@ class Downloader:
|
|
285
321
|
print("No submissions found matching the criteria")
|
286
322
|
return
|
287
323
|
|
288
|
-
# Remove duplicates
|
289
324
|
urls = list(set(urls))
|
290
325
|
|
291
|
-
# Now start the async processing
|
292
326
|
start_time = time.time()
|
293
327
|
|
294
|
-
|
295
|
-
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata))
|
328
|
+
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types, keep_filtered_metadata=keep_filtered_metadata, standardize_metadata=standardize_metadata, max_batch_size=max_batch_size))
|
296
329
|
|
297
|
-
# Calculate and display performance metrics
|
298
330
|
elapsed_time = time.time() - start_time
|
299
331
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
300
332
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
301
333
|
|
302
334
|
def __del__(self):
|
303
|
-
"""Cleanup when the downloader is garbage collected"""
|
304
335
|
if hasattr(self, 'loop') and self.loop.is_running():
|
305
336
|
self.loop.call_soon_threadsafe(self.loop.stop)
|
306
337
|
|
307
|
-
|
308
|
-
|
309
|
-
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True):
|
310
|
-
"""
|
311
|
-
Download and process SEC filings using specific filenames.
|
312
|
-
|
313
|
-
Parameters:
|
314
|
-
- filenames: List of specific filenames to download (e.g., ['000091205797006494.sgml', '000100704297000007.sgml.zst'])
|
315
|
-
- output_dir: Directory to save downloaded files
|
316
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
317
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
318
|
-
- standardize_metadata: Whether to standardize metadata format
|
319
|
-
"""
|
338
|
+
def download_files_using_filename(self, filenames, output_dir="downloads", keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True, max_batch_size=1024*1024*1024):
|
320
339
|
if self.api_key is None:
|
321
340
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
322
341
|
|
@@ -326,27 +345,23 @@ class Downloader:
|
|
326
345
|
if not isinstance(filenames, (list, tuple)):
|
327
346
|
filenames = [filenames]
|
328
347
|
|
329
|
-
# Validate filenames format
|
330
348
|
for filename in filenames:
|
331
349
|
if not isinstance(filename, str):
|
332
350
|
raise ValueError(f"Invalid filename type: {type(filename)}. Expected string.")
|
333
351
|
if not (filename.endswith('.sgml') or filename.endswith('.sgml.zst')):
|
334
352
|
raise ValueError(f"Invalid filename format: {filename}. Expected .sgml or .sgml.zst extension.")
|
335
353
|
|
336
|
-
# Generate URLs directly from filenames
|
337
354
|
print(f"Generating URLs for {len(filenames)} files...")
|
338
355
|
urls = []
|
339
356
|
for filename in filenames:
|
340
357
|
url = f"{self.BASE_URL}{filename}"
|
341
358
|
urls.append(url)
|
342
359
|
|
343
|
-
# Remove duplicates while preserving order
|
344
360
|
seen = set()
|
345
361
|
urls = [url for url in urls if not (url in seen or seen.add(url))]
|
346
362
|
|
347
363
|
print(f"Downloading {len(urls)} files...")
|
348
364
|
|
349
|
-
# Process the batch asynchronously using existing infrastructure
|
350
365
|
start_time = time.time()
|
351
366
|
|
352
367
|
asyncio.run(self.process_batch(
|
@@ -354,33 +369,19 @@ class Downloader:
|
|
354
369
|
output_dir,
|
355
370
|
keep_document_types=keep_document_types,
|
356
371
|
keep_filtered_metadata=keep_filtered_metadata,
|
357
|
-
standardize_metadata=standardize_metadata
|
372
|
+
standardize_metadata=standardize_metadata,
|
373
|
+
max_batch_size=max_batch_size
|
358
374
|
))
|
359
375
|
|
360
|
-
# Calculate and display performance metrics
|
361
376
|
elapsed_time = time.time() - start_time
|
362
377
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
363
378
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
364
379
|
|
365
380
|
|
366
381
|
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
367
|
-
skip_accession_numbers=[]):
|
368
|
-
"""
|
369
|
-
Query SEC filings and download/process them.
|
370
|
-
|
371
|
-
Parameters:
|
372
|
-
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
373
|
-
- cik: Company CIK number(s), string, int, or list
|
374
|
-
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
375
|
-
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
376
|
-
- output_dir: Directory to save downloaded files
|
377
|
-
- accession_numbers: List of specific accession numbers to download
|
378
|
-
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
379
|
-
- keep_filtered_metadata: Whether to keep metadata for filtered documents
|
380
|
-
"""
|
382
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
381
383
|
if accession_numbers:
|
382
384
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
383
|
-
# check if acc no is empty list
|
384
385
|
elif accession_numbers == []:
|
385
386
|
raise ValueError("Applied filter resulted in empty accession numbers list")
|
386
387
|
downloader = Downloader(api_key=api_key)
|
@@ -393,5 +394,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
393
394
|
keep_document_types=keep_document_types,
|
394
395
|
keep_filtered_metadata=keep_filtered_metadata,
|
395
396
|
standardize_metadata=standardize_metadata,
|
396
|
-
skip_accession_numbers=skip_accession_numbers
|
397
|
-
|
397
|
+
skip_accession_numbers=skip_accession_numbers,
|
398
|
+
max_batch_size=max_batch_size
|
399
|
+
)
|
datamule/submission.py
CHANGED
@@ -2,50 +2,15 @@ from pathlib import Path
|
|
2
2
|
import json
|
3
3
|
from .document.document import Document
|
4
4
|
from secsgml import parse_sgml_content_into_memory
|
5
|
-
from secsgml.utils import bytes_to_str
|
5
|
+
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
6
6
|
from secsgml.parse_sgml import transform_metadata_string
|
7
7
|
import tarfile
|
8
8
|
import shutil
|
9
9
|
import zstandard as zstd
|
10
10
|
import gzip
|
11
11
|
import io
|
12
|
-
import copy
|
13
12
|
|
14
13
|
|
15
|
-
def calculate_documents_locations_in_tar(metadata, documents):
|
16
|
-
# Step 1: Add placeholder byte positions to get accurate size (10-digit padded)
|
17
|
-
placeholder_metadata = copy.deepcopy(metadata)
|
18
|
-
|
19
|
-
for file_num in range(len(documents)):
|
20
|
-
if 'documents' in placeholder_metadata:
|
21
|
-
placeholder_metadata['documents'][file_num]['secsgml_start_byte'] = "9999999999" # 10 digits
|
22
|
-
placeholder_metadata['documents'][file_num]['secsgml_end_byte'] = "9999999999" # 10 digits
|
23
|
-
|
24
|
-
# Step 2: Calculate size with placeholders
|
25
|
-
placeholder_str = bytes_to_str(placeholder_metadata, lower=False)
|
26
|
-
placeholder_json = json.dumps(placeholder_str).encode('utf-8')
|
27
|
-
metadata_size = len(placeholder_json)
|
28
|
-
|
29
|
-
# Step 3: Now calculate actual positions using this size
|
30
|
-
current_pos = 512 + metadata_size
|
31
|
-
current_pos += (512 - (current_pos % 512)) % 512
|
32
|
-
|
33
|
-
# Step 4: Calculate real positions and update original metadata (10-digit padded)
|
34
|
-
for file_num, content in enumerate(documents):
|
35
|
-
start_byte = current_pos + 512
|
36
|
-
end_byte = start_byte + len(content)
|
37
|
-
|
38
|
-
if 'documents' in metadata:
|
39
|
-
metadata['documents'][file_num]['secsgml_start_byte'] = f"{start_byte:010d}" # 10-digit padding
|
40
|
-
metadata['documents'][file_num]['secsgml_end_byte'] = f"{end_byte:010d}" # 10-digit padding
|
41
|
-
|
42
|
-
|
43
|
-
file_total_size = 512 + len(content)
|
44
|
-
padded_size = file_total_size + (512 - (file_total_size % 512)) % 512
|
45
|
-
current_pos += padded_size
|
46
|
-
|
47
|
-
return metadata
|
48
|
-
|
49
14
|
|
50
15
|
def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,compression_list):
|
51
16
|
# Write tar directly to disk
|
@@ -78,11 +43,21 @@ def write_submission_to_tar(output_path,metadata,documents,standardize_metadata,
|
|
78
43
|
tar.addfile(tarinfo, io.BytesIO(content))
|
79
44
|
|
80
45
|
class Submission:
|
81
|
-
def __init__(self, path=None,sgml_content=None,keep_document_types=None
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
46
|
+
def __init__(self, path=None, sgml_content=None, keep_document_types=None,
|
47
|
+
batch_tar_path=None, accession_prefix=None, portfolio_ref=None):
|
48
|
+
|
49
|
+
# Validate parameters
|
50
|
+
param_count = sum(x is not None for x in [path, sgml_content, batch_tar_path])
|
51
|
+
if param_count != 1:
|
52
|
+
raise ValueError("Exactly one of path, sgml_content, or batch_tar_path must be provided")
|
53
|
+
|
54
|
+
if batch_tar_path is not None and (accession_prefix is None or portfolio_ref is None):
|
55
|
+
raise ValueError("batch_tar_path requires both accession_prefix and portfolio_ref")
|
56
|
+
|
57
|
+
# Initialize batch tar attributes
|
58
|
+
self.batch_tar_path = batch_tar_path
|
59
|
+
self.accession_prefix = accession_prefix
|
60
|
+
self.portfolio_ref = portfolio_ref
|
86
61
|
|
87
62
|
if sgml_content is not None:
|
88
63
|
self.path = None
|
@@ -100,7 +75,7 @@ class Submission:
|
|
100
75
|
filtered_metadata_documents = []
|
101
76
|
|
102
77
|
for idx,doc in enumerate(self.metadata.content['documents']):
|
103
|
-
type = doc.get('type')
|
78
|
+
type = doc.get('type')
|
104
79
|
|
105
80
|
# Keep only specified types
|
106
81
|
if keep_document_types is not None and type not in keep_document_types:
|
@@ -115,7 +90,26 @@ class Submission:
|
|
115
90
|
|
116
91
|
self.metadata.content['documents'] = filtered_metadata_documents
|
117
92
|
|
118
|
-
|
93
|
+
elif batch_tar_path is not None:
|
94
|
+
# Batch tar case
|
95
|
+
self.path = None
|
96
|
+
|
97
|
+
# Load metadata from batch tar
|
98
|
+
with self.portfolio_ref.batch_tar_locks[batch_tar_path]:
|
99
|
+
tar_handle = self.portfolio_ref.batch_tar_handles[batch_tar_path]
|
100
|
+
metadata_obj = tar_handle.extractfile(f'{accession_prefix}/metadata.json')
|
101
|
+
metadata = json.loads(metadata_obj.read().decode('utf-8'))
|
102
|
+
|
103
|
+
# Set metadata path using :: notation
|
104
|
+
metadata_path = f"{batch_tar_path}::{accession_prefix}/metadata.json"
|
105
|
+
|
106
|
+
# standardize metadata
|
107
|
+
metadata = transform_metadata_string(metadata)
|
108
|
+
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
|
109
|
+
self.accession = self.metadata.content['accession-number']
|
110
|
+
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
|
111
|
+
|
112
|
+
elif path is not None:
|
119
113
|
self.path = Path(path)
|
120
114
|
if self.path.suffix == '.tar':
|
121
115
|
with tarfile.open(self.path,'r') as tar:
|
@@ -228,44 +222,86 @@ class Submission:
|
|
228
222
|
doc = self.metadata.content['documents'][idx]
|
229
223
|
|
230
224
|
# If loaded from sgml_content, return pre-loaded document
|
231
|
-
if self.path is None:
|
225
|
+
if self.path is None and self.batch_tar_path is None:
|
232
226
|
return self.documents[idx]
|
233
227
|
|
234
|
-
#
|
228
|
+
# Get filename
|
235
229
|
filename = doc.get('filename')
|
236
230
|
if filename is None:
|
237
231
|
filename = doc['sequence'] + '.txt'
|
238
232
|
|
239
|
-
|
240
|
-
extension = document_path.suffix
|
233
|
+
extension = Path(filename).suffix
|
241
234
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
235
|
+
# Handle batch tar case
|
236
|
+
if self.batch_tar_path is not None:
|
237
|
+
with self.portfolio_ref.batch_tar_locks[self.batch_tar_path]:
|
238
|
+
tar_handle = self.portfolio_ref.batch_tar_handles[self.batch_tar_path]
|
239
|
+
|
240
|
+
# Try different filename variations for compressed files
|
241
|
+
possible_filenames = [
|
242
|
+
f'{self.accession_prefix}/{filename}',
|
243
|
+
f'{self.accession_prefix}/{filename}.gz',
|
244
|
+
f'{self.accession_prefix}/{filename}.zst'
|
245
|
+
]
|
246
|
+
|
247
|
+
content = None
|
248
|
+
actual_filename = None
|
249
|
+
for attempt_filename in possible_filenames:
|
248
250
|
try:
|
249
|
-
content =
|
251
|
+
content = tar_handle.extractfile(attempt_filename).read()
|
252
|
+
actual_filename = attempt_filename
|
253
|
+
break
|
250
254
|
except:
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
255
|
+
continue
|
256
|
+
|
257
|
+
if content is None:
|
258
|
+
raise ValueError(f"Could not find document in batch tar: {self.batch_tar_path}, accession: {self.accession_prefix}, filename: {filename}")
|
259
|
+
|
256
260
|
# Decompress if compressed
|
257
|
-
if
|
261
|
+
if actual_filename.endswith('.gz'):
|
258
262
|
content = gzip.decompress(content)
|
259
|
-
elif
|
263
|
+
elif actual_filename.endswith('.zst'):
|
260
264
|
dctx = zstd.ZstdDecompressor()
|
261
265
|
content = dctx.decompress(content)
|
266
|
+
|
267
|
+
# Decode text files
|
268
|
+
if extension in ['.htm', '.html', '.txt', '.xml']:
|
269
|
+
content = content.decode('utf-8', errors='replace')
|
270
|
+
|
271
|
+
document_path = f"{self.batch_tar_path}::{self.accession_prefix}/{filename}"
|
272
|
+
|
273
|
+
# Handle regular path case (existing logic)
|
262
274
|
else:
|
263
|
-
|
264
|
-
|
275
|
+
document_path = self.path / filename
|
276
|
+
|
277
|
+
if self.path.suffix == '.tar':
|
278
|
+
with tarfile.open(self.path, 'r') as tar:
|
279
|
+
# so here is where we should use bytes instead with byte offset.
|
280
|
+
# bandaid fix TODO
|
281
|
+
try:
|
282
|
+
content = tar.extractfile(filename).read()
|
283
|
+
except:
|
284
|
+
try:
|
285
|
+
content = tar.extractfile(filename+'.gz').read()
|
286
|
+
except:
|
287
|
+
try:
|
288
|
+
content = tar.extractfile(filename+'.zst').read()
|
289
|
+
except:
|
290
|
+
# some of these issues are on SEC data end, will fix when I setup cloud.
|
291
|
+
raise ValueError(f"Something went wrong with tar: {self.path}")
|
292
|
+
# Decompress if compressed
|
293
|
+
if filename.endswith('.gz'):
|
294
|
+
content = gzip.decompress(content)
|
295
|
+
elif filename.endswith('.zst'):
|
296
|
+
dctx = zstd.ZstdDecompressor()
|
297
|
+
content = dctx.decompress(content)
|
298
|
+
else:
|
299
|
+
with document_path.open('rb') as f:
|
300
|
+
content = f.read()
|
265
301
|
|
266
|
-
|
267
|
-
|
268
|
-
|
302
|
+
# Decode text files
|
303
|
+
if extension in ['.htm', '.html', '.txt', '.xml']:
|
304
|
+
content = content.decode('utf-8', errors='replace')
|
269
305
|
|
270
306
|
return Document(
|
271
307
|
type=doc['type'],
|
File without changes
|
@@ -0,0 +1,150 @@
|
|
1
|
+
import zipfile
|
2
|
+
import json
|
3
|
+
import csv
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
7
|
+
import threading
|
8
|
+
from tqdm import tqdm
|
9
|
+
import urllib.request
|
10
|
+
|
11
|
+
headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
|
12
|
+
|
13
|
+
def process_file_batch(zip_file, filenames_batch):
|
14
|
+
"""Process a batch of files from the zip archive"""
|
15
|
+
batch_filings = []
|
16
|
+
|
17
|
+
for filename in filenames_batch:
|
18
|
+
if not filename.startswith('CIK'):
|
19
|
+
continue
|
20
|
+
|
21
|
+
try:
|
22
|
+
# Extract CIK from filename
|
23
|
+
cik = int(filename.split('.')[0].split('-')[0][3:])
|
24
|
+
|
25
|
+
# Read raw bytes and parse JSON
|
26
|
+
with zip_file.open(filename) as file:
|
27
|
+
raw_data = file.read()
|
28
|
+
submissions_dct = json.loads(raw_data)
|
29
|
+
|
30
|
+
# Handle different file types
|
31
|
+
if 'submissions' in filename:
|
32
|
+
filings_data = submissions_dct
|
33
|
+
else:
|
34
|
+
filings_data = submissions_dct['filings']['recent']
|
35
|
+
|
36
|
+
# Extract required data
|
37
|
+
accession_numbers = filings_data['accessionNumber']
|
38
|
+
filing_dates = filings_data['filingDate']
|
39
|
+
forms = filings_data['form']
|
40
|
+
|
41
|
+
# Create filing records for this file
|
42
|
+
for j in range(len(accession_numbers)):
|
43
|
+
filing_record = {
|
44
|
+
'accessionNumber': int(accession_numbers[j].replace('-','')),
|
45
|
+
'filingDate': filing_dates[j],
|
46
|
+
'submissionType': forms[j],
|
47
|
+
'cik': cik
|
48
|
+
}
|
49
|
+
batch_filings.append(filing_record)
|
50
|
+
|
51
|
+
except Exception as e:
|
52
|
+
print(f"Error processing {filename}: {e}")
|
53
|
+
continue
|
54
|
+
|
55
|
+
return batch_filings
|
56
|
+
|
57
|
+
def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
|
58
|
+
"""Thread-safe CSV writing with lock"""
|
59
|
+
with write_lock:
|
60
|
+
if is_first_write:
|
61
|
+
with open(output_path, 'w', newline='') as csvfile:
|
62
|
+
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
63
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
64
|
+
writer.writeheader()
|
65
|
+
writer.writerows(filings_data)
|
66
|
+
else:
|
67
|
+
with open(output_path, 'a', newline='') as csvfile:
|
68
|
+
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
69
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
70
|
+
writer.writerows(filings_data)
|
71
|
+
|
72
|
+
def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100):
|
73
|
+
"""Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
|
74
|
+
|
75
|
+
if submissions_zip_path is None:
|
76
|
+
url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
|
77
|
+
|
78
|
+
temp_dir = tempfile.mkdtemp()
|
79
|
+
zip_path = os.path.join(temp_dir, 'submissions.zip')
|
80
|
+
|
81
|
+
req = urllib.request.Request(url, headers=headers)
|
82
|
+
|
83
|
+
with urllib.request.urlopen(req) as response:
|
84
|
+
total_size = int(response.headers.get('Content-Length', 0))
|
85
|
+
|
86
|
+
with open(zip_path, 'wb') as f, tqdm(
|
87
|
+
desc="Downloading",
|
88
|
+
total=total_size,
|
89
|
+
unit='B',
|
90
|
+
unit_scale=True,
|
91
|
+
unit_divisor=1024,
|
92
|
+
) as pbar:
|
93
|
+
while True:
|
94
|
+
chunk = response.read(8192)
|
95
|
+
if not chunk:
|
96
|
+
break
|
97
|
+
f.write(chunk)
|
98
|
+
pbar.update(len(chunk))
|
99
|
+
|
100
|
+
submissions_zip_path = zip_path
|
101
|
+
|
102
|
+
# Keep zip file open throughout processing
|
103
|
+
with zipfile.ZipFile(submissions_zip_path, 'r') as zip_file:
|
104
|
+
# Get all CIK filenames
|
105
|
+
all_filenames = [f for f in zip_file.namelist() if f.startswith('CIK')]
|
106
|
+
|
107
|
+
print(f"Processing {len(all_filenames)} files with {max_workers} workers...")
|
108
|
+
|
109
|
+
# Create batches of filenames
|
110
|
+
filename_batches = []
|
111
|
+
for i in range(0, len(all_filenames), batch_size):
|
112
|
+
batch = all_filenames[i:i + batch_size]
|
113
|
+
filename_batches.append(batch)
|
114
|
+
|
115
|
+
# Setup for threading
|
116
|
+
write_lock = threading.Lock()
|
117
|
+
total_filings = 0
|
118
|
+
is_first_write = True
|
119
|
+
|
120
|
+
# Process batches with thread pool
|
121
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
122
|
+
# Submit all batch jobs
|
123
|
+
future_to_batch = {
|
124
|
+
executor.submit(process_file_batch, zip_file, batch): i
|
125
|
+
for i, batch in enumerate(filename_batches)
|
126
|
+
}
|
127
|
+
|
128
|
+
# Process results with progress bar
|
129
|
+
with tqdm(total=len(filename_batches), desc="Processing batches", unit="batch") as pbar:
|
130
|
+
for future in future_to_batch:
|
131
|
+
try:
|
132
|
+
batch_filings = future.result()
|
133
|
+
|
134
|
+
if batch_filings: # Only write if we have data
|
135
|
+
write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
|
136
|
+
is_first_write = False
|
137
|
+
total_filings += len(batch_filings)
|
138
|
+
|
139
|
+
pbar.update(1)
|
140
|
+
pbar.set_postfix({
|
141
|
+
'filings': total_filings,
|
142
|
+
'files': len(filename_batches[future_to_batch[future]])
|
143
|
+
})
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
print(f"Error processing batch: {e}")
|
147
|
+
pbar.update(1)
|
148
|
+
|
149
|
+
print(f"Complete! Processed {total_filings} total filings")
|
150
|
+
print(f"Data saved to {output_path}")
|
@@ -3,9 +3,9 @@ datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
|
3
3
|
datamule/helper.py,sha256=KqhAmTMdvATEh3I-O4xLcAcrHB9zXQERBuwzue7zyQw,3674
|
4
4
|
datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
|
-
datamule/portfolio.py,sha256=
|
6
|
+
datamule/portfolio.py,sha256=360kfXmmnVFrmpz16KF2es6Mq94lnVqzie2DIgnMB9Y,11641
|
7
7
|
datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
|
8
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/submission.py,sha256=f2pecbuhK0VmN1w0beNUiK4n_4Ma_GGQ5JIGilmZPZE,15127
|
9
9
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
10
10
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
datamule/datamule/sec_connector.py,sha256=T3edE7I-d4oHysqj7zYlIOxH3Fuauj9tfw39UdFWvB8,2393
|
@@ -51,7 +51,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
|
|
51
51
|
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
52
|
datamule/sec/submissions/downloader.py,sha256=zGS0oJJI8tVF_GnVpZm20MymdYxnjrEjQioSVggw7Ck,1486
|
53
53
|
datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
|
54
|
-
datamule/sec/submissions/monitor.py,sha256=
|
54
|
+
datamule/sec/submissions/monitor.py,sha256=1JUMRYsTqtd31hX3UrUA_aXFUmZN6n-V7h0i1gavNOs,11395
|
55
55
|
datamule/sec/submissions/streamer.py,sha256=Qydj40CmWB_wsPv2dibefRohmCokegG2pR7iZ9C3xLQ,11584
|
56
56
|
datamule/sec/submissions/textsearch.py,sha256=MKDXEz_VI_0ljl73_aw2lx4MVzJW5uDt8KxjvJBwPwM,5794
|
57
57
|
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -61,9 +61,11 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
|
|
61
61
|
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
62
62
|
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
63
63
|
datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
|
64
|
-
datamule/seclibrary/downloader.py,sha256=
|
64
|
+
datamule/seclibrary/downloader.py,sha256=3jEy67oiEg8BF20KcKCx2KC0UjHzhiepdu29TOaHWXs,17564
|
65
65
|
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
66
|
-
datamule
|
67
|
-
datamule
|
68
|
-
datamule-1.6.
|
69
|
-
datamule-1.6.
|
66
|
+
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
|
+
datamule/utils/construct_submissions_data.py,sha256=aX7ZaAp3zXHLcv4TFk_rGwjb8r7yNDQDFVg4nPf60kM,5934
|
68
|
+
datamule-1.6.2.dist-info/METADATA,sha256=VOeuSq7t_-D7dKJjWrHuEg9zwqNvLWU08dGL7W2T0ow,524
|
69
|
+
datamule-1.6.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
70
|
+
datamule-1.6.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
71
|
+
datamule-1.6.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|