datamule 1.8.3__tar.gz → 1.8.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.8.3 → datamule-1.8.5}/PKG-INFO +1 -1
- {datamule-1.8.3 → datamule-1.8.5}/datamule/datamule/sec_connector.py +29 -9
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/document.py +0 -1
- {datamule-1.8.3 → datamule-1.8.5}/datamule/portfolio.py +21 -8
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/downloader.py +2 -2
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/streamer.py +3 -3
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/utils.py +7 -2
- {datamule-1.8.3 → datamule-1.8.5}/datamule/seclibrary/downloader.py +34 -24
- {datamule-1.8.3 → datamule-1.8.5}/datamule/submission.py +3 -1
- {datamule-1.8.3 → datamule-1.8.5}/datamule/utils/construct_submissions_data.py +21 -22
- {datamule-1.8.3 → datamule-1.8.5}/datamule.egg-info/PKG-INFO +1 -1
- {datamule-1.8.3 → datamule-1.8.5}/setup.py +1 -1
- {datamule-1.8.3 → datamule-1.8.5}/datamule/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/config.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/datamule/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/d.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ex102_abs.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/information_table.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ownership.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/proxy_voting_record.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/submission_metadata.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/thirteenfhr.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/twentyfivense.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/processing.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/document/table.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/helper.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/index.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/package_updater.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/seclibrary/datamule_lookup.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/seclibrary/query.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/sheet.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/utils/__init__.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule/utils/format_accession.py +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule.egg-info/SOURCES.txt +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule.egg-info/requires.txt +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.8.3 → datamule-1.8.5}/setup.cfg +0 -0
@@ -2,7 +2,7 @@ import os
|
|
2
2
|
import json
|
3
3
|
import urllib.request
|
4
4
|
import websocket
|
5
|
-
|
5
|
+
import re
|
6
6
|
|
7
7
|
class SecConnector:
|
8
8
|
def __init__(self, api_key=None, quiet=False):
|
@@ -17,11 +17,12 @@ class SecConnector:
|
|
17
17
|
if not self.quiet:
|
18
18
|
print("Getting JWT token...")
|
19
19
|
|
20
|
-
url =
|
20
|
+
url = self.auth_url
|
21
21
|
|
22
|
-
|
23
|
-
req.
|
22
|
+
# Send API key in Authorization header instead of POST body
|
23
|
+
req = urllib.request.Request(url, method='GET')
|
24
24
|
req.add_header('Accept', 'application/json')
|
25
|
+
req.add_header('Authorization', f'Bearer {self.api_key}') # API key in header
|
25
26
|
|
26
27
|
with urllib.request.urlopen(req) as response:
|
27
28
|
data = json.loads(response.read().decode())
|
@@ -35,8 +36,8 @@ class SecConnector:
|
|
35
36
|
return data['token'], data['websocket_ip']
|
36
37
|
|
37
38
|
def connect(self, data_callback=None):
|
38
|
-
token,websocket_ip = self._get_jwt_token_and_ip()
|
39
|
-
ws_url = f"ws://{websocket_ip}/ws
|
39
|
+
token, websocket_ip = self._get_jwt_token_and_ip()
|
40
|
+
ws_url = f"ws://{websocket_ip}/ws"
|
40
41
|
|
41
42
|
if not self.quiet:
|
42
43
|
print("Connecting to WebSocket...")
|
@@ -51,22 +52,41 @@ class SecConnector:
|
|
51
52
|
if not self.quiet:
|
52
53
|
print(f"Received data: {len(data)} items")
|
53
54
|
if data_callback:
|
54
|
-
data_callback(data)
|
55
|
+
data_callback(data)
|
55
56
|
|
56
57
|
def on_error(ws, error):
|
57
58
|
if not self.quiet:
|
58
|
-
|
59
|
+
sanitized_error = self._sanitize_error_message(str(error))
|
60
|
+
print(f"WebSocket error: {sanitized_error}")
|
59
61
|
|
60
62
|
def on_close(ws, close_status_code, close_msg):
|
61
63
|
if not self.quiet:
|
62
64
|
print("WebSocket closed")
|
63
65
|
|
66
|
+
# Use Authorization header for WebSocket connection
|
67
|
+
headers = {'Authorization': f'Bearer {token}'}
|
68
|
+
|
64
69
|
ws = websocket.WebSocketApp(
|
65
70
|
ws_url,
|
71
|
+
header=headers,
|
66
72
|
on_open=on_open,
|
67
73
|
on_message=on_message,
|
68
74
|
on_error=on_error,
|
69
75
|
on_close=on_close
|
70
76
|
)
|
71
77
|
|
72
|
-
ws.run_forever()
|
78
|
+
ws.run_forever()
|
79
|
+
|
80
|
+
def _sanitize_error_message(self, error_msg):
|
81
|
+
sensitive_patterns = [
|
82
|
+
r'Bearer\s+[A-Za-z0-9\-_\.]+', # Bearer tokens
|
83
|
+
r'api_key[=:]\s*[A-Za-z0-9\-_]+', # API key patterns
|
84
|
+
r'token[=:]\s*[A-Za-z0-9\-_\.]+', # Token patterns
|
85
|
+
r'jwt[=:]\s*[A-Za-z0-9\-_\.]+', # JWT patterns
|
86
|
+
]
|
87
|
+
|
88
|
+
sanitized = error_msg
|
89
|
+
for pattern in sensitive_patterns:
|
90
|
+
sanitized = re.sub(pattern, '[REDACTED]', sanitized, flags=re.IGNORECASE)
|
91
|
+
|
92
|
+
return sanitized
|
@@ -13,11 +13,12 @@ from .seclibrary.downloader import download as seclibrary_download
|
|
13
13
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
14
14
|
from .sec.submissions.monitor import Monitor
|
15
15
|
from .portfolio_compression_utils import CompressionManager
|
16
|
-
#from .sec.xbrl.xbrlmonitor import XBRLMonitor
|
17
16
|
from .datamule.sec_connector import SecConnector
|
18
17
|
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
19
18
|
import json
|
20
19
|
import io
|
20
|
+
import shutil
|
21
|
+
|
21
22
|
|
22
23
|
class Portfolio:
|
23
24
|
def __init__(self, path):
|
@@ -210,7 +211,10 @@ class Portfolio:
|
|
210
211
|
self.accession_numbers = new_accession_numbers
|
211
212
|
|
212
213
|
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,document_type=[],
|
213
|
-
requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True,skip_existing=True,
|
214
|
+
requests_per_second=5,keep_filtered_metadata=False,standardize_metadata=True,skip_existing=True,
|
215
|
+
accession_numbers=None, **kwargs):
|
216
|
+
|
217
|
+
|
214
218
|
if provider is None:
|
215
219
|
config = Config()
|
216
220
|
provider = config.get_default_source()
|
@@ -218,33 +222,35 @@ class Portfolio:
|
|
218
222
|
# Process CIK and metadata filters
|
219
223
|
cik = _process_cik_and_metadata_filters(cik, ticker, **kwargs)
|
220
224
|
|
221
|
-
|
225
|
+
filtered_accession_numbers = self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
226
|
+
|
222
227
|
skip_accession_numbers = []
|
223
228
|
if skip_existing:
|
224
229
|
skip_accession_numbers = [sub.accession for sub in self]
|
225
230
|
|
226
231
|
if provider == 'datamule':
|
227
|
-
|
228
232
|
seclibrary_download(
|
229
233
|
output_dir=self.path,
|
230
234
|
cik=cik,
|
231
235
|
api_key=self.api_key,
|
232
236
|
submission_type=submission_type,
|
233
237
|
filing_date=filing_date,
|
234
|
-
|
238
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
235
239
|
keep_document_types=document_type,
|
236
240
|
keep_filtered_metadata=keep_filtered_metadata,
|
237
241
|
standardize_metadata=standardize_metadata,
|
238
|
-
skip_accession_numbers=skip_accession_numbers
|
242
|
+
skip_accession_numbers=skip_accession_numbers,
|
243
|
+
accession_numbers = accession_numbers
|
239
244
|
)
|
240
245
|
else:
|
246
|
+
# will later add accession_numbers arg in the free update.
|
241
247
|
sec_download(
|
242
248
|
output_dir=self.path,
|
243
249
|
cik=cik,
|
244
250
|
submission_type=submission_type,
|
245
251
|
filing_date=filing_date,
|
246
252
|
requests_per_second=requests_per_second,
|
247
|
-
|
253
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
248
254
|
keep_document_types=document_type,
|
249
255
|
keep_filtered_metadata=keep_filtered_metadata,
|
250
256
|
standardize_metadata=standardize_metadata,
|
@@ -286,4 +292,11 @@ class Portfolio:
|
|
286
292
|
document_types = [document_types]
|
287
293
|
|
288
294
|
for submission in self.submissions:
|
289
|
-
yield from submission.document_type(document_types)
|
295
|
+
yield from submission.document_type(document_types)
|
296
|
+
|
297
|
+
def delete(self):
|
298
|
+
self._close_batch_handles()
|
299
|
+
shutil.rmtree(self.path)
|
300
|
+
|
301
|
+
# reinit
|
302
|
+
self.__dict__.update(Portfolio(self.path).__dict__)
|
@@ -4,7 +4,7 @@ from secsgml import write_sgml_file_to_tar
|
|
4
4
|
from tqdm import tqdm
|
5
5
|
|
6
6
|
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
7
|
-
requests_per_second=5, output_dir="filings",
|
7
|
+
requests_per_second=5, output_dir="filings", filtered_accession_numbers=None,
|
8
8
|
quiet=False, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
9
9
|
skip_accession_numbers=[]):
|
10
10
|
# Make sure output directory exists
|
@@ -29,7 +29,7 @@ def download(cik=None, submission_type=None, filing_date=None, location=None, na
|
|
29
29
|
location=location,
|
30
30
|
requests_per_second=requests_per_second,
|
31
31
|
document_callback=callback_wrapper,
|
32
|
-
|
32
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
33
33
|
skip_accession_numbers=skip_accession_numbers,
|
34
34
|
quiet=quiet
|
35
35
|
)
|
@@ -222,7 +222,7 @@ class Streamer(EFTSQuery):
|
|
222
222
|
return results
|
223
223
|
|
224
224
|
def stream(cik=None, submission_type=None, filing_date=None, location=None,
|
225
|
-
requests_per_second=5.0, document_callback=None,
|
225
|
+
requests_per_second=5.0, document_callback=None, filtered_accession_numbers=None,skip_accession_numbers=[],
|
226
226
|
quiet=False, name=None):
|
227
227
|
"""
|
228
228
|
Stream EFTS results and download documents into memory.
|
@@ -253,14 +253,14 @@ def stream(cik=None, submission_type=None, filing_date=None, location=None,
|
|
253
253
|
"""
|
254
254
|
|
255
255
|
# Check if acc no is empty list
|
256
|
-
if
|
256
|
+
if filtered_accession_numbers == []:
|
257
257
|
raise ValueError("Applied filter resulted in empty accession numbers list")
|
258
258
|
|
259
259
|
async def run_stream():
|
260
260
|
streamer = Streamer(
|
261
261
|
requests_per_second=requests_per_second,
|
262
262
|
document_callback=document_callback,
|
263
|
-
accession_numbers=
|
263
|
+
accession_numbers=filtered_accession_numbers,
|
264
264
|
skip_accession_numbers=skip_accession_numbers,
|
265
265
|
quiet=quiet
|
266
266
|
)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
import time
|
3
3
|
from collections import deque
|
4
|
-
|
4
|
+
import os
|
5
5
|
|
6
6
|
class RetryException(Exception):
|
7
7
|
def __init__(self, url, retry_after=601): # SEC Rate limit is typically 10 minutes.
|
@@ -61,4 +61,9 @@ class RateMonitor:
|
|
61
61
|
|
62
62
|
return round(requests_per_second, 1), round(mb_per_second, 2)
|
63
63
|
|
64
|
-
|
64
|
+
|
65
|
+
user_agent = os.environ.get('DATAMULE_SEC_USER_AGENT')
|
66
|
+
if user_agent is None:
|
67
|
+
user_agent = 'John Smith johnsmith@gmail.com'
|
68
|
+
|
69
|
+
headers = {'User-Agent': user_agent}
|
@@ -18,6 +18,9 @@ from os import cpu_count
|
|
18
18
|
from secsgml import parse_sgml_content_into_memory
|
19
19
|
from secsgml.utils import bytes_to_str
|
20
20
|
from .datamule_lookup import datamule_lookup
|
21
|
+
from ..utils.format_accession import format_accession
|
22
|
+
|
23
|
+
# could be cleaned up
|
21
24
|
|
22
25
|
# Set up logging
|
23
26
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -268,7 +271,7 @@ class Downloader:
|
|
268
271
|
tar_manager = self.TarManager(output_dir, num_tar_files, max_batch_size)
|
269
272
|
|
270
273
|
try:
|
271
|
-
with tqdm(total=len(urls), desc="
|
274
|
+
with tqdm(total=len(urls), desc="Downloading files") as pbar:
|
272
275
|
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
273
276
|
decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
|
274
277
|
|
@@ -296,29 +299,35 @@ class Downloader:
|
|
296
299
|
finally:
|
297
300
|
tar_manager.close_all()
|
298
301
|
|
299
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads",
|
300
|
-
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
302
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", filtered_accession_numbers=None, keep_document_types=[], keep_filtered_metadata=False, standardize_metadata=True,
|
303
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024,accession_numbers=None):
|
301
304
|
if self.api_key is None:
|
302
305
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
303
306
|
|
304
307
|
logger.debug("Querying SEC filings...")
|
305
308
|
|
306
|
-
|
307
|
-
|
309
|
+
if not accession_numbers:
|
310
|
+
filings = datamule_lookup(cik=cik, submission_type=submission_type, filing_date=filing_date,
|
311
|
+
columns=['accessionNumber'], distinct=True, page_size=25000, quiet=False,api_key=self.api_key)
|
308
312
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
313
|
+
if filtered_accession_numbers:
|
314
|
+
filtered_accession_numbers = [str(int(item.replace('-',''))) for item in filtered_accession_numbers]
|
315
|
+
filings = [filing for filing in filings if filing['accessionNumber'] in filtered_accession_numbers]
|
316
|
+
|
317
|
+
if skip_accession_numbers:
|
318
|
+
skip_accession_numbers = [int(item.replace('-','')) for item in skip_accession_numbers]
|
319
|
+
filings = [filing for filing in filings if filing['accessionNumber'] not in skip_accession_numbers]
|
316
320
|
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
321
|
+
logger.debug(f"Generating URLs for {len(filings)} filings...")
|
322
|
+
urls = []
|
323
|
+
for item in filings:
|
324
|
+
url = f"{self.BASE_URL}{str(item['accessionNumber']).zfill(18)}.sgml"
|
325
|
+
urls.append(url)
|
326
|
+
else:
|
327
|
+
urls = []
|
328
|
+
for accession in accession_numbers:
|
329
|
+
url = f"{self.BASE_URL}{format_accession(accession,'no-dash').zfill(18)}.sgml"
|
330
|
+
urls.append(url)
|
322
331
|
|
323
332
|
if not urls:
|
324
333
|
logger.warning("No submissions found matching the criteria")
|
@@ -381,12 +390,12 @@ class Downloader:
|
|
381
390
|
logger.debug(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
382
391
|
|
383
392
|
|
384
|
-
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads",
|
385
|
-
skip_accession_numbers=[], max_batch_size=1024*1024*1024):
|
393
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", filtered_accession_numbers=None, keep_document_types=[],keep_filtered_metadata=False,standardize_metadata=True,
|
394
|
+
skip_accession_numbers=[], max_batch_size=1024*1024*1024,accession_numbers=None):
|
386
395
|
|
387
|
-
if
|
388
|
-
|
389
|
-
elif
|
396
|
+
if filtered_accession_numbers:
|
397
|
+
filtered_accession_numbers = [int(str(x).replace('-', '')) for x in filtered_accession_numbers]
|
398
|
+
elif filtered_accession_numbers == []:
|
390
399
|
raise ValueError("Applied filter resulted in empty accession numbers list")
|
391
400
|
downloader = Downloader(api_key=api_key)
|
392
401
|
downloader.download(
|
@@ -394,10 +403,11 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
394
403
|
cik=cik,
|
395
404
|
filing_date=filing_date,
|
396
405
|
output_dir=output_dir,
|
397
|
-
|
406
|
+
filtered_accession_numbers=filtered_accession_numbers,
|
398
407
|
keep_document_types=keep_document_types,
|
399
408
|
keep_filtered_metadata=keep_filtered_metadata,
|
400
409
|
standardize_metadata=standardize_metadata,
|
401
410
|
skip_accession_numbers=skip_accession_numbers,
|
402
|
-
max_batch_size=max_batch_size
|
411
|
+
max_batch_size=max_batch_size,
|
412
|
+
accession_numbers=accession_numbers
|
403
413
|
)
|
@@ -7,10 +7,9 @@ from concurrent.futures import ThreadPoolExecutor
|
|
7
7
|
import threading
|
8
8
|
from tqdm import tqdm
|
9
9
|
import urllib.request
|
10
|
+
from ..sec.utils import headers
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
def process_file_batch(zip_file, filenames_batch):
|
12
|
+
def process_file_batch(zip_file, filenames_batch, columns, mapping):
|
14
13
|
"""Process a batch of files from the zip archive"""
|
15
14
|
batch_filings = []
|
16
15
|
|
@@ -33,19 +32,17 @@ def process_file_batch(zip_file, filenames_batch):
|
|
33
32
|
else:
|
34
33
|
filings_data = submissions_dct['filings']['recent']
|
35
34
|
|
36
|
-
# Extract required data
|
37
|
-
|
38
|
-
|
39
|
-
forms = filings_data['form']
|
40
|
-
|
35
|
+
# Extract required data using mapping
|
36
|
+
lst_lst = [filings_data[column] for column in columns]
|
37
|
+
|
41
38
|
# Create filing records for this file
|
42
|
-
for j in range(len(
|
43
|
-
filing_record = {
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
39
|
+
for j in range(len(filings_data['accessionNumber'])):
|
40
|
+
filing_record = {'cik': cik}
|
41
|
+
|
42
|
+
for i, column in enumerate(columns):
|
43
|
+
mapped_key = mapping.get(column, column)
|
44
|
+
filing_record[mapped_key] = lst_lst[i][j]
|
45
|
+
|
49
46
|
batch_filings.append(filing_record)
|
50
47
|
|
51
48
|
except Exception as e:
|
@@ -54,24 +51,26 @@ def process_file_batch(zip_file, filenames_batch):
|
|
54
51
|
|
55
52
|
return batch_filings
|
56
53
|
|
57
|
-
def write_csv_chunk(output_path, filings_data, is_first_write, write_lock):
|
54
|
+
def write_csv_chunk(output_path, filings_data, is_first_write, write_lock, fieldnames):
|
58
55
|
"""Thread-safe CSV writing with lock"""
|
59
56
|
with write_lock:
|
60
57
|
if is_first_write:
|
61
58
|
with open(output_path, 'w', newline='') as csvfile:
|
62
|
-
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
63
59
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
64
60
|
writer.writeheader()
|
65
61
|
writer.writerows(filings_data)
|
66
62
|
else:
|
67
63
|
with open(output_path, 'a', newline='') as csvfile:
|
68
|
-
fieldnames = ['accessionNumber', 'filingDate', 'submissionType', 'cik']
|
69
64
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
70
65
|
writer.writerows(filings_data)
|
71
66
|
|
72
|
-
def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100
|
67
|
+
def construct_submissions_data(output_path, submissions_zip_path=None, max_workers=4, batch_size=100,
|
68
|
+
columns = ['accessionNumber', 'filingDate', 'form'], mapping = {'form': 'submissionType'}):
|
73
69
|
"""Creates a list of dicts of every accession number, with filing date, submission type, and ciks"""
|
74
|
-
|
70
|
+
|
71
|
+
# declare fieldnames
|
72
|
+
fieldnames = ['cik'] + [mapping.get(col, col) for col in columns]
|
73
|
+
|
75
74
|
if submissions_zip_path is None:
|
76
75
|
url = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
|
77
76
|
|
@@ -121,7 +120,7 @@ def construct_submissions_data(output_path, submissions_zip_path=None, max_worke
|
|
121
120
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
122
121
|
# Submit all batch jobs
|
123
122
|
future_to_batch = {
|
124
|
-
executor.submit(process_file_batch, zip_file, batch): i
|
123
|
+
executor.submit(process_file_batch, zip_file, batch, columns, mapping): i
|
125
124
|
for i, batch in enumerate(filename_batches)
|
126
125
|
}
|
127
126
|
|
@@ -132,7 +131,7 @@ def construct_submissions_data(output_path, submissions_zip_path=None, max_worke
|
|
132
131
|
batch_filings = future.result()
|
133
132
|
|
134
133
|
if batch_filings: # Only write if we have data
|
135
|
-
write_csv_chunk(output_path, batch_filings, is_first_write, write_lock)
|
134
|
+
write_csv_chunk(output_path, batch_filings, is_first_write, write_lock, fieldnames)
|
136
135
|
is_first_write = False
|
137
136
|
total_filings += len(batch_filings)
|
138
137
|
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="1.8.
|
35
|
+
version="1.8.5",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|