datamule 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/book/__init__.py +0 -0
- datamule/book/book.py +34 -0
- datamule/mapping_dicts/__init__.py +0 -0
- datamule/mapping_dicts/txt_mapping_dicts.py +234 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/sec/__init__.py +0 -0
- datamule/sec/infrastructure/__init__.py +0 -0
- datamule/sec/infrastructure/submissions_metadata.py +386 -0
- datamule/sec/rss/__init__.py +0 -0
- datamule/sec/rss/monitor.py +416 -0
- datamule/sec/submissions/__init__.py +0 -0
- datamule/sec/submissions/downloader.py +70 -0
- datamule/sec/submissions/eftsquery.py +502 -0
- datamule/sec/submissions/monitor.py +126 -0
- datamule/sec/submissions/streamer.py +228 -0
- datamule/sec/submissions/textsearch.py +122 -0
- datamule/sec/utils.py +64 -0
- datamule/sec/xbrl/__init__.py +0 -0
- datamule/sec/xbrl/downloadcompanyfacts.py +83 -0
- datamule/sec/xbrl/filter_xbrl.py +39 -0
- datamule/sec/xbrl/streamcompanyfacts.py +93 -0
- datamule/sec/xbrl/xbrlmonitor.py +143 -0
- datamule/seclibrary/__init__.py +0 -0
- datamule/seclibrary/downloader.py +286 -0
- datamule/seclibrary/query.py +181 -0
- {datamule-1.0.7.dist-info → datamule-1.0.9.dist-info}/METADATA +1 -1
- datamule-1.0.9.dist-info/RECORD +35 -0
- datamule-1.0.7.dist-info/RECORD +0 -10
- {datamule-1.0.7.dist-info → datamule-1.0.9.dist-info}/WHEEL +0 -0
- {datamule-1.0.7.dist-info → datamule-1.0.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
import asyncio
|
2
|
+
import aiohttp
|
3
|
+
import time
|
4
|
+
from collections import deque
|
5
|
+
from lxml import etree
|
6
|
+
from ..utils import PreciseRateLimiter, headers
|
7
|
+
|
8
|
+
class XBRLMonitor:
|
9
|
+
"""
|
10
|
+
Monitor for SEC XBRL RSS feed.
|
11
|
+
Polls https://www.sec.gov/Archives/edgar/xbrlrss.all.xml for new XBRL filings.
|
12
|
+
"""
|
13
|
+
|
14
|
+
def __init__(self, requests_per_second=2.0):
|
15
|
+
"""Initialize the XBRL Monitor."""
|
16
|
+
self.url = "https://www.sec.gov/Archives/edgar/xbrlrss.all.xml"
|
17
|
+
self.seen_accessions = deque(maxlen=2000) # Store up to 2000 accession numbers
|
18
|
+
self.limiter = PreciseRateLimiter(requests_per_second)
|
19
|
+
self.headers = headers
|
20
|
+
self.running = False
|
21
|
+
|
22
|
+
async def _fetch_rss(self, session):
|
23
|
+
"""Fetch the XBRL RSS feed from SEC."""
|
24
|
+
async with self.limiter:
|
25
|
+
try:
|
26
|
+
async with session.get(self.url) as response:
|
27
|
+
response.raise_for_status()
|
28
|
+
return await response.text()
|
29
|
+
except Exception as e:
|
30
|
+
print(f"Error fetching RSS feed: {str(e)}")
|
31
|
+
return None
|
32
|
+
|
33
|
+
def _parse_rss(self, xml_content):
|
34
|
+
"""Parse the XBRL RSS feed XML content using lxml."""
|
35
|
+
# Parse XML using lxml
|
36
|
+
parser = etree.XMLParser(recover=True)
|
37
|
+
root = etree.fromstring(xml_content.encode('utf-8'), parser)
|
38
|
+
|
39
|
+
# Define namespaces
|
40
|
+
namespaces = {
|
41
|
+
'edgar': 'https://www.sec.gov/Archives/edgar'
|
42
|
+
}
|
43
|
+
|
44
|
+
entries = []
|
45
|
+
for item in root.findall('.//item'):
|
46
|
+
# Get basic information
|
47
|
+
title = item.find('title').text if item.find('title') is not None else ""
|
48
|
+
link = item.find('link').text if item.find('link') is not None else ""
|
49
|
+
|
50
|
+
# Get EDGAR-specific information
|
51
|
+
edgar_filing = item.find('.//edgar:xbrlFiling', namespaces)
|
52
|
+
if edgar_filing is not None:
|
53
|
+
cik = edgar_filing.find('./edgar:cikNumber', namespaces).text if edgar_filing.find('./edgar:cikNumber', namespaces) is not None else ""
|
54
|
+
acc_number = edgar_filing.find('./edgar:accessionNumber', namespaces).text if edgar_filing.find('./edgar:accessionNumber', namespaces) is not None else ""
|
55
|
+
form_type = edgar_filing.find('./edgar:formType', namespaces).text if edgar_filing.find('./edgar:formType', namespaces) is not None else ""
|
56
|
+
|
57
|
+
# Keep accession number with dashes
|
58
|
+
if acc_number:
|
59
|
+
entries.append({
|
60
|
+
'accession_number': acc_number,
|
61
|
+
'cik': cik,
|
62
|
+
'submission_type': form_type,
|
63
|
+
'link': link
|
64
|
+
})
|
65
|
+
|
66
|
+
return entries
|
67
|
+
|
68
|
+
|
69
|
+
async def _poll_once(self, data_callback=None, quiet=True):
|
70
|
+
"""Internal async implementation of poll_once."""
|
71
|
+
if not quiet:
|
72
|
+
print(f"Polling {self.url}")
|
73
|
+
|
74
|
+
async with aiohttp.ClientSession(headers=self.headers) as session:
|
75
|
+
xml_content = await self._fetch_rss(session)
|
76
|
+
if not xml_content:
|
77
|
+
return []
|
78
|
+
|
79
|
+
all_entries = self._parse_rss(xml_content)
|
80
|
+
new_entries = []
|
81
|
+
|
82
|
+
# Filter out entries we've already seen
|
83
|
+
for entry in all_entries:
|
84
|
+
if entry['accession_number'] not in self.seen_accessions:
|
85
|
+
self.seen_accessions.appendleft(entry['accession_number'])
|
86
|
+
new_entries.append(entry)
|
87
|
+
|
88
|
+
if new_entries and not quiet:
|
89
|
+
print(f"Found {len(new_entries)} new XBRL filings")
|
90
|
+
|
91
|
+
# Call the callback if provided and if we have new entries
|
92
|
+
if new_entries and data_callback:
|
93
|
+
await data_callback(new_entries)
|
94
|
+
|
95
|
+
return new_entries
|
96
|
+
|
97
|
+
def poll_once(self, data_callback=None, quiet=True):
|
98
|
+
"""
|
99
|
+
Poll the XBRL RSS feed once and process new filings.
|
100
|
+
Synchronous wrapper around async implementation.
|
101
|
+
"""
|
102
|
+
return asyncio.run(self._poll_once(data_callback, quiet))
|
103
|
+
|
104
|
+
async def _monitor(self, data_callback=None, poll_callback=None, polling_interval=600000, quiet=True):
|
105
|
+
"""Internal async implementation of monitor."""
|
106
|
+
self.running = True
|
107
|
+
while self.running:
|
108
|
+
try:
|
109
|
+
# Poll once for new filings
|
110
|
+
await self._poll_once(data_callback, quiet)
|
111
|
+
|
112
|
+
# Execute polling callback if provided
|
113
|
+
start_wait = time.time()
|
114
|
+
if poll_callback:
|
115
|
+
try:
|
116
|
+
await poll_callback()
|
117
|
+
except Exception as e:
|
118
|
+
print(f"Error in poll callback: {str(e)}")
|
119
|
+
|
120
|
+
# Sleep for the remaining interval time
|
121
|
+
elapsed = (time.time() - start_wait) * 1000
|
122
|
+
if elapsed < polling_interval:
|
123
|
+
await asyncio.sleep((polling_interval - elapsed) / 1000)
|
124
|
+
|
125
|
+
except Exception as e:
|
126
|
+
print(f"Error in monitoring: {str(e)}")
|
127
|
+
await asyncio.sleep(polling_interval / 1000)
|
128
|
+
|
129
|
+
def monitor(self, data_callback=None, poll_callback=None, polling_interval=600000, quiet=True):
|
130
|
+
"""
|
131
|
+
Continuously poll the XBRL RSS feed at the specified interval.
|
132
|
+
Synchronous wrapper around async implementation.
|
133
|
+
"""
|
134
|
+
return asyncio.run(self._monitor(
|
135
|
+
data_callback=data_callback,
|
136
|
+
poll_callback=poll_callback,
|
137
|
+
polling_interval=polling_interval,
|
138
|
+
quiet=quiet
|
139
|
+
))
|
140
|
+
|
141
|
+
def stop(self):
|
142
|
+
"""Stop the continuous polling."""
|
143
|
+
self.running = False
|
File without changes
|
@@ -0,0 +1,286 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
import aiohttp
|
4
|
+
from pathlib import Path
|
5
|
+
from tqdm import tqdm
|
6
|
+
import time
|
7
|
+
import shutil
|
8
|
+
import ssl
|
9
|
+
import zstandard as zstd
|
10
|
+
import io
|
11
|
+
import json
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
13
|
+
from functools import partial
|
14
|
+
from queue import Queue, Empty
|
15
|
+
from threading import Thread
|
16
|
+
from secsgml import parse_sgml_submission
|
17
|
+
from .query import query
|
18
|
+
from os import cpu_count
|
19
|
+
|
20
|
+
class Downloader:
|
21
|
+
def __init__(self, api_key=None):
|
22
|
+
self.BASE_URL = "https://library.datamule.xyz/original/nc/"
|
23
|
+
self.CHUNK_SIZE = 2 * 1024 * 1024
|
24
|
+
self.MAX_CONCURRENT_DOWNLOADS = 250
|
25
|
+
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
26
|
+
self.MAX_PROCESSING_WORKERS = cpu_count()
|
27
|
+
self.QUEUE_SIZE = 10
|
28
|
+
if api_key is not None:
|
29
|
+
self._api_key = api_key
|
30
|
+
|
31
|
+
@property
|
32
|
+
def api_key(self):
|
33
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
34
|
+
|
35
|
+
@api_key.setter
|
36
|
+
def api_key(self, value):
|
37
|
+
if not value:
|
38
|
+
raise ValueError("API key cannot be empty")
|
39
|
+
self._api_key = value
|
40
|
+
|
41
|
+
def _log_error(self, output_dir, filename, error_msg):
|
42
|
+
error_file = os.path.join(output_dir, 'errors.json')
|
43
|
+
try:
|
44
|
+
if os.path.exists(error_file):
|
45
|
+
with open(error_file, 'r') as f:
|
46
|
+
errors = json.load(f)
|
47
|
+
else:
|
48
|
+
errors = {}
|
49
|
+
|
50
|
+
errors[filename] = str(error_msg)
|
51
|
+
|
52
|
+
with open(error_file, 'w') as f:
|
53
|
+
json.dump(errors, f, indent=2)
|
54
|
+
except Exception as e:
|
55
|
+
print(f"Failed to log error to {error_file}: {str(e)}")
|
56
|
+
|
57
|
+
class FileProcessor:
|
58
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
|
59
|
+
self.processing_queue = Queue(maxsize=queue_size)
|
60
|
+
self.should_stop = False
|
61
|
+
self.processing_workers = []
|
62
|
+
self.output_dir = output_dir
|
63
|
+
self.max_workers = max_workers
|
64
|
+
self.batch_size = 50
|
65
|
+
self.pbar = pbar
|
66
|
+
self.downloader = downloader
|
67
|
+
|
68
|
+
def start_processing_workers(self):
|
69
|
+
for _ in range(self.max_workers):
|
70
|
+
worker = Thread(target=self._processing_worker)
|
71
|
+
worker.daemon = True
|
72
|
+
worker.start()
|
73
|
+
self.processing_workers.append(worker)
|
74
|
+
|
75
|
+
def _process_file(self, item):
|
76
|
+
filename, content = item
|
77
|
+
try:
|
78
|
+
parse_sgml_submission(output_dir=self.output_dir, content=content)
|
79
|
+
self.pbar.update(1)
|
80
|
+
except Exception as e:
|
81
|
+
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
82
|
+
if os.path.exists(accession_dir):
|
83
|
+
shutil.rmtree(accession_dir)
|
84
|
+
self.downloader._log_error(self.output_dir, filename, str(e))
|
85
|
+
|
86
|
+
def _processing_worker(self):
|
87
|
+
batch = []
|
88
|
+
while not self.should_stop:
|
89
|
+
try:
|
90
|
+
item = self.processing_queue.get(timeout=1)
|
91
|
+
if item is None:
|
92
|
+
break
|
93
|
+
|
94
|
+
batch.append(item)
|
95
|
+
|
96
|
+
if len(batch) >= self.batch_size or self.processing_queue.empty():
|
97
|
+
for item in batch:
|
98
|
+
self._process_file(item)
|
99
|
+
self.processing_queue.task_done()
|
100
|
+
batch = []
|
101
|
+
|
102
|
+
except Empty:
|
103
|
+
if batch:
|
104
|
+
for item in batch:
|
105
|
+
self._process_file(item)
|
106
|
+
self.processing_queue.task_done()
|
107
|
+
batch = []
|
108
|
+
|
109
|
+
def stop_workers(self):
|
110
|
+
self.should_stop = True
|
111
|
+
for _ in self.processing_workers:
|
112
|
+
self.processing_queue.put(None)
|
113
|
+
for worker in self.processing_workers:
|
114
|
+
worker.join()
|
115
|
+
|
116
|
+
def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
|
117
|
+
dctx = zstd.ZstdDecompressor()
|
118
|
+
try:
|
119
|
+
input_buffer = io.BytesIO(b''.join(compressed_chunks))
|
120
|
+
decompressed_content = io.BytesIO()
|
121
|
+
|
122
|
+
with dctx.stream_reader(input_buffer) as reader:
|
123
|
+
shutil.copyfileobj(reader, decompressed_content)
|
124
|
+
|
125
|
+
content = decompressed_content.getvalue().decode('utf-8')
|
126
|
+
processor.processing_queue.put((filename, content))
|
127
|
+
return True
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
|
131
|
+
return False
|
132
|
+
finally:
|
133
|
+
try:
|
134
|
+
input_buffer.close()
|
135
|
+
decompressed_content.close()
|
136
|
+
except:
|
137
|
+
pass
|
138
|
+
|
139
|
+
def save_regular_file(self, chunks, filename, output_dir, processor):
|
140
|
+
try:
|
141
|
+
content = b''.join(chunks).decode('utf-8')
|
142
|
+
processor.processing_queue.put((filename, content))
|
143
|
+
return True
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
|
147
|
+
return False
|
148
|
+
|
149
|
+
async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
|
150
|
+
async with semaphore:
|
151
|
+
chunks = []
|
152
|
+
filename = url.split('/')[-1]
|
153
|
+
|
154
|
+
api_key = self.api_key
|
155
|
+
if not api_key:
|
156
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
157
|
+
|
158
|
+
try:
|
159
|
+
headers = {
|
160
|
+
'Connection': 'keep-alive',
|
161
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
162
|
+
'Authorization': f'Bearer {api_key}'
|
163
|
+
}
|
164
|
+
|
165
|
+
async with session.get(url, headers=headers) as response:
|
166
|
+
if response.status == 200:
|
167
|
+
async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
|
168
|
+
chunks.append(chunk)
|
169
|
+
|
170
|
+
loop = asyncio.get_running_loop()
|
171
|
+
if filename.endswith('.zst'):
|
172
|
+
success = await loop.run_in_executor(
|
173
|
+
decompression_pool,
|
174
|
+
partial(self.decompress_stream, chunks, filename, output_dir, processor)
|
175
|
+
)
|
176
|
+
else:
|
177
|
+
success = await loop.run_in_executor(
|
178
|
+
decompression_pool,
|
179
|
+
partial(self.save_regular_file, chunks, filename, output_dir, processor)
|
180
|
+
)
|
181
|
+
|
182
|
+
if not success:
|
183
|
+
self._log_error(output_dir, filename, "Failed to process file")
|
184
|
+
elif response.status == 401:
|
185
|
+
self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
|
186
|
+
raise ValueError("Invalid API key")
|
187
|
+
else:
|
188
|
+
self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
|
189
|
+
except Exception as e:
|
190
|
+
self._log_error(output_dir, filename, str(e))
|
191
|
+
|
192
|
+
async def process_batch(self, urls, output_dir):
|
193
|
+
os.makedirs(output_dir, exist_ok=True)
|
194
|
+
|
195
|
+
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
196
|
+
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
|
197
|
+
processor.start_processing_workers()
|
198
|
+
|
199
|
+
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
200
|
+
decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
|
201
|
+
|
202
|
+
connector = aiohttp.TCPConnector(
|
203
|
+
limit=self.MAX_CONCURRENT_DOWNLOADS,
|
204
|
+
force_close=False,
|
205
|
+
ssl=ssl.create_default_context(),
|
206
|
+
ttl_dns_cache=300,
|
207
|
+
keepalive_timeout=60
|
208
|
+
)
|
209
|
+
|
210
|
+
# timeout should be max 30s.
|
211
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
|
212
|
+
tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
|
213
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
214
|
+
|
215
|
+
processor.processing_queue.join()
|
216
|
+
processor.stop_workers()
|
217
|
+
decompression_pool.shutdown()
|
218
|
+
|
219
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
|
220
|
+
"""
|
221
|
+
Query SEC filings and download/process them.
|
222
|
+
|
223
|
+
Parameters:
|
224
|
+
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
225
|
+
- cik: Company CIK number(s), string, int, or list
|
226
|
+
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
227
|
+
- output_dir: Directory to save downloaded files
|
228
|
+
"""
|
229
|
+
if self.api_key is None:
|
230
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
231
|
+
|
232
|
+
# Query the SEC filings first - before starting any async operations
|
233
|
+
print("Querying SEC filings...")
|
234
|
+
filings = query(
|
235
|
+
submission_type=submission_type,
|
236
|
+
cik=cik,
|
237
|
+
filing_date=filing_date,
|
238
|
+
api_key=self.api_key
|
239
|
+
)
|
240
|
+
# After querying but before generating URLs
|
241
|
+
if accession_numbers:
|
242
|
+
filings = [filing for filing in filings if filing['accession_number'] in accession_numbers]
|
243
|
+
|
244
|
+
# Generate URLs from the query results
|
245
|
+
|
246
|
+
print(f"Generating URLs for {len(filings)} filings...")
|
247
|
+
urls = []
|
248
|
+
for item in filings:
|
249
|
+
url = f"{self.BASE_URL}{str(item['accession_number']).zfill(18)}.sgml"
|
250
|
+
if item['compressed'] == True or item['compressed'] == 'true' or item['compressed'] == 'True':
|
251
|
+
url += '.zst'
|
252
|
+
urls.append(url)
|
253
|
+
|
254
|
+
if not urls:
|
255
|
+
print("No submissions found matching the criteria")
|
256
|
+
return
|
257
|
+
|
258
|
+
# Remove duplicates
|
259
|
+
urls = list(set(urls))
|
260
|
+
|
261
|
+
# Now start the async processing
|
262
|
+
start_time = time.time()
|
263
|
+
|
264
|
+
# Process the batch asynchronously
|
265
|
+
asyncio.run(self.process_batch(urls, output_dir))
|
266
|
+
|
267
|
+
# Calculate and display performance metrics
|
268
|
+
elapsed_time = time.time() - start_time
|
269
|
+
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
270
|
+
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
271
|
+
|
272
|
+
|
273
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
|
274
|
+
if accession_numbers:
|
275
|
+
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
276
|
+
# check if acc no is empty list
|
277
|
+
elif accession_numbers == []:
|
278
|
+
raise ValueError("Applied filter resulted in empty accession numbers list")
|
279
|
+
downloader = Downloader(api_key=api_key)
|
280
|
+
downloader.download(
|
281
|
+
submission_type=submission_type,
|
282
|
+
cik=cik,
|
283
|
+
filing_date=filing_date,
|
284
|
+
output_dir=output_dir,
|
285
|
+
accession_numbers=accession_numbers
|
286
|
+
)
|
@@ -0,0 +1,181 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
import aiohttp
|
4
|
+
import urllib.parse
|
5
|
+
import ssl
|
6
|
+
import json
|
7
|
+
import time
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
class Query:
|
11
|
+
def __init__(self, api_key=None):
|
12
|
+
self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
|
13
|
+
self._api_key = api_key
|
14
|
+
self.total_cost = 0
|
15
|
+
self.remaining_balance = None
|
16
|
+
self.start_time = None
|
17
|
+
|
18
|
+
@property
|
19
|
+
def api_key(self):
|
20
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
21
|
+
|
22
|
+
@api_key.setter
|
23
|
+
def api_key(self, value):
|
24
|
+
if not value:
|
25
|
+
raise ValueError("API key cannot be empty")
|
26
|
+
self._api_key = value
|
27
|
+
|
28
|
+
async def _fetch_page(self, session, submission_type=None, cik=None, filing_date=None, page=1):
|
29
|
+
params = {
|
30
|
+
'api_key': self.api_key,
|
31
|
+
'page': page
|
32
|
+
}
|
33
|
+
|
34
|
+
# Handle submission_type parameter
|
35
|
+
if submission_type:
|
36
|
+
if isinstance(submission_type, list):
|
37
|
+
params['submission_type'] = ','.join(str(x) for x in submission_type)
|
38
|
+
else:
|
39
|
+
params['submission_type'] = str(submission_type)
|
40
|
+
|
41
|
+
# Handle CIK parameter
|
42
|
+
if cik:
|
43
|
+
if isinstance(cik, list):
|
44
|
+
params['cik'] = ','.join(str(x) for x in cik)
|
45
|
+
else:
|
46
|
+
params['cik'] = str(cik)
|
47
|
+
|
48
|
+
# Handle filing_date parameter
|
49
|
+
if filing_date:
|
50
|
+
if isinstance(filing_date, tuple):
|
51
|
+
params['startdt'] = str(filing_date[0])
|
52
|
+
params['enddt'] = str(filing_date[1])
|
53
|
+
else:
|
54
|
+
if isinstance(filing_date, list):
|
55
|
+
params['filing_date'] = ','.join(str(x) for x in filing_date)
|
56
|
+
else:
|
57
|
+
params['filing_date'] = str(filing_date)
|
58
|
+
|
59
|
+
url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
|
60
|
+
|
61
|
+
async with session.get(url) as response:
|
62
|
+
data = await response.json()
|
63
|
+
if not data.get('success'):
|
64
|
+
raise ValueError(f"API request failed: {data.get('error')}")
|
65
|
+
|
66
|
+
# Track costs and balance
|
67
|
+
charges = data['metadata']['billing']['charges']
|
68
|
+
page_cost = charges['total']
|
69
|
+
self.total_cost += page_cost
|
70
|
+
self.remaining_balance = data['metadata']['billing']['remaining_balance']
|
71
|
+
|
72
|
+
return data['data'], data['metadata']['pagination'], page_cost
|
73
|
+
|
74
|
+
async def execute_query(self, submission_type=None, cik=None, filing_date=None):
|
75
|
+
if self.api_key is None:
|
76
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
77
|
+
|
78
|
+
self.start_time = time.time()
|
79
|
+
total_items = 0
|
80
|
+
pages_processed = 0
|
81
|
+
|
82
|
+
# Display query parameters
|
83
|
+
query_desc = []
|
84
|
+
if cik:
|
85
|
+
query_desc.append(f"CIK={cik}")
|
86
|
+
if submission_type:
|
87
|
+
query_desc.append(f"Type={submission_type}")
|
88
|
+
if filing_date:
|
89
|
+
if isinstance(filing_date, tuple):
|
90
|
+
query_desc.append(f"Date={filing_date[0]} to {filing_date[1]}")
|
91
|
+
else:
|
92
|
+
query_desc.append(f"Date={filing_date}")
|
93
|
+
|
94
|
+
if query_desc:
|
95
|
+
print(f"QUERY: {', '.join(query_desc)}")
|
96
|
+
|
97
|
+
connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
|
98
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
99
|
+
# Initialize progress bar with a custom format to avoid extra colons
|
100
|
+
pbar = tqdm(unit="page", bar_format="{desc}: {n_fmt} {unit} [{elapsed}<{remaining}, {rate_fmt}{postfix}]")
|
101
|
+
pbar.set_description("Fetching data")
|
102
|
+
|
103
|
+
current_page = 1
|
104
|
+
has_more = True
|
105
|
+
results = []
|
106
|
+
|
107
|
+
while has_more:
|
108
|
+
# Fetch page
|
109
|
+
page_results, pagination, page_cost = await self._fetch_page(session,
|
110
|
+
submission_type=submission_type,
|
111
|
+
cik=cik,
|
112
|
+
filing_date=filing_date,
|
113
|
+
page=current_page)
|
114
|
+
|
115
|
+
# Accumulate results
|
116
|
+
results.extend(page_results)
|
117
|
+
|
118
|
+
pages_processed += 1
|
119
|
+
total_items += len(page_results)
|
120
|
+
|
121
|
+
# Update progress bar with cleaner format
|
122
|
+
pbar.set_description(f"Fetching data (page {current_page})")
|
123
|
+
pbar.set_postfix_str(f"cost=${self.total_cost:.2f} | balance=${self.remaining_balance:.2f}")
|
124
|
+
pbar.update(1)
|
125
|
+
|
126
|
+
# Check if we need to fetch more pages
|
127
|
+
has_more = pagination.get('hasMore', False)
|
128
|
+
current_page += 1
|
129
|
+
|
130
|
+
# For the first page, display record info using pbar.write instead of print
|
131
|
+
if pages_processed == 1:
|
132
|
+
records_per_page = pagination.get('currentPageRecords', len(page_results))
|
133
|
+
total_records = pagination.get('totalRecords', None)
|
134
|
+
if total_records:
|
135
|
+
pbar.write(f"Retrieved {records_per_page} records (page 1) of {total_records} total - Fetching additional pages...")
|
136
|
+
else:
|
137
|
+
pbar.write(f"Retrieved {records_per_page} records (page 1) - Fetching additional pages...")
|
138
|
+
|
139
|
+
pbar.close()
|
140
|
+
|
141
|
+
# Final summary
|
142
|
+
elapsed_time = time.time() - self.start_time
|
143
|
+
print("\nQuery complete:")
|
144
|
+
print(f"- Retrieved {total_items} filings across {pages_processed} pages")
|
145
|
+
print(f"- Total cost: ${self.total_cost:.2f}")
|
146
|
+
print(f"- Remaining balance: ${self.remaining_balance:.2f}")
|
147
|
+
print(f"- Time: {elapsed_time:.1f} seconds")
|
148
|
+
|
149
|
+
return results
|
150
|
+
|
151
|
+
|
152
|
+
def query(cik=None, submission_type=None, filing_date=None, api_key=None):
|
153
|
+
"""
|
154
|
+
Query SEC filings data with optional filtering
|
155
|
+
|
156
|
+
Parameters:
|
157
|
+
- cik: Company CIK number(s), can be string, int, or list
|
158
|
+
- submission_type: Filing type(s), can be string or list (e.g., '10-K', ['10-K', '10-Q'])
|
159
|
+
- filing_date: Filing date(s), can be string, list, or tuple of (start_date, end_date)
|
160
|
+
- api_key: Optional API key (can also use DATAMULE_API_KEY environment variable)
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
- List of all matching submission data
|
164
|
+
"""
|
165
|
+
# Create a Query instance for this request
|
166
|
+
q = Query(api_key=api_key)
|
167
|
+
# remove dash from filing_date
|
168
|
+
if isinstance(filing_date, tuple):
|
169
|
+
filing_date = (filing_date[0].replace('-', ''), filing_date[1].replace('-', ''))
|
170
|
+
elif isinstance(filing_date, str):
|
171
|
+
filing_date = filing_date.replace('-', '')
|
172
|
+
elif isinstance(filing_date, list):
|
173
|
+
filing_date = [x.replace('-', '') for x in filing_date]
|
174
|
+
|
175
|
+
print(filing_date)
|
176
|
+
# Run the query and return results
|
177
|
+
return asyncio.run(q.execute_query(
|
178
|
+
submission_type=submission_type,
|
179
|
+
cik=cik,
|
180
|
+
filing_date=filing_date
|
181
|
+
))
|
@@ -0,0 +1,35 @@
|
|
1
|
+
datamule/__init__.py,sha256=0npnB3i2F7YB7etG315oDiCd-eMo-A6MP5LX2gQclHY,914
|
2
|
+
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
+
datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
|
4
|
+
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
|
+
datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
|
6
|
+
datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
|
7
|
+
datamule/book/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
datamule/book/book.py,sha256=QWiowVNqb84o-JcVo0fpKumxnIbBge2ZeKwHxqkVMqw,1023
|
9
|
+
datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
|
11
|
+
datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
|
12
|
+
datamule/sec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
datamule/sec/utils.py,sha256=JUxwijJiqRMnRJNQzVUamyF5h9ZGc7RnO_zsLOIM73g,2079
|
14
|
+
datamule/sec/infrastructure/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
datamule/sec/infrastructure/submissions_metadata.py,sha256=zsSYmvYLZ7KS_MVDsg-j9Y4qeOyDOaHOQ6ZR6MpiET8,17520
|
16
|
+
datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
|
18
|
+
datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
datamule/sec/submissions/downloader.py,sha256=HxbSkNotLLW6ROmU30rnXPlCo9gY3SoB1Z4ZWvj9FIY,2669
|
20
|
+
datamule/sec/submissions/eftsquery.py,sha256=h3MEkYWTrr_Dy76HzQnUvV-nQzi9b-B2CrW-5ah9WQ8,21892
|
21
|
+
datamule/sec/submissions/monitor.py,sha256=XkwH5nvzr_dNttmFRQ52m7344IKbOtWDfOZIEdie4H8,5234
|
22
|
+
datamule/sec/submissions/streamer.py,sha256=hc61le7gGIIWp1KEaOv_PhriUxf7YYFkQrSKELlZ3pg,9748
|
23
|
+
datamule/sec/submissions/textsearch.py,sha256=4TTw-ceEu3_A4GktBDbsqo5vCUnrjdRnKiaWtfQSV7A,4340
|
24
|
+
datamule/sec/xbrl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
+
datamule/sec/xbrl/downloadcompanyfacts.py,sha256=rMWRiCF9ci_gNZMJ9MC2c_PGEd-yEthawQ0CtVwWTjM,3323
|
26
|
+
datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr39Q,1316
|
27
|
+
datamule/sec/xbrl/streamcompanyfacts.py,sha256=WyJIwuy5mNMXWpx_IkhFzDMe9MOfQ-vNkWl_JzBzFmc,3323
|
28
|
+
datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
|
29
|
+
datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
+
datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
|
31
|
+
datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
|
32
|
+
datamule-1.0.9.dist-info/METADATA,sha256=QhoFw_l9Rc-VaXuBG_JpgWkB-02wxg0C2MTCgWNU3uA,512
|
33
|
+
datamule-1.0.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
34
|
+
datamule-1.0.9.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
35
|
+
datamule-1.0.9.dist-info/RECORD,,
|
datamule-1.0.7.dist-info/RECORD
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
datamule/__init__.py,sha256=0npnB3i2F7YB7etG315oDiCd-eMo-A6MP5LX2gQclHY,914
|
2
|
-
datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
|
3
|
-
datamule/document.py,sha256=BC8jdVy9pMOA9ghIqV5N2XJidmVNThqbBohsuSAnVoY,10813
|
4
|
-
datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
|
5
|
-
datamule/portfolio.py,sha256=JmZlTrom_g7FXKXxWp_CiQTyC7p6_cDP08G0kFUja48,6982
|
6
|
-
datamule/submission.py,sha256=JsxYlEz1Ywu6eC32OS15p4p-p8qB6SWd_rXuf2p5UfY,1247
|
7
|
-
datamule-1.0.7.dist-info/METADATA,sha256=HY7kDLSjl9RuZoJkpgCBA2ugL0EpzzXMc0S7-4qjcNk,512
|
8
|
-
datamule-1.0.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
9
|
-
datamule-1.0.7.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
10
|
-
datamule-1.0.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|