datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book.py +16 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +278 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.0.dist-info/METADATA +27 -0
- datamule-1.0.0.dist-info/RECORD +40 -0
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/__init__.py +0 -0
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.380.dist-info/METADATA +0 -110
- datamule-0.380.dist-info/RECORD +0 -61
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
datamule/document.py
ADDED
@@ -0,0 +1,278 @@
|
|
1
|
+
import json
|
2
|
+
import csv
|
3
|
+
from .helper import convert_to_dashed_accession
|
4
|
+
import re
|
5
|
+
from doc2dict import xml2dict, txt2dict
|
6
|
+
from doc2dict.mapping import flatten_hierarchy
|
7
|
+
from .mapping_dicts import txt_mapping_dicts
|
8
|
+
from .mapping_dicts import xml_mapping_dicts
|
9
|
+
from selectolax.parser import HTMLParser
|
10
|
+
|
11
|
+
class Document:
|
12
|
+
def __init__(self, type, filename):
|
13
|
+
self.type = type
|
14
|
+
self.path = filename
|
15
|
+
|
16
|
+
self.data = None
|
17
|
+
self.content = None
|
18
|
+
|
19
|
+
|
20
|
+
def load_content(self,encoding='utf-8'):
|
21
|
+
with open(self.path, 'r',encoding=encoding) as f:
|
22
|
+
self.content = f.read()
|
23
|
+
|
24
|
+
def _load_text_content(self):
|
25
|
+
with open(self.path) as f:
|
26
|
+
return f.read().translate(str.maketrans({
|
27
|
+
'\xa0': ' ', '\u2003': ' ',
|
28
|
+
'\u2018': "'", '\u2019': "'",
|
29
|
+
'\u201c': '"', '\u201d': '"'
|
30
|
+
}))
|
31
|
+
|
32
|
+
# will deprecate this when we add html2dict
|
33
|
+
def _load_html_content(self):
|
34
|
+
with open(self.path,'rb') as f:
|
35
|
+
parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
|
36
|
+
|
37
|
+
# Remove hidden elements first
|
38
|
+
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
39
|
+
for node in hidden_nodes:
|
40
|
+
node.decompose()
|
41
|
+
|
42
|
+
blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
|
43
|
+
lines = []
|
44
|
+
current_line = []
|
45
|
+
|
46
|
+
def flush_line():
|
47
|
+
if current_line:
|
48
|
+
# Don't add spaces between adjacent spans
|
49
|
+
lines.append(''.join(current_line))
|
50
|
+
current_line.clear()
|
51
|
+
|
52
|
+
for node in parser.root.traverse(include_text=True):
|
53
|
+
if node.tag in ('script', 'style', 'css'):
|
54
|
+
continue
|
55
|
+
|
56
|
+
if node.tag in blocks:
|
57
|
+
flush_line()
|
58
|
+
lines.append('')
|
59
|
+
|
60
|
+
if node.text_content:
|
61
|
+
text = node.text_content.strip()
|
62
|
+
if text:
|
63
|
+
if node.tag in blocks:
|
64
|
+
flush_line()
|
65
|
+
lines.append(text)
|
66
|
+
lines.append('')
|
67
|
+
else:
|
68
|
+
# Only add space if nodes aren't directly adjacent
|
69
|
+
if current_line and not current_line[-1].endswith(' '):
|
70
|
+
if node.prev and node.prev.text_content:
|
71
|
+
if node.parent != node.prev.parent or node.prev.next != node:
|
72
|
+
current_line.append(' ')
|
73
|
+
current_line.append(text)
|
74
|
+
|
75
|
+
flush_line()
|
76
|
+
|
77
|
+
text = '\n'.join(lines)
|
78
|
+
while '\n\n\n' in text:
|
79
|
+
text = text.replace('\n\n\n', '\n\n')
|
80
|
+
|
81
|
+
return text.translate(str.maketrans({
|
82
|
+
'\xa0': ' ', '\u2003': ' ',
|
83
|
+
'\u2018': "'", '\u2019': "'",
|
84
|
+
'\u201c': '"', '\u201d': '"'
|
85
|
+
}))
|
86
|
+
|
87
|
+
def _load_file_content(self):
|
88
|
+
if self.path.suffix =='.txt':
|
89
|
+
self.content = self._load_text_content()
|
90
|
+
elif self.path.suffix in ['.html','.htm']:
|
91
|
+
self.content = self._load_html_content()
|
92
|
+
else:
|
93
|
+
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
94
|
+
|
95
|
+
|
96
|
+
def contains_string(self, pattern):
|
97
|
+
"""Currently only works for .htm, .html, and .txt files"""
|
98
|
+
if self.path.suffix in ['.htm', '.html', '.txt']:
|
99
|
+
if self.content is None:
|
100
|
+
self.content = self._load_file_content(self.path)
|
101
|
+
return bool(re.search(pattern, self.content))
|
102
|
+
return False
|
103
|
+
|
104
|
+
# Note: this method will be heavily modified in the future
|
105
|
+
def parse(self):
|
106
|
+
mapping_dict = None
|
107
|
+
|
108
|
+
if self.path.suffix == '.xml':
|
109
|
+
if self.type in ['3', '4', '5']:
|
110
|
+
mapping_dict = xml_mapping_dicts.dict_345
|
111
|
+
|
112
|
+
self.load_content()
|
113
|
+
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
114
|
+
# will deprecate this when we add html2dict
|
115
|
+
elif self.path.suffix in ['.htm', '.html','.txt']:
|
116
|
+
self._load_file_content()
|
117
|
+
|
118
|
+
if self.type == '10-K':
|
119
|
+
mapping_dict = txt_mapping_dicts.dict_10k
|
120
|
+
elif self.type == '10-Q':
|
121
|
+
mapping_dict = txt_mapping_dicts.dict_10q
|
122
|
+
elif self.type == '8-K':
|
123
|
+
mapping_dict = txt_mapping_dicts.dict_8k
|
124
|
+
elif self.type == 'SC 13D':
|
125
|
+
mapping_dict = txt_mapping_dicts.dict_13d
|
126
|
+
elif self.type == 'SC 13G':
|
127
|
+
mapping_dict = txt_mapping_dicts.dict_13g
|
128
|
+
|
129
|
+
self.data = txt2dict(content=self.content, mapping_dict=mapping_dict)
|
130
|
+
return self.data
|
131
|
+
|
132
|
+
def write_json(self, output_filename=None):
|
133
|
+
if not self.data:
|
134
|
+
self.parse()
|
135
|
+
|
136
|
+
if output_filename is None:
|
137
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
|
138
|
+
|
139
|
+
with open(output_filename, 'w',encoding='utf-8') as f:
|
140
|
+
json.dump(self.data, f, indent=2)
|
141
|
+
|
142
|
+
def write_csv(self, output_filename=None, accession_number=None):
|
143
|
+
self.parse()
|
144
|
+
|
145
|
+
if output_filename is None:
|
146
|
+
output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
|
147
|
+
|
148
|
+
with open(output_filename, 'w', newline='') as csvfile:
|
149
|
+
if not self.data:
|
150
|
+
return output_filename
|
151
|
+
|
152
|
+
has_document = any('document' in item for item in self.data)
|
153
|
+
|
154
|
+
if has_document and 'document' in self.data:
|
155
|
+
writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
|
156
|
+
writer.writeheader()
|
157
|
+
flattened = self._flatten_dict(self.data['document'])
|
158
|
+
for section, text in flattened.items():
|
159
|
+
writer.writerow({'section': section, 'text': text})
|
160
|
+
else:
|
161
|
+
fieldnames = list(self.data[0].keys())
|
162
|
+
if accession_number:
|
163
|
+
fieldnames.append('Accession Number')
|
164
|
+
writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
|
165
|
+
writer.writeheader()
|
166
|
+
for row in self.data:
|
167
|
+
if accession_number:
|
168
|
+
row['Accession Number'] = convert_to_dashed_accession(accession_number)
|
169
|
+
writer.writerow(row)
|
170
|
+
|
171
|
+
return output_filename
|
172
|
+
|
173
|
+
def _document_to_section_text(self, document_data, parent_key=''):
|
174
|
+
items = []
|
175
|
+
|
176
|
+
if isinstance(document_data, dict):
|
177
|
+
for key, value in document_data.items():
|
178
|
+
# Build the section name
|
179
|
+
section = f"{parent_key}_{key}" if parent_key else key
|
180
|
+
|
181
|
+
# If the value is a dict, recurse
|
182
|
+
if isinstance(value, dict):
|
183
|
+
items.extend(self._document_to_section_text(value, section))
|
184
|
+
# If it's a list, handle each item
|
185
|
+
elif isinstance(value, list):
|
186
|
+
for i, item in enumerate(value):
|
187
|
+
if isinstance(item, dict):
|
188
|
+
items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
|
189
|
+
else:
|
190
|
+
items.append({
|
191
|
+
'section': f"{section}_{i+1}",
|
192
|
+
'text': str(item)
|
193
|
+
})
|
194
|
+
# Base case - add the item
|
195
|
+
else:
|
196
|
+
items.append({
|
197
|
+
'section': section,
|
198
|
+
'text': str(value)
|
199
|
+
})
|
200
|
+
|
201
|
+
return items
|
202
|
+
|
203
|
+
# we'll modify this for every dict
|
204
|
+
def _flatten_dict(self, d, parent_key=''):
|
205
|
+
items = {}
|
206
|
+
|
207
|
+
if isinstance(d, list):
|
208
|
+
return [self._flatten_dict(item) for item in d]
|
209
|
+
|
210
|
+
for k, v in d.items():
|
211
|
+
new_key = f"{parent_key}_{k}" if parent_key else k
|
212
|
+
|
213
|
+
if isinstance(v, dict):
|
214
|
+
items.update(self._flatten_dict(v, new_key))
|
215
|
+
else:
|
216
|
+
items[new_key] = str(v)
|
217
|
+
|
218
|
+
return items
|
219
|
+
|
220
|
+
# this will all have to be changed. default will be to flatten everything
|
221
|
+
def __iter__(self):
|
222
|
+
if not self.data:
|
223
|
+
self.parse()
|
224
|
+
|
225
|
+
# Let's remove XML iterable for now
|
226
|
+
|
227
|
+
# Handle text-based documents
|
228
|
+
if self.path.suffix in ['.txt', '.htm', '.html']:
|
229
|
+
document_data = self.data
|
230
|
+
if not document_data:
|
231
|
+
return iter([])
|
232
|
+
|
233
|
+
# Find highest hierarchy level from mapping dict
|
234
|
+
highest_hierarchy = float('inf')
|
235
|
+
section_type = None
|
236
|
+
|
237
|
+
if self.type in ['10-K', '10-Q']:
|
238
|
+
mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
|
239
|
+
elif self.type == '8-K':
|
240
|
+
mapping_dict = txt_mapping_dicts.dict_8k
|
241
|
+
elif self.type == 'SC 13D':
|
242
|
+
mapping_dict = txt_mapping_dicts.dict_13d
|
243
|
+
elif self.type == 'SC 13G':
|
244
|
+
mapping_dict = txt_mapping_dicts.dict_13g
|
245
|
+
else:
|
246
|
+
return iter([])
|
247
|
+
|
248
|
+
# Find section type with highest hierarchy number
|
249
|
+
highest_hierarchy = -1 # Start at -1 to find highest
|
250
|
+
for mapping in mapping_dict['rules']['mappings']:
|
251
|
+
if mapping.get('hierarchy') is not None:
|
252
|
+
if mapping['hierarchy'] > highest_hierarchy:
|
253
|
+
highest_hierarchy = mapping['hierarchy']
|
254
|
+
section_type = mapping['name']
|
255
|
+
|
256
|
+
if not section_type:
|
257
|
+
return iter([])
|
258
|
+
|
259
|
+
# Extract sections of the identified type
|
260
|
+
def find_sections(data, target_type):
|
261
|
+
sections = []
|
262
|
+
if isinstance(data, dict):
|
263
|
+
if data.get('type') == target_type:
|
264
|
+
sections.append({
|
265
|
+
'item': data.get('text', ''),
|
266
|
+
'text': flatten_hierarchy(data.get('content', []))
|
267
|
+
})
|
268
|
+
for value in data.values():
|
269
|
+
if isinstance(value, (dict, list)):
|
270
|
+
sections.extend(find_sections(value, target_type))
|
271
|
+
elif isinstance(data, list):
|
272
|
+
for item in data:
|
273
|
+
sections.extend(find_sections(item, target_type))
|
274
|
+
return sections
|
275
|
+
|
276
|
+
return iter(find_sections(document_data, section_type))
|
277
|
+
|
278
|
+
return iter([])
|
@@ -0,0 +1,374 @@
|
|
1
|
+
import asyncio
|
2
|
+
import aiohttp
|
3
|
+
import os
|
4
|
+
from tqdm import tqdm
|
5
|
+
from datetime import datetime
|
6
|
+
from urllib.parse import urlencode
|
7
|
+
import aiofiles
|
8
|
+
import json
|
9
|
+
import time
|
10
|
+
from collections import deque
|
11
|
+
|
12
|
+
from ..helper import identifier_to_cik, load_package_csv, fix_filing_url, headers
|
13
|
+
from secsgml import parse_sgml_submission
|
14
|
+
|
15
|
+
class RetryException(Exception):
|
16
|
+
def __init__(self, url, retry_after=601):
|
17
|
+
self.url = url
|
18
|
+
self.retry_after = retry_after
|
19
|
+
|
20
|
+
class PreciseRateLimiter:
|
21
|
+
def __init__(self, rate, interval=1.0):
|
22
|
+
self.rate = rate # requests per interval
|
23
|
+
self.interval = interval # in seconds
|
24
|
+
self.token_time = self.interval / self.rate # time per token
|
25
|
+
self.last_time = time.time()
|
26
|
+
self.lock = asyncio.Lock()
|
27
|
+
|
28
|
+
async def acquire(self):
|
29
|
+
async with self.lock:
|
30
|
+
now = time.time()
|
31
|
+
wait_time = self.last_time + self.token_time - now
|
32
|
+
if wait_time > 0:
|
33
|
+
await asyncio.sleep(wait_time)
|
34
|
+
self.last_time = time.time()
|
35
|
+
return True
|
36
|
+
|
37
|
+
async def __aenter__(self):
|
38
|
+
await self.acquire()
|
39
|
+
return self
|
40
|
+
|
41
|
+
async def __aexit__(self, exc_type, exc, tb):
|
42
|
+
pass
|
43
|
+
|
44
|
+
class RateMonitor:
|
45
|
+
def __init__(self, window_size=1.0):
|
46
|
+
self.window_size = window_size
|
47
|
+
self.requests = deque()
|
48
|
+
self._lock = asyncio.Lock()
|
49
|
+
|
50
|
+
async def add_request(self, size_bytes):
|
51
|
+
async with self._lock:
|
52
|
+
now = time.time()
|
53
|
+
self.requests.append((now, size_bytes))
|
54
|
+
while self.requests and self.requests[0][0] < now - self.window_size:
|
55
|
+
self.requests.popleft()
|
56
|
+
|
57
|
+
def get_current_rates(self):
|
58
|
+
now = time.time()
|
59
|
+
while self.requests and self.requests[0][0] < now - self.window_size:
|
60
|
+
self.requests.popleft()
|
61
|
+
|
62
|
+
if not self.requests:
|
63
|
+
return 0, 0
|
64
|
+
|
65
|
+
request_count = len(self.requests)
|
66
|
+
byte_count = sum(size for _, size in self.requests)
|
67
|
+
|
68
|
+
requests_per_second = request_count / self.window_size
|
69
|
+
mb_per_second = (byte_count / 1024 / 1024) / self.window_size
|
70
|
+
|
71
|
+
return round(requests_per_second, 1), round(mb_per_second, 2)
|
72
|
+
|
73
|
+
class Downloader:
|
74
|
+
def __init__(self):
|
75
|
+
self.headers = headers
|
76
|
+
self.limiter = PreciseRateLimiter(5) # 10 requests per second
|
77
|
+
self.session = None
|
78
|
+
self.parse_filings = True
|
79
|
+
self.download_queue = asyncio.Queue()
|
80
|
+
self.rate_monitor = RateMonitor()
|
81
|
+
self.current_pbar = None
|
82
|
+
self.connection_semaphore = asyncio.Semaphore(5)
|
83
|
+
|
84
|
+
def update_progress_description(self):
|
85
|
+
if self.current_pbar:
|
86
|
+
reqs_per_sec, mb_per_sec = self.rate_monitor.get_current_rates()
|
87
|
+
self.current_pbar.set_description(
|
88
|
+
f"Progress [Rate: {reqs_per_sec}/s | {mb_per_sec} MB/s]"
|
89
|
+
)
|
90
|
+
|
91
|
+
async def __aenter__(self):
|
92
|
+
await self._init_session()
|
93
|
+
return self
|
94
|
+
|
95
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
96
|
+
await self._close()
|
97
|
+
|
98
|
+
async def _init_session(self):
|
99
|
+
if not self.session:
|
100
|
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
101
|
+
|
102
|
+
async def _close(self):
|
103
|
+
if self.session:
|
104
|
+
await self.session.close()
|
105
|
+
self.session = None
|
106
|
+
|
107
|
+
async def _fetch_json(self, url):
|
108
|
+
"""Fetch JSON with rate monitoring."""
|
109
|
+
async with self.limiter:
|
110
|
+
try:
|
111
|
+
url = fix_filing_url(url)
|
112
|
+
async with self.session.get(url) as response:
|
113
|
+
if response.status == 429:
|
114
|
+
raise RetryException(url)
|
115
|
+
response.raise_for_status()
|
116
|
+
content = await response.read()
|
117
|
+
await self.rate_monitor.add_request(len(content))
|
118
|
+
self.update_progress_description()
|
119
|
+
return await response.json()
|
120
|
+
except aiohttp.ClientResponseError as e:
|
121
|
+
if e.status == 429:
|
122
|
+
raise RetryException(url)
|
123
|
+
raise
|
124
|
+
|
125
|
+
async def _get_filing_urls_from_efts(self, base_url, submission_type=None):
|
126
|
+
"""Fetch filing URLs from EFTS in batches with form type filtering."""
|
127
|
+
start = 0
|
128
|
+
page_size = 100
|
129
|
+
urls = []
|
130
|
+
|
131
|
+
data = await self._fetch_json(f"{base_url}&from=0&size=1")
|
132
|
+
if not data or 'hits' not in data:
|
133
|
+
return []
|
134
|
+
|
135
|
+
total_hits = data['hits']['total']['value']
|
136
|
+
if not total_hits:
|
137
|
+
return []
|
138
|
+
|
139
|
+
pbar = tqdm(total=total_hits, desc="Fetching URLs [Rate: 0/s | 0 MB/s]")
|
140
|
+
self.current_pbar = pbar
|
141
|
+
|
142
|
+
while start < total_hits:
|
143
|
+
try:
|
144
|
+
tasks = [
|
145
|
+
self._fetch_json(f"{base_url}&from={start + i * page_size}&size={page_size}")
|
146
|
+
for i in range(10)
|
147
|
+
]
|
148
|
+
|
149
|
+
results = await asyncio.gather(*tasks)
|
150
|
+
|
151
|
+
for data in results:
|
152
|
+
if data and 'hits' in data:
|
153
|
+
hits = data['hits']['hits']
|
154
|
+
if hits:
|
155
|
+
# Filter hits based on exact form match
|
156
|
+
if not submission_type or submission_type == "-0":
|
157
|
+
filtered_hits = hits
|
158
|
+
else:
|
159
|
+
requested_forms = [submission_type] if isinstance(submission_type, str) else submission_type
|
160
|
+
filtered_hits = [
|
161
|
+
hit for hit in hits
|
162
|
+
if hit['_source'].get('form', '') in requested_forms
|
163
|
+
]
|
164
|
+
|
165
|
+
batch_urls = [
|
166
|
+
f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}.txt"
|
167
|
+
for hit in filtered_hits
|
168
|
+
]
|
169
|
+
urls.extend(batch_urls)
|
170
|
+
pbar.update(len(hits)) # Update progress based on total hits processed
|
171
|
+
self.update_progress_description()
|
172
|
+
|
173
|
+
start += 10 * page_size
|
174
|
+
|
175
|
+
except RetryException as e:
|
176
|
+
print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
|
177
|
+
await asyncio.sleep(e.retry_after)
|
178
|
+
continue
|
179
|
+
except Exception as e:
|
180
|
+
print(f"\nError fetching URLs batch at {start}: {str(e)}")
|
181
|
+
break
|
182
|
+
|
183
|
+
pbar.close()
|
184
|
+
self.current_pbar = None
|
185
|
+
return urls
|
186
|
+
|
187
|
+
async def _download_file(self, url, filepath):
|
188
|
+
"""Download single file with precise rate limiting."""
|
189
|
+
async with self.connection_semaphore:
|
190
|
+
async with self.limiter:
|
191
|
+
try:
|
192
|
+
url = fix_filing_url(url)
|
193
|
+
async with self.session.get(url) as response:
|
194
|
+
if response.status == 429:
|
195
|
+
raise RetryException(url)
|
196
|
+
response.raise_for_status()
|
197
|
+
content = await response.read()
|
198
|
+
await self.rate_monitor.add_request(len(content))
|
199
|
+
self.update_progress_description()
|
200
|
+
|
201
|
+
parsed_data = None
|
202
|
+
if self.parse_filings:
|
203
|
+
try:
|
204
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
205
|
+
async with aiofiles.open(filepath, 'wb') as f:
|
206
|
+
await f.write(content)
|
207
|
+
|
208
|
+
parsed_data = parse_sgml_submission(
|
209
|
+
content=content.decode(),
|
210
|
+
output_dir=os.path.dirname(filepath)
|
211
|
+
)
|
212
|
+
|
213
|
+
try:
|
214
|
+
os.remove(filepath)
|
215
|
+
except Exception as e:
|
216
|
+
print(f"\nError deleting original file {filepath}: {str(e)}")
|
217
|
+
|
218
|
+
except Exception as e:
|
219
|
+
print(f"\nError parsing {url}: {str(e)}")
|
220
|
+
try:
|
221
|
+
os.remove(filepath)
|
222
|
+
parsed_dir = os.path.dirname(filepath) + f'/{url.split("/")[-1].split(".")[0].replace("-", "")}'
|
223
|
+
if os.path.exists(parsed_dir):
|
224
|
+
import shutil
|
225
|
+
shutil.rmtree(parsed_dir)
|
226
|
+
except Exception as e:
|
227
|
+
print(f"\nError cleaning up files for {url}: {str(e)}")
|
228
|
+
else:
|
229
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
230
|
+
async with aiofiles.open(filepath, 'wb') as f:
|
231
|
+
await f.write(content)
|
232
|
+
|
233
|
+
return filepath, parsed_data
|
234
|
+
|
235
|
+
except Exception as e:
|
236
|
+
print(f"\nError downloading {url}: {str(e)}")
|
237
|
+
return None
|
238
|
+
|
239
|
+
async def _download_worker(self, pbar):
|
240
|
+
"""Worker to process download queue."""
|
241
|
+
while True:
|
242
|
+
try:
|
243
|
+
url, filepath = await self.download_queue.get()
|
244
|
+
result = await self._download_file(url, filepath)
|
245
|
+
if result:
|
246
|
+
pbar.update(1)
|
247
|
+
self.download_queue.task_done()
|
248
|
+
except asyncio.CancelledError:
|
249
|
+
break
|
250
|
+
except Exception as e:
|
251
|
+
print(f"\nWorker error processing {url}: {str(e)}")
|
252
|
+
self.download_queue.task_done()
|
253
|
+
|
254
|
+
async def _download_and_process(self, urls, output_dir):
|
255
|
+
"""Queue-based download processing."""
|
256
|
+
results = []
|
257
|
+
parsed_results = []
|
258
|
+
|
259
|
+
pbar = tqdm(total=len(urls), desc="Downloading files [Rate: 0/s | 0 MB/s]")
|
260
|
+
self.current_pbar = pbar
|
261
|
+
|
262
|
+
for url in urls:
|
263
|
+
filename = url.split('/')[-1]
|
264
|
+
filepath = os.path.join(output_dir, filename)
|
265
|
+
await self.download_queue.put((url, filepath))
|
266
|
+
|
267
|
+
workers = [asyncio.create_task(self._download_worker(pbar))
|
268
|
+
for _ in range(5)] # Match number of workers to semaphore
|
269
|
+
|
270
|
+
await self.download_queue.join()
|
271
|
+
|
272
|
+
for worker in workers:
|
273
|
+
worker.cancel()
|
274
|
+
|
275
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
276
|
+
|
277
|
+
pbar.close()
|
278
|
+
self.current_pbar = None
|
279
|
+
return results, parsed_results
|
280
|
+
|
281
|
+
def download_submissions(self, output_dir='filings', cik=None, ticker=None, submission_type=None, filing_date=None, parse=True):
|
282
|
+
"""Main method to download SEC filings."""
|
283
|
+
self.parse_filings = parse
|
284
|
+
|
285
|
+
async def _download():
|
286
|
+
async with self as downloader:
|
287
|
+
if ticker is not None:
|
288
|
+
cik_value = identifier_to_cik(ticker)
|
289
|
+
else:
|
290
|
+
cik_value = cik
|
291
|
+
|
292
|
+
params = {}
|
293
|
+
if cik_value:
|
294
|
+
if isinstance(cik_value, list):
|
295
|
+
params['ciks'] = ','.join(str(c).zfill(10) for c in cik_value)
|
296
|
+
else:
|
297
|
+
params['ciks'] = str(cik_value).zfill(10)
|
298
|
+
|
299
|
+
params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type if submission_type else "-0"
|
300
|
+
|
301
|
+
if isinstance(filing_date, list):
|
302
|
+
dates = [(d, d) for d in filing_date]
|
303
|
+
elif isinstance(filing_date, tuple):
|
304
|
+
dates = [filing_date]
|
305
|
+
else:
|
306
|
+
date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
|
307
|
+
start, end = date_str.split(',')
|
308
|
+
dates = [(start, end)]
|
309
|
+
|
310
|
+
all_filepaths = []
|
311
|
+
all_parsed_data = []
|
312
|
+
|
313
|
+
for start_date, end_date in dates:
|
314
|
+
params['startdt'] = start_date
|
315
|
+
params['enddt'] = end_date
|
316
|
+
base_url = "https://efts.sec.gov/LATEST/search-index"
|
317
|
+
efts_url = f"{base_url}?{urlencode(params, doseq=True)}"
|
318
|
+
|
319
|
+
urls = await self._get_filing_urls_from_efts(efts_url,submission_type)
|
320
|
+
if urls:
|
321
|
+
filepaths, parsed_data = await self._download_and_process(urls, output_dir)
|
322
|
+
all_filepaths.extend(filepaths)
|
323
|
+
all_parsed_data.extend(parsed_data)
|
324
|
+
|
325
|
+
return all_filepaths, all_parsed_data
|
326
|
+
|
327
|
+
return asyncio.run(_download())
|
328
|
+
|
329
|
+
def download_company_concepts(self, output_dir='company_concepts', cik=None, ticker=None):
|
330
|
+
"""Download company concept data."""
|
331
|
+
async def _download_concepts():
|
332
|
+
async with self as downloader:
|
333
|
+
if ticker is not None:
|
334
|
+
ciks = identifier_to_cik(ticker)
|
335
|
+
elif cik:
|
336
|
+
ciks = [cik] if not isinstance(cik, list) else cik
|
337
|
+
else:
|
338
|
+
company_tickers = load_package_csv('company_tickers')
|
339
|
+
ciks = [company['cik'] for company in company_tickers]
|
340
|
+
|
341
|
+
os.makedirs(output_dir, exist_ok=True)
|
342
|
+
urls = [f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json' for cik in ciks]
|
343
|
+
|
344
|
+
pbar = tqdm(total=len(urls), desc="Downloading concepts [Rate: 0/s | 0 MB/s]")
|
345
|
+
self.current_pbar = pbar
|
346
|
+
|
347
|
+
for url in urls:
|
348
|
+
filename = url.split('/')[-1]
|
349
|
+
filepath = os.path.join(output_dir, filename)
|
350
|
+
await self.download_queue.put((url, filepath))
|
351
|
+
|
352
|
+
workers = [asyncio.create_task(self._download_worker(pbar))
|
353
|
+
for _ in range(5)]
|
354
|
+
|
355
|
+
await self.download_queue.join()
|
356
|
+
|
357
|
+
for worker in workers:
|
358
|
+
worker.cancel()
|
359
|
+
|
360
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
361
|
+
|
362
|
+
pbar.close()
|
363
|
+
self.current_pbar = None
|
364
|
+
|
365
|
+
results = []
|
366
|
+
for url in urls:
|
367
|
+
filename = url.split('/')[-1]
|
368
|
+
filepath = os.path.join(output_dir, filename)
|
369
|
+
if os.path.exists(filepath):
|
370
|
+
results.append(filepath)
|
371
|
+
|
372
|
+
return results
|
373
|
+
|
374
|
+
return asyncio.run(_download_concepts())
|