datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book.py +16 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +278 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.0.dist-info/METADATA +27 -0
- datamule-1.0.0.dist-info/RECORD +40 -0
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/__init__.py +0 -0
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.380.dist-info/METADATA +0 -110
- datamule-0.380.dist-info/RECORD +0 -61
- {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,635 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import aiohttp
|
3
|
-
from aiolimiter import AsyncLimiter
|
4
|
-
import os
|
5
|
-
from tqdm import tqdm
|
6
|
-
from tqdm.asyncio import tqdm as atqdm
|
7
|
-
from datetime import datetime, timedelta
|
8
|
-
from urllib.parse import urlparse, parse_qs, urlencode
|
9
|
-
import math
|
10
|
-
import re
|
11
|
-
import aiofiles
|
12
|
-
import json
|
13
|
-
import csv
|
14
|
-
from pkg_resources import resource_filename
|
15
|
-
|
16
|
-
|
17
|
-
from ..global_vars import headers, dataset_10q_url_list,dataset_10k_url_list
|
18
|
-
from ..helper import identifier_to_cik, load_package_csv, fix_filing_url
|
19
|
-
from .ftd import get_all_ftd_urls, process_all_ftd_zips
|
20
|
-
from .dropbox_downloader import DropboxDownloader
|
21
|
-
from .information_table_13f import download_and_process_13f_data
|
22
|
-
class RetryException(Exception):
|
23
|
-
def __init__(self, url, retry_after=601):
|
24
|
-
self.url = url
|
25
|
-
self.retry_after = retry_after
|
26
|
-
|
27
|
-
class Downloader:
|
28
|
-
def __init__(self):
|
29
|
-
self.headers = headers
|
30
|
-
self.dataset_path = 'datasets'
|
31
|
-
self.domain_limiters = {
|
32
|
-
'www.sec.gov': AsyncLimiter(10, 1),
|
33
|
-
'efts.sec.gov': AsyncLimiter(10, 1),
|
34
|
-
'data.sec.gov': AsyncLimiter(10, 1),
|
35
|
-
'default': AsyncLimiter(10, 1)
|
36
|
-
}
|
37
|
-
self.metadata = None
|
38
|
-
|
39
|
-
def set_limiter(self, domain, rate_limit):
|
40
|
-
self.domain_limiters[domain] = AsyncLimiter(rate_limit, 1)
|
41
|
-
|
42
|
-
def set_headers(self, user_agent):
|
43
|
-
self.headers = {'User-Agent': user_agent}
|
44
|
-
|
45
|
-
|
46
|
-
def get_limiter(self, url):
|
47
|
-
domain = urlparse(url).netloc
|
48
|
-
return self.domain_limiters.get(domain, self.domain_limiters['default'])
|
49
|
-
|
50
|
-
async def _fetch_content_from_url(self, session, url):
|
51
|
-
limiter = self.get_limiter(url)
|
52
|
-
async with limiter:
|
53
|
-
try:
|
54
|
-
async with session.get(url, headers=self.headers) as response:
|
55
|
-
if response.status == 429:
|
56
|
-
raise RetryException(url)
|
57
|
-
|
58
|
-
response.raise_for_status()
|
59
|
-
return await response.read()
|
60
|
-
|
61
|
-
except aiohttp.ClientResponseError as e:
|
62
|
-
if e.status == 429:
|
63
|
-
raise RetryException(url)
|
64
|
-
raise
|
65
|
-
|
66
|
-
except Exception as e:
|
67
|
-
print(f"Error downloading {url}: {str(e)}")
|
68
|
-
raise
|
69
|
-
|
70
|
-
async def write_content_to_file(self, content, filepath):
|
71
|
-
"""Write content to a file asynchronously."""
|
72
|
-
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
73
|
-
async with aiofiles.open(filepath, 'wb') as f:
|
74
|
-
await f.write(content)
|
75
|
-
|
76
|
-
def generate_url(self, base_url, params):
|
77
|
-
return f"{base_url}?{urlencode(params)}"
|
78
|
-
|
79
|
-
async def download_file(self, session, url, output_path):
|
80
|
-
"""Download a file from a URL and save it to the specified path."""
|
81
|
-
content = await self._fetch_content_from_url(session, url)
|
82
|
-
await self.write_content_to_file(content, output_path)
|
83
|
-
return output_path
|
84
|
-
|
85
|
-
async def _fetch_json_from_url(self, session, url):
|
86
|
-
"""Asynchronously fetch JSON data from a URL."""
|
87
|
-
content = await self._fetch_content_from_url(session, url)
|
88
|
-
return json.loads(content)
|
89
|
-
|
90
|
-
async def _download_urls(self, urls, filenames, output_dir):
|
91
|
-
os.makedirs(output_dir, exist_ok=True)
|
92
|
-
async with aiohttp.ClientSession() as session:
|
93
|
-
total_files = len(urls)
|
94
|
-
completed_files = 0
|
95
|
-
|
96
|
-
with tqdm(total=total_files, desc="Downloading files") as pbar:
|
97
|
-
while urls and completed_files < total_files:
|
98
|
-
tasks = [asyncio.create_task(self.download_file(session, url, os.path.join(output_dir, filename)))
|
99
|
-
for url, filename in zip(urls, filenames) if filename]
|
100
|
-
|
101
|
-
rate_limited = False
|
102
|
-
retry_after = 0
|
103
|
-
|
104
|
-
pending = tasks
|
105
|
-
while pending:
|
106
|
-
done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
|
107
|
-
|
108
|
-
for task in done:
|
109
|
-
try:
|
110
|
-
result = await task
|
111
|
-
completed_files += 1
|
112
|
-
pbar.update(1)
|
113
|
-
except RetryException as e:
|
114
|
-
print(f"\nRate limited for {e.url}. Will retry after {e.retry_after} seconds.")
|
115
|
-
rate_limited = True
|
116
|
-
retry_after = max(retry_after, e.retry_after)
|
117
|
-
break
|
118
|
-
except Exception as e:
|
119
|
-
print(f"\nFailed to download: {str(e)}")
|
120
|
-
completed_files += 1
|
121
|
-
pbar.update(1)
|
122
|
-
|
123
|
-
if rate_limited:
|
124
|
-
break
|
125
|
-
|
126
|
-
if rate_limited:
|
127
|
-
for task in pending:
|
128
|
-
task.cancel()
|
129
|
-
|
130
|
-
print(f"\nRate limit hit. Sleeping for {retry_after} seconds before retrying.")
|
131
|
-
await asyncio.sleep(retry_after)
|
132
|
-
|
133
|
-
# Recreate the list of URLs and filenames that haven't been processed
|
134
|
-
urls = [task.get_coro().cr_frame.f_locals['url'] for task in pending]
|
135
|
-
filenames = [filename for url, filename in zip(urls, filenames) if url in urls]
|
136
|
-
else:
|
137
|
-
break # All tasks completed successfully
|
138
|
-
|
139
|
-
print(f"\nSuccessfully downloaded {completed_files} out of {total_files} URLs")
|
140
|
-
return completed_files
|
141
|
-
|
142
|
-
def run_download_urls(self, urls, filenames, output_dir='filings'):
|
143
|
-
"""Download a list of URLs to a specified directory"""
|
144
|
-
return asyncio.run(self._download_urls(urls, filenames, output_dir))
|
145
|
-
|
146
|
-
|
147
|
-
async def _number_of_efts_filings(self, session, url):
|
148
|
-
"""Get the number of filings from a given EFTS URL asynchronously."""
|
149
|
-
limiter = self.get_limiter(url)
|
150
|
-
async with limiter:
|
151
|
-
try:
|
152
|
-
async with session.get(url, headers=self.headers) as response:
|
153
|
-
if response.status == 429:
|
154
|
-
raise RetryException(url)
|
155
|
-
response.raise_for_status()
|
156
|
-
data = await response.json()
|
157
|
-
return sum(bucket['doc_count'] for bucket in data['aggregations']['form_filter']['buckets'])
|
158
|
-
except aiohttp.ClientResponseError as e:
|
159
|
-
if e.status == 429:
|
160
|
-
raise RetryException(url)
|
161
|
-
raise
|
162
|
-
except Exception as e:
|
163
|
-
print(f"Error fetching number of filings from {url}: {str(e)}")
|
164
|
-
raise
|
165
|
-
|
166
|
-
def _subset_urls(self, full_url, total_filings, target_filings_per_range=1000):
|
167
|
-
"""Split an EFTS URL into multiple URLs based on the number of filings."""
|
168
|
-
parsed_url = urlparse(full_url)
|
169
|
-
params = parse_qs(parsed_url.query)
|
170
|
-
start = datetime.strptime(params.get('startdt', [None])[0], "%Y-%m-%d")
|
171
|
-
end = datetime.strptime(params.get('enddt', [None])[0], "%Y-%m-%d")
|
172
|
-
|
173
|
-
if start == end:
|
174
|
-
forms = params.get('forms', [None])[0]
|
175
|
-
if forms == '-0':
|
176
|
-
urls = []
|
177
|
-
for form in ['SC 13G', 'SC 13G/A']:
|
178
|
-
new_params = params.copy()
|
179
|
-
new_params['forms'] = [form]
|
180
|
-
urls.append(parsed_url._replace(query=urlencode(new_params, doseq=True)).geturl())
|
181
|
-
return urls
|
182
|
-
else:
|
183
|
-
return [full_url]
|
184
|
-
|
185
|
-
num_ranges = math.ceil(total_filings / target_filings_per_range)
|
186
|
-
days_per_range = math.ceil((end - start).days / num_ranges)
|
187
|
-
|
188
|
-
urls = []
|
189
|
-
current_start = start
|
190
|
-
for _ in range(num_ranges):
|
191
|
-
current_end = min(current_start + timedelta(days=days_per_range), end)
|
192
|
-
new_params = params.copy()
|
193
|
-
new_params['startdt'] = [current_start.strftime('%Y-%m-%d')]
|
194
|
-
new_params['enddt'] = [current_end.strftime('%Y-%m-%d')]
|
195
|
-
urls.append(parsed_url._replace(query=urlencode(new_params, doseq=True)).geturl())
|
196
|
-
if current_end == end:
|
197
|
-
break
|
198
|
-
current_start = current_end + timedelta(days=1)
|
199
|
-
|
200
|
-
return urls[::-1]
|
201
|
-
|
202
|
-
async def _get_filing_urls_from_efts(self, base_url, sics=None, items=None, file_types=None, save_metadata=False, output_dir=None):
|
203
|
-
"""Asynchronously fetch all filing URLs from a given EFTS URL."""
|
204
|
-
urls = []
|
205
|
-
start, page_size = 0, 100
|
206
|
-
|
207
|
-
if save_metadata:
|
208
|
-
metadata_file = os.path.join(output_dir, 'metadata.jsonl')
|
209
|
-
os.makedirs(output_dir, exist_ok=True)
|
210
|
-
|
211
|
-
async with aiohttp.ClientSession() as session:
|
212
|
-
while True:
|
213
|
-
tasks = [self._fetch_json_from_url(session, f"{base_url}&from={start + i * page_size}") for i in range(10)]
|
214
|
-
results = await atqdm.gather(*tasks, desc="Fetching URLs")
|
215
|
-
for data in results:
|
216
|
-
if data and 'hits' in data:
|
217
|
-
hits = data['hits']['hits']
|
218
|
-
if not hits:
|
219
|
-
return urls
|
220
|
-
|
221
|
-
for hit in hits:
|
222
|
-
# Check SIC filter
|
223
|
-
sic_match = sics is None or any(int(sic) in sics for sic in hit['_source'].get('sics', []))
|
224
|
-
|
225
|
-
# Check item filter
|
226
|
-
item_match = items is None or any(item in items for item in hit['_source'].get('items', []))
|
227
|
-
|
228
|
-
# Check file type filter
|
229
|
-
file_type_match = file_types is None or hit['_source'].get('file_type') in (file_types if isinstance(file_types, list) else [file_types])
|
230
|
-
|
231
|
-
if sic_match and item_match and file_type_match:
|
232
|
-
url = f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0].replace('-', '')}/{hit['_id'].split(':')[1]}"
|
233
|
-
urls.append(url)
|
234
|
-
|
235
|
-
if save_metadata:
|
236
|
-
accession_num = hit['_id'].split(':')[0].replace('-', '')
|
237
|
-
metadata = {accession_num: hit}
|
238
|
-
async with aiofiles.open(metadata_file, 'a') as f:
|
239
|
-
await f.write(json.dumps(metadata) + '\n')
|
240
|
-
|
241
|
-
if start + page_size > data['hits']['total']['value']:
|
242
|
-
return urls
|
243
|
-
start += 10 * page_size
|
244
|
-
return urls
|
245
|
-
|
246
|
-
async def _conductor(self, efts_url, output_dir, sics, items, file_types, save_metadata=False):
|
247
|
-
"""Conduct the download process based on the number of filings."""
|
248
|
-
async with aiohttp.ClientSession() as session:
|
249
|
-
try:
|
250
|
-
total_filings = await self._number_of_efts_filings(session, efts_url)
|
251
|
-
except RetryException as e:
|
252
|
-
print(f"Rate limited when fetching number of filings. Retrying after {e.retry_after} seconds.")
|
253
|
-
await asyncio.sleep(e.retry_after)
|
254
|
-
return await self._conductor(efts_url, output_dir, sics, items, file_types, save_metadata)
|
255
|
-
|
256
|
-
if total_filings < 10000:
|
257
|
-
urls = await self._get_filing_urls_from_efts(efts_url, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata, output_dir=output_dir)
|
258
|
-
print(f"{efts_url}\nTotal filings: {len(urls)}")
|
259
|
-
filenames = [f"{url.split('/')[7]}_{url.split('/')[-1]}" for url in urls]
|
260
|
-
await self._download_urls(urls=urls, filenames=filenames, output_dir=output_dir)
|
261
|
-
else:
|
262
|
-
for subset_url in self._subset_urls(efts_url, total_filings):
|
263
|
-
await self._conductor(efts_url=subset_url, output_dir=output_dir, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata)
|
264
|
-
|
265
|
-
|
266
|
-
def download(self, output_dir='filings', cik=None, ticker=None, form=None,
|
267
|
-
date=None, sics=None, items=None, file_types=None, save_metadata=False):
|
268
|
-
base_url = "https://efts.sec.gov/LATEST/search-index"
|
269
|
-
params = {}
|
270
|
-
|
271
|
-
if sum(x is not None for x in [cik, ticker]) > 1:
|
272
|
-
raise ValueError('Please provide no more than one identifier: cik or ticker')
|
273
|
-
|
274
|
-
if ticker is not None:
|
275
|
-
cik = identifier_to_cik(ticker)
|
276
|
-
|
277
|
-
if cik:
|
278
|
-
if isinstance(cik, list):
|
279
|
-
formatted_ciks = ','.join(str(c).zfill(10) for c in cik)
|
280
|
-
else:
|
281
|
-
formatted_ciks = str(cik).zfill(10)
|
282
|
-
params['ciks'] = formatted_ciks
|
283
|
-
|
284
|
-
params['forms'] = ','.join(form) if isinstance(form, list) else form if form else "-0"
|
285
|
-
|
286
|
-
if file_types:
|
287
|
-
params['q'] = '-'
|
288
|
-
if isinstance(file_types, list):
|
289
|
-
params['file_type'] = ','.join(file_types)
|
290
|
-
else:
|
291
|
-
params['file_type'] = file_types
|
292
|
-
|
293
|
-
if isinstance(date, list):
|
294
|
-
efts_url_list = [self.generate_url(base_url, {**params, 'startdt': d, 'enddt': d}) for d in date]
|
295
|
-
elif isinstance(date, tuple):
|
296
|
-
efts_url_list = [self.generate_url(base_url, {**params, 'startdt': date[0], 'enddt': date[1]})]
|
297
|
-
else:
|
298
|
-
date_str = date if date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
|
299
|
-
efts_url_list = [self.generate_url(base_url, {**params, 'startdt': date_str.split(',')[0], 'enddt': date_str.split(',')[1]})]
|
300
|
-
|
301
|
-
for efts_url in efts_url_list:
|
302
|
-
asyncio.run(self._conductor(efts_url=efts_url, output_dir=output_dir, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata))
|
303
|
-
|
304
|
-
def download_company_concepts(self, output_dir='company_concepts', cik=None, ticker=None):
|
305
|
-
if sum(x is not None for x in [cik, ticker]) > 1:
|
306
|
-
raise ValueError('Please provide no more than one identifier: cik or ticker')
|
307
|
-
|
308
|
-
ciks = None
|
309
|
-
if cik:
|
310
|
-
if isinstance(cik, list):
|
311
|
-
ciks = cik
|
312
|
-
else:
|
313
|
-
ciks = [cik]
|
314
|
-
|
315
|
-
if ticker is not None:
|
316
|
-
ciks = identifier_to_cik(ticker)
|
317
|
-
|
318
|
-
if ciks is None:
|
319
|
-
company_tickers = load_package_csv('company_tickers')
|
320
|
-
ciks = [company['cik'] for company in company_tickers]
|
321
|
-
|
322
|
-
os.makedirs(output_dir, exist_ok=True)
|
323
|
-
|
324
|
-
urls = [f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json' for cik in ciks]
|
325
|
-
filenames = [f"CIK{str(cik).zfill(10)}.json" for cik in ciks]
|
326
|
-
self.run_download_urls(urls=urls, filenames=filenames, output_dir=output_dir)
|
327
|
-
|
328
|
-
|
329
|
-
def download_dataset(self, dataset, dataset_path='datasets'):
|
330
|
-
if not os.path.exists(dataset_path):
|
331
|
-
os.makedirs(dataset_path)
|
332
|
-
|
333
|
-
if re.match(r"10k_(\d{4})$", dataset):
|
334
|
-
dropbox_downloader = DropboxDownloader()
|
335
|
-
year = int(dataset.split('_')[-1])
|
336
|
-
year_data = next((data for data in dataset_10k_url_list if data['year'] == year), None)
|
337
|
-
|
338
|
-
if year_data:
|
339
|
-
output_dir = os.path.join(dataset_path, f'10K_{year}')
|
340
|
-
os.makedirs(output_dir, exist_ok=True)
|
341
|
-
|
342
|
-
dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
|
343
|
-
else:
|
344
|
-
print(f"No data found for 10Q_{year}")
|
345
|
-
elif dataset == 'ftd':
|
346
|
-
output_dir = os.path.join(dataset_path, 'ftd')
|
347
|
-
urls = get_all_ftd_urls()
|
348
|
-
self.run_download_urls(urls, filenames=[url.split('/')[-1] for url in urls], output_dir=output_dir)
|
349
|
-
process_all_ftd_zips(output_dir)
|
350
|
-
elif dataset == '13f_information_table':
|
351
|
-
output_dir = os.path.join(dataset_path, '13f_information_table')
|
352
|
-
download_and_process_13f_data(self, output_dir)
|
353
|
-
|
354
|
-
elif re.match(r"10q_(\d{4})$", dataset):
|
355
|
-
dropbox_downloader = DropboxDownloader()
|
356
|
-
year = int(dataset.split('_')[-1])
|
357
|
-
year_data = next((data for data in dataset_10q_url_list if data['year'] == year), None)
|
358
|
-
|
359
|
-
if year_data:
|
360
|
-
output_dir = os.path.join(dataset_path, f'10Q_{year}')
|
361
|
-
os.makedirs(output_dir, exist_ok=True)
|
362
|
-
|
363
|
-
dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
|
364
|
-
else:
|
365
|
-
print(f"No data found for 10Q_{year}")
|
366
|
-
|
367
|
-
elif re.match(r"10k_(\d{4})$", dataset):
|
368
|
-
dropbox_downloader = DropboxDownloader()
|
369
|
-
year = int(dataset.split('_')[-1])
|
370
|
-
year_data = next((data for data in dataset_10k_url_list if data['year'] == year), None)
|
371
|
-
|
372
|
-
if year_data:
|
373
|
-
output_dir = os.path.join(dataset_path, f'10K_{year}')
|
374
|
-
os.makedirs(output_dir, exist_ok=True)
|
375
|
-
|
376
|
-
dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
|
377
|
-
else:
|
378
|
-
print(f"No data found for 10K_{year}")
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
async def _watch_efts(self, form=None, cik=None, interval=1, silent=False, callback=None):
|
383
|
-
"""Watch the EFTS API for changes in the number of filings."""
|
384
|
-
params = {
|
385
|
-
"startdt": datetime.now().strftime("%Y-%m-%d"),
|
386
|
-
"enddt": datetime.now().strftime("%Y-%m-%d")
|
387
|
-
}
|
388
|
-
|
389
|
-
if form:
|
390
|
-
params["forms"] = ",".join(form) if isinstance(form, list) else form
|
391
|
-
else:
|
392
|
-
params["forms"] = "-0"
|
393
|
-
|
394
|
-
if cik:
|
395
|
-
if isinstance(cik, list):
|
396
|
-
params['ciks'] = ','.join(str(c).zfill(10) for c in cik)
|
397
|
-
else:
|
398
|
-
params['ciks'] = str(cik).zfill(10)
|
399
|
-
|
400
|
-
watch_url = self.generate_url("https://efts.sec.gov/LATEST/search-index", params)
|
401
|
-
|
402
|
-
previous_value = None
|
403
|
-
async with aiohttp.ClientSession() as session:
|
404
|
-
while True:
|
405
|
-
data = await self._fetch_json_from_url(session, watch_url)
|
406
|
-
|
407
|
-
if data:
|
408
|
-
if not silent:
|
409
|
-
print(f"URL: {watch_url}")
|
410
|
-
|
411
|
-
current_value = data['hits']['total']['value']
|
412
|
-
|
413
|
-
if previous_value is not None and current_value != previous_value:
|
414
|
-
if not silent:
|
415
|
-
print(f"Value changed from {previous_value} to {current_value}")
|
416
|
-
if callback:
|
417
|
-
callback(data)
|
418
|
-
|
419
|
-
previous_value = current_value
|
420
|
-
if not silent:
|
421
|
-
print(f"Current value: {current_value}. Checking again in {interval} seconds.")
|
422
|
-
else:
|
423
|
-
print("Error occurred while fetching data.")
|
424
|
-
|
425
|
-
await asyncio.sleep(interval)
|
426
|
-
|
427
|
-
def watch(self, interval=1, silent=True, form=None, cik=None, ticker=None, callback=None):
|
428
|
-
if sum(x is not None for x in [cik, ticker]) > 1:
|
429
|
-
raise ValueError('Please provide no more than one identifier: cik or ticker')
|
430
|
-
|
431
|
-
if ticker:
|
432
|
-
cik = identifier_to_cik(ticker)
|
433
|
-
|
434
|
-
return asyncio.run(self._watch_efts(interval=interval, silent=silent, form=form, cik=cik, callback=callback))
|
435
|
-
|
436
|
-
async def _download_company_metadata(self):
|
437
|
-
# Define file paths
|
438
|
-
metadata_file = resource_filename('datamule', 'data/company_metadata.csv')
|
439
|
-
former_names_file = resource_filename('datamule', 'data/company_former_names.csv')
|
440
|
-
|
441
|
-
# Define temporary file paths
|
442
|
-
temp_metadata_file = metadata_file + '.temp'
|
443
|
-
temp_former_names_file = former_names_file + '.temp'
|
444
|
-
|
445
|
-
metadata_fields = ['cik', 'name', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
|
446
|
-
'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
|
447
|
-
'tickers', 'exchanges', 'ein', 'description', 'website', 'investorWebsite',
|
448
|
-
'category', 'fiscalYearEnd', 'stateOfIncorporation', 'stateOfIncorporationDescription',
|
449
|
-
'phone', 'flags', 'mailing_street1', 'mailing_street2', 'mailing_city',
|
450
|
-
'mailing_stateOrCountry', 'mailing_zipCode', 'mailing_stateOrCountryDescription',
|
451
|
-
'business_street1', 'business_street2', 'business_city', 'business_stateOrCountry',
|
452
|
-
'business_zipCode', 'business_stateOrCountryDescription']
|
453
|
-
|
454
|
-
former_names_fields = ['cik', 'former_name', 'from_date', 'to_date']
|
455
|
-
|
456
|
-
company_tickers = load_package_csv('company_tickers')
|
457
|
-
|
458
|
-
async with aiohttp.ClientSession() as session:
|
459
|
-
with open(temp_metadata_file, 'w', newline='') as mf, open(temp_former_names_file, 'w', newline='') as fnf:
|
460
|
-
metadata_writer = csv.DictWriter(mf, fieldnames=metadata_fields)
|
461
|
-
metadata_writer.writeheader()
|
462
|
-
|
463
|
-
former_names_writer = csv.DictWriter(fnf, fieldnames=former_names_fields)
|
464
|
-
former_names_writer.writeheader()
|
465
|
-
|
466
|
-
for company in tqdm(company_tickers, desc="Updating company metadata"):
|
467
|
-
cik = company['cik']
|
468
|
-
url = f'https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json'
|
469
|
-
|
470
|
-
try:
|
471
|
-
data = await self._fetch_json_from_url(session, url)
|
472
|
-
|
473
|
-
metadata = {field: data.get(field, '') for field in metadata_fields if field not in ['tickers', 'exchanges']}
|
474
|
-
metadata['cik'] = cik
|
475
|
-
metadata['tickers'] = ','.join(data.get('tickers', []))
|
476
|
-
metadata['exchanges'] = ','.join(data.get('exchanges', []))
|
477
|
-
|
478
|
-
# Add address information
|
479
|
-
for address_type in ['mailing', 'business']:
|
480
|
-
address = data.get('addresses', {}).get(address_type, {})
|
481
|
-
for key, value in address.items():
|
482
|
-
metadata[f'{address_type}_{key}'] = value if value is not None else ''
|
483
|
-
|
484
|
-
metadata_writer.writerow(metadata)
|
485
|
-
|
486
|
-
for former_name in data.get('formerNames', []):
|
487
|
-
former_names_writer.writerow({
|
488
|
-
'cik': cik,
|
489
|
-
'former_name': former_name['name'],
|
490
|
-
'from_date': former_name['from'],
|
491
|
-
'to_date': former_name['to']
|
492
|
-
})
|
493
|
-
|
494
|
-
except Exception as e:
|
495
|
-
print(f"Error processing CIK {cik}: {str(e)}")
|
496
|
-
|
497
|
-
# Now we can safely replace the original files
|
498
|
-
|
499
|
-
try:
|
500
|
-
# Remove original files if they exist
|
501
|
-
if os.path.exists(metadata_file):
|
502
|
-
os.remove(metadata_file)
|
503
|
-
if os.path.exists(former_names_file):
|
504
|
-
os.remove(former_names_file)
|
505
|
-
|
506
|
-
# Rename temp files to original names
|
507
|
-
os.rename(temp_metadata_file, metadata_file)
|
508
|
-
os.rename(temp_former_names_file, former_names_file)
|
509
|
-
|
510
|
-
print(f"Metadata successfully updated in {metadata_file}")
|
511
|
-
print(f"Former names successfully updated in {former_names_file}")
|
512
|
-
except Exception as e:
|
513
|
-
print(f"Error occurred while finalizing file update: {str(e)}")
|
514
|
-
print("Temporary files have been kept. Please manually review and rename if necessary.")
|
515
|
-
return
|
516
|
-
|
517
|
-
# Clean up temp files if they still exist for some reason
|
518
|
-
for temp_file in [temp_metadata_file, temp_former_names_file]:
|
519
|
-
if os.path.exists(temp_file):
|
520
|
-
try:
|
521
|
-
os.remove(temp_file)
|
522
|
-
except Exception as e:
|
523
|
-
print(f"Warning: Could not remove temporary file {temp_file}: {str(e)}")
|
524
|
-
|
525
|
-
def update_company_metadata(self):
|
526
|
-
return asyncio.run(self._download_company_metadata())
|
527
|
-
|
528
|
-
async def _download_company_tickers(self):
|
529
|
-
url = 'https://www.sec.gov/files/company_tickers.json'
|
530
|
-
|
531
|
-
# Define file paths
|
532
|
-
json_file = resource_filename('datamule', 'data/company_tickers.json')
|
533
|
-
csv_file = resource_filename('datamule', 'data/company_tickers.csv')
|
534
|
-
|
535
|
-
# Define temporary file paths
|
536
|
-
temp_json_file = json_file + '.temp'
|
537
|
-
temp_csv_file = csv_file + '.temp'
|
538
|
-
|
539
|
-
async with aiohttp.ClientSession() as session:
|
540
|
-
try:
|
541
|
-
content = await self._fetch_content_from_url(session, url)
|
542
|
-
|
543
|
-
# Save the raw JSON file
|
544
|
-
await self.write_content_to_file(content, temp_json_file)
|
545
|
-
|
546
|
-
# Parse the JSON content
|
547
|
-
data = json.loads(content)
|
548
|
-
|
549
|
-
# Convert to CSV
|
550
|
-
with open(temp_csv_file, 'w', newline='') as csvfile:
|
551
|
-
fieldnames = ['cik', 'ticker', 'title']
|
552
|
-
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
553
|
-
writer.writeheader()
|
554
|
-
for _, company in data.items():
|
555
|
-
writer.writerow({
|
556
|
-
'cik': str(company['cik_str']).zfill(10),
|
557
|
-
'ticker': company['ticker'],
|
558
|
-
'title': company['title']
|
559
|
-
})
|
560
|
-
|
561
|
-
# If everything went well, replace the original files
|
562
|
-
if os.path.exists(json_file):
|
563
|
-
os.remove(json_file)
|
564
|
-
if os.path.exists(csv_file):
|
565
|
-
os.remove(csv_file)
|
566
|
-
|
567
|
-
os.rename(temp_csv_file, csv_file)
|
568
|
-
|
569
|
-
|
570
|
-
print(f"Company tickers successfully updated in {csv_file}")
|
571
|
-
|
572
|
-
except Exception as e:
|
573
|
-
print(f"Error occurred while updating company tickers: {str(e)}")
|
574
|
-
print("Temporary files have been kept. Please manually review and rename if necessary.")
|
575
|
-
return
|
576
|
-
|
577
|
-
finally:
|
578
|
-
# Clean up temp files if they still exist
|
579
|
-
for temp_file in [temp_json_file, temp_csv_file]:
|
580
|
-
if os.path.exists(temp_file):
|
581
|
-
try:
|
582
|
-
os.remove(temp_file)
|
583
|
-
except Exception as e:
|
584
|
-
print(f"Warning: Could not remove temporary file {temp_file}: {str(e)}")
|
585
|
-
|
586
|
-
def update_company_tickers(self):
|
587
|
-
asyncio.run(self._download_company_tickers())
|
588
|
-
|
589
|
-
def load_metadata(self, filepath):
|
590
|
-
metadata = []
|
591
|
-
with open(f"{filepath}/metadata.jsonl", 'r') as f:
|
592
|
-
for line in f:
|
593
|
-
if line.strip(): # Skip empty lines
|
594
|
-
entry = json.loads(line)
|
595
|
-
accession_num = next(iter(entry))
|
596
|
-
doc_id = entry[accession_num]['_id']
|
597
|
-
acc, filename = doc_id.split(':')
|
598
|
-
row = {'accession_number': accession_num}
|
599
|
-
row.update(entry[accession_num]['_source'])
|
600
|
-
# Create primary doc URL using _id and cik
|
601
|
-
cik = row['ciks'][0] if row.get('ciks') else ''
|
602
|
-
acc_clean = acc.replace('-', '')
|
603
|
-
row['primary_doc_url'] = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_clean.zfill(18)}/{filename}"
|
604
|
-
metadata.append(row)
|
605
|
-
self.metadata = metadata
|
606
|
-
|
607
|
-
def save_metadata_to_csv(self, output_filepath):
|
608
|
-
if not hasattr(self, 'metadata'):
|
609
|
-
return
|
610
|
-
|
611
|
-
fieldnames = {'accession_number', 'primary_doc_url'} # Start with both required fields
|
612
|
-
max_lengths = {}
|
613
|
-
|
614
|
-
for item in self.metadata:
|
615
|
-
for key, value in item.items():
|
616
|
-
if key not in ['accession_number', 'primary_doc_url'] and isinstance(value, list):
|
617
|
-
max_lengths[key] = max(max_lengths.get(key, 0), len(value))
|
618
|
-
fieldnames.update(f"{key}_{i+1}" for i in range(len(value)))
|
619
|
-
else:
|
620
|
-
fieldnames.add(key)
|
621
|
-
|
622
|
-
with open(output_filepath, 'w', newline='') as f:
|
623
|
-
writer = csv.DictWriter(f, sorted(fieldnames))
|
624
|
-
writer.writeheader()
|
625
|
-
|
626
|
-
for item in self.metadata:
|
627
|
-
row = {'accession_number': item['accession_number'], 'primary_doc_url': item['primary_doc_url']}
|
628
|
-
for key, value in item.items():
|
629
|
-
if key not in ['accession_number', 'primary_doc_url']:
|
630
|
-
if isinstance(value, list):
|
631
|
-
for i, v in enumerate(value):
|
632
|
-
row[f"{key}_{i+1}"] = v
|
633
|
-
else:
|
634
|
-
row[key] = value
|
635
|
-
writer.writerow(row)
|
@@ -1 +0,0 @@
|
|
1
|
-
from .filing_viewer import create_interactive_filing, create_valid_id
|