datamule 0.381__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book.py +16 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +278 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.0.dist-info/METADATA +27 -0
- datamule-1.0.0.dist-info/RECORD +40 -0
- {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/__init__.py +0 -0
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.381.dist-info/METADATA +0 -132
- datamule-0.381.dist-info/RECORD +0 -61
- {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,335 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
import aiohttp
|
4
|
+
from pathlib import Path
|
5
|
+
from tqdm import tqdm
|
6
|
+
import time
|
7
|
+
import shutil
|
8
|
+
import ssl
|
9
|
+
import zstandard as zstd
|
10
|
+
import io
|
11
|
+
import json
|
12
|
+
from concurrent.futures import ThreadPoolExecutor
|
13
|
+
from functools import partial
|
14
|
+
from queue import Queue, Empty
|
15
|
+
from threading import Thread
|
16
|
+
from secsgml import parse_sgml_submission
|
17
|
+
import urllib.parse
|
18
|
+
from ..helper import identifier_to_cik
|
19
|
+
|
20
|
+
class InsufficientBalanceError(Exception):
|
21
|
+
def __init__(self, required_cost, current_balance, total_urls):
|
22
|
+
self.required_cost = required_cost
|
23
|
+
self.current_balance = current_balance
|
24
|
+
self.total_urls = total_urls
|
25
|
+
message = (f"Insufficient balance. Required: ${required_cost:.4f}, "
|
26
|
+
f"Current balance: ${current_balance:.4f}, "
|
27
|
+
f"Total URLs: {total_urls}")
|
28
|
+
super().__init__(message)
|
29
|
+
|
30
|
+
class PremiumDownloader:
|
31
|
+
def __init__(self, api_key=None):
|
32
|
+
self.BASE_URL = "https://library.datamule.xyz/original/nc/"
|
33
|
+
self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
|
34
|
+
self.CHUNK_SIZE = 2 * 1024 * 1024
|
35
|
+
self.MAX_CONCURRENT_DOWNLOADS = 100
|
36
|
+
self.MAX_DECOMPRESSION_WORKERS = 16
|
37
|
+
self.MAX_PROCESSING_WORKERS = 16
|
38
|
+
self.QUEUE_SIZE = 10
|
39
|
+
if api_key is not None:
|
40
|
+
self._api_key = api_key
|
41
|
+
|
42
|
+
@property
|
43
|
+
def api_key(self):
|
44
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
45
|
+
|
46
|
+
@api_key.setter
|
47
|
+
def api_key(self, value):
|
48
|
+
if not value:
|
49
|
+
raise ValueError("API key cannot be empty")
|
50
|
+
self._api_key = value
|
51
|
+
|
52
|
+
def _log_error(self, output_dir, filename, error_msg):
|
53
|
+
error_file = os.path.join(output_dir, 'errors.json')
|
54
|
+
try:
|
55
|
+
if os.path.exists(error_file):
|
56
|
+
with open(error_file, 'r') as f:
|
57
|
+
errors = json.load(f)
|
58
|
+
else:
|
59
|
+
errors = {}
|
60
|
+
|
61
|
+
errors[filename] = str(error_msg)
|
62
|
+
|
63
|
+
with open(error_file, 'w') as f:
|
64
|
+
json.dump(errors, f, indent=2)
|
65
|
+
except Exception as e:
|
66
|
+
print(f"Failed to log error to {error_file}: {str(e)}")
|
67
|
+
|
68
|
+
async def _fetch_submissions(self, session, submission_type=None, cik=None, filing_date=None, page=1):
|
69
|
+
params = {
|
70
|
+
'api_key': self.api_key,
|
71
|
+
'page': page
|
72
|
+
}
|
73
|
+
|
74
|
+
if submission_type:
|
75
|
+
if isinstance(submission_type, list):
|
76
|
+
params['submission_type'] = ','.join(str(x) for x in submission_type)
|
77
|
+
else:
|
78
|
+
params['submission_type'] = str(submission_type)
|
79
|
+
|
80
|
+
if cik:
|
81
|
+
if isinstance(cik, list):
|
82
|
+
params['cik'] = ','.join(str(x) for x in cik)
|
83
|
+
else:
|
84
|
+
params['cik'] = str(cik)
|
85
|
+
|
86
|
+
if filing_date:
|
87
|
+
if isinstance(filing_date, tuple):
|
88
|
+
params['startdt'] = str(filing_date[0])
|
89
|
+
params['enddt'] = str(filing_date[1])
|
90
|
+
else:
|
91
|
+
if isinstance(filing_date, list):
|
92
|
+
params['filing_date'] = ','.join(str(x) for x in filing_date)
|
93
|
+
else:
|
94
|
+
params['filing_date'] = str(filing_date)
|
95
|
+
|
96
|
+
url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
|
97
|
+
|
98
|
+
async with session.get(url) as response:
|
99
|
+
data = await response.json()
|
100
|
+
if not data.get('success'):
|
101
|
+
raise ValueError(f"API request failed: {data.get('error')}")
|
102
|
+
|
103
|
+
charges = data['metadata']['billing']['charges']
|
104
|
+
print(f"\nCost: ${charges['results']:.12f} downloads + ${charges['rows_read']:.12f} row reads = ${charges['total']:.12f}")
|
105
|
+
print(f"Balance: ${data['metadata']['billing']['remaining_balance']:.12f}")
|
106
|
+
|
107
|
+
urls = [f"{self.BASE_URL}{str(sub['accession_number']).zfill(18)}.sgml{'.zst' if sub.get('compressed', '').lower() == 'true' else ''}" for sub in data['data']]
|
108
|
+
return urls, data['metadata']['pagination']
|
109
|
+
|
110
|
+
class FileProcessor:
|
111
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
|
112
|
+
self.processing_queue = Queue(maxsize=queue_size)
|
113
|
+
self.should_stop = False
|
114
|
+
self.processing_workers = []
|
115
|
+
self.output_dir = output_dir
|
116
|
+
self.max_workers = max_workers
|
117
|
+
self.batch_size = 10
|
118
|
+
self.pbar = pbar
|
119
|
+
self.downloader = downloader
|
120
|
+
|
121
|
+
def start_processing_workers(self):
|
122
|
+
for _ in range(self.max_workers):
|
123
|
+
worker = Thread(target=self._processing_worker)
|
124
|
+
worker.daemon = True
|
125
|
+
worker.start()
|
126
|
+
self.processing_workers.append(worker)
|
127
|
+
|
128
|
+
def _process_file(self, item):
|
129
|
+
filename, content = item
|
130
|
+
try:
|
131
|
+
parse_sgml_submission(output_dir=self.output_dir, content=content)
|
132
|
+
self.pbar.update(1)
|
133
|
+
except Exception as e:
|
134
|
+
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
135
|
+
if os.path.exists(accession_dir):
|
136
|
+
shutil.rmtree(accession_dir)
|
137
|
+
self.downloader._log_error(self.output_dir, filename, str(e))
|
138
|
+
|
139
|
+
def _processing_worker(self):
|
140
|
+
batch = []
|
141
|
+
while not self.should_stop:
|
142
|
+
try:
|
143
|
+
item = self.processing_queue.get(timeout=1)
|
144
|
+
if item is None:
|
145
|
+
break
|
146
|
+
|
147
|
+
batch.append(item)
|
148
|
+
|
149
|
+
if len(batch) >= self.batch_size or self.processing_queue.empty():
|
150
|
+
for item in batch:
|
151
|
+
self._process_file(item)
|
152
|
+
self.processing_queue.task_done()
|
153
|
+
batch = []
|
154
|
+
|
155
|
+
except Empty:
|
156
|
+
if batch:
|
157
|
+
for item in batch:
|
158
|
+
self._process_file(item)
|
159
|
+
self.processing_queue.task_done()
|
160
|
+
batch = []
|
161
|
+
|
162
|
+
def stop_workers(self):
|
163
|
+
self.should_stop = True
|
164
|
+
for _ in self.processing_workers:
|
165
|
+
self.processing_queue.put(None)
|
166
|
+
for worker in self.processing_workers:
|
167
|
+
worker.join()
|
168
|
+
|
169
|
+
def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
|
170
|
+
dctx = zstd.ZstdDecompressor()
|
171
|
+
try:
|
172
|
+
input_buffer = io.BytesIO(b''.join(compressed_chunks))
|
173
|
+
decompressed_content = io.BytesIO()
|
174
|
+
|
175
|
+
with dctx.stream_reader(input_buffer) as reader:
|
176
|
+
shutil.copyfileobj(reader, decompressed_content)
|
177
|
+
|
178
|
+
content = decompressed_content.getvalue().decode('utf-8')
|
179
|
+
processor.processing_queue.put((filename, content))
|
180
|
+
return True
|
181
|
+
|
182
|
+
except Exception as e:
|
183
|
+
self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
|
184
|
+
return False
|
185
|
+
finally:
|
186
|
+
try:
|
187
|
+
input_buffer.close()
|
188
|
+
decompressed_content.close()
|
189
|
+
except:
|
190
|
+
pass
|
191
|
+
|
192
|
+
def save_regular_file(self, chunks, filename, output_dir, processor):
|
193
|
+
try:
|
194
|
+
content = b''.join(chunks).decode('utf-8')
|
195
|
+
processor.processing_queue.put((filename, content))
|
196
|
+
return True
|
197
|
+
|
198
|
+
except Exception as e:
|
199
|
+
self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
|
200
|
+
return False
|
201
|
+
|
202
|
+
async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
|
203
|
+
async with semaphore:
|
204
|
+
chunks = []
|
205
|
+
filename = url.split('/')[-1]
|
206
|
+
|
207
|
+
api_key = self.api_key
|
208
|
+
if not api_key:
|
209
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
210
|
+
|
211
|
+
try:
|
212
|
+
headers = {
|
213
|
+
'Connection': 'keep-alive',
|
214
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
215
|
+
'Authorization': f'Bearer {api_key}'
|
216
|
+
}
|
217
|
+
|
218
|
+
async with session.get(url, headers=headers) as response:
|
219
|
+
if response.status == 200:
|
220
|
+
async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
|
221
|
+
chunks.append(chunk)
|
222
|
+
|
223
|
+
loop = asyncio.get_running_loop()
|
224
|
+
if filename.endswith('.zst'):
|
225
|
+
success = await loop.run_in_executor(
|
226
|
+
decompression_pool,
|
227
|
+
partial(self.decompress_stream, chunks, filename, output_dir, processor)
|
228
|
+
)
|
229
|
+
else:
|
230
|
+
success = await loop.run_in_executor(
|
231
|
+
decompression_pool,
|
232
|
+
partial(self.save_regular_file, chunks, filename, output_dir, processor)
|
233
|
+
)
|
234
|
+
|
235
|
+
if not success:
|
236
|
+
self._log_error(output_dir, filename, "Failed to process file")
|
237
|
+
elif response.status == 401:
|
238
|
+
self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
|
239
|
+
raise ValueError("Invalid API key")
|
240
|
+
else:
|
241
|
+
self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
|
242
|
+
except Exception as e:
|
243
|
+
self._log_error(output_dir, filename, str(e))
|
244
|
+
|
245
|
+
async def process_batch(self, urls, output_dir):
|
246
|
+
os.makedirs(output_dir, exist_ok=True)
|
247
|
+
|
248
|
+
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
249
|
+
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
|
250
|
+
processor.start_processing_workers()
|
251
|
+
|
252
|
+
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
253
|
+
decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
|
254
|
+
|
255
|
+
connector = aiohttp.TCPConnector(
|
256
|
+
limit=self.MAX_CONCURRENT_DOWNLOADS,
|
257
|
+
force_close=False,
|
258
|
+
ssl=ssl.create_default_context(),
|
259
|
+
ttl_dns_cache=300,
|
260
|
+
keepalive_timeout=60
|
261
|
+
)
|
262
|
+
|
263
|
+
# timeout should be max 2 hours.
|
264
|
+
async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
|
265
|
+
tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
|
266
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
267
|
+
|
268
|
+
processor.processing_queue.join()
|
269
|
+
processor.stop_workers()
|
270
|
+
decompression_pool.shutdown()
|
271
|
+
|
272
|
+
async def download_all_pages(self, submission_type=None, cik=None, filing_date=None, output_dir="download"):
|
273
|
+
connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
|
274
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
275
|
+
try:
|
276
|
+
urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=1)
|
277
|
+
total_urls = urls.copy()
|
278
|
+
current_page = 1
|
279
|
+
|
280
|
+
while pagination.get('hasMore', False):
|
281
|
+
current_page += 1
|
282
|
+
more_urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=current_page)
|
283
|
+
total_urls.extend(more_urls)
|
284
|
+
|
285
|
+
if total_urls:
|
286
|
+
total_urls = list(set(total_urls)) # Remove duplicates
|
287
|
+
start_time = time.time()
|
288
|
+
await self.process_batch(total_urls, output_dir)
|
289
|
+
elapsed_time = time.time() - start_time
|
290
|
+
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
291
|
+
else:
|
292
|
+
print("No submissions found matching the criteria")
|
293
|
+
|
294
|
+
except InsufficientBalanceError as e:
|
295
|
+
error_msg = {
|
296
|
+
"error": "insufficient_balance",
|
297
|
+
"required_cost": e.required_cost,
|
298
|
+
"current_balance": e.current_balance,
|
299
|
+
"total_urls": e.total_urls,
|
300
|
+
"additional_funds_needed": e.required_cost - e.current_balance
|
301
|
+
}
|
302
|
+
self._log_error(output_dir, "balance_check", error_msg)
|
303
|
+
return
|
304
|
+
|
305
|
+
def download_submissions(self, submission_type=None, cik=None, ticker=None, filing_date=None, output_dir="download"):
|
306
|
+
if self.api_key is None:
|
307
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
308
|
+
|
309
|
+
if filing_date is not None:
|
310
|
+
if isinstance(filing_date, str):
|
311
|
+
filing_date = int(filing_date.replace('-', ''))
|
312
|
+
elif isinstance(filing_date, list):
|
313
|
+
filing_date = [int(x.replace('-', '')) for x in filing_date]
|
314
|
+
elif isinstance(filing_date, tuple):
|
315
|
+
filing_date = (int(filing_date[0].replace('-', '')), int(filing_date[1].replace('-', '')))
|
316
|
+
|
317
|
+
if ticker is not None:
|
318
|
+
cik = identifier_to_cik(ticker)
|
319
|
+
|
320
|
+
if cik is not None:
|
321
|
+
if isinstance(cik, str):
|
322
|
+
cik = [int(cik)]
|
323
|
+
elif isinstance(cik, int):
|
324
|
+
cik = [cik]
|
325
|
+
elif isinstance(cik, list):
|
326
|
+
cik = [int(x) for x in cik]
|
327
|
+
|
328
|
+
async def _download():
|
329
|
+
try:
|
330
|
+
await self.download_all_pages(submission_type=submission_type, cik=cik, filing_date=filing_date, output_dir=output_dir)
|
331
|
+
except Exception as e:
|
332
|
+
if not isinstance(e, InsufficientBalanceError):
|
333
|
+
self._log_error(output_dir, "download_error", str(e))
|
334
|
+
|
335
|
+
asyncio.run(_download())
|
datamule/helper.py
CHANGED
@@ -1,136 +1,123 @@
|
|
1
|
-
import requests
|
2
|
-
import os
|
3
|
-
from tqdm import tqdm
|
4
|
-
import zipfile
|
5
|
-
from pkg_resources import resource_filename
|
6
|
-
import csv
|
7
|
-
import re
|
8
|
-
|
9
|
-
# Unused in current implementation.
|
10
|
-
def construct_primary_doc_url(cik, accession_number,primary_doc_url):
|
11
|
-
accession_number = accession_number.replace("-", "")
|
12
|
-
return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
|
13
|
-
|
14
|
-
# DONE
|
15
|
-
def _download_from_dropbox(url, output_path):
|
16
|
-
headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
|
17
|
-
r = requests.get(url, stream=True, headers=headers)
|
18
|
-
total_size = int(r.headers.get('content-length', 0))
|
19
|
-
|
20
|
-
with open(output_path, 'wb') as f, tqdm(
|
21
|
-
desc="Downloading " + os.path.basename(output_path),
|
22
|
-
total=total_size,
|
23
|
-
unit='iB',
|
24
|
-
unit_scale=True,
|
25
|
-
unit_divisor=1024,
|
26
|
-
) as progress_bar:
|
27
|
-
for chunk in r.iter_content(chunk_size=1024):
|
28
|
-
size = f.write(chunk)
|
29
|
-
progress_bar.update(size)
|
30
|
-
|
31
|
-
# Check if the downloaded file is a zip file
|
32
|
-
if zipfile.is_zipfile(output_path):
|
33
|
-
extract_path = os.path.dirname(output_path)
|
34
|
-
with zipfile.ZipFile(output_path, 'r') as zip_ref:
|
35
|
-
for file_info in zip_ref.infolist():
|
36
|
-
extract_file_path = os.path.join(extract_path, file_info.filename)
|
37
|
-
with zip_ref.open(file_info) as file_in_zip, \
|
38
|
-
open(extract_file_path, 'wb') as output_file, \
|
39
|
-
tqdm(total=file_info.file_size, unit='B', unit_scale=True,
|
40
|
-
desc=f"Extracting {file_info.filename}") as pbar:
|
41
|
-
while True:
|
42
|
-
chunk = file_in_zip.read(8192)
|
43
|
-
if not chunk:
|
44
|
-
break
|
45
|
-
output_file.write(chunk)
|
46
|
-
pbar.update(len(chunk))
|
47
|
-
|
48
|
-
# Remove the zip file after extraction
|
49
|
-
os.remove(output_path)
|
50
|
-
print(f"Extracted contents to {extract_path}")
|
51
|
-
else:
|
52
|
-
print(f"Downloaded file is not a zip. Saved to {output_path}")
|
53
|
-
|
54
|
-
# May generalize to load any package resource
|
55
|
-
def load_package_csv(name):
|
56
|
-
"""Load package CSV files"""
|
57
|
-
csv_path = resource_filename('datamule', f'data/{name}.csv')
|
58
|
-
company_tickers = []
|
59
|
-
|
60
|
-
with open(csv_path, 'r') as csvfile:
|
61
|
-
csv_reader = csv.DictReader(csvfile)
|
62
|
-
for row in csv_reader:
|
63
|
-
company_tickers.append(row)
|
64
|
-
|
65
|
-
return company_tickers
|
66
|
-
|
67
|
-
def load_package_dataset(dataset):
|
68
|
-
if dataset == 'company_tickers':
|
69
|
-
return load_package_csv('company_tickers')
|
70
|
-
elif dataset =='company_former_names':
|
71
|
-
return load_package_csv('company_former_names')
|
72
|
-
elif dataset =='company_metadata':
|
73
|
-
return load_package_csv('company_metadata')
|
74
|
-
elif dataset == 'sec_glossary':
|
75
|
-
return load_package_csv('sec-glossary')
|
76
|
-
elif dataset == 'xbrl_descriptions':
|
77
|
-
return load_package_csv('xbrl_descriptions')
|
78
|
-
|
79
|
-
# DONE
|
80
|
-
def identifier_to_cik(ticker):
|
81
|
-
"""Convert company tickers to CIK codes"""
|
82
|
-
company_tickers = load_package_csv('company_tickers')
|
83
|
-
if ticker:
|
84
|
-
if isinstance(ticker, list):
|
85
|
-
cik = []
|
86
|
-
for t in ticker:
|
87
|
-
cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
|
88
|
-
else:
|
89
|
-
cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
|
90
|
-
|
91
|
-
if not cik:
|
92
|
-
raise ValueError("No matching companies found")
|
93
|
-
|
94
|
-
return cik
|
95
|
-
|
96
|
-
|
97
|
-
def fix_filing_url(url):
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
if
|
104
|
-
accession_number =
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
def convert_to_dashed_accession(accession):
|
126
|
-
# Remove any existing dashes or whitespace
|
127
|
-
cleaned = ''.join(accession.split())
|
128
|
-
|
129
|
-
# Check if the cleaned string has 18 characters
|
130
|
-
if len(cleaned) != 18:
|
131
|
-
raise ValueError("Invalid accession number format. Expected 18 characters.")
|
132
|
-
|
133
|
-
# Insert dashes at the correct positions
|
134
|
-
dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
|
135
|
-
|
136
|
-
return dashed
|
1
|
+
import requests
|
2
|
+
import os
|
3
|
+
from tqdm import tqdm
|
4
|
+
import zipfile
|
5
|
+
from pkg_resources import resource_filename
|
6
|
+
import csv
|
7
|
+
import re
|
8
|
+
|
9
|
+
# Unused in current implementation.
|
10
|
+
def construct_primary_doc_url(cik, accession_number,primary_doc_url):
|
11
|
+
accession_number = accession_number.replace("-", "")
|
12
|
+
return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
|
13
|
+
|
14
|
+
# DONE
|
15
|
+
def _download_from_dropbox(url, output_path):
|
16
|
+
headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
|
17
|
+
r = requests.get(url, stream=True, headers=headers)
|
18
|
+
total_size = int(r.headers.get('content-length', 0))
|
19
|
+
|
20
|
+
with open(output_path, 'wb') as f, tqdm(
|
21
|
+
desc="Downloading " + os.path.basename(output_path),
|
22
|
+
total=total_size,
|
23
|
+
unit='iB',
|
24
|
+
unit_scale=True,
|
25
|
+
unit_divisor=1024,
|
26
|
+
) as progress_bar:
|
27
|
+
for chunk in r.iter_content(chunk_size=1024):
|
28
|
+
size = f.write(chunk)
|
29
|
+
progress_bar.update(size)
|
30
|
+
|
31
|
+
# Check if the downloaded file is a zip file
|
32
|
+
if zipfile.is_zipfile(output_path):
|
33
|
+
extract_path = os.path.dirname(output_path)
|
34
|
+
with zipfile.ZipFile(output_path, 'r') as zip_ref:
|
35
|
+
for file_info in zip_ref.infolist():
|
36
|
+
extract_file_path = os.path.join(extract_path, file_info.filename)
|
37
|
+
with zip_ref.open(file_info) as file_in_zip, \
|
38
|
+
open(extract_file_path, 'wb') as output_file, \
|
39
|
+
tqdm(total=file_info.file_size, unit='B', unit_scale=True,
|
40
|
+
desc=f"Extracting {file_info.filename}") as pbar:
|
41
|
+
while True:
|
42
|
+
chunk = file_in_zip.read(8192)
|
43
|
+
if not chunk:
|
44
|
+
break
|
45
|
+
output_file.write(chunk)
|
46
|
+
pbar.update(len(chunk))
|
47
|
+
|
48
|
+
# Remove the zip file after extraction
|
49
|
+
os.remove(output_path)
|
50
|
+
print(f"Extracted contents to {extract_path}")
|
51
|
+
else:
|
52
|
+
print(f"Downloaded file is not a zip. Saved to {output_path}")
|
53
|
+
|
54
|
+
# May generalize to load any package resource
|
55
|
+
def load_package_csv(name):
|
56
|
+
"""Load package CSV files"""
|
57
|
+
csv_path = resource_filename('datamule', f'data/{name}.csv')
|
58
|
+
company_tickers = []
|
59
|
+
|
60
|
+
with open(csv_path, 'r') as csvfile:
|
61
|
+
csv_reader = csv.DictReader(csvfile)
|
62
|
+
for row in csv_reader:
|
63
|
+
company_tickers.append(row)
|
64
|
+
|
65
|
+
return company_tickers
|
66
|
+
|
67
|
+
def load_package_dataset(dataset):
|
68
|
+
if dataset == 'company_tickers':
|
69
|
+
return load_package_csv('company_tickers')
|
70
|
+
elif dataset =='company_former_names':
|
71
|
+
return load_package_csv('company_former_names')
|
72
|
+
elif dataset =='company_metadata':
|
73
|
+
return load_package_csv('company_metadata')
|
74
|
+
elif dataset == 'sec_glossary':
|
75
|
+
return load_package_csv('sec-glossary')
|
76
|
+
elif dataset == 'xbrl_descriptions':
|
77
|
+
return load_package_csv('xbrl_descriptions')
|
78
|
+
|
79
|
+
# DONE
|
80
|
+
def identifier_to_cik(ticker):
|
81
|
+
"""Convert company tickers to CIK codes"""
|
82
|
+
company_tickers = load_package_csv('company_tickers')
|
83
|
+
if ticker:
|
84
|
+
if isinstance(ticker, list):
|
85
|
+
cik = []
|
86
|
+
for t in ticker:
|
87
|
+
cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
|
88
|
+
else:
|
89
|
+
cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
|
90
|
+
|
91
|
+
if not cik:
|
92
|
+
raise ValueError("No matching companies found")
|
93
|
+
|
94
|
+
return cik
|
95
|
+
|
96
|
+
|
97
|
+
def fix_filing_url(url):
|
98
|
+
match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
|
99
|
+
if match_suffix:
|
100
|
+
suffix_number = match_suffix.group(1)
|
101
|
+
file_ext = match_suffix.group(2)
|
102
|
+
match_accession = re.search(r'/(\d{18})/', url)
|
103
|
+
if match_accession:
|
104
|
+
accession_number = match_accession.group(1)
|
105
|
+
formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
|
106
|
+
new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
|
107
|
+
return new_url
|
108
|
+
return url
|
109
|
+
|
110
|
+
def convert_to_dashed_accession(accession):
|
111
|
+
# Remove any existing dashes or whitespace
|
112
|
+
cleaned = ''.join(accession.split())
|
113
|
+
|
114
|
+
# Check if the cleaned string has 18 characters
|
115
|
+
if len(cleaned) != 18:
|
116
|
+
raise ValueError("Invalid accession number format. Expected 18 characters.")
|
117
|
+
|
118
|
+
# Insert dashes at the correct positions
|
119
|
+
dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
|
120
|
+
|
121
|
+
return dashed
|
122
|
+
|
123
|
+
headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}
|