datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.380.dist-info/METADATA +0 -110
  72. datamule-0.380.dist-info/RECORD +0 -61
  73. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,635 +0,0 @@
1
- import asyncio
2
- import aiohttp
3
- from aiolimiter import AsyncLimiter
4
- import os
5
- from tqdm import tqdm
6
- from tqdm.asyncio import tqdm as atqdm
7
- from datetime import datetime, timedelta
8
- from urllib.parse import urlparse, parse_qs, urlencode
9
- import math
10
- import re
11
- import aiofiles
12
- import json
13
- import csv
14
- from pkg_resources import resource_filename
15
-
16
-
17
- from ..global_vars import headers, dataset_10q_url_list,dataset_10k_url_list
18
- from ..helper import identifier_to_cik, load_package_csv, fix_filing_url
19
- from .ftd import get_all_ftd_urls, process_all_ftd_zips
20
- from .dropbox_downloader import DropboxDownloader
21
- from .information_table_13f import download_and_process_13f_data
22
- class RetryException(Exception):
23
- def __init__(self, url, retry_after=601):
24
- self.url = url
25
- self.retry_after = retry_after
26
-
27
- class Downloader:
28
- def __init__(self):
29
- self.headers = headers
30
- self.dataset_path = 'datasets'
31
- self.domain_limiters = {
32
- 'www.sec.gov': AsyncLimiter(10, 1),
33
- 'efts.sec.gov': AsyncLimiter(10, 1),
34
- 'data.sec.gov': AsyncLimiter(10, 1),
35
- 'default': AsyncLimiter(10, 1)
36
- }
37
- self.metadata = None
38
-
39
- def set_limiter(self, domain, rate_limit):
40
- self.domain_limiters[domain] = AsyncLimiter(rate_limit, 1)
41
-
42
- def set_headers(self, user_agent):
43
- self.headers = {'User-Agent': user_agent}
44
-
45
-
46
- def get_limiter(self, url):
47
- domain = urlparse(url).netloc
48
- return self.domain_limiters.get(domain, self.domain_limiters['default'])
49
-
50
- async def _fetch_content_from_url(self, session, url):
51
- limiter = self.get_limiter(url)
52
- async with limiter:
53
- try:
54
- async with session.get(url, headers=self.headers) as response:
55
- if response.status == 429:
56
- raise RetryException(url)
57
-
58
- response.raise_for_status()
59
- return await response.read()
60
-
61
- except aiohttp.ClientResponseError as e:
62
- if e.status == 429:
63
- raise RetryException(url)
64
- raise
65
-
66
- except Exception as e:
67
- print(f"Error downloading {url}: {str(e)}")
68
- raise
69
-
70
- async def write_content_to_file(self, content, filepath):
71
- """Write content to a file asynchronously."""
72
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
73
- async with aiofiles.open(filepath, 'wb') as f:
74
- await f.write(content)
75
-
76
- def generate_url(self, base_url, params):
77
- return f"{base_url}?{urlencode(params)}"
78
-
79
- async def download_file(self, session, url, output_path):
80
- """Download a file from a URL and save it to the specified path."""
81
- content = await self._fetch_content_from_url(session, url)
82
- await self.write_content_to_file(content, output_path)
83
- return output_path
84
-
85
- async def _fetch_json_from_url(self, session, url):
86
- """Asynchronously fetch JSON data from a URL."""
87
- content = await self._fetch_content_from_url(session, url)
88
- return json.loads(content)
89
-
90
- async def _download_urls(self, urls, filenames, output_dir):
91
- os.makedirs(output_dir, exist_ok=True)
92
- async with aiohttp.ClientSession() as session:
93
- total_files = len(urls)
94
- completed_files = 0
95
-
96
- with tqdm(total=total_files, desc="Downloading files") as pbar:
97
- while urls and completed_files < total_files:
98
- tasks = [asyncio.create_task(self.download_file(session, url, os.path.join(output_dir, filename)))
99
- for url, filename in zip(urls, filenames) if filename]
100
-
101
- rate_limited = False
102
- retry_after = 0
103
-
104
- pending = tasks
105
- while pending:
106
- done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
107
-
108
- for task in done:
109
- try:
110
- result = await task
111
- completed_files += 1
112
- pbar.update(1)
113
- except RetryException as e:
114
- print(f"\nRate limited for {e.url}. Will retry after {e.retry_after} seconds.")
115
- rate_limited = True
116
- retry_after = max(retry_after, e.retry_after)
117
- break
118
- except Exception as e:
119
- print(f"\nFailed to download: {str(e)}")
120
- completed_files += 1
121
- pbar.update(1)
122
-
123
- if rate_limited:
124
- break
125
-
126
- if rate_limited:
127
- for task in pending:
128
- task.cancel()
129
-
130
- print(f"\nRate limit hit. Sleeping for {retry_after} seconds before retrying.")
131
- await asyncio.sleep(retry_after)
132
-
133
- # Recreate the list of URLs and filenames that haven't been processed
134
- urls = [task.get_coro().cr_frame.f_locals['url'] for task in pending]
135
- filenames = [filename for url, filename in zip(urls, filenames) if url in urls]
136
- else:
137
- break # All tasks completed successfully
138
-
139
- print(f"\nSuccessfully downloaded {completed_files} out of {total_files} URLs")
140
- return completed_files
141
-
142
- def run_download_urls(self, urls, filenames, output_dir='filings'):
143
- """Download a list of URLs to a specified directory"""
144
- return asyncio.run(self._download_urls(urls, filenames, output_dir))
145
-
146
-
147
- async def _number_of_efts_filings(self, session, url):
148
- """Get the number of filings from a given EFTS URL asynchronously."""
149
- limiter = self.get_limiter(url)
150
- async with limiter:
151
- try:
152
- async with session.get(url, headers=self.headers) as response:
153
- if response.status == 429:
154
- raise RetryException(url)
155
- response.raise_for_status()
156
- data = await response.json()
157
- return sum(bucket['doc_count'] for bucket in data['aggregations']['form_filter']['buckets'])
158
- except aiohttp.ClientResponseError as e:
159
- if e.status == 429:
160
- raise RetryException(url)
161
- raise
162
- except Exception as e:
163
- print(f"Error fetching number of filings from {url}: {str(e)}")
164
- raise
165
-
166
- def _subset_urls(self, full_url, total_filings, target_filings_per_range=1000):
167
- """Split an EFTS URL into multiple URLs based on the number of filings."""
168
- parsed_url = urlparse(full_url)
169
- params = parse_qs(parsed_url.query)
170
- start = datetime.strptime(params.get('startdt', [None])[0], "%Y-%m-%d")
171
- end = datetime.strptime(params.get('enddt', [None])[0], "%Y-%m-%d")
172
-
173
- if start == end:
174
- forms = params.get('forms', [None])[0]
175
- if forms == '-0':
176
- urls = []
177
- for form in ['SC 13G', 'SC 13G/A']:
178
- new_params = params.copy()
179
- new_params['forms'] = [form]
180
- urls.append(parsed_url._replace(query=urlencode(new_params, doseq=True)).geturl())
181
- return urls
182
- else:
183
- return [full_url]
184
-
185
- num_ranges = math.ceil(total_filings / target_filings_per_range)
186
- days_per_range = math.ceil((end - start).days / num_ranges)
187
-
188
- urls = []
189
- current_start = start
190
- for _ in range(num_ranges):
191
- current_end = min(current_start + timedelta(days=days_per_range), end)
192
- new_params = params.copy()
193
- new_params['startdt'] = [current_start.strftime('%Y-%m-%d')]
194
- new_params['enddt'] = [current_end.strftime('%Y-%m-%d')]
195
- urls.append(parsed_url._replace(query=urlencode(new_params, doseq=True)).geturl())
196
- if current_end == end:
197
- break
198
- current_start = current_end + timedelta(days=1)
199
-
200
- return urls[::-1]
201
-
202
- async def _get_filing_urls_from_efts(self, base_url, sics=None, items=None, file_types=None, save_metadata=False, output_dir=None):
203
- """Asynchronously fetch all filing URLs from a given EFTS URL."""
204
- urls = []
205
- start, page_size = 0, 100
206
-
207
- if save_metadata:
208
- metadata_file = os.path.join(output_dir, 'metadata.jsonl')
209
- os.makedirs(output_dir, exist_ok=True)
210
-
211
- async with aiohttp.ClientSession() as session:
212
- while True:
213
- tasks = [self._fetch_json_from_url(session, f"{base_url}&from={start + i * page_size}") for i in range(10)]
214
- results = await atqdm.gather(*tasks, desc="Fetching URLs")
215
- for data in results:
216
- if data and 'hits' in data:
217
- hits = data['hits']['hits']
218
- if not hits:
219
- return urls
220
-
221
- for hit in hits:
222
- # Check SIC filter
223
- sic_match = sics is None or any(int(sic) in sics for sic in hit['_source'].get('sics', []))
224
-
225
- # Check item filter
226
- item_match = items is None or any(item in items for item in hit['_source'].get('items', []))
227
-
228
- # Check file type filter
229
- file_type_match = file_types is None or hit['_source'].get('file_type') in (file_types if isinstance(file_types, list) else [file_types])
230
-
231
- if sic_match and item_match and file_type_match:
232
- url = f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0].replace('-', '')}/{hit['_id'].split(':')[1]}"
233
- urls.append(url)
234
-
235
- if save_metadata:
236
- accession_num = hit['_id'].split(':')[0].replace('-', '')
237
- metadata = {accession_num: hit}
238
- async with aiofiles.open(metadata_file, 'a') as f:
239
- await f.write(json.dumps(metadata) + '\n')
240
-
241
- if start + page_size > data['hits']['total']['value']:
242
- return urls
243
- start += 10 * page_size
244
- return urls
245
-
246
- async def _conductor(self, efts_url, output_dir, sics, items, file_types, save_metadata=False):
247
- """Conduct the download process based on the number of filings."""
248
- async with aiohttp.ClientSession() as session:
249
- try:
250
- total_filings = await self._number_of_efts_filings(session, efts_url)
251
- except RetryException as e:
252
- print(f"Rate limited when fetching number of filings. Retrying after {e.retry_after} seconds.")
253
- await asyncio.sleep(e.retry_after)
254
- return await self._conductor(efts_url, output_dir, sics, items, file_types, save_metadata)
255
-
256
- if total_filings < 10000:
257
- urls = await self._get_filing_urls_from_efts(efts_url, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata, output_dir=output_dir)
258
- print(f"{efts_url}\nTotal filings: {len(urls)}")
259
- filenames = [f"{url.split('/')[7]}_{url.split('/')[-1]}" for url in urls]
260
- await self._download_urls(urls=urls, filenames=filenames, output_dir=output_dir)
261
- else:
262
- for subset_url in self._subset_urls(efts_url, total_filings):
263
- await self._conductor(efts_url=subset_url, output_dir=output_dir, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata)
264
-
265
-
266
- def download(self, output_dir='filings', cik=None, ticker=None, form=None,
267
- date=None, sics=None, items=None, file_types=None, save_metadata=False):
268
- base_url = "https://efts.sec.gov/LATEST/search-index"
269
- params = {}
270
-
271
- if sum(x is not None for x in [cik, ticker]) > 1:
272
- raise ValueError('Please provide no more than one identifier: cik or ticker')
273
-
274
- if ticker is not None:
275
- cik = identifier_to_cik(ticker)
276
-
277
- if cik:
278
- if isinstance(cik, list):
279
- formatted_ciks = ','.join(str(c).zfill(10) for c in cik)
280
- else:
281
- formatted_ciks = str(cik).zfill(10)
282
- params['ciks'] = formatted_ciks
283
-
284
- params['forms'] = ','.join(form) if isinstance(form, list) else form if form else "-0"
285
-
286
- if file_types:
287
- params['q'] = '-'
288
- if isinstance(file_types, list):
289
- params['file_type'] = ','.join(file_types)
290
- else:
291
- params['file_type'] = file_types
292
-
293
- if isinstance(date, list):
294
- efts_url_list = [self.generate_url(base_url, {**params, 'startdt': d, 'enddt': d}) for d in date]
295
- elif isinstance(date, tuple):
296
- efts_url_list = [self.generate_url(base_url, {**params, 'startdt': date[0], 'enddt': date[1]})]
297
- else:
298
- date_str = date if date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
299
- efts_url_list = [self.generate_url(base_url, {**params, 'startdt': date_str.split(',')[0], 'enddt': date_str.split(',')[1]})]
300
-
301
- for efts_url in efts_url_list:
302
- asyncio.run(self._conductor(efts_url=efts_url, output_dir=output_dir, sics=sics, items=items, file_types=file_types, save_metadata=save_metadata))
303
-
304
- def download_company_concepts(self, output_dir='company_concepts', cik=None, ticker=None):
305
- if sum(x is not None for x in [cik, ticker]) > 1:
306
- raise ValueError('Please provide no more than one identifier: cik or ticker')
307
-
308
- ciks = None
309
- if cik:
310
- if isinstance(cik, list):
311
- ciks = cik
312
- else:
313
- ciks = [cik]
314
-
315
- if ticker is not None:
316
- ciks = identifier_to_cik(ticker)
317
-
318
- if ciks is None:
319
- company_tickers = load_package_csv('company_tickers')
320
- ciks = [company['cik'] for company in company_tickers]
321
-
322
- os.makedirs(output_dir, exist_ok=True)
323
-
324
- urls = [f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json' for cik in ciks]
325
- filenames = [f"CIK{str(cik).zfill(10)}.json" for cik in ciks]
326
- self.run_download_urls(urls=urls, filenames=filenames, output_dir=output_dir)
327
-
328
-
329
- def download_dataset(self, dataset, dataset_path='datasets'):
330
- if not os.path.exists(dataset_path):
331
- os.makedirs(dataset_path)
332
-
333
- if re.match(r"10k_(\d{4})$", dataset):
334
- dropbox_downloader = DropboxDownloader()
335
- year = int(dataset.split('_')[-1])
336
- year_data = next((data for data in dataset_10k_url_list if data['year'] == year), None)
337
-
338
- if year_data:
339
- output_dir = os.path.join(dataset_path, f'10K_{year}')
340
- os.makedirs(output_dir, exist_ok=True)
341
-
342
- dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
343
- else:
344
- print(f"No data found for 10Q_{year}")
345
- elif dataset == 'ftd':
346
- output_dir = os.path.join(dataset_path, 'ftd')
347
- urls = get_all_ftd_urls()
348
- self.run_download_urls(urls, filenames=[url.split('/')[-1] for url in urls], output_dir=output_dir)
349
- process_all_ftd_zips(output_dir)
350
- elif dataset == '13f_information_table':
351
- output_dir = os.path.join(dataset_path, '13f_information_table')
352
- download_and_process_13f_data(self, output_dir)
353
-
354
- elif re.match(r"10q_(\d{4})$", dataset):
355
- dropbox_downloader = DropboxDownloader()
356
- year = int(dataset.split('_')[-1])
357
- year_data = next((data for data in dataset_10q_url_list if data['year'] == year), None)
358
-
359
- if year_data:
360
- output_dir = os.path.join(dataset_path, f'10Q_{year}')
361
- os.makedirs(output_dir, exist_ok=True)
362
-
363
- dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
364
- else:
365
- print(f"No data found for 10Q_{year}")
366
-
367
- elif re.match(r"10k_(\d{4})$", dataset):
368
- dropbox_downloader = DropboxDownloader()
369
- year = int(dataset.split('_')[-1])
370
- year_data = next((data for data in dataset_10k_url_list if data['year'] == year), None)
371
-
372
- if year_data:
373
- output_dir = os.path.join(dataset_path, f'10K_{year}')
374
- os.makedirs(output_dir, exist_ok=True)
375
-
376
- dropbox_downloader.download(urls=year_data['urls'], output_dir=output_dir)
377
- else:
378
- print(f"No data found for 10K_{year}")
379
-
380
-
381
-
382
- async def _watch_efts(self, form=None, cik=None, interval=1, silent=False, callback=None):
383
- """Watch the EFTS API for changes in the number of filings."""
384
- params = {
385
- "startdt": datetime.now().strftime("%Y-%m-%d"),
386
- "enddt": datetime.now().strftime("%Y-%m-%d")
387
- }
388
-
389
- if form:
390
- params["forms"] = ",".join(form) if isinstance(form, list) else form
391
- else:
392
- params["forms"] = "-0"
393
-
394
- if cik:
395
- if isinstance(cik, list):
396
- params['ciks'] = ','.join(str(c).zfill(10) for c in cik)
397
- else:
398
- params['ciks'] = str(cik).zfill(10)
399
-
400
- watch_url = self.generate_url("https://efts.sec.gov/LATEST/search-index", params)
401
-
402
- previous_value = None
403
- async with aiohttp.ClientSession() as session:
404
- while True:
405
- data = await self._fetch_json_from_url(session, watch_url)
406
-
407
- if data:
408
- if not silent:
409
- print(f"URL: {watch_url}")
410
-
411
- current_value = data['hits']['total']['value']
412
-
413
- if previous_value is not None and current_value != previous_value:
414
- if not silent:
415
- print(f"Value changed from {previous_value} to {current_value}")
416
- if callback:
417
- callback(data)
418
-
419
- previous_value = current_value
420
- if not silent:
421
- print(f"Current value: {current_value}. Checking again in {interval} seconds.")
422
- else:
423
- print("Error occurred while fetching data.")
424
-
425
- await asyncio.sleep(interval)
426
-
427
- def watch(self, interval=1, silent=True, form=None, cik=None, ticker=None, callback=None):
428
- if sum(x is not None for x in [cik, ticker]) > 1:
429
- raise ValueError('Please provide no more than one identifier: cik or ticker')
430
-
431
- if ticker:
432
- cik = identifier_to_cik(ticker)
433
-
434
- return asyncio.run(self._watch_efts(interval=interval, silent=silent, form=form, cik=cik, callback=callback))
435
-
436
- async def _download_company_metadata(self):
437
- # Define file paths
438
- metadata_file = resource_filename('datamule', 'data/company_metadata.csv')
439
- former_names_file = resource_filename('datamule', 'data/company_former_names.csv')
440
-
441
- # Define temporary file paths
442
- temp_metadata_file = metadata_file + '.temp'
443
- temp_former_names_file = former_names_file + '.temp'
444
-
445
- metadata_fields = ['cik', 'name', 'entityType', 'sic', 'sicDescription', 'ownerOrg',
446
- 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists',
447
- 'tickers', 'exchanges', 'ein', 'description', 'website', 'investorWebsite',
448
- 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'stateOfIncorporationDescription',
449
- 'phone', 'flags', 'mailing_street1', 'mailing_street2', 'mailing_city',
450
- 'mailing_stateOrCountry', 'mailing_zipCode', 'mailing_stateOrCountryDescription',
451
- 'business_street1', 'business_street2', 'business_city', 'business_stateOrCountry',
452
- 'business_zipCode', 'business_stateOrCountryDescription']
453
-
454
- former_names_fields = ['cik', 'former_name', 'from_date', 'to_date']
455
-
456
- company_tickers = load_package_csv('company_tickers')
457
-
458
- async with aiohttp.ClientSession() as session:
459
- with open(temp_metadata_file, 'w', newline='') as mf, open(temp_former_names_file, 'w', newline='') as fnf:
460
- metadata_writer = csv.DictWriter(mf, fieldnames=metadata_fields)
461
- metadata_writer.writeheader()
462
-
463
- former_names_writer = csv.DictWriter(fnf, fieldnames=former_names_fields)
464
- former_names_writer.writeheader()
465
-
466
- for company in tqdm(company_tickers, desc="Updating company metadata"):
467
- cik = company['cik']
468
- url = f'https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json'
469
-
470
- try:
471
- data = await self._fetch_json_from_url(session, url)
472
-
473
- metadata = {field: data.get(field, '') for field in metadata_fields if field not in ['tickers', 'exchanges']}
474
- metadata['cik'] = cik
475
- metadata['tickers'] = ','.join(data.get('tickers', []))
476
- metadata['exchanges'] = ','.join(data.get('exchanges', []))
477
-
478
- # Add address information
479
- for address_type in ['mailing', 'business']:
480
- address = data.get('addresses', {}).get(address_type, {})
481
- for key, value in address.items():
482
- metadata[f'{address_type}_{key}'] = value if value is not None else ''
483
-
484
- metadata_writer.writerow(metadata)
485
-
486
- for former_name in data.get('formerNames', []):
487
- former_names_writer.writerow({
488
- 'cik': cik,
489
- 'former_name': former_name['name'],
490
- 'from_date': former_name['from'],
491
- 'to_date': former_name['to']
492
- })
493
-
494
- except Exception as e:
495
- print(f"Error processing CIK {cik}: {str(e)}")
496
-
497
- # Now we can safely replace the original files
498
-
499
- try:
500
- # Remove original files if they exist
501
- if os.path.exists(metadata_file):
502
- os.remove(metadata_file)
503
- if os.path.exists(former_names_file):
504
- os.remove(former_names_file)
505
-
506
- # Rename temp files to original names
507
- os.rename(temp_metadata_file, metadata_file)
508
- os.rename(temp_former_names_file, former_names_file)
509
-
510
- print(f"Metadata successfully updated in {metadata_file}")
511
- print(f"Former names successfully updated in {former_names_file}")
512
- except Exception as e:
513
- print(f"Error occurred while finalizing file update: {str(e)}")
514
- print("Temporary files have been kept. Please manually review and rename if necessary.")
515
- return
516
-
517
- # Clean up temp files if they still exist for some reason
518
- for temp_file in [temp_metadata_file, temp_former_names_file]:
519
- if os.path.exists(temp_file):
520
- try:
521
- os.remove(temp_file)
522
- except Exception as e:
523
- print(f"Warning: Could not remove temporary file {temp_file}: {str(e)}")
524
-
525
- def update_company_metadata(self):
526
- return asyncio.run(self._download_company_metadata())
527
-
528
- async def _download_company_tickers(self):
529
- url = 'https://www.sec.gov/files/company_tickers.json'
530
-
531
- # Define file paths
532
- json_file = resource_filename('datamule', 'data/company_tickers.json')
533
- csv_file = resource_filename('datamule', 'data/company_tickers.csv')
534
-
535
- # Define temporary file paths
536
- temp_json_file = json_file + '.temp'
537
- temp_csv_file = csv_file + '.temp'
538
-
539
- async with aiohttp.ClientSession() as session:
540
- try:
541
- content = await self._fetch_content_from_url(session, url)
542
-
543
- # Save the raw JSON file
544
- await self.write_content_to_file(content, temp_json_file)
545
-
546
- # Parse the JSON content
547
- data = json.loads(content)
548
-
549
- # Convert to CSV
550
- with open(temp_csv_file, 'w', newline='') as csvfile:
551
- fieldnames = ['cik', 'ticker', 'title']
552
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
553
- writer.writeheader()
554
- for _, company in data.items():
555
- writer.writerow({
556
- 'cik': str(company['cik_str']).zfill(10),
557
- 'ticker': company['ticker'],
558
- 'title': company['title']
559
- })
560
-
561
- # If everything went well, replace the original files
562
- if os.path.exists(json_file):
563
- os.remove(json_file)
564
- if os.path.exists(csv_file):
565
- os.remove(csv_file)
566
-
567
- os.rename(temp_csv_file, csv_file)
568
-
569
-
570
- print(f"Company tickers successfully updated in {csv_file}")
571
-
572
- except Exception as e:
573
- print(f"Error occurred while updating company tickers: {str(e)}")
574
- print("Temporary files have been kept. Please manually review and rename if necessary.")
575
- return
576
-
577
- finally:
578
- # Clean up temp files if they still exist
579
- for temp_file in [temp_json_file, temp_csv_file]:
580
- if os.path.exists(temp_file):
581
- try:
582
- os.remove(temp_file)
583
- except Exception as e:
584
- print(f"Warning: Could not remove temporary file {temp_file}: {str(e)}")
585
-
586
- def update_company_tickers(self):
587
- asyncio.run(self._download_company_tickers())
588
-
589
- def load_metadata(self, filepath):
590
- metadata = []
591
- with open(f"{filepath}/metadata.jsonl", 'r') as f:
592
- for line in f:
593
- if line.strip(): # Skip empty lines
594
- entry = json.loads(line)
595
- accession_num = next(iter(entry))
596
- doc_id = entry[accession_num]['_id']
597
- acc, filename = doc_id.split(':')
598
- row = {'accession_number': accession_num}
599
- row.update(entry[accession_num]['_source'])
600
- # Create primary doc URL using _id and cik
601
- cik = row['ciks'][0] if row.get('ciks') else ''
602
- acc_clean = acc.replace('-', '')
603
- row['primary_doc_url'] = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_clean.zfill(18)}/{filename}"
604
- metadata.append(row)
605
- self.metadata = metadata
606
-
607
- def save_metadata_to_csv(self, output_filepath):
608
- if not hasattr(self, 'metadata'):
609
- return
610
-
611
- fieldnames = {'accession_number', 'primary_doc_url'} # Start with both required fields
612
- max_lengths = {}
613
-
614
- for item in self.metadata:
615
- for key, value in item.items():
616
- if key not in ['accession_number', 'primary_doc_url'] and isinstance(value, list):
617
- max_lengths[key] = max(max_lengths.get(key, 0), len(value))
618
- fieldnames.update(f"{key}_{i+1}" for i in range(len(value)))
619
- else:
620
- fieldnames.add(key)
621
-
622
- with open(output_filepath, 'w', newline='') as f:
623
- writer = csv.DictWriter(f, sorted(fieldnames))
624
- writer.writeheader()
625
-
626
- for item in self.metadata:
627
- row = {'accession_number': item['accession_number'], 'primary_doc_url': item['primary_doc_url']}
628
- for key, value in item.items():
629
- if key not in ['accession_number', 'primary_doc_url']:
630
- if isinstance(value, list):
631
- for i, v in enumerate(value):
632
- row[f"{key}_{i+1}"] = v
633
- else:
634
- row[key] = value
635
- writer.writerow(row)
@@ -1 +0,0 @@
1
- from .filing_viewer import create_interactive_filing, create_valid_id