datamule 0.381__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book/book.py +34 -0
  3. datamule/book/eftsquery.py +127 -0
  4. datamule/book/xbrl_retriever.py +88 -0
  5. datamule/config.py +29 -0
  6. datamule/data/company_former_names.csv +8148 -8148
  7. datamule/data/company_metadata.csv +10049 -10049
  8. datamule/data/company_tickers.csv +9999 -10168
  9. datamule/data/sec-glossary.csv +728 -728
  10. datamule/data/xbrl_descriptions.csv +10024 -10024
  11. datamule/document.py +279 -0
  12. datamule/downloader/downloader.py +374 -0
  13. datamule/downloader/premiumdownloader.py +335 -0
  14. datamule/helper.py +123 -136
  15. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  16. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  17. datamule/monitor.py +238 -0
  18. datamule/mulebot/__init__.py +1 -1
  19. datamule/mulebot/helper.py +34 -34
  20. datamule/mulebot/mulebot.py +129 -129
  21. datamule/mulebot/mulebot_server/server.py +86 -86
  22. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  23. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  24. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  25. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  26. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  27. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  28. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  29. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  30. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  31. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  32. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  33. datamule/mulebot/search.py +51 -51
  34. datamule/mulebot/tools.py +82 -82
  35. datamule/packageupdater.py +207 -0
  36. datamule/portfolio.py +106 -0
  37. datamule/submission.py +76 -0
  38. datamule-1.0.2.dist-info/METADATA +27 -0
  39. datamule-1.0.2.dist-info/RECORD +43 -0
  40. {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/WHEEL +1 -1
  41. datamule/data/filing_types.csv +0 -485
  42. datamule/data/ftd_locations.csv +0 -388
  43. datamule/datamule_api.py +0 -21
  44. datamule/dataset_builder/_init.py +0 -1
  45. datamule/dataset_builder/dataset_builder.py +0 -260
  46. datamule/downloader/dropbox_downloader.py +0 -225
  47. datamule/downloader/ftd.py +0 -216
  48. datamule/downloader/information_table_13f.py +0 -231
  49. datamule/downloader/sec_downloader.py +0 -635
  50. datamule/filing_viewer/__init__.py +0 -1
  51. datamule/filing_viewer/filing_viewer.py +0 -256
  52. datamule/global_vars.py +0 -202
  53. datamule/parser/__init__.py +0 -1
  54. datamule/parser/basic_10k_parser.py +0 -82
  55. datamule/parser/basic_10q_parser.py +0 -73
  56. datamule/parser/basic_13d_parser.py +0 -58
  57. datamule/parser/basic_13g_parser.py +0 -61
  58. datamule/parser/basic_8k_parser.py +0 -84
  59. datamule/parser/company_concepts_parser.py +0 -0
  60. datamule/parser/form_d_parser.py +0 -70
  61. datamule/parser/generalized_item_parser.py +0 -78
  62. datamule/parser/generalized_xml_parser.py +0 -0
  63. datamule/parser/helper.py +0 -75
  64. datamule/parser/information_table_parser_13fhr.py +0 -41
  65. datamule/parser/insider_trading_parser.py +0 -158
  66. datamule/parser/mappings.py +0 -95
  67. datamule/parser/n_port_p_parser.py +0 -70
  68. datamule/parser/sec_parser.py +0 -79
  69. datamule/parser/sgml_parser.py +0 -180
  70. datamule/sec_filing.py +0 -126
  71. datamule/sec_search.py +0 -20
  72. datamule-0.381.dist-info/METADATA +0 -132
  73. datamule-0.381.dist-info/RECORD +0 -61
  74. /datamule/{downloader → book}/__init__.py +0 -0
  75. {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/top_level.txt +0 -0
datamule/document.py ADDED
@@ -0,0 +1,279 @@
1
+ import json
2
+ import csv
3
+ from .helper import convert_to_dashed_accession
4
+ import re
5
+ from doc2dict import xml2dict, txt2dict, dict2dict
6
+ from doc2dict.mapping import flatten_hierarchy
7
+ from .mapping_dicts import txt_mapping_dicts
8
+ from .mapping_dicts import xml_mapping_dicts
9
+ from selectolax.parser import HTMLParser
10
+
11
+ class Document:
12
+ def __init__(self, type, filename):
13
+ self.type = type
14
+ self.path = filename
15
+
16
+ self.data = None
17
+ self.content = None
18
+
19
+
20
+ def load_content(self,encoding='utf-8'):
21
+ with open(self.path, 'r',encoding=encoding) as f:
22
+ self.content = f.read()
23
+
24
+ def _load_text_content(self):
25
+ with open(self.path) as f:
26
+ return f.read().translate(str.maketrans({
27
+ '\xa0': ' ', '\u2003': ' ',
28
+ '\u2018': "'", '\u2019': "'",
29
+ '\u201c': '"', '\u201d': '"'
30
+ }))
31
+
32
+ # will deprecate this when we add html2dict
33
+ def _load_html_content(self):
34
+ with open(self.path,'rb') as f:
35
+ parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
36
+
37
+ # Remove hidden elements first
38
+ hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
39
+ for node in hidden_nodes:
40
+ node.decompose()
41
+
42
+ blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
43
+ lines = []
44
+ current_line = []
45
+
46
+ def flush_line():
47
+ if current_line:
48
+ # Don't add spaces between adjacent spans
49
+ lines.append(''.join(current_line))
50
+ current_line.clear()
51
+
52
+ for node in parser.root.traverse(include_text=True):
53
+ if node.tag in ('script', 'style', 'css'):
54
+ continue
55
+
56
+ if node.tag in blocks:
57
+ flush_line()
58
+ lines.append('')
59
+
60
+ if node.text_content:
61
+ text = node.text_content.strip()
62
+ if text:
63
+ if node.tag in blocks:
64
+ flush_line()
65
+ lines.append(text)
66
+ lines.append('')
67
+ else:
68
+ # Only add space if nodes aren't directly adjacent
69
+ if current_line and not current_line[-1].endswith(' '):
70
+ if node.prev and node.prev.text_content:
71
+ if node.parent != node.prev.parent or node.prev.next != node:
72
+ current_line.append(' ')
73
+ current_line.append(text)
74
+
75
+ flush_line()
76
+
77
+ text = '\n'.join(lines)
78
+ while '\n\n\n' in text:
79
+ text = text.replace('\n\n\n', '\n\n')
80
+
81
+ return text.translate(str.maketrans({
82
+ '\xa0': ' ', '\u2003': ' ',
83
+ '\u2018': "'", '\u2019': "'",
84
+ '\u201c': '"', '\u201d': '"'
85
+ }))
86
+
87
+ def _load_file_content(self):
88
+ if self.path.suffix =='.txt':
89
+ self.content = self._load_text_content()
90
+ elif self.path.suffix in ['.html','.htm']:
91
+ self.content = self._load_html_content()
92
+ else:
93
+ raise ValueError(f"Unsupported file type: {self.path.suffix}")
94
+
95
+
96
+ def contains_string(self, pattern):
97
+ """Currently only works for .htm, .html, and .txt files"""
98
+ if self.path.suffix in ['.htm', '.html', '.txt']:
99
+ if self.content is None:
100
+ self.content = self._load_file_content(self.path)
101
+ return bool(re.search(pattern, self.content))
102
+ return False
103
+
104
+ # Note: this method will be heavily modified in the future
105
+ def parse(self):
106
+ mapping_dict = None
107
+
108
+ if self.path.suffix == '.xml':
109
+ if self.type in ['3', '4', '5']:
110
+ mapping_dict = xml_mapping_dicts.dict_345
111
+
112
+ self.load_content()
113
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
114
+ # will deprecate this when we add html2dict
115
+ elif self.path.suffix in ['.htm', '.html','.txt']:
116
+ self._load_file_content()
117
+
118
+ if self.type == '10-K':
119
+ mapping_dict = txt_mapping_dicts.dict_10k
120
+ elif self.type == '10-Q':
121
+ mapping_dict = txt_mapping_dicts.dict_10q
122
+ elif self.type == '8-K':
123
+ mapping_dict = txt_mapping_dicts.dict_8k
124
+ elif self.type == 'SC 13D':
125
+ mapping_dict = txt_mapping_dicts.dict_13d
126
+ elif self.type == 'SC 13G':
127
+ mapping_dict = txt_mapping_dicts.dict_13g
128
+
129
+ self.data = {}
130
+ self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
131
+ return self.data
132
+
133
+ def write_json(self, output_filename=None):
134
+ if not self.data:
135
+ self.parse()
136
+
137
+ if output_filename is None:
138
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
139
+
140
+ with open(output_filename, 'w',encoding='utf-8') as f:
141
+ json.dump(self.data, f, indent=2)
142
+
143
+ def write_csv(self, output_filename=None, accession_number=None):
144
+ self.parse()
145
+
146
+ if output_filename is None:
147
+ output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
148
+
149
+ with open(output_filename, 'w', newline='') as csvfile:
150
+ if not self.data:
151
+ return output_filename
152
+
153
+ has_document = any('document' in item for item in self.data)
154
+
155
+ if has_document and 'document' in self.data:
156
+ writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
157
+ writer.writeheader()
158
+ flattened = self._flatten_dict(self.data['document'])
159
+ for section, text in flattened.items():
160
+ writer.writerow({'section': section, 'text': text})
161
+ else:
162
+ fieldnames = list(self.data[0].keys())
163
+ if accession_number:
164
+ fieldnames.append('Accession Number')
165
+ writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
166
+ writer.writeheader()
167
+ for row in self.data:
168
+ if accession_number:
169
+ row['Accession Number'] = convert_to_dashed_accession(accession_number)
170
+ writer.writerow(row)
171
+
172
+ return output_filename
173
+
174
+ def _document_to_section_text(self, document_data, parent_key=''):
175
+ items = []
176
+
177
+ if isinstance(document_data, dict):
178
+ for key, value in document_data.items():
179
+ # Build the section name
180
+ section = f"{parent_key}_{key}" if parent_key else key
181
+
182
+ # If the value is a dict, recurse
183
+ if isinstance(value, dict):
184
+ items.extend(self._document_to_section_text(value, section))
185
+ # If it's a list, handle each item
186
+ elif isinstance(value, list):
187
+ for i, item in enumerate(value):
188
+ if isinstance(item, dict):
189
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
190
+ else:
191
+ items.append({
192
+ 'section': f"{section}_{i+1}",
193
+ 'text': str(item)
194
+ })
195
+ # Base case - add the item
196
+ else:
197
+ items.append({
198
+ 'section': section,
199
+ 'text': str(value)
200
+ })
201
+
202
+ return items
203
+
204
+ # we'll modify this for every dict
205
+ def _flatten_dict(self, d, parent_key=''):
206
+ items = {}
207
+
208
+ if isinstance(d, list):
209
+ return [self._flatten_dict(item) for item in d]
210
+
211
+ for k, v in d.items():
212
+ new_key = f"{parent_key}_{k}" if parent_key else k
213
+
214
+ if isinstance(v, dict):
215
+ items.update(self._flatten_dict(v, new_key))
216
+ else:
217
+ items[new_key] = str(v)
218
+
219
+ return items
220
+
221
+ # this will all have to be changed. default will be to flatten everything
222
+ def __iter__(self):
223
+ if not self.data:
224
+ self.parse()
225
+
226
+ # Let's remove XML iterable for now
227
+
228
+ # Handle text-based documents
229
+ if self.path.suffix in ['.txt', '.htm', '.html']:
230
+ document_data = self.data
231
+ if not document_data:
232
+ return iter([])
233
+
234
+ # Find highest hierarchy level from mapping dict
235
+ highest_hierarchy = float('inf')
236
+ section_type = None
237
+
238
+ if self.type in ['10-K', '10-Q']:
239
+ mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
240
+ elif self.type == '8-K':
241
+ mapping_dict = txt_mapping_dicts.dict_8k
242
+ elif self.type == 'SC 13D':
243
+ mapping_dict = txt_mapping_dicts.dict_13d
244
+ elif self.type == 'SC 13G':
245
+ mapping_dict = txt_mapping_dicts.dict_13g
246
+ else:
247
+ return iter([])
248
+
249
+ # Find section type with highest hierarchy number
250
+ highest_hierarchy = -1 # Start at -1 to find highest
251
+ for mapping in mapping_dict['rules']['mappings']:
252
+ if mapping.get('hierarchy') is not None:
253
+ if mapping['hierarchy'] > highest_hierarchy:
254
+ highest_hierarchy = mapping['hierarchy']
255
+ section_type = mapping['name']
256
+
257
+ if not section_type:
258
+ return iter([])
259
+
260
+ # Extract sections of the identified type
261
+ def find_sections(data, target_type):
262
+ sections = []
263
+ if isinstance(data, dict):
264
+ if data.get('type') == target_type:
265
+ sections.append({
266
+ 'item': data.get('text', ''),
267
+ 'text': flatten_hierarchy(data.get('content', []))
268
+ })
269
+ for value in data.values():
270
+ if isinstance(value, (dict, list)):
271
+ sections.extend(find_sections(value, target_type))
272
+ elif isinstance(data, list):
273
+ for item in data:
274
+ sections.extend(find_sections(item, target_type))
275
+ return sections
276
+
277
+ return iter(find_sections(document_data, section_type))
278
+
279
+ return iter([])
@@ -0,0 +1,374 @@
1
+ import asyncio
2
+ import aiohttp
3
+ import os
4
+ from tqdm import tqdm
5
+ from datetime import datetime
6
+ from urllib.parse import urlencode
7
+ import aiofiles
8
+ import json
9
+ import time
10
+ from collections import deque
11
+
12
+ from ..helper import identifier_to_cik, load_package_csv, fix_filing_url, headers
13
+ from secsgml import parse_sgml_submission
14
+
15
+ class RetryException(Exception):
16
+ def __init__(self, url, retry_after=601):
17
+ self.url = url
18
+ self.retry_after = retry_after
19
+
20
+ class PreciseRateLimiter:
21
+ def __init__(self, rate, interval=1.0):
22
+ self.rate = rate # requests per interval
23
+ self.interval = interval # in seconds
24
+ self.token_time = self.interval / self.rate # time per token
25
+ self.last_time = time.time()
26
+ self.lock = asyncio.Lock()
27
+
28
+ async def acquire(self):
29
+ async with self.lock:
30
+ now = time.time()
31
+ wait_time = self.last_time + self.token_time - now
32
+ if wait_time > 0:
33
+ await asyncio.sleep(wait_time)
34
+ self.last_time = time.time()
35
+ return True
36
+
37
+ async def __aenter__(self):
38
+ await self.acquire()
39
+ return self
40
+
41
+ async def __aexit__(self, exc_type, exc, tb):
42
+ pass
43
+
44
+ class RateMonitor:
45
+ def __init__(self, window_size=1.0):
46
+ self.window_size = window_size
47
+ self.requests = deque()
48
+ self._lock = asyncio.Lock()
49
+
50
+ async def add_request(self, size_bytes):
51
+ async with self._lock:
52
+ now = time.time()
53
+ self.requests.append((now, size_bytes))
54
+ while self.requests and self.requests[0][0] < now - self.window_size:
55
+ self.requests.popleft()
56
+
57
+ def get_current_rates(self):
58
+ now = time.time()
59
+ while self.requests and self.requests[0][0] < now - self.window_size:
60
+ self.requests.popleft()
61
+
62
+ if not self.requests:
63
+ return 0, 0
64
+
65
+ request_count = len(self.requests)
66
+ byte_count = sum(size for _, size in self.requests)
67
+
68
+ requests_per_second = request_count / self.window_size
69
+ mb_per_second = (byte_count / 1024 / 1024) / self.window_size
70
+
71
+ return round(requests_per_second, 1), round(mb_per_second, 2)
72
+
73
+ class Downloader:
74
+ def __init__(self):
75
+ self.headers = headers
76
+ self.limiter = PreciseRateLimiter(5) # 10 requests per second
77
+ self.session = None
78
+ self.parse_filings = True
79
+ self.download_queue = asyncio.Queue()
80
+ self.rate_monitor = RateMonitor()
81
+ self.current_pbar = None
82
+ self.connection_semaphore = asyncio.Semaphore(5)
83
+
84
+ def update_progress_description(self):
85
+ if self.current_pbar:
86
+ reqs_per_sec, mb_per_sec = self.rate_monitor.get_current_rates()
87
+ self.current_pbar.set_description(
88
+ f"Progress [Rate: {reqs_per_sec}/s | {mb_per_sec} MB/s]"
89
+ )
90
+
91
+ async def __aenter__(self):
92
+ await self._init_session()
93
+ return self
94
+
95
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
96
+ await self._close()
97
+
98
+ async def _init_session(self):
99
+ if not self.session:
100
+ self.session = aiohttp.ClientSession(headers=self.headers)
101
+
102
+ async def _close(self):
103
+ if self.session:
104
+ await self.session.close()
105
+ self.session = None
106
+
107
+ async def _fetch_json(self, url):
108
+ """Fetch JSON with rate monitoring."""
109
+ async with self.limiter:
110
+ try:
111
+ url = fix_filing_url(url)
112
+ async with self.session.get(url) as response:
113
+ if response.status == 429:
114
+ raise RetryException(url)
115
+ response.raise_for_status()
116
+ content = await response.read()
117
+ await self.rate_monitor.add_request(len(content))
118
+ self.update_progress_description()
119
+ return await response.json()
120
+ except aiohttp.ClientResponseError as e:
121
+ if e.status == 429:
122
+ raise RetryException(url)
123
+ raise
124
+
125
+ async def _get_filing_urls_from_efts(self, base_url, submission_type=None):
126
+ """Fetch filing URLs from EFTS in batches with form type filtering."""
127
+ start = 0
128
+ page_size = 100
129
+ urls = []
130
+
131
+ data = await self._fetch_json(f"{base_url}&from=0&size=1")
132
+ if not data or 'hits' not in data:
133
+ return []
134
+
135
+ total_hits = data['hits']['total']['value']
136
+ if not total_hits:
137
+ return []
138
+
139
+ pbar = tqdm(total=total_hits, desc="Fetching URLs [Rate: 0/s | 0 MB/s]")
140
+ self.current_pbar = pbar
141
+
142
+ while start < total_hits:
143
+ try:
144
+ tasks = [
145
+ self._fetch_json(f"{base_url}&from={start + i * page_size}&size={page_size}")
146
+ for i in range(10)
147
+ ]
148
+
149
+ results = await asyncio.gather(*tasks)
150
+
151
+ for data in results:
152
+ if data and 'hits' in data:
153
+ hits = data['hits']['hits']
154
+ if hits:
155
+ # Filter hits based on exact form match
156
+ if not submission_type or submission_type == "-0":
157
+ filtered_hits = hits
158
+ else:
159
+ requested_forms = [submission_type] if isinstance(submission_type, str) else submission_type
160
+ filtered_hits = [
161
+ hit for hit in hits
162
+ if hit['_source'].get('form', '') in requested_forms
163
+ ]
164
+
165
+ batch_urls = [
166
+ f"https://www.sec.gov/Archives/edgar/data/{hit['_source']['ciks'][0]}/{hit['_id'].split(':')[0]}.txt"
167
+ for hit in filtered_hits
168
+ ]
169
+ urls.extend(batch_urls)
170
+ pbar.update(len(hits)) # Update progress based on total hits processed
171
+ self.update_progress_description()
172
+
173
+ start += 10 * page_size
174
+
175
+ except RetryException as e:
176
+ print(f"\nRate limited. Sleeping for {e.retry_after} seconds...")
177
+ await asyncio.sleep(e.retry_after)
178
+ continue
179
+ except Exception as e:
180
+ print(f"\nError fetching URLs batch at {start}: {str(e)}")
181
+ break
182
+
183
+ pbar.close()
184
+ self.current_pbar = None
185
+ return urls
186
+
187
+ async def _download_file(self, url, filepath):
188
+ """Download single file with precise rate limiting."""
189
+ async with self.connection_semaphore:
190
+ async with self.limiter:
191
+ try:
192
+ url = fix_filing_url(url)
193
+ async with self.session.get(url) as response:
194
+ if response.status == 429:
195
+ raise RetryException(url)
196
+ response.raise_for_status()
197
+ content = await response.read()
198
+ await self.rate_monitor.add_request(len(content))
199
+ self.update_progress_description()
200
+
201
+ parsed_data = None
202
+ if self.parse_filings:
203
+ try:
204
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
205
+ async with aiofiles.open(filepath, 'wb') as f:
206
+ await f.write(content)
207
+
208
+ parsed_data = parse_sgml_submission(
209
+ content=content.decode(),
210
+ output_dir=os.path.dirname(filepath)
211
+ )
212
+
213
+ try:
214
+ os.remove(filepath)
215
+ except Exception as e:
216
+ print(f"\nError deleting original file {filepath}: {str(e)}")
217
+
218
+ except Exception as e:
219
+ print(f"\nError parsing {url}: {str(e)}")
220
+ try:
221
+ os.remove(filepath)
222
+ parsed_dir = os.path.dirname(filepath) + f'/{url.split("/")[-1].split(".")[0].replace("-", "")}'
223
+ if os.path.exists(parsed_dir):
224
+ import shutil
225
+ shutil.rmtree(parsed_dir)
226
+ except Exception as e:
227
+ print(f"\nError cleaning up files for {url}: {str(e)}")
228
+ else:
229
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
230
+ async with aiofiles.open(filepath, 'wb') as f:
231
+ await f.write(content)
232
+
233
+ return filepath, parsed_data
234
+
235
+ except Exception as e:
236
+ print(f"\nError downloading {url}: {str(e)}")
237
+ return None
238
+
239
+ async def _download_worker(self, pbar):
240
+ """Worker to process download queue."""
241
+ while True:
242
+ try:
243
+ url, filepath = await self.download_queue.get()
244
+ result = await self._download_file(url, filepath)
245
+ if result:
246
+ pbar.update(1)
247
+ self.download_queue.task_done()
248
+ except asyncio.CancelledError:
249
+ break
250
+ except Exception as e:
251
+ print(f"\nWorker error processing {url}: {str(e)}")
252
+ self.download_queue.task_done()
253
+
254
+ async def _download_and_process(self, urls, output_dir):
255
+ """Queue-based download processing."""
256
+ results = []
257
+ parsed_results = []
258
+
259
+ pbar = tqdm(total=len(urls), desc="Downloading files [Rate: 0/s | 0 MB/s]")
260
+ self.current_pbar = pbar
261
+
262
+ for url in urls:
263
+ filename = url.split('/')[-1]
264
+ filepath = os.path.join(output_dir, filename)
265
+ await self.download_queue.put((url, filepath))
266
+
267
+ workers = [asyncio.create_task(self._download_worker(pbar))
268
+ for _ in range(5)] # Match number of workers to semaphore
269
+
270
+ await self.download_queue.join()
271
+
272
+ for worker in workers:
273
+ worker.cancel()
274
+
275
+ await asyncio.gather(*workers, return_exceptions=True)
276
+
277
+ pbar.close()
278
+ self.current_pbar = None
279
+ return results, parsed_results
280
+
281
+ def download_submissions(self, output_dir='filings', cik=None, ticker=None, submission_type=None, filing_date=None, parse=True):
282
+ """Main method to download SEC filings."""
283
+ self.parse_filings = parse
284
+
285
+ async def _download():
286
+ async with self as downloader:
287
+ if ticker is not None:
288
+ cik_value = identifier_to_cik(ticker)
289
+ else:
290
+ cik_value = cik
291
+
292
+ params = {}
293
+ if cik_value:
294
+ if isinstance(cik_value, list):
295
+ params['ciks'] = ','.join(str(c).zfill(10) for c in cik_value)
296
+ else:
297
+ params['ciks'] = str(cik_value).zfill(10)
298
+
299
+ params['forms'] = ','.join(submission_type) if isinstance(submission_type, list) else submission_type if submission_type else "-0"
300
+
301
+ if isinstance(filing_date, list):
302
+ dates = [(d, d) for d in filing_date]
303
+ elif isinstance(filing_date, tuple):
304
+ dates = [filing_date]
305
+ else:
306
+ date_str = filing_date if filing_date else f"2001-01-01,{datetime.now().strftime('%Y-%m-%d')}"
307
+ start, end = date_str.split(',')
308
+ dates = [(start, end)]
309
+
310
+ all_filepaths = []
311
+ all_parsed_data = []
312
+
313
+ for start_date, end_date in dates:
314
+ params['startdt'] = start_date
315
+ params['enddt'] = end_date
316
+ base_url = "https://efts.sec.gov/LATEST/search-index"
317
+ efts_url = f"{base_url}?{urlencode(params, doseq=True)}"
318
+
319
+ urls = await self._get_filing_urls_from_efts(efts_url,submission_type)
320
+ if urls:
321
+ filepaths, parsed_data = await self._download_and_process(urls, output_dir)
322
+ all_filepaths.extend(filepaths)
323
+ all_parsed_data.extend(parsed_data)
324
+
325
+ return all_filepaths, all_parsed_data
326
+
327
+ return asyncio.run(_download())
328
+
329
+ def download_company_concepts(self, output_dir='company_concepts', cik=None, ticker=None):
330
+ """Download company concept data."""
331
+ async def _download_concepts():
332
+ async with self as downloader:
333
+ if ticker is not None:
334
+ ciks = identifier_to_cik(ticker)
335
+ elif cik:
336
+ ciks = [cik] if not isinstance(cik, list) else cik
337
+ else:
338
+ company_tickers = load_package_csv('company_tickers')
339
+ ciks = [company['cik'] for company in company_tickers]
340
+
341
+ os.makedirs(output_dir, exist_ok=True)
342
+ urls = [f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json' for cik in ciks]
343
+
344
+ pbar = tqdm(total=len(urls), desc="Downloading concepts [Rate: 0/s | 0 MB/s]")
345
+ self.current_pbar = pbar
346
+
347
+ for url in urls:
348
+ filename = url.split('/')[-1]
349
+ filepath = os.path.join(output_dir, filename)
350
+ await self.download_queue.put((url, filepath))
351
+
352
+ workers = [asyncio.create_task(self._download_worker(pbar))
353
+ for _ in range(5)]
354
+
355
+ await self.download_queue.join()
356
+
357
+ for worker in workers:
358
+ worker.cancel()
359
+
360
+ await asyncio.gather(*workers, return_exceptions=True)
361
+
362
+ pbar.close()
363
+ self.current_pbar = None
364
+
365
+ results = []
366
+ for url in urls:
367
+ filename = url.split('/')[-1]
368
+ filepath = os.path.join(output_dir, filename)
369
+ if os.path.exists(filepath):
370
+ results.append(filepath)
371
+
372
+ return results
373
+
374
+ return asyncio.run(_download_concepts())