datamule 0.380__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.380.dist-info/METADATA +0 -110
  72. datamule-0.380.dist-info/RECORD +0 -61
  73. {datamule-0.380.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,335 @@
1
+ import os
2
+ import asyncio
3
+ import aiohttp
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+ import time
7
+ import shutil
8
+ import ssl
9
+ import zstandard as zstd
10
+ import io
11
+ import json
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import partial
14
+ from queue import Queue, Empty
15
+ from threading import Thread
16
+ from secsgml import parse_sgml_submission
17
+ import urllib.parse
18
+ from ..helper import identifier_to_cik
19
+
20
+ class InsufficientBalanceError(Exception):
21
+ def __init__(self, required_cost, current_balance, total_urls):
22
+ self.required_cost = required_cost
23
+ self.current_balance = current_balance
24
+ self.total_urls = total_urls
25
+ message = (f"Insufficient balance. Required: ${required_cost:.4f}, "
26
+ f"Current balance: ${current_balance:.4f}, "
27
+ f"Total URLs: {total_urls}")
28
+ super().__init__(message)
29
+
30
+ class PremiumDownloader:
31
+ def __init__(self, api_key=None):
32
+ self.BASE_URL = "https://library.datamule.xyz/original/nc/"
33
+ self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
34
+ self.CHUNK_SIZE = 2 * 1024 * 1024
35
+ self.MAX_CONCURRENT_DOWNLOADS = 100
36
+ self.MAX_DECOMPRESSION_WORKERS = 16
37
+ self.MAX_PROCESSING_WORKERS = 16
38
+ self.QUEUE_SIZE = 10
39
+ if api_key is not None:
40
+ self._api_key = api_key
41
+
42
+ @property
43
+ def api_key(self):
44
+ return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
45
+
46
+ @api_key.setter
47
+ def api_key(self, value):
48
+ if not value:
49
+ raise ValueError("API key cannot be empty")
50
+ self._api_key = value
51
+
52
+ def _log_error(self, output_dir, filename, error_msg):
53
+ error_file = os.path.join(output_dir, 'errors.json')
54
+ try:
55
+ if os.path.exists(error_file):
56
+ with open(error_file, 'r') as f:
57
+ errors = json.load(f)
58
+ else:
59
+ errors = {}
60
+
61
+ errors[filename] = str(error_msg)
62
+
63
+ with open(error_file, 'w') as f:
64
+ json.dump(errors, f, indent=2)
65
+ except Exception as e:
66
+ print(f"Failed to log error to {error_file}: {str(e)}")
67
+
68
+ async def _fetch_submissions(self, session, submission_type=None, cik=None, filing_date=None, page=1):
69
+ params = {
70
+ 'api_key': self.api_key,
71
+ 'page': page
72
+ }
73
+
74
+ if submission_type:
75
+ if isinstance(submission_type, list):
76
+ params['submission_type'] = ','.join(str(x) for x in submission_type)
77
+ else:
78
+ params['submission_type'] = str(submission_type)
79
+
80
+ if cik:
81
+ if isinstance(cik, list):
82
+ params['cik'] = ','.join(str(x) for x in cik)
83
+ else:
84
+ params['cik'] = str(cik)
85
+
86
+ if filing_date:
87
+ if isinstance(filing_date, tuple):
88
+ params['startdt'] = str(filing_date[0])
89
+ params['enddt'] = str(filing_date[1])
90
+ else:
91
+ if isinstance(filing_date, list):
92
+ params['filing_date'] = ','.join(str(x) for x in filing_date)
93
+ else:
94
+ params['filing_date'] = str(filing_date)
95
+
96
+ url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
97
+
98
+ async with session.get(url) as response:
99
+ data = await response.json()
100
+ if not data.get('success'):
101
+ raise ValueError(f"API request failed: {data.get('error')}")
102
+
103
+ charges = data['metadata']['billing']['charges']
104
+ print(f"\nCost: ${charges['results']:.12f} downloads + ${charges['rows_read']:.12f} row reads = ${charges['total']:.12f}")
105
+ print(f"Balance: ${data['metadata']['billing']['remaining_balance']:.12f}")
106
+
107
+ urls = [f"{self.BASE_URL}{str(sub['accession_number']).zfill(18)}.sgml{'.zst' if sub.get('compressed', '').lower() == 'true' else ''}" for sub in data['data']]
108
+ return urls, data['metadata']['pagination']
109
+
110
+ class FileProcessor:
111
+ def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
112
+ self.processing_queue = Queue(maxsize=queue_size)
113
+ self.should_stop = False
114
+ self.processing_workers = []
115
+ self.output_dir = output_dir
116
+ self.max_workers = max_workers
117
+ self.batch_size = 10
118
+ self.pbar = pbar
119
+ self.downloader = downloader
120
+
121
+ def start_processing_workers(self):
122
+ for _ in range(self.max_workers):
123
+ worker = Thread(target=self._processing_worker)
124
+ worker.daemon = True
125
+ worker.start()
126
+ self.processing_workers.append(worker)
127
+
128
+ def _process_file(self, item):
129
+ filename, content = item
130
+ try:
131
+ parse_sgml_submission(output_dir=self.output_dir, content=content)
132
+ self.pbar.update(1)
133
+ except Exception as e:
134
+ accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
135
+ if os.path.exists(accession_dir):
136
+ shutil.rmtree(accession_dir)
137
+ self.downloader._log_error(self.output_dir, filename, str(e))
138
+
139
+ def _processing_worker(self):
140
+ batch = []
141
+ while not self.should_stop:
142
+ try:
143
+ item = self.processing_queue.get(timeout=1)
144
+ if item is None:
145
+ break
146
+
147
+ batch.append(item)
148
+
149
+ if len(batch) >= self.batch_size or self.processing_queue.empty():
150
+ for item in batch:
151
+ self._process_file(item)
152
+ self.processing_queue.task_done()
153
+ batch = []
154
+
155
+ except Empty:
156
+ if batch:
157
+ for item in batch:
158
+ self._process_file(item)
159
+ self.processing_queue.task_done()
160
+ batch = []
161
+
162
+ def stop_workers(self):
163
+ self.should_stop = True
164
+ for _ in self.processing_workers:
165
+ self.processing_queue.put(None)
166
+ for worker in self.processing_workers:
167
+ worker.join()
168
+
169
+ def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
170
+ dctx = zstd.ZstdDecompressor()
171
+ try:
172
+ input_buffer = io.BytesIO(b''.join(compressed_chunks))
173
+ decompressed_content = io.BytesIO()
174
+
175
+ with dctx.stream_reader(input_buffer) as reader:
176
+ shutil.copyfileobj(reader, decompressed_content)
177
+
178
+ content = decompressed_content.getvalue().decode('utf-8')
179
+ processor.processing_queue.put((filename, content))
180
+ return True
181
+
182
+ except Exception as e:
183
+ self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
184
+ return False
185
+ finally:
186
+ try:
187
+ input_buffer.close()
188
+ decompressed_content.close()
189
+ except:
190
+ pass
191
+
192
+ def save_regular_file(self, chunks, filename, output_dir, processor):
193
+ try:
194
+ content = b''.join(chunks).decode('utf-8')
195
+ processor.processing_queue.put((filename, content))
196
+ return True
197
+
198
+ except Exception as e:
199
+ self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
200
+ return False
201
+
202
+ async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
203
+ async with semaphore:
204
+ chunks = []
205
+ filename = url.split('/')[-1]
206
+
207
+ api_key = self.api_key
208
+ if not api_key:
209
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
210
+
211
+ try:
212
+ headers = {
213
+ 'Connection': 'keep-alive',
214
+ 'Accept-Encoding': 'gzip, deflate, br',
215
+ 'Authorization': f'Bearer {api_key}'
216
+ }
217
+
218
+ async with session.get(url, headers=headers) as response:
219
+ if response.status == 200:
220
+ async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
221
+ chunks.append(chunk)
222
+
223
+ loop = asyncio.get_running_loop()
224
+ if filename.endswith('.zst'):
225
+ success = await loop.run_in_executor(
226
+ decompression_pool,
227
+ partial(self.decompress_stream, chunks, filename, output_dir, processor)
228
+ )
229
+ else:
230
+ success = await loop.run_in_executor(
231
+ decompression_pool,
232
+ partial(self.save_regular_file, chunks, filename, output_dir, processor)
233
+ )
234
+
235
+ if not success:
236
+ self._log_error(output_dir, filename, "Failed to process file")
237
+ elif response.status == 401:
238
+ self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
239
+ raise ValueError("Invalid API key")
240
+ else:
241
+ self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
242
+ except Exception as e:
243
+ self._log_error(output_dir, filename, str(e))
244
+
245
+ async def process_batch(self, urls, output_dir):
246
+ os.makedirs(output_dir, exist_ok=True)
247
+
248
+ with tqdm(total=len(urls), desc="Processing files") as pbar:
249
+ processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
250
+ processor.start_processing_workers()
251
+
252
+ semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
253
+ decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
254
+
255
+ connector = aiohttp.TCPConnector(
256
+ limit=self.MAX_CONCURRENT_DOWNLOADS,
257
+ force_close=False,
258
+ ssl=ssl.create_default_context(),
259
+ ttl_dns_cache=300,
260
+ keepalive_timeout=60
261
+ )
262
+
263
+ # timeout should be max 2 hours.
264
+ async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
265
+ tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
266
+ await asyncio.gather(*tasks, return_exceptions=True)
267
+
268
+ processor.processing_queue.join()
269
+ processor.stop_workers()
270
+ decompression_pool.shutdown()
271
+
272
+ async def download_all_pages(self, submission_type=None, cik=None, filing_date=None, output_dir="download"):
273
+ connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
274
+ async with aiohttp.ClientSession(connector=connector) as session:
275
+ try:
276
+ urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=1)
277
+ total_urls = urls.copy()
278
+ current_page = 1
279
+
280
+ while pagination.get('hasMore', False):
281
+ current_page += 1
282
+ more_urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=current_page)
283
+ total_urls.extend(more_urls)
284
+
285
+ if total_urls:
286
+ total_urls = list(set(total_urls)) # Remove duplicates
287
+ start_time = time.time()
288
+ await self.process_batch(total_urls, output_dir)
289
+ elapsed_time = time.time() - start_time
290
+ print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
291
+ else:
292
+ print("No submissions found matching the criteria")
293
+
294
+ except InsufficientBalanceError as e:
295
+ error_msg = {
296
+ "error": "insufficient_balance",
297
+ "required_cost": e.required_cost,
298
+ "current_balance": e.current_balance,
299
+ "total_urls": e.total_urls,
300
+ "additional_funds_needed": e.required_cost - e.current_balance
301
+ }
302
+ self._log_error(output_dir, "balance_check", error_msg)
303
+ return
304
+
305
+ def download_submissions(self, submission_type=None, cik=None, ticker=None, filing_date=None, output_dir="download"):
306
+ if self.api_key is None:
307
+ raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
308
+
309
+ if filing_date is not None:
310
+ if isinstance(filing_date, str):
311
+ filing_date = int(filing_date.replace('-', ''))
312
+ elif isinstance(filing_date, list):
313
+ filing_date = [int(x.replace('-', '')) for x in filing_date]
314
+ elif isinstance(filing_date, tuple):
315
+ filing_date = (int(filing_date[0].replace('-', '')), int(filing_date[1].replace('-', '')))
316
+
317
+ if ticker is not None:
318
+ cik = identifier_to_cik(ticker)
319
+
320
+ if cik is not None:
321
+ if isinstance(cik, str):
322
+ cik = [int(cik)]
323
+ elif isinstance(cik, int):
324
+ cik = [cik]
325
+ elif isinstance(cik, list):
326
+ cik = [int(x) for x in cik]
327
+
328
+ async def _download():
329
+ try:
330
+ await self.download_all_pages(submission_type=submission_type, cik=cik, filing_date=filing_date, output_dir=output_dir)
331
+ except Exception as e:
332
+ if not isinstance(e, InsufficientBalanceError):
333
+ self._log_error(output_dir, "download_error", str(e))
334
+
335
+ asyncio.run(_download())
datamule/helper.py CHANGED
@@ -1,136 +1,123 @@
1
- import requests
2
- import os
3
- from tqdm import tqdm
4
- import zipfile
5
- from pkg_resources import resource_filename
6
- import csv
7
- import re
8
-
9
- # Unused in current implementation.
10
- def construct_primary_doc_url(cik, accession_number,primary_doc_url):
11
- accession_number = accession_number.replace("-", "")
12
- return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
13
-
14
- # DONE
15
- def _download_from_dropbox(url, output_path):
16
- headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
17
- r = requests.get(url, stream=True, headers=headers)
18
- total_size = int(r.headers.get('content-length', 0))
19
-
20
- with open(output_path, 'wb') as f, tqdm(
21
- desc="Downloading " + os.path.basename(output_path),
22
- total=total_size,
23
- unit='iB',
24
- unit_scale=True,
25
- unit_divisor=1024,
26
- ) as progress_bar:
27
- for chunk in r.iter_content(chunk_size=1024):
28
- size = f.write(chunk)
29
- progress_bar.update(size)
30
-
31
- # Check if the downloaded file is a zip file
32
- if zipfile.is_zipfile(output_path):
33
- extract_path = os.path.dirname(output_path)
34
- with zipfile.ZipFile(output_path, 'r') as zip_ref:
35
- for file_info in zip_ref.infolist():
36
- extract_file_path = os.path.join(extract_path, file_info.filename)
37
- with zip_ref.open(file_info) as file_in_zip, \
38
- open(extract_file_path, 'wb') as output_file, \
39
- tqdm(total=file_info.file_size, unit='B', unit_scale=True,
40
- desc=f"Extracting {file_info.filename}") as pbar:
41
- while True:
42
- chunk = file_in_zip.read(8192)
43
- if not chunk:
44
- break
45
- output_file.write(chunk)
46
- pbar.update(len(chunk))
47
-
48
- # Remove the zip file after extraction
49
- os.remove(output_path)
50
- print(f"Extracted contents to {extract_path}")
51
- else:
52
- print(f"Downloaded file is not a zip. Saved to {output_path}")
53
-
54
- # May generalize to load any package resource
55
- def load_package_csv(name):
56
- """Load package CSV files"""
57
- csv_path = resource_filename('datamule', f'data/{name}.csv')
58
- company_tickers = []
59
-
60
- with open(csv_path, 'r') as csvfile:
61
- csv_reader = csv.DictReader(csvfile)
62
- for row in csv_reader:
63
- company_tickers.append(row)
64
-
65
- return company_tickers
66
-
67
- def load_package_dataset(dataset):
68
- if dataset == 'company_tickers':
69
- return load_package_csv('company_tickers')
70
- elif dataset =='company_former_names':
71
- return load_package_csv('company_former_names')
72
- elif dataset =='company_metadata':
73
- return load_package_csv('company_metadata')
74
- elif dataset == 'sec_glossary':
75
- return load_package_csv('sec-glossary')
76
- elif dataset == 'xbrl_descriptions':
77
- return load_package_csv('xbrl_descriptions')
78
-
79
- # DONE
80
- def identifier_to_cik(ticker):
81
- """Convert company tickers to CIK codes"""
82
- company_tickers = load_package_csv('company_tickers')
83
- if ticker:
84
- if isinstance(ticker, list):
85
- cik = []
86
- for t in ticker:
87
- cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
88
- else:
89
- cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
90
-
91
- if not cik:
92
- raise ValueError("No matching companies found")
93
-
94
- return cik
95
-
96
-
97
- def fix_filing_url(url):
98
- """Some Filings URLs have the wrong path. This is an issue with EFTS, that is solved in SEC.gov index page, but SEC.gov still has links to broken url."""
99
- # Check if the URL ends with '/0001.txt'
100
- if url.endswith('/0001.txt'):
101
- # Extract the accession number from the URL
102
- match = re.search(r'/(\d{18})/', url)
103
- if match:
104
- accession_number = match.group(1)
105
- # Add dashes to the accession number
106
- formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
107
- # Construct the new URL
108
- new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-0001.txt'
109
- return new_url
110
- elif url.endswith('/0001.htm'):
111
- # Extract the accession number from the URL
112
- match = re.search(r'/(\d{18})/', url)
113
- if match:
114
- accession_number = match.group(1)
115
- # Add dashes to the accession number
116
- formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
117
- # Construct the new URL
118
- new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-0001.htm'
119
- return new_url
120
-
121
- # If the URL doesn't end with '/0001.txt' or doesn't contain a valid accession number,
122
- # return the original URL
123
- return url
124
-
125
- def convert_to_dashed_accession(accession):
126
- # Remove any existing dashes or whitespace
127
- cleaned = ''.join(accession.split())
128
-
129
- # Check if the cleaned string has 18 characters
130
- if len(cleaned) != 18:
131
- raise ValueError("Invalid accession number format. Expected 18 characters.")
132
-
133
- # Insert dashes at the correct positions
134
- dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
135
-
136
- return dashed
1
+ import requests
2
+ import os
3
+ from tqdm import tqdm
4
+ import zipfile
5
+ from pkg_resources import resource_filename
6
+ import csv
7
+ import re
8
+
9
+ # Unused in current implementation.
10
+ def construct_primary_doc_url(cik, accession_number,primary_doc_url):
11
+ accession_number = accession_number.replace("-", "")
12
+ return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
13
+
14
+ # DONE
15
+ def _download_from_dropbox(url, output_path):
16
+ headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
17
+ r = requests.get(url, stream=True, headers=headers)
18
+ total_size = int(r.headers.get('content-length', 0))
19
+
20
+ with open(output_path, 'wb') as f, tqdm(
21
+ desc="Downloading " + os.path.basename(output_path),
22
+ total=total_size,
23
+ unit='iB',
24
+ unit_scale=True,
25
+ unit_divisor=1024,
26
+ ) as progress_bar:
27
+ for chunk in r.iter_content(chunk_size=1024):
28
+ size = f.write(chunk)
29
+ progress_bar.update(size)
30
+
31
+ # Check if the downloaded file is a zip file
32
+ if zipfile.is_zipfile(output_path):
33
+ extract_path = os.path.dirname(output_path)
34
+ with zipfile.ZipFile(output_path, 'r') as zip_ref:
35
+ for file_info in zip_ref.infolist():
36
+ extract_file_path = os.path.join(extract_path, file_info.filename)
37
+ with zip_ref.open(file_info) as file_in_zip, \
38
+ open(extract_file_path, 'wb') as output_file, \
39
+ tqdm(total=file_info.file_size, unit='B', unit_scale=True,
40
+ desc=f"Extracting {file_info.filename}") as pbar:
41
+ while True:
42
+ chunk = file_in_zip.read(8192)
43
+ if not chunk:
44
+ break
45
+ output_file.write(chunk)
46
+ pbar.update(len(chunk))
47
+
48
+ # Remove the zip file after extraction
49
+ os.remove(output_path)
50
+ print(f"Extracted contents to {extract_path}")
51
+ else:
52
+ print(f"Downloaded file is not a zip. Saved to {output_path}")
53
+
54
+ # May generalize to load any package resource
55
+ def load_package_csv(name):
56
+ """Load package CSV files"""
57
+ csv_path = resource_filename('datamule', f'data/{name}.csv')
58
+ company_tickers = []
59
+
60
+ with open(csv_path, 'r') as csvfile:
61
+ csv_reader = csv.DictReader(csvfile)
62
+ for row in csv_reader:
63
+ company_tickers.append(row)
64
+
65
+ return company_tickers
66
+
67
+ def load_package_dataset(dataset):
68
+ if dataset == 'company_tickers':
69
+ return load_package_csv('company_tickers')
70
+ elif dataset =='company_former_names':
71
+ return load_package_csv('company_former_names')
72
+ elif dataset =='company_metadata':
73
+ return load_package_csv('company_metadata')
74
+ elif dataset == 'sec_glossary':
75
+ return load_package_csv('sec-glossary')
76
+ elif dataset == 'xbrl_descriptions':
77
+ return load_package_csv('xbrl_descriptions')
78
+
79
+ # DONE
80
+ def identifier_to_cik(ticker):
81
+ """Convert company tickers to CIK codes"""
82
+ company_tickers = load_package_csv('company_tickers')
83
+ if ticker:
84
+ if isinstance(ticker, list):
85
+ cik = []
86
+ for t in ticker:
87
+ cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
88
+ else:
89
+ cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
90
+
91
+ if not cik:
92
+ raise ValueError("No matching companies found")
93
+
94
+ return cik
95
+
96
+
97
+ def fix_filing_url(url):
98
+ match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
99
+ if match_suffix:
100
+ suffix_number = match_suffix.group(1)
101
+ file_ext = match_suffix.group(2)
102
+ match_accession = re.search(r'/(\d{18})/', url)
103
+ if match_accession:
104
+ accession_number = match_accession.group(1)
105
+ formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
106
+ new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
107
+ return new_url
108
+ return url
109
+
110
+ def convert_to_dashed_accession(accession):
111
+ # Remove any existing dashes or whitespace
112
+ cleaned = ''.join(accession.split())
113
+
114
+ # Check if the cleaned string has 18 characters
115
+ if len(cleaned) != 18:
116
+ raise ValueError("Invalid accession number format. Expected 18 characters.")
117
+
118
+ # Insert dashes at the correct positions
119
+ dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
120
+
121
+ return dashed
122
+
123
+ headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}