datamule 0.381__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.381.dist-info/METADATA +0 -132
  72. datamule-0.381.dist-info/RECORD +0 -61
  73. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,260 +0,0 @@
1
- import pandas as pd
2
- import json
3
- import os
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from tqdm import tqdm
6
- import google.generativeai as genai
7
- import time
8
- from datetime import datetime
9
- import psutil
10
- from threading import Lock
11
-
12
- class RateLimiter:
13
- def __init__(self, max_rpm):
14
- self.min_delay = 62.0 / max_rpm # 58 seconds to allow for some buffer WIP
15
- self.last_request = time.time()
16
- self.lock = Lock()
17
- self.request_count = 0
18
-
19
- def acquire(self):
20
- with self.lock:
21
- now = time.time()
22
- time_since_last = now - self.last_request
23
- delay_needed = self.min_delay - time_since_last
24
- self.last_request = now + max(0, delay_needed) # Update based on expected completion
25
- self.request_count += 1
26
- count = self.request_count
27
-
28
- # Sleep outside the lock
29
- if delay_needed > 0:
30
- time.sleep(delay_needed)
31
-
32
- return count
33
-
34
- class DatasetBuilder:
35
- def __init__(self):
36
- self.base_prompt = None
37
- self.response_schema = None
38
- self.input_path = None
39
- self.output_path = None
40
- self.failed_path = None
41
- self.max_rpm = 1450
42
- self.max_workers = 30
43
- self.save_frequency = 100
44
- self.output_columns = None
45
- self.buffer = []
46
- self.buffer_lock = Lock()
47
- self.failed_ids = set()
48
- self.failed_lock = Lock()
49
- self.model_name = "gemini-1.5-flash-8b" # Default model
50
- self.model_config = {} # Additional model configuration
51
- self.api_key = None
52
-
53
- def set_api_key(self, api_key):
54
- """Set the API key for Google's Generative AI."""
55
- self.api_key = api_key
56
- genai.configure(api_key=api_key)
57
- return self
58
-
59
- def set_paths(self, input_path, output_path, failed_path):
60
- """Set input and output file paths."""
61
- self.input_path = input_path
62
- self.output_path = output_path
63
- self.failed_path = failed_path
64
- return self
65
-
66
- def set_base_prompt(self, prompt):
67
- """Set the base prompt for LLM processing."""
68
- self.base_prompt = prompt
69
- return self
70
-
71
- def set_response_schema(self, schema):
72
- """Set the response schema and derive output columns."""
73
- self.response_schema = schema
74
- # Derive output columns from schema
75
- if schema and 'items' in schema and 'properties' in schema['items']:
76
- properties = schema['items']['properties']
77
- self.output_columns = ['accession_number'] + list(properties.keys())
78
- return self
79
-
80
- def set_rpm(self, max_rpm=1450):
81
- """Set the maximum requests per minute."""
82
- self.max_rpm = max_rpm
83
- return self
84
-
85
- def set_max_workers(self, max_workers=30):
86
- """Set the maximum number of concurrent workers."""
87
- self.max_workers = max_workers
88
- return self
89
-
90
- def set_save_frequency(self, frequency=100):
91
- """Set how often to save progress."""
92
- self.save_frequency = frequency
93
- return self
94
-
95
- def set_model(self, model_name="gemini-1.5-flash-8b", **model_config):
96
- """Set the model name and configuration."""
97
- self.model_name = model_name
98
- self.model_config = model_config
99
- return self
100
-
101
- def validate_config(self):
102
- """Validate that all required configurations are set."""
103
- if not all([self.base_prompt, self.response_schema, self.input_path,
104
- self.output_path, self.failed_path, self.api_key]):
105
- raise ValueError("""Missing required configuration. Please ensure you have set:
106
- - API key
107
- - Paths (input_path, output_path, failed_path)
108
- - Base prompt
109
- - Response schema""")
110
-
111
- def get_processed_ids(self):
112
- """Get set of processed accession numbers from output file."""
113
- if not os.path.exists(self.output_path):
114
- return set()
115
-
116
- try:
117
- # Read only the accession_number column for memory efficiency
118
- df = pd.read_csv(self.output_path, usecols=['accession_number'])
119
- return set(df['accession_number'])
120
- except Exception as e:
121
- print(f"Warning: Error reading processed IDs: {e}")
122
- return set()
123
-
124
- def save_data(self, df_new):
125
- """Append new data to existing CSV."""
126
- df_new.to_csv(self.output_path, mode='a', header=not os.path.exists(self.output_path), index=False)
127
-
128
- def save_failed_ids(self):
129
- """Save failed accession numbers to file."""
130
- with open(self.failed_path, 'w') as f:
131
- for acc in self.failed_ids:
132
- f.write(f"{acc}\n")
133
-
134
- def process_text(self, args):
135
- """Process a single text entry through the model."""
136
- model, text, accession_number, rate_limiter = args
137
-
138
- current_requests = rate_limiter.acquire()
139
-
140
- full_prompt = self.base_prompt + "\n\nINFORMATION:\n" + text
141
-
142
- try:
143
- generation_config = genai.GenerationConfig(
144
- response_mime_type="application/json",
145
- response_schema=self.response_schema,
146
- **self.model_config
147
- )
148
-
149
- response = model.generate_content(
150
- full_prompt,
151
- generation_config=generation_config
152
- )
153
- results = json.loads(response.text)
154
-
155
- for result in results:
156
- result['accession_number'] = accession_number
157
-
158
- with self.buffer_lock:
159
- self.buffer.extend(results)
160
-
161
- return True, current_requests
162
- except Exception as e:
163
- with self.failed_lock:
164
- self.failed_ids.add(accession_number)
165
- return False, f"Error processing {accession_number}: {str(e)}"
166
-
167
- def build(self):
168
- """Main processing method to build the dataset."""
169
- self.validate_config()
170
-
171
- # Initialize model and rate limiter
172
- model = genai.GenerativeModel(self.model_name)
173
- rate_limiter = RateLimiter(self.max_rpm)
174
-
175
- # Load data
176
- print("Loading data...")
177
- df_input = pd.read_csv(self.input_path)
178
- processed_ids = self.get_processed_ids()
179
- df_to_process = df_input[~df_input['accession_number'].isin(processed_ids)]
180
-
181
- total_in_dataset = len(df_input)
182
- already_processed = len(processed_ids)
183
- to_process = len(df_to_process)
184
-
185
- print(f"Total entries in dataset: {total_in_dataset}")
186
- print(f"Already processed: {already_processed}")
187
- print(f"New entries to process: {to_process}")
188
-
189
- if len(df_to_process) == 0:
190
- print("All entries already processed!")
191
- return
192
-
193
- work_items = [
194
- (model, row['text'], row['accession_number'], rate_limiter)
195
- for _, row in df_to_process.iterrows()
196
- ]
197
-
198
- start_time = time.time()
199
- last_save_time = time.time()
200
- processed_count = 0
201
-
202
- with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
203
- futures = {executor.submit(self.process_text, item): item for item in work_items}
204
-
205
- with tqdm(total=total_in_dataset, initial=already_processed, desc="Processing entries") as pbar:
206
- for future in as_completed(futures):
207
- success, result = future.result()
208
-
209
- if not success:
210
- print(f"\n{result}")
211
-
212
- processed_count += 1
213
- pbar.update(1)
214
-
215
- elapsed = time.time() - start_time
216
- rpm = processed_count / (elapsed / 60)
217
- memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
218
-
219
- current_progress = already_processed + processed_count
220
- pbar.set_description(
221
- f"Processed {current_progress}/{total_in_dataset} | {rpm:.0f} RPM | Mem: {memory_usage:.0f}MB"
222
- )
223
-
224
- # Save periodically using append
225
- if len(self.buffer) >= self.save_frequency:
226
- with self.buffer_lock:
227
- df_new = pd.DataFrame(self.buffer)
228
- self.buffer = []
229
-
230
- if not df_new.empty:
231
- self.save_data(df_new)
232
- last_save_time = time.time()
233
-
234
- # Save failed IDs periodically
235
- if self.failed_ids and time.time() - last_save_time > 300:
236
- self.save_failed_ids()
237
- last_save_time = time.time()
238
-
239
- # Save any remaining results
240
- if self.buffer:
241
- with self.buffer_lock:
242
- df_new = pd.DataFrame(self.buffer)
243
- self.buffer = []
244
-
245
- if not df_new.empty:
246
- self.save_data(df_new)
247
-
248
- if self.failed_ids:
249
- self.save_failed_ids()
250
-
251
- # Print final statistics
252
- elapsed = time.time() - start_time
253
- final_rpm = processed_count / (elapsed / 60)
254
-
255
- print(f"\nProcessing complete:")
256
- print(f"Total processed in this run: {processed_count}")
257
- print(f"Average speed: {final_rpm:.0f} RPM")
258
- print(f"Failed entries: {len(self.failed_ids)}")
259
- if self.failed_ids:
260
- print(f"Failed entries saved to: {self.failed_path}")
File without changes
@@ -1,225 +0,0 @@
1
- import asyncio
2
- import aiohttp
3
- from aiolimiter import AsyncLimiter
4
- import os
5
- from tqdm import tqdm
6
- from tqdm.asyncio import tqdm as atqdm
7
- from datetime import datetime, timedelta
8
- from urllib.parse import urlparse, parse_qs, urlencode
9
- import math
10
- import re
11
- import aiofiles
12
- import json
13
- import csv
14
- from pkg_resources import resource_filename
15
- import zipfile
16
- import shutil
17
-
18
- class DropboxDownloader:
19
- """
20
- Asynchronous downloader for handling multiple file downloads with rate limiting and progress tracking.
21
-
22
- This class provides functionality to download multiple files concurrently with rate limiting,
23
- progress bars, and automatic handling of zip archives (including multi-part archives).
24
-
25
- Parameters
26
- ----------
27
- concurrent_downloads : int, optional
28
- Maximum number of concurrent downloads allowed (default is 5)
29
- rate_limit : int, optional
30
- Maximum number of requests per second (default is 10)
31
-
32
- Attributes
33
- ----------
34
- semaphore : asyncio.Semaphore
35
- Controls the number of concurrent downloads
36
- rate_limiter : AsyncLimiter
37
- Handles rate limiting of requests
38
- session : aiohttp.ClientSession
39
- HTTP session for making requests
40
- progress_bars : dict
41
- Dictionary of progress bars for active downloads
42
- """
43
-
44
- def __init__(self, concurrent_downloads=5, rate_limit=10):
45
- """Initialize the DropboxDownloader with specified concurrency and rate limits."""
46
- self.semaphore = asyncio.Semaphore(concurrent_downloads)
47
- self.rate_limiter = AsyncLimiter(rate_limit, 1) # rate_limit requests per second
48
- self.session = None
49
- self.progress_bars = {}
50
-
51
- async def create_session(self):
52
- """
53
- Create an aiohttp client session.
54
-
55
- This method should be called before starting any downloads.
56
- """
57
- self.session = aiohttp.ClientSession()
58
-
59
- async def close_session(self):
60
- """
61
- Close the aiohttp client session.
62
-
63
- This method should be called after all downloads are complete.
64
- """
65
- if self.session:
66
- await self.session.close()
67
-
68
- async def download_file(self, url, dest_folder):
69
- """
70
- Download a single file with progress tracking and automatic unzipping.
71
-
72
- Parameters
73
- ----------
74
- url : str
75
- URL of the file to download
76
- dest_folder : str
77
- Destination folder where the file will be saved
78
-
79
- Notes
80
- -----
81
- - Uses a progress bar to show download progress
82
- - Automatically handles zip files and multi-part archives
83
- - Rate limited based on the instance settings
84
- """
85
- async with self.semaphore:
86
- await self.rate_limiter.acquire()
87
- try:
88
- file_name = os.path.basename(urlparse(url).path)
89
- file_path = os.path.join(dest_folder, file_name)
90
-
91
- async with self.session.get(url) as response:
92
- if response.status != 200:
93
- print(f"Failed to download {url}: HTTP {response.status}")
94
- return
95
-
96
- file_size = int(response.headers.get('Content-Length', 0))
97
-
98
- if file_name not in self.progress_bars:
99
- self.progress_bars[file_name] = tqdm(
100
- total=file_size,
101
- unit='iB',
102
- unit_scale=True,
103
- desc=file_name
104
- )
105
-
106
- async with aiofiles.open(file_path, 'wb') as f:
107
- chunk_size = 8192
108
- downloaded = 0
109
- async for chunk in response.content.iter_chunked(chunk_size):
110
- await f.write(chunk)
111
- downloaded += len(chunk)
112
- self.progress_bars[file_name].update(len(chunk))
113
-
114
- self.progress_bars[file_name].close()
115
- del self.progress_bars[file_name]
116
-
117
- print(f"Downloaded {file_name}")
118
-
119
- if re.match(r'.*\.zip(\.001)?$', file_name):
120
- await self.unzip_file(file_path, dest_folder)
121
-
122
- except Exception as e:
123
- print(f"Error downloading {url}: {str(e)}")
124
-
125
- async def unzip_file(self, file_path, dest_folder):
126
- """
127
- Extract contents of a zip file and clean up archive files.
128
-
129
- Handles both single zip files and multi-part archives.
130
-
131
- Parameters
132
- ----------
133
- file_path : str
134
- Path to the zip file
135
- dest_folder : str
136
- Destination folder for extracted contents
137
-
138
- Notes
139
- -----
140
- - Automatically combines multi-part archives
141
- - Deletes archive files after successful extraction
142
- - Handles both .zip and .zip.001 format files
143
- """
144
- try:
145
- base_name = os.path.splitext(file_path)[0]
146
- if file_path.endswith('.001'):
147
- base_name = os.path.splitext(base_name)[0]
148
-
149
- combined_zip = f"{base_name}.zip"
150
-
151
- # Combine parts if necessary
152
- if not os.path.exists(combined_zip):
153
- with open(combined_zip, 'wb') as outfile:
154
- part_num = 1
155
- while True:
156
- part_path = f"{base_name}.zip.{part_num:03d}"
157
- if not os.path.exists(part_path):
158
- break
159
- with open(part_path, 'rb') as infile:
160
- shutil.copyfileobj(infile, outfile)
161
- part_num += 1
162
-
163
- # Unzip the combined file
164
- with zipfile.ZipFile(combined_zip, 'r') as zip_ref:
165
- zip_ref.extractall(dest_folder)
166
- print(f"Unzipped {os.path.basename(base_name)}")
167
-
168
- # Remove archive files
169
- if os.path.exists(combined_zip):
170
- os.remove(combined_zip)
171
-
172
- part_num = 1
173
- while True:
174
- part_path = f"{base_name}.zip.{part_num:03d}"
175
- if not os.path.exists(part_path):
176
- break
177
- os.remove(part_path)
178
- part_num += 1
179
-
180
- standalone_zip = f"{base_name}.zip"
181
- if os.path.exists(standalone_zip):
182
- os.remove(standalone_zip)
183
-
184
- print(f"Cleaned up zip files for {os.path.basename(base_name)}")
185
- except Exception as e:
186
- print(f"Error unzipping {file_path}: {str(e)}")
187
-
188
- async def _download_urls(self, urls, dest_folder):
189
- """
190
- Internal method to handle multiple URL downloads.
191
-
192
- Parameters
193
- ----------
194
- urls : list of str
195
- List of URLs to download
196
- dest_folder : str
197
- Destination folder for downloaded files
198
- """
199
- os.makedirs(dest_folder, exist_ok=True)
200
- await self.create_session()
201
- tasks = [self.download_file(url, dest_folder) for url in urls]
202
- await asyncio.gather(*tasks)
203
- await self.close_session()
204
-
205
- def download(self, urls, output_dir):
206
- """
207
- Download multiple URLs to a specified directory.
208
-
209
- This is the main method to use for downloading files. It handles the creation
210
- and cleanup of the async event loop.
211
-
212
- Parameters
213
- ----------
214
- urls : list of str
215
- List of URLs to download
216
- output_dir : str
217
- Directory where files will be saved
218
-
219
- Examples
220
- --------
221
- >>> downloader = DropboxDownloader(concurrent_downloads=3, rate_limit=5)
222
- >>> urls = ['http://example.com/file1.zip', 'http://example.com/file2.zip']
223
- >>> downloader.download(urls, '/path/to/output')
224
- """
225
- return asyncio.run(self._download_urls(urls, output_dir))
@@ -1,216 +0,0 @@
1
- from datetime import datetime, timedelta
2
- import pkg_resources
3
- import io
4
- import re
5
- import os
6
- import zipfile
7
- import csv
8
- from concurrent.futures import ThreadPoolExecutor
9
- from tqdm import tqdm
10
-
11
- def process_ftd_zip(zip_path):
12
- """
13
- Process a single FTD (Fails-to-Deliver) ZIP file by converting its contents to CSV format.
14
-
15
- This function extracts the contents of a ZIP file containing tab-delimited data,
16
- converts it to CSV format, and removes the original ZIP file.
17
-
18
- Parameters
19
- ----------
20
- zip_path : str
21
- Path to the ZIP file to process
22
-
23
- Notes
24
- -----
25
- - Assumes each ZIP file contains exactly one data file
26
- - Uses '|' as the delimiter for input data
27
- - Removes the original ZIP file after processing
28
- - Handles UTF-8 encoding with replacement for invalid characters
29
-
30
- Examples
31
- --------
32
- >>> process_ftd_zip('path/to/cnsfails202301a.zip')
33
- """
34
- base_name = os.path.splitext(zip_path)[0]
35
- csv_path = f"{base_name}.csv"
36
-
37
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
38
- file_name = zip_ref.namelist()[0] # Assuming only one file per zip
39
- with zip_ref.open(file_name) as file:
40
- content = io.TextIOWrapper(file, encoding='utf-8', errors='replace').read()
41
-
42
- # Convert tab-delimited content to CSV
43
- with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
44
- writer = csv.writer(csvfile)
45
- for line in content.split('\n'):
46
- writer.writerow(line.split('|'))
47
-
48
- # Remove the original zip file
49
- os.remove(zip_path)
50
-
51
- def process_all_ftd_zips(output_dir):
52
- """
53
- Process all FTD ZIP files in a directory concurrently.
54
-
55
- Uses ThreadPoolExecutor for parallel processing with progress tracking.
56
-
57
- Parameters
58
- ----------
59
- output_dir : str
60
- Directory containing the ZIP files to process
61
-
62
- Notes
63
- -----
64
- - Processes all files ending with '.zip' in the directory
65
- - Shows progress bar during processing
66
- - Uses system default number of worker threads
67
-
68
- Examples
69
- --------
70
- >>> process_all_ftd_zips('/path/to/ftd/files')
71
- """
72
- zip_files = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
73
-
74
- # Use ThreadPoolExecutor for parallel processing with tqdm
75
- with ThreadPoolExecutor() as executor:
76
- list(tqdm(executor.map(lambda f: process_ftd_zip(os.path.join(output_dir, f)), zip_files),
77
- total=len(zip_files),
78
- desc="Processing ZIP files",
79
- unit="file"))
80
-
81
- def load_csv_data():
82
- """
83
- Load existing FTD locations data from the package's data directory.
84
-
85
- Returns
86
- -------
87
- list of dict
88
- List of dictionaries containing the CSV data, where each dictionary
89
- represents a row with keys from the CSV header
90
-
91
- Notes
92
- -----
93
- - Reads from 'ftd_locations.csv' in the package's data directory
94
- - Uses UTF-8 encoding
95
- - Assumes CSV has a header row
96
-
97
- Examples
98
- --------
99
- >>> data = load_csv_data()
100
- >>> print(data[0]['url']) # Print first URL in data
101
- """
102
- csv_content = pkg_resources.resource_string('datamule', 'data/ftd_locations.csv')
103
- csv_data = []
104
- csv_file = io.StringIO(csv_content.decode('utf-8'))
105
- csv_reader = csv.DictReader(csv_file)
106
- for row in csv_reader:
107
- csv_data.append(row)
108
- return csv_data
109
-
110
- def extract_date_from_url(url):
111
- """
112
- Extract the date from an FTD file URL.
113
-
114
- Parameters
115
- ----------
116
- url : str
117
- URL of the FTD file
118
-
119
- Returns
120
- -------
121
- datetime or None
122
- Datetime object representing the month and year from the URL,
123
- or None if no date is found
124
-
125
- Notes
126
- -----
127
- - Expects URLs in format containing 'cnsfails{YYYYMM}[a|b].zip'
128
- - Returns None if the URL doesn't match the expected pattern
129
-
130
- Examples
131
- --------
132
- >>> url = "https://www.sec.gov/files/data/fails-deliver-data/cnsfails202301a.zip"
133
- >>> date = extract_date_from_url(url)
134
- >>> print(date)
135
- 2023-01-01 00:00:00
136
- """
137
- match = re.search(r'cnsfails(\d{6})[ab]\.zip', url)
138
- if match:
139
- date_str = match.group(1)
140
- return datetime.strptime(date_str, '%Y%m')
141
- return None
142
-
143
- def generate_urls(start_date, end_date):
144
- """
145
- Generate FTD file URLs for a date range.
146
-
147
- Parameters
148
- ----------
149
- start_date : datetime
150
- Start date for URL generation
151
- end_date : datetime
152
- End date for URL generation
153
-
154
- Returns
155
- -------
156
- list of str
157
- List of URLs for FTD files in the date range
158
-
159
- Notes
160
- -----
161
- - Generates two URLs per month ('a' and 'b' files)
162
- - Uses 15-day intervals for half-month periods
163
- - URLs follow SEC's file naming convention
164
-
165
- Examples
166
- --------
167
- >>> start = datetime(2023, 1, 1)
168
- >>> end = datetime(2023, 12, 31)
169
- >>> urls = generate_urls(start, end)
170
- """
171
- urls = []
172
- current_date = start_date
173
- while current_date <= end_date:
174
- for half in ['a', 'b']:
175
- url = f"https://www.sec.gov/files/data/fails-deliver-data/cnsfails{current_date.strftime('%Y%m')}{half}.zip"
176
- urls.append(url)
177
- current_date += timedelta(days=15) # Move to the next half-month
178
- return urls
179
-
180
- def get_all_ftd_urls():
181
- """
182
- Get a complete list of FTD URLs, including both existing and new ones.
183
-
184
- Returns
185
- -------
186
- list of str
187
- Combined list of existing and newly generated URLs
188
-
189
- Notes
190
- -----
191
- - Loads existing URLs from the package's data file
192
- - Finds the latest date in existing URLs
193
- - Generates new URLs from the month after the latest date up to current date
194
- - Combines and returns all URLs
195
-
196
- Examples
197
- --------
198
- >>> urls = get_all_ftd_urls()
199
- >>> print(len(urls)) # Number of URLs
200
- >>> print(urls[-1]) # Most recent URL
201
- """
202
- # Load existing URLs
203
- csv_data = load_csv_data()
204
- existing_urls = [row['url'] for row in csv_data]
205
-
206
- # Find the last date in the existing URLs
207
- last_date = max(extract_date_from_url(url) for url in existing_urls if extract_date_from_url(url))
208
-
209
- # Generate new URLs starting from the month after the last date
210
- start_date = (last_date.replace(day=1) + timedelta(days=32)).replace(day=1)
211
- end_date = datetime.now()
212
-
213
- new_urls = generate_urls(start_date, end_date)
214
-
215
- # Combine and return all URLs
216
- return existing_urls + new_urls