datamule 0.381__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book/book.py +34 -0
- datamule/book/eftsquery.py +127 -0
- datamule/book/xbrl_retriever.py +88 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +279 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.2.dist-info/METADATA +27 -0
- datamule-1.0.2.dist-info/RECORD +43 -0
- {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.381.dist-info/METADATA +0 -132
- datamule-0.381.dist-info/RECORD +0 -61
- /datamule/{downloader → book}/__init__.py +0 -0
- {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/top_level.txt +0 -0
@@ -1,260 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from tqdm import tqdm
|
6
|
-
import google.generativeai as genai
|
7
|
-
import time
|
8
|
-
from datetime import datetime
|
9
|
-
import psutil
|
10
|
-
from threading import Lock
|
11
|
-
|
12
|
-
class RateLimiter:
|
13
|
-
def __init__(self, max_rpm):
|
14
|
-
self.min_delay = 62.0 / max_rpm # 58 seconds to allow for some buffer WIP
|
15
|
-
self.last_request = time.time()
|
16
|
-
self.lock = Lock()
|
17
|
-
self.request_count = 0
|
18
|
-
|
19
|
-
def acquire(self):
|
20
|
-
with self.lock:
|
21
|
-
now = time.time()
|
22
|
-
time_since_last = now - self.last_request
|
23
|
-
delay_needed = self.min_delay - time_since_last
|
24
|
-
self.last_request = now + max(0, delay_needed) # Update based on expected completion
|
25
|
-
self.request_count += 1
|
26
|
-
count = self.request_count
|
27
|
-
|
28
|
-
# Sleep outside the lock
|
29
|
-
if delay_needed > 0:
|
30
|
-
time.sleep(delay_needed)
|
31
|
-
|
32
|
-
return count
|
33
|
-
|
34
|
-
class DatasetBuilder:
|
35
|
-
def __init__(self):
|
36
|
-
self.base_prompt = None
|
37
|
-
self.response_schema = None
|
38
|
-
self.input_path = None
|
39
|
-
self.output_path = None
|
40
|
-
self.failed_path = None
|
41
|
-
self.max_rpm = 1450
|
42
|
-
self.max_workers = 30
|
43
|
-
self.save_frequency = 100
|
44
|
-
self.output_columns = None
|
45
|
-
self.buffer = []
|
46
|
-
self.buffer_lock = Lock()
|
47
|
-
self.failed_ids = set()
|
48
|
-
self.failed_lock = Lock()
|
49
|
-
self.model_name = "gemini-1.5-flash-8b" # Default model
|
50
|
-
self.model_config = {} # Additional model configuration
|
51
|
-
self.api_key = None
|
52
|
-
|
53
|
-
def set_api_key(self, api_key):
|
54
|
-
"""Set the API key for Google's Generative AI."""
|
55
|
-
self.api_key = api_key
|
56
|
-
genai.configure(api_key=api_key)
|
57
|
-
return self
|
58
|
-
|
59
|
-
def set_paths(self, input_path, output_path, failed_path):
|
60
|
-
"""Set input and output file paths."""
|
61
|
-
self.input_path = input_path
|
62
|
-
self.output_path = output_path
|
63
|
-
self.failed_path = failed_path
|
64
|
-
return self
|
65
|
-
|
66
|
-
def set_base_prompt(self, prompt):
|
67
|
-
"""Set the base prompt for LLM processing."""
|
68
|
-
self.base_prompt = prompt
|
69
|
-
return self
|
70
|
-
|
71
|
-
def set_response_schema(self, schema):
|
72
|
-
"""Set the response schema and derive output columns."""
|
73
|
-
self.response_schema = schema
|
74
|
-
# Derive output columns from schema
|
75
|
-
if schema and 'items' in schema and 'properties' in schema['items']:
|
76
|
-
properties = schema['items']['properties']
|
77
|
-
self.output_columns = ['accession_number'] + list(properties.keys())
|
78
|
-
return self
|
79
|
-
|
80
|
-
def set_rpm(self, max_rpm=1450):
|
81
|
-
"""Set the maximum requests per minute."""
|
82
|
-
self.max_rpm = max_rpm
|
83
|
-
return self
|
84
|
-
|
85
|
-
def set_max_workers(self, max_workers=30):
|
86
|
-
"""Set the maximum number of concurrent workers."""
|
87
|
-
self.max_workers = max_workers
|
88
|
-
return self
|
89
|
-
|
90
|
-
def set_save_frequency(self, frequency=100):
|
91
|
-
"""Set how often to save progress."""
|
92
|
-
self.save_frequency = frequency
|
93
|
-
return self
|
94
|
-
|
95
|
-
def set_model(self, model_name="gemini-1.5-flash-8b", **model_config):
|
96
|
-
"""Set the model name and configuration."""
|
97
|
-
self.model_name = model_name
|
98
|
-
self.model_config = model_config
|
99
|
-
return self
|
100
|
-
|
101
|
-
def validate_config(self):
|
102
|
-
"""Validate that all required configurations are set."""
|
103
|
-
if not all([self.base_prompt, self.response_schema, self.input_path,
|
104
|
-
self.output_path, self.failed_path, self.api_key]):
|
105
|
-
raise ValueError("""Missing required configuration. Please ensure you have set:
|
106
|
-
- API key
|
107
|
-
- Paths (input_path, output_path, failed_path)
|
108
|
-
- Base prompt
|
109
|
-
- Response schema""")
|
110
|
-
|
111
|
-
def get_processed_ids(self):
|
112
|
-
"""Get set of processed accession numbers from output file."""
|
113
|
-
if not os.path.exists(self.output_path):
|
114
|
-
return set()
|
115
|
-
|
116
|
-
try:
|
117
|
-
# Read only the accession_number column for memory efficiency
|
118
|
-
df = pd.read_csv(self.output_path, usecols=['accession_number'])
|
119
|
-
return set(df['accession_number'])
|
120
|
-
except Exception as e:
|
121
|
-
print(f"Warning: Error reading processed IDs: {e}")
|
122
|
-
return set()
|
123
|
-
|
124
|
-
def save_data(self, df_new):
|
125
|
-
"""Append new data to existing CSV."""
|
126
|
-
df_new.to_csv(self.output_path, mode='a', header=not os.path.exists(self.output_path), index=False)
|
127
|
-
|
128
|
-
def save_failed_ids(self):
|
129
|
-
"""Save failed accession numbers to file."""
|
130
|
-
with open(self.failed_path, 'w') as f:
|
131
|
-
for acc in self.failed_ids:
|
132
|
-
f.write(f"{acc}\n")
|
133
|
-
|
134
|
-
def process_text(self, args):
|
135
|
-
"""Process a single text entry through the model."""
|
136
|
-
model, text, accession_number, rate_limiter = args
|
137
|
-
|
138
|
-
current_requests = rate_limiter.acquire()
|
139
|
-
|
140
|
-
full_prompt = self.base_prompt + "\n\nINFORMATION:\n" + text
|
141
|
-
|
142
|
-
try:
|
143
|
-
generation_config = genai.GenerationConfig(
|
144
|
-
response_mime_type="application/json",
|
145
|
-
response_schema=self.response_schema,
|
146
|
-
**self.model_config
|
147
|
-
)
|
148
|
-
|
149
|
-
response = model.generate_content(
|
150
|
-
full_prompt,
|
151
|
-
generation_config=generation_config
|
152
|
-
)
|
153
|
-
results = json.loads(response.text)
|
154
|
-
|
155
|
-
for result in results:
|
156
|
-
result['accession_number'] = accession_number
|
157
|
-
|
158
|
-
with self.buffer_lock:
|
159
|
-
self.buffer.extend(results)
|
160
|
-
|
161
|
-
return True, current_requests
|
162
|
-
except Exception as e:
|
163
|
-
with self.failed_lock:
|
164
|
-
self.failed_ids.add(accession_number)
|
165
|
-
return False, f"Error processing {accession_number}: {str(e)}"
|
166
|
-
|
167
|
-
def build(self):
|
168
|
-
"""Main processing method to build the dataset."""
|
169
|
-
self.validate_config()
|
170
|
-
|
171
|
-
# Initialize model and rate limiter
|
172
|
-
model = genai.GenerativeModel(self.model_name)
|
173
|
-
rate_limiter = RateLimiter(self.max_rpm)
|
174
|
-
|
175
|
-
# Load data
|
176
|
-
print("Loading data...")
|
177
|
-
df_input = pd.read_csv(self.input_path)
|
178
|
-
processed_ids = self.get_processed_ids()
|
179
|
-
df_to_process = df_input[~df_input['accession_number'].isin(processed_ids)]
|
180
|
-
|
181
|
-
total_in_dataset = len(df_input)
|
182
|
-
already_processed = len(processed_ids)
|
183
|
-
to_process = len(df_to_process)
|
184
|
-
|
185
|
-
print(f"Total entries in dataset: {total_in_dataset}")
|
186
|
-
print(f"Already processed: {already_processed}")
|
187
|
-
print(f"New entries to process: {to_process}")
|
188
|
-
|
189
|
-
if len(df_to_process) == 0:
|
190
|
-
print("All entries already processed!")
|
191
|
-
return
|
192
|
-
|
193
|
-
work_items = [
|
194
|
-
(model, row['text'], row['accession_number'], rate_limiter)
|
195
|
-
for _, row in df_to_process.iterrows()
|
196
|
-
]
|
197
|
-
|
198
|
-
start_time = time.time()
|
199
|
-
last_save_time = time.time()
|
200
|
-
processed_count = 0
|
201
|
-
|
202
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
203
|
-
futures = {executor.submit(self.process_text, item): item for item in work_items}
|
204
|
-
|
205
|
-
with tqdm(total=total_in_dataset, initial=already_processed, desc="Processing entries") as pbar:
|
206
|
-
for future in as_completed(futures):
|
207
|
-
success, result = future.result()
|
208
|
-
|
209
|
-
if not success:
|
210
|
-
print(f"\n{result}")
|
211
|
-
|
212
|
-
processed_count += 1
|
213
|
-
pbar.update(1)
|
214
|
-
|
215
|
-
elapsed = time.time() - start_time
|
216
|
-
rpm = processed_count / (elapsed / 60)
|
217
|
-
memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
|
218
|
-
|
219
|
-
current_progress = already_processed + processed_count
|
220
|
-
pbar.set_description(
|
221
|
-
f"Processed {current_progress}/{total_in_dataset} | {rpm:.0f} RPM | Mem: {memory_usage:.0f}MB"
|
222
|
-
)
|
223
|
-
|
224
|
-
# Save periodically using append
|
225
|
-
if len(self.buffer) >= self.save_frequency:
|
226
|
-
with self.buffer_lock:
|
227
|
-
df_new = pd.DataFrame(self.buffer)
|
228
|
-
self.buffer = []
|
229
|
-
|
230
|
-
if not df_new.empty:
|
231
|
-
self.save_data(df_new)
|
232
|
-
last_save_time = time.time()
|
233
|
-
|
234
|
-
# Save failed IDs periodically
|
235
|
-
if self.failed_ids and time.time() - last_save_time > 300:
|
236
|
-
self.save_failed_ids()
|
237
|
-
last_save_time = time.time()
|
238
|
-
|
239
|
-
# Save any remaining results
|
240
|
-
if self.buffer:
|
241
|
-
with self.buffer_lock:
|
242
|
-
df_new = pd.DataFrame(self.buffer)
|
243
|
-
self.buffer = []
|
244
|
-
|
245
|
-
if not df_new.empty:
|
246
|
-
self.save_data(df_new)
|
247
|
-
|
248
|
-
if self.failed_ids:
|
249
|
-
self.save_failed_ids()
|
250
|
-
|
251
|
-
# Print final statistics
|
252
|
-
elapsed = time.time() - start_time
|
253
|
-
final_rpm = processed_count / (elapsed / 60)
|
254
|
-
|
255
|
-
print(f"\nProcessing complete:")
|
256
|
-
print(f"Total processed in this run: {processed_count}")
|
257
|
-
print(f"Average speed: {final_rpm:.0f} RPM")
|
258
|
-
print(f"Failed entries: {len(self.failed_ids)}")
|
259
|
-
if self.failed_ids:
|
260
|
-
print(f"Failed entries saved to: {self.failed_path}")
|
@@ -1,225 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
import aiohttp
|
3
|
-
from aiolimiter import AsyncLimiter
|
4
|
-
import os
|
5
|
-
from tqdm import tqdm
|
6
|
-
from tqdm.asyncio import tqdm as atqdm
|
7
|
-
from datetime import datetime, timedelta
|
8
|
-
from urllib.parse import urlparse, parse_qs, urlencode
|
9
|
-
import math
|
10
|
-
import re
|
11
|
-
import aiofiles
|
12
|
-
import json
|
13
|
-
import csv
|
14
|
-
from pkg_resources import resource_filename
|
15
|
-
import zipfile
|
16
|
-
import shutil
|
17
|
-
|
18
|
-
class DropboxDownloader:
|
19
|
-
"""
|
20
|
-
Asynchronous downloader for handling multiple file downloads with rate limiting and progress tracking.
|
21
|
-
|
22
|
-
This class provides functionality to download multiple files concurrently with rate limiting,
|
23
|
-
progress bars, and automatic handling of zip archives (including multi-part archives).
|
24
|
-
|
25
|
-
Parameters
|
26
|
-
----------
|
27
|
-
concurrent_downloads : int, optional
|
28
|
-
Maximum number of concurrent downloads allowed (default is 5)
|
29
|
-
rate_limit : int, optional
|
30
|
-
Maximum number of requests per second (default is 10)
|
31
|
-
|
32
|
-
Attributes
|
33
|
-
----------
|
34
|
-
semaphore : asyncio.Semaphore
|
35
|
-
Controls the number of concurrent downloads
|
36
|
-
rate_limiter : AsyncLimiter
|
37
|
-
Handles rate limiting of requests
|
38
|
-
session : aiohttp.ClientSession
|
39
|
-
HTTP session for making requests
|
40
|
-
progress_bars : dict
|
41
|
-
Dictionary of progress bars for active downloads
|
42
|
-
"""
|
43
|
-
|
44
|
-
def __init__(self, concurrent_downloads=5, rate_limit=10):
|
45
|
-
"""Initialize the DropboxDownloader with specified concurrency and rate limits."""
|
46
|
-
self.semaphore = asyncio.Semaphore(concurrent_downloads)
|
47
|
-
self.rate_limiter = AsyncLimiter(rate_limit, 1) # rate_limit requests per second
|
48
|
-
self.session = None
|
49
|
-
self.progress_bars = {}
|
50
|
-
|
51
|
-
async def create_session(self):
|
52
|
-
"""
|
53
|
-
Create an aiohttp client session.
|
54
|
-
|
55
|
-
This method should be called before starting any downloads.
|
56
|
-
"""
|
57
|
-
self.session = aiohttp.ClientSession()
|
58
|
-
|
59
|
-
async def close_session(self):
|
60
|
-
"""
|
61
|
-
Close the aiohttp client session.
|
62
|
-
|
63
|
-
This method should be called after all downloads are complete.
|
64
|
-
"""
|
65
|
-
if self.session:
|
66
|
-
await self.session.close()
|
67
|
-
|
68
|
-
async def download_file(self, url, dest_folder):
|
69
|
-
"""
|
70
|
-
Download a single file with progress tracking and automatic unzipping.
|
71
|
-
|
72
|
-
Parameters
|
73
|
-
----------
|
74
|
-
url : str
|
75
|
-
URL of the file to download
|
76
|
-
dest_folder : str
|
77
|
-
Destination folder where the file will be saved
|
78
|
-
|
79
|
-
Notes
|
80
|
-
-----
|
81
|
-
- Uses a progress bar to show download progress
|
82
|
-
- Automatically handles zip files and multi-part archives
|
83
|
-
- Rate limited based on the instance settings
|
84
|
-
"""
|
85
|
-
async with self.semaphore:
|
86
|
-
await self.rate_limiter.acquire()
|
87
|
-
try:
|
88
|
-
file_name = os.path.basename(urlparse(url).path)
|
89
|
-
file_path = os.path.join(dest_folder, file_name)
|
90
|
-
|
91
|
-
async with self.session.get(url) as response:
|
92
|
-
if response.status != 200:
|
93
|
-
print(f"Failed to download {url}: HTTP {response.status}")
|
94
|
-
return
|
95
|
-
|
96
|
-
file_size = int(response.headers.get('Content-Length', 0))
|
97
|
-
|
98
|
-
if file_name not in self.progress_bars:
|
99
|
-
self.progress_bars[file_name] = tqdm(
|
100
|
-
total=file_size,
|
101
|
-
unit='iB',
|
102
|
-
unit_scale=True,
|
103
|
-
desc=file_name
|
104
|
-
)
|
105
|
-
|
106
|
-
async with aiofiles.open(file_path, 'wb') as f:
|
107
|
-
chunk_size = 8192
|
108
|
-
downloaded = 0
|
109
|
-
async for chunk in response.content.iter_chunked(chunk_size):
|
110
|
-
await f.write(chunk)
|
111
|
-
downloaded += len(chunk)
|
112
|
-
self.progress_bars[file_name].update(len(chunk))
|
113
|
-
|
114
|
-
self.progress_bars[file_name].close()
|
115
|
-
del self.progress_bars[file_name]
|
116
|
-
|
117
|
-
print(f"Downloaded {file_name}")
|
118
|
-
|
119
|
-
if re.match(r'.*\.zip(\.001)?$', file_name):
|
120
|
-
await self.unzip_file(file_path, dest_folder)
|
121
|
-
|
122
|
-
except Exception as e:
|
123
|
-
print(f"Error downloading {url}: {str(e)}")
|
124
|
-
|
125
|
-
async def unzip_file(self, file_path, dest_folder):
|
126
|
-
"""
|
127
|
-
Extract contents of a zip file and clean up archive files.
|
128
|
-
|
129
|
-
Handles both single zip files and multi-part archives.
|
130
|
-
|
131
|
-
Parameters
|
132
|
-
----------
|
133
|
-
file_path : str
|
134
|
-
Path to the zip file
|
135
|
-
dest_folder : str
|
136
|
-
Destination folder for extracted contents
|
137
|
-
|
138
|
-
Notes
|
139
|
-
-----
|
140
|
-
- Automatically combines multi-part archives
|
141
|
-
- Deletes archive files after successful extraction
|
142
|
-
- Handles both .zip and .zip.001 format files
|
143
|
-
"""
|
144
|
-
try:
|
145
|
-
base_name = os.path.splitext(file_path)[0]
|
146
|
-
if file_path.endswith('.001'):
|
147
|
-
base_name = os.path.splitext(base_name)[0]
|
148
|
-
|
149
|
-
combined_zip = f"{base_name}.zip"
|
150
|
-
|
151
|
-
# Combine parts if necessary
|
152
|
-
if not os.path.exists(combined_zip):
|
153
|
-
with open(combined_zip, 'wb') as outfile:
|
154
|
-
part_num = 1
|
155
|
-
while True:
|
156
|
-
part_path = f"{base_name}.zip.{part_num:03d}"
|
157
|
-
if not os.path.exists(part_path):
|
158
|
-
break
|
159
|
-
with open(part_path, 'rb') as infile:
|
160
|
-
shutil.copyfileobj(infile, outfile)
|
161
|
-
part_num += 1
|
162
|
-
|
163
|
-
# Unzip the combined file
|
164
|
-
with zipfile.ZipFile(combined_zip, 'r') as zip_ref:
|
165
|
-
zip_ref.extractall(dest_folder)
|
166
|
-
print(f"Unzipped {os.path.basename(base_name)}")
|
167
|
-
|
168
|
-
# Remove archive files
|
169
|
-
if os.path.exists(combined_zip):
|
170
|
-
os.remove(combined_zip)
|
171
|
-
|
172
|
-
part_num = 1
|
173
|
-
while True:
|
174
|
-
part_path = f"{base_name}.zip.{part_num:03d}"
|
175
|
-
if not os.path.exists(part_path):
|
176
|
-
break
|
177
|
-
os.remove(part_path)
|
178
|
-
part_num += 1
|
179
|
-
|
180
|
-
standalone_zip = f"{base_name}.zip"
|
181
|
-
if os.path.exists(standalone_zip):
|
182
|
-
os.remove(standalone_zip)
|
183
|
-
|
184
|
-
print(f"Cleaned up zip files for {os.path.basename(base_name)}")
|
185
|
-
except Exception as e:
|
186
|
-
print(f"Error unzipping {file_path}: {str(e)}")
|
187
|
-
|
188
|
-
async def _download_urls(self, urls, dest_folder):
|
189
|
-
"""
|
190
|
-
Internal method to handle multiple URL downloads.
|
191
|
-
|
192
|
-
Parameters
|
193
|
-
----------
|
194
|
-
urls : list of str
|
195
|
-
List of URLs to download
|
196
|
-
dest_folder : str
|
197
|
-
Destination folder for downloaded files
|
198
|
-
"""
|
199
|
-
os.makedirs(dest_folder, exist_ok=True)
|
200
|
-
await self.create_session()
|
201
|
-
tasks = [self.download_file(url, dest_folder) for url in urls]
|
202
|
-
await asyncio.gather(*tasks)
|
203
|
-
await self.close_session()
|
204
|
-
|
205
|
-
def download(self, urls, output_dir):
|
206
|
-
"""
|
207
|
-
Download multiple URLs to a specified directory.
|
208
|
-
|
209
|
-
This is the main method to use for downloading files. It handles the creation
|
210
|
-
and cleanup of the async event loop.
|
211
|
-
|
212
|
-
Parameters
|
213
|
-
----------
|
214
|
-
urls : list of str
|
215
|
-
List of URLs to download
|
216
|
-
output_dir : str
|
217
|
-
Directory where files will be saved
|
218
|
-
|
219
|
-
Examples
|
220
|
-
--------
|
221
|
-
>>> downloader = DropboxDownloader(concurrent_downloads=3, rate_limit=5)
|
222
|
-
>>> urls = ['http://example.com/file1.zip', 'http://example.com/file2.zip']
|
223
|
-
>>> downloader.download(urls, '/path/to/output')
|
224
|
-
"""
|
225
|
-
return asyncio.run(self._download_urls(urls, output_dir))
|
datamule/downloader/ftd.py
DELETED
@@ -1,216 +0,0 @@
|
|
1
|
-
from datetime import datetime, timedelta
|
2
|
-
import pkg_resources
|
3
|
-
import io
|
4
|
-
import re
|
5
|
-
import os
|
6
|
-
import zipfile
|
7
|
-
import csv
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
9
|
-
from tqdm import tqdm
|
10
|
-
|
11
|
-
def process_ftd_zip(zip_path):
|
12
|
-
"""
|
13
|
-
Process a single FTD (Fails-to-Deliver) ZIP file by converting its contents to CSV format.
|
14
|
-
|
15
|
-
This function extracts the contents of a ZIP file containing tab-delimited data,
|
16
|
-
converts it to CSV format, and removes the original ZIP file.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
zip_path : str
|
21
|
-
Path to the ZIP file to process
|
22
|
-
|
23
|
-
Notes
|
24
|
-
-----
|
25
|
-
- Assumes each ZIP file contains exactly one data file
|
26
|
-
- Uses '|' as the delimiter for input data
|
27
|
-
- Removes the original ZIP file after processing
|
28
|
-
- Handles UTF-8 encoding with replacement for invalid characters
|
29
|
-
|
30
|
-
Examples
|
31
|
-
--------
|
32
|
-
>>> process_ftd_zip('path/to/cnsfails202301a.zip')
|
33
|
-
"""
|
34
|
-
base_name = os.path.splitext(zip_path)[0]
|
35
|
-
csv_path = f"{base_name}.csv"
|
36
|
-
|
37
|
-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
38
|
-
file_name = zip_ref.namelist()[0] # Assuming only one file per zip
|
39
|
-
with zip_ref.open(file_name) as file:
|
40
|
-
content = io.TextIOWrapper(file, encoding='utf-8', errors='replace').read()
|
41
|
-
|
42
|
-
# Convert tab-delimited content to CSV
|
43
|
-
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
|
44
|
-
writer = csv.writer(csvfile)
|
45
|
-
for line in content.split('\n'):
|
46
|
-
writer.writerow(line.split('|'))
|
47
|
-
|
48
|
-
# Remove the original zip file
|
49
|
-
os.remove(zip_path)
|
50
|
-
|
51
|
-
def process_all_ftd_zips(output_dir):
|
52
|
-
"""
|
53
|
-
Process all FTD ZIP files in a directory concurrently.
|
54
|
-
|
55
|
-
Uses ThreadPoolExecutor for parallel processing with progress tracking.
|
56
|
-
|
57
|
-
Parameters
|
58
|
-
----------
|
59
|
-
output_dir : str
|
60
|
-
Directory containing the ZIP files to process
|
61
|
-
|
62
|
-
Notes
|
63
|
-
-----
|
64
|
-
- Processes all files ending with '.zip' in the directory
|
65
|
-
- Shows progress bar during processing
|
66
|
-
- Uses system default number of worker threads
|
67
|
-
|
68
|
-
Examples
|
69
|
-
--------
|
70
|
-
>>> process_all_ftd_zips('/path/to/ftd/files')
|
71
|
-
"""
|
72
|
-
zip_files = [f for f in os.listdir(output_dir) if f.endswith('.zip')]
|
73
|
-
|
74
|
-
# Use ThreadPoolExecutor for parallel processing with tqdm
|
75
|
-
with ThreadPoolExecutor() as executor:
|
76
|
-
list(tqdm(executor.map(lambda f: process_ftd_zip(os.path.join(output_dir, f)), zip_files),
|
77
|
-
total=len(zip_files),
|
78
|
-
desc="Processing ZIP files",
|
79
|
-
unit="file"))
|
80
|
-
|
81
|
-
def load_csv_data():
|
82
|
-
"""
|
83
|
-
Load existing FTD locations data from the package's data directory.
|
84
|
-
|
85
|
-
Returns
|
86
|
-
-------
|
87
|
-
list of dict
|
88
|
-
List of dictionaries containing the CSV data, where each dictionary
|
89
|
-
represents a row with keys from the CSV header
|
90
|
-
|
91
|
-
Notes
|
92
|
-
-----
|
93
|
-
- Reads from 'ftd_locations.csv' in the package's data directory
|
94
|
-
- Uses UTF-8 encoding
|
95
|
-
- Assumes CSV has a header row
|
96
|
-
|
97
|
-
Examples
|
98
|
-
--------
|
99
|
-
>>> data = load_csv_data()
|
100
|
-
>>> print(data[0]['url']) # Print first URL in data
|
101
|
-
"""
|
102
|
-
csv_content = pkg_resources.resource_string('datamule', 'data/ftd_locations.csv')
|
103
|
-
csv_data = []
|
104
|
-
csv_file = io.StringIO(csv_content.decode('utf-8'))
|
105
|
-
csv_reader = csv.DictReader(csv_file)
|
106
|
-
for row in csv_reader:
|
107
|
-
csv_data.append(row)
|
108
|
-
return csv_data
|
109
|
-
|
110
|
-
def extract_date_from_url(url):
|
111
|
-
"""
|
112
|
-
Extract the date from an FTD file URL.
|
113
|
-
|
114
|
-
Parameters
|
115
|
-
----------
|
116
|
-
url : str
|
117
|
-
URL of the FTD file
|
118
|
-
|
119
|
-
Returns
|
120
|
-
-------
|
121
|
-
datetime or None
|
122
|
-
Datetime object representing the month and year from the URL,
|
123
|
-
or None if no date is found
|
124
|
-
|
125
|
-
Notes
|
126
|
-
-----
|
127
|
-
- Expects URLs in format containing 'cnsfails{YYYYMM}[a|b].zip'
|
128
|
-
- Returns None if the URL doesn't match the expected pattern
|
129
|
-
|
130
|
-
Examples
|
131
|
-
--------
|
132
|
-
>>> url = "https://www.sec.gov/files/data/fails-deliver-data/cnsfails202301a.zip"
|
133
|
-
>>> date = extract_date_from_url(url)
|
134
|
-
>>> print(date)
|
135
|
-
2023-01-01 00:00:00
|
136
|
-
"""
|
137
|
-
match = re.search(r'cnsfails(\d{6})[ab]\.zip', url)
|
138
|
-
if match:
|
139
|
-
date_str = match.group(1)
|
140
|
-
return datetime.strptime(date_str, '%Y%m')
|
141
|
-
return None
|
142
|
-
|
143
|
-
def generate_urls(start_date, end_date):
|
144
|
-
"""
|
145
|
-
Generate FTD file URLs for a date range.
|
146
|
-
|
147
|
-
Parameters
|
148
|
-
----------
|
149
|
-
start_date : datetime
|
150
|
-
Start date for URL generation
|
151
|
-
end_date : datetime
|
152
|
-
End date for URL generation
|
153
|
-
|
154
|
-
Returns
|
155
|
-
-------
|
156
|
-
list of str
|
157
|
-
List of URLs for FTD files in the date range
|
158
|
-
|
159
|
-
Notes
|
160
|
-
-----
|
161
|
-
- Generates two URLs per month ('a' and 'b' files)
|
162
|
-
- Uses 15-day intervals for half-month periods
|
163
|
-
- URLs follow SEC's file naming convention
|
164
|
-
|
165
|
-
Examples
|
166
|
-
--------
|
167
|
-
>>> start = datetime(2023, 1, 1)
|
168
|
-
>>> end = datetime(2023, 12, 31)
|
169
|
-
>>> urls = generate_urls(start, end)
|
170
|
-
"""
|
171
|
-
urls = []
|
172
|
-
current_date = start_date
|
173
|
-
while current_date <= end_date:
|
174
|
-
for half in ['a', 'b']:
|
175
|
-
url = f"https://www.sec.gov/files/data/fails-deliver-data/cnsfails{current_date.strftime('%Y%m')}{half}.zip"
|
176
|
-
urls.append(url)
|
177
|
-
current_date += timedelta(days=15) # Move to the next half-month
|
178
|
-
return urls
|
179
|
-
|
180
|
-
def get_all_ftd_urls():
|
181
|
-
"""
|
182
|
-
Get a complete list of FTD URLs, including both existing and new ones.
|
183
|
-
|
184
|
-
Returns
|
185
|
-
-------
|
186
|
-
list of str
|
187
|
-
Combined list of existing and newly generated URLs
|
188
|
-
|
189
|
-
Notes
|
190
|
-
-----
|
191
|
-
- Loads existing URLs from the package's data file
|
192
|
-
- Finds the latest date in existing URLs
|
193
|
-
- Generates new URLs from the month after the latest date up to current date
|
194
|
-
- Combines and returns all URLs
|
195
|
-
|
196
|
-
Examples
|
197
|
-
--------
|
198
|
-
>>> urls = get_all_ftd_urls()
|
199
|
-
>>> print(len(urls)) # Number of URLs
|
200
|
-
>>> print(urls[-1]) # Most recent URL
|
201
|
-
"""
|
202
|
-
# Load existing URLs
|
203
|
-
csv_data = load_csv_data()
|
204
|
-
existing_urls = [row['url'] for row in csv_data]
|
205
|
-
|
206
|
-
# Find the last date in the existing URLs
|
207
|
-
last_date = max(extract_date_from_url(url) for url in existing_urls if extract_date_from_url(url))
|
208
|
-
|
209
|
-
# Generate new URLs starting from the month after the last date
|
210
|
-
start_date = (last_date.replace(day=1) + timedelta(days=32)).replace(day=1)
|
211
|
-
end_date = datetime.now()
|
212
|
-
|
213
|
-
new_urls = generate_urls(start_date, end_date)
|
214
|
-
|
215
|
-
# Combine and return all URLs
|
216
|
-
return existing_urls + new_urls
|