datamule 0.381__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +46 -86
- datamule/book/book.py +34 -0
- datamule/book/eftsquery.py +127 -0
- datamule/book/xbrl_retriever.py +88 -0
- datamule/config.py +29 -0
- datamule/data/company_former_names.csv +8148 -8148
- datamule/data/company_metadata.csv +10049 -10049
- datamule/data/company_tickers.csv +9999 -10168
- datamule/data/sec-glossary.csv +728 -728
- datamule/data/xbrl_descriptions.csv +10024 -10024
- datamule/document.py +279 -0
- datamule/downloader/downloader.py +374 -0
- datamule/downloader/premiumdownloader.py +335 -0
- datamule/helper.py +123 -136
- datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
- datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
- datamule/monitor.py +238 -0
- datamule/mulebot/__init__.py +1 -1
- datamule/mulebot/helper.py +34 -34
- datamule/mulebot/mulebot.py +129 -129
- datamule/mulebot/mulebot_server/server.py +86 -86
- datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
- datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
- datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
- datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
- datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
- datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
- datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
- datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
- datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
- datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
- datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
- datamule/mulebot/search.py +51 -51
- datamule/mulebot/tools.py +82 -82
- datamule/packageupdater.py +207 -0
- datamule/portfolio.py +106 -0
- datamule/submission.py +76 -0
- datamule-1.0.2.dist-info/METADATA +27 -0
- datamule-1.0.2.dist-info/RECORD +43 -0
- {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/WHEEL +1 -1
- datamule/data/filing_types.csv +0 -485
- datamule/data/ftd_locations.csv +0 -388
- datamule/datamule_api.py +0 -21
- datamule/dataset_builder/_init.py +0 -1
- datamule/dataset_builder/dataset_builder.py +0 -260
- datamule/downloader/dropbox_downloader.py +0 -225
- datamule/downloader/ftd.py +0 -216
- datamule/downloader/information_table_13f.py +0 -231
- datamule/downloader/sec_downloader.py +0 -635
- datamule/filing_viewer/__init__.py +0 -1
- datamule/filing_viewer/filing_viewer.py +0 -256
- datamule/global_vars.py +0 -202
- datamule/parser/__init__.py +0 -1
- datamule/parser/basic_10k_parser.py +0 -82
- datamule/parser/basic_10q_parser.py +0 -73
- datamule/parser/basic_13d_parser.py +0 -58
- datamule/parser/basic_13g_parser.py +0 -61
- datamule/parser/basic_8k_parser.py +0 -84
- datamule/parser/company_concepts_parser.py +0 -0
- datamule/parser/form_d_parser.py +0 -70
- datamule/parser/generalized_item_parser.py +0 -78
- datamule/parser/generalized_xml_parser.py +0 -0
- datamule/parser/helper.py +0 -75
- datamule/parser/information_table_parser_13fhr.py +0 -41
- datamule/parser/insider_trading_parser.py +0 -158
- datamule/parser/mappings.py +0 -95
- datamule/parser/n_port_p_parser.py +0 -70
- datamule/parser/sec_parser.py +0 -79
- datamule/parser/sgml_parser.py +0 -180
- datamule/sec_filing.py +0 -126
- datamule/sec_search.py +0 -20
- datamule-0.381.dist-info/METADATA +0 -132
- datamule-0.381.dist-info/RECORD +0 -61
- /datamule/{downloader → book}/__init__.py +0 -0
- {datamule-0.381.dist-info → datamule-1.0.2.dist-info}/top_level.txt +0 -0
@@ -1,231 +0,0 @@
|
|
1
|
-
from datetime import datetime, timedelta
|
2
|
-
import os
|
3
|
-
import zipfile
|
4
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from tqdm import tqdm
|
6
|
-
import traceback
|
7
|
-
import polars as pl
|
8
|
-
import shutil
|
9
|
-
import glob
|
10
|
-
import re
|
11
|
-
from aiolimiter import AsyncLimiter
|
12
|
-
|
13
|
-
from ..sec_filing import Filing
|
14
|
-
|
15
|
-
def generate_quarterly_urls(start_date, end_date):
|
16
|
-
urls = []
|
17
|
-
current_date = start_date
|
18
|
-
while current_date <= end_date and current_date < datetime(2024, 1, 1):
|
19
|
-
url = f"https://www.sec.gov/files/structureddata/data/form-13f-data-sets/{current_date.year}q{(current_date.month-1)//3+1}_form13f.zip"
|
20
|
-
urls.append(url)
|
21
|
-
current_date = (current_date.replace(day=1) + timedelta(days=92)).replace(day=1)
|
22
|
-
return urls
|
23
|
-
|
24
|
-
def generate_new_format_urls(start_date, end_date):
|
25
|
-
urls = []
|
26
|
-
current_date = max(start_date, datetime(2024, 1, 1))
|
27
|
-
|
28
|
-
while current_date <= end_date:
|
29
|
-
if current_date.month in [1, 3, 6, 9]:
|
30
|
-
if current_date.month == 1:
|
31
|
-
next_date = datetime(current_date.year, 2, 28 if current_date.year % 4 else 29)
|
32
|
-
elif current_date.month == 3:
|
33
|
-
next_date = datetime(current_date.year, 5, 31)
|
34
|
-
elif current_date.month == 6:
|
35
|
-
next_date = datetime(current_date.year, 8, 31)
|
36
|
-
else: # September
|
37
|
-
next_date = datetime(current_date.year, 11, 30)
|
38
|
-
|
39
|
-
if next_date > end_date:
|
40
|
-
next_date = end_date
|
41
|
-
|
42
|
-
url = f"https://www.sec.gov/files/structureddata/data/form-13f-data-sets/{current_date.strftime('%d%b%Y').lower()}-{next_date.strftime('%d%b%Y').lower()}_form13f.zip"
|
43
|
-
urls.append(url)
|
44
|
-
current_date = next_date + timedelta(days=1)
|
45
|
-
else:
|
46
|
-
current_date = (current_date.replace(day=1) + timedelta(days=31)).replace(day=1)
|
47
|
-
|
48
|
-
return urls
|
49
|
-
|
50
|
-
def get_all_13f_urls():
|
51
|
-
start_date = datetime(2013, 4, 1) # 2013 Q2
|
52
|
-
end_date = datetime.now()
|
53
|
-
quarterly_urls = generate_quarterly_urls(start_date, min(end_date, datetime(2023, 12, 31)))
|
54
|
-
new_format_urls = generate_new_format_urls(max(start_date, datetime(2024, 1, 1)), end_date)
|
55
|
-
return quarterly_urls + new_format_urls
|
56
|
-
|
57
|
-
def unzip_file(zip_path, extract_dir):
|
58
|
-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
59
|
-
zip_ref.extractall(extract_dir)
|
60
|
-
return next((name for name in zip_ref.namelist() if name.endswith('INFOTABLE.tsv')), None)
|
61
|
-
|
62
|
-
def convert_tsv_to_csv(tsv_path, csv_path):
|
63
|
-
dtypes = {
|
64
|
-
'OTHERMANAGER': pl.Utf8 # Treat OTHERMANAGER as string
|
65
|
-
}
|
66
|
-
df = pl.read_csv(tsv_path, separator='\t', truncate_ragged_lines=True, dtypes=dtypes)
|
67
|
-
df.write_csv(csv_path)
|
68
|
-
|
69
|
-
def process_13f_zip(zip_path, output_dir):
|
70
|
-
try:
|
71
|
-
base_name = os.path.splitext(os.path.basename(zip_path))[0]
|
72
|
-
extract_dir = os.path.join(output_dir, base_name)
|
73
|
-
csv_path = os.path.join(output_dir, f"{base_name}_INFOTABLE.csv")
|
74
|
-
|
75
|
-
# Unzip file
|
76
|
-
infotable_file = unzip_file(zip_path, extract_dir)
|
77
|
-
|
78
|
-
if not infotable_file:
|
79
|
-
return f"INFOTABLE.tsv not found in {zip_path}"
|
80
|
-
|
81
|
-
# Convert TSV to CSV
|
82
|
-
tsv_path = os.path.join(extract_dir, infotable_file)
|
83
|
-
convert_tsv_to_csv(tsv_path, csv_path)
|
84
|
-
|
85
|
-
# Clean up: remove extracted folder and zip file
|
86
|
-
shutil.rmtree(extract_dir)
|
87
|
-
os.remove(zip_path)
|
88
|
-
|
89
|
-
return f"Successfully processed and cleaned up {zip_path}"
|
90
|
-
|
91
|
-
except Exception as e:
|
92
|
-
return f"Error processing {zip_path}: {str(e)}\n{traceback.format_exc()}"
|
93
|
-
|
94
|
-
def process_all_13f_zips(output_dir):
|
95
|
-
zip_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith('.zip')]
|
96
|
-
|
97
|
-
# Process files
|
98
|
-
print("Processing files...")
|
99
|
-
with ThreadPoolExecutor() as executor:
|
100
|
-
futures = [executor.submit(process_13f_zip, zip_file, output_dir) for zip_file in zip_files]
|
101
|
-
for future in tqdm(as_completed(futures), total=len(zip_files), desc="Processing", unit="file"):
|
102
|
-
try:
|
103
|
-
result = future.result()
|
104
|
-
except Exception as e:
|
105
|
-
print(f"Error processing: {str(e)}")
|
106
|
-
|
107
|
-
# Count remaining CSV files
|
108
|
-
csv_files = [f for f in os.listdir(output_dir) if f.endswith('.csv')]
|
109
|
-
print(f"Processed {len(csv_files)} files. CSV files are stored in {output_dir}")
|
110
|
-
|
111
|
-
def get_13f_data_cutoff_date():
|
112
|
-
current_date = datetime.now()
|
113
|
-
|
114
|
-
# Define the end months for each period
|
115
|
-
period_end_months = [2, 5, 8, 11]
|
116
|
-
|
117
|
-
# Find the most recent period end date
|
118
|
-
year = current_date.year
|
119
|
-
month = current_date.month
|
120
|
-
|
121
|
-
# Find the most recent period end month
|
122
|
-
recent_end_month = max([m for m in period_end_months if m <= month] or [period_end_months[-1]])
|
123
|
-
if recent_end_month > month:
|
124
|
-
year -= 1
|
125
|
-
|
126
|
-
# Calculate the end date of the most recent period
|
127
|
-
if recent_end_month == 2:
|
128
|
-
recent_end_date = datetime(year, 2, 28 if year % 4 else 29)
|
129
|
-
else:
|
130
|
-
recent_end_date = datetime(year, recent_end_month, {5: 31, 8: 31, 11: 30}[recent_end_month])
|
131
|
-
|
132
|
-
# Add 7 days buffer
|
133
|
-
buffer_date = recent_end_date + timedelta(days=7)
|
134
|
-
|
135
|
-
# If current date is within the buffer period, go back to the previous period
|
136
|
-
if current_date <= buffer_date:
|
137
|
-
prev_end_month = period_end_months[(period_end_months.index(recent_end_month) - 1) % 4]
|
138
|
-
if prev_end_month > recent_end_month:
|
139
|
-
year -= 1
|
140
|
-
if prev_end_month == 2:
|
141
|
-
return datetime(year, 2, 28 if year % 4 else 29)
|
142
|
-
else:
|
143
|
-
return datetime(year, prev_end_month, {5: 31, 8: 31, 11: 30}[prev_end_month])
|
144
|
-
else:
|
145
|
-
return recent_end_date
|
146
|
-
|
147
|
-
|
148
|
-
def process_xml_files(output_dir):
|
149
|
-
xml_files = glob.glob(os.path.join(output_dir, '*.xml'))
|
150
|
-
|
151
|
-
for xml_file in tqdm(xml_files, desc="Processing XML files"):
|
152
|
-
filing = Filing(xml_file, filing_type='13F-HR-INFORMATIONTABLE')
|
153
|
-
filing.parse_filing()
|
154
|
-
|
155
|
-
filename = os.path.basename(xml_file)
|
156
|
-
match = re.match(r'(\d+)_', filename)
|
157
|
-
if match:
|
158
|
-
accession_number = match.group(1)
|
159
|
-
else:
|
160
|
-
raise ValueError(f"Could not extract accession number from {filename}")
|
161
|
-
|
162
|
-
filing.write_csv(accession_number=accession_number)
|
163
|
-
os.remove(xml_file)
|
164
|
-
|
165
|
-
def combine_csv_files(output_dir, cutoff_date):
|
166
|
-
csv_files = glob.glob(os.path.join(output_dir, '*.csv'))
|
167
|
-
combined_df = pl.DataFrame()
|
168
|
-
|
169
|
-
for csv_file in tqdm(csv_files, desc="Combining CSV files"):
|
170
|
-
# Infer schema from the file
|
171
|
-
inferred_schema = pl.read_csv(csv_file, infer_schema_length=1000).schema
|
172
|
-
|
173
|
-
# Read the CSV file with the inferred schema
|
174
|
-
df = pl.read_csv(csv_file, dtypes=inferred_schema)
|
175
|
-
|
176
|
-
# Cast all columns to strings
|
177
|
-
df = df.select([pl.col(col).cast(pl.Utf8) for col in df.columns])
|
178
|
-
|
179
|
-
if combined_df.is_empty():
|
180
|
-
combined_df = df
|
181
|
-
else:
|
182
|
-
combined_df = pl.concat([combined_df, df], how="diagonal")
|
183
|
-
|
184
|
-
os.remove(csv_file)
|
185
|
-
|
186
|
-
current_date = datetime.now().strftime('%Y-%m-%d')
|
187
|
-
combined_csv_name = f"13F_HR_{cutoff_date.strftime('%Y-%m-%d')}_{current_date}.csv"
|
188
|
-
combined_csv_path = os.path.join(output_dir, combined_csv_name)
|
189
|
-
combined_df.write_csv(combined_csv_path)
|
190
|
-
print(f"Combined CSV file saved as: {combined_csv_path}")
|
191
|
-
|
192
|
-
def download_and_process_13f_data(downloader, output_dir='13f_information_table'):
|
193
|
-
os.makedirs(output_dir, exist_ok=True)
|
194
|
-
|
195
|
-
# Store original rate limiters
|
196
|
-
original_sec_limiter = downloader.domain_limiters['www.sec.gov']
|
197
|
-
original_efts_limiter = downloader.domain_limiters['efts.sec.gov']
|
198
|
-
|
199
|
-
try:
|
200
|
-
# Process Current 13F-HR filings
|
201
|
-
current_date = datetime.now().strftime('%Y-%m-%d')
|
202
|
-
cutoff_date = get_13f_data_cutoff_date()
|
203
|
-
|
204
|
-
# Set rate limiters for downloading filings
|
205
|
-
downloader.domain_limiters['www.sec.gov'] = AsyncLimiter(5, 1)
|
206
|
-
downloader.domain_limiters['efts.sec.gov'] = AsyncLimiter(5, 1)
|
207
|
-
|
208
|
-
downloader.download(output_dir=output_dir, form='13F-HR', date=(cutoff_date.strftime('%Y-%m-%d'), current_date), file_types=['INFORMATION TABLE'])
|
209
|
-
|
210
|
-
# Process XML files
|
211
|
-
process_xml_files(output_dir)
|
212
|
-
|
213
|
-
# Combine CSV files
|
214
|
-
combine_csv_files(output_dir, cutoff_date)
|
215
|
-
|
216
|
-
# Download bulk data
|
217
|
-
urls = get_all_13f_urls()
|
218
|
-
|
219
|
-
# Set rate limiters for bulk download
|
220
|
-
downloader.domain_limiters['www.sec.gov'] = AsyncLimiter(1, 1)
|
221
|
-
downloader.domain_limiters['efts.sec.gov'] = AsyncLimiter(1, 1)
|
222
|
-
|
223
|
-
downloader.run_download_urls(urls, filenames=[url.split('/')[-1] for url in urls], output_dir=output_dir)
|
224
|
-
process_all_13f_zips(output_dir)
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
finally:
|
229
|
-
# Restore original rate limiters
|
230
|
-
downloader.domain_limiters['www.sec.gov'] = original_sec_limiter
|
231
|
-
downloader.domain_limiters['efts.sec.gov'] = original_efts_limiter
|