datamule 0.381__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.381.dist-info/METADATA +0 -132
  72. datamule-0.381.dist-info/RECORD +0 -61
  73. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,231 +0,0 @@
1
- from datetime import datetime, timedelta
2
- import os
3
- import zipfile
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from tqdm import tqdm
6
- import traceback
7
- import polars as pl
8
- import shutil
9
- import glob
10
- import re
11
- from aiolimiter import AsyncLimiter
12
-
13
- from ..sec_filing import Filing
14
-
15
- def generate_quarterly_urls(start_date, end_date):
16
- urls = []
17
- current_date = start_date
18
- while current_date <= end_date and current_date < datetime(2024, 1, 1):
19
- url = f"https://www.sec.gov/files/structureddata/data/form-13f-data-sets/{current_date.year}q{(current_date.month-1)//3+1}_form13f.zip"
20
- urls.append(url)
21
- current_date = (current_date.replace(day=1) + timedelta(days=92)).replace(day=1)
22
- return urls
23
-
24
- def generate_new_format_urls(start_date, end_date):
25
- urls = []
26
- current_date = max(start_date, datetime(2024, 1, 1))
27
-
28
- while current_date <= end_date:
29
- if current_date.month in [1, 3, 6, 9]:
30
- if current_date.month == 1:
31
- next_date = datetime(current_date.year, 2, 28 if current_date.year % 4 else 29)
32
- elif current_date.month == 3:
33
- next_date = datetime(current_date.year, 5, 31)
34
- elif current_date.month == 6:
35
- next_date = datetime(current_date.year, 8, 31)
36
- else: # September
37
- next_date = datetime(current_date.year, 11, 30)
38
-
39
- if next_date > end_date:
40
- next_date = end_date
41
-
42
- url = f"https://www.sec.gov/files/structureddata/data/form-13f-data-sets/{current_date.strftime('%d%b%Y').lower()}-{next_date.strftime('%d%b%Y').lower()}_form13f.zip"
43
- urls.append(url)
44
- current_date = next_date + timedelta(days=1)
45
- else:
46
- current_date = (current_date.replace(day=1) + timedelta(days=31)).replace(day=1)
47
-
48
- return urls
49
-
50
- def get_all_13f_urls():
51
- start_date = datetime(2013, 4, 1) # 2013 Q2
52
- end_date = datetime.now()
53
- quarterly_urls = generate_quarterly_urls(start_date, min(end_date, datetime(2023, 12, 31)))
54
- new_format_urls = generate_new_format_urls(max(start_date, datetime(2024, 1, 1)), end_date)
55
- return quarterly_urls + new_format_urls
56
-
57
- def unzip_file(zip_path, extract_dir):
58
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
59
- zip_ref.extractall(extract_dir)
60
- return next((name for name in zip_ref.namelist() if name.endswith('INFOTABLE.tsv')), None)
61
-
62
- def convert_tsv_to_csv(tsv_path, csv_path):
63
- dtypes = {
64
- 'OTHERMANAGER': pl.Utf8 # Treat OTHERMANAGER as string
65
- }
66
- df = pl.read_csv(tsv_path, separator='\t', truncate_ragged_lines=True, dtypes=dtypes)
67
- df.write_csv(csv_path)
68
-
69
- def process_13f_zip(zip_path, output_dir):
70
- try:
71
- base_name = os.path.splitext(os.path.basename(zip_path))[0]
72
- extract_dir = os.path.join(output_dir, base_name)
73
- csv_path = os.path.join(output_dir, f"{base_name}_INFOTABLE.csv")
74
-
75
- # Unzip file
76
- infotable_file = unzip_file(zip_path, extract_dir)
77
-
78
- if not infotable_file:
79
- return f"INFOTABLE.tsv not found in {zip_path}"
80
-
81
- # Convert TSV to CSV
82
- tsv_path = os.path.join(extract_dir, infotable_file)
83
- convert_tsv_to_csv(tsv_path, csv_path)
84
-
85
- # Clean up: remove extracted folder and zip file
86
- shutil.rmtree(extract_dir)
87
- os.remove(zip_path)
88
-
89
- return f"Successfully processed and cleaned up {zip_path}"
90
-
91
- except Exception as e:
92
- return f"Error processing {zip_path}: {str(e)}\n{traceback.format_exc()}"
93
-
94
- def process_all_13f_zips(output_dir):
95
- zip_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith('.zip')]
96
-
97
- # Process files
98
- print("Processing files...")
99
- with ThreadPoolExecutor() as executor:
100
- futures = [executor.submit(process_13f_zip, zip_file, output_dir) for zip_file in zip_files]
101
- for future in tqdm(as_completed(futures), total=len(zip_files), desc="Processing", unit="file"):
102
- try:
103
- result = future.result()
104
- except Exception as e:
105
- print(f"Error processing: {str(e)}")
106
-
107
- # Count remaining CSV files
108
- csv_files = [f for f in os.listdir(output_dir) if f.endswith('.csv')]
109
- print(f"Processed {len(csv_files)} files. CSV files are stored in {output_dir}")
110
-
111
- def get_13f_data_cutoff_date():
112
- current_date = datetime.now()
113
-
114
- # Define the end months for each period
115
- period_end_months = [2, 5, 8, 11]
116
-
117
- # Find the most recent period end date
118
- year = current_date.year
119
- month = current_date.month
120
-
121
- # Find the most recent period end month
122
- recent_end_month = max([m for m in period_end_months if m <= month] or [period_end_months[-1]])
123
- if recent_end_month > month:
124
- year -= 1
125
-
126
- # Calculate the end date of the most recent period
127
- if recent_end_month == 2:
128
- recent_end_date = datetime(year, 2, 28 if year % 4 else 29)
129
- else:
130
- recent_end_date = datetime(year, recent_end_month, {5: 31, 8: 31, 11: 30}[recent_end_month])
131
-
132
- # Add 7 days buffer
133
- buffer_date = recent_end_date + timedelta(days=7)
134
-
135
- # If current date is within the buffer period, go back to the previous period
136
- if current_date <= buffer_date:
137
- prev_end_month = period_end_months[(period_end_months.index(recent_end_month) - 1) % 4]
138
- if prev_end_month > recent_end_month:
139
- year -= 1
140
- if prev_end_month == 2:
141
- return datetime(year, 2, 28 if year % 4 else 29)
142
- else:
143
- return datetime(year, prev_end_month, {5: 31, 8: 31, 11: 30}[prev_end_month])
144
- else:
145
- return recent_end_date
146
-
147
-
148
- def process_xml_files(output_dir):
149
- xml_files = glob.glob(os.path.join(output_dir, '*.xml'))
150
-
151
- for xml_file in tqdm(xml_files, desc="Processing XML files"):
152
- filing = Filing(xml_file, filing_type='13F-HR-INFORMATIONTABLE')
153
- filing.parse_filing()
154
-
155
- filename = os.path.basename(xml_file)
156
- match = re.match(r'(\d+)_', filename)
157
- if match:
158
- accession_number = match.group(1)
159
- else:
160
- raise ValueError(f"Could not extract accession number from {filename}")
161
-
162
- filing.write_csv(accession_number=accession_number)
163
- os.remove(xml_file)
164
-
165
- def combine_csv_files(output_dir, cutoff_date):
166
- csv_files = glob.glob(os.path.join(output_dir, '*.csv'))
167
- combined_df = pl.DataFrame()
168
-
169
- for csv_file in tqdm(csv_files, desc="Combining CSV files"):
170
- # Infer schema from the file
171
- inferred_schema = pl.read_csv(csv_file, infer_schema_length=1000).schema
172
-
173
- # Read the CSV file with the inferred schema
174
- df = pl.read_csv(csv_file, dtypes=inferred_schema)
175
-
176
- # Cast all columns to strings
177
- df = df.select([pl.col(col).cast(pl.Utf8) for col in df.columns])
178
-
179
- if combined_df.is_empty():
180
- combined_df = df
181
- else:
182
- combined_df = pl.concat([combined_df, df], how="diagonal")
183
-
184
- os.remove(csv_file)
185
-
186
- current_date = datetime.now().strftime('%Y-%m-%d')
187
- combined_csv_name = f"13F_HR_{cutoff_date.strftime('%Y-%m-%d')}_{current_date}.csv"
188
- combined_csv_path = os.path.join(output_dir, combined_csv_name)
189
- combined_df.write_csv(combined_csv_path)
190
- print(f"Combined CSV file saved as: {combined_csv_path}")
191
-
192
- def download_and_process_13f_data(downloader, output_dir='13f_information_table'):
193
- os.makedirs(output_dir, exist_ok=True)
194
-
195
- # Store original rate limiters
196
- original_sec_limiter = downloader.domain_limiters['www.sec.gov']
197
- original_efts_limiter = downloader.domain_limiters['efts.sec.gov']
198
-
199
- try:
200
- # Process Current 13F-HR filings
201
- current_date = datetime.now().strftime('%Y-%m-%d')
202
- cutoff_date = get_13f_data_cutoff_date()
203
-
204
- # Set rate limiters for downloading filings
205
- downloader.domain_limiters['www.sec.gov'] = AsyncLimiter(5, 1)
206
- downloader.domain_limiters['efts.sec.gov'] = AsyncLimiter(5, 1)
207
-
208
- downloader.download(output_dir=output_dir, form='13F-HR', date=(cutoff_date.strftime('%Y-%m-%d'), current_date), file_types=['INFORMATION TABLE'])
209
-
210
- # Process XML files
211
- process_xml_files(output_dir)
212
-
213
- # Combine CSV files
214
- combine_csv_files(output_dir, cutoff_date)
215
-
216
- # Download bulk data
217
- urls = get_all_13f_urls()
218
-
219
- # Set rate limiters for bulk download
220
- downloader.domain_limiters['www.sec.gov'] = AsyncLimiter(1, 1)
221
- downloader.domain_limiters['efts.sec.gov'] = AsyncLimiter(1, 1)
222
-
223
- downloader.run_download_urls(urls, filenames=[url.split('/')[-1] for url in urls], output_dir=output_dir)
224
- process_all_13f_zips(output_dir)
225
-
226
-
227
-
228
- finally:
229
- # Restore original rate limiters
230
- downloader.domain_limiters['www.sec.gov'] = original_sec_limiter
231
- downloader.domain_limiters['efts.sec.gov'] = original_efts_limiter