datamule 0.422__cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. datamule/__init__.py +71 -0
  2. datamule/data/company_former_names.csv +8148 -0
  3. datamule/data/company_metadata.csv +10049 -0
  4. datamule/data/company_tickers.csv +9999 -0
  5. datamule/data/sec-glossary.csv +728 -0
  6. datamule/data/xbrl_descriptions.csv +10024 -0
  7. datamule/dataset_builder/dataset_builder.py +259 -0
  8. datamule/document.py +130 -0
  9. datamule/downloader/downloader.py +364 -0
  10. datamule/downloader/premiumdownloader.py +332 -0
  11. datamule/helper.py +123 -0
  12. datamule/monitor.py +236 -0
  13. datamule/mulebot/__init__.py +1 -0
  14. datamule/mulebot/helper.py +35 -0
  15. datamule/mulebot/mulebot.py +130 -0
  16. datamule/mulebot/mulebot_server/__init__.py +1 -0
  17. datamule/mulebot/mulebot_server/server.py +87 -0
  18. datamule/mulebot/mulebot_server/static/css/minimalist.css +174 -0
  19. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +68 -0
  20. datamule/mulebot/mulebot_server/static/scripts/chat.js +92 -0
  21. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +56 -0
  22. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +15 -0
  23. datamule/mulebot/mulebot_server/static/scripts/main.js +57 -0
  24. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +27 -0
  25. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +47 -0
  26. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +129 -0
  27. datamule/mulebot/mulebot_server/static/scripts/utils.js +28 -0
  28. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +91 -0
  29. datamule/mulebot/search.py +52 -0
  30. datamule/mulebot/tools.py +82 -0
  31. datamule/packageupdater.py +207 -0
  32. datamule/parser/document_parsing/basic_10k_parser.py +82 -0
  33. datamule/parser/document_parsing/basic_10q_parser.py +73 -0
  34. datamule/parser/document_parsing/basic_13d_parser.py +58 -0
  35. datamule/parser/document_parsing/basic_13g_parser.py +61 -0
  36. datamule/parser/document_parsing/basic_8k_parser.py +84 -0
  37. datamule/parser/document_parsing/form_d_parser.py +70 -0
  38. datamule/parser/document_parsing/generalized_item_parser.py +78 -0
  39. datamule/parser/document_parsing/generalized_xml_parser.py +0 -0
  40. datamule/parser/document_parsing/helper.py +75 -0
  41. datamule/parser/document_parsing/information_table_parser_13fhr.py +41 -0
  42. datamule/parser/document_parsing/insider_trading_parser.py +158 -0
  43. datamule/parser/document_parsing/mappings.py +95 -0
  44. datamule/parser/document_parsing/n_port_p_parser.py +70 -0
  45. datamule/parser/document_parsing/sec_parser.py +73 -0
  46. datamule/parser/document_parsing/sgml_parser.py +94 -0
  47. datamule/parser/sgml_parsing/sgml_parser_cy.c +19082 -0
  48. datamule/parser/sgml_parsing/sgml_parser_cy.cpython-312-x86_64-linux-gnu.so +0 -0
  49. datamule/portfolio.py +21 -0
  50. datamule/submission.py +67 -0
  51. datamule-0.422.dist-info/METADATA +31 -0
  52. datamule-0.422.dist-info/RECORD +54 -0
  53. datamule-0.422.dist-info/WHEEL +6 -0
  54. datamule-0.422.dist-info/top_level.txt +1 -0
@@ -0,0 +1,259 @@
1
+ import pandas as pd
2
+ import json
3
+ import os
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from tqdm import tqdm
6
+ import google.generativeai as genai
7
+ import time
8
+ import psutil
9
+ from threading import Lock
10
+
11
+ class RateLimiter:
12
+ def __init__(self, max_rpm):
13
+ self.min_delay = 62.0 / max_rpm # 58 seconds to allow for some buffer WIP
14
+ self.last_request = time.time()
15
+ self.lock = Lock()
16
+ self.request_count = 0
17
+
18
+ def acquire(self):
19
+ with self.lock:
20
+ now = time.time()
21
+ time_since_last = now - self.last_request
22
+ delay_needed = self.min_delay - time_since_last
23
+ self.last_request = now + max(0, delay_needed) # Update based on expected completion
24
+ self.request_count += 1
25
+ count = self.request_count
26
+
27
+ # Sleep outside the lock
28
+ if delay_needed > 0:
29
+ time.sleep(delay_needed)
30
+
31
+ return count
32
+
33
+ class DatasetBuilder:
34
+ def __init__(self):
35
+ self.base_prompt = None
36
+ self.response_schema = None
37
+ self.input_path = None
38
+ self.output_path = None
39
+ self.failed_path = None
40
+ self.max_rpm = 1450
41
+ self.max_workers = 30
42
+ self.save_frequency = 100
43
+ self.output_columns = None
44
+ self.buffer = []
45
+ self.buffer_lock = Lock()
46
+ self.failed_ids = set()
47
+ self.failed_lock = Lock()
48
+ self.model_name = "gemini-1.5-flash-8b" # Default model
49
+ self.model_config = {} # Additional model configuration
50
+ self.api_key = None
51
+
52
+ def set_api_key(self, api_key):
53
+ """Set the API key for Google's Generative AI."""
54
+ self.api_key = api_key
55
+ genai.configure(api_key=api_key)
56
+ return self
57
+
58
+ def set_paths(self, input_path, output_path, failed_path):
59
+ """Set input and output file paths."""
60
+ self.input_path = input_path
61
+ self.output_path = output_path
62
+ self.failed_path = failed_path
63
+ return self
64
+
65
+ def set_base_prompt(self, prompt):
66
+ """Set the base prompt for LLM processing."""
67
+ self.base_prompt = prompt
68
+ return self
69
+
70
+ def set_response_schema(self, schema):
71
+ """Set the response schema and derive output columns."""
72
+ self.response_schema = schema
73
+ # Derive output columns from schema
74
+ if schema and 'items' in schema and 'properties' in schema['items']:
75
+ properties = schema['items']['properties']
76
+ self.output_columns = ['accession_number'] + list(properties.keys())
77
+ return self
78
+
79
+ def set_rpm(self, max_rpm=1450):
80
+ """Set the maximum requests per minute."""
81
+ self.max_rpm = max_rpm
82
+ return self
83
+
84
+ def set_max_workers(self, max_workers=30):
85
+ """Set the maximum number of concurrent workers."""
86
+ self.max_workers = max_workers
87
+ return self
88
+
89
+ def set_save_frequency(self, frequency=100):
90
+ """Set how often to save progress."""
91
+ self.save_frequency = frequency
92
+ return self
93
+
94
+ def set_model(self, model_name="gemini-1.5-flash-8b", **model_config):
95
+ """Set the model name and configuration."""
96
+ self.model_name = model_name
97
+ self.model_config = model_config
98
+ return self
99
+
100
+ def validate_config(self):
101
+ """Validate that all required configurations are set."""
102
+ if not all([self.base_prompt, self.response_schema, self.input_path,
103
+ self.output_path, self.failed_path, self.api_key]):
104
+ raise ValueError("""Missing required configuration. Please ensure you have set:
105
+ - API key
106
+ - Paths (input_path, output_path, failed_path)
107
+ - Base prompt
108
+ - Response schema""")
109
+
110
+ def get_processed_ids(self):
111
+ """Get set of processed accession numbers from output file."""
112
+ if not os.path.exists(self.output_path):
113
+ return set()
114
+
115
+ try:
116
+ # Read only the accession_number column for memory efficiency
117
+ df = pd.read_csv(self.output_path, usecols=['accession_number'])
118
+ return set(df['accession_number'])
119
+ except Exception as e:
120
+ print(f"Warning: Error reading processed IDs: {e}")
121
+ return set()
122
+
123
+ def save_data(self, df_new):
124
+ """Append new data to existing CSV."""
125
+ df_new.to_csv(self.output_path, mode='a', header=not os.path.exists(self.output_path), index=False)
126
+
127
+ def save_failed_ids(self):
128
+ """Save failed accession numbers to file."""
129
+ with open(self.failed_path, 'w') as f:
130
+ for acc in self.failed_ids:
131
+ f.write(f"{acc}\n")
132
+
133
+ def process_text(self, args):
134
+ """Process a single text entry through the model."""
135
+ model, text, accession_number, rate_limiter = args
136
+
137
+ current_requests = rate_limiter.acquire()
138
+
139
+ full_prompt = self.base_prompt + "\n\nINFORMATION:\n" + text
140
+
141
+ try:
142
+ generation_config = genai.GenerationConfig(
143
+ response_mime_type="application/json",
144
+ response_schema=self.response_schema,
145
+ **self.model_config
146
+ )
147
+
148
+ response = model.generate_content(
149
+ full_prompt,
150
+ generation_config=generation_config
151
+ )
152
+ results = json.loads(response.text)
153
+
154
+ for result in results:
155
+ result['accession_number'] = accession_number
156
+
157
+ with self.buffer_lock:
158
+ self.buffer.extend(results)
159
+
160
+ return True, current_requests
161
+ except Exception as e:
162
+ with self.failed_lock:
163
+ self.failed_ids.add(accession_number)
164
+ return False, f"Error processing {accession_number}: {str(e)}"
165
+
166
+ def build(self):
167
+ """Main processing method to build the dataset."""
168
+ self.validate_config()
169
+
170
+ # Initialize model and rate limiter
171
+ model = genai.GenerativeModel(self.model_name)
172
+ rate_limiter = RateLimiter(self.max_rpm)
173
+
174
+ # Load data
175
+ print("Loading data...")
176
+ df_input = pd.read_csv(self.input_path)
177
+ processed_ids = self.get_processed_ids()
178
+ df_to_process = df_input[~df_input['accession_number'].isin(processed_ids)]
179
+
180
+ total_in_dataset = len(df_input)
181
+ already_processed = len(processed_ids)
182
+ to_process = len(df_to_process)
183
+
184
+ print(f"Total entries in dataset: {total_in_dataset}")
185
+ print(f"Already processed: {already_processed}")
186
+ print(f"New entries to process: {to_process}")
187
+
188
+ if len(df_to_process) == 0:
189
+ print("All entries already processed!")
190
+ return
191
+
192
+ work_items = [
193
+ (model, row['text'], row['accession_number'], rate_limiter)
194
+ for _, row in df_to_process.iterrows()
195
+ ]
196
+
197
+ start_time = time.time()
198
+ last_save_time = time.time()
199
+ processed_count = 0
200
+
201
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
202
+ futures = {executor.submit(self.process_text, item): item for item in work_items}
203
+
204
+ with tqdm(total=total_in_dataset, initial=already_processed, desc="Processing entries") as pbar:
205
+ for future in as_completed(futures):
206
+ success, result = future.result()
207
+
208
+ if not success:
209
+ print(f"\n{result}")
210
+
211
+ processed_count += 1
212
+ pbar.update(1)
213
+
214
+ elapsed = time.time() - start_time
215
+ rpm = processed_count / (elapsed / 60)
216
+ memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
217
+
218
+ current_progress = already_processed + processed_count
219
+ pbar.set_description(
220
+ f"Processed {current_progress}/{total_in_dataset} | {rpm:.0f} RPM | Mem: {memory_usage:.0f}MB"
221
+ )
222
+
223
+ # Save periodically using append
224
+ if len(self.buffer) >= self.save_frequency:
225
+ with self.buffer_lock:
226
+ df_new = pd.DataFrame(self.buffer)
227
+ self.buffer = []
228
+
229
+ if not df_new.empty:
230
+ self.save_data(df_new)
231
+ last_save_time = time.time()
232
+
233
+ # Save failed IDs periodically
234
+ if self.failed_ids and time.time() - last_save_time > 300:
235
+ self.save_failed_ids()
236
+ last_save_time = time.time()
237
+
238
+ # Save any remaining results
239
+ if self.buffer:
240
+ with self.buffer_lock:
241
+ df_new = pd.DataFrame(self.buffer)
242
+ self.buffer = []
243
+
244
+ if not df_new.empty:
245
+ self.save_data(df_new)
246
+
247
+ if self.failed_ids:
248
+ self.save_failed_ids()
249
+
250
+ # Print final statistics
251
+ elapsed = time.time() - start_time
252
+ final_rpm = processed_count / (elapsed / 60)
253
+
254
+ print(f"\nProcessing complete:")
255
+ print(f"Total processed in this run: {processed_count}")
256
+ print(f"Average speed: {final_rpm:.0f} RPM")
257
+ print(f"Failed entries: {len(self.failed_ids)}")
258
+ if self.failed_ids:
259
+ print(f"Failed entries saved to: {self.failed_path}")
datamule/document.py ADDED
@@ -0,0 +1,130 @@
1
+ import json
2
+ import csv
3
+ from .parser.document_parsing.sec_parser import Parser
4
+ from .helper import convert_to_dashed_accession
5
+
6
+ # we need to modify parse filing to take option in memory
7
+
8
+ parser = Parser()
9
+
10
+ class Document:
11
+ def __init__(self, type, filename):
12
+ self.type = type
13
+ self.filename = filename
14
+
15
+ self.data = None
16
+
17
+ def parse(self):
18
+ self.data = parser.parse_filing(self.filename, self.type)
19
+ return self.data
20
+
21
+ def write_json(self, output_filename=None):
22
+ if not self.data:
23
+ raise ValueError("No data to write. Parse filing first.")
24
+
25
+ if output_filename is None:
26
+ output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
27
+
28
+ with open(output_filename, 'w') as f:
29
+ json.dump(self.data, f, indent=2)
30
+
31
+ def write_csv(self, output_filename=None, accession_number=None):
32
+ if self.data is None:
33
+ raise ValueError("No data available. Please call parse_filing() first.")
34
+
35
+ if output_filename is None:
36
+ output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
37
+
38
+ with open(output_filename, 'w', newline='') as csvfile:
39
+ if not self.data:
40
+ return output_filename
41
+
42
+ has_document = any('document' in item for item in self.data)
43
+
44
+ if has_document and 'document' in self.data:
45
+ writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
46
+ writer.writeheader()
47
+ flattened = self._flatten_dict(self.data['document'])
48
+ for section, text in flattened.items():
49
+ writer.writerow({'section': section, 'text': text})
50
+ else:
51
+ fieldnames = list(self.data[0].keys())
52
+ if accession_number:
53
+ fieldnames.append('Accession Number')
54
+ writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
55
+ writer.writeheader()
56
+ for row in self.data:
57
+ if accession_number:
58
+ row['Accession Number'] = convert_to_dashed_accession(accession_number)
59
+ writer.writerow(row)
60
+
61
+ return output_filename
62
+
63
+ def _document_to_section_text(self, document_data, parent_key=''):
64
+ items = []
65
+
66
+ if isinstance(document_data, dict):
67
+ for key, value in document_data.items():
68
+ # Build the section name
69
+ section = f"{parent_key}_{key}" if parent_key else key
70
+
71
+ # If the value is a dict, recurse
72
+ if isinstance(value, dict):
73
+ items.extend(self._document_to_section_text(value, section))
74
+ # If it's a list, handle each item
75
+ elif isinstance(value, list):
76
+ for i, item in enumerate(value):
77
+ if isinstance(item, dict):
78
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
79
+ else:
80
+ items.append({
81
+ 'section': f"{section}_{i+1}",
82
+ 'text': str(item)
83
+ })
84
+ # Base case - add the item
85
+ else:
86
+ items.append({
87
+ 'section': section,
88
+ 'text': str(value)
89
+ })
90
+
91
+ return items
92
+
93
+ def _flatten_dict(self, d, parent_key=''):
94
+ items = {}
95
+
96
+ if isinstance(d, list):
97
+ return [self._flatten_dict(item) for item in d]
98
+
99
+ for k, v in d.items():
100
+ new_key = f"{parent_key}_{k}" if parent_key else k
101
+
102
+ if isinstance(v, dict):
103
+ items.update(self._flatten_dict(v, new_key))
104
+ else:
105
+ items[new_key] = str(v)
106
+
107
+ return items
108
+
109
+ def __iter__(self):
110
+ if not self.data:
111
+ self.parse()
112
+
113
+ if self.type == 'INFORMATION TABLE':
114
+ return iter(self.data)
115
+ elif self.type == '8-K':
116
+ return iter(self._document_to_section_text(self.data['document']))
117
+ elif self.type == '10-K':
118
+ return iter(self._document_to_section_text(self.data['document']))
119
+ elif self.type == '10-Q':
120
+ return iter(self._document_to_section_text(self.data['document']))
121
+ elif self.type in ['3', '4', '5']:
122
+ return iter(self._flatten_dict(self.data['holdings']))
123
+ elif self.type == 'D':
124
+ return iter(self._flatten_dict(self.data['document']['relatedPersonsList']['relatedPersonInfo']))
125
+ elif self.type == 'NPORT-P':
126
+ return iter(self._flatten_dict(self.data['document']['formData']['invstOrSecs']['invstOrSec']))
127
+ elif self.type == 'SC 13D':
128
+ return iter(self._document_to_section_text(self.data['document']))
129
+ elif self.type == 'SC 13G':
130
+ return iter(self._document_to_section_text(self.data['document']))