datamule 0.415__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamule might be problematic. Click here for more details.

Files changed (37) hide show
  1. datamule/__init__.py +62 -0
  2. datamule/data/company_former_names.csv +8148 -0
  3. datamule/data/company_metadata.csv +10049 -0
  4. datamule/data/company_tickers.csv +9999 -0
  5. datamule/data/sec-glossary.csv +728 -0
  6. datamule/data/xbrl_descriptions.csv +10024 -0
  7. datamule/dataset_builder/dataset_builder.py +259 -0
  8. datamule/document.py +130 -0
  9. datamule/helper.py +123 -0
  10. datamule/monitor.py +236 -0
  11. datamule/mulebot/__init__.py +1 -0
  12. datamule/mulebot/helper.py +35 -0
  13. datamule/mulebot/mulebot.py +130 -0
  14. datamule/mulebot/mulebot_server/__init__.py +1 -0
  15. datamule/mulebot/mulebot_server/server.py +87 -0
  16. datamule/mulebot/mulebot_server/static/css/minimalist.css +174 -0
  17. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +68 -0
  18. datamule/mulebot/mulebot_server/static/scripts/chat.js +92 -0
  19. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +56 -0
  20. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +15 -0
  21. datamule/mulebot/mulebot_server/static/scripts/main.js +57 -0
  22. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +27 -0
  23. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +47 -0
  24. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +129 -0
  25. datamule/mulebot/mulebot_server/static/scripts/utils.js +28 -0
  26. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +91 -0
  27. datamule/mulebot/search.py +52 -0
  28. datamule/mulebot/tools.py +82 -0
  29. datamule/packageupdater.py +207 -0
  30. datamule/parser/sgml_parsing/sgml_parser_cy.c +19082 -0
  31. datamule/parser/sgml_parsing/sgml_parser_cy.cpython-39-darwin.so +0 -0
  32. datamule/portfolio.py +16 -0
  33. datamule/submission.py +61 -0
  34. datamule-0.415.dist-info/METADATA +36 -0
  35. datamule-0.415.dist-info/RECORD +37 -0
  36. datamule-0.415.dist-info/WHEEL +5 -0
  37. datamule-0.415.dist-info/top_level.txt +1 -0
@@ -0,0 +1,259 @@
1
+ import pandas as pd
2
+ import json
3
+ import os
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from tqdm import tqdm
6
+ import google.generativeai as genai
7
+ import time
8
+ import psutil
9
+ from threading import Lock
10
+
11
+ class RateLimiter:
12
+ def __init__(self, max_rpm):
13
+ self.min_delay = 62.0 / max_rpm # 58 seconds to allow for some buffer WIP
14
+ self.last_request = time.time()
15
+ self.lock = Lock()
16
+ self.request_count = 0
17
+
18
+ def acquire(self):
19
+ with self.lock:
20
+ now = time.time()
21
+ time_since_last = now - self.last_request
22
+ delay_needed = self.min_delay - time_since_last
23
+ self.last_request = now + max(0, delay_needed) # Update based on expected completion
24
+ self.request_count += 1
25
+ count = self.request_count
26
+
27
+ # Sleep outside the lock
28
+ if delay_needed > 0:
29
+ time.sleep(delay_needed)
30
+
31
+ return count
32
+
33
+ class DatasetBuilder:
34
+ def __init__(self):
35
+ self.base_prompt = None
36
+ self.response_schema = None
37
+ self.input_path = None
38
+ self.output_path = None
39
+ self.failed_path = None
40
+ self.max_rpm = 1450
41
+ self.max_workers = 30
42
+ self.save_frequency = 100
43
+ self.output_columns = None
44
+ self.buffer = []
45
+ self.buffer_lock = Lock()
46
+ self.failed_ids = set()
47
+ self.failed_lock = Lock()
48
+ self.model_name = "gemini-1.5-flash-8b" # Default model
49
+ self.model_config = {} # Additional model configuration
50
+ self.api_key = None
51
+
52
+ def set_api_key(self, api_key):
53
+ """Set the API key for Google's Generative AI."""
54
+ self.api_key = api_key
55
+ genai.configure(api_key=api_key)
56
+ return self
57
+
58
+ def set_paths(self, input_path, output_path, failed_path):
59
+ """Set input and output file paths."""
60
+ self.input_path = input_path
61
+ self.output_path = output_path
62
+ self.failed_path = failed_path
63
+ return self
64
+
65
+ def set_base_prompt(self, prompt):
66
+ """Set the base prompt for LLM processing."""
67
+ self.base_prompt = prompt
68
+ return self
69
+
70
+ def set_response_schema(self, schema):
71
+ """Set the response schema and derive output columns."""
72
+ self.response_schema = schema
73
+ # Derive output columns from schema
74
+ if schema and 'items' in schema and 'properties' in schema['items']:
75
+ properties = schema['items']['properties']
76
+ self.output_columns = ['accession_number'] + list(properties.keys())
77
+ return self
78
+
79
+ def set_rpm(self, max_rpm=1450):
80
+ """Set the maximum requests per minute."""
81
+ self.max_rpm = max_rpm
82
+ return self
83
+
84
+ def set_max_workers(self, max_workers=30):
85
+ """Set the maximum number of concurrent workers."""
86
+ self.max_workers = max_workers
87
+ return self
88
+
89
+ def set_save_frequency(self, frequency=100):
90
+ """Set how often to save progress."""
91
+ self.save_frequency = frequency
92
+ return self
93
+
94
+ def set_model(self, model_name="gemini-1.5-flash-8b", **model_config):
95
+ """Set the model name and configuration."""
96
+ self.model_name = model_name
97
+ self.model_config = model_config
98
+ return self
99
+
100
+ def validate_config(self):
101
+ """Validate that all required configurations are set."""
102
+ if not all([self.base_prompt, self.response_schema, self.input_path,
103
+ self.output_path, self.failed_path, self.api_key]):
104
+ raise ValueError("""Missing required configuration. Please ensure you have set:
105
+ - API key
106
+ - Paths (input_path, output_path, failed_path)
107
+ - Base prompt
108
+ - Response schema""")
109
+
110
+ def get_processed_ids(self):
111
+ """Get set of processed accession numbers from output file."""
112
+ if not os.path.exists(self.output_path):
113
+ return set()
114
+
115
+ try:
116
+ # Read only the accession_number column for memory efficiency
117
+ df = pd.read_csv(self.output_path, usecols=['accession_number'])
118
+ return set(df['accession_number'])
119
+ except Exception as e:
120
+ print(f"Warning: Error reading processed IDs: {e}")
121
+ return set()
122
+
123
+ def save_data(self, df_new):
124
+ """Append new data to existing CSV."""
125
+ df_new.to_csv(self.output_path, mode='a', header=not os.path.exists(self.output_path), index=False)
126
+
127
+ def save_failed_ids(self):
128
+ """Save failed accession numbers to file."""
129
+ with open(self.failed_path, 'w') as f:
130
+ for acc in self.failed_ids:
131
+ f.write(f"{acc}\n")
132
+
133
+ def process_text(self, args):
134
+ """Process a single text entry through the model."""
135
+ model, text, accession_number, rate_limiter = args
136
+
137
+ current_requests = rate_limiter.acquire()
138
+
139
+ full_prompt = self.base_prompt + "\n\nINFORMATION:\n" + text
140
+
141
+ try:
142
+ generation_config = genai.GenerationConfig(
143
+ response_mime_type="application/json",
144
+ response_schema=self.response_schema,
145
+ **self.model_config
146
+ )
147
+
148
+ response = model.generate_content(
149
+ full_prompt,
150
+ generation_config=generation_config
151
+ )
152
+ results = json.loads(response.text)
153
+
154
+ for result in results:
155
+ result['accession_number'] = accession_number
156
+
157
+ with self.buffer_lock:
158
+ self.buffer.extend(results)
159
+
160
+ return True, current_requests
161
+ except Exception as e:
162
+ with self.failed_lock:
163
+ self.failed_ids.add(accession_number)
164
+ return False, f"Error processing {accession_number}: {str(e)}"
165
+
166
+ def build(self):
167
+ """Main processing method to build the dataset."""
168
+ self.validate_config()
169
+
170
+ # Initialize model and rate limiter
171
+ model = genai.GenerativeModel(self.model_name)
172
+ rate_limiter = RateLimiter(self.max_rpm)
173
+
174
+ # Load data
175
+ print("Loading data...")
176
+ df_input = pd.read_csv(self.input_path)
177
+ processed_ids = self.get_processed_ids()
178
+ df_to_process = df_input[~df_input['accession_number'].isin(processed_ids)]
179
+
180
+ total_in_dataset = len(df_input)
181
+ already_processed = len(processed_ids)
182
+ to_process = len(df_to_process)
183
+
184
+ print(f"Total entries in dataset: {total_in_dataset}")
185
+ print(f"Already processed: {already_processed}")
186
+ print(f"New entries to process: {to_process}")
187
+
188
+ if len(df_to_process) == 0:
189
+ print("All entries already processed!")
190
+ return
191
+
192
+ work_items = [
193
+ (model, row['text'], row['accession_number'], rate_limiter)
194
+ for _, row in df_to_process.iterrows()
195
+ ]
196
+
197
+ start_time = time.time()
198
+ last_save_time = time.time()
199
+ processed_count = 0
200
+
201
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
202
+ futures = {executor.submit(self.process_text, item): item for item in work_items}
203
+
204
+ with tqdm(total=total_in_dataset, initial=already_processed, desc="Processing entries") as pbar:
205
+ for future in as_completed(futures):
206
+ success, result = future.result()
207
+
208
+ if not success:
209
+ print(f"\n{result}")
210
+
211
+ processed_count += 1
212
+ pbar.update(1)
213
+
214
+ elapsed = time.time() - start_time
215
+ rpm = processed_count / (elapsed / 60)
216
+ memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
217
+
218
+ current_progress = already_processed + processed_count
219
+ pbar.set_description(
220
+ f"Processed {current_progress}/{total_in_dataset} | {rpm:.0f} RPM | Mem: {memory_usage:.0f}MB"
221
+ )
222
+
223
+ # Save periodically using append
224
+ if len(self.buffer) >= self.save_frequency:
225
+ with self.buffer_lock:
226
+ df_new = pd.DataFrame(self.buffer)
227
+ self.buffer = []
228
+
229
+ if not df_new.empty:
230
+ self.save_data(df_new)
231
+ last_save_time = time.time()
232
+
233
+ # Save failed IDs periodically
234
+ if self.failed_ids and time.time() - last_save_time > 300:
235
+ self.save_failed_ids()
236
+ last_save_time = time.time()
237
+
238
+ # Save any remaining results
239
+ if self.buffer:
240
+ with self.buffer_lock:
241
+ df_new = pd.DataFrame(self.buffer)
242
+ self.buffer = []
243
+
244
+ if not df_new.empty:
245
+ self.save_data(df_new)
246
+
247
+ if self.failed_ids:
248
+ self.save_failed_ids()
249
+
250
+ # Print final statistics
251
+ elapsed = time.time() - start_time
252
+ final_rpm = processed_count / (elapsed / 60)
253
+
254
+ print(f"\nProcessing complete:")
255
+ print(f"Total processed in this run: {processed_count}")
256
+ print(f"Average speed: {final_rpm:.0f} RPM")
257
+ print(f"Failed entries: {len(self.failed_ids)}")
258
+ if self.failed_ids:
259
+ print(f"Failed entries saved to: {self.failed_path}")
datamule/document.py ADDED
@@ -0,0 +1,130 @@
1
+ import json
2
+ import csv
3
+ from .parser.document_parsing.sec_parser import Parser
4
+ from .helper import convert_to_dashed_accession
5
+
6
+ # we need to modify parse filing to take option in memory
7
+
8
+ parser = Parser()
9
+
10
+ class Document:
11
+ def __init__(self, type, filename):
12
+ self.type = type
13
+ self.filename = filename
14
+
15
+ self.data = None
16
+
17
+ def parse(self):
18
+ self.data = parser.parse_filing(self.filename, self.type)
19
+ return self.data
20
+
21
+ def write_json(self, output_filename=None):
22
+ if not self.data:
23
+ raise ValueError("No data to write. Parse filing first.")
24
+
25
+ if output_filename is None:
26
+ output_filename = f"{self.filename.rsplit('.', 1)[0]}.json"
27
+
28
+ with open(output_filename, 'w') as f:
29
+ json.dump(self.data, f, indent=2)
30
+
31
+ def write_csv(self, output_filename=None, accession_number=None):
32
+ if self.data is None:
33
+ raise ValueError("No data available. Please call parse_filing() first.")
34
+
35
+ if output_filename is None:
36
+ output_filename = f"{self.filename.rsplit('.', 1)[0]}.csv"
37
+
38
+ with open(output_filename, 'w', newline='') as csvfile:
39
+ if not self.data:
40
+ return output_filename
41
+
42
+ has_document = any('document' in item for item in self.data)
43
+
44
+ if has_document and 'document' in self.data:
45
+ writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
46
+ writer.writeheader()
47
+ flattened = self._flatten_dict(self.data['document'])
48
+ for section, text in flattened.items():
49
+ writer.writerow({'section': section, 'text': text})
50
+ else:
51
+ fieldnames = list(self.data[0].keys())
52
+ if accession_number:
53
+ fieldnames.append('Accession Number')
54
+ writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
55
+ writer.writeheader()
56
+ for row in self.data:
57
+ if accession_number:
58
+ row['Accession Number'] = convert_to_dashed_accession(accession_number)
59
+ writer.writerow(row)
60
+
61
+ return output_filename
62
+
63
+ def _document_to_section_text(self, document_data, parent_key=''):
64
+ items = []
65
+
66
+ if isinstance(document_data, dict):
67
+ for key, value in document_data.items():
68
+ # Build the section name
69
+ section = f"{parent_key}_{key}" if parent_key else key
70
+
71
+ # If the value is a dict, recurse
72
+ if isinstance(value, dict):
73
+ items.extend(self._document_to_section_text(value, section))
74
+ # If it's a list, handle each item
75
+ elif isinstance(value, list):
76
+ for i, item in enumerate(value):
77
+ if isinstance(item, dict):
78
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
79
+ else:
80
+ items.append({
81
+ 'section': f"{section}_{i+1}",
82
+ 'text': str(item)
83
+ })
84
+ # Base case - add the item
85
+ else:
86
+ items.append({
87
+ 'section': section,
88
+ 'text': str(value)
89
+ })
90
+
91
+ return items
92
+
93
+ def _flatten_dict(self, d, parent_key=''):
94
+ items = {}
95
+
96
+ if isinstance(d, list):
97
+ return [self._flatten_dict(item) for item in d]
98
+
99
+ for k, v in d.items():
100
+ new_key = f"{parent_key}_{k}" if parent_key else k
101
+
102
+ if isinstance(v, dict):
103
+ items.update(self._flatten_dict(v, new_key))
104
+ else:
105
+ items[new_key] = str(v)
106
+
107
+ return items
108
+
109
+ def __iter__(self):
110
+ if not self.data:
111
+ self.parse()
112
+
113
+ if self.type == 'INFORMATION TABLE':
114
+ return iter(self.data)
115
+ elif self.type == '8-K':
116
+ return iter(self._document_to_section_text(self.data['document']))
117
+ elif self.type == '10-K':
118
+ return iter(self._document_to_section_text(self.data['document']))
119
+ elif self.type == '10-Q':
120
+ return iter(self._document_to_section_text(self.data['document']))
121
+ elif self.type in ['3', '4', '5']:
122
+ return iter(self._flatten_dict(self.data['holdings']))
123
+ elif self.type == 'D':
124
+ return iter(self._flatten_dict(self.data['document']['relatedPersonsList']['relatedPersonInfo']))
125
+ elif self.type == 'NPORT-P':
126
+ return iter(self._flatten_dict(self.data['document']['formData']['invstOrSecs']['invstOrSec']))
127
+ elif self.type == 'SC 13D':
128
+ return iter(self._document_to_section_text(self.data['document']))
129
+ elif self.type == 'SC 13G':
130
+ return iter(self._document_to_section_text(self.data['document']))
datamule/helper.py ADDED
@@ -0,0 +1,123 @@
1
+ import requests
2
+ import os
3
+ from tqdm import tqdm
4
+ import zipfile
5
+ from pkg_resources import resource_filename
6
+ import csv
7
+ import re
8
+
9
+ # Unused in current implementation.
10
+ def construct_primary_doc_url(cik, accession_number,primary_doc_url):
11
+ accession_number = accession_number.replace("-", "")
12
+ return f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_doc_url}"
13
+
14
+ # DONE
15
+ def _download_from_dropbox(url, output_path):
16
+ headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
17
+ r = requests.get(url, stream=True, headers=headers)
18
+ total_size = int(r.headers.get('content-length', 0))
19
+
20
+ with open(output_path, 'wb') as f, tqdm(
21
+ desc="Downloading " + os.path.basename(output_path),
22
+ total=total_size,
23
+ unit='iB',
24
+ unit_scale=True,
25
+ unit_divisor=1024,
26
+ ) as progress_bar:
27
+ for chunk in r.iter_content(chunk_size=1024):
28
+ size = f.write(chunk)
29
+ progress_bar.update(size)
30
+
31
+ # Check if the downloaded file is a zip file
32
+ if zipfile.is_zipfile(output_path):
33
+ extract_path = os.path.dirname(output_path)
34
+ with zipfile.ZipFile(output_path, 'r') as zip_ref:
35
+ for file_info in zip_ref.infolist():
36
+ extract_file_path = os.path.join(extract_path, file_info.filename)
37
+ with zip_ref.open(file_info) as file_in_zip, \
38
+ open(extract_file_path, 'wb') as output_file, \
39
+ tqdm(total=file_info.file_size, unit='B', unit_scale=True,
40
+ desc=f"Extracting {file_info.filename}") as pbar:
41
+ while True:
42
+ chunk = file_in_zip.read(8192)
43
+ if not chunk:
44
+ break
45
+ output_file.write(chunk)
46
+ pbar.update(len(chunk))
47
+
48
+ # Remove the zip file after extraction
49
+ os.remove(output_path)
50
+ print(f"Extracted contents to {extract_path}")
51
+ else:
52
+ print(f"Downloaded file is not a zip. Saved to {output_path}")
53
+
54
+ # May generalize to load any package resource
55
+ def load_package_csv(name):
56
+ """Load package CSV files"""
57
+ csv_path = resource_filename('datamule', f'data/{name}.csv')
58
+ company_tickers = []
59
+
60
+ with open(csv_path, 'r') as csvfile:
61
+ csv_reader = csv.DictReader(csvfile)
62
+ for row in csv_reader:
63
+ company_tickers.append(row)
64
+
65
+ return company_tickers
66
+
67
+ def load_package_dataset(dataset):
68
+ if dataset == 'company_tickers':
69
+ return load_package_csv('company_tickers')
70
+ elif dataset =='company_former_names':
71
+ return load_package_csv('company_former_names')
72
+ elif dataset =='company_metadata':
73
+ return load_package_csv('company_metadata')
74
+ elif dataset == 'sec_glossary':
75
+ return load_package_csv('sec-glossary')
76
+ elif dataset == 'xbrl_descriptions':
77
+ return load_package_csv('xbrl_descriptions')
78
+
79
+ # DONE
80
+ def identifier_to_cik(ticker):
81
+ """Convert company tickers to CIK codes"""
82
+ company_tickers = load_package_csv('company_tickers')
83
+ if ticker:
84
+ if isinstance(ticker, list):
85
+ cik = []
86
+ for t in ticker:
87
+ cik.extend([company['cik'] for company in company_tickers if t == company['ticker']])
88
+ else:
89
+ cik = [company['cik'] for company in company_tickers if ticker == company['ticker']]
90
+
91
+ if not cik:
92
+ raise ValueError("No matching companies found")
93
+
94
+ return cik
95
+
96
+
97
+ def fix_filing_url(url):
98
+ match_suffix = re.search(r'/(\d{4})\.(.+?)$', url)
99
+ if match_suffix:
100
+ suffix_number = match_suffix.group(1)
101
+ file_ext = match_suffix.group(2)
102
+ match_accession = re.search(r'/(\d{18})/', url)
103
+ if match_accession:
104
+ accession_number = match_accession.group(1)
105
+ formatted_accession_number = f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"
106
+ new_url = url.rsplit('/', 1)[0] + f'/{formatted_accession_number}-{suffix_number}.{file_ext}'
107
+ return new_url
108
+ return url
109
+
110
+ def convert_to_dashed_accession(accession):
111
+ # Remove any existing dashes or whitespace
112
+ cleaned = ''.join(accession.split())
113
+
114
+ # Check if the cleaned string has 18 characters
115
+ if len(cleaned) != 18:
116
+ raise ValueError("Invalid accession number format. Expected 18 characters.")
117
+
118
+ # Insert dashes at the correct positions
119
+ dashed = f"{cleaned[:10]}-{cleaned[10:12]}-{cleaned[12:]}"
120
+
121
+ return dashed
122
+
123
+ headers = {'User-Agent': 'John Smith johnsmith@gmail.com'}