datamule 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. datamule/__init__.py +2 -13
  2. datamule/document.py +0 -1
  3. datamule/helper.py +85 -105
  4. datamule/portfolio.py +105 -29
  5. datamule/submission.py +0 -38
  6. {datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/METADATA +2 -8
  7. datamule-1.0.6.dist-info/RECORD +10 -0
  8. datamule/book/__init__.py +0 -0
  9. datamule/book/book.py +0 -34
  10. datamule/book/eftsquery.py +0 -127
  11. datamule/book/xbrl_retriever.py +0 -88
  12. datamule/data/company_former_names.csv +0 -8148
  13. datamule/data/company_metadata.csv +0 -10049
  14. datamule/data/company_tickers.csv +0 -9999
  15. datamule/data/sec-glossary.csv +0 -728
  16. datamule/data/xbrl_descriptions.csv +0 -10024
  17. datamule/downloader/downloader.py +0 -374
  18. datamule/downloader/premiumdownloader.py +0 -335
  19. datamule/mapping_dicts/txt_mapping_dicts.py +0 -234
  20. datamule/mapping_dicts/xml_mapping_dicts.py +0 -19
  21. datamule/monitor.py +0 -283
  22. datamule/mulebot/__init__.py +0 -1
  23. datamule/mulebot/helper.py +0 -35
  24. datamule/mulebot/mulebot.py +0 -130
  25. datamule/mulebot/mulebot_server/__init__.py +0 -1
  26. datamule/mulebot/mulebot_server/server.py +0 -87
  27. datamule/mulebot/mulebot_server/static/css/minimalist.css +0 -174
  28. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +0 -68
  29. datamule/mulebot/mulebot_server/static/scripts/chat.js +0 -92
  30. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +0 -56
  31. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +0 -15
  32. datamule/mulebot/mulebot_server/static/scripts/main.js +0 -57
  33. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +0 -27
  34. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +0 -47
  35. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +0 -129
  36. datamule/mulebot/mulebot_server/static/scripts/utils.js +0 -28
  37. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +0 -91
  38. datamule/mulebot/search.py +0 -52
  39. datamule/mulebot/tools.py +0 -82
  40. datamule/packageupdater.py +0 -207
  41. datamule-1.0.3.dist-info/RECORD +0 -43
  42. {datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/WHEEL +0 -0
  43. {datamule-1.0.3.dist-info → datamule-1.0.6.dist-info}/top_level.txt +0 -0
@@ -1,335 +0,0 @@
1
- import os
2
- import asyncio
3
- import aiohttp
4
- from pathlib import Path
5
- from tqdm import tqdm
6
- import time
7
- import shutil
8
- import ssl
9
- import zstandard as zstd
10
- import io
11
- import json
12
- from concurrent.futures import ThreadPoolExecutor
13
- from functools import partial
14
- from queue import Queue, Empty
15
- from threading import Thread
16
- from secsgml import parse_sgml_submission
17
- import urllib.parse
18
- from ..helper import identifier_to_cik
19
-
20
- class InsufficientBalanceError(Exception):
21
- def __init__(self, required_cost, current_balance, total_urls):
22
- self.required_cost = required_cost
23
- self.current_balance = current_balance
24
- self.total_urls = total_urls
25
- message = (f"Insufficient balance. Required: ${required_cost:.4f}, "
26
- f"Current balance: ${current_balance:.4f}, "
27
- f"Total URLs: {total_urls}")
28
- super().__init__(message)
29
-
30
- class PremiumDownloader:
31
- def __init__(self, api_key=None):
32
- self.BASE_URL = "https://library.datamule.xyz/original/nc/"
33
- self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
34
- self.CHUNK_SIZE = 2 * 1024 * 1024
35
- self.MAX_CONCURRENT_DOWNLOADS = 100
36
- self.MAX_DECOMPRESSION_WORKERS = 16
37
- self.MAX_PROCESSING_WORKERS = 16
38
- self.QUEUE_SIZE = 10
39
- if api_key is not None:
40
- self._api_key = api_key
41
-
42
- @property
43
- def api_key(self):
44
- return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
45
-
46
- @api_key.setter
47
- def api_key(self, value):
48
- if not value:
49
- raise ValueError("API key cannot be empty")
50
- self._api_key = value
51
-
52
- def _log_error(self, output_dir, filename, error_msg):
53
- error_file = os.path.join(output_dir, 'errors.json')
54
- try:
55
- if os.path.exists(error_file):
56
- with open(error_file, 'r') as f:
57
- errors = json.load(f)
58
- else:
59
- errors = {}
60
-
61
- errors[filename] = str(error_msg)
62
-
63
- with open(error_file, 'w') as f:
64
- json.dump(errors, f, indent=2)
65
- except Exception as e:
66
- print(f"Failed to log error to {error_file}: {str(e)}")
67
-
68
- async def _fetch_submissions(self, session, submission_type=None, cik=None, filing_date=None, page=1):
69
- params = {
70
- 'api_key': self.api_key,
71
- 'page': page
72
- }
73
-
74
- if submission_type:
75
- if isinstance(submission_type, list):
76
- params['submission_type'] = ','.join(str(x) for x in submission_type)
77
- else:
78
- params['submission_type'] = str(submission_type)
79
-
80
- if cik:
81
- if isinstance(cik, list):
82
- params['cik'] = ','.join(str(x) for x in cik)
83
- else:
84
- params['cik'] = str(cik)
85
-
86
- if filing_date:
87
- if isinstance(filing_date, tuple):
88
- params['startdt'] = str(filing_date[0])
89
- params['enddt'] = str(filing_date[1])
90
- else:
91
- if isinstance(filing_date, list):
92
- params['filing_date'] = ','.join(str(x) for x in filing_date)
93
- else:
94
- params['filing_date'] = str(filing_date)
95
-
96
- url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
97
-
98
- async with session.get(url) as response:
99
- data = await response.json()
100
- if not data.get('success'):
101
- raise ValueError(f"API request failed: {data.get('error')}")
102
-
103
- charges = data['metadata']['billing']['charges']
104
- print(f"\nCost: ${charges['results']:.12f} downloads + ${charges['rows_read']:.12f} row reads = ${charges['total']:.12f}")
105
- print(f"Balance: ${data['metadata']['billing']['remaining_balance']:.12f}")
106
-
107
- urls = [f"{self.BASE_URL}{str(sub['accession_number']).zfill(18)}.sgml{'.zst' if sub.get('compressed', '').lower() == 'true' else ''}" for sub in data['data']]
108
- return urls, data['metadata']['pagination']
109
-
110
- class FileProcessor:
111
- def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
112
- self.processing_queue = Queue(maxsize=queue_size)
113
- self.should_stop = False
114
- self.processing_workers = []
115
- self.output_dir = output_dir
116
- self.max_workers = max_workers
117
- self.batch_size = 10
118
- self.pbar = pbar
119
- self.downloader = downloader
120
-
121
- def start_processing_workers(self):
122
- for _ in range(self.max_workers):
123
- worker = Thread(target=self._processing_worker)
124
- worker.daemon = True
125
- worker.start()
126
- self.processing_workers.append(worker)
127
-
128
- def _process_file(self, item):
129
- filename, content = item
130
- try:
131
- parse_sgml_submission(output_dir=self.output_dir, content=content)
132
- self.pbar.update(1)
133
- except Exception as e:
134
- accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
135
- if os.path.exists(accession_dir):
136
- shutil.rmtree(accession_dir)
137
- self.downloader._log_error(self.output_dir, filename, str(e))
138
-
139
- def _processing_worker(self):
140
- batch = []
141
- while not self.should_stop:
142
- try:
143
- item = self.processing_queue.get(timeout=1)
144
- if item is None:
145
- break
146
-
147
- batch.append(item)
148
-
149
- if len(batch) >= self.batch_size or self.processing_queue.empty():
150
- for item in batch:
151
- self._process_file(item)
152
- self.processing_queue.task_done()
153
- batch = []
154
-
155
- except Empty:
156
- if batch:
157
- for item in batch:
158
- self._process_file(item)
159
- self.processing_queue.task_done()
160
- batch = []
161
-
162
- def stop_workers(self):
163
- self.should_stop = True
164
- for _ in self.processing_workers:
165
- self.processing_queue.put(None)
166
- for worker in self.processing_workers:
167
- worker.join()
168
-
169
- def decompress_stream(self, compressed_chunks, filename, output_dir, processor):
170
- dctx = zstd.ZstdDecompressor()
171
- try:
172
- input_buffer = io.BytesIO(b''.join(compressed_chunks))
173
- decompressed_content = io.BytesIO()
174
-
175
- with dctx.stream_reader(input_buffer) as reader:
176
- shutil.copyfileobj(reader, decompressed_content)
177
-
178
- content = decompressed_content.getvalue().decode('utf-8')
179
- processor.processing_queue.put((filename, content))
180
- return True
181
-
182
- except Exception as e:
183
- self._log_error(output_dir, filename, f"Decompression error: {str(e)}")
184
- return False
185
- finally:
186
- try:
187
- input_buffer.close()
188
- decompressed_content.close()
189
- except:
190
- pass
191
-
192
- def save_regular_file(self, chunks, filename, output_dir, processor):
193
- try:
194
- content = b''.join(chunks).decode('utf-8')
195
- processor.processing_queue.put((filename, content))
196
- return True
197
-
198
- except Exception as e:
199
- self._log_error(output_dir, filename, f"Error saving file: {str(e)}")
200
- return False
201
-
202
- async def download_and_process(self, session, url, semaphore, decompression_pool, output_dir, processor):
203
- async with semaphore:
204
- chunks = []
205
- filename = url.split('/')[-1]
206
-
207
- api_key = self.api_key
208
- if not api_key:
209
- raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
210
-
211
- try:
212
- headers = {
213
- 'Connection': 'keep-alive',
214
- 'Accept-Encoding': 'gzip, deflate, br',
215
- 'Authorization': f'Bearer {api_key}'
216
- }
217
-
218
- async with session.get(url, headers=headers) as response:
219
- if response.status == 200:
220
- async for chunk in response.content.iter_chunked(self.CHUNK_SIZE):
221
- chunks.append(chunk)
222
-
223
- loop = asyncio.get_running_loop()
224
- if filename.endswith('.zst'):
225
- success = await loop.run_in_executor(
226
- decompression_pool,
227
- partial(self.decompress_stream, chunks, filename, output_dir, processor)
228
- )
229
- else:
230
- success = await loop.run_in_executor(
231
- decompression_pool,
232
- partial(self.save_regular_file, chunks, filename, output_dir, processor)
233
- )
234
-
235
- if not success:
236
- self._log_error(output_dir, filename, "Failed to process file")
237
- elif response.status == 401:
238
- self._log_error(output_dir, filename, "Authentication failed: Invalid API key")
239
- raise ValueError("Invalid API key")
240
- else:
241
- self._log_error(output_dir, filename, f"Download failed: Status {response.status}")
242
- except Exception as e:
243
- self._log_error(output_dir, filename, str(e))
244
-
245
- async def process_batch(self, urls, output_dir):
246
- os.makedirs(output_dir, exist_ok=True)
247
-
248
- with tqdm(total=len(urls), desc="Processing files") as pbar:
249
- processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
250
- processor.start_processing_workers()
251
-
252
- semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
253
- decompression_pool = ThreadPoolExecutor(max_workers=self.MAX_DECOMPRESSION_WORKERS)
254
-
255
- connector = aiohttp.TCPConnector(
256
- limit=self.MAX_CONCURRENT_DOWNLOADS,
257
- force_close=False,
258
- ssl=ssl.create_default_context(),
259
- ttl_dns_cache=300,
260
- keepalive_timeout=60
261
- )
262
-
263
- # timeout should be max 2 hours.
264
- async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=7200)) as session:
265
- tasks = [self.download_and_process(session, url, semaphore, decompression_pool, output_dir, processor) for url in urls]
266
- await asyncio.gather(*tasks, return_exceptions=True)
267
-
268
- processor.processing_queue.join()
269
- processor.stop_workers()
270
- decompression_pool.shutdown()
271
-
272
- async def download_all_pages(self, submission_type=None, cik=None, filing_date=None, output_dir="download"):
273
- connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
274
- async with aiohttp.ClientSession(connector=connector) as session:
275
- try:
276
- urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=1)
277
- total_urls = urls.copy()
278
- current_page = 1
279
-
280
- while pagination.get('hasMore', False):
281
- current_page += 1
282
- more_urls, pagination = await self._fetch_submissions(session, submission_type=submission_type, cik=cik, filing_date=filing_date, page=current_page)
283
- total_urls.extend(more_urls)
284
-
285
- if total_urls:
286
- total_urls = list(set(total_urls)) # Remove duplicates
287
- start_time = time.time()
288
- await self.process_batch(total_urls, output_dir)
289
- elapsed_time = time.time() - start_time
290
- print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
291
- else:
292
- print("No submissions found matching the criteria")
293
-
294
- except InsufficientBalanceError as e:
295
- error_msg = {
296
- "error": "insufficient_balance",
297
- "required_cost": e.required_cost,
298
- "current_balance": e.current_balance,
299
- "total_urls": e.total_urls,
300
- "additional_funds_needed": e.required_cost - e.current_balance
301
- }
302
- self._log_error(output_dir, "balance_check", error_msg)
303
- return
304
-
305
- def download_submissions(self, submission_type=None, cik=None, ticker=None, filing_date=None, output_dir="download"):
306
- if self.api_key is None:
307
- raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
308
-
309
- if filing_date is not None:
310
- if isinstance(filing_date, str):
311
- filing_date = int(filing_date.replace('-', ''))
312
- elif isinstance(filing_date, list):
313
- filing_date = [int(x.replace('-', '')) for x in filing_date]
314
- elif isinstance(filing_date, tuple):
315
- filing_date = (int(filing_date[0].replace('-', '')), int(filing_date[1].replace('-', '')))
316
-
317
- if ticker is not None:
318
- cik = identifier_to_cik(ticker)
319
-
320
- if cik is not None:
321
- if isinstance(cik, str):
322
- cik = [int(cik)]
323
- elif isinstance(cik, int):
324
- cik = [cik]
325
- elif isinstance(cik, list):
326
- cik = [int(x) for x in cik]
327
-
328
- async def _download():
329
- try:
330
- await self.download_all_pages(submission_type=submission_type, cik=cik, filing_date=filing_date, output_dir=output_dir)
331
- except Exception as e:
332
- if not isinstance(e, InsufficientBalanceError):
333
- self._log_error(output_dir, "download_error", str(e))
334
-
335
- asyncio.run(_download())
@@ -1,234 +0,0 @@
1
- import copy
2
-
3
- dict_sgml = {
4
- "rules": {
5
- "join_text": "\n",
6
- "remove": [
7
- {
8
- "pattern": r"^<PAGE>",
9
- }
10
- ],
11
- "mappings": [
12
- {
13
- "name": "table",
14
- "pattern": r"^<TABLE>",
15
- "end": r"^</TABLE>"
16
- },
17
- {
18
- "name": "caption",
19
- "pattern": r"^<CAPTION>",
20
- "end": r"^<S>",
21
- "keep_end": True
22
- },
23
- {
24
- "name": "footnote",
25
- "pattern": r"^<FN>",
26
- "end": r"^</FN>"
27
- }
28
- ]
29
- }
30
- }
31
-
32
- item_pattern_mapping = r"^\n\n\s*(ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
33
- item_pattern_mapping_8k = r"^\n\n\s*(ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
34
- part_pattern_mapping = r"^\n\n\s*(PART|Part)\s+(?:I{1,3}|IV)\.?"
35
-
36
- item_pattern_standardization = r"^\s*(?:ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
37
- item_pattern_standardization_8k = r"^\s*(?:ITEM|Item)\s+(\d+(?:\.\d+)?[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN)\.?"
38
- part_pattern_standardization = r"^\s*(?:PART|Part)\s+([IVX]+)"
39
-
40
-
41
- dict_10k = copy.deepcopy(dict_sgml)
42
- dict_10k["rules"]["mappings"].extend([
43
- {
44
- "type": "hierarchy",
45
- "name": "part",
46
- "pattern": part_pattern_mapping,
47
- "hierarchy": 0
48
- },
49
- {
50
- "type": "hierarchy",
51
- "name": "item",
52
- "pattern": item_pattern_mapping,
53
- "hierarchy": 1
54
- },
55
- ])
56
-
57
- # In the mapping dict:
58
- dict_10k['transformations'] = [
59
- {
60
- "type": "standardize",
61
- "match": {
62
- "type": "part",
63
- "text_pattern": part_pattern_standardization
64
- },
65
- "output": {
66
- "format": "part{}",
67
- "field": "text" # Where to store the standardized value
68
- }
69
- },
70
- {
71
- "type": "standardize",
72
- "match": {
73
- "type": "item",
74
- "text_pattern": item_pattern_standardization
75
- },
76
- "output": {
77
- "format": "item{}",
78
- "field": "text" # Could also be "text" or any other field name
79
- }
80
- },
81
- {
82
- "type": "merge_consecutive",
83
- "match": {
84
- "types": ["part", "item"] # sections types to check for merging
85
- }
86
- },
87
- {
88
- "type": "trim",
89
- "match": {
90
- "type": "item", # or "item"
91
- "expected": 1
92
- },
93
- "output": {
94
- "type": "introduction",
95
- "separator": "\n"
96
- }
97
- }
98
-
99
- ]
100
-
101
- dict_10q = copy.deepcopy(dict_sgml)
102
- dict_10q["rules"]["mappings"].extend([
103
- {
104
- "type": "hierarchy",
105
- "name": "part",
106
- "pattern": part_pattern_mapping,
107
- "hierarchy": 0
108
- },
109
- {
110
- "type": "hierarchy",
111
- "name": "item",
112
- "pattern": item_pattern_mapping,
113
- "hierarchy": 1
114
- },
115
- ])
116
-
117
- # In the mapping dict:
118
- dict_10q['transformations'] = [
119
- {
120
- "type": "standardize",
121
- "match": {
122
- "type": "part",
123
- "text_pattern": part_pattern_standardization
124
- },
125
- "output": {
126
- "format": "part{}",
127
- "field": "text" # Where to store the standardized value
128
- }
129
- },
130
- {
131
- "type": "standardize",
132
- "match": {
133
- "type": "item",
134
- "text_pattern": item_pattern_standardization
135
- },
136
- "output": {
137
- "format": "item{}",
138
- "field": "text" # Could also be "text" or any other field name
139
- }
140
- },
141
- {
142
- "type": "merge_consecutive",
143
- "match": {
144
- "types": ["part", "item"] # sections types to check for merging
145
- }
146
- },
147
- {
148
- "type": "trim",
149
- "match": {
150
- "type": "item", # or "item"
151
- "expected": 2
152
- },
153
- "output": {
154
- "type": "introduction",
155
- "separator": "\n"
156
- }
157
- }
158
-
159
- ]
160
-
161
- dict_13d = copy.deepcopy(dict_sgml)
162
- dict_13d["rules"]["mappings"].extend([
163
- {
164
- "type": "hierarchy",
165
- "name": "item",
166
- "pattern": item_pattern_mapping,
167
- "hierarchy": 0
168
- },
169
- ])
170
-
171
- dict_13d['transformations'] = [
172
- {
173
- "type": "standardize",
174
- "match": {
175
- "type": "item",
176
- "text_pattern": item_pattern_standardization
177
- },
178
- "output": {
179
- "format": "item{}",
180
- "field": "text" # Could also be "text" or any other field name
181
- }
182
- },
183
- {
184
- "type": "merge_consecutive",
185
- "match": {
186
- "types": ["item"] # sections types to check for merging
187
- }
188
- }
189
-
190
- ]
191
-
192
- dict_13g = copy.deepcopy(dict_13d)
193
-
194
- dict_8k = copy.deepcopy(dict_sgml)
195
- dict_8k["rules"]["mappings"].extend([
196
- {
197
- "type": "hierarchy",
198
- "name": "item",
199
- "pattern": item_pattern_mapping_8k,
200
- "hierarchy": 0
201
- },
202
- ])
203
-
204
- dict_8k['transformations'] = [
205
- {
206
- "type": "standardize",
207
- "match": {
208
- "type": "item",
209
- "text_pattern": item_pattern_standardization_8k
210
- },
211
- "output": {
212
- "format": "item{}",
213
- "field": "text" # Could also be "text" or any other field name
214
- }
215
- },
216
- {
217
- "type": "merge_consecutive",
218
- "match": {
219
- "types": ["item"] # sections types to check for merging
220
- }
221
- },
222
- {
223
- "type": "trim",
224
- "match": {
225
- "type": "item", # or "item"
226
- "expected": 1
227
- },
228
- "output": {
229
- "type": "introduction",
230
- "separator": "\n"
231
- }
232
- }
233
-
234
- ]
@@ -1,19 +0,0 @@
1
- dict_345 = {
2
- "transformations": [
3
- {
4
- "search": {
5
- "key": "footnoteId",
6
- "identifier": "@id"
7
- },
8
- "match": {
9
- "identifier": "@id",
10
- "content": "#text",
11
- "remove_after_use": True
12
- },
13
- "output": {
14
- "key": "footnote",
15
- "value": "content"
16
- }
17
- }
18
- ]
19
- }