datamule 0.381__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. datamule/__init__.py +46 -86
  2. datamule/book.py +16 -0
  3. datamule/config.py +29 -0
  4. datamule/data/company_former_names.csv +8148 -8148
  5. datamule/data/company_metadata.csv +10049 -10049
  6. datamule/data/company_tickers.csv +9999 -10168
  7. datamule/data/sec-glossary.csv +728 -728
  8. datamule/data/xbrl_descriptions.csv +10024 -10024
  9. datamule/document.py +278 -0
  10. datamule/downloader/downloader.py +374 -0
  11. datamule/downloader/premiumdownloader.py +335 -0
  12. datamule/helper.py +123 -136
  13. datamule/mapping_dicts/txt_mapping_dicts.py +232 -0
  14. datamule/mapping_dicts/xml_mapping_dicts.py +19 -0
  15. datamule/monitor.py +238 -0
  16. datamule/mulebot/__init__.py +1 -1
  17. datamule/mulebot/helper.py +34 -34
  18. datamule/mulebot/mulebot.py +129 -129
  19. datamule/mulebot/mulebot_server/server.py +86 -86
  20. datamule/mulebot/mulebot_server/static/css/minimalist.css +173 -173
  21. datamule/mulebot/mulebot_server/static/scripts/artifacts.js +67 -67
  22. datamule/mulebot/mulebot_server/static/scripts/chat.js +91 -91
  23. datamule/mulebot/mulebot_server/static/scripts/filingArtifacts.js +55 -55
  24. datamule/mulebot/mulebot_server/static/scripts/listArtifacts.js +14 -14
  25. datamule/mulebot/mulebot_server/static/scripts/main.js +56 -56
  26. datamule/mulebot/mulebot_server/static/scripts/prefilledPrompt.js +26 -26
  27. datamule/mulebot/mulebot_server/static/scripts/suggestions.js +46 -46
  28. datamule/mulebot/mulebot_server/static/scripts/tableArtifacts.js +128 -128
  29. datamule/mulebot/mulebot_server/static/scripts/utils.js +27 -27
  30. datamule/mulebot/mulebot_server/templates/chat-minimalist.html +90 -90
  31. datamule/mulebot/search.py +51 -51
  32. datamule/mulebot/tools.py +82 -82
  33. datamule/packageupdater.py +207 -0
  34. datamule/portfolio.py +106 -0
  35. datamule/submission.py +76 -0
  36. datamule-1.0.0.dist-info/METADATA +27 -0
  37. datamule-1.0.0.dist-info/RECORD +40 -0
  38. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/WHEEL +1 -1
  39. datamule/data/filing_types.csv +0 -485
  40. datamule/data/ftd_locations.csv +0 -388
  41. datamule/datamule_api.py +0 -21
  42. datamule/dataset_builder/_init.py +0 -1
  43. datamule/dataset_builder/dataset_builder.py +0 -260
  44. datamule/downloader/__init__.py +0 -0
  45. datamule/downloader/dropbox_downloader.py +0 -225
  46. datamule/downloader/ftd.py +0 -216
  47. datamule/downloader/information_table_13f.py +0 -231
  48. datamule/downloader/sec_downloader.py +0 -635
  49. datamule/filing_viewer/__init__.py +0 -1
  50. datamule/filing_viewer/filing_viewer.py +0 -256
  51. datamule/global_vars.py +0 -202
  52. datamule/parser/__init__.py +0 -1
  53. datamule/parser/basic_10k_parser.py +0 -82
  54. datamule/parser/basic_10q_parser.py +0 -73
  55. datamule/parser/basic_13d_parser.py +0 -58
  56. datamule/parser/basic_13g_parser.py +0 -61
  57. datamule/parser/basic_8k_parser.py +0 -84
  58. datamule/parser/company_concepts_parser.py +0 -0
  59. datamule/parser/form_d_parser.py +0 -70
  60. datamule/parser/generalized_item_parser.py +0 -78
  61. datamule/parser/generalized_xml_parser.py +0 -0
  62. datamule/parser/helper.py +0 -75
  63. datamule/parser/information_table_parser_13fhr.py +0 -41
  64. datamule/parser/insider_trading_parser.py +0 -158
  65. datamule/parser/mappings.py +0 -95
  66. datamule/parser/n_port_p_parser.py +0 -70
  67. datamule/parser/sec_parser.py +0 -79
  68. datamule/parser/sgml_parser.py +0 -180
  69. datamule/sec_filing.py +0 -126
  70. datamule/sec_search.py +0 -20
  71. datamule-0.381.dist-info/METADATA +0 -132
  72. datamule-0.381.dist-info/RECORD +0 -61
  73. {datamule-0.381.dist-info → datamule-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,232 @@
1
+ import copy
2
+
3
+ dict_sgml = {
4
+ "rules": {
5
+ "join_text": "\n",
6
+ "remove": [
7
+ {
8
+ "pattern": r"^<PAGE>",
9
+ }
10
+ ],
11
+ "mappings": [
12
+ {
13
+ "name": "table",
14
+ "pattern": r"^<TABLE>",
15
+ "end": r"^</TABLE>"
16
+ },
17
+ {
18
+ "name": "caption",
19
+ "pattern": r"^<CAPTION>",
20
+ "end": r"^<S>",
21
+ "keep_end": True
22
+ },
23
+ {
24
+ "name": "footnote",
25
+ "pattern": r"^<FN>",
26
+ "end": r"^</FN>"
27
+ }
28
+ ]
29
+ }
30
+ }
31
+
32
+ item_pattern_mapping = r"^\n\n\s*(ITEM|Item)\b"
33
+ part_pattern_mapping = r"^\n\n\s*(PART|Part)\b"
34
+
35
+ item_pattern_standardization = r"^\s*(?:ITEM|Item)\s+(\d+[a-zA-Z]?|ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|[0-9]+[a-zA-Z]?)\.?"
36
+ part_pattern_standardization = r"^\s*(?:PART|Part)\s+([IVX]+)"
37
+
38
+
39
+ dict_10k = copy.deepcopy(dict_sgml)
40
+ dict_10k["rules"]["mappings"].extend([
41
+ {
42
+ "type": "hierarchy",
43
+ "name": "part",
44
+ "pattern": part_pattern_mapping,
45
+ "hierarchy": 0
46
+ },
47
+ {
48
+ "type": "hierarchy",
49
+ "name": "item",
50
+ "pattern": item_pattern_mapping,
51
+ "hierarchy": 1
52
+ },
53
+ ])
54
+
55
+ # In the mapping dict:
56
+ dict_10k['transformations'] = [
57
+ {
58
+ "type": "standardize",
59
+ "match": {
60
+ "type": "part",
61
+ "text_pattern": part_pattern_standardization
62
+ },
63
+ "output": {
64
+ "format": "part{}",
65
+ "field": "text" # Where to store the standardized value
66
+ }
67
+ },
68
+ {
69
+ "type": "standardize",
70
+ "match": {
71
+ "type": "item",
72
+ "text_pattern": item_pattern_standardization
73
+ },
74
+ "output": {
75
+ "format": "item{}",
76
+ "field": "text" # Could also be "text" or any other field name
77
+ }
78
+ },
79
+ {
80
+ "type": "merge_consecutive",
81
+ "match": {
82
+ "types": ["part", "item"] # sections types to check for merging
83
+ }
84
+ },
85
+ {
86
+ "type": "trim",
87
+ "match": {
88
+ "type": "item", # or "item"
89
+ "expected": 1
90
+ },
91
+ "output": {
92
+ "type": "introduction",
93
+ "separator": "\n"
94
+ }
95
+ }
96
+
97
+ ]
98
+
99
+ dict_10q = copy.deepcopy(dict_sgml)
100
+ dict_10q["rules"]["mappings"].extend([
101
+ {
102
+ "type": "hierarchy",
103
+ "name": "part",
104
+ "pattern": part_pattern_mapping,
105
+ "hierarchy": 0
106
+ },
107
+ {
108
+ "type": "hierarchy",
109
+ "name": "item",
110
+ "pattern": item_pattern_mapping,
111
+ "hierarchy": 1
112
+ },
113
+ ])
114
+
115
+ # In the mapping dict:
116
+ dict_10q['transformations'] = [
117
+ {
118
+ "type": "standardize",
119
+ "match": {
120
+ "type": "part",
121
+ "text_pattern": part_pattern_standardization
122
+ },
123
+ "output": {
124
+ "format": "part{}",
125
+ "field": "text" # Where to store the standardized value
126
+ }
127
+ },
128
+ {
129
+ "type": "standardize",
130
+ "match": {
131
+ "type": "item",
132
+ "text_pattern": item_pattern_standardization
133
+ },
134
+ "output": {
135
+ "format": "item{}",
136
+ "field": "text" # Could also be "text" or any other field name
137
+ }
138
+ },
139
+ {
140
+ "type": "merge_consecutive",
141
+ "match": {
142
+ "types": ["part", "item"] # sections types to check for merging
143
+ }
144
+ },
145
+ {
146
+ "type": "trim",
147
+ "match": {
148
+ "type": "item", # or "item"
149
+ "expected": 2
150
+ },
151
+ "output": {
152
+ "type": "introduction",
153
+ "separator": "\n"
154
+ }
155
+ }
156
+
157
+ ]
158
+
159
+ dict_13d = copy.deepcopy(dict_sgml)
160
+ dict_13d["rules"]["mappings"].extend([
161
+ {
162
+ "type": "hierarchy",
163
+ "name": "item",
164
+ "pattern": item_pattern_mapping,
165
+ "hierarchy": 0
166
+ },
167
+ ])
168
+
169
+ dict_13d['transformations'] = [
170
+ {
171
+ "type": "standardize",
172
+ "match": {
173
+ "type": "item",
174
+ "text_pattern": item_pattern_standardization
175
+ },
176
+ "output": {
177
+ "format": "item{}",
178
+ "field": "text" # Could also be "text" or any other field name
179
+ }
180
+ },
181
+ {
182
+ "type": "merge_consecutive",
183
+ "match": {
184
+ "types": ["item"] # sections types to check for merging
185
+ }
186
+ }
187
+
188
+ ]
189
+
190
+ dict_13g = copy.deepcopy(dict_13d)
191
+
192
+ dict_8k = copy.deepcopy(dict_sgml)
193
+ dict_8k["rules"]["mappings"].extend([
194
+ {
195
+ "type": "hierarchy",
196
+ "name": "item",
197
+ "pattern": item_pattern_mapping,
198
+ "hierarchy": 0
199
+ },
200
+ ])
201
+
202
+ dict_8k['transformations'] = [
203
+ {
204
+ "type": "standardize",
205
+ "match": {
206
+ "type": "item",
207
+ "text_pattern": item_pattern_standardization
208
+ },
209
+ "output": {
210
+ "format": "item{}",
211
+ "field": "text" # Could also be "text" or any other field name
212
+ }
213
+ },
214
+ {
215
+ "type": "merge_consecutive",
216
+ "match": {
217
+ "types": ["item"] # sections types to check for merging
218
+ }
219
+ },
220
+ {
221
+ "type": "trim",
222
+ "match": {
223
+ "type": "item", # or "item"
224
+ "expected": 1
225
+ },
226
+ "output": {
227
+ "type": "introduction",
228
+ "separator": "\n"
229
+ }
230
+ }
231
+
232
+ ]
@@ -0,0 +1,19 @@
1
+ dict_345 = {
2
+ "transformations": [
3
+ {
4
+ "search": {
5
+ "key": "footnoteId",
6
+ "identifier": "@id"
7
+ },
8
+ "match": {
9
+ "identifier": "@id",
10
+ "content": "#text",
11
+ "remove_after_use": True
12
+ },
13
+ "output": {
14
+ "key": "footnote",
15
+ "value": "content"
16
+ }
17
+ }
18
+ ]
19
+ }
datamule/monitor.py ADDED
@@ -0,0 +1,238 @@
1
+ import asyncio
2
+ import aiohttp
3
+ from datetime import timedelta, datetime
4
+ import pytz
5
+ from collections import deque
6
+ import time
7
+ from .helper import headers, identifier_to_cik
8
+
9
+ def _get_current_eastern_date():
10
+ """Get current date in US Eastern timezone (automatically handles DST) """
11
+ eastern = pytz.timezone('America/New_York')
12
+ return datetime.now(eastern)
13
+
14
+ class PreciseRateLimiter:
15
+ def __init__(self, rate, interval=1.0):
16
+ self.rate = rate # requests per interval
17
+ self.interval = interval # in seconds
18
+ self.token_time = self.interval / self.rate # time per token
19
+ self.last_time = time.time()
20
+ self.lock = asyncio.Lock()
21
+
22
+ async def acquire(self):
23
+ async with self.lock:
24
+ now = time.time()
25
+ wait_time = self.last_time + self.token_time - now
26
+ if wait_time > 0:
27
+ await asyncio.sleep(wait_time)
28
+ self.last_time = time.time()
29
+ return True
30
+
31
+ async def __aenter__(self):
32
+ await self.acquire()
33
+ return self
34
+
35
+ async def __aexit__(self, exc_type, exc, tb):
36
+ pass
37
+
38
+ class RateMonitor:
39
+ def __init__(self, window_size=1.0):
40
+ self.window_size = window_size
41
+ self.requests = deque()
42
+ self._lock = asyncio.Lock()
43
+
44
+ async def add_request(self, size_bytes):
45
+ async with self._lock:
46
+ now = time.time()
47
+ self.requests.append((now, size_bytes))
48
+ while self.requests and self.requests[0][0] < now - self.window_size:
49
+ self.requests.popleft()
50
+
51
+ def get_current_rates(self):
52
+ now = time.time()
53
+ while self.requests and self.requests[0][0] < now - self.window_size:
54
+ self.requests.popleft()
55
+
56
+ if not self.requests:
57
+ return 0, 0
58
+
59
+ request_count = len(self.requests)
60
+ byte_count = sum(size for _, size in self.requests)
61
+
62
+ requests_per_second = request_count / self.window_size
63
+ mb_per_second = (byte_count / 1024 / 1024) / self.window_size
64
+
65
+ return round(requests_per_second, 1), round(mb_per_second, 2)
66
+
67
+ class Monitor:
68
+ def __init__(self):
69
+ self.last_total = 0
70
+ self.last_date = _get_current_eastern_date()
71
+ self.submissions = []
72
+ self.max_hits = 10000
73
+ self.limiter = PreciseRateLimiter(5) # 5 requests per second
74
+ self.rate_monitor = RateMonitor()
75
+ self.headers = headers
76
+
77
+ async def _fetch_json(self, session, url):
78
+ """Fetch JSON with rate limiting and monitoring."""
79
+ async with self.limiter:
80
+ try:
81
+ async with session.get(url) as response:
82
+ response.raise_for_status()
83
+ content = await response.read()
84
+ await self.rate_monitor.add_request(len(content))
85
+ return await response.json()
86
+ except Exception as e:
87
+ print(f"Error fetching {url}: {str(e)}")
88
+ return None
89
+
90
+ async def _poll(self, base_url, session, poll_interval, quiet):
91
+ """Poll API until new submissions are found."""
92
+ while True:
93
+ current_date = _get_current_eastern_date()
94
+ date_str = current_date.strftime('%Y-%m-%d')
95
+ timestamp = int(time.time()) # Add this line
96
+
97
+ if self.last_date != current_date.strftime('%Y-%m-%d'):
98
+ print(f"New date: {date_str}")
99
+ self.last_total = 0
100
+ self.submissions = []
101
+ self.last_date = date_str
102
+
103
+ poll_url = f"{base_url}&startdt={date_str}&enddt={date_str}&v={timestamp}" # Modified this line
104
+ if not quiet:
105
+ print(f"Polling {poll_url}")
106
+
107
+ try:
108
+ data = await self._fetch_json(session, poll_url)
109
+ if data:
110
+ current_total = data['hits']['total']['value']
111
+ if current_total > self.last_total:
112
+ print(f"Found {current_total - self.last_total} new submissions")
113
+ self.last_total = current_total
114
+ return current_total, data, poll_url
115
+ self.last_total = current_total
116
+ except Exception as e:
117
+ print(f"Error in poll: {str(e)}")
118
+
119
+ await asyncio.sleep(poll_interval / 1000)
120
+
121
+ async def _retrieve_batch(self, session, poll_url, from_positions, quiet):
122
+ """Retrieve a batch of submissions concurrently."""
123
+ # The poll_url already contains the timestamp from _poll
124
+ tasks = [
125
+ self._fetch_json(
126
+ session,
127
+ f"{poll_url}&from={pos}"
128
+ )
129
+ for pos in from_positions
130
+ ]
131
+
132
+ results = await asyncio.gather(*tasks, return_exceptions=True)
133
+ submissions = []
134
+
135
+ for result in results:
136
+ if isinstance(result, Exception):
137
+ print(f"Error in batch: {str(result)}")
138
+ continue
139
+ if result and 'hits' in result:
140
+ submissions.extend(result['hits']['hits'])
141
+
142
+ return submissions
143
+
144
+ async def _retrieve(self, poll_url, initial_data, session, quiet):
145
+ """Retrieve all submissions using parallel batch processing."""
146
+ batch_size = 10 # Number of concurrent requests
147
+ page_size = 100 # Results per request
148
+ max_position = min(self.max_hits, self.last_total)
149
+ submissions = []
150
+
151
+ # Process in batches of concurrent requests
152
+ for batch_start in range(0, max_position, batch_size * page_size):
153
+ from_positions = [
154
+ pos for pos in range(
155
+ batch_start,
156
+ min(batch_start + batch_size * page_size, max_position),
157
+ page_size
158
+ )
159
+ ]
160
+
161
+ if not quiet:
162
+ print(f"Retrieving batch from positions: {from_positions}")
163
+
164
+ batch_submissions = await self._retrieve_batch(
165
+ session, poll_url, from_positions, quiet
166
+ )
167
+
168
+ if not batch_submissions:
169
+ break
170
+
171
+ submissions.extend(batch_submissions)
172
+
173
+ # If we got fewer results than expected, we're done
174
+ if len(batch_submissions) < len(from_positions) * page_size:
175
+ break
176
+
177
+ return submissions
178
+
179
+ async def _monitor(self, callback, form=None, cik=None, ticker=None, poll_interval=1000, quiet=True):
180
+ """Main monitoring loop with parallel processing."""
181
+ if poll_interval < 100:
182
+ raise ValueError("SEC rate limit is 10 requests per second, set poll_interval to 100ms or higher")
183
+
184
+ # Handle form parameter
185
+ if form is None:
186
+ form = ['-0']
187
+ elif isinstance(form, str):
188
+ form = [form]
189
+
190
+ # Handle CIK/ticker parameter
191
+ cik_param = None
192
+ if ticker is not None:
193
+ cik_param = identifier_to_cik(ticker)
194
+ elif cik is not None:
195
+ cik_param = cik if isinstance(cik, list) else [cik]
196
+
197
+ # Construct base URL
198
+ base_url = 'https://efts.sec.gov/LATEST/search-index?forms=' + ','.join(form)
199
+
200
+ # Add CIK parameter if specified
201
+ if cik_param:
202
+ cik_list = ','.join(str(c).zfill(10) for c in cik_param)
203
+ base_url += f"&ciks={cik_list}"
204
+
205
+ async with aiohttp.ClientSession(headers=self.headers) as session:
206
+ while True:
207
+ try:
208
+ # Poll until we find new submissions
209
+ _, data, poll_url = await self._poll(base_url, session, poll_interval, quiet)
210
+
211
+ # Retrieve all submissions in parallel
212
+ submissions = await self._retrieve(poll_url, data, session, quiet)
213
+
214
+ # Find new submissions
215
+ existing_ids = {sub['_id'] for sub in self.submissions}
216
+ new_submissions = [
217
+ sub for sub in submissions
218
+ if sub['_id'] not in existing_ids
219
+ ]
220
+
221
+ if new_submissions:
222
+ self.submissions.extend(new_submissions)
223
+ if callback:
224
+ await callback(new_submissions)
225
+
226
+ reqs_per_sec, mb_per_sec = self.rate_monitor.get_current_rates()
227
+ if not quiet:
228
+ print(f"Current rates: {reqs_per_sec} req/s, {mb_per_sec} MB/s")
229
+
230
+ except Exception as e:
231
+ print(f"Error in monitor: {str(e)}")
232
+ await asyncio.sleep(poll_interval / 1000)
233
+
234
+ await asyncio.sleep(poll_interval / 1000)
235
+
236
+ def monitor_submissions(self, callback=None, form=None, cik=None, ticker=None, poll_interval=1000, quiet=True):
237
+ """Start the monitoring process."""
238
+ asyncio.run(self._monitor(callback, form, cik, ticker, poll_interval, quiet))
@@ -1 +1 @@
1
- from .mulebot import MuleBot
1
+ from .mulebot import MuleBot
@@ -1,35 +1,35 @@
1
- import requests
2
- from datamule.global_vars import headers
3
- from datamule.helper import identifier_to_cik
4
- from datamule import Parser
5
-
6
- parser = Parser()
7
-
8
- def get_company_concept(ticker):
9
-
10
- cik = identifier_to_cik(ticker)[0]
11
- url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json'
12
- response = requests.get(url,headers=headers)
13
- data = response.json()
14
-
15
- table_dict_list = parser.parse_company_concepts(data)
16
-
17
- # drop tables where label is None
18
- table_dict_list = [table_dict for table_dict in table_dict_list if table_dict['label'] is not None]
19
-
20
- return table_dict_list
21
-
22
- def select_dict_by_title(data, title):
23
- if isinstance(data, dict):
24
- if data.get('title') == title:
25
- return data
26
- for value in data.values():
27
- result = select_dict_by_title(value, title)
28
- if result:
29
- return result
30
- elif isinstance(data, list):
31
- for item in data:
32
- result = select_dict_by_title(item, title)
33
- if result:
34
- return result
1
+ import requests
2
+ from datamule.global_vars import headers
3
+ from datamule.helper import identifier_to_cik
4
+ from datamule import Parser
5
+
6
+ parser = Parser()
7
+
8
+ def get_company_concept(ticker):
9
+
10
+ cik = identifier_to_cik(ticker)[0]
11
+ url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{str(cik).zfill(10)}.json'
12
+ response = requests.get(url,headers=headers)
13
+ data = response.json()
14
+
15
+ table_dict_list = parser.parse_company_concepts(data)
16
+
17
+ # drop tables where label is None
18
+ table_dict_list = [table_dict for table_dict in table_dict_list if table_dict['label'] is not None]
19
+
20
+ return table_dict_list
21
+
22
+ def select_dict_by_title(data, title):
23
+ if isinstance(data, dict):
24
+ if data.get('title') == title:
25
+ return data
26
+ for value in data.values():
27
+ result = select_dict_by_title(value, title)
28
+ if result:
29
+ return result
30
+ elif isinstance(data, list):
31
+ for item in data:
32
+ result = select_dict_by_title(item, title)
33
+ if result:
34
+ return result
35
35
  return None