datamule 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/datamule/datamule_mysql_rds.py +275 -3
- datamule/sheet.py +4 -4
- datamule/submission.py +0 -2
- {datamule-1.9.0.dist-info → datamule-2.0.0.dist-info}/METADATA +1 -1
- {datamule-1.9.0.dist-info → datamule-2.0.0.dist-info}/RECORD +7 -7
- {datamule-1.9.0.dist-info → datamule-2.0.0.dist-info}/WHEEL +0 -0
- {datamule-1.9.0.dist-info → datamule-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,275 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
import aiohttp
|
4
|
+
import json
|
5
|
+
import ssl
|
6
|
+
import time
|
7
|
+
from tqdm import tqdm
|
8
|
+
|
9
|
+
class DatamuleMySQL:
|
10
|
+
def __init__(self, api_key=None):
|
11
|
+
self.API_BASE_URL = "https://datamule-mysql-rds.jgfriedman99.workers.dev"
|
12
|
+
self._api_key = api_key
|
13
|
+
self.total_cost = 0
|
14
|
+
self.remaining_balance = None
|
15
|
+
self.start_time = None
|
16
|
+
|
17
|
+
@property
|
18
|
+
def api_key(self):
|
19
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
20
|
+
|
21
|
+
@api_key.setter
|
22
|
+
def api_key(self, value):
|
23
|
+
if not value:
|
24
|
+
raise ValueError("API key cannot be empty")
|
25
|
+
self._api_key = value
|
26
|
+
|
27
|
+
async def _fetch_page(self, session, table, database, filters, page=1, page_size=25000):
|
28
|
+
payload = {
|
29
|
+
"table": table,
|
30
|
+
"database": database,
|
31
|
+
"filters": filters,
|
32
|
+
"page": page,
|
33
|
+
"pageSize": page_size
|
34
|
+
}
|
35
|
+
|
36
|
+
headers = {
|
37
|
+
"Content-Type": "application/json",
|
38
|
+
"Authorization": f"Bearer {self.api_key}",
|
39
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
40
|
+
}
|
41
|
+
|
42
|
+
async with session.post(self.API_BASE_URL, json=payload, headers=headers) as response:
|
43
|
+
data = await response.json()
|
44
|
+
if not data.get('success'):
|
45
|
+
raise ValueError(f"API request failed: {data.get('error')}")
|
46
|
+
|
47
|
+
# Track costs and balance
|
48
|
+
billing = data['metadata']['billing']
|
49
|
+
page_cost = billing['total_charge']
|
50
|
+
self.total_cost += page_cost
|
51
|
+
self.remaining_balance = billing['remaining_balance']
|
52
|
+
|
53
|
+
return data['data'], data['metadata']['pagination'], page_cost
|
54
|
+
|
55
|
+
async def execute_query(self, table, **kwargs):
|
56
|
+
if self.api_key is None:
|
57
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
58
|
+
|
59
|
+
# Extract pagination and display options
|
60
|
+
page_size = kwargs.pop('page_size', 25000)
|
61
|
+
quiet = kwargs.pop('quiet', False)
|
62
|
+
|
63
|
+
# Determine database from table
|
64
|
+
if table == 'simple_xbrl':
|
65
|
+
database = 'xbrl_db'
|
66
|
+
elif table == 'accession_cik':
|
67
|
+
database = 'lookup_db'
|
68
|
+
elif table == 'submission_details':
|
69
|
+
database = 'lookup_db'
|
70
|
+
else:
|
71
|
+
raise ValueError(f"Unsupported table: {table}")
|
72
|
+
|
73
|
+
# Process filters: tuples = range, lists = OR, single = exact
|
74
|
+
filters = {}
|
75
|
+
for key, value in kwargs.items():
|
76
|
+
# Skip None values entirely
|
77
|
+
if value is None:
|
78
|
+
continue
|
79
|
+
|
80
|
+
# Special logic for cik
|
81
|
+
if key == 'cik':
|
82
|
+
if isinstance(value, list):
|
83
|
+
value = [int(val) for val in value]
|
84
|
+
else:
|
85
|
+
value = [int(value)]
|
86
|
+
filters[key] = {"type": "or", "values": value}
|
87
|
+
elif isinstance(value, tuple):
|
88
|
+
filters[key] = {"type": "range", "values": list(value)}
|
89
|
+
elif isinstance(value, list):
|
90
|
+
filters[key] = {"type": "or", "values": value}
|
91
|
+
else:
|
92
|
+
filters[key] = {"type": "or", "values": [value]}
|
93
|
+
|
94
|
+
self.start_time = time.time()
|
95
|
+
total_items = 0
|
96
|
+
pages_processed = 0
|
97
|
+
|
98
|
+
# Display query parameters
|
99
|
+
query_desc = [f"Table={table}"]
|
100
|
+
for key, filter_obj in filters.items():
|
101
|
+
if filter_obj["type"] == "range":
|
102
|
+
query_desc.append(f"{key}={filter_obj['values'][0]} to {filter_obj['values'][1]}")
|
103
|
+
elif len(filter_obj["values"]) == 1:
|
104
|
+
query_desc.append(f"{key}={filter_obj['values'][0]}")
|
105
|
+
else:
|
106
|
+
query_desc.append(f"{key}={filter_obj['values']}")
|
107
|
+
|
108
|
+
if not quiet:
|
109
|
+
print(f"QUERY: {', '.join(query_desc)}")
|
110
|
+
|
111
|
+
connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
|
112
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
113
|
+
# Initialize progress bar only if not quiet
|
114
|
+
if not quiet:
|
115
|
+
pbar = tqdm(unit="page", bar_format="{desc}: {n_fmt} {unit} [{elapsed}<{remaining}, {rate_fmt}{postfix}]")
|
116
|
+
pbar.set_description("Fetching data")
|
117
|
+
|
118
|
+
current_page = 1
|
119
|
+
has_more = True
|
120
|
+
results = []
|
121
|
+
|
122
|
+
while has_more:
|
123
|
+
# Fetch page
|
124
|
+
page_results, pagination, page_cost = await self._fetch_page(
|
125
|
+
session,
|
126
|
+
table=table,
|
127
|
+
database=database,
|
128
|
+
filters=filters,
|
129
|
+
page=current_page,
|
130
|
+
page_size=page_size
|
131
|
+
)
|
132
|
+
|
133
|
+
# Accumulate results
|
134
|
+
results.extend(page_results)
|
135
|
+
|
136
|
+
pages_processed += 1
|
137
|
+
total_items += len(page_results)
|
138
|
+
|
139
|
+
# Update progress bar only if not quiet
|
140
|
+
if not quiet:
|
141
|
+
pbar.set_description(f"Fetching data (page {current_page})")
|
142
|
+
pbar.set_postfix_str(f"cost=${self.total_cost:.4f} | balance=${self.remaining_balance:.2f}")
|
143
|
+
pbar.update(1)
|
144
|
+
|
145
|
+
# Check if we need to fetch more pages
|
146
|
+
has_more = pagination.get('hasMore', False)
|
147
|
+
current_page += 1
|
148
|
+
|
149
|
+
# For the first page, display record info only if not quiet
|
150
|
+
if pages_processed == 1 and not quiet:
|
151
|
+
records_per_page = pagination.get('currentPageRecords', len(page_results))
|
152
|
+
if records_per_page > 0:
|
153
|
+
pbar.write(f"Retrieved {records_per_page} records (page 1) - Fetching additional pages...")
|
154
|
+
else:
|
155
|
+
pbar.write("No records found matching criteria")
|
156
|
+
break
|
157
|
+
|
158
|
+
if not quiet:
|
159
|
+
pbar.close()
|
160
|
+
|
161
|
+
# Final summary only if not quiet
|
162
|
+
if not quiet:
|
163
|
+
elapsed_time = time.time() - self.start_time
|
164
|
+
print("\nQuery complete:")
|
165
|
+
print(f"- Retrieved {total_items} records across {pages_processed} pages")
|
166
|
+
print(f"- Total cost: ${self.total_cost:.4f}")
|
167
|
+
print(f"- Remaining balance: ${self.remaining_balance:.2f}")
|
168
|
+
print(f"- Time: {elapsed_time:.1f} seconds")
|
169
|
+
|
170
|
+
return results
|
171
|
+
|
172
|
+
|
173
|
+
def query_mysql_rds(table, api_key=None, **kwargs):
|
174
|
+
"""
|
175
|
+
Query MySQL RDS data from Datamule with optional filtering and automatic pagination
|
176
|
+
|
177
|
+
Parameters:
|
178
|
+
- table: Table name (e.g., 'simple_xbrl')
|
179
|
+
- cik: Company CIK number(s), can be int, string, or list
|
180
|
+
- Any other filter parameters as keyword arguments
|
181
|
+
- page_size: Number of records per page (max 25000, default 25000)
|
182
|
+
- quiet: Boolean, whether to suppress progress output and summary (default False)
|
183
|
+
- api_key: Optional API key (can also use DATAMULE_API_KEY environment variable)
|
184
|
+
|
185
|
+
Filter value types:
|
186
|
+
- Single value: Exact match
|
187
|
+
- List: OR condition (any of the values)
|
188
|
+
- Tuple: Range condition (between first and second values)
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
- List of dictionaries containing the requested data (ready for pandas DataFrame)
|
192
|
+
"""
|
193
|
+
# For backwards compatibility, handle non-paginated single requests
|
194
|
+
if kwargs.get('_single_page', False):
|
195
|
+
# Remove the flag and use original synchronous implementation
|
196
|
+
kwargs.pop('_single_page')
|
197
|
+
return _query_mysql_rds_single(table, api_key, **kwargs)
|
198
|
+
|
199
|
+
# Create a DatamuleMySQL instance for this request
|
200
|
+
dm = DatamuleMySQL(api_key=api_key)
|
201
|
+
|
202
|
+
# Run the paginated query and return results
|
203
|
+
return asyncio.run(dm.execute_query(table=table, **kwargs))
|
204
|
+
|
205
|
+
|
206
|
+
def _query_mysql_rds_single(table, api_key=None, **kwargs):
|
207
|
+
"""Original synchronous implementation for single page requests"""
|
208
|
+
import urllib.request
|
209
|
+
import urllib.error
|
210
|
+
|
211
|
+
endpoint_url = "https://datamule-mysql-rds.jgfriedman99.workers.dev"
|
212
|
+
|
213
|
+
# Get API key from parameter or environment
|
214
|
+
if api_key is None:
|
215
|
+
api_key = os.getenv('DATAMULE_API_KEY')
|
216
|
+
|
217
|
+
if not api_key:
|
218
|
+
return {"error": "API key required. Pass api_key parameter or set DATAMULE_API_KEY environment variable"}
|
219
|
+
|
220
|
+
# Process filters: tuples = range, lists = OR, single = exact
|
221
|
+
filters = {}
|
222
|
+
for key, value in kwargs.items():
|
223
|
+
# Skip None values entirely
|
224
|
+
if value is None:
|
225
|
+
continue
|
226
|
+
|
227
|
+
# special logic for cik
|
228
|
+
if key == 'cik':
|
229
|
+
if isinstance(value, list):
|
230
|
+
value = [int(val) for val in value]
|
231
|
+
else:
|
232
|
+
value = [int(value)]
|
233
|
+
filters[key] = {"type": "or", "values": value}
|
234
|
+
elif isinstance(value, tuple):
|
235
|
+
filters[key] = {"type": "range", "values": list(value)}
|
236
|
+
elif isinstance(value, list):
|
237
|
+
filters[key] = {"type": "or", "values": value}
|
238
|
+
else:
|
239
|
+
filters[key] = {"type": "or", "values": [value]}
|
240
|
+
|
241
|
+
payload = {"filters": filters}
|
242
|
+
# add table to payload
|
243
|
+
payload['table'] = table
|
244
|
+
|
245
|
+
if table == 'simple_xbrl':
|
246
|
+
payload['database'] = 'xbrl_db'
|
247
|
+
else:
|
248
|
+
raise ValueError("table not found")
|
249
|
+
|
250
|
+
data = json.dumps(payload).encode('utf-8')
|
251
|
+
req = urllib.request.Request(
|
252
|
+
endpoint_url,
|
253
|
+
data=data,
|
254
|
+
headers={
|
255
|
+
"Content-Type": "application/json",
|
256
|
+
"Authorization": f"Bearer {api_key}",
|
257
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
258
|
+
}
|
259
|
+
)
|
260
|
+
|
261
|
+
try:
|
262
|
+
with urllib.request.urlopen(req, timeout=6000) as response:
|
263
|
+
result = json.loads(response.read().decode('utf-8'))
|
264
|
+
# Return just the data for single page requests
|
265
|
+
return result.get('data', []) if result.get('success') else result
|
266
|
+
except urllib.error.HTTPError as e:
|
267
|
+
# Print the error response body
|
268
|
+
error_body = e.read().decode('utf-8')
|
269
|
+
print(f"HTTP Error {e.code}: {error_body}")
|
270
|
+
try:
|
271
|
+
error_json = json.loads(error_body)
|
272
|
+
print(f"Error details: {error_json}")
|
273
|
+
except json.JSONDecodeError:
|
274
|
+
print(f"Raw error response: {error_body}")
|
275
|
+
raise
|
datamule/sheet.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4
4
|
from .helper import _process_cik_and_metadata_filters, load_package_dataset
|
5
5
|
from .sec.xbrl.downloadcompanyfacts import download_company_facts
|
6
6
|
from .datamule.datamule_lookup import datamule_lookup
|
7
|
-
|
7
|
+
from .datamule.datamule_mysql_rds import query_mysql_rds
|
8
8
|
# slated for deprecation?
|
9
9
|
from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
|
10
10
|
|
@@ -19,9 +19,9 @@ class Sheet:
|
|
19
19
|
return datamule_lookup(cik, accession_number, submission_type, filing_date,
|
20
20
|
columns, distinct, page_size, quiet, api_key)
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
def get_table(self,table,cik=None,ticker=None,**kwargs):
|
23
|
+
cik = _process_cik_and_metadata_filters(cik, ticker)
|
24
|
+
return query_mysql_rds(table=table,cik=cik,**kwargs)
|
25
25
|
|
26
26
|
def download_xbrl(
|
27
27
|
self,
|
datamule/submission.py
CHANGED
@@ -5,12 +5,12 @@ datamule/index.py,sha256=Rrcna9FJV-Oh_K6O2IuUEIDmtay_7UZ4l4jgKCi7A7I,2079
|
|
5
5
|
datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
|
6
6
|
datamule/portfolio.py,sha256=YViG1JgJ9SFhg8N3tOOhBI8oc6Pmi2vwnHeHmlkC_5U,12119
|
7
7
|
datamule/portfolio_compression_utils.py,sha256=8OPYEN5zAdV1FiTxgVN3S7cTKs99Elv74bwgoIJP4QY,12654
|
8
|
-
datamule/sheet.py,sha256=
|
9
|
-
datamule/submission.py,sha256=
|
8
|
+
datamule/sheet.py,sha256=GnF9wA42iDw6purPmgshALymFssBp7gjmfjIs86CNJY,22997
|
9
|
+
datamule/submission.py,sha256=TkD_SVCEGjxOmHm5hjQm69j8DqQWr3YtgjTdKRWm26k,11205
|
10
10
|
datamule/data/listed_filer_metadata.csv,sha256=dT9fQ8AC5P1-Udf_UF0ZkdXJ88jNxJb_tuhi5YYL1rc,2426827
|
11
11
|
datamule/datamule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
datamule/datamule/datamule_lookup.py,sha256=e8djAg-ctSyHiKk7BjbtgugZ3p8roUjzsym5z3AihUg,9468
|
13
|
-
datamule/datamule/datamule_mysql_rds.py,sha256=
|
13
|
+
datamule/datamule/datamule_mysql_rds.py,sha256=Oj_xPTBKkzWsuRlb_tphjJrBW1eua1cOuxjGwJx581k,10591
|
14
14
|
datamule/datamule/downloader.py,sha256=IbeBkvc4-xefHq37qktTxzCXh90cG8ayx80qQWehRvU,18527
|
15
15
|
datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
|
16
16
|
datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -65,7 +65,7 @@ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,180
|
|
65
65
|
datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
66
66
|
datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
|
67
67
|
datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
|
68
|
-
datamule-
|
69
|
-
datamule-
|
70
|
-
datamule-
|
71
|
-
datamule-
|
68
|
+
datamule-2.0.0.dist-info/METADATA,sha256=Gg6gAtm4lGxYuXLiqNB8VtP0pb1A922QvKRw9kgCegk,560
|
69
|
+
datamule-2.0.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
70
|
+
datamule-2.0.0.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
|
71
|
+
datamule-2.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|