datamule 1.8.6__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamule-1.8.6 → datamule-2.0.0}/PKG-INFO +2 -1
- {datamule-1.8.6/datamule/seclibrary → datamule-2.0.0/datamule/datamule}/datamule_lookup.py +0 -1
- datamule-2.0.0/datamule/datamule/datamule_mysql_rds.py +275 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/document.py +55 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/portfolio.py +1 -4
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sheet.py +7 -2
- {datamule-1.8.6 → datamule-2.0.0}/datamule/submission.py +0 -2
- {datamule-1.8.6 → datamule-2.0.0}/datamule.egg-info/PKG-INFO +2 -1
- {datamule-1.8.6 → datamule-2.0.0}/datamule.egg-info/SOURCES.txt +3 -3
- {datamule-1.8.6 → datamule-2.0.0}/datamule.egg-info/requires.txt +1 -0
- {datamule-1.8.6 → datamule-2.0.0}/setup.py +2 -1
- datamule-1.8.6/datamule/seclibrary/query.py +0 -181
- {datamule-1.8.6 → datamule-2.0.0}/datamule/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/config.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/data/listed_filer_metadata.csv +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/datamule/__init__.py +0 -0
- {datamule-1.8.6/datamule/seclibrary → datamule-2.0.0/datamule/datamule}/downloader.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/datamule/sec_connector.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/atsn.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/cfportal.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/d.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ex102_abs.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ex99a_sdr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ex99c_sdr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ex99g_sdr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ex99i_sdr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/information_table.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/nmfp.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/npx.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/onefourtyfour.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ownership.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/proxy_voting_record.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/sbs.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/sbsef.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/schedule13.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/sdr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/submission_metadata.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/ta.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/thirteenfhr.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/twentyfivense.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/mappings/twentyfourf2nt.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/processing.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/document/table.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/helper.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/index.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/mapping_dicts/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/mapping_dicts/html_mapping_dicts.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/package_updater.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/portfolio_compression_utils.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/infrastructure/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/downloader.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/eftsquery.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/monitor.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/streamer.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/submissions/textsearch.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/utils.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/xbrl/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/seclibrary/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/seclibrary/bq.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/utils/__init__.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/utils/construct_submissions_data.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule/utils/format_accession.py +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule.egg-info/dependency_links.txt +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/datamule.egg-info/top_level.txt +0 -0
- {datamule-1.8.6 → datamule-2.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version:
|
3
|
+
Version: 2.0.0
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -18,3 +18,4 @@ Requires-Dist: doc2dict
|
|
18
18
|
Requires-Dist: secxbrl
|
19
19
|
Requires-Dist: secsgml
|
20
20
|
Requires-Dist: websocket-client
|
21
|
+
Requires-Dist: company_fundamentals
|
@@ -0,0 +1,275 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
import aiohttp
|
4
|
+
import json
|
5
|
+
import ssl
|
6
|
+
import time
|
7
|
+
from tqdm import tqdm
|
8
|
+
|
9
|
+
class DatamuleMySQL:
|
10
|
+
def __init__(self, api_key=None):
|
11
|
+
self.API_BASE_URL = "https://datamule-mysql-rds.jgfriedman99.workers.dev"
|
12
|
+
self._api_key = api_key
|
13
|
+
self.total_cost = 0
|
14
|
+
self.remaining_balance = None
|
15
|
+
self.start_time = None
|
16
|
+
|
17
|
+
@property
|
18
|
+
def api_key(self):
|
19
|
+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
20
|
+
|
21
|
+
@api_key.setter
|
22
|
+
def api_key(self, value):
|
23
|
+
if not value:
|
24
|
+
raise ValueError("API key cannot be empty")
|
25
|
+
self._api_key = value
|
26
|
+
|
27
|
+
async def _fetch_page(self, session, table, database, filters, page=1, page_size=25000):
|
28
|
+
payload = {
|
29
|
+
"table": table,
|
30
|
+
"database": database,
|
31
|
+
"filters": filters,
|
32
|
+
"page": page,
|
33
|
+
"pageSize": page_size
|
34
|
+
}
|
35
|
+
|
36
|
+
headers = {
|
37
|
+
"Content-Type": "application/json",
|
38
|
+
"Authorization": f"Bearer {self.api_key}",
|
39
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
40
|
+
}
|
41
|
+
|
42
|
+
async with session.post(self.API_BASE_URL, json=payload, headers=headers) as response:
|
43
|
+
data = await response.json()
|
44
|
+
if not data.get('success'):
|
45
|
+
raise ValueError(f"API request failed: {data.get('error')}")
|
46
|
+
|
47
|
+
# Track costs and balance
|
48
|
+
billing = data['metadata']['billing']
|
49
|
+
page_cost = billing['total_charge']
|
50
|
+
self.total_cost += page_cost
|
51
|
+
self.remaining_balance = billing['remaining_balance']
|
52
|
+
|
53
|
+
return data['data'], data['metadata']['pagination'], page_cost
|
54
|
+
|
55
|
+
async def execute_query(self, table, **kwargs):
|
56
|
+
if self.api_key is None:
|
57
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
58
|
+
|
59
|
+
# Extract pagination and display options
|
60
|
+
page_size = kwargs.pop('page_size', 25000)
|
61
|
+
quiet = kwargs.pop('quiet', False)
|
62
|
+
|
63
|
+
# Determine database from table
|
64
|
+
if table == 'simple_xbrl':
|
65
|
+
database = 'xbrl_db'
|
66
|
+
elif table == 'accession_cik':
|
67
|
+
database = 'lookup_db'
|
68
|
+
elif table == 'submission_details':
|
69
|
+
database = 'lookup_db'
|
70
|
+
else:
|
71
|
+
raise ValueError(f"Unsupported table: {table}")
|
72
|
+
|
73
|
+
# Process filters: tuples = range, lists = OR, single = exact
|
74
|
+
filters = {}
|
75
|
+
for key, value in kwargs.items():
|
76
|
+
# Skip None values entirely
|
77
|
+
if value is None:
|
78
|
+
continue
|
79
|
+
|
80
|
+
# Special logic for cik
|
81
|
+
if key == 'cik':
|
82
|
+
if isinstance(value, list):
|
83
|
+
value = [int(val) for val in value]
|
84
|
+
else:
|
85
|
+
value = [int(value)]
|
86
|
+
filters[key] = {"type": "or", "values": value}
|
87
|
+
elif isinstance(value, tuple):
|
88
|
+
filters[key] = {"type": "range", "values": list(value)}
|
89
|
+
elif isinstance(value, list):
|
90
|
+
filters[key] = {"type": "or", "values": value}
|
91
|
+
else:
|
92
|
+
filters[key] = {"type": "or", "values": [value]}
|
93
|
+
|
94
|
+
self.start_time = time.time()
|
95
|
+
total_items = 0
|
96
|
+
pages_processed = 0
|
97
|
+
|
98
|
+
# Display query parameters
|
99
|
+
query_desc = [f"Table={table}"]
|
100
|
+
for key, filter_obj in filters.items():
|
101
|
+
if filter_obj["type"] == "range":
|
102
|
+
query_desc.append(f"{key}={filter_obj['values'][0]} to {filter_obj['values'][1]}")
|
103
|
+
elif len(filter_obj["values"]) == 1:
|
104
|
+
query_desc.append(f"{key}={filter_obj['values'][0]}")
|
105
|
+
else:
|
106
|
+
query_desc.append(f"{key}={filter_obj['values']}")
|
107
|
+
|
108
|
+
if not quiet:
|
109
|
+
print(f"QUERY: {', '.join(query_desc)}")
|
110
|
+
|
111
|
+
connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
|
112
|
+
async with aiohttp.ClientSession(connector=connector) as session:
|
113
|
+
# Initialize progress bar only if not quiet
|
114
|
+
if not quiet:
|
115
|
+
pbar = tqdm(unit="page", bar_format="{desc}: {n_fmt} {unit} [{elapsed}<{remaining}, {rate_fmt}{postfix}]")
|
116
|
+
pbar.set_description("Fetching data")
|
117
|
+
|
118
|
+
current_page = 1
|
119
|
+
has_more = True
|
120
|
+
results = []
|
121
|
+
|
122
|
+
while has_more:
|
123
|
+
# Fetch page
|
124
|
+
page_results, pagination, page_cost = await self._fetch_page(
|
125
|
+
session,
|
126
|
+
table=table,
|
127
|
+
database=database,
|
128
|
+
filters=filters,
|
129
|
+
page=current_page,
|
130
|
+
page_size=page_size
|
131
|
+
)
|
132
|
+
|
133
|
+
# Accumulate results
|
134
|
+
results.extend(page_results)
|
135
|
+
|
136
|
+
pages_processed += 1
|
137
|
+
total_items += len(page_results)
|
138
|
+
|
139
|
+
# Update progress bar only if not quiet
|
140
|
+
if not quiet:
|
141
|
+
pbar.set_description(f"Fetching data (page {current_page})")
|
142
|
+
pbar.set_postfix_str(f"cost=${self.total_cost:.4f} | balance=${self.remaining_balance:.2f}")
|
143
|
+
pbar.update(1)
|
144
|
+
|
145
|
+
# Check if we need to fetch more pages
|
146
|
+
has_more = pagination.get('hasMore', False)
|
147
|
+
current_page += 1
|
148
|
+
|
149
|
+
# For the first page, display record info only if not quiet
|
150
|
+
if pages_processed == 1 and not quiet:
|
151
|
+
records_per_page = pagination.get('currentPageRecords', len(page_results))
|
152
|
+
if records_per_page > 0:
|
153
|
+
pbar.write(f"Retrieved {records_per_page} records (page 1) - Fetching additional pages...")
|
154
|
+
else:
|
155
|
+
pbar.write("No records found matching criteria")
|
156
|
+
break
|
157
|
+
|
158
|
+
if not quiet:
|
159
|
+
pbar.close()
|
160
|
+
|
161
|
+
# Final summary only if not quiet
|
162
|
+
if not quiet:
|
163
|
+
elapsed_time = time.time() - self.start_time
|
164
|
+
print("\nQuery complete:")
|
165
|
+
print(f"- Retrieved {total_items} records across {pages_processed} pages")
|
166
|
+
print(f"- Total cost: ${self.total_cost:.4f}")
|
167
|
+
print(f"- Remaining balance: ${self.remaining_balance:.2f}")
|
168
|
+
print(f"- Time: {elapsed_time:.1f} seconds")
|
169
|
+
|
170
|
+
return results
|
171
|
+
|
172
|
+
|
173
|
+
def query_mysql_rds(table, api_key=None, **kwargs):
|
174
|
+
"""
|
175
|
+
Query MySQL RDS data from Datamule with optional filtering and automatic pagination
|
176
|
+
|
177
|
+
Parameters:
|
178
|
+
- table: Table name (e.g., 'simple_xbrl')
|
179
|
+
- cik: Company CIK number(s), can be int, string, or list
|
180
|
+
- Any other filter parameters as keyword arguments
|
181
|
+
- page_size: Number of records per page (max 25000, default 25000)
|
182
|
+
- quiet: Boolean, whether to suppress progress output and summary (default False)
|
183
|
+
- api_key: Optional API key (can also use DATAMULE_API_KEY environment variable)
|
184
|
+
|
185
|
+
Filter value types:
|
186
|
+
- Single value: Exact match
|
187
|
+
- List: OR condition (any of the values)
|
188
|
+
- Tuple: Range condition (between first and second values)
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
- List of dictionaries containing the requested data (ready for pandas DataFrame)
|
192
|
+
"""
|
193
|
+
# For backwards compatibility, handle non-paginated single requests
|
194
|
+
if kwargs.get('_single_page', False):
|
195
|
+
# Remove the flag and use original synchronous implementation
|
196
|
+
kwargs.pop('_single_page')
|
197
|
+
return _query_mysql_rds_single(table, api_key, **kwargs)
|
198
|
+
|
199
|
+
# Create a DatamuleMySQL instance for this request
|
200
|
+
dm = DatamuleMySQL(api_key=api_key)
|
201
|
+
|
202
|
+
# Run the paginated query and return results
|
203
|
+
return asyncio.run(dm.execute_query(table=table, **kwargs))
|
204
|
+
|
205
|
+
|
206
|
+
def _query_mysql_rds_single(table, api_key=None, **kwargs):
|
207
|
+
"""Original synchronous implementation for single page requests"""
|
208
|
+
import urllib.request
|
209
|
+
import urllib.error
|
210
|
+
|
211
|
+
endpoint_url = "https://datamule-mysql-rds.jgfriedman99.workers.dev"
|
212
|
+
|
213
|
+
# Get API key from parameter or environment
|
214
|
+
if api_key is None:
|
215
|
+
api_key = os.getenv('DATAMULE_API_KEY')
|
216
|
+
|
217
|
+
if not api_key:
|
218
|
+
return {"error": "API key required. Pass api_key parameter or set DATAMULE_API_KEY environment variable"}
|
219
|
+
|
220
|
+
# Process filters: tuples = range, lists = OR, single = exact
|
221
|
+
filters = {}
|
222
|
+
for key, value in kwargs.items():
|
223
|
+
# Skip None values entirely
|
224
|
+
if value is None:
|
225
|
+
continue
|
226
|
+
|
227
|
+
# special logic for cik
|
228
|
+
if key == 'cik':
|
229
|
+
if isinstance(value, list):
|
230
|
+
value = [int(val) for val in value]
|
231
|
+
else:
|
232
|
+
value = [int(value)]
|
233
|
+
filters[key] = {"type": "or", "values": value}
|
234
|
+
elif isinstance(value, tuple):
|
235
|
+
filters[key] = {"type": "range", "values": list(value)}
|
236
|
+
elif isinstance(value, list):
|
237
|
+
filters[key] = {"type": "or", "values": value}
|
238
|
+
else:
|
239
|
+
filters[key] = {"type": "or", "values": [value]}
|
240
|
+
|
241
|
+
payload = {"filters": filters}
|
242
|
+
# add table to payload
|
243
|
+
payload['table'] = table
|
244
|
+
|
245
|
+
if table == 'simple_xbrl':
|
246
|
+
payload['database'] = 'xbrl_db'
|
247
|
+
else:
|
248
|
+
raise ValueError("table not found")
|
249
|
+
|
250
|
+
data = json.dumps(payload).encode('utf-8')
|
251
|
+
req = urllib.request.Request(
|
252
|
+
endpoint_url,
|
253
|
+
data=data,
|
254
|
+
headers={
|
255
|
+
"Content-Type": "application/json",
|
256
|
+
"Authorization": f"Bearer {api_key}",
|
257
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
258
|
+
}
|
259
|
+
)
|
260
|
+
|
261
|
+
try:
|
262
|
+
with urllib.request.urlopen(req, timeout=6000) as response:
|
263
|
+
result = json.loads(response.read().decode('utf-8'))
|
264
|
+
# Return just the data for single page requests
|
265
|
+
return result.get('data', []) if result.get('success') else result
|
266
|
+
except urllib.error.HTTPError as e:
|
267
|
+
# Print the error response body
|
268
|
+
error_body = e.read().decode('utf-8')
|
269
|
+
print(f"HTTP Error {e.code}: {error_body}")
|
270
|
+
try:
|
271
|
+
error_json = json.loads(error_body)
|
272
|
+
print(f"Error details: {error_json}")
|
273
|
+
except json.JSONDecodeError:
|
274
|
+
print(f"Raw error response: {error_body}")
|
275
|
+
raise
|
@@ -13,6 +13,7 @@ from pathlib import Path
|
|
13
13
|
import webbrowser
|
14
14
|
from secsgml.utils import bytes_to_str
|
15
15
|
from secxbrl import parse_inline_xbrl
|
16
|
+
from company_fundamentals import construct_fundamentals
|
16
17
|
|
17
18
|
class Document:
|
18
19
|
def __init__(self, type, content, extension,accession,filing_date,path=None):
|
@@ -35,6 +36,7 @@ class Document:
|
|
35
36
|
# this will be filled by parsed
|
36
37
|
self.data = None
|
37
38
|
self.xbrl = None
|
39
|
+
self.fundamentals = None
|
38
40
|
|
39
41
|
#_load_text_content
|
40
42
|
def _preprocess_txt_content(self):
|
@@ -113,6 +115,59 @@ class Document:
|
|
113
115
|
self.xbrl = parse_inline_xbrl(self.content)
|
114
116
|
else:
|
115
117
|
raise ValueError("Only inline has been implemented so far.")
|
118
|
+
|
119
|
+
def parse_fundamentals(self,categories=None):
|
120
|
+
self.parse_xbrl()
|
121
|
+
# Transform XBRL records into the format needed by construct_fundamentals
|
122
|
+
xbrl = []
|
123
|
+
|
124
|
+
for xbrl_record in self.xbrl:
|
125
|
+
try:
|
126
|
+
# Extract basic fields
|
127
|
+
value = xbrl_record.get('_val', None)
|
128
|
+
taxonomy, name = xbrl_record['_attributes']['name'].split(':')
|
129
|
+
|
130
|
+
# Handle scaling if present
|
131
|
+
if xbrl_record.get('_attributes', {}).get('scale') is not None:
|
132
|
+
scale = int(xbrl_record['_attributes']['scale'])
|
133
|
+
try:
|
134
|
+
value = str(Decimal(value.replace(',', '')) * (Decimal(10) ** scale))
|
135
|
+
except:
|
136
|
+
pass
|
137
|
+
|
138
|
+
# Extract period dates
|
139
|
+
period_start_date = None
|
140
|
+
period_end_date = None
|
141
|
+
|
142
|
+
if xbrl_record.get('_context'):
|
143
|
+
context = xbrl_record['_context']
|
144
|
+
period_start_date = context.get('context_period_instant') or context.get('context_period_startdate')
|
145
|
+
period_end_date = context.get('context_period_enddate')
|
146
|
+
|
147
|
+
# Create record in the format expected by construct_fundamentals
|
148
|
+
record = {
|
149
|
+
'taxonomy': taxonomy,
|
150
|
+
'name': name,
|
151
|
+
'value': value,
|
152
|
+
'period_start_date': period_start_date,
|
153
|
+
'period_end_date': period_end_date
|
154
|
+
}
|
155
|
+
|
156
|
+
xbrl.append(record)
|
157
|
+
|
158
|
+
except Exception as e:
|
159
|
+
# Skip malformed records
|
160
|
+
continue
|
161
|
+
|
162
|
+
# Call construct_fundamentals with the transformed data
|
163
|
+
fundamentals = construct_fundamentals(xbrl,
|
164
|
+
taxonomy_key='taxonomy',
|
165
|
+
concept_key='name',
|
166
|
+
start_date_key='period_start_date',
|
167
|
+
end_date_key='period_end_date',
|
168
|
+
categories=categories)
|
169
|
+
|
170
|
+
self.fundamentals = fundamentals
|
116
171
|
|
117
172
|
# Note: this method will be heavily modified in the future
|
118
173
|
def parse(self):
|
@@ -9,14 +9,11 @@ import os
|
|
9
9
|
import tarfile
|
10
10
|
from threading import Lock
|
11
11
|
from .helper import _process_cik_and_metadata_filters
|
12
|
-
from .
|
12
|
+
from .datamule.downloader import download as seclibrary_download
|
13
13
|
from .sec.xbrl.filter_xbrl import filter_xbrl
|
14
14
|
from .sec.submissions.monitor import Monitor
|
15
15
|
from .portfolio_compression_utils import CompressionManager
|
16
16
|
from .datamule.sec_connector import SecConnector
|
17
|
-
from secsgml.utils import bytes_to_str, calculate_documents_locations_in_tar
|
18
|
-
import json
|
19
|
-
import io
|
20
17
|
import shutil
|
21
18
|
|
22
19
|
|
@@ -3,8 +3,8 @@ import csv
|
|
3
3
|
import os
|
4
4
|
from .helper import _process_cik_and_metadata_filters, load_package_dataset
|
5
5
|
from .sec.xbrl.downloadcompanyfacts import download_company_facts
|
6
|
-
from .
|
7
|
-
|
6
|
+
from .datamule.datamule_lookup import datamule_lookup
|
7
|
+
from .datamule.datamule_mysql_rds import query_mysql_rds
|
8
8
|
# slated for deprecation?
|
9
9
|
from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
|
10
10
|
|
@@ -12,11 +12,16 @@ class Sheet:
|
|
12
12
|
def __init__(self, path):
|
13
13
|
self.path = Path(path)
|
14
14
|
|
15
|
+
# Keep
|
15
16
|
def get_submissions(self,cik=None, accession_number=None, submission_type=None, filing_date=None,
|
16
17
|
columns=None, distinct=False, page_size=25000, quiet=False, api_key=None):
|
17
18
|
|
18
19
|
return datamule_lookup(cik, accession_number, submission_type, filing_date,
|
19
20
|
columns, distinct, page_size, quiet, api_key)
|
21
|
+
|
22
|
+
def get_table(self,table,cik=None,ticker=None,**kwargs):
|
23
|
+
cik = _process_cik_and_metadata_filters(cik, ticker)
|
24
|
+
return query_mysql_rds(table=table,cik=cik,**kwargs)
|
20
25
|
|
21
26
|
def download_xbrl(
|
22
27
|
self,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: datamule
|
3
|
-
Version:
|
3
|
+
Version: 2.0.0
|
4
4
|
Summary: Work with SEC submissions at scale.
|
5
5
|
Home-page: https://github.com/john-friedman/datamule-python
|
6
6
|
Author: John Friedman
|
@@ -18,3 +18,4 @@ Requires-Dist: doc2dict
|
|
18
18
|
Requires-Dist: secxbrl
|
19
19
|
Requires-Dist: secsgml
|
20
20
|
Requires-Dist: websocket-client
|
21
|
+
Requires-Dist: company_fundamentals
|
@@ -15,6 +15,9 @@ datamule.egg-info/requires.txt
|
|
15
15
|
datamule.egg-info/top_level.txt
|
16
16
|
datamule/data/listed_filer_metadata.csv
|
17
17
|
datamule/datamule/__init__.py
|
18
|
+
datamule/datamule/datamule_lookup.py
|
19
|
+
datamule/datamule/datamule_mysql_rds.py
|
20
|
+
datamule/datamule/downloader.py
|
18
21
|
datamule/datamule/sec_connector.py
|
19
22
|
datamule/document/__init__.py
|
20
23
|
datamule/document/document.py
|
@@ -65,9 +68,6 @@ datamule/sec/xbrl/streamcompanyfacts.py
|
|
65
68
|
datamule/sec/xbrl/xbrlmonitor.py
|
66
69
|
datamule/seclibrary/__init__.py
|
67
70
|
datamule/seclibrary/bq.py
|
68
|
-
datamule/seclibrary/datamule_lookup.py
|
69
|
-
datamule/seclibrary/downloader.py
|
70
|
-
datamule/seclibrary/query.py
|
71
71
|
datamule/utils/__init__.py
|
72
72
|
datamule/utils/construct_submissions_data.py
|
73
73
|
datamule/utils/format_accession.py
|
@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
|
|
32
32
|
setup(
|
33
33
|
name="datamule",
|
34
34
|
author="John Friedman",
|
35
|
-
version="
|
35
|
+
version="2.0.0",
|
36
36
|
description="Work with SEC submissions at scale.",
|
37
37
|
packages=find_packages(include=['datamule', 'datamule.*']),
|
38
38
|
url="https://github.com/john-friedman/datamule-python",
|
@@ -51,6 +51,7 @@ setup(
|
|
51
51
|
'secxbrl',
|
52
52
|
'secsgml',
|
53
53
|
'websocket-client',
|
54
|
+
'company_fundamentals'
|
54
55
|
],
|
55
56
|
# Include the data directory in the package
|
56
57
|
package_data={
|
@@ -1,181 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import asyncio
|
3
|
-
import aiohttp
|
4
|
-
import urllib.parse
|
5
|
-
import ssl
|
6
|
-
import json
|
7
|
-
import time
|
8
|
-
from tqdm import tqdm
|
9
|
-
|
10
|
-
class Query:
|
11
|
-
def __init__(self, api_key=None):
|
12
|
-
self.API_BASE_URL = "https://sec-library.jgfriedman99.workers.dev/"
|
13
|
-
self._api_key = api_key
|
14
|
-
self.total_cost = 0
|
15
|
-
self.remaining_balance = None
|
16
|
-
self.start_time = None
|
17
|
-
|
18
|
-
@property
|
19
|
-
def api_key(self):
|
20
|
-
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
|
21
|
-
|
22
|
-
@api_key.setter
|
23
|
-
def api_key(self, value):
|
24
|
-
if not value:
|
25
|
-
raise ValueError("API key cannot be empty")
|
26
|
-
self._api_key = value
|
27
|
-
|
28
|
-
async def _fetch_page(self, session, submission_type=None, cik=None, filing_date=None, page=1):
|
29
|
-
params = {
|
30
|
-
'api_key': self.api_key,
|
31
|
-
'page': page
|
32
|
-
}
|
33
|
-
|
34
|
-
# Handle submission_type parameter
|
35
|
-
if submission_type:
|
36
|
-
if isinstance(submission_type, list):
|
37
|
-
params['submission_type'] = ','.join(str(x) for x in submission_type)
|
38
|
-
else:
|
39
|
-
params['submission_type'] = str(submission_type)
|
40
|
-
|
41
|
-
# Handle CIK parameter
|
42
|
-
if cik:
|
43
|
-
if isinstance(cik, list):
|
44
|
-
params['cik'] = ','.join(str(x) for x in cik)
|
45
|
-
else:
|
46
|
-
params['cik'] = str(cik)
|
47
|
-
|
48
|
-
# Handle filing_date parameter
|
49
|
-
if filing_date:
|
50
|
-
if isinstance(filing_date, tuple):
|
51
|
-
params['startdt'] = str(filing_date[0])
|
52
|
-
params['enddt'] = str(filing_date[1])
|
53
|
-
else:
|
54
|
-
if isinstance(filing_date, list):
|
55
|
-
params['filing_date'] = ','.join(str(x) for x in filing_date)
|
56
|
-
else:
|
57
|
-
params['filing_date'] = str(filing_date)
|
58
|
-
|
59
|
-
url = f"{self.API_BASE_URL}?{urllib.parse.urlencode(params)}"
|
60
|
-
|
61
|
-
async with session.get(url) as response:
|
62
|
-
data = await response.json()
|
63
|
-
if not data.get('success'):
|
64
|
-
raise ValueError(f"API request failed: {data.get('error')}")
|
65
|
-
|
66
|
-
# Track costs and balance
|
67
|
-
charges = data['metadata']['billing']['charges']
|
68
|
-
page_cost = charges['total']
|
69
|
-
self.total_cost += page_cost
|
70
|
-
self.remaining_balance = data['metadata']['billing']['remaining_balance']
|
71
|
-
|
72
|
-
return data['data'], data['metadata']['pagination'], page_cost
|
73
|
-
|
74
|
-
async def execute_query(self, submission_type=None, cik=None, filing_date=None):
|
75
|
-
if self.api_key is None:
|
76
|
-
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
77
|
-
|
78
|
-
self.start_time = time.time()
|
79
|
-
total_items = 0
|
80
|
-
pages_processed = 0
|
81
|
-
|
82
|
-
# Display query parameters
|
83
|
-
query_desc = []
|
84
|
-
if cik:
|
85
|
-
query_desc.append(f"CIK={cik}")
|
86
|
-
if submission_type:
|
87
|
-
query_desc.append(f"Type={submission_type}")
|
88
|
-
if filing_date:
|
89
|
-
if isinstance(filing_date, tuple):
|
90
|
-
query_desc.append(f"Date={filing_date[0]} to {filing_date[1]}")
|
91
|
-
else:
|
92
|
-
query_desc.append(f"Date={filing_date}")
|
93
|
-
|
94
|
-
if query_desc:
|
95
|
-
print(f"QUERY: {', '.join(query_desc)}")
|
96
|
-
|
97
|
-
connector = aiohttp.TCPConnector(ssl=ssl.create_default_context())
|
98
|
-
async with aiohttp.ClientSession(connector=connector) as session:
|
99
|
-
# Initialize progress bar with a custom format to avoid extra colons
|
100
|
-
pbar = tqdm(unit="page", bar_format="{desc}: {n_fmt} {unit} [{elapsed}<{remaining}, {rate_fmt}{postfix}]")
|
101
|
-
pbar.set_description("Fetching data")
|
102
|
-
|
103
|
-
current_page = 1
|
104
|
-
has_more = True
|
105
|
-
results = []
|
106
|
-
|
107
|
-
while has_more:
|
108
|
-
# Fetch page
|
109
|
-
page_results, pagination, page_cost = await self._fetch_page(session,
|
110
|
-
submission_type=submission_type,
|
111
|
-
cik=cik,
|
112
|
-
filing_date=filing_date,
|
113
|
-
page=current_page)
|
114
|
-
|
115
|
-
# Accumulate results
|
116
|
-
results.extend(page_results)
|
117
|
-
|
118
|
-
pages_processed += 1
|
119
|
-
total_items += len(page_results)
|
120
|
-
|
121
|
-
# Update progress bar with cleaner format
|
122
|
-
pbar.set_description(f"Fetching data (page {current_page})")
|
123
|
-
pbar.set_postfix_str(f"cost=${self.total_cost:.2f} | balance=${self.remaining_balance:.2f}")
|
124
|
-
pbar.update(1)
|
125
|
-
|
126
|
-
# Check if we need to fetch more pages
|
127
|
-
has_more = pagination.get('hasMore', False)
|
128
|
-
current_page += 1
|
129
|
-
|
130
|
-
# For the first page, display record info using pbar.write instead of print
|
131
|
-
if pages_processed == 1:
|
132
|
-
records_per_page = pagination.get('currentPageRecords', len(page_results))
|
133
|
-
total_records = pagination.get('totalRecords', None)
|
134
|
-
if total_records:
|
135
|
-
pbar.write(f"Retrieved {records_per_page} records (page 1) of {total_records} total - Fetching additional pages...")
|
136
|
-
else:
|
137
|
-
pbar.write(f"Retrieved {records_per_page} records (page 1) - Fetching additional pages...")
|
138
|
-
|
139
|
-
pbar.close()
|
140
|
-
|
141
|
-
# Final summary
|
142
|
-
elapsed_time = time.time() - self.start_time
|
143
|
-
print("\nQuery complete:")
|
144
|
-
print(f"- Retrieved {total_items} filings across {pages_processed} pages")
|
145
|
-
print(f"- Total cost: ${self.total_cost:.2f}")
|
146
|
-
print(f"- Remaining balance: ${self.remaining_balance:.2f}")
|
147
|
-
print(f"- Time: {elapsed_time:.1f} seconds")
|
148
|
-
|
149
|
-
return results
|
150
|
-
|
151
|
-
|
152
|
-
def query(cik=None, submission_type=None, filing_date=None, api_key=None):
|
153
|
-
"""
|
154
|
-
Query SEC filings data with optional filtering
|
155
|
-
|
156
|
-
Parameters:
|
157
|
-
- cik: Company CIK number(s), can be string, int, or list
|
158
|
-
- submission_type: Filing type(s), can be string or list (e.g., '10-K', ['10-K', '10-Q'])
|
159
|
-
- filing_date: Filing date(s), can be string, list, or tuple of (start_date, end_date)
|
160
|
-
- api_key: Optional API key (can also use DATAMULE_API_KEY environment variable)
|
161
|
-
|
162
|
-
Returns:
|
163
|
-
- List of all matching submission data
|
164
|
-
"""
|
165
|
-
# Create a Query instance for this request
|
166
|
-
q = Query(api_key=api_key)
|
167
|
-
# remove dash from filing_date
|
168
|
-
if isinstance(filing_date, tuple):
|
169
|
-
filing_date = (filing_date[0].replace('-', ''), filing_date[1].replace('-', ''))
|
170
|
-
elif isinstance(filing_date, str):
|
171
|
-
filing_date = filing_date.replace('-', '')
|
172
|
-
elif isinstance(filing_date, list):
|
173
|
-
filing_date = [x.replace('-', '') for x in filing_date]
|
174
|
-
|
175
|
-
print(filing_date)
|
176
|
-
# Run the query and return results
|
177
|
-
return asyncio.run(q.execute_query(
|
178
|
-
submission_type=submission_type,
|
179
|
-
cik=cik,
|
180
|
-
filing_date=filing_date
|
181
|
-
))
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|