datamule 1.1.8__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/__init__.py +3 -1
- datamule/document/__init__.py +0 -0
- datamule/document/document.py +255 -0
- datamule/document/processing.py +604 -0
- datamule/document/table.py +260 -0
- datamule/package_updater.py +30 -0
- datamule/portfolio.py +5 -3
- datamule/sec/submissions/downloader.py +14 -37
- datamule/seclibrary/bq.py +349 -12
- datamule/seclibrary/downloader.py +50 -9
- datamule/sheet.py +458 -34
- datamule/submission.py +102 -7
- {datamule-1.1.8.dist-info → datamule-1.2.2.dist-info}/METADATA +1 -1
- {datamule-1.1.8.dist-info → datamule-1.2.2.dist-info}/RECORD +16 -12
- {datamule-1.1.8.dist-info → datamule-1.2.2.dist-info}/WHEEL +1 -1
- datamule/document.py +0 -472
- {datamule-1.1.8.dist-info → datamule-1.2.2.dist-info}/top_level.txt +0 -0
datamule/seclibrary/bq.py
CHANGED
@@ -3,9 +3,6 @@ import requests
|
|
3
3
|
import json
|
4
4
|
|
5
5
|
def get_information_table(
|
6
|
-
# Required parameters
|
7
|
-
table_type="INFORMATION_TABLE",
|
8
|
-
|
9
6
|
# Optional filtering parameters
|
10
7
|
columns=None,
|
11
8
|
name_of_issuer=None,
|
@@ -37,8 +34,6 @@ def get_information_table(
|
|
37
34
|
|
38
35
|
Parameters:
|
39
36
|
-----------
|
40
|
-
table_type : str
|
41
|
-
The table to query (default is "INFORMATION_TABLE")
|
42
37
|
columns : List[str], optional
|
43
38
|
Specific columns to return. If None, all columns are returned.
|
44
39
|
|
@@ -76,7 +71,7 @@ def get_information_table(
|
|
76
71
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
|
77
72
|
|
78
73
|
# 2. Build query parameters
|
79
|
-
params = {'table_type':
|
74
|
+
params = {'table_type': 'INFORMATION_TABLE'}
|
80
75
|
|
81
76
|
# Add columns parameter if provided
|
82
77
|
if columns:
|
@@ -138,7 +133,198 @@ def get_information_table(
|
|
138
133
|
# Exact match
|
139
134
|
params[api_param_name] = value
|
140
135
|
|
141
|
-
#
|
136
|
+
# Call common function to make API request
|
137
|
+
return _make_api_request(params, api_key, print_cost, verbose)
|
138
|
+
|
139
|
+
def get_345(
|
140
|
+
# Optional filtering parameters
|
141
|
+
columns=None,
|
142
|
+
is_derivative=None,
|
143
|
+
is_non_derivative=None,
|
144
|
+
security_title=None,
|
145
|
+
transaction_date=None,
|
146
|
+
document_type=None,
|
147
|
+
transaction_code=None,
|
148
|
+
equity_swap_involved=None,
|
149
|
+
transaction_timeliness=None,
|
150
|
+
transaction_shares=None,
|
151
|
+
transaction_price_per_share=None,
|
152
|
+
shares_owned_following_transaction=None,
|
153
|
+
ownership_type=None,
|
154
|
+
deemed_execution_date=None,
|
155
|
+
conversion_or_exercise_price=None,
|
156
|
+
exercise_date=None,
|
157
|
+
expiration_date=None,
|
158
|
+
underlying_security_title=None,
|
159
|
+
underlying_security_shares=None,
|
160
|
+
underlying_security_value=None,
|
161
|
+
accession=None,
|
162
|
+
reporting_owner_cik=None,
|
163
|
+
issuer_cik=None,
|
164
|
+
filing_date=None,
|
165
|
+
|
166
|
+
# API key handling
|
167
|
+
api_key=None,
|
168
|
+
|
169
|
+
# Additional options
|
170
|
+
print_cost=True,
|
171
|
+
verbose=False
|
172
|
+
):
|
173
|
+
"""
|
174
|
+
Query the SEC BigQuery API for Form 345 insider transaction data.
|
175
|
+
|
176
|
+
Parameters:
|
177
|
+
-----------
|
178
|
+
columns : List[str], optional
|
179
|
+
Specific columns to return. If None, all columns are returned.
|
180
|
+
|
181
|
+
# Filter parameters
|
182
|
+
is_derivative, security_title, etc. : Various filters that can be:
|
183
|
+
- str/bool: Exact match
|
184
|
+
- List[str]: Match any in list
|
185
|
+
- tuple: (min, max) range for numeric/date fields
|
186
|
+
|
187
|
+
reporting_owner_cik : str or List[str]
|
188
|
+
CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
|
189
|
+
Any match within the array will return the record.
|
190
|
+
|
191
|
+
issuer_cik : str or List[str]
|
192
|
+
CIK(s) of the company/companies
|
193
|
+
|
194
|
+
api_key : str, optional
|
195
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
196
|
+
print_cost : bool
|
197
|
+
Whether to print the query cost information
|
198
|
+
verbose : bool
|
199
|
+
Whether to print additional information about the query
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
--------
|
203
|
+
List[Dict]
|
204
|
+
A list of dictionaries containing the query results
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
-------
|
208
|
+
ValueError
|
209
|
+
If API key is missing or invalid
|
210
|
+
Exception
|
211
|
+
For API errors or other issues
|
212
|
+
"""
|
213
|
+
|
214
|
+
# 1. Handle API key
|
215
|
+
if api_key is None:
|
216
|
+
api_key = os.getenv('DATAMULE_API_KEY')
|
217
|
+
|
218
|
+
if not api_key:
|
219
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
|
220
|
+
|
221
|
+
# 2. Build query parameters
|
222
|
+
params = {'table_type': 'FORM_345_TABLE'}
|
223
|
+
|
224
|
+
# Add columns parameter if provided
|
225
|
+
if columns:
|
226
|
+
if isinstance(columns, list):
|
227
|
+
params['columns'] = ','.join(columns)
|
228
|
+
else:
|
229
|
+
params['columns'] = columns
|
230
|
+
|
231
|
+
# Map Python parameter names to API parameter names
|
232
|
+
param_mapping = {
|
233
|
+
'is_derivative': 'isDerivative',
|
234
|
+
'is_non_derivative': 'isNonDerivative',
|
235
|
+
'security_title': 'securityTitle',
|
236
|
+
'transaction_date': 'transactionDate',
|
237
|
+
'document_type': 'documentType',
|
238
|
+
'transaction_code': 'transactionCode',
|
239
|
+
'equity_swap_involved': 'equitySwapInvolved',
|
240
|
+
'transaction_timeliness': 'transactionTimeliness',
|
241
|
+
'transaction_shares': 'transactionShares',
|
242
|
+
'transaction_price_per_share': 'transactionPricePerShare',
|
243
|
+
'shares_owned_following_transaction': 'sharesOwnedFollowingTransaction',
|
244
|
+
'ownership_type': 'ownershipType',
|
245
|
+
'deemed_execution_date': 'deemedExecutionDate',
|
246
|
+
'conversion_or_exercise_price': 'conversionOrExercisePrice',
|
247
|
+
'exercise_date': 'exerciseDate',
|
248
|
+
'expiration_date': 'expirationDate',
|
249
|
+
'underlying_security_title': 'underlyingSecurityTitle',
|
250
|
+
'underlying_security_shares': 'underlyingSecurityShares',
|
251
|
+
'underlying_security_value': 'underlyingSecurityValue',
|
252
|
+
'accession': 'accession',
|
253
|
+
'reporting_owner_cik': 'reportingOwnerCIK',
|
254
|
+
'issuer_cik': 'issuerCIK',
|
255
|
+
'filing_date': 'filingDate'
|
256
|
+
}
|
257
|
+
|
258
|
+
# Process all possible filter parameters
|
259
|
+
date_params = ['transaction_date', 'filing_date', 'deemed_execution_date', 'exercise_date', 'expiration_date']
|
260
|
+
boolean_params = ['is_derivative', 'is_non_derivative']
|
261
|
+
|
262
|
+
for param_name, api_param_name in param_mapping.items():
|
263
|
+
value = locals()[param_name]
|
264
|
+
if value is not None:
|
265
|
+
# Handle different filter types
|
266
|
+
if isinstance(value, list):
|
267
|
+
# List filter
|
268
|
+
params[api_param_name] = f"[{','.join(str(v) for v in value)}]"
|
269
|
+
elif isinstance(value, tuple):
|
270
|
+
# Range filter
|
271
|
+
if len(value) == 2:
|
272
|
+
min_val, max_val = value
|
273
|
+
# Handle date range specially
|
274
|
+
if param_name in date_params:
|
275
|
+
# Dates need to be in quotes within the parentheses
|
276
|
+
if min_val is None:
|
277
|
+
min_val = ''
|
278
|
+
else:
|
279
|
+
min_val = f"'{min_val}'"
|
280
|
+
|
281
|
+
if max_val is None:
|
282
|
+
max_val = ''
|
283
|
+
else:
|
284
|
+
max_val = f"'{max_val}'"
|
285
|
+
|
286
|
+
range_str = f"({min_val},{max_val})"
|
287
|
+
params[api_param_name] = range_str
|
288
|
+
else:
|
289
|
+
raise ValueError(f"Range filter for {param_name} must be a tuple of (min, max)")
|
290
|
+
elif param_name in boolean_params:
|
291
|
+
# Boolean values
|
292
|
+
params[api_param_name] = str(value).lower()
|
293
|
+
else:
|
294
|
+
# Exact match
|
295
|
+
params[api_param_name] = value
|
296
|
+
|
297
|
+
# Call common function to make API request
|
298
|
+
return _make_api_request(params, api_key, print_cost, verbose)
|
299
|
+
|
300
|
+
def _make_api_request(params, api_key, print_cost=True, verbose=False):
|
301
|
+
"""
|
302
|
+
Common function to make API requests to the SEC BigQuery API.
|
303
|
+
|
304
|
+
Parameters:
|
305
|
+
-----------
|
306
|
+
params : dict
|
307
|
+
Query parameters
|
308
|
+
api_key : str
|
309
|
+
API key for authentication
|
310
|
+
print_cost : bool
|
311
|
+
Whether to print cost information
|
312
|
+
verbose : bool
|
313
|
+
Whether to print debugging information
|
314
|
+
|
315
|
+
Returns:
|
316
|
+
--------
|
317
|
+
List[Dict]
|
318
|
+
Data returned from the API
|
319
|
+
|
320
|
+
Raises:
|
321
|
+
-------
|
322
|
+
ValueError
|
323
|
+
If API key is invalid
|
324
|
+
Exception
|
325
|
+
For other API errors
|
326
|
+
"""
|
327
|
+
# Make the API request
|
142
328
|
BASE_URL = "https://sec-bq.jgfriedman99.workers.dev/"
|
143
329
|
|
144
330
|
headers = {
|
@@ -166,7 +352,15 @@ def get_information_table(
|
|
166
352
|
# Extract metadata for cost reporting
|
167
353
|
metadata = result.get('metadata', {})
|
168
354
|
|
169
|
-
#
|
355
|
+
# Process the data to handle array fields
|
356
|
+
data = result.get('data', [])
|
357
|
+
for row in data:
|
358
|
+
# Check if reportingOwnerCIK is an array that needs processing
|
359
|
+
if 'reportingOwnerCIK' in row and isinstance(row['reportingOwnerCIK'], list):
|
360
|
+
# Transform from [{'v': 'value1'}, {'v': 'value2'}] to comma-separated string
|
361
|
+
row['reportingOwnerCIK'] = ','.join([item['v'] for item in row['reportingOwnerCIK'] if 'v' in item])
|
362
|
+
|
363
|
+
# Print cost information if requested
|
170
364
|
if print_cost and 'billing' in metadata:
|
171
365
|
billing = metadata['billing']
|
172
366
|
query_info = metadata.get('query_info', {})
|
@@ -181,11 +375,154 @@ def get_information_table(
|
|
181
375
|
print(f"Cache Hit: {query_info.get('cache_hit', False)}")
|
182
376
|
print("==============================\n")
|
183
377
|
|
184
|
-
#
|
185
|
-
return
|
378
|
+
# Return data
|
379
|
+
return data
|
186
380
|
|
187
381
|
except requests.exceptions.RequestException as e:
|
188
|
-
if response.status_code == 401:
|
382
|
+
if hasattr(e, 'response') and e.response is not None and e.response.status_code == 401:
|
189
383
|
raise ValueError("Authentication failed: Invalid API key")
|
190
384
|
else:
|
191
|
-
raise Exception(f"Request failed: {str(e)}")
|
385
|
+
raise Exception(f"Request failed: {str(e)}")
|
386
|
+
|
387
|
+
def get_proxy_voting_record(
|
388
|
+
# Optional filtering parameters
|
389
|
+
columns=None,
|
390
|
+
meeting_date=None,
|
391
|
+
isin=None,
|
392
|
+
cusip=None,
|
393
|
+
issuer_name=None,
|
394
|
+
vote_description=None,
|
395
|
+
shares_on_loan=None,
|
396
|
+
shares_voted=None,
|
397
|
+
vote_category=None,
|
398
|
+
vote_record=None,
|
399
|
+
vote_source=None,
|
400
|
+
how_voted=None,
|
401
|
+
figi=None,
|
402
|
+
management_recommendation=None,
|
403
|
+
accession=None,
|
404
|
+
reporting_owner_cik=None,
|
405
|
+
filing_date=None,
|
406
|
+
|
407
|
+
# API key handling
|
408
|
+
api_key=None,
|
409
|
+
|
410
|
+
# Additional options
|
411
|
+
print_cost=True,
|
412
|
+
verbose=False
|
413
|
+
):
|
414
|
+
"""
|
415
|
+
Query the SEC BigQuery API for NPX proxy voting record data.
|
416
|
+
|
417
|
+
Parameters:
|
418
|
+
-----------
|
419
|
+
columns : List[str], optional
|
420
|
+
Specific columns to return. If None, all columns are returned.
|
421
|
+
|
422
|
+
# Filter parameters
|
423
|
+
meeting_date, isin, cusip, etc. : Various filters that can be:
|
424
|
+
- str: Exact match
|
425
|
+
- List[str]: Match any in list
|
426
|
+
- tuple: (min, max) range for numeric/date fields
|
427
|
+
|
428
|
+
shares_on_loan, shares_voted : int/float or tuple
|
429
|
+
Numeric values or (min, max) range
|
430
|
+
|
431
|
+
filing_date : str or tuple
|
432
|
+
Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
|
433
|
+
|
434
|
+
api_key : str, optional
|
435
|
+
SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
|
436
|
+
print_cost : bool
|
437
|
+
Whether to print the query cost information
|
438
|
+
verbose : bool
|
439
|
+
Whether to print additional information about the query
|
440
|
+
|
441
|
+
Returns:
|
442
|
+
--------
|
443
|
+
List[Dict]
|
444
|
+
A list of dictionaries containing the query results
|
445
|
+
|
446
|
+
Raises:
|
447
|
+
-------
|
448
|
+
ValueError
|
449
|
+
If API key is missing or invalid
|
450
|
+
Exception
|
451
|
+
For API errors or other issues
|
452
|
+
"""
|
453
|
+
|
454
|
+
# 1. Handle API key
|
455
|
+
if api_key is None:
|
456
|
+
api_key = os.getenv('DATAMULE_API_KEY')
|
457
|
+
|
458
|
+
if not api_key:
|
459
|
+
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key parameter")
|
460
|
+
|
461
|
+
# 2. Build query parameters
|
462
|
+
params = {'table_type': 'NPX_VOTING_TABLE'}
|
463
|
+
|
464
|
+
# Add columns parameter if provided
|
465
|
+
if columns:
|
466
|
+
if isinstance(columns, list):
|
467
|
+
params['columns'] = ','.join(columns)
|
468
|
+
else:
|
469
|
+
params['columns'] = columns
|
470
|
+
|
471
|
+
# Map Python parameter names to API parameter names
|
472
|
+
param_mapping = {
|
473
|
+
'meeting_date': 'meetingDate',
|
474
|
+
'isin': 'isin',
|
475
|
+
'cusip': 'cusip',
|
476
|
+
'issuer_name': 'issuerName',
|
477
|
+
'vote_description': 'voteDescription',
|
478
|
+
'shares_on_loan': 'sharesOnLoan',
|
479
|
+
'shares_voted': 'sharesVoted',
|
480
|
+
'vote_category': 'voteCategory',
|
481
|
+
'vote_record': 'voteRecord',
|
482
|
+
'vote_source': 'voteSource',
|
483
|
+
'how_voted': 'howVoted',
|
484
|
+
'figi': 'figi',
|
485
|
+
'management_recommendation': 'managementRecommendation',
|
486
|
+
'accession': 'accession',
|
487
|
+
'reporting_owner_cik': 'reportingOwnerCIK',
|
488
|
+
'filing_date': 'filingDate'
|
489
|
+
}
|
490
|
+
|
491
|
+
# Process all possible filter parameters
|
492
|
+
date_params = ['meeting_date', 'filing_date']
|
493
|
+
numeric_params = ['shares_on_loan', 'shares_voted']
|
494
|
+
|
495
|
+
for param_name, api_param_name in param_mapping.items():
|
496
|
+
value = locals()[param_name]
|
497
|
+
if value is not None:
|
498
|
+
# Handle different filter types
|
499
|
+
if isinstance(value, list):
|
500
|
+
# List filter
|
501
|
+
params[api_param_name] = f"[{','.join(str(v) for v in value)}]"
|
502
|
+
elif isinstance(value, tuple):
|
503
|
+
# Range filter
|
504
|
+
if len(value) == 2:
|
505
|
+
min_val, max_val = value
|
506
|
+
# Handle date range specially
|
507
|
+
if param_name in date_params:
|
508
|
+
# Dates need to be in quotes within the parentheses
|
509
|
+
if min_val is None:
|
510
|
+
min_val = ''
|
511
|
+
else:
|
512
|
+
min_val = f"'{min_val}'"
|
513
|
+
|
514
|
+
if max_val is None:
|
515
|
+
max_val = ''
|
516
|
+
else:
|
517
|
+
max_val = f"'{max_val}'"
|
518
|
+
|
519
|
+
range_str = f"({min_val},{max_val})"
|
520
|
+
params[api_param_name] = range_str
|
521
|
+
else:
|
522
|
+
raise ValueError(f"Range filter for {param_name} must be a tuple of (min, max)")
|
523
|
+
else:
|
524
|
+
# Exact match
|
525
|
+
params[api_param_name] = value
|
526
|
+
|
527
|
+
# Call common function to make API request
|
528
|
+
return _make_api_request(params, api_key, print_cost, verbose)
|
@@ -16,17 +16,35 @@ from threading import Thread
|
|
16
16
|
from secsgml import parse_sgml_submission
|
17
17
|
from .query import query
|
18
18
|
from os import cpu_count
|
19
|
+
from ..submission import Submission
|
19
20
|
|
20
21
|
class Downloader:
|
21
22
|
def __init__(self, api_key=None):
|
22
23
|
self.BASE_URL = "https://library.datamule.xyz/original/nc/"
|
23
24
|
self.CHUNK_SIZE = 2 * 1024 * 1024
|
24
|
-
self.MAX_CONCURRENT_DOWNLOADS =
|
25
|
+
self.MAX_CONCURRENT_DOWNLOADS = 100
|
25
26
|
self.MAX_DECOMPRESSION_WORKERS = cpu_count()
|
26
27
|
self.MAX_PROCESSING_WORKERS = cpu_count()
|
27
28
|
self.QUEUE_SIZE = 10
|
28
29
|
if api_key is not None:
|
29
30
|
self._api_key = api_key
|
31
|
+
# Create a shared event loop for async operations
|
32
|
+
self.loop = asyncio.new_event_loop()
|
33
|
+
# Create a thread to run the event loop
|
34
|
+
self.loop_thread = Thread(target=self._run_event_loop, daemon=True)
|
35
|
+
self.loop_thread.start()
|
36
|
+
# Create a queue for async tasks
|
37
|
+
self.async_queue = Queue()
|
38
|
+
|
39
|
+
def _run_event_loop(self):
|
40
|
+
"""Run the event loop in a separate thread"""
|
41
|
+
asyncio.set_event_loop(self.loop)
|
42
|
+
self.loop.run_forever()
|
43
|
+
|
44
|
+
def _run_coroutine(self, coro):
|
45
|
+
"""Run a coroutine in the event loop and return its result"""
|
46
|
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
47
|
+
return future.result()
|
30
48
|
|
31
49
|
@property
|
32
50
|
def api_key(self):
|
@@ -55,7 +73,7 @@ class Downloader:
|
|
55
73
|
print(f"Failed to log error to {error_file}: {str(e)}")
|
56
74
|
|
57
75
|
class FileProcessor:
|
58
|
-
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader):
|
76
|
+
def __init__(self, output_dir, max_workers, queue_size, pbar, downloader, keep_document_types=None):
|
59
77
|
self.processing_queue = Queue(maxsize=queue_size)
|
60
78
|
self.should_stop = False
|
61
79
|
self.processing_workers = []
|
@@ -64,6 +82,7 @@ class Downloader:
|
|
64
82
|
self.batch_size = 50
|
65
83
|
self.pbar = pbar
|
66
84
|
self.downloader = downloader
|
85
|
+
self.keep_document_types = keep_document_types
|
67
86
|
|
68
87
|
def start_processing_workers(self):
|
69
88
|
for _ in range(self.max_workers):
|
@@ -75,7 +94,9 @@ class Downloader:
|
|
75
94
|
def _process_file(self, item):
|
76
95
|
filename, content = item
|
77
96
|
try:
|
78
|
-
|
97
|
+
submission = Submission(sgml_content=content, keep_document_types=self.keep_document_types)
|
98
|
+
# Use the shared event loop to run save_async
|
99
|
+
self.downloader._run_coroutine(submission.save_async(output_dir=self.output_dir))
|
79
100
|
self.pbar.update(1)
|
80
101
|
except Exception as e:
|
81
102
|
accession_dir = os.path.join(self.output_dir, filename.split('.')[0])
|
@@ -189,11 +210,11 @@ class Downloader:
|
|
189
210
|
except Exception as e:
|
190
211
|
self._log_error(output_dir, filename, str(e))
|
191
212
|
|
192
|
-
async def process_batch(self, urls, output_dir):
|
213
|
+
async def process_batch(self, urls, output_dir, keep_document_types=None):
|
193
214
|
os.makedirs(output_dir, exist_ok=True)
|
194
215
|
|
195
216
|
with tqdm(total=len(urls), desc="Processing files") as pbar:
|
196
|
-
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self)
|
217
|
+
processor = self.FileProcessor(output_dir, self.MAX_PROCESSING_WORKERS, self.QUEUE_SIZE, pbar, self, keep_document_types=keep_document_types)
|
197
218
|
processor.start_processing_workers()
|
198
219
|
|
199
220
|
semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_DOWNLOADS)
|
@@ -216,7 +237,7 @@ class Downloader:
|
|
216
237
|
processor.stop_workers()
|
217
238
|
decompression_pool.shutdown()
|
218
239
|
|
219
|
-
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None):
|
240
|
+
def download(self, submission_type=None, cik=None, filing_date=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
|
220
241
|
"""
|
221
242
|
Query SEC filings and download/process them.
|
222
243
|
|
@@ -225,6 +246,8 @@ class Downloader:
|
|
225
246
|
- cik: Company CIK number(s), string, int, or list
|
226
247
|
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
227
248
|
- output_dir: Directory to save downloaded files
|
249
|
+
- accession_numbers: List of specific accession numbers to download
|
250
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
228
251
|
"""
|
229
252
|
if self.api_key is None:
|
230
253
|
raise ValueError("No API key found. Please set DATAMULE_API_KEY environment variable or provide api_key in constructor")
|
@@ -262,15 +285,32 @@ class Downloader:
|
|
262
285
|
start_time = time.time()
|
263
286
|
|
264
287
|
# Process the batch asynchronously
|
265
|
-
asyncio.run(self.process_batch(urls, output_dir))
|
288
|
+
asyncio.run(self.process_batch(urls, output_dir, keep_document_types=keep_document_types))
|
266
289
|
|
267
290
|
# Calculate and display performance metrics
|
268
291
|
elapsed_time = time.time() - start_time
|
269
292
|
print(f"\nProcessing completed in {elapsed_time:.2f} seconds")
|
270
293
|
print(f"Processing speed: {len(urls)/elapsed_time:.2f} files/second")
|
294
|
+
|
295
|
+
def __del__(self):
|
296
|
+
"""Cleanup when the downloader is garbage collected"""
|
297
|
+
if hasattr(self, 'loop') and self.loop.is_running():
|
298
|
+
self.loop.call_soon_threadsafe(self.loop.stop)
|
271
299
|
|
272
300
|
|
273
|
-
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None):
|
301
|
+
def download(submission_type=None, cik=None, filing_date=None, api_key=None, output_dir="downloads", accession_numbers=None, keep_document_types=None):
|
302
|
+
"""
|
303
|
+
Query SEC filings and download/process them.
|
304
|
+
|
305
|
+
Parameters:
|
306
|
+
- submission_type: Filing type(s), string or list (e.g., '10-K', ['10-K', '10-Q'])
|
307
|
+
- cik: Company CIK number(s), string, int, or list
|
308
|
+
- filing_date: Filing date(s), string, list, or tuple of (start_date, end_date)
|
309
|
+
- api_key: API key for datamule service (optional if DATAMULE_API_KEY env var is set)
|
310
|
+
- output_dir: Directory to save downloaded files
|
311
|
+
- accession_numbers: List of specific accession numbers to download
|
312
|
+
- keep_document_types: List of document types to keep (e.g., ['10-K', 'EX-10.1'])
|
313
|
+
"""
|
274
314
|
if accession_numbers:
|
275
315
|
accession_numbers = [int(str(x).replace('-', '')) for x in accession_numbers]
|
276
316
|
# check if acc no is empty list
|
@@ -282,5 +322,6 @@ def download(submission_type=None, cik=None, filing_date=None, api_key=None, out
|
|
282
322
|
cik=cik,
|
283
323
|
filing_date=filing_date,
|
284
324
|
output_dir=output_dir,
|
285
|
-
accession_numbers=accession_numbers
|
325
|
+
accession_numbers=accession_numbers,
|
326
|
+
keep_document_types=keep_document_types
|
286
327
|
)
|