datamule 1.1.6__py3-none-any.whl → 1.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document.py +262 -68
- datamule/portfolio.py +7 -5
- datamule/sec/submissions/downloader.py +19 -2
- datamule/sec/submissions/eftsquery.py +129 -8
- datamule/sec/submissions/monitor.py +5 -1
- datamule/sec/submissions/streamer.py +59 -23
- datamule/sec/submissions/textsearch.py +33 -6
- datamule/seclibrary/bq.py +191 -0
- datamule/sheet.py +220 -6
- datamule/submission.py +94 -19
- {datamule-1.1.6.dist-info → datamule-1.1.8.dist-info}/METADATA +1 -1
- {datamule-1.1.6.dist-info → datamule-1.1.8.dist-info}/RECORD +14 -13
- {datamule-1.1.6.dist-info → datamule-1.1.8.dist-info}/WHEEL +0 -0
- {datamule-1.1.6.dist-info → datamule-1.1.8.dist-info}/top_level.txt +0 -0
datamule/document.py
CHANGED
@@ -8,31 +8,32 @@ from .mapping_dicts.xml_mapping_dicts import dict_345
|
|
8
8
|
from selectolax.parser import HTMLParser
|
9
9
|
|
10
10
|
class Document:
|
11
|
-
def __init__(self, type,
|
11
|
+
def __init__(self, type, content, extension):
|
12
|
+
|
12
13
|
self.type = type
|
13
|
-
|
14
|
+
extension = extension.lower()
|
15
|
+
self.content = content
|
16
|
+
if extension == '.txt':
|
17
|
+
self.content = self._preprocess_txt_content()
|
18
|
+
elif extension in ['.htm', '.html']:
|
19
|
+
self.content = self._preprocess_html_content()
|
14
20
|
|
21
|
+
self.extension = extension
|
22
|
+
# this will be filled by parsed
|
15
23
|
self.data = None
|
16
|
-
self.content = None
|
17
|
-
|
18
24
|
|
19
|
-
|
20
|
-
|
21
|
-
self.content
|
22
|
-
|
23
|
-
def _load_text_content(self):
|
24
|
-
with open(self.path) as f:
|
25
|
-
return f.read().translate(str.maketrans({
|
25
|
+
#_load_text_content
|
26
|
+
def _preprocess_txt_content(self):
|
27
|
+
return self.content.read().translate(str.maketrans({
|
26
28
|
'\xa0': ' ', '\u2003': ' ',
|
27
29
|
'\u2018': "'", '\u2019': "'",
|
28
30
|
'\u201c': '"', '\u201d': '"'
|
29
31
|
}))
|
30
32
|
|
31
33
|
# will deprecate this when we add html2dict
|
32
|
-
def
|
33
|
-
|
34
|
-
|
35
|
-
|
34
|
+
def _preprocess_html_content(self):
|
35
|
+
parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
|
36
|
+
|
36
37
|
# Remove hidden elements first
|
37
38
|
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
|
38
39
|
for node in hidden_nodes:
|
@@ -83,36 +84,27 @@ class Document:
|
|
83
84
|
'\u201c': '"', '\u201d': '"'
|
84
85
|
}))
|
85
86
|
|
86
|
-
def _load_file_content(self):
|
87
|
-
if self.path.suffix =='.txt':
|
88
|
-
self.content = self._load_text_content()
|
89
|
-
elif self.path.suffix in ['.html','.htm']:
|
90
|
-
self.content = self._load_html_content()
|
91
|
-
else:
|
92
|
-
raise ValueError(f"Unsupported file type: {self.path.suffix}")
|
93
|
-
|
94
|
-
|
95
87
|
def contains_string(self, pattern):
|
96
|
-
"""
|
97
|
-
if self.
|
98
|
-
if self.content is None:
|
99
|
-
self.content = self._load_file_content(self.path)
|
88
|
+
"""Works for select files"""
|
89
|
+
if self.extension in ['.htm', '.html', '.txt','.xml']:
|
100
90
|
return bool(re.search(pattern, self.content))
|
101
91
|
return False
|
102
92
|
|
103
93
|
# Note: this method will be heavily modified in the future
|
104
94
|
def parse(self):
|
95
|
+
# check if we have already parsed the content
|
96
|
+
if self.data:
|
97
|
+
return self.data
|
105
98
|
mapping_dict = None
|
106
99
|
|
107
|
-
if self.
|
100
|
+
if self.extension == '.xml':
|
108
101
|
if self.type in ['3', '4', '5']:
|
109
102
|
mapping_dict = dict_345
|
110
103
|
|
111
|
-
self.load_content()
|
112
104
|
self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
|
105
|
+
|
113
106
|
# will deprecate this when we add html2dict
|
114
|
-
elif self.
|
115
|
-
self._load_file_content()
|
107
|
+
elif self.extension in ['.htm', '.html','.txt']:
|
116
108
|
|
117
109
|
if self.type == '10-K':
|
118
110
|
mapping_dict = dict_10k
|
@@ -133,43 +125,246 @@ class Document:
|
|
133
125
|
if not self.data:
|
134
126
|
self.parse()
|
135
127
|
|
136
|
-
if output_filename is None:
|
137
|
-
output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
|
138
|
-
|
139
128
|
with open(output_filename, 'w',encoding='utf-8') as f:
|
140
129
|
json.dump(self.data, f, indent=2)
|
141
130
|
|
142
|
-
def
|
131
|
+
def to_tabular(self, accession_number=None):
|
143
132
|
self.parse()
|
144
133
|
|
145
|
-
if
|
146
|
-
|
134
|
+
if self.type == "INFORMATION TABLE":
|
135
|
+
info_table = self.data['informationTable']['infoTable']
|
136
|
+
if isinstance(info_table, dict):
|
137
|
+
info_table = [info_table]
|
147
138
|
|
148
|
-
|
149
|
-
if not self.data:
|
150
|
-
return output_filename
|
139
|
+
flattened = self._flatten_dict(info_table)
|
151
140
|
|
152
|
-
|
141
|
+
# Original field names
|
142
|
+
original_columns = [
|
143
|
+
"nameOfIssuer", "titleOfClass", "cusip", "value",
|
144
|
+
"shrsOrPrnAmt_sshPrnamt", "shrsOrPrnAmt_sshPrnamtType",
|
145
|
+
"investmentDiscretion", "votingAuthority_Sole",
|
146
|
+
"votingAuthority_Shared", "votingAuthority_None",
|
147
|
+
"reportingOwnerCIK", "putCall", "otherManager", 'figi'
|
148
|
+
]
|
149
|
+
|
150
|
+
# Define mapping from original to camelCase field names
|
151
|
+
field_mapping = {
|
152
|
+
"shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
|
153
|
+
"shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
|
154
|
+
"votingAuthority_Sole": "votingAuthoritySole",
|
155
|
+
"votingAuthority_Shared": "votingAuthorityShared",
|
156
|
+
"votingAuthority_None": "votingAuthorityNone"
|
157
|
+
}
|
158
|
+
|
159
|
+
# Create the new expected columns list with mapped field names
|
160
|
+
expected_columns = []
|
161
|
+
for column in original_columns:
|
162
|
+
if column in field_mapping:
|
163
|
+
expected_columns.append(field_mapping[column])
|
164
|
+
else:
|
165
|
+
expected_columns.append(column)
|
166
|
+
|
167
|
+
# Process each item in the flattened data
|
168
|
+
for item in flattened:
|
169
|
+
# Remove newlines from items
|
170
|
+
for key in item:
|
171
|
+
if isinstance(item[key], str):
|
172
|
+
item[key] = re.sub(r'\s+', ' ', item[key])
|
173
|
+
|
174
|
+
new_item = {}
|
175
|
+
for key, value in item.items():
|
176
|
+
# Apply the mapping if the key is in our mapping dictionary
|
177
|
+
if key in field_mapping:
|
178
|
+
new_item[field_mapping[key]] = value
|
179
|
+
else:
|
180
|
+
new_item[key] = value
|
181
|
+
|
182
|
+
# Update the original item with the new keys
|
183
|
+
item.clear()
|
184
|
+
item.update(new_item)
|
185
|
+
|
186
|
+
# Ensure all expected columns exist
|
187
|
+
for column in expected_columns:
|
188
|
+
if column not in item:
|
189
|
+
item[column] = None
|
190
|
+
|
191
|
+
item['accession'] = accession_number
|
153
192
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
193
|
+
# Add this block to reorder the items to match the expected order
|
194
|
+
ordered_columns = ["nameOfIssuer", "titleOfClass", "cusip", "value", "sshPrnamt", "sshPrnamtType",
|
195
|
+
"investmentDiscretion", "votingAuthoritySole", "votingAuthorityShared", "votingAuthorityNone",
|
196
|
+
"reportingOwnerCIK", "putCall", "otherManager", "figi"]
|
197
|
+
if accession_number is not None:
|
198
|
+
ordered_columns.append("accession")
|
199
|
+
|
200
|
+
ordered_data = []
|
201
|
+
for item in flattened:
|
202
|
+
ordered_item = {column: item.get(column, None) for column in ordered_columns}
|
203
|
+
ordered_data.append(ordered_item)
|
204
|
+
|
205
|
+
return ordered_data
|
206
|
+
|
207
|
+
elif self.type in ["3", "4", "5"]:
|
208
|
+
# Master mapping dictionary - includes all possible fields
|
209
|
+
# The order of this dictionary will determine the output column order
|
210
|
+
master_mapping_dict = {
|
211
|
+
# Flag fields (will be set programmatically)
|
212
|
+
"isDerivative": "isDerivative",
|
213
|
+
"isNonDerivative": "isNonDerivative",
|
214
|
+
|
215
|
+
# Common fields across all types
|
216
|
+
"securityTitle_value": "securityTitle",
|
217
|
+
"transactionDate_value": "transactionDate",
|
218
|
+
"documentType": "documentType",
|
219
|
+
"transactionCoding_transactionFormType": "documentType",
|
220
|
+
"transactionCoding_transactionCode": "transactionCode",
|
221
|
+
"transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
|
222
|
+
"transactionCoding_equitySwapInvolved": "equitySwapInvolved",
|
223
|
+
"transactionTimeliness_value": "transactionTimeliness",
|
224
|
+
"transactionAmounts_transactionShares_value": "transactionShares",
|
225
|
+
"transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
|
226
|
+
"postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
|
227
|
+
"heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
|
228
|
+
"ownershipNature_directOrIndirectOwnership_value": "ownershipType",
|
229
|
+
"ownershipNature_natureOfOwnership_value": "ownershipType",
|
230
|
+
"deemedExecutionDate": "deemedExecutionDate",
|
231
|
+
"deemedExecutionDate_value": "deemedExecutionDate",
|
232
|
+
|
233
|
+
# Derivative-specific fields
|
234
|
+
"conversionOrExercisePrice_value": "conversionOrExercisePrice",
|
235
|
+
"exerciseDate_value": "exerciseDate",
|
236
|
+
"expirationDate_value": "expirationDate",
|
237
|
+
"underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
|
238
|
+
"underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
|
239
|
+
"underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
|
240
|
+
|
241
|
+
# Footnote fields
|
242
|
+
"transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
|
243
|
+
"transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
|
244
|
+
"transactionCodeFootnote": "transactionCodeFootnote",
|
245
|
+
"transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
|
246
|
+
"transactionCoding_footnote": "transactionCodeFootnote",
|
247
|
+
"natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
|
248
|
+
"ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
|
249
|
+
"sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
|
250
|
+
"postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
|
251
|
+
"ownershipTypeFootnote": "ownershipTypeFootnote",
|
252
|
+
"ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
|
253
|
+
"securityTitleFootnote": "securityTitleFootnote",
|
254
|
+
"securityTitle_footnote": "securityTitleFootnote",
|
255
|
+
"transactionSharesFootnote": "transactionSharesFootnote",
|
256
|
+
"transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
|
257
|
+
"transactionDateFootnote": "transactionDateFootnote",
|
258
|
+
"transactionDate_footnote": "transactionDateFootnote",
|
259
|
+
"conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
|
260
|
+
"conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
|
261
|
+
"exerciseDateFootnote": "exerciseDateFootnote",
|
262
|
+
"exerciseDate_footnote": "exerciseDateFootnote",
|
263
|
+
"expirationDateFootnote": "expirationDateFootnote",
|
264
|
+
"expirationDate_footnote": "expirationDateFootnote",
|
265
|
+
"underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
|
266
|
+
"underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
|
267
|
+
"underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
|
268
|
+
"underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
|
269
|
+
"underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
|
270
|
+
"underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
|
271
|
+
}
|
272
|
+
|
273
|
+
# Get the unique target column names in order from the mapping dictionary
|
274
|
+
output_columns = []
|
275
|
+
for _, target_key in master_mapping_dict.items():
|
276
|
+
if target_key not in output_columns:
|
277
|
+
output_columns.append(target_key)
|
278
|
+
|
279
|
+
# Process function that handles any table type
|
280
|
+
def process_table(table_data, is_derivative):
|
281
|
+
if isinstance(table_data, dict):
|
282
|
+
table_data = [table_data]
|
283
|
+
|
284
|
+
flattened = self._flatten_dict(table_data)
|
285
|
+
|
286
|
+
# Apply mapping to the flattened data and ensure all expected columns are present
|
287
|
+
mapped_data = []
|
288
|
+
for item in flattened:
|
289
|
+
mapped_item = {}
|
290
|
+
# First, apply the mapping
|
291
|
+
for old_key, value in item.items():
|
292
|
+
target_key = master_mapping_dict.get(old_key, old_key)
|
293
|
+
mapped_item[target_key] = value
|
294
|
+
|
295
|
+
# Set the derivative/non-derivative flags
|
296
|
+
mapped_item["isDerivative"] = 1 if is_derivative else 0
|
297
|
+
mapped_item["isNonDerivative"] = 0 if is_derivative else 1
|
298
|
+
|
299
|
+
# Create a new ordered dictionary with all columns
|
300
|
+
ordered_item = {}
|
301
|
+
for column in output_columns:
|
302
|
+
ordered_item[column] = mapped_item.get(column, None)
|
303
|
+
|
304
|
+
# Add accession_number if available
|
305
|
+
if accession_number is not None:
|
306
|
+
ordered_item['accession_number'] = accession_number
|
307
|
+
|
308
|
+
mapped_data.append(ordered_item)
|
309
|
+
|
310
|
+
return mapped_data
|
311
|
+
|
312
|
+
# Results container
|
313
|
+
all_results = []
|
314
|
+
|
315
|
+
# Process non-derivative transactions if they exist
|
316
|
+
if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
|
317
|
+
if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
|
318
|
+
non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
|
319
|
+
non_deriv_results = process_table(non_deriv_trans, is_derivative=False)
|
320
|
+
all_results.extend(non_deriv_results)
|
321
|
+
|
322
|
+
# Process non-derivative holdings (for Form 3)
|
323
|
+
if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
|
324
|
+
non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
|
325
|
+
non_deriv_hold_results = process_table(non_deriv_hold, is_derivative=False)
|
326
|
+
all_results.extend(non_deriv_hold_results)
|
327
|
+
|
328
|
+
# Process derivative transactions if they exist
|
329
|
+
if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
|
330
|
+
if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
|
331
|
+
deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
|
332
|
+
deriv_results = process_table(deriv_trans, is_derivative=True)
|
333
|
+
all_results.extend(deriv_results)
|
334
|
+
|
335
|
+
# Process derivative holdings (for Form 3)
|
336
|
+
if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
|
337
|
+
deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
|
338
|
+
deriv_hold_results = process_table(deriv_hold, is_derivative=True)
|
339
|
+
all_results.extend(deriv_hold_results)
|
170
340
|
|
171
|
-
|
172
|
-
|
341
|
+
# check if any rows not in the mapping dict, raise error if so
|
342
|
+
for item in all_results:
|
343
|
+
for key in item.keys():
|
344
|
+
if key not in master_mapping_dict.values() and key != 'accession_number':
|
345
|
+
raise ValueError(f"Key '{key}' not found in mapping dictionary")
|
346
|
+
|
347
|
+
|
348
|
+
return all_results
|
349
|
+
else:
|
350
|
+
raise ValueError("sorry, rejigging conversion to tabular format")
|
351
|
+
|
352
|
+
def write_csv(self, output_filename, accession_number=None):
|
353
|
+
|
354
|
+
data = self.to_tabular(accession_number)
|
355
|
+
|
356
|
+
if not data:
|
357
|
+
|
358
|
+
return
|
359
|
+
|
360
|
+
fieldnames = data[0].keys()
|
361
|
+
|
362
|
+
with open(output_filename, 'w', newline='') as csvfile:
|
363
|
+
writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
364
|
+
writer.writeheader()
|
365
|
+
writer.writerows(data)
|
366
|
+
|
367
|
+
|
173
368
|
def _document_to_section_text(self, document_data, parent_key=''):
|
174
369
|
items = []
|
175
370
|
|
@@ -203,7 +398,7 @@ class Document:
|
|
203
398
|
# we'll modify this for every dict
|
204
399
|
def _flatten_dict(self, d, parent_key=''):
|
205
400
|
items = {}
|
206
|
-
|
401
|
+
|
207
402
|
if isinstance(d, list):
|
208
403
|
return [self._flatten_dict(item) for item in d]
|
209
404
|
|
@@ -219,13 +414,12 @@ class Document:
|
|
219
414
|
|
220
415
|
# this will all have to be changed. default will be to flatten everything
|
221
416
|
def __iter__(self):
|
222
|
-
|
223
|
-
self.parse()
|
417
|
+
self.parse()
|
224
418
|
|
225
419
|
# Let's remove XML iterable for now
|
226
420
|
|
227
421
|
# Handle text-based documents
|
228
|
-
if self.
|
422
|
+
if self.extension in ['.txt', '.htm', '.html']:
|
229
423
|
document_data = self.data
|
230
424
|
if not document_data:
|
231
425
|
return iter([])
|
@@ -235,13 +429,13 @@ class Document:
|
|
235
429
|
section_type = None
|
236
430
|
|
237
431
|
if self.type in ['10-K', '10-Q']:
|
238
|
-
mapping_dict =
|
432
|
+
mapping_dict = dict_10k if self.type == '10-K' else dict_10q
|
239
433
|
elif self.type == '8-K':
|
240
|
-
mapping_dict =
|
434
|
+
mapping_dict = dict_8k
|
241
435
|
elif self.type == 'SC 13D':
|
242
|
-
mapping_dict =
|
436
|
+
mapping_dict = dict_13d
|
243
437
|
elif self.type == 'SC 13G':
|
244
|
-
mapping_dict =
|
438
|
+
mapping_dict = dict_13g
|
245
439
|
else:
|
246
440
|
return iter([])
|
247
441
|
|
datamule/portfolio.py
CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
|
|
119
119
|
# First query, just set the accession numbers
|
120
120
|
self.accession_numbers = new_accession_numbers
|
121
121
|
|
122
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
|
123
123
|
if provider is None:
|
124
124
|
config = Config()
|
125
125
|
provider = config.get_default_source()
|
@@ -142,7 +142,7 @@ class Portfolio:
|
|
142
142
|
cik=cik,
|
143
143
|
submission_type=submission_type,
|
144
144
|
filing_date=filing_date,
|
145
|
-
requests_per_second=
|
145
|
+
requests_per_second=requests_per_second,
|
146
146
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
147
|
)
|
148
148
|
|
@@ -164,8 +164,6 @@ class Portfolio:
|
|
164
164
|
)
|
165
165
|
|
166
166
|
|
167
|
-
|
168
|
-
|
169
167
|
def __iter__(self):
|
170
168
|
if not self.submissions_loaded:
|
171
169
|
self._load_submissions()
|
@@ -179,4 +177,8 @@ class Portfolio:
|
|
179
177
|
document_types = [document_types]
|
180
178
|
|
181
179
|
for submission in self.submissions:
|
182
|
-
yield from submission.document_type(document_types)
|
180
|
+
yield from submission.document_type(document_types)
|
181
|
+
|
182
|
+
def keep(self,document_type):
|
183
|
+
for submission in self.__iter__():
|
184
|
+
submission.keep(document_type)
|
@@ -36,7 +36,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
|
|
36
36
|
print(f"Error processing {accno}: {e}")
|
37
37
|
return None
|
38
38
|
|
39
|
-
def download(cik=None, submission_type=None, filing_date=None,
|
39
|
+
def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
|
40
|
+
requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
|
40
41
|
"""
|
41
42
|
Download SEC EDGAR filings and extract their documents.
|
42
43
|
|
@@ -44,12 +45,25 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
|
|
44
45
|
- cik: CIK number(s) to query for
|
45
46
|
- submission_type: Filing type(s) to query for (default: 10-K)
|
46
47
|
- filing_date: Date or date range to query for
|
48
|
+
- location: Location code to filter by (e.g., 'CA' for California)
|
49
|
+
- name: Company name to search for (alternative to providing CIK)
|
47
50
|
- requests_per_second: Rate limit for SEC requests
|
48
51
|
- output_dir: Directory to save documents
|
49
52
|
- accession_numbers: Optional list of accession numbers to filter by
|
53
|
+
- quiet: Whether to suppress progress output
|
50
54
|
|
51
55
|
Returns:
|
52
56
|
- List of all document paths processed
|
57
|
+
|
58
|
+
Examples:
|
59
|
+
# Download filings by CIK
|
60
|
+
download(cik="1318605", submission_type="10-K")
|
61
|
+
|
62
|
+
# Download filings by company name
|
63
|
+
download(name="Tesla", submission_type="10-K")
|
64
|
+
|
65
|
+
# Download filings with location filter
|
66
|
+
download(name="Apple", location="CA", submission_type="10-K")
|
53
67
|
"""
|
54
68
|
|
55
69
|
# Make sure output directory exists
|
@@ -62,9 +76,12 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
|
|
62
76
|
# Call the stream function with our callback
|
63
77
|
return stream(
|
64
78
|
cik=cik,
|
79
|
+
name=name,
|
65
80
|
submission_type=submission_type,
|
66
81
|
filing_date=filing_date,
|
82
|
+
location=location,
|
67
83
|
requests_per_second=requests_per_second,
|
68
84
|
document_callback=callback_wrapper,
|
69
|
-
accession_numbers=accession_numbers
|
85
|
+
accession_numbers=accession_numbers,
|
86
|
+
quiet=quiet
|
70
87
|
)
|