datamule 1.1.7__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamule/document.py +231 -29
- datamule/portfolio.py +2 -4
- datamule/sec/submissions/monitor.py +5 -1
- datamule/seclibrary/bq.py +528 -0
- datamule/sheet.py +644 -13
- datamule/submission.py +2 -1
- {datamule-1.1.7.dist-info → datamule-1.2.0.dist-info}/METADATA +1 -1
- {datamule-1.1.7.dist-info → datamule-1.2.0.dist-info}/RECORD +10 -9
- {datamule-1.1.7.dist-info → datamule-1.2.0.dist-info}/WHEEL +1 -1
- {datamule-1.1.7.dist-info → datamule-1.2.0.dist-info}/top_level.txt +0 -0
datamule/document.py
CHANGED
@@ -11,8 +11,6 @@ class Document:
|
|
11
11
|
def __init__(self, type, content, extension):
|
12
12
|
|
13
13
|
self.type = type
|
14
|
-
# we will remove this later #
|
15
|
-
# make sure extension is in lower case
|
16
14
|
extension = extension.lower()
|
17
15
|
self.content = content
|
18
16
|
if extension == '.txt':
|
@@ -94,6 +92,9 @@ class Document:
|
|
94
92
|
|
95
93
|
# Note: this method will be heavily modified in the future
|
96
94
|
def parse(self):
|
95
|
+
# check if we have already parsed the content
|
96
|
+
if self.data:
|
97
|
+
return self.data
|
97
98
|
mapping_dict = None
|
98
99
|
|
99
100
|
if self.extension == '.xml':
|
@@ -127,34 +128,236 @@ class Document:
|
|
127
128
|
with open(output_filename, 'w',encoding='utf-8') as f:
|
128
129
|
json.dump(self.data, f, indent=2)
|
129
130
|
|
130
|
-
def
|
131
|
+
def to_tabular(self, accession_number=None):
|
132
|
+
"""
|
133
|
+
Convert the document to a tabular format suitable for CSV output.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
accession_number: Optional accession number to include in the output
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
list: List of dictionaries, each representing a row in the tabular output
|
140
|
+
"""
|
131
141
|
self.parse()
|
142
|
+
|
143
|
+
# Common function to normalize and process dictionaries
|
144
|
+
def process_records(records, mapping_dict, is_derivative=None):
|
145
|
+
"""
|
146
|
+
Process records into a standardized tabular format
|
147
|
+
|
148
|
+
Args:
|
149
|
+
records: List or single dictionary of records to process
|
150
|
+
mapping_dict: Dictionary mapping source keys to target keys
|
151
|
+
is_derivative: Boolean flag for derivative securities (or None if not applicable)
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
list: Processed records in tabular format
|
155
|
+
"""
|
156
|
+
# Convert single dict to list for uniform processing
|
157
|
+
if isinstance(records, dict):
|
158
|
+
records = [records]
|
159
|
+
|
160
|
+
# Flatten nested dictionaries
|
161
|
+
flattened = self._flatten_dict(records)
|
162
|
+
|
163
|
+
# Process each record
|
164
|
+
result = []
|
165
|
+
for item in flattened:
|
166
|
+
# Normalize whitespace in all string values
|
167
|
+
for key in item:
|
168
|
+
if isinstance(item[key], str):
|
169
|
+
item[key] = re.sub(r'\s+', ' ', item[key])
|
170
|
+
|
171
|
+
# Map keys according to the mapping dictionary
|
172
|
+
mapped_item = {}
|
173
|
+
for old_key, value in item.items():
|
174
|
+
target_key = mapping_dict.get(old_key, old_key)
|
175
|
+
mapped_item[target_key] = value
|
176
|
+
|
177
|
+
# Set derivative flags if applicable
|
178
|
+
if is_derivative is not None:
|
179
|
+
mapped_item["isDerivative"] = 1 if is_derivative else 0
|
180
|
+
mapped_item["isNonDerivative"] = 0 if is_derivative else 1
|
181
|
+
|
182
|
+
# Ensure all expected columns exist
|
183
|
+
output_columns = list(dict.fromkeys(mapping_dict.values()))
|
184
|
+
ordered_item = {column: mapped_item.get(column, None) for column in output_columns}
|
185
|
+
|
186
|
+
# Add accession number if provided
|
187
|
+
if accession_number is not None:
|
188
|
+
ordered_item['accession'] = accession_number
|
189
|
+
|
190
|
+
result.append(ordered_item)
|
191
|
+
|
192
|
+
return result
|
193
|
+
|
194
|
+
# Handle different document types
|
195
|
+
if self.type == "INFORMATION TABLE":
|
196
|
+
# Information Table mapping dictionary
|
197
|
+
info_table_mapping = {
|
198
|
+
"nameOfIssuer": "nameOfIssuer",
|
199
|
+
"titleOfClass": "titleOfClass",
|
200
|
+
"cusip": "cusip",
|
201
|
+
"value": "value",
|
202
|
+
"shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
|
203
|
+
"shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
|
204
|
+
"investmentDiscretion": "investmentDiscretion",
|
205
|
+
"votingAuthority_Sole": "votingAuthoritySole",
|
206
|
+
"votingAuthority_Shared": "votingAuthorityShared",
|
207
|
+
"votingAuthority_None": "votingAuthorityNone",
|
208
|
+
"reportingOwnerCIK": "reportingOwnerCIK",
|
209
|
+
"putCall": "putCall",
|
210
|
+
"otherManager": "otherManager",
|
211
|
+
"figi": "figi"
|
212
|
+
}
|
213
|
+
|
214
|
+
# Process the information table
|
215
|
+
info_table = self.data['informationTable']['infoTable']
|
216
|
+
return process_records(info_table, info_table_mapping)
|
217
|
+
|
218
|
+
elif self.type == "PROXY VOTING RECORD":
|
219
|
+
# Proxy voting record mapping dictionary
|
220
|
+
proxy_mapping = {
|
221
|
+
'meetingDate': 'meetingDate',
|
222
|
+
'isin': 'isin',
|
223
|
+
'cusip': 'cusip',
|
224
|
+
'issuerName': 'issuerName',
|
225
|
+
'voteDescription': 'voteDescription',
|
226
|
+
'sharesOnLoan': 'sharesOnLoan',
|
227
|
+
'vote_voteRecord_sharesVoted': 'sharesVoted',
|
228
|
+
'voteCategories_voteCategory_categoryType': 'voteCategory',
|
229
|
+
'vote_voteRecord': 'voteRecord',
|
230
|
+
'sharesVoted': 'sharesVoted',
|
231
|
+
'voteSource': 'voteSource',
|
232
|
+
'vote_voteRecord_howVoted': 'howVoted',
|
233
|
+
'figi': 'figi',
|
234
|
+
'vote_voteRecord_managementRecommendation': 'managementRecommendation'
|
235
|
+
}
|
236
|
+
|
237
|
+
# Process proxy voting records if they exist
|
238
|
+
all_results = []
|
239
|
+
if 'proxyVoteTable' in self.data and 'proxyTable' in self.data['proxyVoteTable'] and self.data['proxyVoteTable']['proxyTable'] is not None:
|
240
|
+
proxy_records = self.data['proxyVoteTable']['proxyTable']
|
241
|
+
proxy_results = process_records(proxy_records, proxy_mapping)
|
242
|
+
all_results.extend(proxy_results)
|
243
|
+
|
244
|
+
return all_results
|
245
|
+
|
246
|
+
elif self.type in ["3", "4", "5"]:
|
247
|
+
# Forms 3, 4, 5 mapping dictionary
|
248
|
+
form_345_mapping = {
|
249
|
+
# Flag fields (will be set programmatically)
|
250
|
+
"isDerivative": "isDerivative",
|
251
|
+
"isNonDerivative": "isNonDerivative",
|
252
|
+
|
253
|
+
# Common fields across all types
|
254
|
+
"securityTitle_value": "securityTitle",
|
255
|
+
"transactionDate_value": "transactionDate",
|
256
|
+
"documentType": "documentType",
|
257
|
+
"transactionCoding_transactionFormType": "documentType",
|
258
|
+
"transactionCoding_transactionCode": "transactionCode",
|
259
|
+
"transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
|
260
|
+
"transactionCoding_equitySwapInvolved": "equitySwapInvolved",
|
261
|
+
"transactionTimeliness_value": "transactionTimeliness",
|
262
|
+
"transactionAmounts_transactionShares_value": "transactionShares",
|
263
|
+
"transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
|
264
|
+
"postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
|
265
|
+
"heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
|
266
|
+
"ownershipNature_directOrIndirectOwnership_value": "ownershipType",
|
267
|
+
"ownershipNature_natureOfOwnership_value": "ownershipType",
|
268
|
+
"deemedExecutionDate": "deemedExecutionDate",
|
269
|
+
"deemedExecutionDate_value": "deemedExecutionDate",
|
270
|
+
|
271
|
+
# Derivative-specific fields
|
272
|
+
"conversionOrExercisePrice_value": "conversionOrExercisePrice",
|
273
|
+
"exerciseDate_value": "exerciseDate",
|
274
|
+
"expirationDate_value": "expirationDate",
|
275
|
+
"underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
|
276
|
+
"underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
|
277
|
+
"underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
|
278
|
+
|
279
|
+
# Footnote fields
|
280
|
+
"transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
|
281
|
+
"transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
|
282
|
+
"transactionCodeFootnote": "transactionCodeFootnote",
|
283
|
+
"transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
|
284
|
+
"transactionCoding_footnote": "transactionCodeFootnote",
|
285
|
+
"natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
|
286
|
+
"ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
|
287
|
+
"sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
|
288
|
+
"postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
|
289
|
+
"ownershipTypeFootnote": "ownershipTypeFootnote",
|
290
|
+
"ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
|
291
|
+
"securityTitleFootnote": "securityTitleFootnote",
|
292
|
+
"securityTitle_footnote": "securityTitleFootnote",
|
293
|
+
"transactionSharesFootnote": "transactionSharesFootnote",
|
294
|
+
"transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
|
295
|
+
"transactionDateFootnote": "transactionDateFootnote",
|
296
|
+
"transactionDate_footnote": "transactionDateFootnote",
|
297
|
+
"conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
|
298
|
+
"conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
|
299
|
+
"exerciseDateFootnote": "exerciseDateFootnote",
|
300
|
+
"exerciseDate_footnote": "exerciseDateFootnote",
|
301
|
+
"expirationDateFootnote": "expirationDateFootnote",
|
302
|
+
"expirationDate_footnote": "expirationDateFootnote",
|
303
|
+
"underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
|
304
|
+
"underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
|
305
|
+
"underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
|
306
|
+
"underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
|
307
|
+
"underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
|
308
|
+
"underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
|
309
|
+
}
|
310
|
+
|
311
|
+
# Results container
|
312
|
+
all_results = []
|
313
|
+
|
314
|
+
# Process non-derivative transactions if they exist
|
315
|
+
if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
|
316
|
+
if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
|
317
|
+
non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
|
318
|
+
non_deriv_results = process_records(non_deriv_trans, form_345_mapping, is_derivative=False)
|
319
|
+
all_results.extend(non_deriv_results)
|
320
|
+
|
321
|
+
# Process non-derivative holdings (for Form 3)
|
322
|
+
if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
|
323
|
+
non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
|
324
|
+
non_deriv_hold_results = process_records(non_deriv_hold, form_345_mapping, is_derivative=False)
|
325
|
+
all_results.extend(non_deriv_hold_results)
|
326
|
+
|
327
|
+
# Process derivative transactions if they exist
|
328
|
+
if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
|
329
|
+
if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
|
330
|
+
deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
|
331
|
+
deriv_results = process_records(deriv_trans, form_345_mapping, is_derivative=True)
|
332
|
+
all_results.extend(deriv_results)
|
333
|
+
|
334
|
+
# Process derivative holdings (for Form 3)
|
335
|
+
if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
|
336
|
+
deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
|
337
|
+
deriv_hold_results = process_records(deriv_hold, form_345_mapping, is_derivative=True)
|
338
|
+
all_results.extend(deriv_hold_results)
|
339
|
+
|
340
|
+
return all_results
|
341
|
+
|
342
|
+
else:
|
343
|
+
raise ValueError(f"Document type '{self.type}' is not supported for tabular conversion")
|
344
|
+
|
345
|
+
def write_csv(self, output_filename, accession_number=None):
|
346
|
+
|
347
|
+
data = self.to_tabular(accession_number)
|
132
348
|
|
133
|
-
|
134
|
-
if not self.data:
|
135
|
-
return output_filename
|
349
|
+
if not data:
|
136
350
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
else:
|
146
|
-
fieldnames = list(self.data[0].keys())
|
147
|
-
if accession_number:
|
148
|
-
fieldnames.append('Accession Number')
|
149
|
-
writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
|
150
|
-
writer.writeheader()
|
151
|
-
for row in self.data:
|
152
|
-
if accession_number:
|
153
|
-
row['Accession Number'] = accession_number
|
154
|
-
writer.writerow(row)
|
351
|
+
return
|
352
|
+
|
353
|
+
fieldnames = data[0].keys()
|
354
|
+
|
355
|
+
with open(output_filename, 'w', newline='') as csvfile:
|
356
|
+
writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
|
357
|
+
writer.writeheader()
|
358
|
+
writer.writerows(data)
|
155
359
|
|
156
|
-
|
157
|
-
|
360
|
+
|
158
361
|
def _document_to_section_text(self, document_data, parent_key=''):
|
159
362
|
items = []
|
160
363
|
|
@@ -188,7 +391,7 @@ class Document:
|
|
188
391
|
# we'll modify this for every dict
|
189
392
|
def _flatten_dict(self, d, parent_key=''):
|
190
393
|
items = {}
|
191
|
-
|
394
|
+
|
192
395
|
if isinstance(d, list):
|
193
396
|
return [self._flatten_dict(item) for item in d]
|
194
397
|
|
@@ -204,8 +407,7 @@ class Document:
|
|
204
407
|
|
205
408
|
# this will all have to be changed. default will be to flatten everything
|
206
409
|
def __iter__(self):
|
207
|
-
|
208
|
-
self.parse()
|
410
|
+
self.parse()
|
209
411
|
|
210
412
|
# Let's remove XML iterable for now
|
211
413
|
|
datamule/portfolio.py
CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
|
|
119
119
|
# First query, just set the accession numbers
|
120
120
|
self.accession_numbers = new_accession_numbers
|
121
121
|
|
122
|
-
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
|
122
|
+
def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
|
123
123
|
if provider is None:
|
124
124
|
config = Config()
|
125
125
|
provider = config.get_default_source()
|
@@ -142,7 +142,7 @@ class Portfolio:
|
|
142
142
|
cik=cik,
|
143
143
|
submission_type=submission_type,
|
144
144
|
filing_date=filing_date,
|
145
|
-
requests_per_second=
|
145
|
+
requests_per_second=requests_per_second,
|
146
146
|
accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
|
147
147
|
)
|
148
148
|
|
@@ -164,8 +164,6 @@ class Portfolio:
|
|
164
164
|
)
|
165
165
|
|
166
166
|
|
167
|
-
|
168
|
-
|
169
167
|
def __iter__(self):
|
170
168
|
if not self.submissions_loaded:
|
171
169
|
self._load_submissions()
|
@@ -20,12 +20,16 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
|
|
20
20
|
submission_type = source.get('form')
|
21
21
|
ciks = source.get('ciks', [])
|
22
22
|
ciks = [str(int(cik)) for cik in ciks]
|
23
|
+
|
24
|
+
filing_date = source.get('file_date')
|
23
25
|
|
24
26
|
# Create standardized filing record
|
25
27
|
filing = {
|
26
28
|
'accession_number': accession_number,
|
27
29
|
'submission_type': submission_type,
|
28
|
-
'ciks': ciks
|
30
|
+
'ciks': ciks,
|
31
|
+
'filing_date': filing_date,
|
32
|
+
|
29
33
|
}
|
30
34
|
|
31
35
|
processed_hits.append(filing)
|