datamule 1.1.7__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/document.py CHANGED
@@ -11,8 +11,6 @@ class Document:
11
11
  def __init__(self, type, content, extension):
12
12
 
13
13
  self.type = type
14
- # we will remove this later #
15
- # make sure extension is in lower case
16
14
  extension = extension.lower()
17
15
  self.content = content
18
16
  if extension == '.txt':
@@ -94,6 +92,9 @@ class Document:
94
92
 
95
93
  # Note: this method will be heavily modified in the future
96
94
  def parse(self):
95
+ # check if we have already parsed the content
96
+ if self.data:
97
+ return self.data
97
98
  mapping_dict = None
98
99
 
99
100
  if self.extension == '.xml':
@@ -127,34 +128,236 @@ class Document:
127
128
  with open(output_filename, 'w',encoding='utf-8') as f:
128
129
  json.dump(self.data, f, indent=2)
129
130
 
130
- def write_csv(self, output_filename=None, accession_number=None):
131
+ def to_tabular(self, accession_number=None):
132
+ """
133
+ Convert the document to a tabular format suitable for CSV output.
134
+
135
+ Args:
136
+ accession_number: Optional accession number to include in the output
137
+
138
+ Returns:
139
+ list: List of dictionaries, each representing a row in the tabular output
140
+ """
131
141
  self.parse()
142
+
143
+ # Common function to normalize and process dictionaries
144
+ def process_records(records, mapping_dict, is_derivative=None):
145
+ """
146
+ Process records into a standardized tabular format
147
+
148
+ Args:
149
+ records: List or single dictionary of records to process
150
+ mapping_dict: Dictionary mapping source keys to target keys
151
+ is_derivative: Boolean flag for derivative securities (or None if not applicable)
152
+
153
+ Returns:
154
+ list: Processed records in tabular format
155
+ """
156
+ # Convert single dict to list for uniform processing
157
+ if isinstance(records, dict):
158
+ records = [records]
159
+
160
+ # Flatten nested dictionaries
161
+ flattened = self._flatten_dict(records)
162
+
163
+ # Process each record
164
+ result = []
165
+ for item in flattened:
166
+ # Normalize whitespace in all string values
167
+ for key in item:
168
+ if isinstance(item[key], str):
169
+ item[key] = re.sub(r'\s+', ' ', item[key])
170
+
171
+ # Map keys according to the mapping dictionary
172
+ mapped_item = {}
173
+ for old_key, value in item.items():
174
+ target_key = mapping_dict.get(old_key, old_key)
175
+ mapped_item[target_key] = value
176
+
177
+ # Set derivative flags if applicable
178
+ if is_derivative is not None:
179
+ mapped_item["isDerivative"] = 1 if is_derivative else 0
180
+ mapped_item["isNonDerivative"] = 0 if is_derivative else 1
181
+
182
+ # Ensure all expected columns exist
183
+ output_columns = list(dict.fromkeys(mapping_dict.values()))
184
+ ordered_item = {column: mapped_item.get(column, None) for column in output_columns}
185
+
186
+ # Add accession number if provided
187
+ if accession_number is not None:
188
+ ordered_item['accession'] = accession_number
189
+
190
+ result.append(ordered_item)
191
+
192
+ return result
193
+
194
+ # Handle different document types
195
+ if self.type == "INFORMATION TABLE":
196
+ # Information Table mapping dictionary
197
+ info_table_mapping = {
198
+ "nameOfIssuer": "nameOfIssuer",
199
+ "titleOfClass": "titleOfClass",
200
+ "cusip": "cusip",
201
+ "value": "value",
202
+ "shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
203
+ "shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
204
+ "investmentDiscretion": "investmentDiscretion",
205
+ "votingAuthority_Sole": "votingAuthoritySole",
206
+ "votingAuthority_Shared": "votingAuthorityShared",
207
+ "votingAuthority_None": "votingAuthorityNone",
208
+ "reportingOwnerCIK": "reportingOwnerCIK",
209
+ "putCall": "putCall",
210
+ "otherManager": "otherManager",
211
+ "figi": "figi"
212
+ }
213
+
214
+ # Process the information table
215
+ info_table = self.data['informationTable']['infoTable']
216
+ return process_records(info_table, info_table_mapping)
217
+
218
+ elif self.type == "PROXY VOTING RECORD":
219
+ # Proxy voting record mapping dictionary
220
+ proxy_mapping = {
221
+ 'meetingDate': 'meetingDate',
222
+ 'isin': 'isin',
223
+ 'cusip': 'cusip',
224
+ 'issuerName': 'issuerName',
225
+ 'voteDescription': 'voteDescription',
226
+ 'sharesOnLoan': 'sharesOnLoan',
227
+ 'vote_voteRecord_sharesVoted': 'sharesVoted',
228
+ 'voteCategories_voteCategory_categoryType': 'voteCategory',
229
+ 'vote_voteRecord': 'voteRecord',
230
+ 'sharesVoted': 'sharesVoted',
231
+ 'voteSource': 'voteSource',
232
+ 'vote_voteRecord_howVoted': 'howVoted',
233
+ 'figi': 'figi',
234
+ 'vote_voteRecord_managementRecommendation': 'managementRecommendation'
235
+ }
236
+
237
+ # Process proxy voting records if they exist
238
+ all_results = []
239
+ if 'proxyVoteTable' in self.data and 'proxyTable' in self.data['proxyVoteTable'] and self.data['proxyVoteTable']['proxyTable'] is not None:
240
+ proxy_records = self.data['proxyVoteTable']['proxyTable']
241
+ proxy_results = process_records(proxy_records, proxy_mapping)
242
+ all_results.extend(proxy_results)
243
+
244
+ return all_results
245
+
246
+ elif self.type in ["3", "4", "5"]:
247
+ # Forms 3, 4, 5 mapping dictionary
248
+ form_345_mapping = {
249
+ # Flag fields (will be set programmatically)
250
+ "isDerivative": "isDerivative",
251
+ "isNonDerivative": "isNonDerivative",
252
+
253
+ # Common fields across all types
254
+ "securityTitle_value": "securityTitle",
255
+ "transactionDate_value": "transactionDate",
256
+ "documentType": "documentType",
257
+ "transactionCoding_transactionFormType": "documentType",
258
+ "transactionCoding_transactionCode": "transactionCode",
259
+ "transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
260
+ "transactionCoding_equitySwapInvolved": "equitySwapInvolved",
261
+ "transactionTimeliness_value": "transactionTimeliness",
262
+ "transactionAmounts_transactionShares_value": "transactionShares",
263
+ "transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
264
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
265
+ "heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
266
+ "ownershipNature_directOrIndirectOwnership_value": "ownershipType",
267
+ "ownershipNature_natureOfOwnership_value": "ownershipType",
268
+ "deemedExecutionDate": "deemedExecutionDate",
269
+ "deemedExecutionDate_value": "deemedExecutionDate",
270
+
271
+ # Derivative-specific fields
272
+ "conversionOrExercisePrice_value": "conversionOrExercisePrice",
273
+ "exerciseDate_value": "exerciseDate",
274
+ "expirationDate_value": "expirationDate",
275
+ "underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
276
+ "underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
277
+ "underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
278
+
279
+ # Footnote fields
280
+ "transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
281
+ "transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
282
+ "transactionCodeFootnote": "transactionCodeFootnote",
283
+ "transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
284
+ "transactionCoding_footnote": "transactionCodeFootnote",
285
+ "natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
286
+ "ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
287
+ "sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
288
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
289
+ "ownershipTypeFootnote": "ownershipTypeFootnote",
290
+ "ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
291
+ "securityTitleFootnote": "securityTitleFootnote",
292
+ "securityTitle_footnote": "securityTitleFootnote",
293
+ "transactionSharesFootnote": "transactionSharesFootnote",
294
+ "transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
295
+ "transactionDateFootnote": "transactionDateFootnote",
296
+ "transactionDate_footnote": "transactionDateFootnote",
297
+ "conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
298
+ "conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
299
+ "exerciseDateFootnote": "exerciseDateFootnote",
300
+ "exerciseDate_footnote": "exerciseDateFootnote",
301
+ "expirationDateFootnote": "expirationDateFootnote",
302
+ "expirationDate_footnote": "expirationDateFootnote",
303
+ "underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
304
+ "underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
305
+ "underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
306
+ "underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
307
+ "underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
308
+ "underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
309
+ }
310
+
311
+ # Results container
312
+ all_results = []
313
+
314
+ # Process non-derivative transactions if they exist
315
+ if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
316
+ if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
317
+ non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
318
+ non_deriv_results = process_records(non_deriv_trans, form_345_mapping, is_derivative=False)
319
+ all_results.extend(non_deriv_results)
320
+
321
+ # Process non-derivative holdings (for Form 3)
322
+ if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
323
+ non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
324
+ non_deriv_hold_results = process_records(non_deriv_hold, form_345_mapping, is_derivative=False)
325
+ all_results.extend(non_deriv_hold_results)
326
+
327
+ # Process derivative transactions if they exist
328
+ if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
329
+ if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
330
+ deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
331
+ deriv_results = process_records(deriv_trans, form_345_mapping, is_derivative=True)
332
+ all_results.extend(deriv_results)
333
+
334
+ # Process derivative holdings (for Form 3)
335
+ if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
336
+ deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
337
+ deriv_hold_results = process_records(deriv_hold, form_345_mapping, is_derivative=True)
338
+ all_results.extend(deriv_hold_results)
339
+
340
+ return all_results
341
+
342
+ else:
343
+ raise ValueError(f"Document type '{self.type}' is not supported for tabular conversion")
344
+
345
+ def write_csv(self, output_filename, accession_number=None):
346
+
347
+ data = self.to_tabular(accession_number)
132
348
 
133
- with open(output_filename, 'w', newline='') as csvfile:
134
- if not self.data:
135
- return output_filename
349
+ if not data:
136
350
 
137
- has_document = any('document' in item for item in self.data)
138
-
139
- if has_document and 'document' in self.data:
140
- writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
141
- writer.writeheader()
142
- flattened = self._flatten_dict(self.data['document'])
143
- for section, text in flattened.items():
144
- writer.writerow({'section': section, 'text': text})
145
- else:
146
- fieldnames = list(self.data[0].keys())
147
- if accession_number:
148
- fieldnames.append('Accession Number')
149
- writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
150
- writer.writeheader()
151
- for row in self.data:
152
- if accession_number:
153
- row['Accession Number'] = accession_number
154
- writer.writerow(row)
351
+ return
352
+
353
+ fieldnames = data[0].keys()
354
+
355
+ with open(output_filename, 'w', newline='') as csvfile:
356
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
357
+ writer.writeheader()
358
+ writer.writerows(data)
155
359
 
156
- return output_filename
157
-
360
+
158
361
  def _document_to_section_text(self, document_data, parent_key=''):
159
362
  items = []
160
363
 
@@ -188,7 +391,7 @@ class Document:
188
391
  # we'll modify this for every dict
189
392
  def _flatten_dict(self, d, parent_key=''):
190
393
  items = {}
191
-
394
+
192
395
  if isinstance(d, list):
193
396
  return [self._flatten_dict(item) for item in d]
194
397
 
@@ -204,8 +407,7 @@ class Document:
204
407
 
205
408
  # this will all have to be changed. default will be to flatten everything
206
409
  def __iter__(self):
207
- if not self.data:
208
- self.parse()
410
+ self.parse()
209
411
 
210
412
  # Let's remove XML iterable for now
211
413
 
datamule/portfolio.py CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
119
119
  # First query, just set the accession numbers
120
120
  self.accession_numbers = new_accession_numbers
121
121
 
122
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
123
123
  if provider is None:
124
124
  config = Config()
125
125
  provider = config.get_default_source()
@@ -142,7 +142,7 @@ class Portfolio:
142
142
  cik=cik,
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
- requests_per_second=5,
145
+ requests_per_second=requests_per_second,
146
146
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
147
  )
148
148
 
@@ -164,8 +164,6 @@ class Portfolio:
164
164
  )
165
165
 
166
166
 
167
-
168
-
169
167
  def __iter__(self):
170
168
  if not self.submissions_loaded:
171
169
  self._load_submissions()
@@ -20,12 +20,16 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
20
20
  submission_type = source.get('form')
21
21
  ciks = source.get('ciks', [])
22
22
  ciks = [str(int(cik)) for cik in ciks]
23
+
24
+ filing_date = source.get('file_date')
23
25
 
24
26
  # Create standardized filing record
25
27
  filing = {
26
28
  'accession_number': accession_number,
27
29
  'submission_type': submission_type,
28
- 'ciks': ciks
30
+ 'ciks': ciks,
31
+ 'filing_date': filing_date,
32
+
29
33
  }
30
34
 
31
35
  processed_hits.append(filing)