datamule 1.1.7__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. datamule-1.2.0/PKG-INFO +20 -0
  2. datamule-1.2.0/datamule/document.py +465 -0
  3. {datamule-1.1.7 → datamule-1.2.0}/datamule/portfolio.py +2 -4
  4. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/monitor.py +5 -1
  5. datamule-1.2.0/datamule/seclibrary/bq.py +528 -0
  6. datamule-1.2.0/datamule/sheet.py +672 -0
  7. {datamule-1.1.7 → datamule-1.2.0}/datamule/submission.py +2 -1
  8. datamule-1.2.0/datamule.egg-info/PKG-INFO +20 -0
  9. {datamule-1.1.7 → datamule-1.2.0}/datamule.egg-info/SOURCES.txt +1 -0
  10. {datamule-1.1.7 → datamule-1.2.0}/setup.py +1 -1
  11. datamule-1.1.7/PKG-INFO +0 -6
  12. datamule-1.1.7/datamule/document.py +0 -263
  13. datamule-1.1.7/datamule/sheet.py +0 -41
  14. datamule-1.1.7/datamule.egg-info/PKG-INFO +0 -6
  15. {datamule-1.1.7 → datamule-1.2.0}/datamule/__init__.py +0 -0
  16. {datamule-1.1.7 → datamule-1.2.0}/datamule/config.py +0 -0
  17. {datamule-1.1.7 → datamule-1.2.0}/datamule/helper.py +0 -0
  18. {datamule-1.1.7 → datamule-1.2.0}/datamule/index.py +0 -0
  19. {datamule-1.1.7 → datamule-1.2.0}/datamule/mapping_dicts/__init__.py +0 -0
  20. {datamule-1.1.7 → datamule-1.2.0}/datamule/mapping_dicts/txt_mapping_dicts.py +0 -0
  21. {datamule-1.1.7 → datamule-1.2.0}/datamule/mapping_dicts/xml_mapping_dicts.py +0 -0
  22. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/__init__.py +0 -0
  23. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/infrastructure/__init__.py +0 -0
  24. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/infrastructure/submissions_metadata.py +0 -0
  25. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/rss/__init__.py +0 -0
  26. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/rss/monitor.py +0 -0
  27. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/__init__.py +0 -0
  28. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/downloader.py +0 -0
  29. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/eftsquery.py +0 -0
  30. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/streamer.py +0 -0
  31. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/submissions/textsearch.py +0 -0
  32. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/utils.py +0 -0
  33. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/xbrl/__init__.py +0 -0
  34. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/xbrl/downloadcompanyfacts.py +0 -0
  35. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/xbrl/filter_xbrl.py +0 -0
  36. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/xbrl/streamcompanyfacts.py +0 -0
  37. {datamule-1.1.7 → datamule-1.2.0}/datamule/sec/xbrl/xbrlmonitor.py +0 -0
  38. {datamule-1.1.7 → datamule-1.2.0}/datamule/seclibrary/__init__.py +0 -0
  39. {datamule-1.1.7 → datamule-1.2.0}/datamule/seclibrary/downloader.py +0 -0
  40. {datamule-1.1.7 → datamule-1.2.0}/datamule/seclibrary/query.py +0 -0
  41. {datamule-1.1.7 → datamule-1.2.0}/datamule.egg-info/dependency_links.txt +0 -0
  42. {datamule-1.1.7 → datamule-1.2.0}/datamule.egg-info/requires.txt +0 -0
  43. {datamule-1.1.7 → datamule-1.2.0}/datamule.egg-info/top_level.txt +0 -0
  44. {datamule-1.1.7 → datamule-1.2.0}/setup.cfg +0 -0
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.1
2
+ Name: datamule
3
+ Version: 1.2.0
4
+ Summary: Making it easier to use SEC filings.
5
+ Home-page: https://github.com/john-friedman/datamule-python
6
+ Author: John Friedman
7
+ Requires-Dist: aiohttp
8
+ Requires-Dist: aiolimiter
9
+ Requires-Dist: tqdm
10
+ Requires-Dist: requests
11
+ Requires-Dist: nest_asyncio
12
+ Requires-Dist: aiofiles
13
+ Requires-Dist: polars
14
+ Requires-Dist: setuptools
15
+ Requires-Dist: selectolax
16
+ Requires-Dist: pytz
17
+ Requires-Dist: zstandard
18
+ Requires-Dist: doc2dict
19
+ Requires-Dist: secsgml
20
+ Requires-Dist: lxml
@@ -0,0 +1,465 @@
1
+ import json
2
+ import csv
3
+ import re
4
+ from doc2dict import xml2dict, txt2dict, dict2dict
5
+ from doc2dict.mapping import flatten_hierarchy
6
+ from .mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
7
+ from .mapping_dicts.xml_mapping_dicts import dict_345
8
+ from selectolax.parser import HTMLParser
9
+
10
+ class Document:
11
+ def __init__(self, type, content, extension):
12
+
13
+ self.type = type
14
+ extension = extension.lower()
15
+ self.content = content
16
+ if extension == '.txt':
17
+ self.content = self._preprocess_txt_content()
18
+ elif extension in ['.htm', '.html']:
19
+ self.content = self._preprocess_html_content()
20
+
21
+ self.extension = extension
22
+ # this will be filled by parsed
23
+ self.data = None
24
+
25
+ #_load_text_content
26
+ def _preprocess_txt_content(self):
27
+ return self.content.read().translate(str.maketrans({
28
+ '\xa0': ' ', '\u2003': ' ',
29
+ '\u2018': "'", '\u2019': "'",
30
+ '\u201c': '"', '\u201d': '"'
31
+ }))
32
+
33
+ # will deprecate this when we add html2dict
34
+ def _preprocess_html_content(self):
35
+ parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
36
+
37
+ # Remove hidden elements first
38
+ hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
39
+ for node in hidden_nodes:
40
+ node.decompose()
41
+
42
+ blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
43
+ lines = []
44
+ current_line = []
45
+
46
+ def flush_line():
47
+ if current_line:
48
+ # Don't add spaces between adjacent spans
49
+ lines.append(''.join(current_line))
50
+ current_line.clear()
51
+
52
+ for node in parser.root.traverse(include_text=True):
53
+ if node.tag in ('script', 'style', 'css'):
54
+ continue
55
+
56
+ if node.tag in blocks:
57
+ flush_line()
58
+ lines.append('')
59
+
60
+ if node.text_content:
61
+ text = node.text_content.strip()
62
+ if text:
63
+ if node.tag in blocks:
64
+ flush_line()
65
+ lines.append(text)
66
+ lines.append('')
67
+ else:
68
+ # Only add space if nodes aren't directly adjacent
69
+ if current_line and not current_line[-1].endswith(' '):
70
+ if node.prev and node.prev.text_content:
71
+ if node.parent != node.prev.parent or node.prev.next != node:
72
+ current_line.append(' ')
73
+ current_line.append(text)
74
+
75
+ flush_line()
76
+
77
+ text = '\n'.join(lines)
78
+ while '\n\n\n' in text:
79
+ text = text.replace('\n\n\n', '\n\n')
80
+
81
+ return text.translate(str.maketrans({
82
+ '\xa0': ' ', '\u2003': ' ',
83
+ '\u2018': "'", '\u2019': "'",
84
+ '\u201c': '"', '\u201d': '"'
85
+ }))
86
+
87
+ def contains_string(self, pattern):
88
+ """Works for select files"""
89
+ if self.extension in ['.htm', '.html', '.txt','.xml']:
90
+ return bool(re.search(pattern, self.content))
91
+ return False
92
+
93
+ # Note: this method will be heavily modified in the future
94
+ def parse(self):
95
+ # check if we have already parsed the content
96
+ if self.data:
97
+ return self.data
98
+ mapping_dict = None
99
+
100
+ if self.extension == '.xml':
101
+ if self.type in ['3', '4', '5']:
102
+ mapping_dict = dict_345
103
+
104
+ self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
105
+
106
+ # will deprecate this when we add html2dict
107
+ elif self.extension in ['.htm', '.html','.txt']:
108
+
109
+ if self.type == '10-K':
110
+ mapping_dict = dict_10k
111
+ elif self.type == '10-Q':
112
+ mapping_dict = dict_10q
113
+ elif self.type == '8-K':
114
+ mapping_dict = dict_8k
115
+ elif self.type == 'SC 13D':
116
+ mapping_dict = dict_13d
117
+ elif self.type == 'SC 13G':
118
+ mapping_dict = dict_13g
119
+
120
+ self.data = {}
121
+ self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
122
+ return self.data
123
+
124
+ def write_json(self, output_filename=None):
125
+ if not self.data:
126
+ self.parse()
127
+
128
+ with open(output_filename, 'w',encoding='utf-8') as f:
129
+ json.dump(self.data, f, indent=2)
130
+
131
+ def to_tabular(self, accession_number=None):
132
+ """
133
+ Convert the document to a tabular format suitable for CSV output.
134
+
135
+ Args:
136
+ accession_number: Optional accession number to include in the output
137
+
138
+ Returns:
139
+ list: List of dictionaries, each representing a row in the tabular output
140
+ """
141
+ self.parse()
142
+
143
+ # Common function to normalize and process dictionaries
144
+ def process_records(records, mapping_dict, is_derivative=None):
145
+ """
146
+ Process records into a standardized tabular format
147
+
148
+ Args:
149
+ records: List or single dictionary of records to process
150
+ mapping_dict: Dictionary mapping source keys to target keys
151
+ is_derivative: Boolean flag for derivative securities (or None if not applicable)
152
+
153
+ Returns:
154
+ list: Processed records in tabular format
155
+ """
156
+ # Convert single dict to list for uniform processing
157
+ if isinstance(records, dict):
158
+ records = [records]
159
+
160
+ # Flatten nested dictionaries
161
+ flattened = self._flatten_dict(records)
162
+
163
+ # Process each record
164
+ result = []
165
+ for item in flattened:
166
+ # Normalize whitespace in all string values
167
+ for key in item:
168
+ if isinstance(item[key], str):
169
+ item[key] = re.sub(r'\s+', ' ', item[key])
170
+
171
+ # Map keys according to the mapping dictionary
172
+ mapped_item = {}
173
+ for old_key, value in item.items():
174
+ target_key = mapping_dict.get(old_key, old_key)
175
+ mapped_item[target_key] = value
176
+
177
+ # Set derivative flags if applicable
178
+ if is_derivative is not None:
179
+ mapped_item["isDerivative"] = 1 if is_derivative else 0
180
+ mapped_item["isNonDerivative"] = 0 if is_derivative else 1
181
+
182
+ # Ensure all expected columns exist
183
+ output_columns = list(dict.fromkeys(mapping_dict.values()))
184
+ ordered_item = {column: mapped_item.get(column, None) for column in output_columns}
185
+
186
+ # Add accession number if provided
187
+ if accession_number is not None:
188
+ ordered_item['accession'] = accession_number
189
+
190
+ result.append(ordered_item)
191
+
192
+ return result
193
+
194
+ # Handle different document types
195
+ if self.type == "INFORMATION TABLE":
196
+ # Information Table mapping dictionary
197
+ info_table_mapping = {
198
+ "nameOfIssuer": "nameOfIssuer",
199
+ "titleOfClass": "titleOfClass",
200
+ "cusip": "cusip",
201
+ "value": "value",
202
+ "shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
203
+ "shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
204
+ "investmentDiscretion": "investmentDiscretion",
205
+ "votingAuthority_Sole": "votingAuthoritySole",
206
+ "votingAuthority_Shared": "votingAuthorityShared",
207
+ "votingAuthority_None": "votingAuthorityNone",
208
+ "reportingOwnerCIK": "reportingOwnerCIK",
209
+ "putCall": "putCall",
210
+ "otherManager": "otherManager",
211
+ "figi": "figi"
212
+ }
213
+
214
+ # Process the information table
215
+ info_table = self.data['informationTable']['infoTable']
216
+ return process_records(info_table, info_table_mapping)
217
+
218
+ elif self.type == "PROXY VOTING RECORD":
219
+ # Proxy voting record mapping dictionary
220
+ proxy_mapping = {
221
+ 'meetingDate': 'meetingDate',
222
+ 'isin': 'isin',
223
+ 'cusip': 'cusip',
224
+ 'issuerName': 'issuerName',
225
+ 'voteDescription': 'voteDescription',
226
+ 'sharesOnLoan': 'sharesOnLoan',
227
+ 'vote_voteRecord_sharesVoted': 'sharesVoted',
228
+ 'voteCategories_voteCategory_categoryType': 'voteCategory',
229
+ 'vote_voteRecord': 'voteRecord',
230
+ 'sharesVoted': 'sharesVoted',
231
+ 'voteSource': 'voteSource',
232
+ 'vote_voteRecord_howVoted': 'howVoted',
233
+ 'figi': 'figi',
234
+ 'vote_voteRecord_managementRecommendation': 'managementRecommendation'
235
+ }
236
+
237
+ # Process proxy voting records if they exist
238
+ all_results = []
239
+ if 'proxyVoteTable' in self.data and 'proxyTable' in self.data['proxyVoteTable'] and self.data['proxyVoteTable']['proxyTable'] is not None:
240
+ proxy_records = self.data['proxyVoteTable']['proxyTable']
241
+ proxy_results = process_records(proxy_records, proxy_mapping)
242
+ all_results.extend(proxy_results)
243
+
244
+ return all_results
245
+
246
+ elif self.type in ["3", "4", "5"]:
247
+ # Forms 3, 4, 5 mapping dictionary
248
+ form_345_mapping = {
249
+ # Flag fields (will be set programmatically)
250
+ "isDerivative": "isDerivative",
251
+ "isNonDerivative": "isNonDerivative",
252
+
253
+ # Common fields across all types
254
+ "securityTitle_value": "securityTitle",
255
+ "transactionDate_value": "transactionDate",
256
+ "documentType": "documentType",
257
+ "transactionCoding_transactionFormType": "documentType",
258
+ "transactionCoding_transactionCode": "transactionCode",
259
+ "transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
260
+ "transactionCoding_equitySwapInvolved": "equitySwapInvolved",
261
+ "transactionTimeliness_value": "transactionTimeliness",
262
+ "transactionAmounts_transactionShares_value": "transactionShares",
263
+ "transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
264
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
265
+ "heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
266
+ "ownershipNature_directOrIndirectOwnership_value": "ownershipType",
267
+ "ownershipNature_natureOfOwnership_value": "ownershipType",
268
+ "deemedExecutionDate": "deemedExecutionDate",
269
+ "deemedExecutionDate_value": "deemedExecutionDate",
270
+
271
+ # Derivative-specific fields
272
+ "conversionOrExercisePrice_value": "conversionOrExercisePrice",
273
+ "exerciseDate_value": "exerciseDate",
274
+ "expirationDate_value": "expirationDate",
275
+ "underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
276
+ "underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
277
+ "underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
278
+
279
+ # Footnote fields
280
+ "transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
281
+ "transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
282
+ "transactionCodeFootnote": "transactionCodeFootnote",
283
+ "transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
284
+ "transactionCoding_footnote": "transactionCodeFootnote",
285
+ "natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
286
+ "ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
287
+ "sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
288
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
289
+ "ownershipTypeFootnote": "ownershipTypeFootnote",
290
+ "ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
291
+ "securityTitleFootnote": "securityTitleFootnote",
292
+ "securityTitle_footnote": "securityTitleFootnote",
293
+ "transactionSharesFootnote": "transactionSharesFootnote",
294
+ "transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
295
+ "transactionDateFootnote": "transactionDateFootnote",
296
+ "transactionDate_footnote": "transactionDateFootnote",
297
+ "conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
298
+ "conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
299
+ "exerciseDateFootnote": "exerciseDateFootnote",
300
+ "exerciseDate_footnote": "exerciseDateFootnote",
301
+ "expirationDateFootnote": "expirationDateFootnote",
302
+ "expirationDate_footnote": "expirationDateFootnote",
303
+ "underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
304
+ "underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
305
+ "underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
306
+ "underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
307
+ "underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
308
+ "underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
309
+ }
310
+
311
+ # Results container
312
+ all_results = []
313
+
314
+ # Process non-derivative transactions if they exist
315
+ if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
316
+ if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
317
+ non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
318
+ non_deriv_results = process_records(non_deriv_trans, form_345_mapping, is_derivative=False)
319
+ all_results.extend(non_deriv_results)
320
+
321
+ # Process non-derivative holdings (for Form 3)
322
+ if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
323
+ non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
324
+ non_deriv_hold_results = process_records(non_deriv_hold, form_345_mapping, is_derivative=False)
325
+ all_results.extend(non_deriv_hold_results)
326
+
327
+ # Process derivative transactions if they exist
328
+ if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
329
+ if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
330
+ deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
331
+ deriv_results = process_records(deriv_trans, form_345_mapping, is_derivative=True)
332
+ all_results.extend(deriv_results)
333
+
334
+ # Process derivative holdings (for Form 3)
335
+ if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
336
+ deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
337
+ deriv_hold_results = process_records(deriv_hold, form_345_mapping, is_derivative=True)
338
+ all_results.extend(deriv_hold_results)
339
+
340
+ return all_results
341
+
342
+ else:
343
+ raise ValueError(f"Document type '{self.type}' is not supported for tabular conversion")
344
+
345
+ def write_csv(self, output_filename, accession_number=None):
346
+
347
+ data = self.to_tabular(accession_number)
348
+
349
+ if not data:
350
+
351
+ return
352
+
353
+ fieldnames = data[0].keys()
354
+
355
+ with open(output_filename, 'w', newline='') as csvfile:
356
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
357
+ writer.writeheader()
358
+ writer.writerows(data)
359
+
360
+
361
+ def _document_to_section_text(self, document_data, parent_key=''):
362
+ items = []
363
+
364
+ if isinstance(document_data, dict):
365
+ for key, value in document_data.items():
366
+ # Build the section name
367
+ section = f"{parent_key}_{key}" if parent_key else key
368
+
369
+ # If the value is a dict, recurse
370
+ if isinstance(value, dict):
371
+ items.extend(self._document_to_section_text(value, section))
372
+ # If it's a list, handle each item
373
+ elif isinstance(value, list):
374
+ for i, item in enumerate(value):
375
+ if isinstance(item, dict):
376
+ items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
377
+ else:
378
+ items.append({
379
+ 'section': f"{section}_{i+1}",
380
+ 'text': str(item)
381
+ })
382
+ # Base case - add the item
383
+ else:
384
+ items.append({
385
+ 'section': section,
386
+ 'text': str(value)
387
+ })
388
+
389
+ return items
390
+
391
+ # we'll modify this for every dict
392
+ def _flatten_dict(self, d, parent_key=''):
393
+ items = {}
394
+
395
+ if isinstance(d, list):
396
+ return [self._flatten_dict(item) for item in d]
397
+
398
+ for k, v in d.items():
399
+ new_key = f"{parent_key}_{k}" if parent_key else k
400
+
401
+ if isinstance(v, dict):
402
+ items.update(self._flatten_dict(v, new_key))
403
+ else:
404
+ items[new_key] = str(v)
405
+
406
+ return items
407
+
408
+ # this will all have to be changed. default will be to flatten everything
409
+ def __iter__(self):
410
+ self.parse()
411
+
412
+ # Let's remove XML iterable for now
413
+
414
+ # Handle text-based documents
415
+ if self.extension in ['.txt', '.htm', '.html']:
416
+ document_data = self.data
417
+ if not document_data:
418
+ return iter([])
419
+
420
+ # Find highest hierarchy level from mapping dict
421
+ highest_hierarchy = float('inf')
422
+ section_type = None
423
+
424
+ if self.type in ['10-K', '10-Q']:
425
+ mapping_dict = dict_10k if self.type == '10-K' else dict_10q
426
+ elif self.type == '8-K':
427
+ mapping_dict = dict_8k
428
+ elif self.type == 'SC 13D':
429
+ mapping_dict = dict_13d
430
+ elif self.type == 'SC 13G':
431
+ mapping_dict = dict_13g
432
+ else:
433
+ return iter([])
434
+
435
+ # Find section type with highest hierarchy number
436
+ highest_hierarchy = -1 # Start at -1 to find highest
437
+ for mapping in mapping_dict['rules']['mappings']:
438
+ if mapping.get('hierarchy') is not None:
439
+ if mapping['hierarchy'] > highest_hierarchy:
440
+ highest_hierarchy = mapping['hierarchy']
441
+ section_type = mapping['name']
442
+
443
+ if not section_type:
444
+ return iter([])
445
+
446
+ # Extract sections of the identified type
447
+ def find_sections(data, target_type):
448
+ sections = []
449
+ if isinstance(data, dict):
450
+ if data.get('type') == target_type:
451
+ sections.append({
452
+ 'item': data.get('text', ''),
453
+ 'text': flatten_hierarchy(data.get('content', []))
454
+ })
455
+ for value in data.values():
456
+ if isinstance(value, (dict, list)):
457
+ sections.extend(find_sections(value, target_type))
458
+ elif isinstance(data, list):
459
+ for item in data:
460
+ sections.extend(find_sections(item, target_type))
461
+ return sections
462
+
463
+ return iter(find_sections(document_data, section_type))
464
+
465
+ return iter([])
@@ -119,7 +119,7 @@ class Portfolio:
119
119
  # First query, just set the accession numbers
120
120
  self.accession_numbers = new_accession_numbers
121
121
 
122
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
123
123
  if provider is None:
124
124
  config = Config()
125
125
  provider = config.get_default_source()
@@ -142,7 +142,7 @@ class Portfolio:
142
142
  cik=cik,
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
- requests_per_second=5,
145
+ requests_per_second=requests_per_second,
146
146
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
147
  )
148
148
 
@@ -164,8 +164,6 @@ class Portfolio:
164
164
  )
165
165
 
166
166
 
167
-
168
-
169
167
  def __iter__(self):
170
168
  if not self.submissions_loaded:
171
169
  self._load_submissions()
@@ -20,12 +20,16 @@ async def _process_efts_hits(hits, collected_accession_numbers, data_callback=No
20
20
  submission_type = source.get('form')
21
21
  ciks = source.get('ciks', [])
22
22
  ciks = [str(int(cik)) for cik in ciks]
23
+
24
+ filing_date = source.get('file_date')
23
25
 
24
26
  # Create standardized filing record
25
27
  filing = {
26
28
  'accession_number': accession_number,
27
29
  'submission_type': submission_type,
28
- 'ciks': ciks
30
+ 'ciks': ciks,
31
+ 'filing_date': filing_date,
32
+
29
33
  }
30
34
 
31
35
  processed_hits.append(filing)