datamule 1.1.8__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamule
3
- Version: 1.1.8
3
+ Version: 1.2.2
4
4
  Summary: Making it easier to use SEC filings.
5
5
  Home-page: https://github.com/john-friedman/datamule-python
6
6
  Author: John Friedman
@@ -1,11 +1,15 @@
1
- datamule/__init__.py,sha256=l6YlwT5EeRxPlCtO5Jd4I8l266rSRUJyfFe97cRtSCM,991
1
+ datamule/__init__.py,sha256=8KioESb9y0Xwy72WuTfsYZnnMFdCrRhSv8DW-kZ4-To,1066
2
2
  datamule/config.py,sha256=Y--CVv7JcgrjJkMOSLrvm2S8B9ost6RMSkGviP-MKtg,883
3
- datamule/document.py,sha256=qShyVKHQ1nSCNvSfrhAOMVXprOd1br1rFKLy52S9WnE,22007
4
3
  datamule/helper.py,sha256=xgOVnea-lUlQ5I-U0vYUp0VeKPNZehNhqjJvegA3lYE,3342
5
4
  datamule/index.py,sha256=0txvbzPcvY1GsdxA-wGdLzAByxSeE_1VyyBp9mZEQRM,2292
6
- datamule/portfolio.py,sha256=yWt5gYTjV7rJsLiPUmhc6Vmr3lfvfCR5MSpLQ_6Gdp4,7104
7
- datamule/sheet.py,sha256=QaArtx7LpT7bwyteelJV67C-lK0RjQbGS3ka7ftdi8w,7978
8
- datamule/submission.py,sha256=LI7Zr60YbE_tU-v2N09k2dGjfztSgplKZACT3eRUkFE,4463
5
+ datamule/package_updater.py,sha256=Z9zaa_y0Z5cknpRn8oPea3gg4kquFHfpfhduKKCZ6NU,958
6
+ datamule/portfolio.py,sha256=so6j2KrkcZOToHIqkANAu3CC4QsfgaUN1zk9CrbRe1E,7225
7
+ datamule/sheet.py,sha256=TvFqK9eAYuVoJ2uWdAlx5EN6vS9lke-aZf7FqtUiDBc,22304
8
+ datamule/submission.py,sha256=tc4-8houjT2gfSK0P7ekowPduT31rj5_zt0axwZUacc,8483
9
+ datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ datamule/document/document.py,sha256=BRnHPVt-vIT7EZTF-c-Ulv3N33xX9zE02Q9mKXVDeuY,9474
11
+ datamule/document/processing.py,sha256=fw-1OWfbmZhG1R8XpJx_vcGwz3_djmk0FrblHAMPmwc,27476
12
+ datamule/document/table.py,sha256=Sv9jTGiVhnWIY9nHaynUUixwbCrvbLsf0fdOnFR-NCY,10791
9
13
  datamule/mapping_dicts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
14
  datamule/mapping_dicts/txt_mapping_dicts.py,sha256=DQPrGYbAPQxomRUtt4iiMGrwuF7BHc_LeFBQuYBzU9o,6311
11
15
  datamule/mapping_dicts/xml_mapping_dicts.py,sha256=Z22yDVwKYonUfM5foQP00dVDE8EHhhMKp0CLqVKV5OI,438
@@ -16,7 +20,7 @@ datamule/sec/infrastructure/submissions_metadata.py,sha256=f1KarzFSryKm0EV8DCDNs
16
20
  datamule/sec/rss/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
21
  datamule/sec/rss/monitor.py,sha256=6r4EYaSlGu6VYErlj9zXJsIMLVie1cfacSZU-ESfuBI,18231
18
22
  datamule/sec/submissions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- datamule/sec/submissions/downloader.py,sha256=IB08W8-lQD5Bb0LgzrTN4Xi4HsCw24DybRLHqE1AUrU,3290
23
+ datamule/sec/submissions/downloader.py,sha256=60wX2Yml1UCuxOtU0xMxqqeyHhrypCmlDQ0jZF-StJo,2665
20
24
  datamule/sec/submissions/eftsquery.py,sha256=mSZon8rlW8dxma7M49ZW5V02Fn-ENOdt9TNO6elBrhE,27983
21
25
  datamule/sec/submissions/monitor.py,sha256=Im2kgnUehhTgyY2Vq3uk07n4Vkj4PjII_SsRDi8ehAE,5384
22
26
  datamule/sec/submissions/streamer.py,sha256=EXyWNCD9N6mZmvm9lFSCFodF19zSQ8jfIbWPZNp0K5Y,11253
@@ -27,10 +31,10 @@ datamule/sec/xbrl/filter_xbrl.py,sha256=g9OT4zrNS0tiUJeBIwbCs_zMisOBkpFnMR3tV4Tr
27
31
  datamule/sec/xbrl/streamcompanyfacts.py,sha256=WyJIwuy5mNMXWpx_IkhFzDMe9MOfQ-vNkWl_JzBzFmc,3323
28
32
  datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
29
33
  datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- datamule/seclibrary/bq.py,sha256=C6kafFXWtm-MUjf70H1wTtpwv1Rxpcbk-Kfy8fkBPfo,6469
31
- datamule/seclibrary/downloader.py,sha256=Zb1TxsIz887tO3MJVP66siYVtNus89ti-g9oZ6VywrM,11500
34
+ datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
35
+ datamule/seclibrary/downloader.py,sha256=fJztJ_sEfv2oHHbDff07DRlXLmztXnzt3Yvv5YaZgGk,13718
32
36
  datamule/seclibrary/query.py,sha256=qGuursTERRbOGfoDcYcpo4oWkW3PCBW6x1Qf1Puiak4,7352
33
- datamule-1.1.8.dist-info/METADATA,sha256=8HRRMz6l928E5tuHXkPi1_Kf-8nfPSjWQnnfReSxdPM,512
34
- datamule-1.1.8.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
35
- datamule-1.1.8.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
36
- datamule-1.1.8.dist-info/RECORD,,
37
+ datamule-1.2.2.dist-info/METADATA,sha256=QpXbg-4cnRknynj-W4Z2Sc1zKlWan62zEG8OrN2_E-A,512
38
+ datamule-1.2.2.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
39
+ datamule-1.2.2.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
40
+ datamule-1.2.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
datamule/document.py DELETED
@@ -1,472 +0,0 @@
1
- import json
2
- import csv
3
- import re
4
- from doc2dict import xml2dict, txt2dict, dict2dict
5
- from doc2dict.mapping import flatten_hierarchy
6
- from .mapping_dicts.txt_mapping_dicts import dict_10k, dict_10q, dict_8k, dict_13d, dict_13g
7
- from .mapping_dicts.xml_mapping_dicts import dict_345
8
- from selectolax.parser import HTMLParser
9
-
10
- class Document:
11
- def __init__(self, type, content, extension):
12
-
13
- self.type = type
14
- extension = extension.lower()
15
- self.content = content
16
- if extension == '.txt':
17
- self.content = self._preprocess_txt_content()
18
- elif extension in ['.htm', '.html']:
19
- self.content = self._preprocess_html_content()
20
-
21
- self.extension = extension
22
- # this will be filled by parsed
23
- self.data = None
24
-
25
- #_load_text_content
26
- def _preprocess_txt_content(self):
27
- return self.content.read().translate(str.maketrans({
28
- '\xa0': ' ', '\u2003': ' ',
29
- '\u2018': "'", '\u2019': "'",
30
- '\u201c': '"', '\u201d': '"'
31
- }))
32
-
33
- # will deprecate this when we add html2dict
34
- def _preprocess_html_content(self):
35
- parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
36
-
37
- # Remove hidden elements first
38
- hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
39
- for node in hidden_nodes:
40
- node.decompose()
41
-
42
- blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
43
- lines = []
44
- current_line = []
45
-
46
- def flush_line():
47
- if current_line:
48
- # Don't add spaces between adjacent spans
49
- lines.append(''.join(current_line))
50
- current_line.clear()
51
-
52
- for node in parser.root.traverse(include_text=True):
53
- if node.tag in ('script', 'style', 'css'):
54
- continue
55
-
56
- if node.tag in blocks:
57
- flush_line()
58
- lines.append('')
59
-
60
- if node.text_content:
61
- text = node.text_content.strip()
62
- if text:
63
- if node.tag in blocks:
64
- flush_line()
65
- lines.append(text)
66
- lines.append('')
67
- else:
68
- # Only add space if nodes aren't directly adjacent
69
- if current_line and not current_line[-1].endswith(' '):
70
- if node.prev and node.prev.text_content:
71
- if node.parent != node.prev.parent or node.prev.next != node:
72
- current_line.append(' ')
73
- current_line.append(text)
74
-
75
- flush_line()
76
-
77
- text = '\n'.join(lines)
78
- while '\n\n\n' in text:
79
- text = text.replace('\n\n\n', '\n\n')
80
-
81
- return text.translate(str.maketrans({
82
- '\xa0': ' ', '\u2003': ' ',
83
- '\u2018': "'", '\u2019': "'",
84
- '\u201c': '"', '\u201d': '"'
85
- }))
86
-
87
- def contains_string(self, pattern):
88
- """Works for select files"""
89
- if self.extension in ['.htm', '.html', '.txt','.xml']:
90
- return bool(re.search(pattern, self.content))
91
- return False
92
-
93
- # Note: this method will be heavily modified in the future
94
- def parse(self):
95
- # check if we have already parsed the content
96
- if self.data:
97
- return self.data
98
- mapping_dict = None
99
-
100
- if self.extension == '.xml':
101
- if self.type in ['3', '4', '5']:
102
- mapping_dict = dict_345
103
-
104
- self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
105
-
106
- # will deprecate this when we add html2dict
107
- elif self.extension in ['.htm', '.html','.txt']:
108
-
109
- if self.type == '10-K':
110
- mapping_dict = dict_10k
111
- elif self.type == '10-Q':
112
- mapping_dict = dict_10q
113
- elif self.type == '8-K':
114
- mapping_dict = dict_8k
115
- elif self.type == 'SC 13D':
116
- mapping_dict = dict_13d
117
- elif self.type == 'SC 13G':
118
- mapping_dict = dict_13g
119
-
120
- self.data = {}
121
- self.data['document'] = dict2dict(txt2dict(content=self.content, mapping_dict=mapping_dict))
122
- return self.data
123
-
124
- def write_json(self, output_filename=None):
125
- if not self.data:
126
- self.parse()
127
-
128
- with open(output_filename, 'w',encoding='utf-8') as f:
129
- json.dump(self.data, f, indent=2)
130
-
131
- def to_tabular(self, accession_number=None):
132
- self.parse()
133
-
134
- if self.type == "INFORMATION TABLE":
135
- info_table = self.data['informationTable']['infoTable']
136
- if isinstance(info_table, dict):
137
- info_table = [info_table]
138
-
139
- flattened = self._flatten_dict(info_table)
140
-
141
- # Original field names
142
- original_columns = [
143
- "nameOfIssuer", "titleOfClass", "cusip", "value",
144
- "shrsOrPrnAmt_sshPrnamt", "shrsOrPrnAmt_sshPrnamtType",
145
- "investmentDiscretion", "votingAuthority_Sole",
146
- "votingAuthority_Shared", "votingAuthority_None",
147
- "reportingOwnerCIK", "putCall", "otherManager", 'figi'
148
- ]
149
-
150
- # Define mapping from original to camelCase field names
151
- field_mapping = {
152
- "shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
153
- "shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
154
- "votingAuthority_Sole": "votingAuthoritySole",
155
- "votingAuthority_Shared": "votingAuthorityShared",
156
- "votingAuthority_None": "votingAuthorityNone"
157
- }
158
-
159
- # Create the new expected columns list with mapped field names
160
- expected_columns = []
161
- for column in original_columns:
162
- if column in field_mapping:
163
- expected_columns.append(field_mapping[column])
164
- else:
165
- expected_columns.append(column)
166
-
167
- # Process each item in the flattened data
168
- for item in flattened:
169
- # Remove newlines from items
170
- for key in item:
171
- if isinstance(item[key], str):
172
- item[key] = re.sub(r'\s+', ' ', item[key])
173
-
174
- new_item = {}
175
- for key, value in item.items():
176
- # Apply the mapping if the key is in our mapping dictionary
177
- if key in field_mapping:
178
- new_item[field_mapping[key]] = value
179
- else:
180
- new_item[key] = value
181
-
182
- # Update the original item with the new keys
183
- item.clear()
184
- item.update(new_item)
185
-
186
- # Ensure all expected columns exist
187
- for column in expected_columns:
188
- if column not in item:
189
- item[column] = None
190
-
191
- item['accession'] = accession_number
192
-
193
- # Add this block to reorder the items to match the expected order
194
- ordered_columns = ["nameOfIssuer", "titleOfClass", "cusip", "value", "sshPrnamt", "sshPrnamtType",
195
- "investmentDiscretion", "votingAuthoritySole", "votingAuthorityShared", "votingAuthorityNone",
196
- "reportingOwnerCIK", "putCall", "otherManager", "figi"]
197
- if accession_number is not None:
198
- ordered_columns.append("accession")
199
-
200
- ordered_data = []
201
- for item in flattened:
202
- ordered_item = {column: item.get(column, None) for column in ordered_columns}
203
- ordered_data.append(ordered_item)
204
-
205
- return ordered_data
206
-
207
- elif self.type in ["3", "4", "5"]:
208
- # Master mapping dictionary - includes all possible fields
209
- # The order of this dictionary will determine the output column order
210
- master_mapping_dict = {
211
- # Flag fields (will be set programmatically)
212
- "isDerivative": "isDerivative",
213
- "isNonDerivative": "isNonDerivative",
214
-
215
- # Common fields across all types
216
- "securityTitle_value": "securityTitle",
217
- "transactionDate_value": "transactionDate",
218
- "documentType": "documentType",
219
- "transactionCoding_transactionFormType": "documentType",
220
- "transactionCoding_transactionCode": "transactionCode",
221
- "transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
222
- "transactionCoding_equitySwapInvolved": "equitySwapInvolved",
223
- "transactionTimeliness_value": "transactionTimeliness",
224
- "transactionAmounts_transactionShares_value": "transactionShares",
225
- "transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
226
- "postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
227
- "heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
228
- "ownershipNature_directOrIndirectOwnership_value": "ownershipType",
229
- "ownershipNature_natureOfOwnership_value": "ownershipType",
230
- "deemedExecutionDate": "deemedExecutionDate",
231
- "deemedExecutionDate_value": "deemedExecutionDate",
232
-
233
- # Derivative-specific fields
234
- "conversionOrExercisePrice_value": "conversionOrExercisePrice",
235
- "exerciseDate_value": "exerciseDate",
236
- "expirationDate_value": "expirationDate",
237
- "underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
238
- "underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
239
- "underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
240
-
241
- # Footnote fields
242
- "transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
243
- "transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
244
- "transactionCodeFootnote": "transactionCodeFootnote",
245
- "transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
246
- "transactionCoding_footnote": "transactionCodeFootnote",
247
- "natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
248
- "ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
249
- "sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
250
- "postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
251
- "ownershipTypeFootnote": "ownershipTypeFootnote",
252
- "ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
253
- "securityTitleFootnote": "securityTitleFootnote",
254
- "securityTitle_footnote": "securityTitleFootnote",
255
- "transactionSharesFootnote": "transactionSharesFootnote",
256
- "transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
257
- "transactionDateFootnote": "transactionDateFootnote",
258
- "transactionDate_footnote": "transactionDateFootnote",
259
- "conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
260
- "conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
261
- "exerciseDateFootnote": "exerciseDateFootnote",
262
- "exerciseDate_footnote": "exerciseDateFootnote",
263
- "expirationDateFootnote": "expirationDateFootnote",
264
- "expirationDate_footnote": "expirationDateFootnote",
265
- "underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
266
- "underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
267
- "underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
268
- "underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
269
- "underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
270
- "underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
271
- }
272
-
273
- # Get the unique target column names in order from the mapping dictionary
274
- output_columns = []
275
- for _, target_key in master_mapping_dict.items():
276
- if target_key not in output_columns:
277
- output_columns.append(target_key)
278
-
279
- # Process function that handles any table type
280
- def process_table(table_data, is_derivative):
281
- if isinstance(table_data, dict):
282
- table_data = [table_data]
283
-
284
- flattened = self._flatten_dict(table_data)
285
-
286
- # Apply mapping to the flattened data and ensure all expected columns are present
287
- mapped_data = []
288
- for item in flattened:
289
- mapped_item = {}
290
- # First, apply the mapping
291
- for old_key, value in item.items():
292
- target_key = master_mapping_dict.get(old_key, old_key)
293
- mapped_item[target_key] = value
294
-
295
- # Set the derivative/non-derivative flags
296
- mapped_item["isDerivative"] = 1 if is_derivative else 0
297
- mapped_item["isNonDerivative"] = 0 if is_derivative else 1
298
-
299
- # Create a new ordered dictionary with all columns
300
- ordered_item = {}
301
- for column in output_columns:
302
- ordered_item[column] = mapped_item.get(column, None)
303
-
304
- # Add accession_number if available
305
- if accession_number is not None:
306
- ordered_item['accession_number'] = accession_number
307
-
308
- mapped_data.append(ordered_item)
309
-
310
- return mapped_data
311
-
312
- # Results container
313
- all_results = []
314
-
315
- # Process non-derivative transactions if they exist
316
- if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
317
- if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
318
- non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
319
- non_deriv_results = process_table(non_deriv_trans, is_derivative=False)
320
- all_results.extend(non_deriv_results)
321
-
322
- # Process non-derivative holdings (for Form 3)
323
- if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
324
- non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
325
- non_deriv_hold_results = process_table(non_deriv_hold, is_derivative=False)
326
- all_results.extend(non_deriv_hold_results)
327
-
328
- # Process derivative transactions if they exist
329
- if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
330
- if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
331
- deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
332
- deriv_results = process_table(deriv_trans, is_derivative=True)
333
- all_results.extend(deriv_results)
334
-
335
- # Process derivative holdings (for Form 3)
336
- if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
337
- deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
338
- deriv_hold_results = process_table(deriv_hold, is_derivative=True)
339
- all_results.extend(deriv_hold_results)
340
-
341
- # check if any rows not in the mapping dict, raise error if so
342
- for item in all_results:
343
- for key in item.keys():
344
- if key not in master_mapping_dict.values() and key != 'accession_number':
345
- raise ValueError(f"Key '{key}' not found in mapping dictionary")
346
-
347
-
348
- return all_results
349
- else:
350
- raise ValueError("sorry, rejigging conversion to tabular format")
351
-
352
- def write_csv(self, output_filename, accession_number=None):
353
-
354
- data = self.to_tabular(accession_number)
355
-
356
- if not data:
357
-
358
- return
359
-
360
- fieldnames = data[0].keys()
361
-
362
- with open(output_filename, 'w', newline='') as csvfile:
363
- writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
364
- writer.writeheader()
365
- writer.writerows(data)
366
-
367
-
368
- def _document_to_section_text(self, document_data, parent_key=''):
369
- items = []
370
-
371
- if isinstance(document_data, dict):
372
- for key, value in document_data.items():
373
- # Build the section name
374
- section = f"{parent_key}_{key}" if parent_key else key
375
-
376
- # If the value is a dict, recurse
377
- if isinstance(value, dict):
378
- items.extend(self._document_to_section_text(value, section))
379
- # If it's a list, handle each item
380
- elif isinstance(value, list):
381
- for i, item in enumerate(value):
382
- if isinstance(item, dict):
383
- items.extend(self._document_to_section_text(item, f"{section}_{i+1}"))
384
- else:
385
- items.append({
386
- 'section': f"{section}_{i+1}",
387
- 'text': str(item)
388
- })
389
- # Base case - add the item
390
- else:
391
- items.append({
392
- 'section': section,
393
- 'text': str(value)
394
- })
395
-
396
- return items
397
-
398
- # we'll modify this for every dict
399
- def _flatten_dict(self, d, parent_key=''):
400
- items = {}
401
-
402
- if isinstance(d, list):
403
- return [self._flatten_dict(item) for item in d]
404
-
405
- for k, v in d.items():
406
- new_key = f"{parent_key}_{k}" if parent_key else k
407
-
408
- if isinstance(v, dict):
409
- items.update(self._flatten_dict(v, new_key))
410
- else:
411
- items[new_key] = str(v)
412
-
413
- return items
414
-
415
- # this will all have to be changed. default will be to flatten everything
416
- def __iter__(self):
417
- self.parse()
418
-
419
- # Let's remove XML iterable for now
420
-
421
- # Handle text-based documents
422
- if self.extension in ['.txt', '.htm', '.html']:
423
- document_data = self.data
424
- if not document_data:
425
- return iter([])
426
-
427
- # Find highest hierarchy level from mapping dict
428
- highest_hierarchy = float('inf')
429
- section_type = None
430
-
431
- if self.type in ['10-K', '10-Q']:
432
- mapping_dict = dict_10k if self.type == '10-K' else dict_10q
433
- elif self.type == '8-K':
434
- mapping_dict = dict_8k
435
- elif self.type == 'SC 13D':
436
- mapping_dict = dict_13d
437
- elif self.type == 'SC 13G':
438
- mapping_dict = dict_13g
439
- else:
440
- return iter([])
441
-
442
- # Find section type with highest hierarchy number
443
- highest_hierarchy = -1 # Start at -1 to find highest
444
- for mapping in mapping_dict['rules']['mappings']:
445
- if mapping.get('hierarchy') is not None:
446
- if mapping['hierarchy'] > highest_hierarchy:
447
- highest_hierarchy = mapping['hierarchy']
448
- section_type = mapping['name']
449
-
450
- if not section_type:
451
- return iter([])
452
-
453
- # Extract sections of the identified type
454
- def find_sections(data, target_type):
455
- sections = []
456
- if isinstance(data, dict):
457
- if data.get('type') == target_type:
458
- sections.append({
459
- 'item': data.get('text', ''),
460
- 'text': flatten_hierarchy(data.get('content', []))
461
- })
462
- for value in data.values():
463
- if isinstance(value, (dict, list)):
464
- sections.extend(find_sections(value, target_type))
465
- elif isinstance(data, list):
466
- for item in data:
467
- sections.extend(find_sections(item, target_type))
468
- return sections
469
-
470
- return iter(find_sections(document_data, section_type))
471
-
472
- return iter([])