datamule 1.1.6__py3-none-any.whl → 1.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/document.py CHANGED
@@ -8,31 +8,32 @@ from .mapping_dicts.xml_mapping_dicts import dict_345
8
8
  from selectolax.parser import HTMLParser
9
9
 
10
10
  class Document:
11
- def __init__(self, type, filename):
11
+ def __init__(self, type, content, extension):
12
+
12
13
  self.type = type
13
- self.path = filename
14
+ extension = extension.lower()
15
+ self.content = content
16
+ if extension == '.txt':
17
+ self.content = self._preprocess_txt_content()
18
+ elif extension in ['.htm', '.html']:
19
+ self.content = self._preprocess_html_content()
14
20
 
21
+ self.extension = extension
22
+ # this will be filled by parsed
15
23
  self.data = None
16
- self.content = None
17
-
18
24
 
19
- def load_content(self,encoding='utf-8'):
20
- with open(self.path, 'r',encoding=encoding) as f:
21
- self.content = f.read()
22
-
23
- def _load_text_content(self):
24
- with open(self.path) as f:
25
- return f.read().translate(str.maketrans({
25
+ #_load_text_content
26
+ def _preprocess_txt_content(self):
27
+ return self.content.read().translate(str.maketrans({
26
28
  '\xa0': ' ', '\u2003': ' ',
27
29
  '\u2018': "'", '\u2019': "'",
28
30
  '\u201c': '"', '\u201d': '"'
29
31
  }))
30
32
 
31
33
  # will deprecate this when we add html2dict
32
- def _load_html_content(self):
33
- with open(self.path,'rb') as f:
34
- parser = HTMLParser(f.read(),detect_encoding=True,decode_errors='ignore')
35
-
34
+ def _preprocess_html_content(self):
35
+ parser = HTMLParser(self.content,detect_encoding=True,decode_errors='ignore')
36
+
36
37
  # Remove hidden elements first
37
38
  hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
38
39
  for node in hidden_nodes:
@@ -83,36 +84,27 @@ class Document:
83
84
  '\u201c': '"', '\u201d': '"'
84
85
  }))
85
86
 
86
- def _load_file_content(self):
87
- if self.path.suffix =='.txt':
88
- self.content = self._load_text_content()
89
- elif self.path.suffix in ['.html','.htm']:
90
- self.content = self._load_html_content()
91
- else:
92
- raise ValueError(f"Unsupported file type: {self.path.suffix}")
93
-
94
-
95
87
  def contains_string(self, pattern):
96
- """Currently only works for .htm, .html, and .txt files"""
97
- if self.path.suffix in ['.htm', '.html', '.txt']:
98
- if self.content is None:
99
- self.content = self._load_file_content(self.path)
88
+ """Works for select files"""
89
+ if self.extension in ['.htm', '.html', '.txt','.xml']:
100
90
  return bool(re.search(pattern, self.content))
101
91
  return False
102
92
 
103
93
  # Note: this method will be heavily modified in the future
104
94
  def parse(self):
95
+ # check if we have already parsed the content
96
+ if self.data:
97
+ return self.data
105
98
  mapping_dict = None
106
99
 
107
- if self.path.suffix == '.xml':
100
+ if self.extension == '.xml':
108
101
  if self.type in ['3', '4', '5']:
109
102
  mapping_dict = dict_345
110
103
 
111
- self.load_content()
112
104
  self.data = xml2dict(content=self.content, mapping_dict=mapping_dict)
105
+
113
106
  # will deprecate this when we add html2dict
114
- elif self.path.suffix in ['.htm', '.html','.txt']:
115
- self._load_file_content()
107
+ elif self.extension in ['.htm', '.html','.txt']:
116
108
 
117
109
  if self.type == '10-K':
118
110
  mapping_dict = dict_10k
@@ -133,43 +125,246 @@ class Document:
133
125
  if not self.data:
134
126
  self.parse()
135
127
 
136
- if output_filename is None:
137
- output_filename = f"{self.path.rsplit('.', 1)[0]}.json"
138
-
139
128
  with open(output_filename, 'w',encoding='utf-8') as f:
140
129
  json.dump(self.data, f, indent=2)
141
130
 
142
- def write_csv(self, output_filename=None, accession_number=None):
131
+ def to_tabular(self, accession_number=None):
143
132
  self.parse()
144
133
 
145
- if output_filename is None:
146
- output_filename = f"{self.path.rsplit('.', 1)[0]}.csv"
134
+ if self.type == "INFORMATION TABLE":
135
+ info_table = self.data['informationTable']['infoTable']
136
+ if isinstance(info_table, dict):
137
+ info_table = [info_table]
147
138
 
148
- with open(output_filename, 'w', newline='') as csvfile:
149
- if not self.data:
150
- return output_filename
139
+ flattened = self._flatten_dict(info_table)
151
140
 
152
- has_document = any('document' in item for item in self.data)
141
+ # Original field names
142
+ original_columns = [
143
+ "nameOfIssuer", "titleOfClass", "cusip", "value",
144
+ "shrsOrPrnAmt_sshPrnamt", "shrsOrPrnAmt_sshPrnamtType",
145
+ "investmentDiscretion", "votingAuthority_Sole",
146
+ "votingAuthority_Shared", "votingAuthority_None",
147
+ "reportingOwnerCIK", "putCall", "otherManager", 'figi'
148
+ ]
149
+
150
+ # Define mapping from original to camelCase field names
151
+ field_mapping = {
152
+ "shrsOrPrnAmt_sshPrnamt": "sshPrnamt",
153
+ "shrsOrPrnAmt_sshPrnamtType": "sshPrnamtType",
154
+ "votingAuthority_Sole": "votingAuthoritySole",
155
+ "votingAuthority_Shared": "votingAuthorityShared",
156
+ "votingAuthority_None": "votingAuthorityNone"
157
+ }
158
+
159
+ # Create the new expected columns list with mapped field names
160
+ expected_columns = []
161
+ for column in original_columns:
162
+ if column in field_mapping:
163
+ expected_columns.append(field_mapping[column])
164
+ else:
165
+ expected_columns.append(column)
166
+
167
+ # Process each item in the flattened data
168
+ for item in flattened:
169
+ # Remove newlines from items
170
+ for key in item:
171
+ if isinstance(item[key], str):
172
+ item[key] = re.sub(r'\s+', ' ', item[key])
173
+
174
+ new_item = {}
175
+ for key, value in item.items():
176
+ # Apply the mapping if the key is in our mapping dictionary
177
+ if key in field_mapping:
178
+ new_item[field_mapping[key]] = value
179
+ else:
180
+ new_item[key] = value
181
+
182
+ # Update the original item with the new keys
183
+ item.clear()
184
+ item.update(new_item)
185
+
186
+ # Ensure all expected columns exist
187
+ for column in expected_columns:
188
+ if column not in item:
189
+ item[column] = None
190
+
191
+ item['accession'] = accession_number
153
192
 
154
- if has_document and 'document' in self.data:
155
- writer = csv.DictWriter(csvfile, ['section', 'text'], quoting=csv.QUOTE_ALL)
156
- writer.writeheader()
157
- flattened = self._flatten_dict(self.data['document'])
158
- for section, text in flattened.items():
159
- writer.writerow({'section': section, 'text': text})
160
- else:
161
- fieldnames = list(self.data[0].keys())
162
- if accession_number:
163
- fieldnames.append('Accession Number')
164
- writer = csv.DictWriter(csvfile, fieldnames, quoting=csv.QUOTE_ALL)
165
- writer.writeheader()
166
- for row in self.data:
167
- if accession_number:
168
- row['Accession Number'] = convert_to_dashed_accession(accession_number)
169
- writer.writerow(row)
193
+ # Add this block to reorder the items to match the expected order
194
+ ordered_columns = ["nameOfIssuer", "titleOfClass", "cusip", "value", "sshPrnamt", "sshPrnamtType",
195
+ "investmentDiscretion", "votingAuthoritySole", "votingAuthorityShared", "votingAuthorityNone",
196
+ "reportingOwnerCIK", "putCall", "otherManager", "figi"]
197
+ if accession_number is not None:
198
+ ordered_columns.append("accession")
199
+
200
+ ordered_data = []
201
+ for item in flattened:
202
+ ordered_item = {column: item.get(column, None) for column in ordered_columns}
203
+ ordered_data.append(ordered_item)
204
+
205
+ return ordered_data
206
+
207
+ elif self.type in ["3", "4", "5"]:
208
+ # Master mapping dictionary - includes all possible fields
209
+ # The order of this dictionary will determine the output column order
210
+ master_mapping_dict = {
211
+ # Flag fields (will be set programmatically)
212
+ "isDerivative": "isDerivative",
213
+ "isNonDerivative": "isNonDerivative",
214
+
215
+ # Common fields across all types
216
+ "securityTitle_value": "securityTitle",
217
+ "transactionDate_value": "transactionDate",
218
+ "documentType": "documentType",
219
+ "transactionCoding_transactionFormType": "documentType",
220
+ "transactionCoding_transactionCode": "transactionCode",
221
+ "transactionAmounts_transactionAcquiredDisposedCode_value": "transactionCode",
222
+ "transactionCoding_equitySwapInvolved": "equitySwapInvolved",
223
+ "transactionTimeliness_value": "transactionTimeliness",
224
+ "transactionAmounts_transactionShares_value": "transactionShares",
225
+ "transactionAmounts_transactionPricePerShare_value": "transactionPricePerShare",
226
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_value": "sharesOwnedFollowingTransaction",
227
+ "heldFollowingReport": "sharesOwnedFollowingTransaction", # Form 3
228
+ "ownershipNature_directOrIndirectOwnership_value": "ownershipType",
229
+ "ownershipNature_natureOfOwnership_value": "ownershipType",
230
+ "deemedExecutionDate": "deemedExecutionDate",
231
+ "deemedExecutionDate_value": "deemedExecutionDate",
232
+
233
+ # Derivative-specific fields
234
+ "conversionOrExercisePrice_value": "conversionOrExercisePrice",
235
+ "exerciseDate_value": "exerciseDate",
236
+ "expirationDate_value": "expirationDate",
237
+ "underlyingSecurity_underlyingSecurityTitle_value": "underlyingSecurityTitle",
238
+ "underlyingSecurity_underlyingSecurityShares_value": "underlyingSecurityShares",
239
+ "underlyingSecurity_underlyingSecurityValue_value": "underlyingSecurityValue",
240
+
241
+ # Footnote fields
242
+ "transactionPricePerShareFootnote": "transactionPricePerShareFootnote",
243
+ "transactionAmounts_transactionPricePerShare_footnote": "transactionPricePerShareFootnote",
244
+ "transactionCodeFootnote": "transactionCodeFootnote",
245
+ "transactionAmounts_transactionAcquiredDisposedCode_footnote": "transactionCodeFootnote",
246
+ "transactionCoding_footnote": "transactionCodeFootnote",
247
+ "natureOfOwnershipFootnote": "natureOfOwnershipFootnote",
248
+ "ownershipNature_natureOfOwnership_footnote": "natureOfOwnershipFootnote",
249
+ "sharesOwnedFollowingTransactionFootnote": "sharesOwnedFollowingTransactionFootnote",
250
+ "postTransactionAmounts_sharesOwnedFollowingTransaction_footnote": "sharesOwnedFollowingTransactionFootnote",
251
+ "ownershipTypeFootnote": "ownershipTypeFootnote",
252
+ "ownershipNature_directOrIndirectOwnership_footnote": "ownershipTypeFootnote",
253
+ "securityTitleFootnote": "securityTitleFootnote",
254
+ "securityTitle_footnote": "securityTitleFootnote",
255
+ "transactionSharesFootnote": "transactionSharesFootnote",
256
+ "transactionAmounts_transactionShares_footnote": "transactionSharesFootnote",
257
+ "transactionDateFootnote": "transactionDateFootnote",
258
+ "transactionDate_footnote": "transactionDateFootnote",
259
+ "conversionOrExercisePriceFootnote": "conversionOrExercisePriceFootnote",
260
+ "conversionOrExercisePrice_footnote": "conversionOrExercisePriceFootnote",
261
+ "exerciseDateFootnote": "exerciseDateFootnote",
262
+ "exerciseDate_footnote": "exerciseDateFootnote",
263
+ "expirationDateFootnote": "expirationDateFootnote",
264
+ "expirationDate_footnote": "expirationDateFootnote",
265
+ "underlyingSecurityTitleFootnote": "underlyingSecurityTitleFootnote",
266
+ "underlyingSecurity_underlyingSecurityTitle_footnote": "underlyingSecurityTitleFootnote",
267
+ "underlyingSecuritySharesFootnote": "underlyingSecuritySharesFootnote",
268
+ "underlyingSecurity_underlyingSecurityShares_footnote": "underlyingSecuritySharesFootnote",
269
+ "underlyingSecurityValueFootnote": "underlyingSecurityValueFootnote",
270
+ "underlyingSecurity_underlyingSecurityValue_footnote": "underlyingSecurityValueFootnote"
271
+ }
272
+
273
+ # Get the unique target column names in order from the mapping dictionary
274
+ output_columns = []
275
+ for _, target_key in master_mapping_dict.items():
276
+ if target_key not in output_columns:
277
+ output_columns.append(target_key)
278
+
279
+ # Process function that handles any table type
280
+ def process_table(table_data, is_derivative):
281
+ if isinstance(table_data, dict):
282
+ table_data = [table_data]
283
+
284
+ flattened = self._flatten_dict(table_data)
285
+
286
+ # Apply mapping to the flattened data and ensure all expected columns are present
287
+ mapped_data = []
288
+ for item in flattened:
289
+ mapped_item = {}
290
+ # First, apply the mapping
291
+ for old_key, value in item.items():
292
+ target_key = master_mapping_dict.get(old_key, old_key)
293
+ mapped_item[target_key] = value
294
+
295
+ # Set the derivative/non-derivative flags
296
+ mapped_item["isDerivative"] = 1 if is_derivative else 0
297
+ mapped_item["isNonDerivative"] = 0 if is_derivative else 1
298
+
299
+ # Create a new ordered dictionary with all columns
300
+ ordered_item = {}
301
+ for column in output_columns:
302
+ ordered_item[column] = mapped_item.get(column, None)
303
+
304
+ # Add accession_number if available
305
+ if accession_number is not None:
306
+ ordered_item['accession_number'] = accession_number
307
+
308
+ mapped_data.append(ordered_item)
309
+
310
+ return mapped_data
311
+
312
+ # Results container
313
+ all_results = []
314
+
315
+ # Process non-derivative transactions if they exist
316
+ if 'nonDerivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['nonDerivativeTable'] is not None:
317
+ if 'nonDerivativeTransaction' in self.data['ownershipDocument']['nonDerivativeTable']:
318
+ non_deriv_trans = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeTransaction']
319
+ non_deriv_results = process_table(non_deriv_trans, is_derivative=False)
320
+ all_results.extend(non_deriv_results)
321
+
322
+ # Process non-derivative holdings (for Form 3)
323
+ if 'nonDerivativeHolding' in self.data['ownershipDocument']['nonDerivativeTable']:
324
+ non_deriv_hold = self.data['ownershipDocument']['nonDerivativeTable']['nonDerivativeHolding']
325
+ non_deriv_hold_results = process_table(non_deriv_hold, is_derivative=False)
326
+ all_results.extend(non_deriv_hold_results)
327
+
328
+ # Process derivative transactions if they exist
329
+ if 'derivativeTable' in self.data['ownershipDocument'] and self.data['ownershipDocument']['derivativeTable'] is not None:
330
+ if 'derivativeTransaction' in self.data['ownershipDocument']['derivativeTable']:
331
+ deriv_trans = self.data['ownershipDocument']['derivativeTable']['derivativeTransaction']
332
+ deriv_results = process_table(deriv_trans, is_derivative=True)
333
+ all_results.extend(deriv_results)
334
+
335
+ # Process derivative holdings (for Form 3)
336
+ if 'derivativeHolding' in self.data['ownershipDocument']['derivativeTable']:
337
+ deriv_hold = self.data['ownershipDocument']['derivativeTable']['derivativeHolding']
338
+ deriv_hold_results = process_table(deriv_hold, is_derivative=True)
339
+ all_results.extend(deriv_hold_results)
170
340
 
171
- return output_filename
172
-
341
+ # check if any rows not in the mapping dict, raise error if so
342
+ for item in all_results:
343
+ for key in item.keys():
344
+ if key not in master_mapping_dict.values() and key != 'accession_number':
345
+ raise ValueError(f"Key '{key}' not found in mapping dictionary")
346
+
347
+
348
+ return all_results
349
+ else:
350
+ raise ValueError("sorry, rejigging conversion to tabular format")
351
+
352
+ def write_csv(self, output_filename, accession_number=None):
353
+
354
+ data = self.to_tabular(accession_number)
355
+
356
+ if not data:
357
+
358
+ return
359
+
360
+ fieldnames = data[0].keys()
361
+
362
+ with open(output_filename, 'w', newline='') as csvfile:
363
+ writer = csv.DictWriter(csvfile,fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
364
+ writer.writeheader()
365
+ writer.writerows(data)
366
+
367
+
173
368
  def _document_to_section_text(self, document_data, parent_key=''):
174
369
  items = []
175
370
 
@@ -203,7 +398,7 @@ class Document:
203
398
  # we'll modify this for every dict
204
399
  def _flatten_dict(self, d, parent_key=''):
205
400
  items = {}
206
-
401
+
207
402
  if isinstance(d, list):
208
403
  return [self._flatten_dict(item) for item in d]
209
404
 
@@ -219,13 +414,12 @@ class Document:
219
414
 
220
415
  # this will all have to be changed. default will be to flatten everything
221
416
  def __iter__(self):
222
- if not self.data:
223
- self.parse()
417
+ self.parse()
224
418
 
225
419
  # Let's remove XML iterable for now
226
420
 
227
421
  # Handle text-based documents
228
- if self.path.suffix in ['.txt', '.htm', '.html']:
422
+ if self.extension in ['.txt', '.htm', '.html']:
229
423
  document_data = self.data
230
424
  if not document_data:
231
425
  return iter([])
@@ -235,13 +429,13 @@ class Document:
235
429
  section_type = None
236
430
 
237
431
  if self.type in ['10-K', '10-Q']:
238
- mapping_dict = txt_mapping_dicts.dict_10k if self.type == '10-K' else txt_mapping_dicts.dict_10q
432
+ mapping_dict = dict_10k if self.type == '10-K' else dict_10q
239
433
  elif self.type == '8-K':
240
- mapping_dict = txt_mapping_dicts.dict_8k
434
+ mapping_dict = dict_8k
241
435
  elif self.type == 'SC 13D':
242
- mapping_dict = txt_mapping_dicts.dict_13d
436
+ mapping_dict = dict_13d
243
437
  elif self.type == 'SC 13G':
244
- mapping_dict = txt_mapping_dicts.dict_13g
438
+ mapping_dict = dict_13g
245
439
  else:
246
440
  return iter([])
247
441
 
datamule/portfolio.py CHANGED
@@ -119,7 +119,7 @@ class Portfolio:
119
119
  # First query, just set the accession numbers
120
120
  self.accession_numbers = new_accession_numbers
121
121
 
122
- def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None, **kwargs):
122
+ def download_submissions(self, cik=None, ticker=None, submission_type=None, filing_date=None, provider=None,requests_per_second=5, **kwargs):
123
123
  if provider is None:
124
124
  config = Config()
125
125
  provider = config.get_default_source()
@@ -142,7 +142,7 @@ class Portfolio:
142
142
  cik=cik,
143
143
  submission_type=submission_type,
144
144
  filing_date=filing_date,
145
- requests_per_second=5, # Revisit this later.
145
+ requests_per_second=requests_per_second,
146
146
  accession_numbers=self.accession_numbers if hasattr(self, 'accession_numbers') else None
147
147
  )
148
148
 
@@ -164,8 +164,6 @@ class Portfolio:
164
164
  )
165
165
 
166
166
 
167
-
168
-
169
167
  def __iter__(self):
170
168
  if not self.submissions_loaded:
171
169
  self._load_submissions()
@@ -179,4 +177,8 @@ class Portfolio:
179
177
  document_types = [document_types]
180
178
 
181
179
  for submission in self.submissions:
182
- yield from submission.document_type(document_types)
180
+ yield from submission.document_type(document_types)
181
+
182
+ def keep(self,document_type):
183
+ for submission in self.__iter__():
184
+ submission.keep(document_type)
@@ -36,7 +36,8 @@ async def download_callback(hit, content, cik, accno, url, output_dir="filings")
36
36
  print(f"Error processing {accno}: {e}")
37
37
  return None
38
38
 
39
- def download(cik=None, submission_type=None, filing_date=None, requests_per_second=5, output_dir="filings", accession_numbers=None):
39
+ def download(cik=None, submission_type=None, filing_date=None, location=None, name=None,
40
+ requests_per_second=5, output_dir="filings", accession_numbers=None, quiet=False):
40
41
  """
41
42
  Download SEC EDGAR filings and extract their documents.
42
43
 
@@ -44,12 +45,25 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
44
45
  - cik: CIK number(s) to query for
45
46
  - submission_type: Filing type(s) to query for (default: 10-K)
46
47
  - filing_date: Date or date range to query for
48
+ - location: Location code to filter by (e.g., 'CA' for California)
49
+ - name: Company name to search for (alternative to providing CIK)
47
50
  - requests_per_second: Rate limit for SEC requests
48
51
  - output_dir: Directory to save documents
49
52
  - accession_numbers: Optional list of accession numbers to filter by
53
+ - quiet: Whether to suppress progress output
50
54
 
51
55
  Returns:
52
56
  - List of all document paths processed
57
+
58
+ Examples:
59
+ # Download filings by CIK
60
+ download(cik="1318605", submission_type="10-K")
61
+
62
+ # Download filings by company name
63
+ download(name="Tesla", submission_type="10-K")
64
+
65
+ # Download filings with location filter
66
+ download(name="Apple", location="CA", submission_type="10-K")
53
67
  """
54
68
 
55
69
  # Make sure output directory exists
@@ -62,9 +76,12 @@ def download(cik=None, submission_type=None, filing_date=None, requests_per_seco
62
76
  # Call the stream function with our callback
63
77
  return stream(
64
78
  cik=cik,
79
+ name=name,
65
80
  submission_type=submission_type,
66
81
  filing_date=filing_date,
82
+ location=location,
67
83
  requests_per_second=requests_per_second,
68
84
  document_callback=callback_wrapper,
69
- accession_numbers=accession_numbers
85
+ accession_numbers=accession_numbers,
86
+ quiet=quiet
70
87
  )