datamule 1.1.8__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamule/sheet.py CHANGED
@@ -3,7 +3,7 @@ import csv
3
3
  import os
4
4
  from .helper import _process_cik_and_metadata_filters, load_package_dataset
5
5
  from .sec.xbrl.downloadcompanyfacts import download_company_facts
6
- from .seclibrary.bq import get_information_table
6
+ from .seclibrary.bq import get_information_table, get_345, get_proxy_voting_record
7
7
 
8
8
  class Sheet:
9
9
  def __init__(self, path):
@@ -31,9 +31,6 @@ class Sheet:
31
31
 
32
32
  def get_information_table(
33
33
  self,
34
- # Required parameters
35
- table_type="INFORMATION_TABLE",
36
-
37
34
  # Optional filtering parameters
38
35
  columns=None,
39
36
  name_of_issuer=None,
@@ -65,8 +62,6 @@ class Sheet:
65
62
 
66
63
  Parameters:
67
64
  -----------
68
- table_type : str
69
- The table to query (default is "INFORMATION_TABLE")
70
65
  columns : List[str], optional
71
66
  Specific columns to return. If None, all columns are returned.
72
67
 
@@ -97,7 +92,6 @@ class Sheet:
97
92
  """
98
93
 
99
94
  return get_information_table(
100
- table_type=table_type,
101
95
  columns=columns,
102
96
  name_of_issuer=name_of_issuer,
103
97
  title_of_class=title_of_class,
@@ -124,12 +118,164 @@ class Sheet:
124
118
  verbose=verbose
125
119
  )
126
120
 
121
+ def get_345(
122
+ self,
123
+ # Optional filtering parameters
124
+ columns=None,
125
+ is_derivative=None,
126
+ is_non_derivative=None,
127
+ security_title=None,
128
+ transaction_date=None,
129
+ document_type=None,
130
+ transaction_code=None,
131
+ equity_swap_involved=None,
132
+ transaction_timeliness=None,
133
+ transaction_shares=None,
134
+ transaction_price_per_share=None,
135
+ shares_owned_following_transaction=None,
136
+ ownership_type=None,
137
+ deemed_execution_date=None,
138
+ conversion_or_exercise_price=None,
139
+ exercise_date=None,
140
+ expiration_date=None,
141
+ underlying_security_title=None,
142
+ underlying_security_shares=None,
143
+ underlying_security_value=None,
144
+ accession=None,
145
+ reporting_owner_cik=None,
146
+ issuer_cik=None,
147
+ filing_date=None,
148
+
149
+ # API key handling
150
+ api_key=None,
151
+
152
+ # Additional options
153
+ print_cost=True,
154
+ verbose=False
155
+ ):
156
+ """
157
+ Query the SEC BigQuery API for Form 345 insider transaction data.
158
+
159
+ Parameters:
160
+ -----------
161
+ columns : List[str], optional
162
+ Specific columns to return. If None, all columns are returned.
163
+
164
+ # Filter parameters
165
+ is_derivative, security_title, etc. : Various filters that can be:
166
+ - str/bool: Exact match
167
+ - List[str]: Match any in list
168
+ - tuple: (min, max) range for numeric/date fields
169
+
170
+ reporting_owner_cik : str or List[str]
171
+ CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
172
+ Any match within the array will return the record.
173
+
174
+ issuer_cik : str or List[str]
175
+ CIK(s) of the company/companies
176
+
177
+ api_key : str, optional
178
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
179
+ print_cost : bool
180
+ Whether to print the query cost information
181
+ verbose : bool
182
+ Whether to print additional information about the query
183
+
184
+ Returns:
185
+ --------
186
+ List[Dict]
187
+ A list of dictionaries containing the query results
188
+
189
+ Raises:
190
+ -------
191
+ ValueError
192
+ If API key is missing or invalid
193
+ Exception
194
+ For API errors or other issues
195
+ """
196
+
197
+ return get_345(
198
+ columns=columns,
199
+ is_derivative=is_derivative,
200
+ is_non_derivative=is_non_derivative,
201
+ security_title=security_title,
202
+ transaction_date=transaction_date,
203
+ document_type=document_type,
204
+ transaction_code=transaction_code,
205
+ equity_swap_involved=equity_swap_involved,
206
+ transaction_timeliness=transaction_timeliness,
207
+ transaction_shares=transaction_shares,
208
+ transaction_price_per_share=transaction_price_per_share,
209
+ shares_owned_following_transaction=shares_owned_following_transaction,
210
+ ownership_type=ownership_type,
211
+ deemed_execution_date=deemed_execution_date,
212
+ conversion_or_exercise_price=conversion_or_exercise_price,
213
+ exercise_date=exercise_date,
214
+ expiration_date=expiration_date,
215
+ underlying_security_title=underlying_security_title,
216
+ underlying_security_shares=underlying_security_shares,
217
+ underlying_security_value=underlying_security_value,
218
+ accession=accession,
219
+ reporting_owner_cik=reporting_owner_cik,
220
+ issuer_cik=issuer_cik,
221
+ filing_date=filing_date,
222
+
223
+ # API key handling
224
+ api_key=api_key,
225
+
226
+ # Additional options
227
+ print_cost=print_cost,
228
+ verbose=verbose
229
+ )
230
+
231
+ def _download_to_csv(self, data, filepath, verbose=False):
232
+ """
233
+ Helper method to download data to a CSV file.
234
+
235
+ Parameters:
236
+ -----------
237
+ data : List[Dict]
238
+ The data to save
239
+ filepath : str or Path
240
+ Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
241
+ verbose : bool
242
+ Whether to print additional information
243
+
244
+ Returns:
245
+ --------
246
+ List[Dict]
247
+ The input data (for method chaining)
248
+ """
249
+ # If no data returned, nothing to save
250
+ if not data:
251
+ if verbose:
252
+ print("No data returned from API. No file was created.")
253
+ return data
254
+
255
+ # Resolve filepath - if it's not absolute, make it relative to self.path
256
+ filepath_obj = Path(filepath)
257
+ if not filepath_obj.is_absolute():
258
+ filepath_obj = self.path / filepath_obj
259
+
260
+ # Create directory if it doesn't exist
261
+ os.makedirs(filepath_obj.parent, exist_ok=True)
262
+
263
+ # Get fieldnames from the first record
264
+ fieldnames = data[0].keys()
265
+
266
+ # Write to CSV
267
+ with open(filepath_obj, 'w', newline='') as csvfile:
268
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
269
+ writer.writeheader()
270
+ writer.writerows(data)
271
+
272
+ if verbose:
273
+ print(f"Saved {len(data)} records to {filepath_obj}")
274
+
275
+
127
276
  def download_information_table(
128
277
  self,
129
278
  filepath,
130
- # Required parameters
131
- table_type="INFORMATION_TABLE",
132
-
133
279
  # Optional filtering parameters
134
280
  columns=None,
135
281
  name_of_issuer=None,
@@ -164,8 +310,6 @@ class Sheet:
164
310
  filepath : str
165
311
  Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
166
312
 
167
- table_type : str
168
- The table to query (default is "INFORMATION_TABLE")
169
313
  columns : List[str], optional
170
314
  Specific columns to return. If None, all columns are returned.
171
315
 
@@ -196,7 +340,6 @@ class Sheet:
196
340
  """
197
341
  # Get the data from the API
198
342
  data = self.get_information_table(
199
- table_type=table_type,
200
343
  columns=columns,
201
344
  name_of_issuer=name_of_issuer,
202
345
  title_of_class=title_of_class,
@@ -219,30 +362,311 @@ class Sheet:
219
362
  verbose=verbose
220
363
  )
221
364
 
222
- # If no data returned, nothing to save
223
- if not data:
224
- if verbose:
225
- print("No data returned from API. No file was created.")
226
- return data
365
+ # Save to CSV using the helper method
366
+ return self._download_to_csv(data, filepath, verbose)
367
+
368
+ def download_345(
369
+ self,
370
+ filepath,
371
+ # Optional filtering parameters
372
+ columns=None,
373
+ is_derivative=None,
374
+ is_non_derivative=None,
375
+ security_title=None,
376
+ transaction_date=None,
377
+ document_type=None,
378
+ transaction_code=None,
379
+ equity_swap_involved=None,
380
+ transaction_timeliness=None,
381
+ transaction_shares=None,
382
+ transaction_price_per_share=None,
383
+ shares_owned_following_transaction=None,
384
+ ownership_type=None,
385
+ deemed_execution_date=None,
386
+ conversion_or_exercise_price=None,
387
+ exercise_date=None,
388
+ expiration_date=None,
389
+ underlying_security_title=None,
390
+ underlying_security_shares=None,
391
+ underlying_security_value=None,
392
+ accession=None,
393
+ reporting_owner_cik=None,
394
+ issuer_cik=None,
395
+ filing_date=None,
227
396
 
228
- # Resolve filepath - if it's not absolute, make it relative to self.path
229
- filepath_obj = Path(filepath)
230
- if not filepath_obj.is_absolute():
231
- filepath_obj = self.path / filepath_obj
397
+ # API key handling
398
+ api_key=None,
232
399
 
233
- # Create directory if it doesn't exist
234
- os.makedirs(filepath_obj.parent, exist_ok=True)
400
+ # Additional options
401
+ print_cost=True,
402
+ verbose=False
403
+ ):
404
+ """
405
+ Query the SEC BigQuery API for Form 345 insider transaction data and save to CSV.
235
406
 
236
- # Get fieldnames from the first record
237
- fieldnames = data[0].keys()
407
+ Parameters:
408
+ -----------
409
+ filepath : str
410
+ Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
238
411
 
239
- # Write to CSV
240
- with open(filepath_obj, 'w', newline='') as csvfile:
241
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
242
- writer.writeheader()
243
- writer.writerows(data)
412
+ columns : List[str], optional
413
+ Specific columns to return. If None, all columns are returned.
414
+
415
+ # Filter parameters
416
+ is_derivative, security_title, etc. : Various filters that can be:
417
+ - str/bool: Exact match
418
+ - List[str]: Match any in list
419
+ - tuple: (min, max) range for numeric/date fields
244
420
 
245
- if verbose:
246
- print(f"Saved {len(data)} records to {filepath_obj}")
421
+ reporting_owner_cik : str or List[str]
422
+ CIK(s) of the reporting insider(s). This is matched against an array in BigQuery.
423
+ Any match within the array will return the record.
424
+
425
+ issuer_cik : str or List[str]
426
+ CIK(s) of the company/companies
427
+
428
+ api_key : str, optional
429
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
430
+ print_cost : bool
431
+ Whether to print the query cost information
432
+ verbose : bool
433
+ Whether to print additional information about the query
247
434
 
248
- return data
435
+ Returns:
436
+ --------
437
+ List[Dict]
438
+ A list of dictionaries containing the query results
439
+
440
+ Raises:
441
+ -------
442
+ ValueError
443
+ If API key is missing or invalid
444
+ Exception
445
+ For API errors or other issues
446
+ """
447
+ # Get the data from the API
448
+ data = self.get_345(
449
+ columns=columns,
450
+ is_derivative=is_derivative,
451
+ is_non_derivative=is_non_derivative,
452
+ security_title=security_title,
453
+ transaction_date=transaction_date,
454
+ document_type=document_type,
455
+ transaction_code=transaction_code,
456
+ equity_swap_involved=equity_swap_involved,
457
+ transaction_timeliness=transaction_timeliness,
458
+ transaction_shares=transaction_shares,
459
+ transaction_price_per_share=transaction_price_per_share,
460
+ shares_owned_following_transaction=shares_owned_following_transaction,
461
+ ownership_type=ownership_type,
462
+ deemed_execution_date=deemed_execution_date,
463
+ conversion_or_exercise_price=conversion_or_exercise_price,
464
+ exercise_date=exercise_date,
465
+ expiration_date=expiration_date,
466
+ underlying_security_title=underlying_security_title,
467
+ underlying_security_shares=underlying_security_shares,
468
+ underlying_security_value=underlying_security_value,
469
+ accession=accession,
470
+ reporting_owner_cik=reporting_owner_cik,
471
+ issuer_cik=issuer_cik,
472
+ filing_date=filing_date,
473
+ api_key=api_key,
474
+ print_cost=print_cost,
475
+ verbose=verbose
476
+ )
477
+
478
+ # Save to CSV using the helper method
479
+ return self._download_to_csv(data, filepath, verbose)
480
+
481
+ def get_proxy_voting_record(
482
+ self,
483
+ # Optional filtering parameters
484
+ columns=None,
485
+ meeting_date=None,
486
+ isin=None,
487
+ cusip=None,
488
+ issuer_name=None,
489
+ vote_description=None,
490
+ shares_on_loan=None,
491
+ shares_voted=None,
492
+ vote_category=None,
493
+ vote_record=None,
494
+ vote_source=None,
495
+ how_voted=None,
496
+ figi=None,
497
+ management_recommendation=None,
498
+ accession=None,
499
+ reporting_owner_cik=None,
500
+ filing_date=None,
501
+
502
+ # API key handling
503
+ api_key=None,
504
+
505
+ # Additional options
506
+ print_cost=True,
507
+ verbose=False
508
+ ):
509
+ """
510
+ Query the SEC BigQuery API for NPX proxy voting record data.
511
+
512
+ Parameters:
513
+ -----------
514
+ columns : List[str], optional
515
+ Specific columns to return. If None, all columns are returned.
516
+
517
+ # Filter parameters
518
+ meeting_date, isin, cusip, etc. : Various filters that can be:
519
+ - str: Exact match
520
+ - List[str]: Match any in list
521
+ - tuple: (min, max) range for numeric/date fields
522
+
523
+ shares_on_loan, shares_voted : int/float or tuple
524
+ Numeric values or (min, max) range
525
+
526
+ filing_date : str or tuple
527
+ Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
528
+
529
+ api_key : str, optional
530
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
531
+ print_cost : bool
532
+ Whether to print the query cost information
533
+ verbose : bool
534
+ Whether to print additional information about the query
535
+
536
+ Returns:
537
+ --------
538
+ List[Dict]
539
+ A list of dictionaries containing the query results
540
+
541
+ Raises:
542
+ -------
543
+ ValueError
544
+ If API key is missing or invalid
545
+ Exception
546
+ For API errors or other issues
547
+ """
548
+
549
+ return get_proxy_voting_record(
550
+ columns=columns,
551
+ meeting_date=meeting_date,
552
+ isin=isin,
553
+ cusip=cusip,
554
+ issuer_name=issuer_name,
555
+ vote_description=vote_description,
556
+ shares_on_loan=shares_on_loan,
557
+ shares_voted=shares_voted,
558
+ vote_category=vote_category,
559
+ vote_record=vote_record,
560
+ vote_source=vote_source,
561
+ how_voted=how_voted,
562
+ figi=figi,
563
+ management_recommendation=management_recommendation,
564
+ accession=accession,
565
+ reporting_owner_cik=reporting_owner_cik,
566
+ filing_date=filing_date,
567
+
568
+ # API key handling
569
+ api_key=api_key,
570
+
571
+ # Additional options
572
+ print_cost=print_cost,
573
+ verbose=verbose
574
+ )
575
+
576
+ def download_proxy_voting_record(
577
+ self,
578
+ filepath,
579
+ # Optional filtering parameters
580
+ columns=None,
581
+ meeting_date=None,
582
+ isin=None,
583
+ cusip=None,
584
+ issuer_name=None,
585
+ vote_description=None,
586
+ shares_on_loan=None,
587
+ shares_voted=None,
588
+ vote_category=None,
589
+ vote_record=None,
590
+ vote_source=None,
591
+ how_voted=None,
592
+ figi=None,
593
+ management_recommendation=None,
594
+ accession=None,
595
+ reporting_owner_cik=None,
596
+ filing_date=None,
597
+
598
+ # API key handling
599
+ api_key=None,
600
+
601
+ # Additional options
602
+ print_cost=True,
603
+ verbose=False
604
+ ):
605
+ """
606
+ Query the SEC BigQuery API for NPX proxy voting record data and save to CSV.
607
+
608
+ Parameters:
609
+ -----------
610
+ filepath : str
611
+ Path where to save the CSV file. If relative, it will be relative to the Sheet's path.
612
+
613
+ columns : List[str], optional
614
+ Specific columns to return. If None, all columns are returned.
615
+
616
+ # Filter parameters
617
+ meeting_date, isin, cusip, etc. : Various filters that can be:
618
+ - str: Exact match
619
+ - List[str]: Match any in list
620
+ - tuple: (min, max) range for numeric/date fields
621
+
622
+ shares_on_loan, shares_voted : int/float or tuple
623
+ Numeric values or (min, max) range
624
+
625
+ filing_date : str or tuple
626
+ Date string in 'YYYY-MM-DD' format or (start_date, end_date) tuple
627
+
628
+ api_key : str, optional
629
+ SEC BigQuery API key. If None, looks for DATAMULE_API_KEY env variable.
630
+ print_cost : bool
631
+ Whether to print the query cost information
632
+ verbose : bool
633
+ Whether to print additional information about the query
634
+
635
+ Returns:
636
+ --------
637
+ List[Dict]
638
+ A list of dictionaries containing the query results
639
+
640
+ Raises:
641
+ -------
642
+ ValueError
643
+ If API key is missing or invalid
644
+ Exception
645
+ For API errors or other issues
646
+ """
647
+ # Get the data from the API
648
+ data = self.get_proxy_voting_record(
649
+ columns=columns,
650
+ meeting_date=meeting_date,
651
+ isin=isin,
652
+ cusip=cusip,
653
+ issuer_name=issuer_name,
654
+ vote_description=vote_description,
655
+ shares_on_loan=shares_on_loan,
656
+ shares_voted=shares_voted,
657
+ vote_category=vote_category,
658
+ vote_record=vote_record,
659
+ vote_source=vote_source,
660
+ how_voted=how_voted,
661
+ figi=figi,
662
+ management_recommendation=management_recommendation,
663
+ accession=accession,
664
+ reporting_owner_cik=reporting_owner_cik,
665
+ filing_date=filing_date,
666
+ api_key=api_key,
667
+ print_cost=print_cost,
668
+ verbose=verbose
669
+ )
670
+
671
+ # Save to CSV using the helper method
672
+ return self._download_to_csv(data, filepath, verbose)
datamule/submission.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from pathlib import Path
2
2
  import json
3
- from .document import Document
3
+ from .document.document import Document
4
4
  from secsgml import parse_sgml_submission_into_memory
5
- from pathlib import Path
5
+ import os
6
+ import aiofiles
7
+
6
8
 
7
9
  class Submission:
8
10
  def __init__(self, path=None,sgml_content=None,keep_document_types=None):
@@ -14,7 +16,12 @@ class Submission:
14
16
  if sgml_content is not None:
15
17
  self.path = None
16
18
  self.metadata, raw_documents = parse_sgml_submission_into_memory(sgml_content)
19
+
20
+ self.accession = self.metadata['accession-number']
21
+ self.filing_date= f"{self.metadata['filing-date'][:4]}-{self.metadata['filing-date'][4:6]}-{self.metadata['filing-date'][6:8]}"
22
+
17
23
  self.documents = []
24
+ filtered_metadata_documents = []
18
25
 
19
26
  for idx,doc in enumerate(self.metadata['documents']):
20
27
  type = doc.get('type')
@@ -24,15 +31,19 @@ class Submission:
24
31
  continue
25
32
  filename = doc.get('filename')
26
33
  extension = Path(filename).suffix
27
- self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension))
34
+ self.documents.append(Document(type=type, content=raw_documents[idx], extension=extension,filing_date=self.filing_date,accession=self.accession))
28
35
 
36
+ filtered_metadata_documents.append(doc)
37
+
38
+ self.metadata['documents'] = filtered_metadata_documents
29
39
 
30
40
  if path is not None:
31
41
  self.path = Path(path)
32
42
  metadata_path = self.path / 'metadata.json'
33
43
  with metadata_path.open('r') as f:
34
44
  self.metadata = json.load(f)
35
-
45
+
46
+
36
47
 
37
48
  def document_type(self, document_type):
38
49
  # Convert single document type to list for consistent handling
@@ -57,7 +68,7 @@ class Submission:
57
68
  with document_path.open('r') as f:
58
69
  content = f.read()
59
70
 
60
- yield Document(type=doc['type'], content=content, extension=extension)
71
+ yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
61
72
  # if loaded from sgml_content
62
73
  else:
63
74
  yield self.documents[idx]
@@ -81,7 +92,7 @@ class Submission:
81
92
  with document_path.open('r') as f:
82
93
  content = f.read()
83
94
 
84
- yield Document(type=doc['type'], content=content, extension=extension)
95
+ yield Document(type=doc['type'], content=content, extension=extension,filing_date=self.filing_date,accession=self.accession,path=document_path)
85
96
  else:
86
97
  print(f"Warning: File {document_path} does not exist likely due to keep types in downloading.")
87
98
 
@@ -110,4 +121,88 @@ class Submission:
110
121
  # delete the file
111
122
  document_path.unlink()
112
123
  else:
113
- print("Warning: keep() method is only available when loading from path.")
124
+ print("Warning: keep() method is only available when loading from path.")
125
+
126
+
127
+
128
+ def save(self, output_dir="filings"):
129
+ file_dir = Path(output_dir) / str(self.accession)
130
+ file_dir.mkdir(parents=True, exist_ok=True)
131
+
132
+ metadata_path = file_dir / "metadata.json"
133
+ with open(metadata_path, 'w') as f:
134
+ json.dump(self.metadata, f, indent=4)
135
+
136
+ for idx, doc in enumerate(self.metadata['documents']):
137
+ try:
138
+ filename = doc.get('filename')
139
+ if filename is None:
140
+ filename = f"{doc.get('sequence', idx)}.txt"
141
+ except (KeyError, IndexError):
142
+ filename = f"{idx}.txt"
143
+
144
+ doc_path = file_dir / filename
145
+
146
+ if self.path is not None:
147
+ if hasattr(self, 'documents') and self.documents:
148
+ content = self.documents[idx].content
149
+ else:
150
+ orig_doc_path = self.path / filename
151
+ if orig_doc_path.exists():
152
+ with open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
153
+ content = f.read()
154
+ else:
155
+ print(f"Warning: File {orig_doc_path} does not exist, skipping.")
156
+ continue
157
+ else:
158
+ content = self.documents[idx].content
159
+
160
+ if isinstance(content, bytes):
161
+ with open(doc_path, 'wb') as f:
162
+ f.write(content)
163
+ else:
164
+ with open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
165
+ f.write(content)
166
+
167
+ return file_dir
168
+
169
+ async def save_async(self, output_dir="filings"):
170
+ file_dir = Path(output_dir) / str(self.accession)
171
+ os.makedirs(file_dir, exist_ok=True)
172
+
173
+ metadata_path = file_dir / "metadata.json"
174
+ async with aiofiles.open(metadata_path, 'w') as f:
175
+ await f.write(json.dumps(self.metadata, indent=4))
176
+
177
+ for idx, doc in enumerate(self.metadata['documents']):
178
+ try:
179
+ filename = doc.get('filename')
180
+ if filename is None:
181
+ filename = f"{doc.get('sequence', idx)}.txt"
182
+ except (KeyError, IndexError):
183
+ filename = f"{idx}.txt"
184
+
185
+ doc_path = file_dir / filename
186
+
187
+ if self.path is not None:
188
+ if hasattr(self, 'documents') and self.documents:
189
+ content = self.documents[idx].content
190
+ else:
191
+ orig_doc_path = self.path / filename
192
+ if orig_doc_path.exists():
193
+ async with aiofiles.open(orig_doc_path, 'r', encoding='utf-8', errors='replace') as f:
194
+ content = await f.read()
195
+ else:
196
+ print(f"Warning: File {orig_doc_path} does not exist, skipping.")
197
+ continue
198
+ else:
199
+ content = self.documents[idx].content
200
+
201
+ if isinstance(content, bytes):
202
+ async with aiofiles.open(doc_path, 'wb') as f:
203
+ await f.write(content)
204
+ else:
205
+ async with aiofiles.open(doc_path, 'w', encoding='utf-8', errors='replace') as f:
206
+ await f.write(content)
207
+
208
+ return file_dir