dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dayhoff_tools/__init__.py +10 -0
  2. dayhoff_tools/cli/cloud_commands.py +179 -43
  3. dayhoff_tools/cli/engine1/__init__.py +323 -0
  4. dayhoff_tools/cli/engine1/engine_core.py +703 -0
  5. dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
  6. dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
  7. dayhoff_tools/cli/engine1/engine_management.py +505 -0
  8. dayhoff_tools/cli/engine1/shared.py +501 -0
  9. dayhoff_tools/cli/engine1/studio_commands.py +825 -0
  10. dayhoff_tools/cli/engines_studios/__init__.py +6 -0
  11. dayhoff_tools/cli/engines_studios/api_client.py +351 -0
  12. dayhoff_tools/cli/engines_studios/auth.py +144 -0
  13. dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
  14. dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
  15. dayhoff_tools/cli/engines_studios/progress.py +260 -0
  16. dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
  17. dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
  18. dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
  19. dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
  20. dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
  21. dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
  22. dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
  23. dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
  24. dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
  25. dayhoff_tools/cli/main.py +106 -7
  26. dayhoff_tools/cli/utility_commands.py +896 -179
  27. dayhoff_tools/deployment/base.py +70 -6
  28. dayhoff_tools/deployment/deploy_aws.py +165 -25
  29. dayhoff_tools/deployment/deploy_gcp.py +78 -5
  30. dayhoff_tools/deployment/deploy_utils.py +20 -7
  31. dayhoff_tools/deployment/job_runner.py +9 -4
  32. dayhoff_tools/deployment/processors.py +230 -418
  33. dayhoff_tools/deployment/swarm.py +47 -12
  34. dayhoff_tools/embedders.py +28 -26
  35. dayhoff_tools/fasta.py +181 -64
  36. dayhoff_tools/warehouse.py +268 -1
  37. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
  38. dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
  39. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
  40. dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
  41. {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
@@ -128,23 +128,58 @@ def publish_cards(
128
128
  names: List[str],
129
129
  firestore_collection: str,
130
130
  ):
131
- """Publish cards to Firebase. Expects a list of filenames (not full paths),
132
- which will each be published as a new document in the collection."""
131
+ """Publish cards to Firebase using batch writes for optimal performance.
132
+
133
+ Expects a list of filenames (not full paths), which will each be published
134
+ as a new document in the collection. Uses Firestore batch writes to minimize
135
+ network round-trips and improve performance.
136
+
137
+ Args:
138
+ names: List of packet filenames to publish as cards
139
+ firestore_collection: Name of the Firestore collection to write to
140
+ """
141
+ if not names:
142
+ print("No cards to upload.")
143
+ return
133
144
 
134
145
  initialize_firebase()
135
- collection = firestore.client().collection(firestore_collection)
146
+ db = firestore.client()
147
+ collection = db.collection(firestore_collection)
148
+
149
+ # Firestore batch limit is 500 operations
150
+ BATCH_SIZE = 500
151
+ total_cards = len(names)
152
+ cards_processed = 0
153
+
154
+ # Process names in batches of up to 500
155
+ for i in range(0, total_cards, BATCH_SIZE):
156
+ batch = db.batch()
157
+ batch_names = names[i : i + BATCH_SIZE]
158
+
159
+ # Add all operations for this batch
160
+ for name in batch_names:
161
+ doc_ref = collection.document() # Auto-generate document ID
162
+ batch.set(
163
+ doc_ref,
164
+ {
165
+ "status": "available",
166
+ "packet_filename": name,
167
+ "created": datetime.now(ZoneInfo("America/Los_Angeles")),
168
+ },
169
+ )
136
170
 
137
- for name in names:
138
- collection.document().set(
139
- {
140
- "status": "available",
141
- "packet_filename": name,
142
- "created": datetime.now(ZoneInfo("America/Los_Angeles")),
143
- }
171
+ # Commit the entire batch atomically
172
+ batch.commit()
173
+ cards_processed += len(batch_names)
174
+
175
+ print(
176
+ f"Batch {i // BATCH_SIZE + 1}: Created {len(batch_names)} cards "
177
+ f"({cards_processed}/{total_cards} total)"
144
178
  )
145
- print(f"Creating card {name}")
146
179
 
147
- print(f"Uploaded {len(names)} cards.")
180
+ print(
181
+ f"Successfully uploaded {total_cards} cards in {(total_cards + BATCH_SIZE - 1) // BATCH_SIZE} batch(es)."
182
+ )
148
183
 
149
184
 
150
185
  @transactional
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  import os
3
3
  import time
4
- from abc import ABC, abstractmethod
5
4
  from typing import Dict, List, Literal, Optional, Tuple, cast
6
5
 
7
6
  import h5py
@@ -443,35 +442,38 @@ class Embedder(Processor):
443
442
  for seq_id, seq in small_seqs_sorted:
444
443
  seq_len = len(seq)
445
444
 
446
- if current_size + seq_len > self.batch_residue_limit:
447
- if current_batch:
448
- small_batch_count += 1
449
- logger.info(
450
- f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
451
- )
452
- batch_results = self.embed_batch(current_batch)
453
- results.update(batch_results)
454
- self.cleanup_memory()
445
+ # Check if adding this sequence would exceed the limit
446
+ if current_batch and current_size + seq_len > self.batch_residue_limit:
447
+ # Process current batch before adding the new sequence
448
+ small_batch_count += 1
449
+ logger.info(
450
+ f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
451
+ )
452
+ batch_results = self.embed_batch(current_batch)
453
+ results.update(batch_results)
454
+ self.cleanup_memory()
455
455
 
456
- # Update progress
457
- processed_sequences += len(current_batch)
458
- elapsed_time = time.time() - start_time
459
- remaining_sequences = total_sequences - processed_sequences
460
- avg_time_per_seq = (
461
- elapsed_time / processed_sequences
462
- if processed_sequences > 0
463
- else 0
464
- )
465
- estimated_time_left = avg_time_per_seq * remaining_sequences
456
+ # Update progress
457
+ processed_sequences += len(current_batch)
458
+ elapsed_time = time.time() - start_time
459
+ remaining_sequences = total_sequences - processed_sequences
460
+ avg_time_per_seq = (
461
+ elapsed_time / processed_sequences
462
+ if processed_sequences > 0
463
+ else 0
464
+ )
465
+ estimated_time_left = avg_time_per_seq * remaining_sequences
466
466
 
467
- logger.info(
468
- f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
469
- f"Elapsed: {elapsed_time/60:.1f} min | "
470
- f"Est. remaining: {estimated_time_left/60:.1f} min"
471
- )
467
+ logger.info(
468
+ f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
469
+ f"Elapsed: {elapsed_time/60:.1f} min | "
470
+ f"Est. remaining: {estimated_time_left/60:.1f} min"
471
+ )
472
+ # Start new batch
472
473
  current_batch = []
473
474
  current_size = 0
474
475
 
476
+ # Add the current sequence to the batch
475
477
  current_batch.append((seq_id, seq, seq_len))
476
478
  current_size += seq_len
477
479
 
@@ -681,7 +683,7 @@ class Embedder(Processor):
681
683
  sequence_ids, sequences, sequence_lengths = zip(*batch)
682
684
 
683
685
  # Prepare sequences for tokenization
684
- tokenizer_input = self.prepare_tokenizer_input(sequences)
686
+ tokenizer_input = self.prepare_tokenizer_input(list(sequences))
685
687
 
686
688
  # Tokenize sequences
687
689
  encoded_input = self.tokenizer.batch_encode_plus(
dayhoff_tools/fasta.py CHANGED
@@ -13,8 +13,6 @@ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
13
13
  import requests
14
14
  from Bio import SeqIO
15
15
  from Bio.SeqRecord import SeqRecord
16
- from tqdm import tqdm
17
- from tqdm.notebook import tqdm as tqdm_notebook
18
16
 
19
17
  logger = logging.getLogger(__name__)
20
18
 
@@ -27,7 +25,7 @@ def _clean_noncanonical_fasta(
27
25
  ) -> Optional[dict[str, str]]:
28
26
  """
29
27
  Read in a FASTA file containing multiple sequences, replace non-canonical amino acids,
30
- remove empty sequences, and either write the sequences to a new FASTA file or return them as a dictionary.
28
+ remove stop codons, remove empty sequences, and either write the sequences to a new FASTA file or return them as a dictionary.
31
29
 
32
30
  Args:
33
31
  input_path (str): Path to the input FASTA file.
@@ -50,7 +48,11 @@ def _clean_noncanonical_fasta(
50
48
  for line in fasta_file:
51
49
  if line.startswith(">"):
52
50
  if seq_id and seq_lines:
53
- seq = "".join(seq_lines).translate(str.maketrans("OJUZB", "XLCED"))
51
+ seq = (
52
+ "".join(seq_lines)
53
+ .translate(str.maketrans("OJUZB", "XLCED"))
54
+ .replace("*", "")
55
+ )
54
56
  if seq.strip(): # Only process non-empty sequences
55
57
  sequences[seq_id] = seq
56
58
  if output_path:
@@ -63,7 +65,11 @@ def _clean_noncanonical_fasta(
63
65
 
64
66
  # Process the last sequence
65
67
  if seq_id and seq_lines:
66
- seq = "".join(seq_lines).translate(str.maketrans("OJUZB", "XLCED"))
68
+ seq = (
69
+ "".join(seq_lines)
70
+ .translate(str.maketrans("OJUZB", "XLCED"))
71
+ .replace("*", "")
72
+ )
67
73
  if seq.strip(): # Only process non-empty sequences
68
74
  sequences[seq_id] = seq
69
75
  if output_path:
@@ -94,7 +100,7 @@ def clean_noncanonical_fasta(
94
100
  ):
95
101
  """
96
102
  Read in a FASTA file containing multiple sequences and write the sequences to a new FASTA file.
97
- Replace non-canonical amino acids along the way.
103
+ Replace non-canonical amino acids and remove stop codons along the way.
98
104
 
99
105
  Args:
100
106
  input_path (str): Path to the input FASTA file.
@@ -114,7 +120,7 @@ def clean_noncanonical_fasta_to_dict(
114
120
  ) -> dict[str, str]:
115
121
  """
116
122
  Read in a FASTA file containing multiple sequences and return the sequences as a dictionary.
117
- Replace non-canonical amino acids along the way.
123
+ Replace non-canonical amino acids and remove stop codons along the way.
118
124
 
119
125
  Args:
120
126
  input_path (str): Path to the input FASTA file.
@@ -140,6 +146,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
140
146
  Raises:
141
147
  FileExistsError: If the output file already exists.
142
148
  """
149
+ from tqdm import tqdm
150
+
143
151
  _check_output_file(output_path)
144
152
 
145
153
  if isinstance(input_path, str):
@@ -290,6 +298,11 @@ def split_fasta(
290
298
  Returns:
291
299
  int: The number of output files created.
292
300
  """
301
+ from typing import TYPE_CHECKING, Optional
302
+
303
+ if TYPE_CHECKING:
304
+ from tqdm import tqdm
305
+
293
306
  # Ensure the target folder exists
294
307
  os.makedirs(target_folder, exist_ok=True)
295
308
 
@@ -299,7 +312,7 @@ def split_fasta(
299
312
  files_created = 0
300
313
  current_output_file_sequence_count = 0
301
314
  current_output_file_bytes_written = 0
302
- pbar: tqdm | None = None
315
+ pbar: Optional["tqdm"] = None
303
316
  output_file = None # Will be opened when we encounter the first header line
304
317
  output_file_path = ""
305
318
 
@@ -314,6 +327,8 @@ def split_fasta(
314
327
  # Open the large FASTA file for reading
315
328
  with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
316
329
  if show_progress:
330
+ from tqdm import tqdm
331
+
317
332
  total_size = os.path.getsize(fasta_file)
318
333
  pbar = tqdm(
319
334
  total=total_size,
@@ -441,6 +456,9 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
441
456
  Raises:
442
457
  FileExistsError: If the output file already exists.
443
458
  """
459
+ from Bio import SeqIO
460
+ from tqdm import tqdm
461
+
444
462
  _check_output_file(output_file)
445
463
 
446
464
  # Load sequences from file1 with a progress bar
@@ -497,6 +515,8 @@ def simplify_fasta_ids(
497
515
  Raises:
498
516
  FileExistsError: If the output file already exists.
499
517
  """
518
+ from Bio import SeqIO
519
+
500
520
  _check_output_file(output_fasta)
501
521
 
502
522
  count = 0
@@ -575,6 +595,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
575
595
  Raises:
576
596
  ValueError: If there's an issue reading or parsing the input file.
577
597
  """
598
+ from Bio import SeqIO
599
+ from tqdm import tqdm
600
+
578
601
  sequence_ids: Set[str] = set()
579
602
  try:
580
603
  estimated_records = estimate_sequences(fasta_file)
@@ -604,29 +627,48 @@ def process_chunk(
604
627
  ) -> Tuple[List[str], Set[str]]:
605
628
  output_sequences = []
606
629
  written_ids = set()
607
- current_id = ""
608
- current_seq = []
609
-
610
- def id_matches(seq_id: str) -> bool:
611
- return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
612
-
613
- for line in chunk:
614
- line = line.strip()
615
- if line.startswith(">"):
616
- if current_id and current_seq:
617
- if id_matches(current_id) != exclude:
618
- output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
619
- written_ids.add(current_id)
620
- current_id = line[1:]
621
- current_seq = []
622
- elif current_id:
623
- current_seq.append(line)
624
-
625
- # Process the last sequence in the chunk
626
- if current_id and current_seq and id_matches(current_id) != exclude:
627
- output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
628
- written_ids.add(current_id)
630
+ current_id: str = ""
631
+ current_seq: List[str] = []
632
+
633
+ # Get a unique worker ID, could be process ID
634
+ worker_id = os.getpid()
635
+ logger.debug(
636
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} processing a chunk. Target IDs count: {len(target_ids_lower)}, Exclude: {exclude}"
637
+ )
638
+ try:
639
+
640
+ def id_matches(seq_id: str) -> bool:
641
+ return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
629
642
 
643
+ for line in chunk:
644
+ line = line.strip()
645
+ if line.startswith(">"):
646
+ if current_id and current_seq:
647
+ if id_matches(current_id) != exclude:
648
+ output_sequences.append(
649
+ f">{current_id}\n{''.join(current_seq)}\n"
650
+ )
651
+ written_ids.add(current_id)
652
+ current_id = line[1:]
653
+ current_seq = []
654
+ elif current_id:
655
+ current_seq.append(line)
656
+
657
+ # Process the last sequence in the chunk
658
+ if current_id and current_seq and id_matches(current_id) != exclude:
659
+ output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
660
+ written_ids.add(current_id)
661
+
662
+ except Exception as e:
663
+ logger.error(
664
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} encountered error: {e}",
665
+ exc_info=True,
666
+ )
667
+ # Re-raising the exception so the main process's pool error handling can catch it
668
+ raise
669
+ logger.debug(
670
+ f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} finished chunk. Output sequences: {len(output_sequences)}, Written IDs: {len(written_ids)}"
671
+ )
630
672
  return output_sequences, written_ids
631
673
 
632
674
 
@@ -655,54 +697,104 @@ def subset_fasta(
655
697
  Raises:
656
698
  FileExistsError: If the output file already exists.
657
699
  """
700
+ logger.info(
701
+ f"SUBSET_FASTA: Starting for input '{fasta_file}', output '{output_path}'. Target IDs: {len(target_ids)}, Exclude: {exclude}"
702
+ )
658
703
  _check_output_file(output_path)
659
704
 
660
705
  target_ids_lower = {id.lower() for id in target_ids}
661
706
  total_size = os.path.getsize(fasta_file)
662
- chunk_size = max(
663
- 1, total_size // (multiprocessing.cpu_count() * 2)
664
- ) # Adjust chunk size based on CPU count
665
707
 
666
- def chunk_reader(file_obj, chunk_size: int):
708
+ # Determine a reasonable number of processes
709
+ num_processes = multiprocessing.cpu_count()
710
+ # Adjust chunk size based on number of processes to balance load vs memory
711
+ # Aim for at least a few chunks per process if possible, but not too many small chunks.
712
+ # This is a heuristic and might need tuning.
713
+ # Let's make chunks reasonably large, e.g., 10-50MB, or ensure at least num_processes chunks.
714
+ # If total_size is very small, chunk_size could become 0 if not handled.
715
+ desired_chunk_size_mb = 32
716
+ chunk_size = max(1, desired_chunk_size_mb * 1024 * 1024)
717
+ num_chunks = max(1, math.ceil(total_size / chunk_size))
718
+
719
+ def chunk_reader(
720
+ file_obj, cs: int
721
+ ) -> Iterator[List[str]]: # Explicitly Iterator[List[str]]
667
722
  chunk = []
668
723
  chunk_bytes = 0
669
724
  for line in file_obj:
670
725
  chunk.append(line)
671
726
  chunk_bytes += len(line)
672
- if chunk_bytes >= chunk_size and line.startswith(">"):
727
+ if chunk_bytes >= cs and line.startswith(">"):
673
728
  yield chunk
674
729
  chunk = [line]
675
730
  chunk_bytes = len(line)
676
731
  if chunk:
677
732
  yield chunk
678
733
 
679
- open_func = gzip.open if fasta_file.endswith(".gz") else open
680
- mode = "rt" if fasta_file.endswith(".gz") else "r"
734
+ mode = "rt" # text mode for both gzip and regular open
681
735
 
682
- with open_func(fasta_file, mode) as input_file:
683
- with multiprocessing.Pool() as pool:
684
- process_func = partial(
685
- process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
736
+ all_written_ids: Set[str] = set()
737
+ try:
738
+ with open(fasta_file, mode) as input_file:
739
+ logger.info(
740
+ f"SUBSET_FASTA: Using up to {num_processes} worker processes for {num_chunks} potential chunks."
686
741
  )
687
- results = list(
688
- tqdm(
689
- pool.imap(process_func, chunk_reader(input_file, chunk_size)),
690
- total=total_size // chunk_size,
691
- desc="Processing FASTA",
742
+
743
+ with multiprocessing.Pool(processes=num_processes) as pool:
744
+ logger.info(
745
+ f"SUBSET_FASTA: Multiprocessing pool created (intended processes: {num_processes})."
692
746
  )
693
- )
694
747
 
695
- all_written_ids = set()
696
- with open(output_path, "w") as output_file:
697
- for output_sequences, written_ids in results:
698
- output_file.writelines(output_sequences)
699
- all_written_ids.update(written_ids)
748
+ process_func = partial(
749
+ process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
750
+ )
700
751
 
701
- print(f"Wrote {len(all_written_ids)} sequences to {output_path}")
752
+ # Using imap_unordered can sometimes be better for memory with many results,
753
+ # as results are processed as they complete.
754
+ # However, for aggregation later, order doesn't strictly matter for building the final set/list of strings.
755
+ # tqdm will work with imap and imap_unordered.
756
+
757
+ # Calculate total for tqdm more robustly
758
+ actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
759
+
760
+ try:
761
+ from tqdm import tqdm
762
+
763
+ results_buffer = []
764
+ for result_tuple in tqdm(
765
+ pool.imap(process_func, chunk_reader(input_file, chunk_size)),
766
+ total=actual_num_chunks_for_tqdm, # Use calculated number of chunks
767
+ desc="Processing FASTA (subset_fasta)",
768
+ ):
769
+ results_buffer.append(result_tuple)
770
+ logger.debug("SUBSET_FASTA: pool.imap completed.")
771
+ except Exception as e_pool:
772
+ logger.error(
773
+ f"SUBSET_FASTA: Error during multiprocessing pool.imap: {e_pool}",
774
+ exc_info=True,
775
+ )
776
+ raise
777
+
778
+ logger.debug(
779
+ f"SUBSET_FASTA: Aggregating results from {len(results_buffer)} processed chunks."
780
+ )
781
+ with open(output_path, "w") as output_file:
782
+ for output_sequences, written_ids_chunk in results_buffer:
783
+ output_file.writelines(output_sequences)
784
+ all_written_ids.update(written_ids_chunk)
785
+ except Exception as e_main:
786
+ logger.error(
787
+ f"SUBSET_FASTA: Error in main processing logic: {e_main}", exc_info=True
788
+ )
789
+ raise
790
+
791
+ logger.info(
792
+ f"SUBSET_FASTA: Wrote {len(all_written_ids)} sequences to {output_path}. Finished."
793
+ )
702
794
  return all_written_ids if return_written_ids else None
703
795
 
704
796
 
705
- def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
797
+ def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
706
798
  """
707
799
  Load a FASTA file into a dictionary with record IDs as keys.
708
800
  Keep only the first instance of each identifier.
@@ -713,6 +805,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
713
805
  Returns:
714
806
  Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
715
807
  """
808
+ from Bio import SeqIO
809
+ from Bio.SeqRecord import SeqRecord
810
+ from tqdm import tqdm
811
+
716
812
  record_dict: Dict[str, SeqRecord] = {}
717
813
  estimated_sequences = estimate_sequences(fasta_file)
718
814
 
@@ -748,6 +844,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
748
844
  Example:
749
845
  fasta_to_sqlite("proteins.fasta", "proteins.db")
750
846
  """
847
+ from Bio import SeqIO
848
+ from tqdm import tqdm
849
+
751
850
  _check_output_file(db_file)
752
851
 
753
852
  if not os.path.exists(fasta_file):
@@ -779,7 +878,7 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
779
878
  batch = []
780
879
 
781
880
  for protein_id, sequence in tqdm(
782
- _protein_generator(fasta_file),
881
+ _protein_generator(Path(fasta_file)), # Pass as Path object
783
882
  total=estimated_records,
784
883
  desc="Processing proteins",
785
884
  ):
@@ -804,22 +903,29 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
804
903
  print(f"Conversion completed. SQLite database saved to {db_file}")
805
904
 
806
905
 
807
- def _protein_generator(fasta_path: Path) -> Iterator[tuple[str, str]]:
906
+ def _protein_generator(
907
+ fasta_path: Path,
908
+ ) -> Iterator[tuple[str, str]]: # fasta_path is Path
808
909
  """
809
910
  Generate protein data from a FASTA file.
810
-
811
911
  Args:
812
912
  fasta_path (Path): Path to the FASTA file.
813
-
814
913
  Yields:
815
914
  tuple[str, str]: A tuple containing protein_id and sequence.
816
915
  """
817
- for record in SeqIO.parse(fasta_path, "fasta"):
818
- protein_id = record.id.split()[
819
- 0
820
- ] # Assumes the first part of the id is the protein_id
821
- sequence = str(record.seq)
822
- yield protein_id, sequence
916
+ from Bio import SeqIO
917
+
918
+ # Ensure we use 'rt' for text mode reading, especially if gzipped
919
+ open_func = gzip.open if str(fasta_path).endswith(".gz") else open
920
+ mode = "rt"
921
+
922
+ with open_func(fasta_path, mode) as handle:
923
+ for record in SeqIO.parse(handle, "fasta"):
924
+ protein_id = record.id.split()[
925
+ 0
926
+ ] # Assumes the first part of the id is the protein_id
927
+ sequence = str(record.seq)
928
+ yield protein_id, sequence
823
929
 
824
930
 
825
931
  def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
@@ -839,6 +945,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
839
945
  FileNotFoundError: If the input file doesn't exist
840
946
  ValueError: If the FASTA file is malformed
841
947
  """
948
+ from Bio import SeqIO
949
+ from tqdm import tqdm
950
+
842
951
  if not os.path.exists(fasta_path):
843
952
  raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
844
953
 
@@ -915,6 +1024,12 @@ def clean_fasta_duplicates(
915
1024
  FileExistsError: If the output file already exists
916
1025
  FileNotFoundError: If the input file doesn't exist
917
1026
  """
1027
+ from Bio import SeqIO
1028
+ from tqdm import tqdm
1029
+
1030
+ if not os.path.exists(input_path):
1031
+ raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
1032
+
918
1033
  _check_output_file(output_path)
919
1034
 
920
1035
  # First pass: collect sequence hashes for each ID
@@ -1003,6 +1118,8 @@ def fetch_uniprot_fasta(
1003
1118
  Returns:
1004
1119
  tuple: (success_count, failed_count, output_filepath, failed_accessions)
1005
1120
  """
1121
+ from tqdm.notebook import tqdm as tqdm_notebook
1122
+
1006
1123
  # Convert set to list for batch processing
1007
1124
  accession_list = list(accession_set)
1008
1125