dayhoff-tools 1.1.10__py3-none-any.whl → 1.13.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +10 -0
- dayhoff_tools/cli/cloud_commands.py +179 -43
- dayhoff_tools/cli/engine1/__init__.py +323 -0
- dayhoff_tools/cli/engine1/engine_core.py +703 -0
- dayhoff_tools/cli/engine1/engine_lifecycle.py +136 -0
- dayhoff_tools/cli/engine1/engine_maintenance.py +431 -0
- dayhoff_tools/cli/engine1/engine_management.py +505 -0
- dayhoff_tools/cli/engine1/shared.py +501 -0
- dayhoff_tools/cli/engine1/studio_commands.py +825 -0
- dayhoff_tools/cli/engines_studios/__init__.py +6 -0
- dayhoff_tools/cli/engines_studios/api_client.py +351 -0
- dayhoff_tools/cli/engines_studios/auth.py +144 -0
- dayhoff_tools/cli/engines_studios/engine-studio-cli.md +1230 -0
- dayhoff_tools/cli/engines_studios/engine_commands.py +1151 -0
- dayhoff_tools/cli/engines_studios/progress.py +260 -0
- dayhoff_tools/cli/engines_studios/simulators/cli-simulators.md +151 -0
- dayhoff_tools/cli/engines_studios/simulators/demo.sh +75 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_list_simulator.py +319 -0
- dayhoff_tools/cli/engines_studios/simulators/engine_status_simulator.py +369 -0
- dayhoff_tools/cli/engines_studios/simulators/idle_status_simulator.py +476 -0
- dayhoff_tools/cli/engines_studios/simulators/simulator_utils.py +180 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_list_simulator.py +374 -0
- dayhoff_tools/cli/engines_studios/simulators/studio_status_simulator.py +164 -0
- dayhoff_tools/cli/engines_studios/studio_commands.py +755 -0
- dayhoff_tools/cli/main.py +106 -7
- dayhoff_tools/cli/utility_commands.py +896 -179
- dayhoff_tools/deployment/base.py +70 -6
- dayhoff_tools/deployment/deploy_aws.py +165 -25
- dayhoff_tools/deployment/deploy_gcp.py +78 -5
- dayhoff_tools/deployment/deploy_utils.py +20 -7
- dayhoff_tools/deployment/job_runner.py +9 -4
- dayhoff_tools/deployment/processors.py +230 -418
- dayhoff_tools/deployment/swarm.py +47 -12
- dayhoff_tools/embedders.py +28 -26
- dayhoff_tools/fasta.py +181 -64
- dayhoff_tools/warehouse.py +268 -1
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/METADATA +20 -5
- dayhoff_tools-1.13.12.dist-info/RECORD +54 -0
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/WHEEL +1 -1
- dayhoff_tools-1.1.10.dist-info/RECORD +0 -32
- {dayhoff_tools-1.1.10.dist-info → dayhoff_tools-1.13.12.dist-info}/entry_points.txt +0 -0
|
@@ -128,23 +128,58 @@ def publish_cards(
|
|
|
128
128
|
names: List[str],
|
|
129
129
|
firestore_collection: str,
|
|
130
130
|
):
|
|
131
|
-
"""Publish cards to Firebase
|
|
132
|
-
|
|
131
|
+
"""Publish cards to Firebase using batch writes for optimal performance.
|
|
132
|
+
|
|
133
|
+
Expects a list of filenames (not full paths), which will each be published
|
|
134
|
+
as a new document in the collection. Uses Firestore batch writes to minimize
|
|
135
|
+
network round-trips and improve performance.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
names: List of packet filenames to publish as cards
|
|
139
|
+
firestore_collection: Name of the Firestore collection to write to
|
|
140
|
+
"""
|
|
141
|
+
if not names:
|
|
142
|
+
print("No cards to upload.")
|
|
143
|
+
return
|
|
133
144
|
|
|
134
145
|
initialize_firebase()
|
|
135
|
-
|
|
146
|
+
db = firestore.client()
|
|
147
|
+
collection = db.collection(firestore_collection)
|
|
148
|
+
|
|
149
|
+
# Firestore batch limit is 500 operations
|
|
150
|
+
BATCH_SIZE = 500
|
|
151
|
+
total_cards = len(names)
|
|
152
|
+
cards_processed = 0
|
|
153
|
+
|
|
154
|
+
# Process names in batches of up to 500
|
|
155
|
+
for i in range(0, total_cards, BATCH_SIZE):
|
|
156
|
+
batch = db.batch()
|
|
157
|
+
batch_names = names[i : i + BATCH_SIZE]
|
|
158
|
+
|
|
159
|
+
# Add all operations for this batch
|
|
160
|
+
for name in batch_names:
|
|
161
|
+
doc_ref = collection.document() # Auto-generate document ID
|
|
162
|
+
batch.set(
|
|
163
|
+
doc_ref,
|
|
164
|
+
{
|
|
165
|
+
"status": "available",
|
|
166
|
+
"packet_filename": name,
|
|
167
|
+
"created": datetime.now(ZoneInfo("America/Los_Angeles")),
|
|
168
|
+
},
|
|
169
|
+
)
|
|
136
170
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
}
|
|
171
|
+
# Commit the entire batch atomically
|
|
172
|
+
batch.commit()
|
|
173
|
+
cards_processed += len(batch_names)
|
|
174
|
+
|
|
175
|
+
print(
|
|
176
|
+
f"Batch {i // BATCH_SIZE + 1}: Created {len(batch_names)} cards "
|
|
177
|
+
f"({cards_processed}/{total_cards} total)"
|
|
144
178
|
)
|
|
145
|
-
print(f"Creating card {name}")
|
|
146
179
|
|
|
147
|
-
print(
|
|
180
|
+
print(
|
|
181
|
+
f"Successfully uploaded {total_cards} cards in {(total_cards + BATCH_SIZE - 1) // BATCH_SIZE} batch(es)."
|
|
182
|
+
)
|
|
148
183
|
|
|
149
184
|
|
|
150
185
|
@transactional
|
dayhoff_tools/embedders.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
4
|
from typing import Dict, List, Literal, Optional, Tuple, cast
|
|
6
5
|
|
|
7
6
|
import h5py
|
|
@@ -443,35 +442,38 @@ class Embedder(Processor):
|
|
|
443
442
|
for seq_id, seq in small_seqs_sorted:
|
|
444
443
|
seq_len = len(seq)
|
|
445
444
|
|
|
446
|
-
if
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
445
|
+
# Check if adding this sequence would exceed the limit
|
|
446
|
+
if current_batch and current_size + seq_len > self.batch_residue_limit:
|
|
447
|
+
# Process current batch before adding the new sequence
|
|
448
|
+
small_batch_count += 1
|
|
449
|
+
logger.info(
|
|
450
|
+
f"Processing small batch {small_batch_count}/{total_small_batches} with {len(current_batch)} sequences"
|
|
451
|
+
)
|
|
452
|
+
batch_results = self.embed_batch(current_batch)
|
|
453
|
+
results.update(batch_results)
|
|
454
|
+
self.cleanup_memory()
|
|
455
455
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
456
|
+
# Update progress
|
|
457
|
+
processed_sequences += len(current_batch)
|
|
458
|
+
elapsed_time = time.time() - start_time
|
|
459
|
+
remaining_sequences = total_sequences - processed_sequences
|
|
460
|
+
avg_time_per_seq = (
|
|
461
|
+
elapsed_time / processed_sequences
|
|
462
|
+
if processed_sequences > 0
|
|
463
|
+
else 0
|
|
464
|
+
)
|
|
465
|
+
estimated_time_left = avg_time_per_seq * remaining_sequences
|
|
466
466
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
467
|
+
logger.info(
|
|
468
|
+
f"Progress: {processed_sequences}/{total_sequences} sequences ({processed_sequences/total_sequences*100:.1f}%) | "
|
|
469
|
+
f"Elapsed: {elapsed_time/60:.1f} min | "
|
|
470
|
+
f"Est. remaining: {estimated_time_left/60:.1f} min"
|
|
471
|
+
)
|
|
472
|
+
# Start new batch
|
|
472
473
|
current_batch = []
|
|
473
474
|
current_size = 0
|
|
474
475
|
|
|
476
|
+
# Add the current sequence to the batch
|
|
475
477
|
current_batch.append((seq_id, seq, seq_len))
|
|
476
478
|
current_size += seq_len
|
|
477
479
|
|
|
@@ -681,7 +683,7 @@ class Embedder(Processor):
|
|
|
681
683
|
sequence_ids, sequences, sequence_lengths = zip(*batch)
|
|
682
684
|
|
|
683
685
|
# Prepare sequences for tokenization
|
|
684
|
-
tokenizer_input = self.prepare_tokenizer_input(sequences)
|
|
686
|
+
tokenizer_input = self.prepare_tokenizer_input(list(sequences))
|
|
685
687
|
|
|
686
688
|
# Tokenize sequences
|
|
687
689
|
encoded_input = self.tokenizer.batch_encode_plus(
|
dayhoff_tools/fasta.py
CHANGED
|
@@ -13,8 +13,6 @@ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
|
13
13
|
import requests
|
|
14
14
|
from Bio import SeqIO
|
|
15
15
|
from Bio.SeqRecord import SeqRecord
|
|
16
|
-
from tqdm import tqdm
|
|
17
|
-
from tqdm.notebook import tqdm as tqdm_notebook
|
|
18
16
|
|
|
19
17
|
logger = logging.getLogger(__name__)
|
|
20
18
|
|
|
@@ -27,7 +25,7 @@ def _clean_noncanonical_fasta(
|
|
|
27
25
|
) -> Optional[dict[str, str]]:
|
|
28
26
|
"""
|
|
29
27
|
Read in a FASTA file containing multiple sequences, replace non-canonical amino acids,
|
|
30
|
-
remove empty sequences, and either write the sequences to a new FASTA file or return them as a dictionary.
|
|
28
|
+
remove stop codons, remove empty sequences, and either write the sequences to a new FASTA file or return them as a dictionary.
|
|
31
29
|
|
|
32
30
|
Args:
|
|
33
31
|
input_path (str): Path to the input FASTA file.
|
|
@@ -50,7 +48,11 @@ def _clean_noncanonical_fasta(
|
|
|
50
48
|
for line in fasta_file:
|
|
51
49
|
if line.startswith(">"):
|
|
52
50
|
if seq_id and seq_lines:
|
|
53
|
-
seq =
|
|
51
|
+
seq = (
|
|
52
|
+
"".join(seq_lines)
|
|
53
|
+
.translate(str.maketrans("OJUZB", "XLCED"))
|
|
54
|
+
.replace("*", "")
|
|
55
|
+
)
|
|
54
56
|
if seq.strip(): # Only process non-empty sequences
|
|
55
57
|
sequences[seq_id] = seq
|
|
56
58
|
if output_path:
|
|
@@ -63,7 +65,11 @@ def _clean_noncanonical_fasta(
|
|
|
63
65
|
|
|
64
66
|
# Process the last sequence
|
|
65
67
|
if seq_id and seq_lines:
|
|
66
|
-
seq =
|
|
68
|
+
seq = (
|
|
69
|
+
"".join(seq_lines)
|
|
70
|
+
.translate(str.maketrans("OJUZB", "XLCED"))
|
|
71
|
+
.replace("*", "")
|
|
72
|
+
)
|
|
67
73
|
if seq.strip(): # Only process non-empty sequences
|
|
68
74
|
sequences[seq_id] = seq
|
|
69
75
|
if output_path:
|
|
@@ -94,7 +100,7 @@ def clean_noncanonical_fasta(
|
|
|
94
100
|
):
|
|
95
101
|
"""
|
|
96
102
|
Read in a FASTA file containing multiple sequences and write the sequences to a new FASTA file.
|
|
97
|
-
Replace non-canonical amino acids along the way.
|
|
103
|
+
Replace non-canonical amino acids and remove stop codons along the way.
|
|
98
104
|
|
|
99
105
|
Args:
|
|
100
106
|
input_path (str): Path to the input FASTA file.
|
|
@@ -114,7 +120,7 @@ def clean_noncanonical_fasta_to_dict(
|
|
|
114
120
|
) -> dict[str, str]:
|
|
115
121
|
"""
|
|
116
122
|
Read in a FASTA file containing multiple sequences and return the sequences as a dictionary.
|
|
117
|
-
Replace non-canonical amino acids along the way.
|
|
123
|
+
Replace non-canonical amino acids and remove stop codons along the way.
|
|
118
124
|
|
|
119
125
|
Args:
|
|
120
126
|
input_path (str): Path to the input FASTA file.
|
|
@@ -140,6 +146,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
|
|
|
140
146
|
Raises:
|
|
141
147
|
FileExistsError: If the output file already exists.
|
|
142
148
|
"""
|
|
149
|
+
from tqdm import tqdm
|
|
150
|
+
|
|
143
151
|
_check_output_file(output_path)
|
|
144
152
|
|
|
145
153
|
if isinstance(input_path, str):
|
|
@@ -290,6 +298,11 @@ def split_fasta(
|
|
|
290
298
|
Returns:
|
|
291
299
|
int: The number of output files created.
|
|
292
300
|
"""
|
|
301
|
+
from typing import TYPE_CHECKING, Optional
|
|
302
|
+
|
|
303
|
+
if TYPE_CHECKING:
|
|
304
|
+
from tqdm import tqdm
|
|
305
|
+
|
|
293
306
|
# Ensure the target folder exists
|
|
294
307
|
os.makedirs(target_folder, exist_ok=True)
|
|
295
308
|
|
|
@@ -299,7 +312,7 @@ def split_fasta(
|
|
|
299
312
|
files_created = 0
|
|
300
313
|
current_output_file_sequence_count = 0
|
|
301
314
|
current_output_file_bytes_written = 0
|
|
302
|
-
pbar: tqdm
|
|
315
|
+
pbar: Optional["tqdm"] = None
|
|
303
316
|
output_file = None # Will be opened when we encounter the first header line
|
|
304
317
|
output_file_path = ""
|
|
305
318
|
|
|
@@ -314,6 +327,8 @@ def split_fasta(
|
|
|
314
327
|
# Open the large FASTA file for reading
|
|
315
328
|
with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
|
|
316
329
|
if show_progress:
|
|
330
|
+
from tqdm import tqdm
|
|
331
|
+
|
|
317
332
|
total_size = os.path.getsize(fasta_file)
|
|
318
333
|
pbar = tqdm(
|
|
319
334
|
total=total_size,
|
|
@@ -441,6 +456,9 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
|
|
|
441
456
|
Raises:
|
|
442
457
|
FileExistsError: If the output file already exists.
|
|
443
458
|
"""
|
|
459
|
+
from Bio import SeqIO
|
|
460
|
+
from tqdm import tqdm
|
|
461
|
+
|
|
444
462
|
_check_output_file(output_file)
|
|
445
463
|
|
|
446
464
|
# Load sequences from file1 with a progress bar
|
|
@@ -497,6 +515,8 @@ def simplify_fasta_ids(
|
|
|
497
515
|
Raises:
|
|
498
516
|
FileExistsError: If the output file already exists.
|
|
499
517
|
"""
|
|
518
|
+
from Bio import SeqIO
|
|
519
|
+
|
|
500
520
|
_check_output_file(output_fasta)
|
|
501
521
|
|
|
502
522
|
count = 0
|
|
@@ -575,6 +595,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
|
|
|
575
595
|
Raises:
|
|
576
596
|
ValueError: If there's an issue reading or parsing the input file.
|
|
577
597
|
"""
|
|
598
|
+
from Bio import SeqIO
|
|
599
|
+
from tqdm import tqdm
|
|
600
|
+
|
|
578
601
|
sequence_ids: Set[str] = set()
|
|
579
602
|
try:
|
|
580
603
|
estimated_records = estimate_sequences(fasta_file)
|
|
@@ -604,29 +627,48 @@ def process_chunk(
|
|
|
604
627
|
) -> Tuple[List[str], Set[str]]:
|
|
605
628
|
output_sequences = []
|
|
606
629
|
written_ids = set()
|
|
607
|
-
current_id = ""
|
|
608
|
-
current_seq = []
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
written_ids.add(current_id)
|
|
620
|
-
current_id = line[1:]
|
|
621
|
-
current_seq = []
|
|
622
|
-
elif current_id:
|
|
623
|
-
current_seq.append(line)
|
|
624
|
-
|
|
625
|
-
# Process the last sequence in the chunk
|
|
626
|
-
if current_id and current_seq and id_matches(current_id) != exclude:
|
|
627
|
-
output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
|
|
628
|
-
written_ids.add(current_id)
|
|
630
|
+
current_id: str = ""
|
|
631
|
+
current_seq: List[str] = []
|
|
632
|
+
|
|
633
|
+
# Get a unique worker ID, could be process ID
|
|
634
|
+
worker_id = os.getpid()
|
|
635
|
+
logger.debug(
|
|
636
|
+
f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} processing a chunk. Target IDs count: {len(target_ids_lower)}, Exclude: {exclude}"
|
|
637
|
+
)
|
|
638
|
+
try:
|
|
639
|
+
|
|
640
|
+
def id_matches(seq_id: str) -> bool:
|
|
641
|
+
return any(part.lower() in target_ids_lower for part in seq_id.split("|"))
|
|
629
642
|
|
|
643
|
+
for line in chunk:
|
|
644
|
+
line = line.strip()
|
|
645
|
+
if line.startswith(">"):
|
|
646
|
+
if current_id and current_seq:
|
|
647
|
+
if id_matches(current_id) != exclude:
|
|
648
|
+
output_sequences.append(
|
|
649
|
+
f">{current_id}\n{''.join(current_seq)}\n"
|
|
650
|
+
)
|
|
651
|
+
written_ids.add(current_id)
|
|
652
|
+
current_id = line[1:]
|
|
653
|
+
current_seq = []
|
|
654
|
+
elif current_id:
|
|
655
|
+
current_seq.append(line)
|
|
656
|
+
|
|
657
|
+
# Process the last sequence in the chunk
|
|
658
|
+
if current_id and current_seq and id_matches(current_id) != exclude:
|
|
659
|
+
output_sequences.append(f">{current_id}\n{''.join(current_seq)}\n")
|
|
660
|
+
written_ids.add(current_id)
|
|
661
|
+
|
|
662
|
+
except Exception as e:
|
|
663
|
+
logger.error(
|
|
664
|
+
f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} encountered error: {e}",
|
|
665
|
+
exc_info=True,
|
|
666
|
+
)
|
|
667
|
+
# Re-raising the exception so the main process's pool error handling can catch it
|
|
668
|
+
raise
|
|
669
|
+
logger.debug(
|
|
670
|
+
f"SUBSET_FASTA_PROCESS_CHUNK: Worker {worker_id} finished chunk. Output sequences: {len(output_sequences)}, Written IDs: {len(written_ids)}"
|
|
671
|
+
)
|
|
630
672
|
return output_sequences, written_ids
|
|
631
673
|
|
|
632
674
|
|
|
@@ -655,54 +697,104 @@ def subset_fasta(
|
|
|
655
697
|
Raises:
|
|
656
698
|
FileExistsError: If the output file already exists.
|
|
657
699
|
"""
|
|
700
|
+
logger.info(
|
|
701
|
+
f"SUBSET_FASTA: Starting for input '{fasta_file}', output '{output_path}'. Target IDs: {len(target_ids)}, Exclude: {exclude}"
|
|
702
|
+
)
|
|
658
703
|
_check_output_file(output_path)
|
|
659
704
|
|
|
660
705
|
target_ids_lower = {id.lower() for id in target_ids}
|
|
661
706
|
total_size = os.path.getsize(fasta_file)
|
|
662
|
-
chunk_size = max(
|
|
663
|
-
1, total_size // (multiprocessing.cpu_count() * 2)
|
|
664
|
-
) # Adjust chunk size based on CPU count
|
|
665
707
|
|
|
666
|
-
|
|
708
|
+
# Determine a reasonable number of processes
|
|
709
|
+
num_processes = multiprocessing.cpu_count()
|
|
710
|
+
# Adjust chunk size based on number of processes to balance load vs memory
|
|
711
|
+
# Aim for at least a few chunks per process if possible, but not too many small chunks.
|
|
712
|
+
# This is a heuristic and might need tuning.
|
|
713
|
+
# Let's make chunks reasonably large, e.g., 10-50MB, or ensure at least num_processes chunks.
|
|
714
|
+
# If total_size is very small, chunk_size could become 0 if not handled.
|
|
715
|
+
desired_chunk_size_mb = 32
|
|
716
|
+
chunk_size = max(1, desired_chunk_size_mb * 1024 * 1024)
|
|
717
|
+
num_chunks = max(1, math.ceil(total_size / chunk_size))
|
|
718
|
+
|
|
719
|
+
def chunk_reader(
|
|
720
|
+
file_obj, cs: int
|
|
721
|
+
) -> Iterator[List[str]]: # Explicitly Iterator[List[str]]
|
|
667
722
|
chunk = []
|
|
668
723
|
chunk_bytes = 0
|
|
669
724
|
for line in file_obj:
|
|
670
725
|
chunk.append(line)
|
|
671
726
|
chunk_bytes += len(line)
|
|
672
|
-
if chunk_bytes >=
|
|
727
|
+
if chunk_bytes >= cs and line.startswith(">"):
|
|
673
728
|
yield chunk
|
|
674
729
|
chunk = [line]
|
|
675
730
|
chunk_bytes = len(line)
|
|
676
731
|
if chunk:
|
|
677
732
|
yield chunk
|
|
678
733
|
|
|
679
|
-
|
|
680
|
-
mode = "rt" if fasta_file.endswith(".gz") else "r"
|
|
734
|
+
mode = "rt" # text mode for both gzip and regular open
|
|
681
735
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
736
|
+
all_written_ids: Set[str] = set()
|
|
737
|
+
try:
|
|
738
|
+
with open(fasta_file, mode) as input_file:
|
|
739
|
+
logger.info(
|
|
740
|
+
f"SUBSET_FASTA: Using up to {num_processes} worker processes for {num_chunks} potential chunks."
|
|
686
741
|
)
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
desc="Processing FASTA",
|
|
742
|
+
|
|
743
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
744
|
+
logger.info(
|
|
745
|
+
f"SUBSET_FASTA: Multiprocessing pool created (intended processes: {num_processes})."
|
|
692
746
|
)
|
|
693
|
-
)
|
|
694
747
|
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
output_file.writelines(output_sequences)
|
|
699
|
-
all_written_ids.update(written_ids)
|
|
748
|
+
process_func = partial(
|
|
749
|
+
process_chunk, target_ids_lower=target_ids_lower, exclude=exclude
|
|
750
|
+
)
|
|
700
751
|
|
|
701
|
-
|
|
752
|
+
# Using imap_unordered can sometimes be better for memory with many results,
|
|
753
|
+
# as results are processed as they complete.
|
|
754
|
+
# However, for aggregation later, order doesn't strictly matter for building the final set/list of strings.
|
|
755
|
+
# tqdm will work with imap and imap_unordered.
|
|
756
|
+
|
|
757
|
+
# Calculate total for tqdm more robustly
|
|
758
|
+
actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
from tqdm import tqdm
|
|
762
|
+
|
|
763
|
+
results_buffer = []
|
|
764
|
+
for result_tuple in tqdm(
|
|
765
|
+
pool.imap(process_func, chunk_reader(input_file, chunk_size)),
|
|
766
|
+
total=actual_num_chunks_for_tqdm, # Use calculated number of chunks
|
|
767
|
+
desc="Processing FASTA (subset_fasta)",
|
|
768
|
+
):
|
|
769
|
+
results_buffer.append(result_tuple)
|
|
770
|
+
logger.debug("SUBSET_FASTA: pool.imap completed.")
|
|
771
|
+
except Exception as e_pool:
|
|
772
|
+
logger.error(
|
|
773
|
+
f"SUBSET_FASTA: Error during multiprocessing pool.imap: {e_pool}",
|
|
774
|
+
exc_info=True,
|
|
775
|
+
)
|
|
776
|
+
raise
|
|
777
|
+
|
|
778
|
+
logger.debug(
|
|
779
|
+
f"SUBSET_FASTA: Aggregating results from {len(results_buffer)} processed chunks."
|
|
780
|
+
)
|
|
781
|
+
with open(output_path, "w") as output_file:
|
|
782
|
+
for output_sequences, written_ids_chunk in results_buffer:
|
|
783
|
+
output_file.writelines(output_sequences)
|
|
784
|
+
all_written_ids.update(written_ids_chunk)
|
|
785
|
+
except Exception as e_main:
|
|
786
|
+
logger.error(
|
|
787
|
+
f"SUBSET_FASTA: Error in main processing logic: {e_main}", exc_info=True
|
|
788
|
+
)
|
|
789
|
+
raise
|
|
790
|
+
|
|
791
|
+
logger.info(
|
|
792
|
+
f"SUBSET_FASTA: Wrote {len(all_written_ids)} sequences to {output_path}. Finished."
|
|
793
|
+
)
|
|
702
794
|
return all_written_ids if return_written_ids else None
|
|
703
795
|
|
|
704
796
|
|
|
705
|
-
def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
|
797
|
+
def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
|
|
706
798
|
"""
|
|
707
799
|
Load a FASTA file into a dictionary with record IDs as keys.
|
|
708
800
|
Keep only the first instance of each identifier.
|
|
@@ -713,6 +805,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
|
|
713
805
|
Returns:
|
|
714
806
|
Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
|
|
715
807
|
"""
|
|
808
|
+
from Bio import SeqIO
|
|
809
|
+
from Bio.SeqRecord import SeqRecord
|
|
810
|
+
from tqdm import tqdm
|
|
811
|
+
|
|
716
812
|
record_dict: Dict[str, SeqRecord] = {}
|
|
717
813
|
estimated_sequences = estimate_sequences(fasta_file)
|
|
718
814
|
|
|
@@ -748,6 +844,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
|
|
|
748
844
|
Example:
|
|
749
845
|
fasta_to_sqlite("proteins.fasta", "proteins.db")
|
|
750
846
|
"""
|
|
847
|
+
from Bio import SeqIO
|
|
848
|
+
from tqdm import tqdm
|
|
849
|
+
|
|
751
850
|
_check_output_file(db_file)
|
|
752
851
|
|
|
753
852
|
if not os.path.exists(fasta_file):
|
|
@@ -779,7 +878,7 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
|
|
|
779
878
|
batch = []
|
|
780
879
|
|
|
781
880
|
for protein_id, sequence in tqdm(
|
|
782
|
-
_protein_generator(fasta_file),
|
|
881
|
+
_protein_generator(Path(fasta_file)), # Pass as Path object
|
|
783
882
|
total=estimated_records,
|
|
784
883
|
desc="Processing proteins",
|
|
785
884
|
):
|
|
@@ -804,22 +903,29 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
|
|
|
804
903
|
print(f"Conversion completed. SQLite database saved to {db_file}")
|
|
805
904
|
|
|
806
905
|
|
|
807
|
-
def _protein_generator(
|
|
906
|
+
def _protein_generator(
|
|
907
|
+
fasta_path: Path,
|
|
908
|
+
) -> Iterator[tuple[str, str]]: # fasta_path is Path
|
|
808
909
|
"""
|
|
809
910
|
Generate protein data from a FASTA file.
|
|
810
|
-
|
|
811
911
|
Args:
|
|
812
912
|
fasta_path (Path): Path to the FASTA file.
|
|
813
|
-
|
|
814
913
|
Yields:
|
|
815
914
|
tuple[str, str]: A tuple containing protein_id and sequence.
|
|
816
915
|
"""
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
916
|
+
from Bio import SeqIO
|
|
917
|
+
|
|
918
|
+
# Ensure we use 'rt' for text mode reading, especially if gzipped
|
|
919
|
+
open_func = gzip.open if str(fasta_path).endswith(".gz") else open
|
|
920
|
+
mode = "rt"
|
|
921
|
+
|
|
922
|
+
with open_func(fasta_path, mode) as handle:
|
|
923
|
+
for record in SeqIO.parse(handle, "fasta"):
|
|
924
|
+
protein_id = record.id.split()[
|
|
925
|
+
0
|
|
926
|
+
] # Assumes the first part of the id is the protein_id
|
|
927
|
+
sequence = str(record.seq)
|
|
928
|
+
yield protein_id, sequence
|
|
823
929
|
|
|
824
930
|
|
|
825
931
|
def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
|
|
@@ -839,6 +945,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
|
|
|
839
945
|
FileNotFoundError: If the input file doesn't exist
|
|
840
946
|
ValueError: If the FASTA file is malformed
|
|
841
947
|
"""
|
|
948
|
+
from Bio import SeqIO
|
|
949
|
+
from tqdm import tqdm
|
|
950
|
+
|
|
842
951
|
if not os.path.exists(fasta_path):
|
|
843
952
|
raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
|
|
844
953
|
|
|
@@ -915,6 +1024,12 @@ def clean_fasta_duplicates(
|
|
|
915
1024
|
FileExistsError: If the output file already exists
|
|
916
1025
|
FileNotFoundError: If the input file doesn't exist
|
|
917
1026
|
"""
|
|
1027
|
+
from Bio import SeqIO
|
|
1028
|
+
from tqdm import tqdm
|
|
1029
|
+
|
|
1030
|
+
if not os.path.exists(input_path):
|
|
1031
|
+
raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
|
|
1032
|
+
|
|
918
1033
|
_check_output_file(output_path)
|
|
919
1034
|
|
|
920
1035
|
# First pass: collect sequence hashes for each ID
|
|
@@ -1003,6 +1118,8 @@ def fetch_uniprot_fasta(
|
|
|
1003
1118
|
Returns:
|
|
1004
1119
|
tuple: (success_count, failed_count, output_filepath, failed_accessions)
|
|
1005
1120
|
"""
|
|
1121
|
+
from tqdm.notebook import tqdm as tqdm_notebook
|
|
1122
|
+
|
|
1006
1123
|
# Convert set to list for batch processing
|
|
1007
1124
|
accession_list = list(accession_set)
|
|
1008
1125
|
|