dayhoff-tools 1.1.36__py3-none-any.whl → 1.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/deployment/deploy_gcp.py +1 -0
- dayhoff_tools/fasta.py +41 -4
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.38.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.38.dist-info}/RECORD +6 -6
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.38.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.38.dist-info}/entry_points.txt +0 -0
@@ -61,6 +61,7 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
61
61
|
"allocation_policy", # Goes into batch_config.allocationPolicy
|
62
62
|
"logs_policy", # Goes into batch_config.logsPolicy
|
63
63
|
"batch_job", # Contains detailed task and resource specs
|
64
|
+
"image_uri",
|
64
65
|
# Keys like job_name, region, registry_uri, repository are used by other functions
|
65
66
|
# or for other purposes, not directly for constructing the core batch_config JSON here.
|
66
67
|
}
|
dayhoff_tools/fasta.py
CHANGED
@@ -13,8 +13,6 @@ from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
13
13
|
import requests
|
14
14
|
from Bio import SeqIO
|
15
15
|
from Bio.SeqRecord import SeqRecord
|
16
|
-
from tqdm import tqdm
|
17
|
-
from tqdm.notebook import tqdm as tqdm_notebook
|
18
16
|
|
19
17
|
logger = logging.getLogger(__name__)
|
20
18
|
|
@@ -140,6 +138,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
|
|
140
138
|
Raises:
|
141
139
|
FileExistsError: If the output file already exists.
|
142
140
|
"""
|
141
|
+
from tqdm import tqdm
|
142
|
+
|
143
143
|
_check_output_file(output_path)
|
144
144
|
|
145
145
|
if isinstance(input_path, str):
|
@@ -290,6 +290,11 @@ def split_fasta(
|
|
290
290
|
Returns:
|
291
291
|
int: The number of output files created.
|
292
292
|
"""
|
293
|
+
from typing import TYPE_CHECKING, Optional
|
294
|
+
|
295
|
+
if TYPE_CHECKING:
|
296
|
+
from tqdm import tqdm
|
297
|
+
|
293
298
|
# Ensure the target folder exists
|
294
299
|
os.makedirs(target_folder, exist_ok=True)
|
295
300
|
|
@@ -299,7 +304,7 @@ def split_fasta(
|
|
299
304
|
files_created = 0
|
300
305
|
current_output_file_sequence_count = 0
|
301
306
|
current_output_file_bytes_written = 0
|
302
|
-
pbar: tqdm
|
307
|
+
pbar: Optional["tqdm"] = None
|
303
308
|
output_file = None # Will be opened when we encounter the first header line
|
304
309
|
output_file_path = ""
|
305
310
|
|
@@ -314,6 +319,8 @@ def split_fasta(
|
|
314
319
|
# Open the large FASTA file for reading
|
315
320
|
with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
|
316
321
|
if show_progress:
|
322
|
+
from tqdm import tqdm
|
323
|
+
|
317
324
|
total_size = os.path.getsize(fasta_file)
|
318
325
|
pbar = tqdm(
|
319
326
|
total=total_size,
|
@@ -441,6 +448,9 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
|
|
441
448
|
Raises:
|
442
449
|
FileExistsError: If the output file already exists.
|
443
450
|
"""
|
451
|
+
from Bio import SeqIO
|
452
|
+
from tqdm import tqdm
|
453
|
+
|
444
454
|
_check_output_file(output_file)
|
445
455
|
|
446
456
|
# Load sequences from file1 with a progress bar
|
@@ -497,6 +507,8 @@ def simplify_fasta_ids(
|
|
497
507
|
Raises:
|
498
508
|
FileExistsError: If the output file already exists.
|
499
509
|
"""
|
510
|
+
from Bio import SeqIO
|
511
|
+
|
500
512
|
_check_output_file(output_fasta)
|
501
513
|
|
502
514
|
count = 0
|
@@ -575,6 +587,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
|
|
575
587
|
Raises:
|
576
588
|
ValueError: If there's an issue reading or parsing the input file.
|
577
589
|
"""
|
590
|
+
from Bio import SeqIO
|
591
|
+
from tqdm import tqdm
|
592
|
+
|
578
593
|
sequence_ids: Set[str] = set()
|
579
594
|
try:
|
580
595
|
estimated_records = estimate_sequences(fasta_file)
|
@@ -735,6 +750,8 @@ def subset_fasta(
|
|
735
750
|
actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
|
736
751
|
|
737
752
|
try:
|
753
|
+
from tqdm import tqdm
|
754
|
+
|
738
755
|
results_buffer = []
|
739
756
|
for result_tuple in tqdm(
|
740
757
|
pool.imap(process_func, chunk_reader(input_file, chunk_size)),
|
@@ -769,7 +786,7 @@ def subset_fasta(
|
|
769
786
|
return all_written_ids if return_written_ids else None
|
770
787
|
|
771
788
|
|
772
|
-
def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
789
|
+
def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
|
773
790
|
"""
|
774
791
|
Load a FASTA file into a dictionary with record IDs as keys.
|
775
792
|
Keep only the first instance of each identifier.
|
@@ -780,6 +797,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
|
780
797
|
Returns:
|
781
798
|
Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
|
782
799
|
"""
|
800
|
+
from Bio import SeqIO
|
801
|
+
from Bio.SeqRecord import SeqRecord
|
802
|
+
from tqdm import tqdm
|
803
|
+
|
783
804
|
record_dict: Dict[str, SeqRecord] = {}
|
784
805
|
estimated_sequences = estimate_sequences(fasta_file)
|
785
806
|
|
@@ -815,6 +836,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
|
|
815
836
|
Example:
|
816
837
|
fasta_to_sqlite("proteins.fasta", "proteins.db")
|
817
838
|
"""
|
839
|
+
from Bio import SeqIO
|
840
|
+
from tqdm import tqdm
|
841
|
+
|
818
842
|
_check_output_file(db_file)
|
819
843
|
|
820
844
|
if not os.path.exists(fasta_file):
|
@@ -881,6 +905,8 @@ def _protein_generator(
|
|
881
905
|
Yields:
|
882
906
|
tuple[str, str]: A tuple containing protein_id and sequence.
|
883
907
|
"""
|
908
|
+
from Bio import SeqIO
|
909
|
+
|
884
910
|
# Ensure we use 'rt' for text mode reading, especially if gzipped
|
885
911
|
open_func = gzip.open if str(fasta_path).endswith(".gz") else open
|
886
912
|
mode = "rt"
|
@@ -911,6 +937,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
|
|
911
937
|
FileNotFoundError: If the input file doesn't exist
|
912
938
|
ValueError: If the FASTA file is malformed
|
913
939
|
"""
|
940
|
+
from Bio import SeqIO
|
941
|
+
from tqdm import tqdm
|
942
|
+
|
914
943
|
if not os.path.exists(fasta_path):
|
915
944
|
raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
|
916
945
|
|
@@ -987,6 +1016,12 @@ def clean_fasta_duplicates(
|
|
987
1016
|
FileExistsError: If the output file already exists
|
988
1017
|
FileNotFoundError: If the input file doesn't exist
|
989
1018
|
"""
|
1019
|
+
from Bio import SeqIO
|
1020
|
+
from tqdm import tqdm
|
1021
|
+
|
1022
|
+
if not os.path.exists(input_path):
|
1023
|
+
raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
|
1024
|
+
|
990
1025
|
_check_output_file(output_path)
|
991
1026
|
|
992
1027
|
# First pass: collect sequence hashes for each ID
|
@@ -1075,6 +1110,8 @@ def fetch_uniprot_fasta(
|
|
1075
1110
|
Returns:
|
1076
1111
|
tuple: (success_count, failed_count, output_filepath, failed_accessions)
|
1077
1112
|
"""
|
1113
|
+
from tqdm.notebook import tqdm as tqdm_notebook
|
1114
|
+
|
1078
1115
|
# Convert set to list for batch processing
|
1079
1116
|
accession_list = list(accession_set)
|
1080
1117
|
|
@@ -8,13 +8,13 @@ dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2z
|
|
8
8
|
dayhoff_tools/cli/utility_commands.py,sha256=ER4VrJt4hu904MwrcltUXjwBWT4uFrP-aPXjdXyT3F8,24685
|
9
9
|
dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
|
10
10
|
dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0S5zCVJr1M,17884
|
11
|
-
dayhoff_tools/deployment/deploy_gcp.py,sha256=
|
11
|
+
dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
|
14
14
|
dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
16
|
dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
|
17
|
-
dayhoff_tools/fasta.py,sha256=
|
17
|
+
dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
|
18
18
|
dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
|
19
19
|
dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
|
20
20
|
dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.38.dist-info/METADATA,sha256=nDSK0SHTOMdieTxWDLScNArXB4g5TLAocONnt4xD89k,2843
|
30
|
+
dayhoff_tools-1.1.38.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.38.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.38.dist-info/RECORD,,
|
File without changes
|
File without changes
|