dayhoff-tools 1.1.37__py3-none-any.whl → 1.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/deployment/deploy_gcp.py +1 -0
- dayhoff_tools/embedders.py +3 -8
- dayhoff_tools/fasta.py +15 -1
- {dayhoff_tools-1.1.37.dist-info → dayhoff_tools-1.1.38.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.1.37.dist-info → dayhoff_tools-1.1.38.dist-info}/RECORD +7 -7
- {dayhoff_tools-1.1.37.dist-info → dayhoff_tools-1.1.38.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.37.dist-info → dayhoff_tools-1.1.38.dist-info}/entry_points.txt +0 -0
@@ -61,6 +61,7 @@ def create_batch_job_config(config: dict, image_uri: str) -> dict:
|
|
61
61
|
"allocation_policy", # Goes into batch_config.allocationPolicy
|
62
62
|
"logs_policy", # Goes into batch_config.logsPolicy
|
63
63
|
"batch_job", # Contains detailed task and resource specs
|
64
|
+
"image_uri",
|
64
65
|
# Keys like job_name, region, registry_uri, repository are used by other functions
|
65
66
|
# or for other purposes, not directly for constructing the core batch_config JSON here.
|
66
67
|
}
|
dayhoff_tools/embedders.py
CHANGED
@@ -7,11 +7,14 @@ import h5py
|
|
7
7
|
import numpy as np
|
8
8
|
import pandas as pd
|
9
9
|
import torch
|
10
|
+
import torch.utils.data
|
10
11
|
from dayhoff_tools.deployment.processors import Processor
|
11
12
|
from dayhoff_tools.fasta import (
|
12
13
|
clean_noncanonical_fasta,
|
13
14
|
clean_noncanonical_fasta_to_dict,
|
14
15
|
)
|
16
|
+
from esm import FastaBatchedDataset, pretrained
|
17
|
+
from transformers import T5EncoderModel, T5Tokenizer
|
15
18
|
|
16
19
|
logger = logging.getLogger(__name__)
|
17
20
|
|
@@ -49,8 +52,6 @@ class ESMEmbedder(Processor):
|
|
49
52
|
|
50
53
|
def _load_model(self):
|
51
54
|
"""Download pre-trained model and load onto device"""
|
52
|
-
from esm import pretrained
|
53
|
-
|
54
55
|
self.model, self.alphabet = pretrained.load_model_and_alphabet(self.model_name)
|
55
56
|
self.model.eval()
|
56
57
|
if torch.cuda.is_available():
|
@@ -61,8 +62,6 @@ class ESMEmbedder(Processor):
|
|
61
62
|
|
62
63
|
def _load_dataset(self, fasta_file: str) -> None:
|
63
64
|
"""Load FASTA file into batched dataset and dataloader"""
|
64
|
-
from esm import FastaBatchedDataset
|
65
|
-
|
66
65
|
if not fasta_file.endswith(".fasta"):
|
67
66
|
raise ValueError("Input file must have .fasta extension.")
|
68
67
|
|
@@ -764,8 +763,6 @@ class ProstT5Embedder(Embedder):
|
|
764
763
|
The model automatically selects half precision (float16) when running on GPU
|
765
764
|
and full precision (float32) when running on CPU.
|
766
765
|
"""
|
767
|
-
from transformers import T5EncoderModel, T5Tokenizer
|
768
|
-
|
769
766
|
tokenizer = T5Tokenizer.from_pretrained(
|
770
767
|
"Rostlab/ProstT5", do_lower_case=False, legacy=True
|
771
768
|
)
|
@@ -854,8 +851,6 @@ class T5Embedder(Embedder):
|
|
854
851
|
The model automatically handles memory management and batch processing
|
855
852
|
based on sequence sizes and available resources.
|
856
853
|
"""
|
857
|
-
from transformers import T5EncoderModel, T5Tokenizer
|
858
|
-
|
859
854
|
tokenizer = T5Tokenizer.from_pretrained(
|
860
855
|
"Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
|
861
856
|
)
|
dayhoff_tools/fasta.py
CHANGED
@@ -11,6 +11,8 @@ from pathlib import Path
|
|
11
11
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
12
12
|
|
13
13
|
import requests
|
14
|
+
from Bio import SeqIO
|
15
|
+
from Bio.SeqRecord import SeqRecord
|
14
16
|
|
15
17
|
logger = logging.getLogger(__name__)
|
16
18
|
|
@@ -136,6 +138,8 @@ def combine_fasta_files(input_path: Union[str, List[str]], output_path: str) ->
|
|
136
138
|
Raises:
|
137
139
|
FileExistsError: If the output file already exists.
|
138
140
|
"""
|
141
|
+
from tqdm import tqdm
|
142
|
+
|
139
143
|
_check_output_file(output_path)
|
140
144
|
|
141
145
|
if isinstance(input_path, str):
|
@@ -286,6 +290,11 @@ def split_fasta(
|
|
286
290
|
Returns:
|
287
291
|
int: The number of output files created.
|
288
292
|
"""
|
293
|
+
from typing import TYPE_CHECKING, Optional
|
294
|
+
|
295
|
+
if TYPE_CHECKING:
|
296
|
+
from tqdm import tqdm
|
297
|
+
|
289
298
|
# Ensure the target folder exists
|
290
299
|
os.makedirs(target_folder, exist_ok=True)
|
291
300
|
|
@@ -295,7 +304,7 @@ def split_fasta(
|
|
295
304
|
files_created = 0
|
296
305
|
current_output_file_sequence_count = 0
|
297
306
|
current_output_file_bytes_written = 0
|
298
|
-
pbar: tqdm
|
307
|
+
pbar: Optional["tqdm"] = None
|
299
308
|
output_file = None # Will be opened when we encounter the first header line
|
300
309
|
output_file_path = ""
|
301
310
|
|
@@ -310,6 +319,8 @@ def split_fasta(
|
|
310
319
|
# Open the large FASTA file for reading
|
311
320
|
with open(fasta_file, "r", buffering=1024 * 1024) as fasta:
|
312
321
|
if show_progress:
|
322
|
+
from tqdm import tqdm
|
323
|
+
|
313
324
|
total_size = os.path.getsize(fasta_file)
|
314
325
|
pbar = tqdm(
|
315
326
|
total=total_size,
|
@@ -438,6 +449,7 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
|
|
438
449
|
FileExistsError: If the output file already exists.
|
439
450
|
"""
|
440
451
|
from Bio import SeqIO
|
452
|
+
from tqdm import tqdm
|
441
453
|
|
442
454
|
_check_output_file(output_file)
|
443
455
|
|
@@ -738,6 +750,8 @@ def subset_fasta(
|
|
738
750
|
actual_num_chunks_for_tqdm = num_chunks # Use the calculated num_chunks
|
739
751
|
|
740
752
|
try:
|
753
|
+
from tqdm import tqdm
|
754
|
+
|
741
755
|
results_buffer = []
|
742
756
|
for result_tuple in tqdm(
|
743
757
|
pool.imap(process_func, chunk_reader(input_file, chunk_size)),
|
@@ -8,13 +8,13 @@ dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2z
|
|
8
8
|
dayhoff_tools/cli/utility_commands.py,sha256=ER4VrJt4hu904MwrcltUXjwBWT4uFrP-aPXjdXyT3F8,24685
|
9
9
|
dayhoff_tools/deployment/base.py,sha256=8tXwsPYvRo-zV-aNhHw1c7Rji-KWg8S5xoCCznFnVVI,17412
|
10
10
|
dayhoff_tools/deployment/deploy_aws.py,sha256=jQyQ0fbm2793jEHFO84lr5tNqiOpdBg6U0S5zCVJr1M,17884
|
11
|
-
dayhoff_tools/deployment/deploy_gcp.py,sha256=
|
11
|
+
dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
|
12
12
|
dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqqDkpaIOaAn_4,26027
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
|
14
14
|
dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
|
-
dayhoff_tools/embedders.py,sha256=
|
17
|
-
dayhoff_tools/fasta.py,sha256=
|
16
|
+
dayhoff_tools/embedders.py,sha256=svP_ksm3FdyVZ8i8R9R5uoGu2qI_hVQ_eztG0drXkN8,36477
|
17
|
+
dayhoff_tools/fasta.py,sha256=_kA2Cpiy7JAGbBqLrjElkzbcUD_p-nO2d5Aj1LVmOvc,50509
|
18
18
|
dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
|
19
19
|
dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
|
20
20
|
dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.38.dist-info/METADATA,sha256=nDSK0SHTOMdieTxWDLScNArXB4g5TLAocONnt4xD89k,2843
|
30
|
+
dayhoff_tools-1.1.38.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.38.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.38.dist-info/RECORD,,
|
File without changes
|
File without changes
|