dayhoff-tools 1.1.36__py3-none-any.whl → 1.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/embedders.py +8 -3
- dayhoff_tools/fasta.py +28 -5
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.37.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.37.dist-info}/RECORD +6 -6
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.37.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.1.36.dist-info → dayhoff_tools-1.1.37.dist-info}/entry_points.txt +0 -0
dayhoff_tools/embedders.py
CHANGED
@@ -7,14 +7,11 @@ import h5py
|
|
7
7
|
import numpy as np
|
8
8
|
import pandas as pd
|
9
9
|
import torch
|
10
|
-
import torch.utils.data
|
11
10
|
from dayhoff_tools.deployment.processors import Processor
|
12
11
|
from dayhoff_tools.fasta import (
|
13
12
|
clean_noncanonical_fasta,
|
14
13
|
clean_noncanonical_fasta_to_dict,
|
15
14
|
)
|
16
|
-
from esm import FastaBatchedDataset, pretrained
|
17
|
-
from transformers import T5EncoderModel, T5Tokenizer
|
18
15
|
|
19
16
|
logger = logging.getLogger(__name__)
|
20
17
|
|
@@ -52,6 +49,8 @@ class ESMEmbedder(Processor):
|
|
52
49
|
|
53
50
|
def _load_model(self):
|
54
51
|
"""Download pre-trained model and load onto device"""
|
52
|
+
from esm import pretrained
|
53
|
+
|
55
54
|
self.model, self.alphabet = pretrained.load_model_and_alphabet(self.model_name)
|
56
55
|
self.model.eval()
|
57
56
|
if torch.cuda.is_available():
|
@@ -62,6 +61,8 @@ class ESMEmbedder(Processor):
|
|
62
61
|
|
63
62
|
def _load_dataset(self, fasta_file: str) -> None:
|
64
63
|
"""Load FASTA file into batched dataset and dataloader"""
|
64
|
+
from esm import FastaBatchedDataset
|
65
|
+
|
65
66
|
if not fasta_file.endswith(".fasta"):
|
66
67
|
raise ValueError("Input file must have .fasta extension.")
|
67
68
|
|
@@ -763,6 +764,8 @@ class ProstT5Embedder(Embedder):
|
|
763
764
|
The model automatically selects half precision (float16) when running on GPU
|
764
765
|
and full precision (float32) when running on CPU.
|
765
766
|
"""
|
767
|
+
from transformers import T5EncoderModel, T5Tokenizer
|
768
|
+
|
766
769
|
tokenizer = T5Tokenizer.from_pretrained(
|
767
770
|
"Rostlab/ProstT5", do_lower_case=False, legacy=True
|
768
771
|
)
|
@@ -851,6 +854,8 @@ class T5Embedder(Embedder):
|
|
851
854
|
The model automatically handles memory management and batch processing
|
852
855
|
based on sequence sizes and available resources.
|
853
856
|
"""
|
857
|
+
from transformers import T5EncoderModel, T5Tokenizer
|
858
|
+
|
854
859
|
tokenizer = T5Tokenizer.from_pretrained(
|
855
860
|
"Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
|
856
861
|
)
|
dayhoff_tools/fasta.py
CHANGED
@@ -11,10 +11,6 @@ from pathlib import Path
|
|
11
11
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
12
12
|
|
13
13
|
import requests
|
14
|
-
from Bio import SeqIO
|
15
|
-
from Bio.SeqRecord import SeqRecord
|
16
|
-
from tqdm import tqdm
|
17
|
-
from tqdm.notebook import tqdm as tqdm_notebook
|
18
14
|
|
19
15
|
logger = logging.getLogger(__name__)
|
20
16
|
|
@@ -441,6 +437,8 @@ def subtract_fasta_files(file1: str, file2: str, output_file: str):
|
|
441
437
|
Raises:
|
442
438
|
FileExistsError: If the output file already exists.
|
443
439
|
"""
|
440
|
+
from Bio import SeqIO
|
441
|
+
|
444
442
|
_check_output_file(output_file)
|
445
443
|
|
446
444
|
# Load sequences from file1 with a progress bar
|
@@ -497,6 +495,8 @@ def simplify_fasta_ids(
|
|
497
495
|
Raises:
|
498
496
|
FileExistsError: If the output file already exists.
|
499
497
|
"""
|
498
|
+
from Bio import SeqIO
|
499
|
+
|
500
500
|
_check_output_file(output_fasta)
|
501
501
|
|
502
502
|
count = 0
|
@@ -575,6 +575,9 @@ def extract_ids_from_fasta(fasta_file: str) -> Set[str]:
|
|
575
575
|
Raises:
|
576
576
|
ValueError: If there's an issue reading or parsing the input file.
|
577
577
|
"""
|
578
|
+
from Bio import SeqIO
|
579
|
+
from tqdm import tqdm
|
580
|
+
|
578
581
|
sequence_ids: Set[str] = set()
|
579
582
|
try:
|
580
583
|
estimated_records = estimate_sequences(fasta_file)
|
@@ -769,7 +772,7 @@ def subset_fasta(
|
|
769
772
|
return all_written_ids if return_written_ids else None
|
770
773
|
|
771
774
|
|
772
|
-
def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
775
|
+
def load_fasta_as_dict(fasta_file: str) -> Dict[str, "SeqRecord"]:
|
773
776
|
"""
|
774
777
|
Load a FASTA file into a dictionary with record IDs as keys.
|
775
778
|
Keep only the first instance of each identifier.
|
@@ -780,6 +783,10 @@ def load_fasta_as_dict(fasta_file: str) -> Dict[str, SeqRecord]:
|
|
780
783
|
Returns:
|
781
784
|
Dict[str, SeqRecord]: A dictionary with record IDs as keys and SeqRecord objects as values.
|
782
785
|
"""
|
786
|
+
from Bio import SeqIO
|
787
|
+
from Bio.SeqRecord import SeqRecord
|
788
|
+
from tqdm import tqdm
|
789
|
+
|
783
790
|
record_dict: Dict[str, SeqRecord] = {}
|
784
791
|
estimated_sequences = estimate_sequences(fasta_file)
|
785
792
|
|
@@ -815,6 +822,9 @@ def fasta_to_sqlite(fasta_file: str, db_file: str, batch_size: int = 1000) -> No
|
|
815
822
|
Example:
|
816
823
|
fasta_to_sqlite("proteins.fasta", "proteins.db")
|
817
824
|
"""
|
825
|
+
from Bio import SeqIO
|
826
|
+
from tqdm import tqdm
|
827
|
+
|
818
828
|
_check_output_file(db_file)
|
819
829
|
|
820
830
|
if not os.path.exists(fasta_file):
|
@@ -881,6 +891,8 @@ def _protein_generator(
|
|
881
891
|
Yields:
|
882
892
|
tuple[str, str]: A tuple containing protein_id and sequence.
|
883
893
|
"""
|
894
|
+
from Bio import SeqIO
|
895
|
+
|
884
896
|
# Ensure we use 'rt' for text mode reading, especially if gzipped
|
885
897
|
open_func = gzip.open if str(fasta_path).endswith(".gz") else open
|
886
898
|
mode = "rt"
|
@@ -911,6 +923,9 @@ def check_fasta_duplicates(fasta_path: str) -> tuple[set[str], set[str]]:
|
|
911
923
|
FileNotFoundError: If the input file doesn't exist
|
912
924
|
ValueError: If the FASTA file is malformed
|
913
925
|
"""
|
926
|
+
from Bio import SeqIO
|
927
|
+
from tqdm import tqdm
|
928
|
+
|
914
929
|
if not os.path.exists(fasta_path):
|
915
930
|
raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
|
916
931
|
|
@@ -987,6 +1002,12 @@ def clean_fasta_duplicates(
|
|
987
1002
|
FileExistsError: If the output file already exists
|
988
1003
|
FileNotFoundError: If the input file doesn't exist
|
989
1004
|
"""
|
1005
|
+
from Bio import SeqIO
|
1006
|
+
from tqdm import tqdm
|
1007
|
+
|
1008
|
+
if not os.path.exists(input_path):
|
1009
|
+
raise FileNotFoundError(f"Input FASTA file not found: {input_path}")
|
1010
|
+
|
990
1011
|
_check_output_file(output_path)
|
991
1012
|
|
992
1013
|
# First pass: collect sequence hashes for each ID
|
@@ -1075,6 +1096,8 @@ def fetch_uniprot_fasta(
|
|
1075
1096
|
Returns:
|
1076
1097
|
tuple: (success_count, failed_count, output_filepath, failed_accessions)
|
1077
1098
|
"""
|
1099
|
+
from tqdm.notebook import tqdm as tqdm_notebook
|
1100
|
+
|
1078
1101
|
# Convert set to list for batch processing
|
1079
1102
|
accession_list = list(accession_set)
|
1080
1103
|
|
@@ -13,8 +13,8 @@ dayhoff_tools/deployment/deploy_utils.py,sha256=StFwbqnr2_FWiKVg3xnJF4kagTHzndqq
|
|
13
13
|
dayhoff_tools/deployment/job_runner.py,sha256=hljvFpH2Bw96uYyUup5Ths72PZRL_X27KxlYzBMgguo,5086
|
14
14
|
dayhoff_tools/deployment/processors.py,sha256=A7zvF47TfCkuLTCvaqZmk1M9ZgZcv6CAoXZCV6rEXuE,34599
|
15
15
|
dayhoff_tools/deployment/swarm.py,sha256=MGcS2_x4RNFtnVjWlU_SwNfhICz8NlGYr9cYBK4ZKDA,21688
|
16
|
-
dayhoff_tools/embedders.py,sha256=
|
17
|
-
dayhoff_tools/fasta.py,sha256=
|
16
|
+
dayhoff_tools/embedders.py,sha256=yO13jwVSqd-gk-PUhBE_7lnlRUIHn2tirxZ_mFAVAgM,36557
|
17
|
+
dayhoff_tools/fasta.py,sha256=UARjl3w4O6hlZSgzuZZkG-Mi89TLxvxUDL-WEt0-5OU,50210
|
18
18
|
dayhoff_tools/file_ops.py,sha256=JlGowvr-CUJFidV-4g_JmhUTN9bsYuaxtqKmnKomm-Q,8506
|
19
19
|
dayhoff_tools/h5.py,sha256=j1nxxaiHsMidVX_XwB33P1Pz9d7K8ZKiDZwJWQUUQSY,21158
|
20
20
|
dayhoff_tools/intake/gcp.py,sha256=uCeEskhbEwJIYpN6ne6siT1dbpTizCjjel-hRe0kReE,3030
|
@@ -26,7 +26,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
26
26
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
27
27
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
28
28
|
dayhoff_tools/warehouse.py,sha256=TqV8nex1AluNaL4JuXH5zuu9P7qmE89lSo6f_oViy6U,14965
|
29
|
-
dayhoff_tools-1.1.
|
30
|
-
dayhoff_tools-1.1.
|
31
|
-
dayhoff_tools-1.1.
|
32
|
-
dayhoff_tools-1.1.
|
29
|
+
dayhoff_tools-1.1.37.dist-info/METADATA,sha256=wN6qyLCZ5vAQaZdS-8KNuzFnlHIWSz1moDJSugGyVjY,2843
|
30
|
+
dayhoff_tools-1.1.37.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
31
|
+
dayhoff_tools-1.1.37.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
32
|
+
dayhoff_tools-1.1.37.dist-info/RECORD,,
|
File without changes
|
File without changes
|