enzymetk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- enzymetk/__init__.py +56 -0
- enzymetk/annotateEC_CLEAN_step.py +122 -0
- enzymetk/annotateEC_CREEP_step.py +82 -0
- enzymetk/annotateEC_proteinfer_step.py +136 -0
- enzymetk/dock_chai_step.py +51 -0
- enzymetk/dock_vina_step.py +63 -0
- enzymetk/embedchem_chemberta_step.py +61 -0
- enzymetk/embedchem_rxnfp_run.py +28 -0
- enzymetk/embedchem_rxnfp_step.py +55 -0
- enzymetk/embedchem_selformer_run.py +28 -0
- enzymetk/embedchem_selformer_step.py +39 -0
- enzymetk/embedchem_unimol_step.py +57 -0
- enzymetk/embedprotein_esm_step.py +123 -0
- enzymetk/esm-extract.py +140 -0
- enzymetk/filter_sequence_step.py +0 -0
- enzymetk/filter_structure_step.py +0 -0
- enzymetk/generate_msa_step.py +61 -0
- enzymetk/generate_oligopool_step.py +0 -0
- enzymetk/generate_tree_step.py +74 -0
- enzymetk/inpaint_ligandMPNN_step.py +65 -0
- enzymetk/main.py +37 -0
- enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
- enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
- enzymetk/pipeline.py +1 -0
- enzymetk/predict_activity_step.py +0 -0
- enzymetk/predict_catalyticsite_run.py +47 -0
- enzymetk/predict_catalyticsite_step.py +70 -0
- enzymetk/reducedim_pca_run.py +67 -0
- enzymetk/reducedim_vae_run.py +67 -0
- enzymetk/reducedim_vae_step.py +12 -0
- enzymetk/save_step.py +13 -0
- enzymetk/sequence_search_blast.py +80 -0
- enzymetk/similarity_foldseek_step.py +114 -0
- enzymetk/similarity_mmseqs_step.py +80 -0
- enzymetk/similarity_reaction_step.py +60 -0
- enzymetk/similarity_substrate_step.py +59 -0
- enzymetk/step.py +60 -0
- enzymetk-0.0.1.data/data/LICENSE +0 -0
- enzymetk-0.0.1.dist-info/LICENSE +0 -0
- enzymetk-0.0.1.dist-info/METADATA +370 -0
- enzymetk-0.0.1.dist-info/RECORD +44 -0
- enzymetk-0.0.1.dist-info/WHEEL +5 -0
- enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
- enzymetk-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
def run_selformer(output_filename, input_filename, label, selformer_dir, model_file):
|
|
5
|
+
print(f'Running selformer on {input_filename} with label {label}')
|
|
6
|
+
os.chdir(selformer_dir)
|
|
7
|
+
os.system(f'cp {input_filename} {selformer_dir}data/{label}.txt')
|
|
8
|
+
os.system(f'conda run -n SELFormer_env python3 {selformer_dir}generate_selfies.py --smiles_dataset=data/{label}.txt --selfies_dataset=data/{label}.csv')
|
|
9
|
+
os.system(f'conda run -n SELFormer_env python3 {selformer_dir}produce_embeddings.py --selfies_dataset=data/{label}.csv --model_file={model_file} --embed_file=data/{label}_embedding.csv')
|
|
10
|
+
os.system(f'cp {selformer_dir}data/{label}_embedding.csv {output_filename}')
|
|
11
|
+
os.system(f'rm data/{label}.txt')
|
|
12
|
+
os.system(f'rm data/{label}.csv')
|
|
13
|
+
os.system(f'rm data/{label}_embedding.csv')
|
|
14
|
+
|
|
15
|
+
def parse_args():
|
|
16
|
+
parser = argparse.ArgumentParser(description="Run selformer on a dataset")
|
|
17
|
+
parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
|
|
18
|
+
parser.add_argument('-input', '--input', type=str, required=True, help='path to the dataframe')
|
|
19
|
+
parser.add_argument('-label', '--label', type=str, required=True, help='label of the column')
|
|
20
|
+
parser.add_argument('-dir', '--dir', type=str, required=True, help='path to the directory')
|
|
21
|
+
parser.add_argument('-model', '--model', type=str, required=True, help='path to the model')
|
|
22
|
+
return parser.parse_args()
|
|
23
|
+
|
|
24
|
+
def main():
|
|
25
|
+
args = parse_args()
|
|
26
|
+
run_selformer(args.out, args.input, args.label, args.dir, args.model)
|
|
27
|
+
|
|
28
|
+
main()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import logging
|
|
7
|
+
import datetime
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
logger.setLevel(logging.INFO)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SelFormer(Step):
|
|
15
|
+
|
|
16
|
+
def __init__(self, value_col: str, id_col: str, selformer_dir: str, model_file: str):
|
|
17
|
+
self.value_col = value_col
|
|
18
|
+
self.id_col = id_col
|
|
19
|
+
self.selformer_dir = selformer_dir
|
|
20
|
+
self.model_file = model_file
|
|
21
|
+
|
|
22
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
23
|
+
sub_df = df[[self.id_col, self.value_col]]
|
|
24
|
+
# Have to change it so that selformer can run
|
|
25
|
+
sub_df.columns = ['chembl_id', 'canonical_smiles']
|
|
26
|
+
with TemporaryDirectory() as tmp_dir:
|
|
27
|
+
now = datetime.datetime.now()
|
|
28
|
+
formatted_date = now.strftime("%Y%m%d%H%M%S")
|
|
29
|
+
label = f'selformer_{formatted_date}'
|
|
30
|
+
output_filename = f'{tmp_dir}/{label}.csv'
|
|
31
|
+
input_filename = f'{tmp_dir}/{label}.tsv'
|
|
32
|
+
sub_df.to_csv(input_filename, sep='\t', index=False)
|
|
33
|
+
cmd = ['python', Path(__file__).parent/'selformer_run.py', '--out', output_filename,
|
|
34
|
+
'--input', input_filename, '--label', label, '--dir', self.selformer_dir,
|
|
35
|
+
'--model', self.model_file]
|
|
36
|
+
self.run(cmd)
|
|
37
|
+
df = pd.read_csv(output_filename)
|
|
38
|
+
|
|
39
|
+
return df
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
from unimol_tools import UniMolRepr
|
|
7
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
logger.setLevel(logging.INFO)
|
|
11
|
+
|
|
12
|
+
# pip install unimol_tools
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UniMol(Step):
|
|
16
|
+
|
|
17
|
+
def __init__(self, smiles_col: str, unimol_model = 'unimolv2', unimol_size = '164m', num_threads = 1):
|
|
18
|
+
self.smiles_col = smiles_col
|
|
19
|
+
self.num_threads = num_threads
|
|
20
|
+
# single smiles unimol representation
|
|
21
|
+
clf = UniMolRepr(data_type='molecule',
|
|
22
|
+
remove_hs=False,
|
|
23
|
+
model_name= unimol_model or 'unimolv2', # avaliable: unimolv1, unimolv2
|
|
24
|
+
model_size= unimol_size or '164m', # work when model_name is unimolv2. avaliable: 84m, 164m, 310m, 570m, 1.1B.
|
|
25
|
+
)
|
|
26
|
+
self.clf = clf
|
|
27
|
+
|
|
28
|
+
def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
29
|
+
smiles_list = list(df[self.smiles_col].values)
|
|
30
|
+
reprs = []
|
|
31
|
+
for smile in smiles_list:
|
|
32
|
+
try:
|
|
33
|
+
unimol_repr = self.clf.get_repr([smile], return_atomic_reprs=True)
|
|
34
|
+
reprs.append(unimol_repr['cls_repr'])
|
|
35
|
+
except Exception as e:
|
|
36
|
+
logger.warning(f"Error embedding smile {smile}: {e}")
|
|
37
|
+
reprs.append(None)
|
|
38
|
+
df['unimol_repr'] = reprs
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
42
|
+
with TemporaryDirectory() as tmp_dir:
|
|
43
|
+
if self.num_threads > 1:
|
|
44
|
+
data = []
|
|
45
|
+
df_list = np.array_split(df, self.num_threads)
|
|
46
|
+
for df_chunk in df_list:
|
|
47
|
+
data.append(df_chunk)
|
|
48
|
+
pool = ThreadPool(self.num_threads)
|
|
49
|
+
output_filenames = pool.map(self.__execute, data)
|
|
50
|
+
df = pd.DataFrame()
|
|
51
|
+
for tmp_df in output_filenames:
|
|
52
|
+
df = pd.concat([df, tmp_df])
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
else:
|
|
56
|
+
return self.__execute(df)
|
|
57
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import numpy as np
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
import torch
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# First run this: nohup python esm-extract.py esm2_t33_650M_UR50D /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/EC3.1.1_training.fasta /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/encodings --include per_tok &
|
|
13
|
+
def extract_active_site_embedding(df, id_column, residue_columns, encoding_dir, rep_num=33):
|
|
14
|
+
""" Expects that the entries for the active site df are saved as the filenames in the encoding dir. """
|
|
15
|
+
combined_tensors = []
|
|
16
|
+
mean_tensors = []
|
|
17
|
+
count_fail = 0
|
|
18
|
+
count_success = 0
|
|
19
|
+
for entry, residues in tqdm(df[[id_column, residue_columns]].values):
|
|
20
|
+
try:
|
|
21
|
+
file = Path(encoding_dir + f'/{entry}.pt')
|
|
22
|
+
tensors = []
|
|
23
|
+
if residues is not None and residues != 'None':
|
|
24
|
+
try:
|
|
25
|
+
residues = [int(r) for r in residues.split('|')]
|
|
26
|
+
except:
|
|
27
|
+
residues = []
|
|
28
|
+
else:
|
|
29
|
+
residues = []
|
|
30
|
+
embedding_file = torch.load(file)
|
|
31
|
+
tensor = embedding_file['representations'][rep_num] # have to get the last layer (36) of the embeddings... very dependant on ESM model used! 36 for medium ESM2
|
|
32
|
+
tensors = []
|
|
33
|
+
mean_tensors.append(np.mean(np.asarray(tensor).astype(np.float32), axis=0))
|
|
34
|
+
for residue in residues:
|
|
35
|
+
t = np.asarray(tensor[residue]).astype(np.float32)
|
|
36
|
+
tensors.append(t)
|
|
37
|
+
combined_tensors.append(tensors)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print(f'Error loading file {file}: {e}')
|
|
40
|
+
count_fail += 1
|
|
41
|
+
mean_tensors.append(None)
|
|
42
|
+
combined_tensors.append(None)
|
|
43
|
+
# HEre is where you do something on the combined tensors
|
|
44
|
+
df['active_embedding'] = combined_tensors
|
|
45
|
+
df['esm_embedding'] = mean_tensors
|
|
46
|
+
print(count_success, count_fail, count_fail + count_success)
|
|
47
|
+
return df
|
|
48
|
+
|
|
49
|
+
# First run this: nohup python esm-extract.py esm2_t33_650M_UR50D /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/EC3.1.1_training.fasta /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/encodings --include per_tok &
|
|
50
|
+
def extract_mean_embedding(df, id_column, encoding_dir, rep_num=33):
|
|
51
|
+
""" Expects that the entries for the active site df are saved as the filenames in the encoding dir. """
|
|
52
|
+
tensors = []
|
|
53
|
+
count_fail = 0
|
|
54
|
+
count_success = 0
|
|
55
|
+
for entry in tqdm(df[id_column].values):
|
|
56
|
+
try:
|
|
57
|
+
file = Path(os.path.join(encoding_dir, f'{entry}.pt'))
|
|
58
|
+
embedding_file = torch.load(file)
|
|
59
|
+
tensor = embedding_file['representations'][rep_num] # have to get the last layer (36) of the embeddings... very dependant on ESM model used! 36 for medium ESM2
|
|
60
|
+
t = np.mean(np.asarray(tensor).astype(np.float32), axis=0)
|
|
61
|
+
tensors.append(t)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print(f'Error loading file {file}: {e}')
|
|
64
|
+
count_fail += 1
|
|
65
|
+
tensors.append(None)
|
|
66
|
+
|
|
67
|
+
df['embedding'] = tensors
|
|
68
|
+
print(count_success, count_fail, count_fail + count_success)
|
|
69
|
+
return df
|
|
70
|
+
|
|
71
|
+
class EmbedESM(Step):
|
|
72
|
+
|
|
73
|
+
def __init__(self, id_col: str, seq_col: str, model='esm2_t33_650M_UR50D', extraction_method='mean',
|
|
74
|
+
active_site_col: str = None, num_threads=1, tmp_dir: str = None, env_name: str = 'enzymetk'):
|
|
75
|
+
self.seq_col = seq_col
|
|
76
|
+
self.id_col = id_col
|
|
77
|
+
self.active_site_col = active_site_col
|
|
78
|
+
self.model = model
|
|
79
|
+
self.num_threads = num_threads or 1
|
|
80
|
+
self.extraction_method = extraction_method
|
|
81
|
+
self.tmp_dir = tmp_dir
|
|
82
|
+
self.env_name = env_name
|
|
83
|
+
|
|
84
|
+
def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
|
|
85
|
+
input_filename = f'{tmp_dir}/input.fasta'
|
|
86
|
+
# Check the file doesn't exist in the tmp_dir
|
|
87
|
+
files = os.listdir(tmp_dir)
|
|
88
|
+
done_entries = set([f.split('.')[0] for f in files if f.endswith('.pt')])
|
|
89
|
+
# write fasta file which is the input for proteinfer
|
|
90
|
+
with open(input_filename, 'w+') as fout:
|
|
91
|
+
for entry, seq in df[[self.id_col, self.seq_col]].values:
|
|
92
|
+
if entry not in done_entries:
|
|
93
|
+
fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
|
|
94
|
+
# Might have an issue if the things are not correctly installed in the same dicrectory
|
|
95
|
+
cmd = ['conda', 'run', '-n', self.env_name, 'python', Path(__file__).parent/'esm-extract.py', self.model, input_filename, tmp_dir, '--include', 'per_tok']
|
|
96
|
+
self.run(cmd)
|
|
97
|
+
if self.extraction_method == 'mean':
|
|
98
|
+
df = extract_mean_embedding(df, self.id_col, tmp_dir)
|
|
99
|
+
elif self.extraction_method == 'active_site':
|
|
100
|
+
if self.active_site_col is None:
|
|
101
|
+
raise ValueError('active_site_col must be provided if extraction_method is active_site')
|
|
102
|
+
df = extract_active_site_embedding(df, self.id_col, self.active_site_col, tmp_dir)
|
|
103
|
+
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
107
|
+
if self.tmp_dir is None:
|
|
108
|
+
with TemporaryDirectory() as tmp_dir:
|
|
109
|
+
if self.num_threads > 1:
|
|
110
|
+
dfs = []
|
|
111
|
+
df_list = np.array_split(df, self.num_threads)
|
|
112
|
+
for df_chunk in tqdm(df_list):
|
|
113
|
+
dfs.append(self.__execute(df_chunk, tmp_dir))
|
|
114
|
+
df = pd.DataFrame()
|
|
115
|
+
for tmp_df in tqdm(dfs):
|
|
116
|
+
df = pd.concat([df, tmp_df])
|
|
117
|
+
return df
|
|
118
|
+
else:
|
|
119
|
+
df = self.__execute(df, tmp_dir)
|
|
120
|
+
return df
|
|
121
|
+
else:
|
|
122
|
+
df = self.__execute(df, self.tmp_dir)
|
|
123
|
+
return df
|
enzymetk/esm-extract.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#!/usr/bin/env python3 -u
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the MIT license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import pathlib
|
|
9
|
+
|
|
10
|
+
import torch
|
|
11
|
+
|
|
12
|
+
from esm import Alphabet, FastaBatchedDataset, ProteinBertModel, pretrained, MSATransformer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_parser():
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
description="Extract per-token representations and model outputs for sequences in a FASTA file" # noqa
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"model_location",
|
|
22
|
+
type=str,
|
|
23
|
+
help="PyTorch model file OR name of pretrained model to download (see README for models)",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"fasta_file",
|
|
27
|
+
type=pathlib.Path,
|
|
28
|
+
help="FASTA file on which to extract representations",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"output_dir",
|
|
32
|
+
type=pathlib.Path,
|
|
33
|
+
help="output directory for extracted representations",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
parser.add_argument("--toks_per_batch", type=int, default=4096, help="maximum batch size")
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--repr_layers",
|
|
39
|
+
type=int,
|
|
40
|
+
default=[-1],
|
|
41
|
+
nargs="+",
|
|
42
|
+
help="layers indices from which to extract representations (0 to num_layers, inclusive)",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--include",
|
|
46
|
+
type=str,
|
|
47
|
+
nargs="+",
|
|
48
|
+
choices=["mean", "per_tok", "bos", "contacts"],
|
|
49
|
+
help="specify which representations to return",
|
|
50
|
+
required=True,
|
|
51
|
+
)
|
|
52
|
+
parser.add_argument(
|
|
53
|
+
"--truncation_seq_length",
|
|
54
|
+
type=int,
|
|
55
|
+
default=1022,
|
|
56
|
+
help="truncate sequences longer than the given value",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
parser.add_argument("--nogpu", action="store_true", help="Do not use GPU even if available")
|
|
60
|
+
return parser
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def run(args):
|
|
64
|
+
model, alphabet = pretrained.load_model_and_alphabet(args.model_location)
|
|
65
|
+
model.eval()
|
|
66
|
+
if isinstance(model, MSATransformer):
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"This script currently does not handle models with MSA input (MSA Transformer)."
|
|
69
|
+
)
|
|
70
|
+
if torch.cuda.is_available() and not args.nogpu:
|
|
71
|
+
model = model.cuda()
|
|
72
|
+
print("Transferred model to GPU")
|
|
73
|
+
|
|
74
|
+
dataset = FastaBatchedDataset.from_file(args.fasta_file)
|
|
75
|
+
batches = dataset.get_batch_indices(args.toks_per_batch, extra_toks_per_seq=1)
|
|
76
|
+
data_loader = torch.utils.data.DataLoader(
|
|
77
|
+
dataset, collate_fn=alphabet.get_batch_converter(args.truncation_seq_length), batch_sampler=batches
|
|
78
|
+
)
|
|
79
|
+
print(f"Read {args.fasta_file} with {len(dataset)} sequences")
|
|
80
|
+
|
|
81
|
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
return_contacts = "contacts" in args.include
|
|
83
|
+
|
|
84
|
+
assert all(-(model.num_layers + 1) <= i <= model.num_layers for i in args.repr_layers)
|
|
85
|
+
repr_layers = [(i + model.num_layers + 1) % (model.num_layers + 1) for i in args.repr_layers]
|
|
86
|
+
|
|
87
|
+
with torch.no_grad():
|
|
88
|
+
for batch_idx, (labels, strs, toks) in enumerate(data_loader):
|
|
89
|
+
print(
|
|
90
|
+
f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
|
|
91
|
+
)
|
|
92
|
+
if torch.cuda.is_available() and not args.nogpu:
|
|
93
|
+
toks = toks.to(device="cuda", non_blocking=True)
|
|
94
|
+
|
|
95
|
+
out = model(toks, repr_layers=repr_layers, return_contacts=return_contacts)
|
|
96
|
+
|
|
97
|
+
logits = out["logits"].to(device="cpu")
|
|
98
|
+
representations = {
|
|
99
|
+
layer: t.to(device="cpu") for layer, t in out["representations"].items()
|
|
100
|
+
}
|
|
101
|
+
if return_contacts:
|
|
102
|
+
contacts = out["contacts"].to(device="cpu")
|
|
103
|
+
|
|
104
|
+
for i, label in enumerate(labels):
|
|
105
|
+
args.output_file = args.output_dir / f"{label}.pt"
|
|
106
|
+
args.output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
result = {"label": label}
|
|
108
|
+
truncate_len = min(args.truncation_seq_length, len(strs[i]))
|
|
109
|
+
# Call clone on tensors to ensure tensors are not views into a larger representation
|
|
110
|
+
# See https://github.com/pytorch/pytorch/issues/1995
|
|
111
|
+
if "per_tok" in args.include:
|
|
112
|
+
result["representations"] = {
|
|
113
|
+
layer: t[i, 1 : truncate_len + 1].clone()
|
|
114
|
+
for layer, t in representations.items()
|
|
115
|
+
}
|
|
116
|
+
if "mean" in args.include:
|
|
117
|
+
result["mean_representations"] = {
|
|
118
|
+
layer: t[i, 1 : truncate_len + 1].mean(0).clone()
|
|
119
|
+
for layer, t in representations.items()
|
|
120
|
+
}
|
|
121
|
+
if "bos" in args.include:
|
|
122
|
+
result["bos_representations"] = {
|
|
123
|
+
layer: t[i, 0].clone() for layer, t in representations.items()
|
|
124
|
+
}
|
|
125
|
+
if return_contacts:
|
|
126
|
+
result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
|
|
127
|
+
|
|
128
|
+
torch.save(
|
|
129
|
+
result,
|
|
130
|
+
args.output_file,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def main():
|
|
135
|
+
parser = create_parser()
|
|
136
|
+
args = parser.parse_args()
|
|
137
|
+
run(args)
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
main()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Step to run multiple sequence alignment with the Clustal Omega tool.
|
|
3
|
+
./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
|
|
4
|
+
"""
|
|
5
|
+
from enzymetk.step import Step
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
from tempfile import TemporaryDirectory
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
import random
|
|
12
|
+
import string
|
|
13
|
+
|
|
14
|
+
class ClustalOmega(Step):
|
|
15
|
+
|
|
16
|
+
def __init__(self, id_col: str, seq_col: str, tmp_dir: str = None):
|
|
17
|
+
self.seq_col = seq_col
|
|
18
|
+
self.id_col = id_col
|
|
19
|
+
self.tmp_dir = tmp_dir
|
|
20
|
+
|
|
21
|
+
def __execute(self, data: list) -> np.array:
|
|
22
|
+
df, tmp_dir = data
|
|
23
|
+
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
24
|
+
fasta_file = os.path.join(tmp_dir, 'sequences.fasta')
|
|
25
|
+
output_file = os.path.join(tmp_dir, f"{tmp_label}.aln")
|
|
26
|
+
|
|
27
|
+
# Turn dataframe into fasta file
|
|
28
|
+
with open(fasta_file, 'w') as f:
|
|
29
|
+
for seq_id, seq in df[[self.id_col, self.seq_col]].values:
|
|
30
|
+
f.write(f">{seq_id}\n{seq}\n")
|
|
31
|
+
|
|
32
|
+
# Running Clustal Omega on the generated FASTA file
|
|
33
|
+
subprocess.run(['clustalo', '-i', fasta_file, '-o', output_file], check=True)
|
|
34
|
+
|
|
35
|
+
sequences = {}
|
|
36
|
+
|
|
37
|
+
# Read the output file
|
|
38
|
+
with open(output_file, 'r') as f:
|
|
39
|
+
current_id = None
|
|
40
|
+
for line in f:
|
|
41
|
+
line = line.strip() # Remove leading/trailing whitespaces or newline characters
|
|
42
|
+
if line.startswith(">"):
|
|
43
|
+
# Header line with sequence ID
|
|
44
|
+
current_id = line[1:] # Extract ID without ">"
|
|
45
|
+
sequences[current_id] = "" # Initialize an empty string for this ID
|
|
46
|
+
else:
|
|
47
|
+
# Sequence line; append it to the current ID's sequence
|
|
48
|
+
sequences[current_id] += line.strip()
|
|
49
|
+
|
|
50
|
+
# Convert the sequences dictionary into a DataFrame
|
|
51
|
+
df_aligned = pd.DataFrame(list(sequences.items()), columns=[self.id_col, 'aligned_sequence'])
|
|
52
|
+
|
|
53
|
+
df = pd.merge(df, df_aligned, on=self.id_col, how='left')
|
|
54
|
+
|
|
55
|
+
return df
|
|
56
|
+
|
|
57
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
58
|
+
if self.tmp_dir is not None:
|
|
59
|
+
return self.__execute([df, self.tmp_dir])
|
|
60
|
+
with TemporaryDirectory() as tmp_dir:
|
|
61
|
+
return self.__execute([df, tmp_dir])
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# /home/ikumi/degradeo/software/FastTree -gtr -nt /home/ikumi/degradeo/pipeline/ikumi_data/Q04457_esterase-2.msa > /home/ikumi/degradeo/pipeline/ikumi_data/output_tree.tree
|
|
2
|
+
# /home/ikumi/degradeo/software/FastTree -wag /home/ikumi/degradeo/pipeline/ikumi_data/Q04457_esterase-2.msa > /home/ikumi/degradeo/pipeline/ikumi_data/output_tree.tree
|
|
3
|
+
"""
|
|
4
|
+
Install clean and then you need to activate the environment and install and run via that.
|
|
5
|
+
|
|
6
|
+
Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
|
|
7
|
+
repo and then copy it out of it.
|
|
8
|
+
"""
|
|
9
|
+
from enzymetk.step import Step
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
13
|
+
from tempfile import TemporaryDirectory
|
|
14
|
+
import os
|
|
15
|
+
import subprocess
|
|
16
|
+
import random
|
|
17
|
+
import string
|
|
18
|
+
|
|
19
|
+
class FastTree(Step):
|
|
20
|
+
def __init__(self, fasttree_dir: str, id_col: str, seq_col: str, csv_file: str, output_dir: str):
|
|
21
|
+
self.fasttree_dir = fasttree_dir
|
|
22
|
+
self.id_col = id_col
|
|
23
|
+
self.seq_col = seq_col
|
|
24
|
+
self.csv_file = csv_file
|
|
25
|
+
self.num_threads = 1
|
|
26
|
+
self.output_dir = output_dir
|
|
27
|
+
|
|
28
|
+
def create_alignment_file(self, df: pd.DataFrame) -> str:
|
|
29
|
+
print(f"Creating MSA file from {len(df)} sequences")
|
|
30
|
+
|
|
31
|
+
# Create MSA file in the output directory
|
|
32
|
+
msa_file = os.path.join(self.output_dir, 'ikumi.data.msa')
|
|
33
|
+
with open(msa_file, 'w') as fout:
|
|
34
|
+
for entry, seq in df[[self.id_col, self.seq_col]].values:
|
|
35
|
+
fout.write(f">{entry.strip()}\n{seq.strip()}\n")
|
|
36
|
+
|
|
37
|
+
print(f"Created MSA file at: {msa_file}")
|
|
38
|
+
return msa_file
|
|
39
|
+
|
|
40
|
+
def __execute(self, data: list) -> pd.DataFrame:
|
|
41
|
+
df, tmp_dir = data
|
|
42
|
+
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
43
|
+
# Get the msa file
|
|
44
|
+
msa_file = self.create_alignment_file(df)
|
|
45
|
+
|
|
46
|
+
fasttree_executable = os.path.join(self.fasttree_dir, 'FastTree')
|
|
47
|
+
output_tree_file = os.path.join(tmp_dir, f'{tmp_label}.tree')
|
|
48
|
+
|
|
49
|
+
# Run FastTree and redirect output to a file
|
|
50
|
+
with open(output_tree_file, 'w') as outfile:
|
|
51
|
+
subprocess.run([fasttree_executable, '-wag', msa_file], stdout=outfile, check=True)
|
|
52
|
+
|
|
53
|
+
df = pd.read_csv(output_tree_file, header=None, sep='\t')
|
|
54
|
+
|
|
55
|
+
print(df.head())
|
|
56
|
+
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
60
|
+
with TemporaryDirectory() as tmp_dir:
|
|
61
|
+
if self.num_threads > 1:
|
|
62
|
+
data = []
|
|
63
|
+
df_list = np.array_split(df, self.num_threads)
|
|
64
|
+
pool = ThreadPool(self.num_threads)
|
|
65
|
+
for df_chunk in df_list:
|
|
66
|
+
data.append([df_chunk, tmp_dir])
|
|
67
|
+
results = pool.map(self.__execute, data)
|
|
68
|
+
df = pd.DataFrame()
|
|
69
|
+
for dfs in results:
|
|
70
|
+
df = pd.concat([df, dfs])
|
|
71
|
+
return df
|
|
72
|
+
else:
|
|
73
|
+
return self.__execute([df, tmp_dir])
|
|
74
|
+
return df
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# #os.system('python run.py --seed 111 --pdb_path "./test_outputs/QHH_0.pdb" --fixed_residues "A19 A20 A21 A59 A60 A61 A90 A91 A92" --checkpoint_path_sc "./model_params/ligandmpnn_sc_v_32_002_16.pt" --out_folder "./outputs/QHH"')
|
|
2
|
+
#os.system('python run.py --seed 111 --pdb_path "./test_outputs/QHH_0.pdb" --fixed_residues "A19 A20 A21 A59 A60 A61 A90 A91 A92" --checkpoint_path_sc "./model_params/ligandmpnn_sc_v_32_002_16.pt" --out_folder "./outputs/QHH"')
|
|
3
|
+
"""
|
|
4
|
+
Install clean and then you need to activate the environment and install and run via that.
|
|
5
|
+
|
|
6
|
+
Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
|
|
7
|
+
repo and then copy it out of it.
|
|
8
|
+
"""
|
|
9
|
+
from enzymetk.step import Step
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
from tempfile import TemporaryDirectory
|
|
13
|
+
import subprocess
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
class LigandMPNN(Step):
|
|
18
|
+
|
|
19
|
+
def __init__(self, pdb_column_name: str, ligand_mpnn_dir: str, output_dir: str, tmp_dir: str = None, args=None, num_threads: int = 1, env_name: str = 'ligandmpnn_env'):
|
|
20
|
+
self.pdb_column_name = pdb_column_name
|
|
21
|
+
self.ligand_mpnn_dir = ligand_mpnn_dir
|
|
22
|
+
self.output_dir = output_dir
|
|
23
|
+
self.tmp_dir = tmp_dir
|
|
24
|
+
self.args = args
|
|
25
|
+
self.num_threads = num_threads
|
|
26
|
+
self.env_name = env_name
|
|
27
|
+
self.logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
def __execute(self, data: list) -> np.array:
|
|
30
|
+
df, tmp_dir = data
|
|
31
|
+
# Get the PDB files from the column
|
|
32
|
+
output_filenames = []
|
|
33
|
+
# You have to change the directory to the ligandmpnn directory
|
|
34
|
+
os.chdir(self.ligand_mpnn_dir)
|
|
35
|
+
|
|
36
|
+
for pdb_file in df[ self.pdb_column_name].values:
|
|
37
|
+
cmd = ['conda', 'run', '-n', self.env_name, 'python3', f'{self.ligand_mpnn_dir}run.py', '--pdb_path', pdb_file, '--out_folder', f'{self.output_dir}']
|
|
38
|
+
if self.args is not None:
|
|
39
|
+
cmd.extend(self.args)
|
|
40
|
+
result = subprocess.run(cmd, check=True)
|
|
41
|
+
if result.stderr:
|
|
42
|
+
self.logger.error(result.stderr)
|
|
43
|
+
else:
|
|
44
|
+
output_filenames.append(f'{self.output_dir}{pdb_file.split("/")[-1].split(".")[0]}')
|
|
45
|
+
self.logger.info(result.stdout)
|
|
46
|
+
df['inpainted_pdb'] = output_filenames
|
|
47
|
+
return df
|
|
48
|
+
|
|
49
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
50
|
+
if self.tmp_dir is not None:
|
|
51
|
+
return self.__execute([df, self.tmp_dir])
|
|
52
|
+
with TemporaryDirectory() as tmp_dir:
|
|
53
|
+
if self.num_threads > 1:
|
|
54
|
+
output_filenames = []
|
|
55
|
+
df_list = np.array_split(df, self.num_threads)
|
|
56
|
+
for df_chunk in df_list:
|
|
57
|
+
output_filenames.append(self.__execute(df_chunk, tmp_dir))
|
|
58
|
+
|
|
59
|
+
df = pd.DataFrame()
|
|
60
|
+
for tmp_df in output_filenames:
|
|
61
|
+
df = pd.concat([df, tmp_df])
|
|
62
|
+
return df
|
|
63
|
+
|
|
64
|
+
return self.__execute([df, tmp_dir])
|
|
65
|
+
return df
|
enzymetk/main.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Author: Ariane Mora
|
|
20
|
+
Date: September 2024
|
|
21
|
+
"""
|
|
22
|
+
from enzymetk import *
|
|
23
|
+
import argparse
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_args():
|
|
27
|
+
parser = argparse.ArgumentParser(description="Run on a dataframe")
|
|
28
|
+
parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
|
|
29
|
+
parser.add_argument('-df', '--df', type=str, required=True, help='Fasta of the file of interest')
|
|
30
|
+
return parser.parse_args()
|
|
31
|
+
|
|
32
|
+
def main():
|
|
33
|
+
args = parse_args()
|
|
34
|
+
#run(args.out, args.df)
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
main()
|