enzymetk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- enzymetk/__init__.py +56 -0
- enzymetk/annotateEC_CLEAN_step.py +122 -0
- enzymetk/annotateEC_CREEP_step.py +82 -0
- enzymetk/annotateEC_proteinfer_step.py +136 -0
- enzymetk/dock_chai_step.py +51 -0
- enzymetk/dock_vina_step.py +63 -0
- enzymetk/embedchem_chemberta_step.py +61 -0
- enzymetk/embedchem_rxnfp_run.py +28 -0
- enzymetk/embedchem_rxnfp_step.py +55 -0
- enzymetk/embedchem_selformer_run.py +28 -0
- enzymetk/embedchem_selformer_step.py +39 -0
- enzymetk/embedchem_unimol_step.py +57 -0
- enzymetk/embedprotein_esm_step.py +123 -0
- enzymetk/esm-extract.py +140 -0
- enzymetk/filter_sequence_step.py +0 -0
- enzymetk/filter_structure_step.py +0 -0
- enzymetk/generate_msa_step.py +61 -0
- enzymetk/generate_oligopool_step.py +0 -0
- enzymetk/generate_tree_step.py +74 -0
- enzymetk/inpaint_ligandMPNN_step.py +65 -0
- enzymetk/main.py +37 -0
- enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
- enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
- enzymetk/pipeline.py +1 -0
- enzymetk/predict_activity_step.py +0 -0
- enzymetk/predict_catalyticsite_run.py +47 -0
- enzymetk/predict_catalyticsite_step.py +70 -0
- enzymetk/reducedim_pca_run.py +67 -0
- enzymetk/reducedim_vae_run.py +67 -0
- enzymetk/reducedim_vae_step.py +12 -0
- enzymetk/save_step.py +13 -0
- enzymetk/sequence_search_blast.py +80 -0
- enzymetk/similarity_foldseek_step.py +114 -0
- enzymetk/similarity_mmseqs_step.py +80 -0
- enzymetk/similarity_reaction_step.py +60 -0
- enzymetk/similarity_substrate_step.py +59 -0
- enzymetk/step.py +60 -0
- enzymetk-0.0.1.data/data/LICENSE +0 -0
- enzymetk-0.0.1.dist-info/LICENSE +0 -0
- enzymetk-0.0.1.dist-info/METADATA +370 -0
- enzymetk-0.0.1.dist-info/RECORD +44 -0
- enzymetk-0.0.1.dist-info/WHEEL +5 -0
- enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
- enzymetk-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Install clean and then you need to activate the environment and install and run via that.
|
|
3
|
+
|
|
4
|
+
Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
|
|
5
|
+
repo and then copy it out of it.
|
|
6
|
+
"""
|
|
7
|
+
from enzymetk.step import Step
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
11
|
+
from tempfile import TemporaryDirectory
|
|
12
|
+
import os
|
|
13
|
+
import subprocess
|
|
14
|
+
import random
|
|
15
|
+
import string
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PoreChop(Step):
|
|
19
|
+
|
|
20
|
+
def __init__(self, porechop_dir: str, input_column_name: str, output_column_name: str, num_threads=1):
|
|
21
|
+
self.porechop_dir = porechop_dir
|
|
22
|
+
self.input_column_name = input_column_name
|
|
23
|
+
self.output_column_name = output_column_name
|
|
24
|
+
self.num_threads = num_threads
|
|
25
|
+
|
|
26
|
+
def __execute(self, data: list) -> np.array:
|
|
27
|
+
df = data
|
|
28
|
+
# f'./porechop-runner.py -i {data_dir}fastq/{l}.fastq -o {data_dir}trimmed/{l}.fastq'
|
|
29
|
+
file_created = []
|
|
30
|
+
for input_filename, output_filename in df[[self.input_column_name, self.output_column_name]]:
|
|
31
|
+
subprocess.run([f'{self.porechop_dir}./porechop-runner.py', '-i', input_filename, '-o', output_filename], check=True)
|
|
32
|
+
# Check that the file was created
|
|
33
|
+
if os.path.exists(output_filename):
|
|
34
|
+
file_created.append(True)
|
|
35
|
+
else:
|
|
36
|
+
file_created.append(False)
|
|
37
|
+
df['file_created'] = file_created
|
|
38
|
+
return df
|
|
39
|
+
|
|
40
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
41
|
+
with TemporaryDirectory() as tmp_dir:
|
|
42
|
+
if self.num_threads > 1:
|
|
43
|
+
data = []
|
|
44
|
+
df_list = np.array_split(df, self.num_threads)
|
|
45
|
+
pool = ThreadPool(self.num_threads)
|
|
46
|
+
for df_chunk in df_list:
|
|
47
|
+
data.append([df_chunk, tmp_dir])
|
|
48
|
+
results = pool.map(self.__execute, data)
|
|
49
|
+
df = pd.DataFrame()
|
|
50
|
+
for dfs in results:
|
|
51
|
+
df = pd.concat([df, dfs])
|
|
52
|
+
return df
|
|
53
|
+
else:
|
|
54
|
+
return self.__execute([df, tmp_dir])
|
|
55
|
+
return df
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#./foldseek easy-search /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_0.cif /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_1.cif pdb test_aln.fasta tmp
|
|
2
|
+
"""
|
|
3
|
+
Install clean and then you need to activate the environment and install and run via that.
|
|
4
|
+
|
|
5
|
+
Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
|
|
6
|
+
repo and then copy it out of it.
|
|
7
|
+
"""
|
|
8
|
+
from enzymetk.step import Step
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
12
|
+
from tempfile import TemporaryDirectory
|
|
13
|
+
import os
|
|
14
|
+
import subprocess
|
|
15
|
+
import random
|
|
16
|
+
import string
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
""" Install: conda install -c conda-forge -c bioconda -c defaults prokka """
|
|
20
|
+
class Prokka(Step):
|
|
21
|
+
|
|
22
|
+
def __init__(self, porechop_dir: str, name: str, input_column_name: str, output_dir: str, num_threads=1):
|
|
23
|
+
self.porechop_dir = porechop_dir
|
|
24
|
+
self.name = name
|
|
25
|
+
self.input_column_name = input_column_name
|
|
26
|
+
self.output_dir = output_dir
|
|
27
|
+
self.num_threads = num_threads
|
|
28
|
+
|
|
29
|
+
def __execute(self, data: list) -> np.array:
|
|
30
|
+
df = data
|
|
31
|
+
# f'prokka --outdir {data_dir}prokka/{l} --prefix {l} {data_dir}flye/{l}/assembly.fasta ')
|
|
32
|
+
file_created = []
|
|
33
|
+
for name, input_filename, output_dir in df[[self.name, self.input_column_name, self.output_dir]]:
|
|
34
|
+
# Note it expects the input file name to be the outpt from flye
|
|
35
|
+
subprocess.run([f'prokka', '--outdir', output_dir, '--prefix', name, input_filename], check=True)
|
|
36
|
+
# Check that the file was created
|
|
37
|
+
if os.path.exists(output_filename):
|
|
38
|
+
file_created.append(True)
|
|
39
|
+
else:
|
|
40
|
+
file_created.append(False)
|
|
41
|
+
df['file_created'] = file_created
|
|
42
|
+
return df
|
|
43
|
+
|
|
44
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
45
|
+
with TemporaryDirectory() as tmp_dir:
|
|
46
|
+
if self.num_threads > 1:
|
|
47
|
+
data = []
|
|
48
|
+
df_list = np.array_split(df, self.num_threads)
|
|
49
|
+
pool = ThreadPool(self.num_threads)
|
|
50
|
+
for df_chunk in df_list:
|
|
51
|
+
data.append([df_chunk, tmp_dir])
|
|
52
|
+
results = pool.map(self.__execute, data)
|
|
53
|
+
df = pd.DataFrame()
|
|
54
|
+
for dfs in results:
|
|
55
|
+
df = pd.concat([df, dfs])
|
|
56
|
+
return df
|
|
57
|
+
else:
|
|
58
|
+
return self.__execute([df, tmp_dir])
|
|
59
|
+
return df
|
enzymetk/pipeline.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def run_as_inference(output_dir, fasta_file, squidly_dir, toks_per_batch, as_threshold, bs_threshold, cr_model_as,
|
|
6
|
+
cr_model_bs, lstm_model_as, lstm_model_bs, esm2_model):
|
|
7
|
+
esm2_model = esm2_model or "esm2_t36_3B_UR50D"
|
|
8
|
+
if esm2_model == "esm2_t36_3B_UR50D":
|
|
9
|
+
cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_3B.pt"
|
|
10
|
+
lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_3B.pth"
|
|
11
|
+
elif esm2_model == "esm2_t48_15B_UR50D":
|
|
12
|
+
cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_15B.pt"
|
|
13
|
+
lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_15B.pth"
|
|
14
|
+
as_threshold = 0.99
|
|
15
|
+
#esm2_model = "esm2_t48_15B_UR50D"
|
|
16
|
+
# python /scratch/project/squid/code_modular/SQUIDLY_run_model_LSTM.py ${FILE} ${ESM2_MODEL} ${CR_MODEL_AS}
|
|
17
|
+
# ${LSTM_MODEL_AS} ${OUT} --toks_per_batch ${TOKS_PER_BATCH} --AS_threshold ${AS_THRESHOLD} --monitor
|
|
18
|
+
|
|
19
|
+
command = f'conda run -n AS_inference python {squidly_dir}SQUIDLY_run_model_LSTM.py \
|
|
20
|
+
{fasta_file} {esm2_model} {cr_model_as} {lstm_model_as} {output_dir} \
|
|
21
|
+
--toks_per_batch {toks_per_batch} --AS_threshold {as_threshold}'
|
|
22
|
+
print(command)
|
|
23
|
+
os.system(command)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_args():
|
|
27
|
+
parser = argparse.ArgumentParser(description="Run as inference on a dataset")
|
|
28
|
+
parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
|
|
29
|
+
parser.add_argument('-input', '--input', type=str, required=True, help='path to the dataframe')
|
|
30
|
+
parser.add_argument('--squidly_dir', type=str, required=True, help='path to the squidly_dir')
|
|
31
|
+
parser.add_argument('--toks_per_batch', type=int, default=5, help='How many tokens per batch')
|
|
32
|
+
parser.add_argument('--as_threshold', type=float, default=0.90, help='the threshold for active site.')
|
|
33
|
+
parser.add_argument('--bs_threshold', type=float, default=0.85, help='the threshold for binding site.')
|
|
34
|
+
parser.add_argument('--cr_model_as', type=str, help='the path to the active site CR model.')
|
|
35
|
+
parser.add_argument('--cr_model_bs', type=str, help='the path to the binding site CR model.')
|
|
36
|
+
parser.add_argument('--lstm_model_as', type=str, help='the path to the active site LSTM model.')
|
|
37
|
+
parser.add_argument('--lstm_model_bs', type=str, help='the path to the binding site LSTM model.')
|
|
38
|
+
parser.add_argument('--esm2_model', type=str, help='ESM2 model.')
|
|
39
|
+
return parser.parse_args()
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
args = parse_args()
|
|
43
|
+
run_as_inference(args.out, args.input, args.squidly_dir, args.toks_per_batch, args.as_threshold, args.bs_threshold,
|
|
44
|
+
args.cr_model_as, args.cr_model_bs, args.lstm_model_as, args.lstm_model_bs, args.esm2_model)
|
|
45
|
+
|
|
46
|
+
# Removed the if name since we run with subprocess
|
|
47
|
+
main()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from tempfile import TemporaryDirectory
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import logging
|
|
7
|
+
import numpy as np
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import random
|
|
10
|
+
import string
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
logger.setLevel(logging.INFO)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ActiveSitePred(Step):
|
|
17
|
+
|
|
18
|
+
def __init__(self, id_col: str, seq_col: str, squidly_dir: str, num_threads: int = 1,
|
|
19
|
+
esm2_model = 'esm2_t36_3B_UR50D', tmp_dir: str = None):
|
|
20
|
+
self.id_col = id_col
|
|
21
|
+
self.seq_col = seq_col
|
|
22
|
+
self.num_threads = num_threads or 1
|
|
23
|
+
self.squidly_dir = squidly_dir
|
|
24
|
+
self.esm2_model = esm2_model
|
|
25
|
+
self.tmp_dir = tmp_dir
|
|
26
|
+
|
|
27
|
+
def __to_fasta(self, df: pd.DataFrame, tmp_dir: str):
|
|
28
|
+
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
29
|
+
|
|
30
|
+
input_filename = f'{tmp_dir}/as_inference_{tmp_label}.fasta'
|
|
31
|
+
# Save as a fasta
|
|
32
|
+
with open(input_filename, 'w+') as fout:
|
|
33
|
+
for entry, seq in df[[self.id_col, self.seq_col]].values:
|
|
34
|
+
fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
|
|
35
|
+
return input_filename
|
|
36
|
+
|
|
37
|
+
def __execute(self, df: pd.DataFrame, tmp_dir: str):
|
|
38
|
+
input_filename = self.__to_fasta(df, tmp_dir)
|
|
39
|
+
# Might have an issue if the things are not correctly installed in the same dicrectory
|
|
40
|
+
result = subprocess.run(['python', Path(__file__).parent/'predict_catalyticsite_run.py', '--out', str(tmp_dir),
|
|
41
|
+
'--input', input_filename, '--squidly_dir', self.squidly_dir, '--esm2_model', self.esm2_model], capture_output=True, text=True)
|
|
42
|
+
output_filename = f'{input_filename.replace(".fasta", "_results.pkl")}'
|
|
43
|
+
if result.stderr:
|
|
44
|
+
logger.error(result.stderr)
|
|
45
|
+
logger.info(result.stdout)
|
|
46
|
+
|
|
47
|
+
return output_filename
|
|
48
|
+
|
|
49
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
50
|
+
with TemporaryDirectory() as tmp_dir:
|
|
51
|
+
tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
|
|
52
|
+
if self.num_threads > 1:
|
|
53
|
+
output_filenames = []
|
|
54
|
+
df_list = np.array_split(df, self.num_threads)
|
|
55
|
+
for df_chunk in tqdm(df_list):
|
|
56
|
+
try:
|
|
57
|
+
output_filenames.append(self.__execute(df_chunk, tmp_dir))
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f"Error in executing ESM2 model: {e}")
|
|
60
|
+
continue
|
|
61
|
+
df = pd.DataFrame()
|
|
62
|
+
print(output_filenames)
|
|
63
|
+
for p in output_filenames:
|
|
64
|
+
sub_df = pd.read_pickle(p)
|
|
65
|
+
df = pd.concat([df, sub_df])
|
|
66
|
+
return df
|
|
67
|
+
|
|
68
|
+
else:
|
|
69
|
+
output_filename = self.__execute(df, tmp_dir)
|
|
70
|
+
return pd.read_pickle(output_filename)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import argparse
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scivae import VAE
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
def run_vae(output_dir, input_filename, value_column, method, config_file, label, id_column):
|
|
9
|
+
|
|
10
|
+
# Load the input dataset
|
|
11
|
+
with open(input_filename, 'rb') as file:
|
|
12
|
+
chem_df = pickle.load(file)
|
|
13
|
+
|
|
14
|
+
chem_encodings = np.asarray([np.asarray(x) for x in chem_df[value_column].values])
|
|
15
|
+
|
|
16
|
+
# Load in the config
|
|
17
|
+
with open(config_file, 'r') as f:
|
|
18
|
+
config = json.load(f)
|
|
19
|
+
|
|
20
|
+
# Add some defaults
|
|
21
|
+
if config.get('batch_size') is None:
|
|
22
|
+
config['batch_size'] = 100
|
|
23
|
+
if config.get('epochs') is None:
|
|
24
|
+
config['epochs'] = 100
|
|
25
|
+
if config.get('early_stop') is None:
|
|
26
|
+
config['early_stop'] = True
|
|
27
|
+
vae_mse = VAE(chem_encodings, chem_encodings, np.ones(len(chem_encodings)), config, 'esm_label')
|
|
28
|
+
# Set batch size and number of epochs
|
|
29
|
+
if method == 'train':
|
|
30
|
+
vae_mse.encode('default', epochs=config['epochs'], batch_size=config['batch_size'], early_stop=config['early_stop'])
|
|
31
|
+
# Save this
|
|
32
|
+
vae_mse.save(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
|
|
33
|
+
config_json=f'{output_dir}{label}_config.json')
|
|
34
|
+
elif method == 'encode':
|
|
35
|
+
# load this
|
|
36
|
+
vae_mse.load(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
|
|
37
|
+
config_json=f'{output_dir}{label}_config.json')
|
|
38
|
+
|
|
39
|
+
encoded_data_vae_mse = vae_mse.encode_new_data(chem_encodings)
|
|
40
|
+
|
|
41
|
+
df = pd.DataFrame()
|
|
42
|
+
df['id'] = chem_df[f'{id_column}'].values
|
|
43
|
+
df['encoding'] = [x for x in encoded_data_vae_mse]
|
|
44
|
+
|
|
45
|
+
# Save the encoded data
|
|
46
|
+
with open(f'{output_dir}{label}.pkl', 'wb') as file:
|
|
47
|
+
pickle.dump(df, file)
|
|
48
|
+
|
|
49
|
+
def parse_args():
|
|
50
|
+
parser = argparse.ArgumentParser(description="Run VAE dimensionality reduction on a dataset")
|
|
51
|
+
parser.add_argument('-o', '--out', required=True, help='Path to the output directory')
|
|
52
|
+
parser.add_argument('-i', '--input', type=str, required=True, help='path to the dataframe')
|
|
53
|
+
parser.add_argument('-v', '--value', type=str, required=True, help='label of the column which has the values for encoding')
|
|
54
|
+
parser.add_argument('-m', '--method', type=str, required=True, help='either to encode or train a VAE')
|
|
55
|
+
parser.add_argument('-c', '--config', type=str, required=True, help='config file path as JSON')
|
|
56
|
+
parser.add_argument('-l', '--label', type=str, required=True, help='run label for saving')
|
|
57
|
+
parser.add_argument('-d', '--id', type=str, required=True, help='id column')
|
|
58
|
+
|
|
59
|
+
return parser.parse_args()
|
|
60
|
+
|
|
61
|
+
def main():
|
|
62
|
+
args = parse_args()
|
|
63
|
+
# def run_vae(output_filename, input_filename, value_column, method, config_file, label_column, id_column):
|
|
64
|
+
run_vae(args.out, args.input, args.value, args.method, args.config, args.label, args.id)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
main()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import argparse
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scivae import VAE
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
def run_vae(output_dir, input_filename, value_column, method, config_file, label, id_column):
|
|
9
|
+
|
|
10
|
+
# Load the input dataset
|
|
11
|
+
with open(input_filename, 'rb') as file:
|
|
12
|
+
chem_df = pickle.load(file)
|
|
13
|
+
|
|
14
|
+
chem_encodings = np.asarray([np.asarray(x) for x in chem_df[value_column].values])
|
|
15
|
+
|
|
16
|
+
# Load in the config
|
|
17
|
+
with open(config_file, 'r') as f:
|
|
18
|
+
config = json.load(f)
|
|
19
|
+
|
|
20
|
+
# Add some defaults
|
|
21
|
+
if config.get('batch_size') is None:
|
|
22
|
+
config['batch_size'] = 100
|
|
23
|
+
if config.get('epochs') is None:
|
|
24
|
+
config['epochs'] = 100
|
|
25
|
+
if config.get('early_stop') is None:
|
|
26
|
+
config['early_stop'] = True
|
|
27
|
+
vae_mse = VAE(chem_encodings, chem_encodings, np.ones(len(chem_encodings)), config, 'esm_label')
|
|
28
|
+
# Set batch size and number of epochs
|
|
29
|
+
if method == 'train':
|
|
30
|
+
vae_mse.encode('default', epochs=config['epochs'], batch_size=config['batch_size'], early_stop=config['early_stop'])
|
|
31
|
+
# Save this
|
|
32
|
+
vae_mse.save(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
|
|
33
|
+
config_json=f'{output_dir}{label}_config.json')
|
|
34
|
+
elif method == 'encode':
|
|
35
|
+
# load this
|
|
36
|
+
vae_mse.load(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
|
|
37
|
+
config_json=f'{output_dir}{label}_config.json')
|
|
38
|
+
|
|
39
|
+
encoded_data_vae_mse = vae_mse.encode_new_data(chem_encodings)
|
|
40
|
+
|
|
41
|
+
df = pd.DataFrame()
|
|
42
|
+
df['id'] = chem_df[f'{id_column}'].values
|
|
43
|
+
df['encoding'] = [x for x in encoded_data_vae_mse]
|
|
44
|
+
|
|
45
|
+
# Save the encoded data
|
|
46
|
+
with open(f'{output_dir}{label}.pkl', 'wb') as file:
|
|
47
|
+
pickle.dump(df, file)
|
|
48
|
+
|
|
49
|
+
def parse_args():
|
|
50
|
+
parser = argparse.ArgumentParser(description="Run VAE dimensionality reduction on a dataset")
|
|
51
|
+
parser.add_argument('-o', '--out', required=True, help='Path to the output directory')
|
|
52
|
+
parser.add_argument('-i', '--input', type=str, required=True, help='path to the dataframe')
|
|
53
|
+
parser.add_argument('-v', '--value', type=str, required=True, help='label of the column which has the values for encoding')
|
|
54
|
+
parser.add_argument('-m', '--method', type=str, required=True, help='either to encode or train a VAE')
|
|
55
|
+
parser.add_argument('-c', '--config', type=str, required=True, help='config file path as JSON')
|
|
56
|
+
parser.add_argument('-l', '--label', type=str, required=True, help='run label for saving')
|
|
57
|
+
parser.add_argument('-d', '--id', type=str, required=True, help='id column')
|
|
58
|
+
|
|
59
|
+
return parser.parse_args()
|
|
60
|
+
|
|
61
|
+
def main():
|
|
62
|
+
args = parse_args()
|
|
63
|
+
# def run_vae(output_filename, input_filename, value_column, method, config_file, label_column, id_column):
|
|
64
|
+
run_vae(args.out, args.input, args.value, args.method, args.config, args.label, args.id)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
main()
|
enzymetk/save_step.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
class Save(Step):
|
|
7
|
+
|
|
8
|
+
def __init__(self, output_filename: str):
|
|
9
|
+
self.output_filename = output_filename
|
|
10
|
+
|
|
11
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
12
|
+
df.to_pickle(self.output_filename)
|
|
13
|
+
return df
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Step to run multiple sequence alignment with the Clustal Omega tool.
|
|
3
|
+
./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
|
|
4
|
+
"""
|
|
5
|
+
from enzymetk.step import Step
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import numpy as np
|
|
9
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
10
|
+
from tempfile import TemporaryDirectory
|
|
11
|
+
import os
|
|
12
|
+
import subprocess
|
|
13
|
+
import random
|
|
14
|
+
import string
|
|
15
|
+
|
|
16
|
+
class BLAST(Step):
|
|
17
|
+
|
|
18
|
+
def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None, mode='blastp', args=None, tmp_dir=None):
|
|
19
|
+
self.id_col = id_col
|
|
20
|
+
self.seq_col = sequence_col
|
|
21
|
+
self.label_col = label_col # This is whether it is query or reference
|
|
22
|
+
self.mode = mode
|
|
23
|
+
self.database = database
|
|
24
|
+
self.args = args
|
|
25
|
+
self.tmp_dir = tmp_dir
|
|
26
|
+
if self.database is None and self.label_col is None:
|
|
27
|
+
raise ValueError('Database is not set, you can pass a database that you have already created see diamond for more information or the sequences \
|
|
28
|
+
as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference \
|
|
29
|
+
refers to sequences that you want to search against and query refers to sequences that you want to search for.')
|
|
30
|
+
|
|
31
|
+
def __execute(self, data: list) -> np.array:
|
|
32
|
+
df, tmp_dir = data
|
|
33
|
+
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
34
|
+
query_fasta = os.path.join(tmp_dir, f'{tmp_label}_query.fasta')
|
|
35
|
+
ref_fasta = os.path.join(tmp_dir, f'{tmp_label}_ref.fasta')
|
|
36
|
+
db_label = os.path.join(tmp_dir, f'{tmp_label}_db')
|
|
37
|
+
# write fasta file which is the input for proteinfer
|
|
38
|
+
if self.label_col is not None:
|
|
39
|
+
with open(query_fasta, 'w+') as fout:
|
|
40
|
+
query_df = df[df[self.label_col] == 'query']
|
|
41
|
+
print(query_df)
|
|
42
|
+
for entry, seq in query_df[[self.id_col, self.seq_col]].values:
|
|
43
|
+
fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
|
|
44
|
+
|
|
45
|
+
with open(ref_fasta, 'w+') as fout:
|
|
46
|
+
query_df = df[df[self.label_col] == 'reference']
|
|
47
|
+
print(query_df)
|
|
48
|
+
for entry, seq in query_df[[self.id_col, self.seq_col]].values:
|
|
49
|
+
fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
|
|
50
|
+
# Make the DB first
|
|
51
|
+
db_label = os.path.join(tmp_dir, f'{tmp_label}_refdb')
|
|
52
|
+
subprocess.run(['diamond', 'makedb', '--in', ref_fasta, '-d', db_label], check=True)
|
|
53
|
+
else:
|
|
54
|
+
with open(query_fasta, 'w+') as fout:
|
|
55
|
+
for entry, seq in df[[self.id_col, self.seq_col]].values:
|
|
56
|
+
fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
|
|
57
|
+
if os.path.exists(self.database):
|
|
58
|
+
# Here we're assuming they're passing a database as a fasta file
|
|
59
|
+
subprocess.run(['diamond', 'makedb', '--in', self.database, '-d', db_label], check=True)
|
|
60
|
+
else:
|
|
61
|
+
db_label = self.database
|
|
62
|
+
|
|
63
|
+
# Running Clustal Omega on the generated FASTA file
|
|
64
|
+
matches_filename = os.path.join(tmp_dir, f'{tmp_label}_matches.tsv')
|
|
65
|
+
cmd = ['diamond', self.mode]
|
|
66
|
+
if self.args is not None:
|
|
67
|
+
cmd.extend(self.args)
|
|
68
|
+
cmd.extend(['-d', db_label, '-q', query_fasta, '-o', matches_filename])
|
|
69
|
+
print(cmd)
|
|
70
|
+
self.run(cmd)
|
|
71
|
+
df = pd.read_csv(matches_filename, sep='\t', header=None)
|
|
72
|
+
print(df)
|
|
73
|
+
df.columns = ['query', 'target', 'sequence identity', 'length', 'mismatch', 'gapopen', 'query start', 'query end', 'target start', 'target end', 'e-value', 'bitscore']
|
|
74
|
+
return df
|
|
75
|
+
|
|
76
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
+
if self.tmp_dir is not None:
|
|
78
|
+
return self.__execute([df, self.tmp_dir])
|
|
79
|
+
with TemporaryDirectory() as tmp_dir:
|
|
80
|
+
return self.__execute([df, tmp_dir])
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#./foldseek easy-search /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_0.cif /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_1.cif pdb test_aln.fasta tmp
|
|
2
|
+
"""
|
|
3
|
+
Install clean and then you need to activate the environment and install and run via that.
|
|
4
|
+
|
|
5
|
+
Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
|
|
6
|
+
repo and then copy it out of it.
|
|
7
|
+
"""
|
|
8
|
+
from enzymetk.step import Step
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
from tempfile import TemporaryDirectory
|
|
14
|
+
import subprocess
|
|
15
|
+
import random
|
|
16
|
+
import string
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def process_clustering(filename, df, id_column_name):
|
|
20
|
+
clustering = pd.read_csv(filename, delimiter='\t', header=None)
|
|
21
|
+
#rename heading as cluster reference and id
|
|
22
|
+
clustering.columns = ['foldseek_representative_cluster_structure', id_column_name]
|
|
23
|
+
clustering.drop_duplicates(subset=id_column_name, keep='first', inplace=True)
|
|
24
|
+
# Remove the chain to a separate column
|
|
25
|
+
clustering['chain'] = ['_'.join(c.split('_')[-1]) for c in clustering[id_column_name].values]
|
|
26
|
+
clustering[id_column_name] = ['_'.join(c.split('_')[:-1]) for c in clustering[id_column_name].values]
|
|
27
|
+
clustering.set_index(id_column_name, inplace=True)
|
|
28
|
+
# Join the clustering with the df
|
|
29
|
+
df = df.set_index(id_column_name)
|
|
30
|
+
df = df.join(clustering, how='left')
|
|
31
|
+
df.reset_index(inplace=True)
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
class FoldSeek(Step):
|
|
35
|
+
|
|
36
|
+
def __init__(self, id_column_name: str, query_column_name: str, reference_database: str, method='search', query_type='structures',
|
|
37
|
+
args=None, tmp_dir: str = None):
|
|
38
|
+
self.query_column_name = query_column_name
|
|
39
|
+
self.id_column_name = id_column_name
|
|
40
|
+
self.reference_database = reference_database # pdb should be the default
|
|
41
|
+
self.tmp_dir = tmp_dir
|
|
42
|
+
self.method = method
|
|
43
|
+
self.args = args
|
|
44
|
+
self.query_type = query_type
|
|
45
|
+
if self.method not in ['search', 'cluster']:
|
|
46
|
+
print('Method must be in "search" or "cluster". Will likely fail... ')
|
|
47
|
+
if self.query_type not in ['seqs', 'structures']:
|
|
48
|
+
print('query_type must be either "seqs" or "structures" i.e. is it an amino acid sequence or a path to pdb files?')
|
|
49
|
+
|
|
50
|
+
def __execute(self, data: list) -> np.array:
|
|
51
|
+
df, tmp_dir = data
|
|
52
|
+
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
53
|
+
|
|
54
|
+
if self.query_type == 'seqs':
|
|
55
|
+
# Convert to a fasta
|
|
56
|
+
with open(f'{tmp_dir}/{tmp_label}_seqs.fasta', 'w') as f:
|
|
57
|
+
for i, row in df.iterrows():
|
|
58
|
+
f.write(f'>{row[self.id_column_name]}\n{row[self.query_column_name]}\n')
|
|
59
|
+
|
|
60
|
+
# Get the PDB files from the column
|
|
61
|
+
pdb_files = list(df[self.query_column_name].values)
|
|
62
|
+
|
|
63
|
+
if self.method == 'search':
|
|
64
|
+
cmd = ['foldseek', 'easy-search']
|
|
65
|
+
if self.query_type == 'structures':
|
|
66
|
+
cmd += pdb_files + [f'{self.reference_database}', f'{tmp_dir}{tmp_label}.txt', 'tmp']
|
|
67
|
+
else:
|
|
68
|
+
# Convert the file to a fasta and then pass that file name
|
|
69
|
+
# Make a db from the seqs
|
|
70
|
+
# ToDo: make this more efficient
|
|
71
|
+
subcmd = ['foldseek', 'databases', 'ProstT5', 'weights', 'tmp']
|
|
72
|
+
self.run(subcmd)
|
|
73
|
+
|
|
74
|
+
subcmd = ['foldseek', 'createdb', f'{tmp_dir}/{tmp_label}_seqs.fasta', f'db_{tmp_label}', '--prostt5-model', 'weights']
|
|
75
|
+
self.run(subcmd)
|
|
76
|
+
|
|
77
|
+
# Pass your newly created dB
|
|
78
|
+
cmd += [f'db_{tmp_label}', f'{self.reference_database}', f'{tmp_dir}{tmp_label}.txt', 'tmp']
|
|
79
|
+
|
|
80
|
+
elif self.method == 'cluster':
|
|
81
|
+
cmd = ['foldseek', 'easy-cluster']
|
|
82
|
+
if self.query_type == 'structures':
|
|
83
|
+
cmd += pdb_files + [f'{tmp_dir}/clusterFolds', f'{tmp_dir}']
|
|
84
|
+
else:
|
|
85
|
+
subcmd = ['foldseek', 'databases', 'ProstT5', 'weights', 'tmp']
|
|
86
|
+
self.run(subcmd)
|
|
87
|
+
subcmd = ['foldseek', 'createdb', f'{tmp_dir}/{tmp_label}_seqs.fasta', f'db_{tmp_label}', '--prostt5-model', 'weights']
|
|
88
|
+
self.run(subcmd)
|
|
89
|
+
cmd = ['foldseek', 'cluster']
|
|
90
|
+
|
|
91
|
+
# Convert the file to a fasta and then pass that file name
|
|
92
|
+
cmd += [ f'db_{tmp_label}', f'{tmp_dir}/clusterFolds', f'{tmp_dir}']
|
|
93
|
+
|
|
94
|
+
# add in args
|
|
95
|
+
if self.args is not None:
|
|
96
|
+
cmd.extend(self.args)
|
|
97
|
+
|
|
98
|
+
self.run(cmd)
|
|
99
|
+
|
|
100
|
+
if self.method == 'search':
|
|
101
|
+
df = pd.read_csv(f'{tmp_dir}{tmp_label}.txt', header=None, sep='\t')
|
|
102
|
+
df.columns = ['query', 'target', 'fident', 'alnlen', 'mismatch',
|
|
103
|
+
'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
|
|
104
|
+
elif self.method == 'cluster':
|
|
105
|
+
df = process_clustering(f'{tmp_dir}/clusterFolds_cluster.tsv', df, self.id_column_name)
|
|
106
|
+
return df
|
|
107
|
+
return df
|
|
108
|
+
|
|
109
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
110
|
+
if self.tmp_dir is not None:
|
|
111
|
+
return self.__execute([df, self.tmp_dir])
|
|
112
|
+
with TemporaryDirectory() as tmp_dir:
|
|
113
|
+
return self.__execute([df, tmp_dir])
|
|
114
|
+
return df
|