enzymetk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. enzymetk/__init__.py +56 -0
  2. enzymetk/annotateEC_CLEAN_step.py +122 -0
  3. enzymetk/annotateEC_CREEP_step.py +82 -0
  4. enzymetk/annotateEC_proteinfer_step.py +136 -0
  5. enzymetk/dock_chai_step.py +51 -0
  6. enzymetk/dock_vina_step.py +63 -0
  7. enzymetk/embedchem_chemberta_step.py +61 -0
  8. enzymetk/embedchem_rxnfp_run.py +28 -0
  9. enzymetk/embedchem_rxnfp_step.py +55 -0
  10. enzymetk/embedchem_selformer_run.py +28 -0
  11. enzymetk/embedchem_selformer_step.py +39 -0
  12. enzymetk/embedchem_unimol_step.py +57 -0
  13. enzymetk/embedprotein_esm_step.py +123 -0
  14. enzymetk/esm-extract.py +140 -0
  15. enzymetk/filter_sequence_step.py +0 -0
  16. enzymetk/filter_structure_step.py +0 -0
  17. enzymetk/generate_msa_step.py +61 -0
  18. enzymetk/generate_oligopool_step.py +0 -0
  19. enzymetk/generate_tree_step.py +74 -0
  20. enzymetk/inpaint_ligandMPNN_step.py +65 -0
  21. enzymetk/main.py +37 -0
  22. enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
  23. enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
  24. enzymetk/pipeline.py +1 -0
  25. enzymetk/predict_activity_step.py +0 -0
  26. enzymetk/predict_catalyticsite_run.py +47 -0
  27. enzymetk/predict_catalyticsite_step.py +70 -0
  28. enzymetk/reducedim_pca_run.py +67 -0
  29. enzymetk/reducedim_vae_run.py +67 -0
  30. enzymetk/reducedim_vae_step.py +12 -0
  31. enzymetk/save_step.py +13 -0
  32. enzymetk/sequence_search_blast.py +80 -0
  33. enzymetk/similarity_foldseek_step.py +114 -0
  34. enzymetk/similarity_mmseqs_step.py +80 -0
  35. enzymetk/similarity_reaction_step.py +60 -0
  36. enzymetk/similarity_substrate_step.py +59 -0
  37. enzymetk/step.py +60 -0
  38. enzymetk-0.0.1.data/data/LICENSE +0 -0
  39. enzymetk-0.0.1.dist-info/LICENSE +0 -0
  40. enzymetk-0.0.1.dist-info/METADATA +370 -0
  41. enzymetk-0.0.1.dist-info/RECORD +44 -0
  42. enzymetk-0.0.1.dist-info/WHEEL +5 -0
  43. enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
  44. enzymetk-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,55 @@
1
+ """
2
+ Install clean and then you need to activate the environment and install and run via that.
3
+
4
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
5
+ repo and then copy it out of it.
6
+ """
7
+ from enzymetk.step import Step
8
+ import pandas as pd
9
+ import numpy as np
10
+ from multiprocessing.dummy import Pool as ThreadPool
11
+ from tempfile import TemporaryDirectory
12
+ import os
13
+ import subprocess
14
+ import random
15
+ import string
16
+
17
+
18
+ class PoreChop(Step):
19
+
20
+ def __init__(self, porechop_dir: str, input_column_name: str, output_column_name: str, num_threads=1):
21
+ self.porechop_dir = porechop_dir
22
+ self.input_column_name = input_column_name
23
+ self.output_column_name = output_column_name
24
+ self.num_threads = num_threads
25
+
26
+ def __execute(self, data: list) -> np.array:
27
+ df = data
28
+ # f'./porechop-runner.py -i {data_dir}fastq/{l}.fastq -o {data_dir}trimmed/{l}.fastq'
29
+ file_created = []
30
+ for input_filename, output_filename in df[[self.input_column_name, self.output_column_name]]:
31
+ subprocess.run([f'{self.porechop_dir}./porechop-runner.py', '-i', input_filename, '-o', output_filename], check=True)
32
+ # Check that the file was created
33
+ if os.path.exists(output_filename):
34
+ file_created.append(True)
35
+ else:
36
+ file_created.append(False)
37
+ df['file_created'] = file_created
38
+ return df
39
+
40
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
41
+ with TemporaryDirectory() as tmp_dir:
42
+ if self.num_threads > 1:
43
+ data = []
44
+ df_list = np.array_split(df, self.num_threads)
45
+ pool = ThreadPool(self.num_threads)
46
+ for df_chunk in df_list:
47
+ data.append([df_chunk, tmp_dir])
48
+ results = pool.map(self.__execute, data)
49
+ df = pd.DataFrame()
50
+ for dfs in results:
51
+ df = pd.concat([df, dfs])
52
+ return df
53
+ else:
54
+ return self.__execute([df, tmp_dir])
55
+ return df
@@ -0,0 +1,59 @@
1
+ #./foldseek easy-search /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_0.cif /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_1.cif pdb test_aln.fasta tmp
2
+ """
3
+ Install clean and then you need to activate the environment and install and run via that.
4
+
5
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
6
+ repo and then copy it out of it.
7
+ """
8
+ from enzymetk.step import Step
9
+ import pandas as pd
10
+ import numpy as np
11
+ from multiprocessing.dummy import Pool as ThreadPool
12
+ from tempfile import TemporaryDirectory
13
+ import os
14
+ import subprocess
15
+ import random
16
+ import string
17
+
18
+
19
+ """ Install: conda install -c conda-forge -c bioconda -c defaults prokka """
20
+ class Prokka(Step):
21
+
22
+ def __init__(self, porechop_dir: str, name: str, input_column_name: str, output_dir: str, num_threads=1):
23
+ self.porechop_dir = porechop_dir
24
+ self.name = name
25
+ self.input_column_name = input_column_name
26
+ self.output_dir = output_dir
27
+ self.num_threads = num_threads
28
+
29
+ def __execute(self, data: list) -> np.array:
30
+ df = data
31
+ # f'prokka --outdir {data_dir}prokka/{l} --prefix {l} {data_dir}flye/{l}/assembly.fasta ')
32
+ file_created = []
33
+ for name, input_filename, output_dir in df[[self.name, self.input_column_name, self.output_dir]]:
34
+ # Note it expects the input file name to be the outpt from flye
35
+ subprocess.run([f'prokka', '--outdir', output_dir, '--prefix', name, input_filename], check=True)
36
+ # Check that the file was created
37
+ if os.path.exists(output_filename):
38
+ file_created.append(True)
39
+ else:
40
+ file_created.append(False)
41
+ df['file_created'] = file_created
42
+ return df
43
+
44
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
45
+ with TemporaryDirectory() as tmp_dir:
46
+ if self.num_threads > 1:
47
+ data = []
48
+ df_list = np.array_split(df, self.num_threads)
49
+ pool = ThreadPool(self.num_threads)
50
+ for df_chunk in df_list:
51
+ data.append([df_chunk, tmp_dir])
52
+ results = pool.map(self.__execute, data)
53
+ df = pd.DataFrame()
54
+ for dfs in results:
55
+ df = pd.concat([df, dfs])
56
+ return df
57
+ else:
58
+ return self.__execute([df, tmp_dir])
59
+ return df
enzymetk/pipeline.py ADDED
@@ -0,0 +1 @@
1
+
File without changes
@@ -0,0 +1,47 @@
1
+ import argparse
2
+ import os
3
+
4
+
5
+ def run_as_inference(output_dir, fasta_file, squidly_dir, toks_per_batch, as_threshold, bs_threshold, cr_model_as,
6
+ cr_model_bs, lstm_model_as, lstm_model_bs, esm2_model):
7
+ esm2_model = esm2_model or "esm2_t36_3B_UR50D"
8
+ if esm2_model == "esm2_t36_3B_UR50D":
9
+ cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_3B.pt"
10
+ lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_3B.pth"
11
+ elif esm2_model == "esm2_t48_15B_UR50D":
12
+ cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_15B.pt"
13
+ lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_15B.pth"
14
+ as_threshold = 0.99
15
+ #esm2_model = "esm2_t48_15B_UR50D"
16
+ # python /scratch/project/squid/code_modular/SQUIDLY_run_model_LSTM.py ${FILE} ${ESM2_MODEL} ${CR_MODEL_AS}
17
+ # ${LSTM_MODEL_AS} ${OUT} --toks_per_batch ${TOKS_PER_BATCH} --AS_threshold ${AS_THRESHOLD} --monitor
18
+
19
+ command = f'conda run -n AS_inference python {squidly_dir}SQUIDLY_run_model_LSTM.py \
20
+ {fasta_file} {esm2_model} {cr_model_as} {lstm_model_as} {output_dir} \
21
+ --toks_per_batch {toks_per_batch} --AS_threshold {as_threshold}'
22
+ print(command)
23
+ os.system(command)
24
+
25
+
26
+ def parse_args():
27
+ parser = argparse.ArgumentParser(description="Run as inference on a dataset")
28
+ parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
29
+ parser.add_argument('-input', '--input', type=str, required=True, help='path to the dataframe')
30
+ parser.add_argument('--squidly_dir', type=str, required=True, help='path to the squidly_dir')
31
+ parser.add_argument('--toks_per_batch', type=int, default=5, help='How many tokens per batch')
32
+ parser.add_argument('--as_threshold', type=float, default=0.90, help='the threshold for active site.')
33
+ parser.add_argument('--bs_threshold', type=float, default=0.85, help='the threshold for binding site.')
34
+ parser.add_argument('--cr_model_as', type=str, help='the path to the active site CR model.')
35
+ parser.add_argument('--cr_model_bs', type=str, help='the path to the binding site CR model.')
36
+ parser.add_argument('--lstm_model_as', type=str, help='the path to the active site LSTM model.')
37
+ parser.add_argument('--lstm_model_bs', type=str, help='the path to the binding site LSTM model.')
38
+ parser.add_argument('--esm2_model', type=str, help='ESM2 model.')
39
+ return parser.parse_args()
40
+
41
+ def main():
42
+ args = parse_args()
43
+ run_as_inference(args.out, args.input, args.squidly_dir, args.toks_per_batch, args.as_threshold, args.bs_threshold,
44
+ args.cr_model_as, args.cr_model_bs, args.lstm_model_as, args.lstm_model_bs, args.esm2_model)
45
+
46
+ # Removed the if name since we run with subprocess
47
+ main()
@@ -0,0 +1,70 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ import subprocess
5
+ from pathlib import Path
6
+ import logging
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+ import random
10
+ import string
11
+
12
+ logger = logging.getLogger(__name__)
13
+ logger.setLevel(logging.INFO)
14
+
15
+
16
+ class ActiveSitePred(Step):
17
+
18
+ def __init__(self, id_col: str, seq_col: str, squidly_dir: str, num_threads: int = 1,
19
+ esm2_model = 'esm2_t36_3B_UR50D', tmp_dir: str = None):
20
+ self.id_col = id_col
21
+ self.seq_col = seq_col
22
+ self.num_threads = num_threads or 1
23
+ self.squidly_dir = squidly_dir
24
+ self.esm2_model = esm2_model
25
+ self.tmp_dir = tmp_dir
26
+
27
+ def __to_fasta(self, df: pd.DataFrame, tmp_dir: str):
28
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
29
+
30
+ input_filename = f'{tmp_dir}/as_inference_{tmp_label}.fasta'
31
+ # Save as a fasta
32
+ with open(input_filename, 'w+') as fout:
33
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
34
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
35
+ return input_filename
36
+
37
+ def __execute(self, df: pd.DataFrame, tmp_dir: str):
38
+ input_filename = self.__to_fasta(df, tmp_dir)
39
+ # Might have an issue if the things are not correctly installed in the same dicrectory
40
+ result = subprocess.run(['python', Path(__file__).parent/'predict_catalyticsite_run.py', '--out', str(tmp_dir),
41
+ '--input', input_filename, '--squidly_dir', self.squidly_dir, '--esm2_model', self.esm2_model], capture_output=True, text=True)
42
+ output_filename = f'{input_filename.replace(".fasta", "_results.pkl")}'
43
+ if result.stderr:
44
+ logger.error(result.stderr)
45
+ logger.info(result.stdout)
46
+
47
+ return output_filename
48
+
49
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
50
+ with TemporaryDirectory() as tmp_dir:
51
+ tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
52
+ if self.num_threads > 1:
53
+ output_filenames = []
54
+ df_list = np.array_split(df, self.num_threads)
55
+ for df_chunk in tqdm(df_list):
56
+ try:
57
+ output_filenames.append(self.__execute(df_chunk, tmp_dir))
58
+ except Exception as e:
59
+ logger.error(f"Error in executing ESM2 model: {e}")
60
+ continue
61
+ df = pd.DataFrame()
62
+ print(output_filenames)
63
+ for p in output_filenames:
64
+ sub_df = pd.read_pickle(p)
65
+ df = pd.concat([df, sub_df])
66
+ return df
67
+
68
+ else:
69
+ output_filename = self.__execute(df, tmp_dir)
70
+ return pd.read_pickle(output_filename)
@@ -0,0 +1,67 @@
1
+ import pickle
2
+ import argparse
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scivae import VAE
6
+ import json
7
+
8
+ def run_vae(output_dir, input_filename, value_column, method, config_file, label, id_column):
9
+
10
+ # Load the input dataset
11
+ with open(input_filename, 'rb') as file:
12
+ chem_df = pickle.load(file)
13
+
14
+ chem_encodings = np.asarray([np.asarray(x) for x in chem_df[value_column].values])
15
+
16
+ # Load in the config
17
+ with open(config_file, 'r') as f:
18
+ config = json.load(f)
19
+
20
+ # Add some defaults
21
+ if config.get('batch_size') is None:
22
+ config['batch_size'] = 100
23
+ if config.get('epochs') is None:
24
+ config['epochs'] = 100
25
+ if config.get('early_stop') is None:
26
+ config['early_stop'] = True
27
+ vae_mse = VAE(chem_encodings, chem_encodings, np.ones(len(chem_encodings)), config, 'esm_label')
28
+ # Set batch size and number of epochs
29
+ if method == 'train':
30
+ vae_mse.encode('default', epochs=config['epochs'], batch_size=config['batch_size'], early_stop=config['early_stop'])
31
+ # Save this
32
+ vae_mse.save(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
33
+ config_json=f'{output_dir}{label}_config.json')
34
+ elif method == 'encode':
35
+ # load this
36
+ vae_mse.load(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
37
+ config_json=f'{output_dir}{label}_config.json')
38
+
39
+ encoded_data_vae_mse = vae_mse.encode_new_data(chem_encodings)
40
+
41
+ df = pd.DataFrame()
42
+ df['id'] = chem_df[f'{id_column}'].values
43
+ df['encoding'] = [x for x in encoded_data_vae_mse]
44
+
45
+ # Save the encoded data
46
+ with open(f'{output_dir}{label}.pkl', 'wb') as file:
47
+ pickle.dump(df, file)
48
+
49
+ def parse_args():
50
+ parser = argparse.ArgumentParser(description="Run VAE dimensionality reduction on a dataset")
51
+ parser.add_argument('-o', '--out', required=True, help='Path to the output directory')
52
+ parser.add_argument('-i', '--input', type=str, required=True, help='path to the dataframe')
53
+ parser.add_argument('-v', '--value', type=str, required=True, help='label of the column which has the values for encoding')
54
+ parser.add_argument('-m', '--method', type=str, required=True, help='either to encode or train a VAE')
55
+ parser.add_argument('-c', '--config', type=str, required=True, help='config file path as JSON')
56
+ parser.add_argument('-l', '--label', type=str, required=True, help='run label for saving')
57
+ parser.add_argument('-d', '--id', type=str, required=True, help='id column')
58
+
59
+ return parser.parse_args()
60
+
61
+ def main():
62
+ args = parse_args()
63
+ # def run_vae(output_filename, input_filename, value_column, method, config_file, label_column, id_column):
64
+ run_vae(args.out, args.input, args.value, args.method, args.config, args.label, args.id)
65
+
66
+
67
+ main()
@@ -0,0 +1,67 @@
1
+ import pickle
2
+ import argparse
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scivae import VAE
6
+ import json
7
+
8
+ def run_vae(output_dir, input_filename, value_column, method, config_file, label, id_column):
9
+
10
+ # Load the input dataset
11
+ with open(input_filename, 'rb') as file:
12
+ chem_df = pickle.load(file)
13
+
14
+ chem_encodings = np.asarray([np.asarray(x) for x in chem_df[value_column].values])
15
+
16
+ # Load in the config
17
+ with open(config_file, 'r') as f:
18
+ config = json.load(f)
19
+
20
+ # Add some defaults
21
+ if config.get('batch_size') is None:
22
+ config['batch_size'] = 100
23
+ if config.get('epochs') is None:
24
+ config['epochs'] = 100
25
+ if config.get('early_stop') is None:
26
+ config['early_stop'] = True
27
+ vae_mse = VAE(chem_encodings, chem_encodings, np.ones(len(chem_encodings)), config, 'esm_label')
28
+ # Set batch size and number of epochs
29
+ if method == 'train':
30
+ vae_mse.encode('default', epochs=config['epochs'], batch_size=config['batch_size'], early_stop=config['early_stop'])
31
+ # Save this
32
+ vae_mse.save(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
33
+ config_json=f'{output_dir}{label}_config.json')
34
+ elif method == 'encode':
35
+ # load this
36
+ vae_mse.load(weight_file_path=f'{output_dir}{label}_model_weights.h5', optimizer_file_path=f'{output_dir}{label}_model_optimiser.json',
37
+ config_json=f'{output_dir}{label}_config.json')
38
+
39
+ encoded_data_vae_mse = vae_mse.encode_new_data(chem_encodings)
40
+
41
+ df = pd.DataFrame()
42
+ df['id'] = chem_df[f'{id_column}'].values
43
+ df['encoding'] = [x for x in encoded_data_vae_mse]
44
+
45
+ # Save the encoded data
46
+ with open(f'{output_dir}{label}.pkl', 'wb') as file:
47
+ pickle.dump(df, file)
48
+
49
+ def parse_args():
50
+ parser = argparse.ArgumentParser(description="Run VAE dimensionality reduction on a dataset")
51
+ parser.add_argument('-o', '--out', required=True, help='Path to the output directory')
52
+ parser.add_argument('-i', '--input', type=str, required=True, help='path to the dataframe')
53
+ parser.add_argument('-v', '--value', type=str, required=True, help='label of the column which has the values for encoding')
54
+ parser.add_argument('-m', '--method', type=str, required=True, help='either to encode or train a VAE')
55
+ parser.add_argument('-c', '--config', type=str, required=True, help='config file path as JSON')
56
+ parser.add_argument('-l', '--label', type=str, required=True, help='run label for saving')
57
+ parser.add_argument('-d', '--id', type=str, required=True, help='id column')
58
+
59
+ return parser.parse_args()
60
+
61
+ def main():
62
+ args = parse_args()
63
+ # def run_vae(output_filename, input_filename, value_column, method, config_file, label_column, id_column):
64
+ run_vae(args.out, args.input, args.value, args.method, args.config, args.label, args.id)
65
+
66
+
67
+ main()
@@ -0,0 +1,12 @@
1
+ from enzymetk.step import Step
2
+
3
+ import pandas as pd
4
+ from tempfile import TemporaryDirectory
5
+ import subprocess
6
+ from pathlib import Path
7
+ import logging
8
+ import datetime
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.INFO)
enzymetk/save_step.py ADDED
@@ -0,0 +1,13 @@
1
+ from enzymetk.step import Step
2
+
3
+
4
+ import pandas as pd
5
+
6
+ class Save(Step):
7
+
8
+ def __init__(self, output_filename: str):
9
+ self.output_filename = output_filename
10
+
11
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
12
+ df.to_pickle(self.output_filename)
13
+ return df
@@ -0,0 +1,80 @@
1
+ """
2
+ Step to run multiple sequence alignment with the Clustal Omega tool.
3
+ ./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
4
+ """
5
+ from enzymetk.step import Step
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ from multiprocessing.dummy import Pool as ThreadPool
10
+ from tempfile import TemporaryDirectory
11
+ import os
12
+ import subprocess
13
+ import random
14
+ import string
15
+
16
+ class BLAST(Step):
17
+
18
+ def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None, mode='blastp', args=None, tmp_dir=None):
19
+ self.id_col = id_col
20
+ self.seq_col = sequence_col
21
+ self.label_col = label_col # This is whether it is query or reference
22
+ self.mode = mode
23
+ self.database = database
24
+ self.args = args
25
+ self.tmp_dir = tmp_dir
26
+ if self.database is None and self.label_col is None:
27
+ raise ValueError('Database is not set, you can pass a database that you have already created see diamond for more information or the sequences \
28
+ as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference \
29
+ refers to sequences that you want to search against and query refers to sequences that you want to search for.')
30
+
31
+ def __execute(self, data: list) -> np.array:
32
+ df, tmp_dir = data
33
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
34
+ query_fasta = os.path.join(tmp_dir, f'{tmp_label}_query.fasta')
35
+ ref_fasta = os.path.join(tmp_dir, f'{tmp_label}_ref.fasta')
36
+ db_label = os.path.join(tmp_dir, f'{tmp_label}_db')
37
+ # write fasta file which is the input for proteinfer
38
+ if self.label_col is not None:
39
+ with open(query_fasta, 'w+') as fout:
40
+ query_df = df[df[self.label_col] == 'query']
41
+ print(query_df)
42
+ for entry, seq in query_df[[self.id_col, self.seq_col]].values:
43
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
44
+
45
+ with open(ref_fasta, 'w+') as fout:
46
+ query_df = df[df[self.label_col] == 'reference']
47
+ print(query_df)
48
+ for entry, seq in query_df[[self.id_col, self.seq_col]].values:
49
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
50
+ # Make the DB first
51
+ db_label = os.path.join(tmp_dir, f'{tmp_label}_refdb')
52
+ subprocess.run(['diamond', 'makedb', '--in', ref_fasta, '-d', db_label], check=True)
53
+ else:
54
+ with open(query_fasta, 'w+') as fout:
55
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
56
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
57
+ if os.path.exists(self.database):
58
+ # Here we're assuming they're passing a database as a fasta file
59
+ subprocess.run(['diamond', 'makedb', '--in', self.database, '-d', db_label], check=True)
60
+ else:
61
+ db_label = self.database
62
+
63
+ # Running Clustal Omega on the generated FASTA file
64
+ matches_filename = os.path.join(tmp_dir, f'{tmp_label}_matches.tsv')
65
+ cmd = ['diamond', self.mode]
66
+ if self.args is not None:
67
+ cmd.extend(self.args)
68
+ cmd.extend(['-d', db_label, '-q', query_fasta, '-o', matches_filename])
69
+ print(cmd)
70
+ self.run(cmd)
71
+ df = pd.read_csv(matches_filename, sep='\t', header=None)
72
+ print(df)
73
+ df.columns = ['query', 'target', 'sequence identity', 'length', 'mismatch', 'gapopen', 'query start', 'query end', 'target start', 'target end', 'e-value', 'bitscore']
74
+ return df
75
+
76
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
77
+ if self.tmp_dir is not None:
78
+ return self.__execute([df, self.tmp_dir])
79
+ with TemporaryDirectory() as tmp_dir:
80
+ return self.__execute([df, tmp_dir])
@@ -0,0 +1,114 @@
1
+ #./foldseek easy-search /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_0.cif /home/ariane/degradeo/data/pipeline/p1_predict_activity/p1b_encode_protein/e1_esm/chai/Q0HLQ7/chai/Q0HLQ7_1.cif pdb test_aln.fasta tmp
2
+ """
3
+ Install clean and then you need to activate the environment and install and run via that.
4
+
5
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
6
+ repo and then copy it out of it.
7
+ """
8
+ from enzymetk.step import Step
9
+
10
+
11
+ import pandas as pd
12
+ import numpy as np
13
+ from tempfile import TemporaryDirectory
14
+ import subprocess
15
+ import random
16
+ import string
17
+
18
+
19
+ def process_clustering(filename, df, id_column_name):
20
+ clustering = pd.read_csv(filename, delimiter='\t', header=None)
21
+ #rename heading as cluster reference and id
22
+ clustering.columns = ['foldseek_representative_cluster_structure', id_column_name]
23
+ clustering.drop_duplicates(subset=id_column_name, keep='first', inplace=True)
24
+ # Remove the chain to a separate column
25
+ clustering['chain'] = ['_'.join(c.split('_')[-1]) for c in clustering[id_column_name].values]
26
+ clustering[id_column_name] = ['_'.join(c.split('_')[:-1]) for c in clustering[id_column_name].values]
27
+ clustering.set_index(id_column_name, inplace=True)
28
+ # Join the clustering with the df
29
+ df = df.set_index(id_column_name)
30
+ df = df.join(clustering, how='left')
31
+ df.reset_index(inplace=True)
32
+ return df
33
+
34
+ class FoldSeek(Step):
35
+
36
+ def __init__(self, id_column_name: str, query_column_name: str, reference_database: str, method='search', query_type='structures',
37
+ args=None, tmp_dir: str = None):
38
+ self.query_column_name = query_column_name
39
+ self.id_column_name = id_column_name
40
+ self.reference_database = reference_database # pdb should be the default
41
+ self.tmp_dir = tmp_dir
42
+ self.method = method
43
+ self.args = args
44
+ self.query_type = query_type
45
+ if self.method not in ['search', 'cluster']:
46
+ print('Method must be in "search" or "cluster". Will likely fail... ')
47
+ if self.query_type not in ['seqs', 'structures']:
48
+ print('query_type must be either "seqs" or "structures" i.e. is it an amino acid sequence or a path to pdb files?')
49
+
50
+ def __execute(self, data: list) -> np.array:
51
+ df, tmp_dir = data
52
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
53
+
54
+ if self.query_type == 'seqs':
55
+ # Convert to a fasta
56
+ with open(f'{tmp_dir}/{tmp_label}_seqs.fasta', 'w') as f:
57
+ for i, row in df.iterrows():
58
+ f.write(f'>{row[self.id_column_name]}\n{row[self.query_column_name]}\n')
59
+
60
+ # Get the PDB files from the column
61
+ pdb_files = list(df[self.query_column_name].values)
62
+
63
+ if self.method == 'search':
64
+ cmd = ['foldseek', 'easy-search']
65
+ if self.query_type == 'structures':
66
+ cmd += pdb_files + [f'{self.reference_database}', f'{tmp_dir}{tmp_label}.txt', 'tmp']
67
+ else:
68
+ # Convert the file to a fasta and then pass that file name
69
+ # Make a db from the seqs
70
+ # ToDo: make this more efficient
71
+ subcmd = ['foldseek', 'databases', 'ProstT5', 'weights', 'tmp']
72
+ self.run(subcmd)
73
+
74
+ subcmd = ['foldseek', 'createdb', f'{tmp_dir}/{tmp_label}_seqs.fasta', f'db_{tmp_label}', '--prostt5-model', 'weights']
75
+ self.run(subcmd)
76
+
77
+ # Pass your newly created dB
78
+ cmd += [f'db_{tmp_label}', f'{self.reference_database}', f'{tmp_dir}{tmp_label}.txt', 'tmp']
79
+
80
+ elif self.method == 'cluster':
81
+ cmd = ['foldseek', 'easy-cluster']
82
+ if self.query_type == 'structures':
83
+ cmd += pdb_files + [f'{tmp_dir}/clusterFolds', f'{tmp_dir}']
84
+ else:
85
+ subcmd = ['foldseek', 'databases', 'ProstT5', 'weights', 'tmp']
86
+ self.run(subcmd)
87
+ subcmd = ['foldseek', 'createdb', f'{tmp_dir}/{tmp_label}_seqs.fasta', f'db_{tmp_label}', '--prostt5-model', 'weights']
88
+ self.run(subcmd)
89
+ cmd = ['foldseek', 'cluster']
90
+
91
+ # Convert the file to a fasta and then pass that file name
92
+ cmd += [ f'db_{tmp_label}', f'{tmp_dir}/clusterFolds', f'{tmp_dir}']
93
+
94
+ # add in args
95
+ if self.args is not None:
96
+ cmd.extend(self.args)
97
+
98
+ self.run(cmd)
99
+
100
+ if self.method == 'search':
101
+ df = pd.read_csv(f'{tmp_dir}{tmp_label}.txt', header=None, sep='\t')
102
+ df.columns = ['query', 'target', 'fident', 'alnlen', 'mismatch',
103
+ 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
104
+ elif self.method == 'cluster':
105
+ df = process_clustering(f'{tmp_dir}/clusterFolds_cluster.tsv', df, self.id_column_name)
106
+ return df
107
+ return df
108
+
109
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
110
+ if self.tmp_dir is not None:
111
+ return self.__execute([df, self.tmp_dir])
112
+ with TemporaryDirectory() as tmp_dir:
113
+ return self.__execute([df, tmp_dir])
114
+ return df