enzymetk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. enzymetk/__init__.py +56 -0
  2. enzymetk/annotateEC_CLEAN_step.py +122 -0
  3. enzymetk/annotateEC_CREEP_step.py +82 -0
  4. enzymetk/annotateEC_proteinfer_step.py +136 -0
  5. enzymetk/dock_chai_step.py +51 -0
  6. enzymetk/dock_vina_step.py +63 -0
  7. enzymetk/embedchem_chemberta_step.py +61 -0
  8. enzymetk/embedchem_rxnfp_run.py +28 -0
  9. enzymetk/embedchem_rxnfp_step.py +55 -0
  10. enzymetk/embedchem_selformer_run.py +28 -0
  11. enzymetk/embedchem_selformer_step.py +39 -0
  12. enzymetk/embedchem_unimol_step.py +57 -0
  13. enzymetk/embedprotein_esm_step.py +123 -0
  14. enzymetk/esm-extract.py +140 -0
  15. enzymetk/filter_sequence_step.py +0 -0
  16. enzymetk/filter_structure_step.py +0 -0
  17. enzymetk/generate_msa_step.py +61 -0
  18. enzymetk/generate_oligopool_step.py +0 -0
  19. enzymetk/generate_tree_step.py +74 -0
  20. enzymetk/inpaint_ligandMPNN_step.py +65 -0
  21. enzymetk/main.py +37 -0
  22. enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
  23. enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
  24. enzymetk/pipeline.py +1 -0
  25. enzymetk/predict_activity_step.py +0 -0
  26. enzymetk/predict_catalyticsite_run.py +47 -0
  27. enzymetk/predict_catalyticsite_step.py +70 -0
  28. enzymetk/reducedim_pca_run.py +67 -0
  29. enzymetk/reducedim_vae_run.py +67 -0
  30. enzymetk/reducedim_vae_step.py +12 -0
  31. enzymetk/save_step.py +13 -0
  32. enzymetk/sequence_search_blast.py +80 -0
  33. enzymetk/similarity_foldseek_step.py +114 -0
  34. enzymetk/similarity_mmseqs_step.py +80 -0
  35. enzymetk/similarity_reaction_step.py +60 -0
  36. enzymetk/similarity_substrate_step.py +59 -0
  37. enzymetk/step.py +60 -0
  38. enzymetk-0.0.1.data/data/LICENSE +0 -0
  39. enzymetk-0.0.1.dist-info/LICENSE +0 -0
  40. enzymetk-0.0.1.dist-info/METADATA +370 -0
  41. enzymetk-0.0.1.dist-info/RECORD +44 -0
  42. enzymetk-0.0.1.dist-info/WHEEL +5 -0
  43. enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
  44. enzymetk-0.0.1.dist-info/top_level.txt +1 -0
enzymetk/__init__.py ADDED
@@ -0,0 +1,56 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ """
19
+ Author: Ariane Mora
20
+ Date: March 2025
21
+ """
22
+ __title__ = 'enzymetk'
23
+ __description__ = 'Toolkit for enzymes and what not'
24
+ __url__ = 'https://github.com/arianemora/enzyme-tk/'
25
+ __version__ = '0.0.1'
26
+ __author__ = 'Ariane Mora'
27
+ __author_email__ = 'ariane.n.mora@gmail.com'
28
+ __license__ = 'GPL3'
29
+
30
+ # from enzymetk.step import *
31
+ # from enzymetk.generate_msa_step import ClustalOmega
32
+ # from enzymetk.annotateEC_CLEAN_step import CLEAN
33
+ # from enzymetk.annotateEC_proteinfer_step import ProteInfer
34
+ # from enzymetk.dock_chai_step import Chai
35
+ # from enzymetk.dock_vina_step import Vina
36
+ # from enzymetk.embedchem_chemberta_step import ChemBERT
37
+ # from enzymetk.embedchem_rxnfp_step import RxnFP
38
+ # from enzymetk.embedchem_selformer_step import SelFormer
39
+ # from enzymetk.embedchem_unimol_step import UniMol
40
+ # from enzymetk.embedprotein_esm_step import EmbedESM
41
+ # from enzymetk.generate_tree_step import FastTree
42
+ # from enzymetk.inpaint_ligandMPNN_step import LigandMPNN
43
+ # from enzymetk.metagenomics_porechop_trim_reads_step import PoreChop
44
+ # from enzymetk.metagenomics_prokka_annotate_genes import Prokka
45
+ # #from enzymetk.predict_activity_step import
46
+ # from enzymetk.predict_catalyticsite_step import ActiveSitePred
47
+ # from enzymetk.sequence_search_blast import BLAST
48
+ # from enzymetk.similarity_foldseek_step import FoldSeek
49
+ # from enzymetk.similarity_mmseqs_step import MMseqs
50
+ # from enzymetk.similarity_reaction_step import ReactionDist
51
+ # from enzymetk.similarity_substrate_step import SubstrateDist
52
+
53
+
54
+
55
+
56
+
@@ -0,0 +1,122 @@
1
+ """
2
+ Install clean and then you need to activate the environment and install and run via that.
3
+
4
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
5
+ repo and then copy it out of it.
6
+ """
7
+ from enzymetk.step import Step
8
+ import pandas as pd
9
+ import numpy as np
10
+ from multiprocessing.dummy import Pool as ThreadPool
11
+ from tempfile import TemporaryDirectory
12
+ import os
13
+ import subprocess
14
+ import random
15
+ import string
16
+ import os
17
+ from tqdm import tqdm
18
+
19
+
20
+ class CLEAN(Step):
21
+
22
+ def __init__(self, id_col: str, seq_col: str, clean_dir: str, num_threads: int = 1,
23
+ ec1_filter: list = None, ec2_filter: list = None, ec3_filter: list = None, ec4_filter: list = None,
24
+ env_name: str = 'clean', args: list = None):
25
+ self.env_name = env_name
26
+ self.args = args
27
+ self.id_col = id_col
28
+ self.clean_dir = clean_dir
29
+ self.seq_col = seq_col # This is the column which has the sequence in it
30
+ self.num_threads = num_threads
31
+ self.ec1_filter = ec1_filter
32
+ self.ec2_filter = ec2_filter
33
+ self.ec3_filter = ec3_filter
34
+ self.ec4_filter = ec4_filter
35
+
36
+ def __filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
37
+ # ------------- Separate out ECs ------------------
38
+ df['id'] = [c.split(',')[0] for c in df[0].values]
39
+ df['ec'] = [c.split(',')[1:] for c in df[0].values]
40
+ df = df.drop(columns=0)
41
+ df = df.explode('ec')
42
+ df['score'] = [float(ec.split('/')[1]) for ec in df['ec'].values]
43
+ df['ec'] = [str(ec.split('/')[0]) for ec in df['ec'].values]
44
+ df['predicted_ecs'] = [ec.split(':')[1] for ec in df['ec'].values]
45
+ df['EC1'] = [r.split('.')[0] for r in df['predicted_ecs'].values]
46
+ df['EC2'] = [r.split('.')[1] for r in df['predicted_ecs'].values]
47
+ df['EC3'] = [r.split('.')[2] for r in df['predicted_ecs'].values]
48
+ df['EC4'] = [r.split('.')[3] for r in df['predicted_ecs'].values]
49
+
50
+ if self.ec1_filter is not None:
51
+ df = df[df['EC1'].isin(self.ec1_filter)]
52
+ if self.ec2_filter is not None:
53
+ df = df[df['EC2'].isin(self.ec2_filter)]
54
+ if self.ec3_filter is not None:
55
+ df = df[df['EC3'].isin(self.ec3_filter)]
56
+ if self.ec4_filter is not None:
57
+ df = df[df['EC4'].isin(self.ec4_filter)]
58
+
59
+ df = df.sort_values(by='score', ascending=False)
60
+ # Drop duplicates based on id only keeping the highest score
61
+ df.drop_duplicates(subset='id', keep='first', inplace=True)
62
+ return df
63
+
64
+ def __execute(self, data: list) -> np.array:
65
+ df, tmp_dir = data
66
+ # Make sure in the directory of proteinfer
67
+ # Create the fasta file based on the id and the sequence value columns
68
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
69
+ input_filename = f'{tmp_dir}CLEAN_{tmp_label}.fasta'
70
+
71
+ # write fasta file which is the input for proteinfer
72
+ with open(input_filename, 'w+') as fout:
73
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
74
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
75
+ # Run it multi threaded
76
+ os.chdir(self.clean_dir)
77
+
78
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
79
+ # Since clean is GPU hungry, we only run CLEAN on the ones that proteInfer has predicted to be class 3.
80
+ # Need to first copy the data to the CLEAN folder because it's stupid
81
+ os.chdir(f'{self.clean_dir}')
82
+ cmd = ['cp', input_filename, f'{self.clean_dir}data/inputs/{tmp_label}.fasta']
83
+ self.run(cmd)
84
+ # Run clean with clean environment
85
+ cmd = ['conda', 'run', '-n', self.env_name, 'python3', f'{self.clean_dir}CLEAN_infer_fasta.py',
86
+ '--fasta_data', tmp_label]
87
+ if self.args is not None:
88
+ # Add the args to the command
89
+ cmd.extend(self.args)
90
+ self.run(cmd)
91
+ # Copy across the results file
92
+ df = pd.read_csv(f'{self.clean_dir}results/inputs/{tmp_label}_maxsep.csv', header=None, sep='\t')
93
+ cmd = ['rm', f'{self.clean_dir}data/inputs/{tmp_label}.fasta']
94
+ self.run(cmd)
95
+ cmd = ['rm', f'{self.clean_dir}results/inputs/{tmp_label}_maxsep.csv']
96
+ self.run(cmd)
97
+
98
+ # Change back to the current folder
99
+ dir_path = os.path.dirname(os.path.realpath(__file__))
100
+ os.chdir(dir_path)
101
+
102
+ return df
103
+
104
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
105
+ with TemporaryDirectory() as tmp_dir:
106
+ if self.num_threads > 1:
107
+ output_filenames = []
108
+ df_list = np.array_split(df, self.num_threads)
109
+ for df_chunk in tqdm(df_list):
110
+ try:
111
+ output_filenames.append(self.__execute([df_chunk, tmp_dir]))
112
+ except Exception as e:
113
+ print(f"Error in executing ESM2 model: {e}")
114
+ continue
115
+ df = pd.DataFrame()
116
+ print(output_filenames)
117
+ for sub_df in output_filenames:
118
+ df = pd.concat([df, sub_df])
119
+ return df
120
+ else:
121
+ return self.__execute([df, tmp_dir])
122
+ return df
@@ -0,0 +1,82 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ import subprocess
5
+ import logging
6
+ import numpy as np
7
+ import os
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+ """
13
+ import os
14
+
15
+ Example file as input:
16
+ Reaction,EC number,Reaction Text,EC3,EC2,EC1
17
+ O=C(OCC(CC)CCCC)C1=CC=CC=C1C(OCC(CC)CCCC)=O>>OC(C2=CC=CC=C2C(O)=O)=O,3.1.1.60,DEHP->PA,3.1.1,3.1,3
18
+ CCCCC(CC)COC(=O)c1ccccc1C(=O)OCC(CC)CCCC.O>>CCCCC(CC)CO.CCCCC(CC)COC(=O)c1ccccc1C(=O)O,3.1.1.60,DEHP-MEHP,3.1.1,3.1,3
19
+
20
+ os.system(f'
21
+ python step_02_extract_CREEP.py --pretrained_folder=/disk1/share/software/CREEP/data/bioremediation_split --dataset=/disk1/share/software/CREEP/output/DEHP/bioremediation_reaction_test.csv --modality=reaction
22
+ ')
23
+
24
+ os.system(f'python downstream_retrieval.py --pretrained_folder=CREEP/$OUTPUT_DIR --query_dataset=$TEST_SET --reference_dataset=all_ECs --query_modality=reaction --reference_modality=protein')
25
+ """
26
+
27
+ class CREEP(Step):
28
+
29
+ def __init__(self, id_col: str, value_col: str, CREEP_dir: str, CREEP_cache_dir: str, modality: str, reference_modality: str,
30
+ env_name: str = 'CREEP', args_extract: list = None, args_retrieval: list = None):
31
+ self.env_name = env_name
32
+ self.id_col = id_col
33
+ self.value_col = value_col
34
+ self.modality = modality
35
+ self.reference_modality = reference_modality
36
+ self.CREEP_dir = CREEP_dir
37
+ self.CREEP_cache_dir = CREEP_cache_dir
38
+ self.args_extract = args_extract
39
+ self.args_retrieval = args_retrieval
40
+
41
+ def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
42
+ tmp_dir = '/disk1/ariane/vscode/degradeo/pipeline/tmp/'
43
+ input_filename = f'{tmp_dir}/creepasjkdkajshdkja.csv'
44
+ df.to_csv(input_filename, index=False)
45
+ cmd = ['conda', 'run', '-n', self.env_name, 'python', f'{self.CREEP_dir}scripts/step_02_extract_CREEP.py', '--pretrained_folder',
46
+ f'{self.CREEP_cache_dir}output/easy_split',
47
+ '--dataset', input_filename,
48
+ '--cache_dir', self.CREEP_dir,
49
+ '--modality', self.modality.strip(),
50
+ '--output_dir', f'{tmp_dir}']
51
+ if self.args_extract is not None:
52
+ cmd.extend(self.args_extract)
53
+ result = subprocess.run(cmd, capture_output=True, text=True)
54
+ cmd = ['conda', 'run', '-n', self.env_name, 'python', f'{self.CREEP_dir}scripts/downstream_retrieval.py', '--pretrained_folder',
55
+ f'{self.CREEP_cache_dir}output/easy_split',
56
+ '--query_dataset', input_filename,
57
+ '--reference_dataset', 'all_ECs',
58
+ '--query_modality', self.modality.strip(),
59
+ '--cache_dir', self.CREEP_cache_dir,
60
+ '--output_dir', f'{tmp_dir}',
61
+ '--reference_modality', self.reference_modality]
62
+ if self.args_retrieval is not None:
63
+ cmd.extend(self.args_retrieval)
64
+ self.run(cmd)
65
+ output_filename = f'{tmp_dir}/creep_reaction2protein_retrieval_similarities.npy'
66
+ return output_filename
67
+
68
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
69
+ with TemporaryDirectory() as tmp_dir:
70
+ output_filename = self.__execute(df, tmp_dir)
71
+ df = pd.read_csv(f"{self.CREEP_dir}/data/processed_data/EC_list.txt", header=None)
72
+ data = np.load(output_filename)
73
+ all_ecs = np.load(f"{self.CREEP_dir}/data/output/easy_split/representations/all_ECs_cluster_centers.npy", allow_pickle=True)
74
+ rxn_data = np.load(f"{self.CREEP_dir}/data/output/easy_split/representations/easy_reaction_test_representations.npy", allow_pickle=True)
75
+ data_dict = rxn_data.item()
76
+ print(data_dict)
77
+ data_dict = data_dict['reaction_repr_array']
78
+ data_rxn = all_ecs.item()
79
+ data_rxn = data_rxn['protein_repr_array']
80
+ for i, d in enumerate(data):
81
+ df[f'sim_{i}'] = d
82
+ return df
@@ -0,0 +1,136 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ import numpy as np
4
+ from multiprocessing.dummy import Pool as ThreadPool
5
+ from tempfile import TemporaryDirectory
6
+ import os
7
+ import subprocess
8
+
9
+
10
+ class ProteInfer(Step):
11
+
12
+ def __init__(self, id_col: str, seq_col: str, proteinfer_dir: str, num_threads: int = 1,
13
+ ec1_filter: list = None, ec2_filter: list = None, ec3_filter: list = None, ec4_filter: list = None,
14
+ env_name: str = 'proteinfer', args: list = None):
15
+ """Initialize the CLEAN step for enzyme classification.
16
+
17
+ Filters are lists of strings which are the EC values to keep. If None then keep all EC values.
18
+
19
+ Parameters
20
+ ----------
21
+ id_col : str
22
+ Name of the column containing sequence identifiers in the input DataFrame
23
+ seq_col : str
24
+ Name of the column containing protein sequences in the input DataFrame
25
+ clean_dir : str
26
+ Path to the CLEAN software directory containing the CLEAN_infer_fasta.py script
27
+ num_threads : int, optional
28
+ Number of parallel threads to use for processing (default=1)
29
+ ec1_filter : list, optional
30
+ List of EC1 values to filter by (default=None) if None then keep all EC1 values also use '-' to keep missing values
31
+ ec2_filter : list, optional
32
+ List of EC2 values to filter by (default=None) if None then keep all EC2 values also use '-' to keep missing values
33
+ ec3_filter : list, optional
34
+ List of EC3 values to filter by (default=None) if None then keep all EC3 values also use '-' to keep missing values e.g. ['3', '-']
35
+ ec4_filter : list, optional
36
+ List of EC4 values to filter by (default=None) if None then keep all EC4 values also use '-' to keep missing values e.g. ['1', '-']
37
+
38
+ Notes
39
+ -----
40
+ CLEAN requires a GPU and the 'clean' conda environment to be installed.
41
+ The CLEAN software directory should contain the following structure:
42
+ - data/inputs/ : Directory for temporary fasta files
43
+ - results/inputs/ : Directory where CLEAN outputs results
44
+ """
45
+ self.env_name = env_name
46
+ self.args = args
47
+ self.id_col = id_col
48
+ self.proteinfer_dir = proteinfer_dir
49
+ self.seq_col = seq_col # This is the column which has the sequence in it
50
+ self.num_threads = num_threads
51
+ self.ec1_filter = ec1_filter
52
+ self.ec2_filter = ec2_filter
53
+ self.ec3_filter = ec3_filter
54
+ self.ec4_filter = ec4_filter
55
+
56
+ def __execute(self, data: list) -> np.array:
57
+ df, tmp_dir = data
58
+ # Make sure in the directory of proteinfer
59
+ # Create the fasta file based on the id and the sequence value columns
60
+ input_filename = f'{tmp_dir}proteinfer.fasta'
61
+ output_filename = f'{tmp_dir}proteinfer.txt'
62
+
63
+ # write fasta file which is the input for proteinfer
64
+ with open(input_filename, 'w+') as fout:
65
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
66
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
67
+
68
+ os.chdir(self.proteinfer_dir)
69
+ cmd = ['conda', 'run', '-n', self.env_name, 'python3',
70
+ os.path.join(self.proteinfer_dir, f'proteinfer.py'),
71
+ '-i', input_filename,
72
+ '-o', output_filename]
73
+ if self.args is not None:
74
+ # Add the args to the command
75
+ cmd.extend(self.args)
76
+ self.run(cmd)
77
+ df = pd.read_csv(output_filename, sep='\t')
78
+
79
+ # Change back to the current folder
80
+ dir_path = os.path.dirname(os.path.realpath(__file__))
81
+ os.chdir(dir_path)
82
+
83
+ return df
84
+
85
+ def __clean_df(self, results: pd.DataFrame) -> pd.DataFrame:
86
+ """
87
+ Clean the proteinfer formatted file
88
+ """
89
+ results['predicted_ecs'] = [ec.split(':')[1] if 'EC:' in ec else 'None' for ec in results['predicted_label'].values]
90
+ # Remobe missing ECs
91
+ results = results[results['predicted_ecs'] != 'None']
92
+
93
+ # ------------- Separate out ECs ------------------
94
+ results['EC1'] = [r.split('.')[0] for r in results['predicted_ecs'].values]
95
+ results['EC2'] = [r.split('.')[1] for r in results['predicted_ecs'].values]
96
+ results['EC3'] = [r.split('.')[2] for r in results['predicted_ecs'].values]
97
+ results['EC4'] = [r.split('.')[3] for r in results['predicted_ecs'].values]
98
+ # Filter to only have one EC per seqeunce
99
+ # ------------- Group ------------------
100
+ # Now we want to group by the sequence_name and keep only the highest confidence level assignment
101
+ df = results.groupby('sequence_name')
102
+ rows = []
103
+ for grp in df:
104
+ top_row = grp[1].sort_values(by='predicted_label', ascending=False).values[0]
105
+ rows.append(top_row)
106
+ df = pd.DataFrame(rows, columns=results.columns)
107
+
108
+ # ------------- Filter to EC XXXX ------------------
109
+ if self.ec1_filter is not None:
110
+ df = df[df['EC1'].isin(self.ec1_filter)]
111
+ if self.ec2_filter is not None:
112
+ df = df[df['EC2'].isin(self.ec2_filter)]
113
+ if self.ec3_filter is not None:
114
+ df = df[df['EC3'].isin(self.ec3_filter)]
115
+ if self.ec4_filter is not None:
116
+ df = df[df['EC4'].isin(self.ec4_filter)]
117
+ return df
118
+
119
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
120
+ with TemporaryDirectory() as tmp_dir:
121
+ if self.num_threads > 1:
122
+ data = []
123
+ df_list = np.array_split(df, self.num_threads)
124
+ pool = ThreadPool(self.num_threads)
125
+ for df_chunk in df_list:
126
+ data.append([df_chunk, tmp_dir])
127
+ results = pool.map(self.__execute, data)
128
+ df = pd.DataFrame()
129
+ for dfs in results:
130
+ df = pd.concat([df, dfs])
131
+ #df = self.__clean_df(df)
132
+ return df
133
+ else:
134
+ df = self.__execute([df, tmp_dir])
135
+ #df = self.__clean_df(df)
136
+ return df
@@ -0,0 +1,51 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from docko.chai import run_chai
4
+ import logging
5
+ import numpy as np
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+
12
+ class Chai(Step):
13
+
14
+ def __init__(self, id_col: str, seq_col: str, substrate_col: str, output_dir: str, num_threads: int):
15
+ self.id_col = id_col
16
+ self.seq_col = seq_col
17
+ self.substrate_col = substrate_col
18
+ self.output_dir = output_dir or None
19
+ self.num_threads = num_threads or 1
20
+
21
+ def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
22
+ output_filenames = []
23
+ for run_id, seq, substrate in df[[self.id_col, self.seq_col, self.substrate_col]].values:
24
+ # Might have an issue if the things are not correctly installed in the same dicrectory
25
+ if not isinstance(substrate, str):
26
+ substrate = ''
27
+ print(run_id, seq, substrate)
28
+ run_chai(run_id, # name
29
+ seq, # sequence
30
+ substrate, # ligand as smiles
31
+ tmp_dir)
32
+ output_filenames.append(f'{tmp_dir}/{run_id}/')
33
+ return output_filenames
34
+
35
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
36
+ if self.output_dir:
37
+ if self.num_threads > 1:
38
+ output_filenames = []
39
+ df_list = np.array_split(df, self.num_threads)
40
+ for df_chunk in df_list:
41
+ output_filenames += self.__execute(df_chunk, self.output_dir)
42
+
43
+ df['output_dir'] = output_filenames
44
+ return df
45
+
46
+ else:
47
+ output_filenames = self.__execute(df, self.output_dir)
48
+ df['output_dir'] = output_filenames
49
+ return df
50
+ else:
51
+ print('No output directory provided')
@@ -0,0 +1,63 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from docko.docko import *
4
+ import logging
5
+ import numpy as np
6
+ import os
7
+ from multiprocessing.dummy import Pool as ThreadPool
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+
13
+ class Vina(Step):
14
+
15
+ def __init__(self, id_col: str, structure_col: str, sequence_col: str,
16
+ substrate_col: str, substrate_name_col: str, active_site_col: str, output_dir: str, num_threads: int):
17
+ print('Expects active site residues as a string separated by |. Zero indexed.')
18
+ self.id_col = id_col
19
+ self.structure_col = structure_col
20
+ self.sequence_col = sequence_col
21
+ self.substrate_col = substrate_col
22
+ self.substrate_name_col = substrate_name_col
23
+ self.active_site_col = active_site_col # Expects active site residues as a string separated by |
24
+ self.output_dir = output_dir or None
25
+ self.num_threads = num_threads or 1
26
+
27
+ def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
28
+ output_filenames = []
29
+ # ToDo: update to create from sequence if the path doesn't exist.
30
+ for label, structure_path, seq, substrate_smiles, substrate_name, residues in df[[self.id_col, self.structure_col, self.sequence_col, self.substrate_col, self.substrate_name_col, self.active_site_col]].values:
31
+ os.system(f'mkdir {self.output_dir}{label}')
32
+ try:
33
+ residues = str(residues)
34
+ residues = [int(r) + 1 for r in residues.split('|')]
35
+ if not os.path.exists(f'{structure_path}'):
36
+ # Try get the AF2 structure we expect the label to be the uniprot id
37
+ get_alphafold_structure(label, f'{self.output_dir}{label}/{label}_AF2.pdb')
38
+ structure_path = f'{self.output_dir}{label}/{label}_AF2.pdb'
39
+ clean_one_pdb(f'{structure_path}', f'{self.output_dir}{label}/{label}.pdb')
40
+ pdb_to_pdbqt_protein(f'{self.output_dir}{label}/{label}.pdb', f'{self.output_dir}{label}/{label}.pdbqt')
41
+ score = dock(sequence='', protein_name=label, smiles=substrate_smiles, ligand_name=substrate_name, residues=residues,
42
+ protein_dir=f'{self.output_dir}', ligand_dir=f'{self.output_dir}', output_dir=f'{self.output_dir}{label}/', pH=7.4,
43
+ method='vina', size_x=10.0, size_y=10.0, size_z=10.0)
44
+ output_filename = f'{self.output_dir}{label}/{label}.pdb'
45
+ output_filenames.append(output_filename)
46
+ except Exception as e:
47
+ print(f'Error docking {label}: {e}')
48
+ output_filenames.append(None)
49
+ return output_filenames
50
+
51
+
52
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
53
+ if self.output_dir:
54
+ if self.num_threads > 1:
55
+ pool = ThreadPool(self.num_threads)
56
+ df_list = np.array_split(df, self.num_threads)
57
+ results = pool.map(self.__execute, df_list)
58
+ else:
59
+ results = self.__execute(df)
60
+ df['output_dir'] = results
61
+ return df
62
+ else:
63
+ print('No output directory provided')
@@ -0,0 +1,61 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ import numpy as np
4
+ from multiprocessing.dummy import Pool as ThreadPool
5
+ from transformers import AutoModel, AutoTokenizer
6
+
7
+ class ChemBERT(Step):
8
+
9
+ def __init__(self, id_col: str, value_col: str, num_threads: int):
10
+ self.id_col = id_col
11
+ self.value_col = value_col
12
+ self.num_threads = num_threads
13
+ model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
14
+ self.model = AutoModel.from_pretrained(model_version, output_attentions=True)
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_version)
16
+ self.seq_len_limit = 500
17
+ self.embedding_len = 768
18
+
19
+
20
+ def __execute(self, data: list) -> np.array:
21
+ results = []
22
+ for v in data:
23
+ i, smiles = v[0], v[1]
24
+ print(smiles)
25
+ encoded_input = self.tokenizer(
26
+ smiles,
27
+ truncation=True,
28
+ max_length=self.seq_len_limit,
29
+ padding='max_length',
30
+ return_tensors='pt')
31
+ output = self.model(**encoded_input)
32
+ results.append((i, output['last_hidden_state'][:, 0][0].detach().numpy()))
33
+ return results
34
+
35
+
36
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
37
+ if self.num_threads > 1:
38
+ data = []
39
+ df_list = np.array_split(df, self.num_threads)
40
+ pool = ThreadPool(self.num_threads)
41
+ for df_chunk in df_list:
42
+ data.append([(i, v) for i, v in df_chunk[[self.id_col, self.value_col]].values])
43
+ results = pool.map(self.__execute, data)
44
+ all_results_map = {}
45
+ for r in results:
46
+ for j in r:
47
+ all_results_map[j[0]] = j[1]
48
+ encodings = []
49
+ for uid in df[self.id_col].values:
50
+ if all_results_map.get(uid) is None:
51
+ encodings.append(np.zeros(self.embedding_len))
52
+ else:
53
+ encodings.append(all_results_map.get(uid))
54
+ df['chemberta'] = encodings
55
+ return df
56
+
57
+ else:
58
+ data = [(i, v) for i, v in df[[self.id_col, self.value_col]].values]
59
+ results = self.__execute(data)
60
+ df['chemberta'] = [r[1] for r in results]
61
+ return df
@@ -0,0 +1,28 @@
1
+ from rxnfp.transformer_fingerprints import RXNBERTFingerprintGenerator, get_default_model_and_tokenizer
2
+ import pandas as pd
3
+ import pickle
4
+ import argparse
5
+
6
+
7
+ def run_rxnfp(output_filename, input_filename, label):
8
+ df = pd.read_csv(input_filename)
9
+ rxns = df[label].values
10
+ model, tokenizer = get_default_model_and_tokenizer()
11
+ rxnfp_generator = RXNBERTFingerprintGenerator(model, tokenizer)
12
+ fps = rxnfp_generator.convert_batch(rxns)
13
+ df['rxnfp'] = fps
14
+ with open(output_filename, 'wb') as file:
15
+ pickle.dump(df, file)
16
+
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Run rxnfp on a dataset")
19
+ parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
20
+ parser.add_argument('-input', '--input', type=str, required=True, help='path to the dataframe')
21
+ parser.add_argument('-label', '--label', type=str, required=True, help='label of the column')
22
+ return parser.parse_args()
23
+
24
+ def main():
25
+ args = parse_args()
26
+ run_rxnfp(args.out, args.input, args.label)
27
+
28
+ main()
@@ -0,0 +1,55 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ import pickle
5
+ import subprocess
6
+ from pathlib import Path
7
+ import logging
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ import random
11
+ import string
12
+
13
+ logger = logging.getLogger(__name__)
14
+ logger.setLevel(logging.INFO)
15
+
16
+
17
+ class RxnFP(Step):
18
+
19
+ def __init__(self, smiles_col: str, num_threads: int, env_name: str = 'rxnfp'):
20
+ self.value_col = smiles_col
21
+ self.num_threads = num_threads or 1
22
+ self.env_name = env_name
23
+
24
+ def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
25
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
26
+
27
+ output_filename = f'{tmp_dir}/rxnfp_{tmp_label}.pkl'
28
+ input_filename = f'{tmp_dir}/input_{tmp_label}.csv'
29
+ df.to_csv(input_filename, index=False)
30
+ cmd = ['conda', 'run', '-n', self.env_name, 'python', Path(__file__).parent/'embedchem_rxnfp_run.py', '--out', output_filename,
31
+ '--input', input_filename, '--label', self.value_col]
32
+ self.run(cmd)
33
+ # Might have an issue if the things are not correctly installed in the same dicrectory
34
+
35
+ return output_filename
36
+
37
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
38
+ with TemporaryDirectory() as tmp_dir:
39
+ if self.num_threads > 1:
40
+ output_filenames = []
41
+ df_list = np.array_split(df, self.num_threads)
42
+ for df_chunk in tqdm(df_list, total=len(df_list)):
43
+ output_filenames.append(self.__execute(df_chunk, tmp_dir))
44
+
45
+ df = pd.DataFrame()
46
+ for p in output_filenames:
47
+ with open(f'{p}', 'rb') as file:
48
+ tmp_df = pickle.load(file)
49
+ df = pd.concat([df, tmp_df])
50
+ return df
51
+
52
+ else:
53
+ output_filename = self.__execute(df, tmp_dir)
54
+ with open(f'{output_filename}', 'rb') as file:
55
+ return pickle.load(file)