enzymetk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. enzymetk/__init__.py +56 -0
  2. enzymetk/annotateEC_CLEAN_step.py +122 -0
  3. enzymetk/annotateEC_CREEP_step.py +82 -0
  4. enzymetk/annotateEC_proteinfer_step.py +136 -0
  5. enzymetk/dock_chai_step.py +51 -0
  6. enzymetk/dock_vina_step.py +63 -0
  7. enzymetk/embedchem_chemberta_step.py +61 -0
  8. enzymetk/embedchem_rxnfp_run.py +28 -0
  9. enzymetk/embedchem_rxnfp_step.py +55 -0
  10. enzymetk/embedchem_selformer_run.py +28 -0
  11. enzymetk/embedchem_selformer_step.py +39 -0
  12. enzymetk/embedchem_unimol_step.py +57 -0
  13. enzymetk/embedprotein_esm_step.py +123 -0
  14. enzymetk/esm-extract.py +140 -0
  15. enzymetk/filter_sequence_step.py +0 -0
  16. enzymetk/filter_structure_step.py +0 -0
  17. enzymetk/generate_msa_step.py +61 -0
  18. enzymetk/generate_oligopool_step.py +0 -0
  19. enzymetk/generate_tree_step.py +74 -0
  20. enzymetk/inpaint_ligandMPNN_step.py +65 -0
  21. enzymetk/main.py +37 -0
  22. enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
  23. enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
  24. enzymetk/pipeline.py +1 -0
  25. enzymetk/predict_activity_step.py +0 -0
  26. enzymetk/predict_catalyticsite_run.py +47 -0
  27. enzymetk/predict_catalyticsite_step.py +70 -0
  28. enzymetk/reducedim_pca_run.py +67 -0
  29. enzymetk/reducedim_vae_run.py +67 -0
  30. enzymetk/reducedim_vae_step.py +12 -0
  31. enzymetk/save_step.py +13 -0
  32. enzymetk/sequence_search_blast.py +80 -0
  33. enzymetk/similarity_foldseek_step.py +114 -0
  34. enzymetk/similarity_mmseqs_step.py +80 -0
  35. enzymetk/similarity_reaction_step.py +60 -0
  36. enzymetk/similarity_substrate_step.py +59 -0
  37. enzymetk/step.py +60 -0
  38. enzymetk-0.0.1.data/data/LICENSE +0 -0
  39. enzymetk-0.0.1.dist-info/LICENSE +0 -0
  40. enzymetk-0.0.1.dist-info/METADATA +370 -0
  41. enzymetk-0.0.1.dist-info/RECORD +44 -0
  42. enzymetk-0.0.1.dist-info/WHEEL +5 -0
  43. enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
  44. enzymetk-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,80 @@
1
+ """
2
+ Install clean and then you need to activate the environment and install and run via that.
3
+
4
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
5
+ repo and then copy it out of it.
6
+ """
7
+ from enzymetk.step import Step
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from tempfile import TemporaryDirectory
12
+ import subprocess
13
+ import random
14
+ import string
15
+
16
+
17
+ def process_clustering(filename, df, id_column_name):
18
+ clustering = pd.read_csv(filename, delimiter='\t', header=None)
19
+ #rename heading as cluster reference and id
20
+ clustering.columns = ['mmseqs_representative_cluster_seq', id_column_name]
21
+ clustering.drop_duplicates(subset=id_column_name, keep='first', inplace=True)
22
+ clustering.set_index(id_column_name, inplace=True)
23
+ # Join the clustering with the df
24
+ df = df.set_index(id_column_name)
25
+ df = df.join(clustering, how='left')
26
+ df.reset_index(inplace=True)
27
+ return df
28
+
29
+ class MMseqs(Step):
30
+
31
+ def __init__(self, id_column_name: str, seq_column_name: str, method='search',reference_database: str = None, tmp_dir: str = None, args: list = None):
32
+ self.seq_column_name = seq_column_name
33
+ self.id_column_name = id_column_name
34
+ self.reference_database = reference_database # pdb should be the default
35
+ self.tmp_dir = tmp_dir
36
+ self.args = args
37
+ self.method = method
38
+
39
+ def __execute(self, data: list) -> np.array:
40
+ df, tmp_dir = data
41
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
42
+ # Convert to a fasta
43
+ with open(f'{tmp_dir}/seqs.fasta', 'w') as f:
44
+ for i, row in df.iterrows():
45
+ f.write(f'>{row[self.id_column_name]}\n{row[self.seq_column_name]}\n')
46
+
47
+ if self.reference_database is None and self.method == 'search':
48
+ print('Creating database')
49
+ cmd = ['mmseqs', 'createdb', f'{tmp_dir}/seqs.fasta', 'targetDB']
50
+ self.run(cmd)
51
+ cmd = ['mmseqs', 'createindex', 'targetDB', 'tmp']
52
+ self.run(cmd)
53
+ self.reference_database = 'targetDB'
54
+
55
+ # e.g. args --min-seq-id 0.5 -c 0.8 --cov-mode 1
56
+ if self.method == 'search':
57
+ cmd = ['mmseqs', 'easy-search', f'{tmp_dir}/seqs.fasta', self.reference_database, f'{tmp_dir}/{tmp_label}.txt', f'{tmp_dir}/tmp']
58
+ elif self.method == 'cluster':
59
+ cmd = ['mmseqs', 'easy-cluster', f'{tmp_dir}/seqs.fasta', f'{tmp_dir}/clusterRes', f'{tmp_dir}/tmp']
60
+ # add in args
61
+ if self.args is not None:
62
+ cmd.extend(self.args)
63
+
64
+ self.run(cmd)
65
+ # https://github.com/soedinglab/MMseqs2/issues/458
66
+ if self.method == 'search':
67
+ df = pd.read_csv(f'{tmp_dir}/{tmp_label}.txt', header=None, sep='\t')
68
+ df.columns = ['Query', 'Target', 'Sequence Identity', 'Alignment Length', 'Mismatches', 'Gap Opens',
69
+ 'Query Start', 'Query End', 'Target Start', 'Target End', 'E-value', 'Bit Score']
70
+ return df
71
+ elif self.method == 'cluster':
72
+ df = process_clustering(f'{tmp_dir}/clusterRes_cluster.tsv', df, self.id_column_name)
73
+ return df
74
+
75
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
76
+ if self.tmp_dir is not None:
77
+ return self.__execute([df, self.tmp_dir])
78
+ with TemporaryDirectory() as tmp_dir:
79
+ return self.__execute([df, tmp_dir])
80
+ return df
@@ -0,0 +1,60 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tempfile import TemporaryDirectory
5
+ from rdkit import Chem
6
+ from rdkit import DataStructs
7
+ from rdkit.Chem import rdChemReactions
8
+ import pandas as pd
9
+ import os
10
+ from rdkit.DataStructs import FingerprintSimilarity
11
+ from rdkit.Chem.Fingerprints import FingerprintMols
12
+ import random
13
+ import string
14
+ from tqdm import tqdm
15
+ from multiprocessing.dummy import Pool as ThreadPool
16
+
17
+
18
+ class ReactionDist(Step):
19
+
20
+ def __init__(self, id_column_name: str, smiles_column_name: str, smiles_string: str, num_threads=1):
21
+ self.smiles_column_name = smiles_column_name
22
+ self.id_column_name = id_column_name
23
+ self.smiles_string = smiles_string
24
+ self.num_threads = num_threads
25
+
26
+ def __execute(self, data: list) -> np.array:
27
+ reaction_df = data
28
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
29
+
30
+ rxn = rdChemReactions.ReactionFromSmarts(self.smiles_string)
31
+ rxn_fp = rdChemReactions.CreateStructuralFingerprintForReaction(rxn)
32
+ rows = []
33
+ # compare all fp pairwise without duplicates
34
+ for smile_id, smiles in tqdm(reaction_df[[self.id_column_name, self.smiles_column_name]].values): # -1 so the last fp will not be used
35
+ mol_ = rdChemReactions.ReactionFromSmarts(smiles)
36
+ fps = rdChemReactions.CreateStructuralFingerprintForReaction(mol_)
37
+ rows.append([smile_id,
38
+ smiles,
39
+ DataStructs.TanimotoSimilarity(fps, rxn_fp),
40
+ DataStructs.RusselSimilarity(fps, rxn_fp),
41
+ DataStructs.CosineSimilarity(fps, rxn_fp)])
42
+ distance_df = pd.DataFrame(rows, columns=[self.id_column_name, 'TargetSmiles', 'TanimotoSimilarity', 'RusselSimilarity', 'CosineSimilarity'])
43
+ return distance_df
44
+
45
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
46
+ if self.num_threads > 1:
47
+ data = []
48
+ df_list = np.array_split(df, self.num_threads)
49
+ for df_chunk in df_list:
50
+ data.append(df_chunk)
51
+ pool = ThreadPool(self.num_threads)
52
+ output_filenames = pool.map(self.__execute, data)
53
+ df = pd.DataFrame()
54
+ for tmp_df in output_filenames:
55
+ df = pd.concat([df, tmp_df])
56
+ return df
57
+
58
+ else:
59
+ return self.__execute(df)
60
+
@@ -0,0 +1,59 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tempfile import TemporaryDirectory
5
+ from rdkit import Chem
6
+ from rdkit import DataStructs
7
+ from rdkit.Chem import rdChemReactions
8
+ import pandas as pd
9
+ import os
10
+ from rdkit.DataStructs import FingerprintSimilarity
11
+ from rdkit.Chem.Fingerprints import FingerprintMols
12
+ import random
13
+ import string
14
+ from tqdm import tqdm
15
+ from multiprocessing.dummy import Pool as ThreadPool
16
+
17
+
18
+ class SubstrateDist(Step):
19
+
20
+ def __init__(self, id_column_name: str, smiles_column_name: str, smiles_string: str, num_threads=1):
21
+ self.smiles_column_name = smiles_column_name
22
+ self.id_column_name = id_column_name
23
+ self.smiles_string = smiles_string
24
+ self.num_threads = num_threads
25
+
26
+ def __execute(self, data: list) -> np.array:
27
+ reaction_df = data
28
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
29
+
30
+ rxn = Chem.MolFromSmiles(self.smiles_string)
31
+ rxn_fp = FingerprintMols.FingerprintMol(rxn)
32
+ rows = []
33
+ # compare all fp pairwise without duplicates
34
+ for smile_id, smiles in tqdm(reaction_df[[self.id_column_name, self.smiles_column_name]].values): # -1 so the last fp will not be used
35
+ mol_ = Chem.MolFromSmiles(smiles)
36
+ fps = FingerprintMols.FingerprintMol(mol_)
37
+ rows.append([smile_id,
38
+ smiles,
39
+ DataStructs.TanimotoSimilarity(fps, rxn_fp),
40
+ DataStructs.RusselSimilarity(fps, rxn_fp),
41
+ DataStructs.CosineSimilarity(fps, rxn_fp)])
42
+ distance_df = pd.DataFrame(rows, columns=[self.id_column_name, 'TargetSmiles', 'TanimotoSimilarity', 'RusselSimilarity', 'CosineSimilarity'])
43
+ return distance_df
44
+
45
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
46
+ if self.num_threads > 1:
47
+ data = []
48
+ df_list = np.array_split(df, self.num_threads)
49
+ for df_chunk in df_list:
50
+ data.append(df_chunk)
51
+ pool = ThreadPool(self.num_threads)
52
+ output_filenames = pool.map(self.__execute, data)
53
+ df = pd.DataFrame()
54
+ for tmp_df in output_filenames:
55
+ df = pd.concat([df, tmp_df])
56
+ return df
57
+
58
+ else:
59
+ return self.__execute(df)
enzymetk/step.py ADDED
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from sciutil import SciUtil
4
+ import timeit
5
+ import logging
6
+ import subprocess
7
+
8
+ u = SciUtil()
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+
13
+ class Pipeline():
14
+
15
+ def __init__(self, *steps: Step):
16
+ self.steps = list(steps)
17
+
18
+ def __rshift__(self, other: Step) -> Step:
19
+ return Pipeline(*self.steps, other)
20
+
21
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
22
+ """
23
+ Execute some shit.
24
+ """
25
+ for step in self.steps:
26
+ df = step.execute(df)
27
+ return df
28
+
29
+ def __rlshift__(self, other: pd.DataFrame) -> pd.DataFrame:
30
+ return self.execute(other)
31
+
32
+
33
+ class Step():
34
+
35
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
36
+ """ Execute some shit """
37
+ return df
38
+
39
+ def run(self, cmd: list) -> None:
40
+ """ Run a command """
41
+ start = timeit.default_timer()
42
+ u.dp(['Running command', ' '.join([str(c) for c in cmd])])
43
+ result = subprocess.run(cmd, capture_output=True, text=True)
44
+ u.warn_p(['Output:'])
45
+ print(result.stdout)
46
+ u.err_p(['Error:', result.stderr])
47
+ if result.stderr:
48
+ logger.error(result.stderr)
49
+ logger.info(result.stdout)
50
+ u.dp(['Time for command to run (min): ', (timeit.default_timer() - start)/60])
51
+
52
+ def __rshift__(self, other: Step) -> Step:
53
+ return Pipeline(self, other)
54
+
55
+ def __rlshift__(self, other: pd.DataFrame) -> pd.DataFrame:
56
+ """
57
+ Overriding the right shift operator to allow for the pipeline to be executed.
58
+ """
59
+ return self.execute(other)
60
+
File without changes
File without changes
@@ -0,0 +1,370 @@
1
+ Metadata-Version: 2.2
2
+ Name: enzymetk
3
+ Version: 0.0.1
4
+ Home-page: https://github.com/arianemora/enzyme-tk/
5
+ Author: Ariane Mora
6
+ Author-email: ariane.n.mora@gmail.com
7
+ License: GPL3
8
+ Project-URL: Bug Tracker, https://github.com/arianemora/enzyme-tk/
9
+ Project-URL: Documentation, https://github.com/arianemora/enzyme-tk/
10
+ Project-URL: Source Code, https://github.com/arianemora/enzyme-tk/
11
+ Keywords: enzymes,protein-engineering
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
+ Classifier: Natural Language :: English
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.8
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: fair-esm
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: numpy
24
+ Requires-Dist: seaborn
25
+ Requires-Dist: sciutil
26
+ Requires-Dist: pandas==2.1.4
27
+ Requires-Dist: biopython
28
+ Requires-Dist: sentence_transformers
29
+ Requires-Dist: pubchempy
30
+ Requires-Dist: pyfaidx
31
+ Requires-Dist: spacy
32
+ Dynamic: author
33
+ Dynamic: author-email
34
+ Dynamic: classifier
35
+ Dynamic: description
36
+ Dynamic: description-content-type
37
+ Dynamic: home-page
38
+ Dynamic: keywords
39
+ Dynamic: license
40
+ Dynamic: project-url
41
+ Dynamic: requires-dist
42
+ Dynamic: requires-python
43
+
44
+ # A pipeline for enzyme engineering
45
+
46
+ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ source enzymetk/conda_envs/install_all.sh
52
+ ```
53
+
54
+ ## Install subsets of enzyme-tk
55
+
56
+ ```bash
57
+ git clone git@github.com:ArianeMora/enzyme-tk.git
58
+ python setup.py sdist bdist_wheel
59
+ pip install dist/enzymetk-0.0.1.tar.gz
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
65
+
66
+ Here are some of the tools that have been implemented to be chained together as a pipeline:
67
+
68
+ [mmseqs2](https://github.com/soedinglab/mmseqs2)
69
+ [foldseek](https://github.com/steineggerlab/foldseek)
70
+ [diamond](https://github.com/bbuchfink/diamond)
71
+ [proteinfer](https://github.com/google-research/proteinfer)
72
+ [CLEAN](https://github.com/tttianhao/CLEAN)
73
+ [chai](https://github.com/chaidiscovery/chai-lab/)
74
+ [chemBERTa2](https://github.com/seyonechithrananda/bert-loves-chemistry)
75
+ [SELFormer](https://github.com/HUBioDataLab/SELFormer)
76
+ [rxnfp](https://github.com/rxn4chemistry/rxnfp)
77
+ [clustalomega](http://www.clustal.org/omega/)
78
+ [CREEP](https://github.com/jsunn-y/CARE)
79
+ [esm](https://github.com/facebookresearch/esm)
80
+ [LigandMPNN](https://github.com/dauparas/LigandMPNN)
81
+ [vina](https://vina.scripps.edu/)
82
+ [Uni-Mol](https://github.com/deepmodeling/Uni-Mol)
83
+ [fasttree](https://morgannprice.github.io/fasttree/)
84
+ [Porechop](https://github.com/rrwick/Porechop)
85
+ [prokka](https://github.com/tseemann/prokka)
86
+ ## Things to note
87
+
88
+ All the tools use the conda env of `enzymetk` by default.
89
+
90
+ If you want to use a different conda env, you can do so by passing the `env_name` argument to the constructor of the step.
91
+
92
+ For example:
93
+
94
+ ```python
95
+ proteinfer = ProteInfer(env_name='proteinfer')
96
+ ```
97
+
98
+ ## Arguments
99
+
100
+ All the arguments are passed to the constructor of the step, the ones that are required are passed as arguments to the constructor and the ones that are optional are passed as a list to the `args` argument, this needs to be a list as one would normally pass arguments to a command line tool.
101
+
102
+ For example:
103
+
104
+ ```python
105
+ proteinfer = ProteInfer(env_name='proteinfer', args=['--num_threads', '10'])
106
+ ```
107
+ For those wanting to use specific arguments, check the individual tools for specifics.
108
+
109
+ ## Steps
110
+
111
+ The steps are the main building blocks of the pipeline. They are responsible for executing the individual tools.
112
+
113
+ ### BLAST
114
+
115
+ BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
116
+
117
+ ```python
118
+ id_col = 'Entry'
119
+ seq_col = 'Sequence'
120
+ label_col = 'label'
121
+ rows = [['AXE2_TALPU', 'query', 'MHSKFFAASLLGLGAAAIPLEGVMEKRSCPAIHVFGARETTASPGYGSSSTVVNGVLSAYPGSTAEAINYPACGGQSSCGGASYSSSVAQGIAAVASAVNSFNSQCPSTKIVLVGYSQGGEIMDVALCGGGDPNQGYTNTAVQLSSSAVNMVKAAIFMGDPMFRAGLSYEVGTCAAGGFDQRPAGFSCPSAAKIKSYCDASDPYCCNGSNAATHQGYGSEYGSQALAFVKSKLG'],
122
+ ['AXE2_TALPU', 'reference', 'MHSKFFAASLLGLGAAAIPLEGVMEKRSCPAIHVFGARETTASPGYGSSSTVVNGVLSAYPGSTAEAINYPACGGQSSCGGASYSSSVAQGIAAVASAVNSFNSQCPSTKIVLVGYSQGGEIMDVALCGGGDPNQGYTNTAVQLSSSAVNMVKAAIFMGDPMFRAGLSYEVGTCAAGGFDQRPAGFSCPSAAKIKSYCDASDPYCCNGSNAATHQGYGSEYGSQALAFVKSKLG'],
123
+ ['AXE2_GEOSE', 'reference', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR'],
124
+ ['AXE7A_XYLR2', 'referece', 'MFNFAPKQTTEMKKLLFTLVFVLGSMATALAENYPYRADYLWLTVPNHADWLYKTGERAKVEVSFCLYGMPQNVEVAYEIGPDMMPATSSGKVTLKNGRAVIDMGTMKKPGFLDMRLSVDGKYQHHVKVGFSPELLKPYTKNPQDFDAFWKANLDEARKTPVSVSCNKVDKYTTDAFDCYLLKIKTDRRHSIYGYLTKPKKAGKYPVVLCPPGAGIKTIKEPMRSTFYAKNGFIRLEMEIHGLNPEMTDEQFKEITTAFDYENGYLTNGLDDRDNYYMKHVYVACVRAIDYLTSLPDWDGKNVFVQGGSQGGALSLVTAGLDPRVTACVANHPALSDMAGYLDNRAGGYPHFNRLKNMFTPEKVNTMAYYDVVNFARRITCPVYITWGYNDNVCPPTTSYIVWNLITAPKESLITPINEHWTTSETNYTQMLWLKKQVK'],
125
+ ['A0A0B8RHP0_LISMN', 'reference', 'MKKLLFLGDSVTDAGRDFENDRELGHGYVKIIADQLEQEDVTVINRGVSANRVADLHRRIEADAISLQPDVVTIMIGINDTWFSFSRWEDTSVTAFKEVYRVILNRIKTETNAELILMEPFVLPYPEDRKEWRGDLDPKIGAVRELAAEFGATLIPLDGLMNALAIKHGPTFLAEDGVHPTKAGHEAIASTWLEFTK']]
126
+ df = pd.DataFrame(rows, columns=[id_col, label_col, seq_col])
127
+ df << (BLAST(id_col, seq_col, label_col) >> Save('tmp/blast_test.pkl'))
128
+ ```
129
+
130
+ ### ActiveSitePred
131
+
132
+ ActiveSitePred is a tool for predicting the active site of an enzyme. This returns a dataframe with the active site prediction for each sequence, and the probability of the active site. Note we use a zero index for the active site prediction while UniProt uses a one index.
133
+
134
+ ```python
135
+ squidly_dir = '/disk1/share/software/AS_inference/' # This should be where you downloaded the data from zotero, there is a folder in there called AS_inference
136
+ num_threads = 1
137
+ id_col = 'Entry'
138
+ seq_col = 'Sequence'
139
+ rows = [['AXE2', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR'],
140
+ ['H7C0D0', 'XRAHREIKDIFYKAIQKRRQSQEKIDDILQTLLDATYKDGRPLTDDEVAGMLIGLLLAGQHTSSTTSAWMGFFLARDKTLQKKCYLEQKTVCGENLPPLTYDQLKDLNLLDRCIKETLRLRPPIMIMMRMARTPQTVAGYTIPPGHQDNPASGEKFAYVPFGAGRHRCIGENFAYVQIKTIWSTMLRLYEFDLIDGYFPTVNYTTMIHTPENPVIRYKRRSK']]
141
+ df = pd.DataFrame(rows, columns=[id_col, seq_col])
142
+ print(df)
143
+ df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
144
+
145
+ ```
146
+
147
+ ### Chai
148
+
149
+ Chai is a tool for predicting the structure of a protein and a ligand, this tool outputs the data to a new folder and creates directories based on the id that is passed. We return the paths to the specific structure for each id in the returned dataframe.
150
+
151
+ Requres the `docko` conda environment to be created.
152
+
153
+ ```python
154
+ output_dir = 'tmp/'
155
+ num_threads = 1
156
+ id_col = 'Entry'
157
+ seq_col = 'Sequence'
158
+ substrate_col = 'Substrate'
159
+ rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
160
+ ['AXE2', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
161
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
162
+ print(df)
163
+ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
164
+
165
+ ```
166
+
167
+ ### ChemBERTa
168
+
169
+ ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
170
+
171
+ ```python
172
+ from steps.embedchem_chemberta_step import ChemBERT
173
+ from steps.save_step import Save
174
+
175
+ output_dir = 'tmp/'
176
+ num_threads = 1
177
+ id_col = 'Entry'
178
+ seq_col = 'Sequence'
179
+ substrate_col = 'Substrate'
180
+ rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
181
+ ['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
182
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
183
+ df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
184
+ ```
185
+
186
+ ### CLEAN
187
+
188
+ CLEAN is a tool for predicting the EC number of an enzyme.
189
+
190
+ ```python
191
+
192
+ output_dir = 'tmp/'
193
+ num_threads = 1
194
+ id_col = 'Entry'
195
+ seq_col = 'Sequence'
196
+ substrate_col = 'Substrate'
197
+ rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
198
+ ['AXE2', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
199
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
200
+ # This should be relative to the location of the script if you installed via the install_all.sh script
201
+ # Note you need to have downloaded their predictive models (ToDo )
202
+ clean_dir = 'software/CLEAN/app/'
203
+ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clean_missing_EC_seqs.pkl'))
204
+
205
+
206
+ ```
207
+ ### ClustalOmega
208
+
209
+ ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
210
+
211
+ ```python
212
+ from steps.generate_msa_step import ClustalOmega
213
+ from steps.save_step import Save
214
+ import pandas as pd
215
+
216
+ id_col = 'Entry'
217
+ seq_col = 'Sequence'
218
+ label_col = 'label'
219
+ rows = [['AXE2_TALPU', 'query', 'MHSKFFAASLLGLGAAAIPLEGVMEKRSCPAIHVFGARETTASPGYGSSSTVVNGVLSAYPGSTAEAINYPACGGQSSCGGASYSSSVAQGIAAVASAVNSFNSQCPSTKIVLVGYSQGGEIMDVALCGGGDPNQGYTNTAVQLSSSAVNMVKAAIFMGDPMFRAGLSYEVGTCAAGGFDQRPAGFSCPSAAKIKSYCDASDPYCCNGSNAATHQGYGSEYGSQALAFVKSKLG'],
220
+ ['AXE2_TALPU', 'reference', 'MHSKFFAASLLGLGAAAIPLEGVMEKRSCPAIHVFGARETTASPGYGSSSTVVNGVLSAYPGSTAEAINYPACGGQSSCGGASYSSSVAQGIAAVASAVNSFNSQCPSTKIVLVGYSQGGEIMDVALCGGGDPNQGYTNTAVQLSSSAVNMVKAAIFMGDPMFRAGLSYEVGTCAAGGFDQRPAGFSCPSAAKIKSYCDASDPYCCNGSNAATHQGYGSEYGSQALAFVKSKLG'],
221
+ ['AXE2_GEOSE', 'reference', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR'],
222
+ ['AXE7A_XYLR2', 'referece', 'MFNFAPKQTTEMKKLLFTLVFVLGSMATALAENYPYRADYLWLTVPNHADWLYKTGERAKVEVSFCLYGMPQNVEVAYEIGPDMMPATSSGKVTLKNGRAVIDMGTMKKPGFLDMRLSVDGKYQHHVKVGFSPELLKPYTKNPQDFDAFWKANLDEARKTPVSVSCNKVDKYTTDAFDCYLLKIKTDRRHSIYGYLTKPKKAGKYPVVLCPPGAGIKTIKEPMRSTFYAKNGFIRLEMEIHGLNPEMTDEQFKEITTAFDYENGYLTNGLDDRDNYYMKHVYVACVRAIDYLTSLPDWDGKNVFVQGGSQGGALSLVTAGLDPRVTACVANHPALSDMAGYLDNRAGGYPHFNRLKNMFTPEKVNTMAYYDVVNFARRITCPVYITWGYNDNVCPPTTSYIVWNLITAPKESLITPINEHWTTSETNYTQMLWLKKQVK'],
223
+ ['A0A0B8RHP0_LISMN', 'reference', 'MKKLLFLGDSVTDAGRDFENDRELGHGYVKIIADQLEQEDVTVINRGVSANRVADLHRRIEADAISLQPDVVTIMIGINDTWFSFSRWEDTSVTAFKEVYRVILNRIKTETNAELILMEPFVLPYPEDRKEWRGDLDPKIGAVRELAAEFGATLIPLDGLMNALAIKHGPTFLAEDGVHPTKAGHEAIASTWLEFTK']]
224
+ df = pd.DataFrame(rows, columns=[id_col, label_col, seq_col])
225
+ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
226
+ ```
227
+
228
+ ### CREEP
229
+
230
+ CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
231
+
232
+ ```python
233
+ from steps.annotateEC_CREEP_step import CREEP
234
+ from steps.save_step import Save
235
+ import pandas as pd
236
+
237
+ # CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
238
+ output_dir = 'tmp/'
239
+ df = pd.DataFrame({'EC number': ['1.1.1.1', '1.1.1.2'],
240
+ 'Sequence': ['MALWMRLLPLLALLALWGPDPAAA', 'MALWMRLLPLLALLALWGPDPAAA'],
241
+ 'Reaction': ['O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5.OC6=CC=CC=C6',
242
+ 'O=P(OC1=CC=CC=C1)(OC2=CC=CC=C2)OC3=CC=CC=C3>>O=P(O)(OC4=CC=CC=C4)OC5=CC=CC=C5.OC6=CC=CC=C6']})
243
+ id_col = 'Entry'
244
+ reaction_col = 'Reaction'
245
+
246
+ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/data/', CREEP_dir='/disk1/share/software/CREEP/',
247
+ modality='reaction', reference_modality='protein') >> Save(f'{output_dir}CREEP_test_protein.pkl'))
248
+ ```
249
+
250
+ ### EmbedESM
251
+
252
+ EmbedESM is a tool for embedding a set of sequences using ESM2.
253
+
254
+ ```python
255
+ from steps.embedprotein_esm_step import EmbedESM
256
+ from steps.save_step import Save
257
+ import pandas as pd
258
+
259
+ id_col = 'Entry'
260
+ seq_col = 'Sequence'
261
+ label_col = 'ActiveSite'
262
+ rows = [['AXE2_TALPU', '10', 'MHSKFFAASLLGLGAAAIPLEGVMEKRSCPAIHVFGARETTASPGYGSSSTVVNGVLSAYPGSTAEAINYPACGGQSSCGGASYSSSVAQGIAAVASAVNSFNSQCPSTKIVLVGYSQGGEIMDVALCGGGDPNQGYTNTAVQLSSSAVNMVKAAIFMGDPMFRAGLSYEVGTCAAGGFDQRPAGFSCPSAAKIKSYCDASDPYCCNGSNAATHQGYGSEYGSQALAFVKSKLG'],
263
+ ['AXE2_GEOSE', '1|2', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR'],
264
+ ['AXE7A_XYLR2', '1', 'MFNFAPKQTTEMKKLLFTLVFVLGSMATALAENYPYRADYLWLTVPNHADWLYKTGERAKVEVSFCLYGMPQNVEVAYEIGPDMMPATSSGKVTLKNGRAVIDMGTMKKPGFLDMRLSVDGKYQHHVKVGFSPELLKPYTKNPQDFDAFWKANLDEARKTPVSVSCNKVDKYTTDAFDCYLLKIKTDRRHSIYGYLTKPKKAGKYPVVLCPPGAGIKTIKEPMRSTFYAKNGFIRLEMEIHGLNPEMTDEQFKEITTAFDYENGYLTNGLDDRDNYYMKHVYVACVRAIDYLTSLPDWDGKNVFVQGGSQGGALSLVTAGLDPRVTACVANHPALSDMAGYLDNRAGGYPHFNRLKNMFTPEKVNTMAYYDVVNFARRITCPVYITWGYNDNVCPPTTSYIVWNLITAPKESLITPINEHWTTSETNYTQMLWLKKQVK'],
265
+ ['A0A0B8RHP0_LISMN', '2', 'MKKLLFLGDSVTDAGRDFENDRELGHGYVKIIADQLEQEDVTVINRGVSANRVADLHRRIEADAISLQPDVVTIMIGINDTWFSFSRWEDTSVTAFKEVYRVILNRIKTETNAELILMEPFVLPYPEDRKEWRGDLDPKIGAVRELAAEFGATLIPLDGLMNALAIKHGPTFLAEDGVHPTKAGHEAIASTWLEFTK']]
266
+ df = pd.DataFrame(rows, columns=[id_col, label_col, seq_col])
267
+ df << (EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp/') >> Save('tmp/esm2_test.pkl'))
268
+ # You can also extract the active site embedding in addition to the mean embedding
269
+ df << (EmbedESM(id_col, seq_col, extraction_method='active_site', active_site_col='ActiveSite', tmp_dir='tmp/') >> Save('tmp/esm2_test_active_site.pkl'))
270
+ ```
271
+
272
+ ### FoldSeek
273
+
274
+ See: [FoldSeek](https://github.com/steineggerlab/foldseek)
275
+
276
+ FoldSeek does a similarity search against a database of structures, it runs in the `enzyme-tk` environment. Similarly to the diamond blast, you can either create databases yourself before hand using the
277
+ foldseek documentation or you can create a database on the fly by passing the dataframe with a column called `label` that has two values: `reference` and `query`.
278
+ If you pass a database, you need to pass the path to the database.
279
+
280
+ The columns expect a path to a pdb file i.e. the output from the `Chai` step.
281
+
282
+ ```python
283
+ from steps.similarity_foldseek_step import FoldSeek
284
+ from steps.save_step import Save
285
+ import pandas as pd
286
+
287
+ # id_col: str, seq_col: str, proteinfer_dir: str,
288
+ output_dir = 'tmp/'
289
+ rows = [['tmp/P0DP24/chai/P0DP24_3.cif'],
290
+ ['tmp/P0DP24/chai/P0DP24_1.cif']]
291
+ df = pd.DataFrame(rows, columns=['pdbs'])
292
+ # foldseek_dir: str, pdb_column_name: str, reference_database: str
293
+ pdb_column_name = 'pdbs'
294
+ # The foldseek database was created using the folldwing command in this location:
295
+ # foldseek databases PDB pdb tmp
296
+ reference_database = '/disk1/share/software/foldseek/structures/pdb/pdb'
297
+ df << (FoldSeek(pdb_column_name, reference_database) >> Save(f'{output_dir}pdb_files.pkl'))
298
+
299
+ ```
300
+
301
+ ### LigandMPNN
302
+
303
+ LigandMPNN is a tool for inpainting the sequence for a protein backbone that has been generated by a generative model.
304
+
305
+ See: [LigandMPNN](https://github.com/dauparas/LigandMPNN)
306
+
307
+ ```python
308
+ from steps.inpaint_ligandMPNN_step import LigandMPNN
309
+ from steps.save_step import Save
310
+ import pandas as pd
311
+
312
+ # id_col: str, seq_col: str, proteinfer_dir: str,
313
+ # This needs to be the full path to the file since LigandMPNN requires the full path (otherwise it will save to the ligandmpnn directory)
314
+ output_dir = '/disk1/ariane/vscode/enzyme-tk/examples/tmp/'
315
+ # These have to be the full path to the file since LigandMPNN requires the full path.
316
+ rows = [['/disk1/ariane/vscode/enzyme-tk/examples/tmp/P0DP24/chai/P0DP24_3.cif'],
317
+ ['/disk1/ariane/vscode/enzyme-tk/examples/tmp/P0DP24/chai/P0DP24_1.cif']]
318
+ df = pd.DataFrame(rows, columns=['pdbs'])
319
+ # foldseek_dir: str, pdb_column_name: str, reference_database: str
320
+ pdb_column_name = 'pdbs'
321
+ ligand_mpnn_dir = '/disk1/share/software/LigandMPNN/'
322
+ # See how you need to enclose the fixed residues in quotes make sure any spaces are closed in double quotes!
323
+ args = ['--fixed_residues', '"A19 A20 A21 A59 A60 A61 A90 A91 A92"', '--checkpoint_path_sc', f'{ligand_mpnn_dir}model_params/ligandmpnn_sc_v_32_002_16.pt']
324
+ df << (LigandMPNN(pdb_column_name, ligand_mpnn_dir, output_dir,args=args) >> Save(f'{output_dir}ligandmpnn_inpainted.pkl'))
325
+
326
+ ```
327
+
328
+ ### Proteinfer
329
+
330
+ Proteinfer is a tool for predicting the EC number of an enzyme.
331
+
332
+ ```python
333
+
334
+ output_dir = 'tmp/'
335
+ num_threads = 1
336
+ id_col = 'Entry'
337
+ seq_col = 'Sequence'
338
+ substrate_col = 'Substrate'
339
+ rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
340
+ ['AXE2', 'MKIGSGEKLLFIGDSITDCGRARPEGEGSFGALGTGYVAYVVGLLQAVYPELGIRVVNKGISGNTVRDLKARWEEDVIAQKPDWVSIMIGINDVWRQYDLPFMKEKHVYLDEYEATLRSLVLETKPLVKGIILMTPFYIEGNEQDPMRRTMDQYGRVVKQIAEETNSLFVDTQAAFNEVLKTLYPAALAWDRVHPSVAGHMILARAFLREIGFEWVRSR', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
341
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
342
+ # This should be relative to the location of the script if you installed via the install_all.sh script
343
+ # Note you need to have downloaded their predictive models (ToDo )
344
+ proteinfer_dir = 'software/proteinfer/'
345
+ df << (ProteInfer(id_col, seq_col, proteinfer_dir, num_threads=num_threads) >> Save(f'proteinfer.pkl'))
346
+ ```
347
+
348
+ ## Tools and references
349
+ Being a toolkit this is a collection of other tools, which means if you use any of these tools then cite the ones relevant to your work:
350
+
351
+ [mmseqs2](https://github.com/soedinglab/mmseqs2)
352
+ [foldseek](https://github.com/steineggerlab/foldseek)
353
+ [diamond](https://github.com/bbuchfink/diamond)
354
+ [proteinfer](https://github.com/google-research/proteinfer)
355
+ [CLEAN](https://github.com/tttianhao/CLEAN)
356
+ [chai](https://github.com/chaidiscovery/chai-lab/)
357
+ [chemBERTa2](https://github.com/seyonechithrananda/bert-loves-chemistry)
358
+ [SELFormer](https://github.com/HUBioDataLab/SELFormer)
359
+ [rxnfp](https://github.com/rxn4chemistry/rxnfp)
360
+ [clustalomega](http://www.clustal.org/omega/)
361
+ [CREEP](https://github.com/jsunn-y/CARE)
362
+ [esm](https://github.com/facebookresearch/esm)
363
+ [LigandMPNN](https://github.com/dauparas/LigandMPNN)
364
+ [vina](https://vina.scripps.edu/)
365
+ [Uni-Mol](https://github.com/deepmodeling/Uni-Mol)
366
+ [fasttree](https://morgannprice.github.io/fasttree/)
367
+ [Porechop](https://github.com/rrwick/Porechop)
368
+ [prokka](https://github.com/tseemann/prokka)
369
+
370
+