enzymetk 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of enzymetk might be problematic. Click here for more details.
- {enzymetk-0.0.1 → enzymetk-0.0.3}/PKG-INFO +60 -18
- {enzymetk-0.0.1 → enzymetk-0.0.3}/README.md +57 -16
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/__init__.py +1 -24
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/annotateEC_CLEAN_step.py +2 -2
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/annotateEC_CREEP_step.py +1 -1
- enzymetk-0.0.3/enzymetk/dock_boltz_step.py +46 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedprotein_esm_step.py +5 -4
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/predict_catalyticsite_run.py +1 -1
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/predict_catalyticsite_step.py +20 -12
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/sequence_search_blast.py +33 -4
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/similarity_foldseek_step.py +26 -6
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/similarity_reaction_step.py +12 -8
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/step.py +5 -3
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/PKG-INFO +60 -18
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/SOURCES.txt +1 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/LICENSE +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/annotateEC_proteinfer_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/dock_chai_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/dock_vina_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_chemberta_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_rxnfp_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_rxnfp_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_selformer_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_selformer_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/embedchem_unimol_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/esm-extract.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/filter_sequence_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/filter_structure_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/generate_msa_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/generate_oligopool_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/generate_tree_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/inpaint_ligandMPNN_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/main.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/pipeline.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/predict_activity_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/reducedim_pca_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/reducedim_vae_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/reducedim_vae_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/save_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/similarity_mmseqs_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk/similarity_substrate_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/dependency_links.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/entry_points.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/requires.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/enzymetk.egg-info/top_level.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/setup.cfg +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.3}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: enzymetk
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Home-page: https://github.com/arianemora/enzyme-tk/
|
|
5
5
|
Author: Ariane Mora
|
|
6
6
|
Author-email: ariane.n.mora@gmail.com
|
|
@@ -37,6 +37,7 @@ Dynamic: description-content-type
|
|
|
37
37
|
Dynamic: home-page
|
|
38
38
|
Dynamic: keywords
|
|
39
39
|
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
40
41
|
Dynamic: project-url
|
|
41
42
|
Dynamic: requires-dist
|
|
42
43
|
Dynamic: requires-python
|
|
@@ -45,26 +46,36 @@ Dynamic: requires-python
|
|
|
45
46
|
|
|
46
47
|
Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
|
|
47
48
|
|
|
49
|
+
|
|
50
|
+
**If you have any issues installing, let me know - this has been tested only on Linux/Ubuntu. Please post an issue!**
|
|
51
|
+
|
|
48
52
|
## Installation
|
|
49
53
|
|
|
54
|
+
## Install base package to import modules
|
|
55
|
+
|
|
50
56
|
```bash
|
|
51
|
-
|
|
57
|
+
pip install enzymetk
|
|
52
58
|
```
|
|
53
59
|
|
|
54
|
-
|
|
60
|
+
### Install only the specific requirements you need (recomended)
|
|
55
61
|
|
|
62
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
56
63
|
```bash
|
|
57
64
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
66
|
+
# e.g. to install all from within that folder you would do
|
|
67
|
+
source install_all.sh
|
|
60
68
|
```
|
|
61
69
|
|
|
62
70
|
## Usage
|
|
63
71
|
|
|
64
72
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
65
73
|
|
|
74
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
75
|
+
|
|
66
76
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
67
77
|
|
|
78
|
+
[boltz2](https://github.com/jwohlwend/boltz)
|
|
68
79
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
69
80
|
[foldseek](https://github.com/steineggerlab/foldseek)
|
|
70
81
|
[diamond](https://github.com/bbuchfink/diamond)
|
|
@@ -83,6 +94,7 @@ Here are some of the tools that have been implemented to be chained together as
|
|
|
83
94
|
[fasttree](https://morgannprice.github.io/fasttree/)
|
|
84
95
|
[Porechop](https://github.com/rrwick/Porechop)
|
|
85
96
|
[prokka](https://github.com/tseemann/prokka)
|
|
97
|
+
|
|
86
98
|
## Things to note
|
|
87
99
|
|
|
88
100
|
All the tools use the conda env of `enzymetk` by default.
|
|
@@ -114,6 +126,8 @@ The steps are the main building blocks of the pipeline. They are responsible for
|
|
|
114
126
|
|
|
115
127
|
BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
|
|
116
128
|
|
|
129
|
+
Note you need to have installed the BLAST environment.
|
|
130
|
+
|
|
117
131
|
```python
|
|
118
132
|
id_col = 'Entry'
|
|
119
133
|
seq_col = 'Sequence'
|
|
@@ -142,6 +156,34 @@ df = pd.DataFrame(rows, columns=[id_col, seq_col])
|
|
|
142
156
|
print(df)
|
|
143
157
|
df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
|
|
144
158
|
|
|
159
|
+
```
|
|
160
|
+
### Boltz2
|
|
161
|
+
|
|
162
|
+
Boltz2 is a model for predicting structures. Note you need docko installed as I run via that.
|
|
163
|
+
|
|
164
|
+
Below is an example using boltz with 4 threads, and uses a cofactor (intermediate in this case). Just set to be None for a single substrate version.
|
|
165
|
+
```
|
|
166
|
+
import sys
|
|
167
|
+
from enzymetk.dock_boltz_step import Boltz
|
|
168
|
+
from enzymetk.save_step import Save
|
|
169
|
+
import pandas as pd
|
|
170
|
+
import os
|
|
171
|
+
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
|
172
|
+
|
|
173
|
+
output_dir = 'tmp/'
|
|
174
|
+
num_threads = 4
|
|
175
|
+
id_col = 'Entry'
|
|
176
|
+
seq_col = 'Sequence'
|
|
177
|
+
substrate_col = 'Substrate'
|
|
178
|
+
intermediate_col = 'Intermediate'
|
|
179
|
+
|
|
180
|
+
rows = [['P0DP23_boltz_8999', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
181
|
+
['P0DP24_boltz_p1', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
182
|
+
['P0DP23_boltz_p2', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
183
|
+
['P0DP24_boltz_p3', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
184
|
+
['P0DP24_boltz_p4', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]']]
|
|
185
|
+
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col, intermediate_col])
|
|
186
|
+
df << (Boltz(id_col, seq_col, substrate_col, intermediate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
|
|
145
187
|
```
|
|
146
188
|
|
|
147
189
|
### Chai
|
|
@@ -169,8 +211,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
169
211
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
170
212
|
|
|
171
213
|
```python
|
|
172
|
-
from
|
|
173
|
-
from
|
|
214
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
215
|
+
from enzymetk.save_step import Save
|
|
174
216
|
|
|
175
217
|
output_dir = 'tmp/'
|
|
176
218
|
num_threads = 1
|
|
@@ -180,7 +222,7 @@ substrate_col = 'Substrate'
|
|
|
180
222
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
181
223
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
182
224
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
183
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
225
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
184
226
|
```
|
|
185
227
|
|
|
186
228
|
### CLEAN
|
|
@@ -206,11 +248,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
206
248
|
```
|
|
207
249
|
### ClustalOmega
|
|
208
250
|
|
|
209
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
251
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
210
252
|
|
|
211
253
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
254
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
255
|
+
from enzymetk.save_step import Save
|
|
214
256
|
import pandas as pd
|
|
215
257
|
|
|
216
258
|
id_col = 'Entry'
|
|
@@ -230,8 +272,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
230
272
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
231
273
|
|
|
232
274
|
```python
|
|
233
|
-
from
|
|
234
|
-
from
|
|
275
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
276
|
+
from enzymetk.save_step import Save
|
|
235
277
|
import pandas as pd
|
|
236
278
|
|
|
237
279
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -252,8 +294,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
252
294
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
253
295
|
|
|
254
296
|
```python
|
|
255
|
-
from
|
|
256
|
-
from
|
|
297
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
298
|
+
from enzymetk.save_step import Save
|
|
257
299
|
import pandas as pd
|
|
258
300
|
|
|
259
301
|
id_col = 'Entry'
|
|
@@ -280,8 +322,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
280
322
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
281
323
|
|
|
282
324
|
```python
|
|
283
|
-
from
|
|
284
|
-
from
|
|
325
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
326
|
+
from enzymetk.save_step import Save
|
|
285
327
|
import pandas as pd
|
|
286
328
|
|
|
287
329
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
@@ -2,26 +2,36 @@
|
|
|
2
2
|
|
|
3
3
|
Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
|
|
4
4
|
|
|
5
|
+
|
|
6
|
+
**If you have any issues installing, let me know - this has been tested only on Linux/Ubuntu. Please post an issue!**
|
|
7
|
+
|
|
5
8
|
## Installation
|
|
6
9
|
|
|
10
|
+
## Install base package to import modules
|
|
11
|
+
|
|
7
12
|
```bash
|
|
8
|
-
|
|
13
|
+
pip install enzymetk
|
|
9
14
|
```
|
|
10
15
|
|
|
11
|
-
|
|
16
|
+
### Install only the specific requirements you need (recomended)
|
|
12
17
|
|
|
18
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
13
19
|
```bash
|
|
14
20
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
15
|
-
|
|
16
|
-
|
|
21
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
22
|
+
# e.g. to install all from within that folder you would do
|
|
23
|
+
source install_all.sh
|
|
17
24
|
```
|
|
18
25
|
|
|
19
26
|
## Usage
|
|
20
27
|
|
|
21
28
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
22
29
|
|
|
30
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
31
|
+
|
|
23
32
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
24
33
|
|
|
34
|
+
[boltz2](https://github.com/jwohlwend/boltz)
|
|
25
35
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
26
36
|
[foldseek](https://github.com/steineggerlab/foldseek)
|
|
27
37
|
[diamond](https://github.com/bbuchfink/diamond)
|
|
@@ -40,6 +50,7 @@ Here are some of the tools that have been implemented to be chained together as
|
|
|
40
50
|
[fasttree](https://morgannprice.github.io/fasttree/)
|
|
41
51
|
[Porechop](https://github.com/rrwick/Porechop)
|
|
42
52
|
[prokka](https://github.com/tseemann/prokka)
|
|
53
|
+
|
|
43
54
|
## Things to note
|
|
44
55
|
|
|
45
56
|
All the tools use the conda env of `enzymetk` by default.
|
|
@@ -71,6 +82,8 @@ The steps are the main building blocks of the pipeline. They are responsible for
|
|
|
71
82
|
|
|
72
83
|
BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
|
|
73
84
|
|
|
85
|
+
Note you need to have installed the BLAST environment.
|
|
86
|
+
|
|
74
87
|
```python
|
|
75
88
|
id_col = 'Entry'
|
|
76
89
|
seq_col = 'Sequence'
|
|
@@ -99,6 +112,34 @@ df = pd.DataFrame(rows, columns=[id_col, seq_col])
|
|
|
99
112
|
print(df)
|
|
100
113
|
df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
|
|
101
114
|
|
|
115
|
+
```
|
|
116
|
+
### Boltz2
|
|
117
|
+
|
|
118
|
+
Boltz2 is a model for predicting structures. Note you need docko installed as I run via that.
|
|
119
|
+
|
|
120
|
+
Below is an example using boltz with 4 threads, and uses a cofactor (intermediate in this case). Just set to be None for a single substrate version.
|
|
121
|
+
```
|
|
122
|
+
import sys
|
|
123
|
+
from enzymetk.dock_boltz_step import Boltz
|
|
124
|
+
from enzymetk.save_step import Save
|
|
125
|
+
import pandas as pd
|
|
126
|
+
import os
|
|
127
|
+
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
|
128
|
+
|
|
129
|
+
output_dir = 'tmp/'
|
|
130
|
+
num_threads = 4
|
|
131
|
+
id_col = 'Entry'
|
|
132
|
+
seq_col = 'Sequence'
|
|
133
|
+
substrate_col = 'Substrate'
|
|
134
|
+
intermediate_col = 'Intermediate'
|
|
135
|
+
|
|
136
|
+
rows = [['P0DP23_boltz_8999', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
137
|
+
['P0DP24_boltz_p1', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
138
|
+
['P0DP23_boltz_p2', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
139
|
+
['P0DP24_boltz_p3', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
140
|
+
['P0DP24_boltz_p4', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]']]
|
|
141
|
+
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col, intermediate_col])
|
|
142
|
+
df << (Boltz(id_col, seq_col, substrate_col, intermediate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
|
|
102
143
|
```
|
|
103
144
|
|
|
104
145
|
### Chai
|
|
@@ -126,8 +167,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
126
167
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
127
168
|
|
|
128
169
|
```python
|
|
129
|
-
from
|
|
130
|
-
from
|
|
170
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
171
|
+
from enzymetk.save_step import Save
|
|
131
172
|
|
|
132
173
|
output_dir = 'tmp/'
|
|
133
174
|
num_threads = 1
|
|
@@ -137,7 +178,7 @@ substrate_col = 'Substrate'
|
|
|
137
178
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
138
179
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
139
180
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
140
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
181
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
141
182
|
```
|
|
142
183
|
|
|
143
184
|
### CLEAN
|
|
@@ -163,11 +204,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
163
204
|
```
|
|
164
205
|
### ClustalOmega
|
|
165
206
|
|
|
166
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
207
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
167
208
|
|
|
168
209
|
```python
|
|
169
|
-
from
|
|
170
|
-
from
|
|
210
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
211
|
+
from enzymetk.save_step import Save
|
|
171
212
|
import pandas as pd
|
|
172
213
|
|
|
173
214
|
id_col = 'Entry'
|
|
@@ -187,8 +228,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
187
228
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
188
229
|
|
|
189
230
|
```python
|
|
190
|
-
from
|
|
191
|
-
from
|
|
231
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
232
|
+
from enzymetk.save_step import Save
|
|
192
233
|
import pandas as pd
|
|
193
234
|
|
|
194
235
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -209,8 +250,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
209
250
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
210
251
|
|
|
211
252
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
253
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
254
|
+
from enzymetk.save_step import Save
|
|
214
255
|
import pandas as pd
|
|
215
256
|
|
|
216
257
|
id_col = 'Entry'
|
|
@@ -237,8 +278,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
237
278
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
238
279
|
|
|
239
280
|
```python
|
|
240
|
-
from
|
|
241
|
-
from
|
|
281
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
282
|
+
from enzymetk.save_step import Save
|
|
242
283
|
import pandas as pd
|
|
243
284
|
|
|
244
285
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
@@ -22,34 +22,11 @@ Date: March 2025
|
|
|
22
22
|
__title__ = 'enzymetk'
|
|
23
23
|
__description__ = 'Toolkit for enzymes and what not'
|
|
24
24
|
__url__ = 'https://github.com/arianemora/enzyme-tk/'
|
|
25
|
-
__version__ = '0.0.
|
|
25
|
+
__version__ = '0.0.3'
|
|
26
26
|
__author__ = 'Ariane Mora'
|
|
27
27
|
__author_email__ = 'ariane.n.mora@gmail.com'
|
|
28
28
|
__license__ = 'GPL3'
|
|
29
29
|
|
|
30
|
-
# from enzymetk.step import *
|
|
31
|
-
# from enzymetk.generate_msa_step import ClustalOmega
|
|
32
|
-
# from enzymetk.annotateEC_CLEAN_step import CLEAN
|
|
33
|
-
# from enzymetk.annotateEC_proteinfer_step import ProteInfer
|
|
34
|
-
# from enzymetk.dock_chai_step import Chai
|
|
35
|
-
# from enzymetk.dock_vina_step import Vina
|
|
36
|
-
# from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
37
|
-
# from enzymetk.embedchem_rxnfp_step import RxnFP
|
|
38
|
-
# from enzymetk.embedchem_selformer_step import SelFormer
|
|
39
|
-
# from enzymetk.embedchem_unimol_step import UniMol
|
|
40
|
-
# from enzymetk.embedprotein_esm_step import EmbedESM
|
|
41
|
-
# from enzymetk.generate_tree_step import FastTree
|
|
42
|
-
# from enzymetk.inpaint_ligandMPNN_step import LigandMPNN
|
|
43
|
-
# from enzymetk.metagenomics_porechop_trim_reads_step import PoreChop
|
|
44
|
-
# from enzymetk.metagenomics_prokka_annotate_genes import Prokka
|
|
45
|
-
# #from enzymetk.predict_activity_step import
|
|
46
|
-
# from enzymetk.predict_catalyticsite_step import ActiveSitePred
|
|
47
|
-
# from enzymetk.sequence_search_blast import BLAST
|
|
48
|
-
# from enzymetk.similarity_foldseek_step import FoldSeek
|
|
49
|
-
# from enzymetk.similarity_mmseqs_step import MMseqs
|
|
50
|
-
# from enzymetk.similarity_reaction_step import ReactionDist
|
|
51
|
-
# from enzymetk.similarity_substrate_step import SubstrateDist
|
|
52
|
-
|
|
53
30
|
|
|
54
31
|
|
|
55
32
|
|
|
@@ -116,7 +116,7 @@ class CLEAN(Step):
|
|
|
116
116
|
print(output_filenames)
|
|
117
117
|
for sub_df in output_filenames:
|
|
118
118
|
df = pd.concat([df, sub_df])
|
|
119
|
-
return df
|
|
119
|
+
return self.__filter_df(df)
|
|
120
120
|
else:
|
|
121
|
-
return self.__execute([df, tmp_dir])
|
|
121
|
+
return self.__filter_df(self.__execute([df, tmp_dir]))
|
|
122
122
|
return df
|
|
@@ -38,7 +38,7 @@ class CREEP(Step):
|
|
|
38
38
|
self.args_extract = args_extract
|
|
39
39
|
self.args_retrieval = args_retrieval
|
|
40
40
|
|
|
41
|
-
def __execute(self, df: pd.DataFrame, tmp_dir: str)
|
|
41
|
+
def __execute(self, df: pd.DataFrame, tmp_dir: str):
|
|
42
42
|
tmp_dir = '/disk1/ariane/vscode/degradeo/pipeline/tmp/'
|
|
43
43
|
input_filename = f'{tmp_dir}/creepasjkdkajshdkja.csv'
|
|
44
44
|
df.to_csv(input_filename, index=False)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from enzymetk.step import Step
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from docko.boltz import run_boltz_affinity
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
from multiprocessing.dummy import Pool as ThreadPool
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
logger.setLevel(logging.INFO)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Boltz(Step):
|
|
14
|
+
|
|
15
|
+
def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str, num_threads: int):
|
|
16
|
+
self.id_col = id_col
|
|
17
|
+
self.seq_col = seq_col
|
|
18
|
+
self.substrate_col = substrate_col
|
|
19
|
+
self.intermediate_col = intermediate_col
|
|
20
|
+
self.output_dir = output_dir or None
|
|
21
|
+
self.num_threads = num_threads or 1
|
|
22
|
+
|
|
23
|
+
def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
24
|
+
output_filenames = []
|
|
25
|
+
|
|
26
|
+
for run_id, seq, substrate, intermediate in df[[self.id_col, self.seq_col, self.substrate_col, self.intermediate_col]].values:
|
|
27
|
+
# Might have an issue if the things are not correctly installed in the same dicrectory
|
|
28
|
+
if not isinstance(substrate, str):
|
|
29
|
+
substrate = ''
|
|
30
|
+
print(run_id, seq, substrate)
|
|
31
|
+
run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate)
|
|
32
|
+
output_filenames.append(f'{self.output_dir}/{run_id}/')
|
|
33
|
+
return output_filenames
|
|
34
|
+
|
|
35
|
+
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
36
|
+
if self.output_dir:
|
|
37
|
+
if self.num_threads > 1:
|
|
38
|
+
pool = ThreadPool(self.num_threads)
|
|
39
|
+
df_list = np.array_split(df, self.num_threads)
|
|
40
|
+
results = pool.map(self.__execute, df_list)
|
|
41
|
+
else:
|
|
42
|
+
results = self.__execute(df)
|
|
43
|
+
df['output_dir'] = results
|
|
44
|
+
return df
|
|
45
|
+
else:
|
|
46
|
+
print('No output directory provided')
|
|
@@ -70,8 +70,8 @@ def extract_mean_embedding(df, id_column, encoding_dir, rep_num=33):
|
|
|
70
70
|
|
|
71
71
|
class EmbedESM(Step):
|
|
72
72
|
|
|
73
|
-
def __init__(self, id_col: str, seq_col: str, model='
|
|
74
|
-
active_site_col: str = None, num_threads=1, tmp_dir: str = None, env_name: str = 'enzymetk'):
|
|
73
|
+
def __init__(self, id_col: str, seq_col: str, model='esm2_t36_3B_UR50D', extraction_method='mean',
|
|
74
|
+
active_site_col: str = None, num_threads=1, tmp_dir: str = None, env_name: str = 'enzymetk', rep_num=36):
|
|
75
75
|
self.seq_col = seq_col
|
|
76
76
|
self.id_col = id_col
|
|
77
77
|
self.active_site_col = active_site_col
|
|
@@ -80,6 +80,7 @@ class EmbedESM(Step):
|
|
|
80
80
|
self.extraction_method = extraction_method
|
|
81
81
|
self.tmp_dir = tmp_dir
|
|
82
82
|
self.env_name = env_name
|
|
83
|
+
self.rep_num = rep_num
|
|
83
84
|
|
|
84
85
|
def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
|
|
85
86
|
input_filename = f'{tmp_dir}/input.fasta'
|
|
@@ -95,11 +96,11 @@ class EmbedESM(Step):
|
|
|
95
96
|
cmd = ['conda', 'run', '-n', self.env_name, 'python', Path(__file__).parent/'esm-extract.py', self.model, input_filename, tmp_dir, '--include', 'per_tok']
|
|
96
97
|
self.run(cmd)
|
|
97
98
|
if self.extraction_method == 'mean':
|
|
98
|
-
df = extract_mean_embedding(df, self.id_col, tmp_dir)
|
|
99
|
+
df = extract_mean_embedding(df, self.id_col, tmp_dir, rep_num=self.rep_num)
|
|
99
100
|
elif self.extraction_method == 'active_site':
|
|
100
101
|
if self.active_site_col is None:
|
|
101
102
|
raise ValueError('active_site_col must be provided if extraction_method is active_site')
|
|
102
|
-
df = extract_active_site_embedding(df, self.id_col, self.active_site_col, tmp_dir)
|
|
103
|
+
df = extract_active_site_embedding(df, self.id_col, self.active_site_col, tmp_dir, rep_num=self.rep_num)
|
|
103
104
|
|
|
104
105
|
return df
|
|
105
106
|
|
|
@@ -11,7 +11,7 @@ def run_as_inference(output_dir, fasta_file, squidly_dir, toks_per_batch, as_thr
|
|
|
11
11
|
elif esm2_model == "esm2_t48_15B_UR50D":
|
|
12
12
|
cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_15B.pt"
|
|
13
13
|
lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_15B.pth"
|
|
14
|
-
as_threshold = 0.
|
|
14
|
+
as_threshold = 0.97
|
|
15
15
|
#esm2_model = "esm2_t48_15B_UR50D"
|
|
16
16
|
# python /scratch/project/squid/code_modular/SQUIDLY_run_model_LSTM.py ${FILE} ${ESM2_MODEL} ${CR_MODEL_AS}
|
|
17
17
|
# ${LSTM_MODEL_AS} ${OUT} --toks_per_batch ${TOKS_PER_BATCH} --AS_threshold ${AS_THRESHOLD} --monitor
|
|
@@ -8,6 +8,8 @@ import numpy as np
|
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
import random
|
|
10
10
|
import string
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
logger.setLevel(logging.INFO)
|
|
@@ -15,15 +17,17 @@ logger.setLevel(logging.INFO)
|
|
|
15
17
|
|
|
16
18
|
class ActiveSitePred(Step):
|
|
17
19
|
|
|
18
|
-
def __init__(self, id_col: str, seq_col: str,
|
|
19
|
-
esm2_model = 'esm2_t36_3B_UR50D', tmp_dir: str = None):
|
|
20
|
+
def __init__(self, id_col: str, seq_col: str, num_threads: int = 1,
|
|
21
|
+
esm2_model = 'esm2_t36_3B_UR50D', tmp_dir: str = None, args=None):
|
|
20
22
|
self.id_col = id_col
|
|
21
23
|
self.seq_col = seq_col
|
|
22
24
|
self.num_threads = num_threads or 1
|
|
23
|
-
self.squidly_dir = squidly_dir
|
|
24
25
|
self.esm2_model = esm2_model
|
|
25
26
|
self.tmp_dir = tmp_dir
|
|
26
|
-
|
|
27
|
+
self.args = None
|
|
28
|
+
self.logger = logging.getLogger(__name__)
|
|
29
|
+
print('Predicting Active Sites using Squidly')
|
|
30
|
+
|
|
27
31
|
def __to_fasta(self, df: pd.DataFrame, tmp_dir: str):
|
|
28
32
|
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
29
33
|
|
|
@@ -37,13 +41,17 @@ class ActiveSitePred(Step):
|
|
|
37
41
|
def __execute(self, df: pd.DataFrame, tmp_dir: str):
|
|
38
42
|
input_filename = self.__to_fasta(df, tmp_dir)
|
|
39
43
|
# Might have an issue if the things are not correctly installed in the same dicrectory
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
cmd = []
|
|
45
|
+
cmd = ['squidly', 'run', input_filename, self.esm2_model, tmp_dir]
|
|
46
|
+
if self.args is not None:
|
|
47
|
+
cmd.extend(self.args)
|
|
48
|
+
result = self.run(cmd)
|
|
43
49
|
if result.stderr:
|
|
44
|
-
logger.error(result.stderr)
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
self.logger.error(result.stderr)
|
|
51
|
+
print(result.stderr)
|
|
52
|
+
else:
|
|
53
|
+
self.logger.info(result.stdout)
|
|
54
|
+
output_filename = os.path.join(tmp_dir, 'squidly_ensemble.csv')
|
|
47
55
|
return output_filename
|
|
48
56
|
|
|
49
57
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -61,10 +69,10 @@ class ActiveSitePred(Step):
|
|
|
61
69
|
df = pd.DataFrame()
|
|
62
70
|
print(output_filenames)
|
|
63
71
|
for p in output_filenames:
|
|
64
|
-
sub_df = pd.
|
|
72
|
+
sub_df = pd.read_csv(p)
|
|
65
73
|
df = pd.concat([df, sub_df])
|
|
66
74
|
return df
|
|
67
75
|
|
|
68
76
|
else:
|
|
69
77
|
output_filename = self.__execute(df, tmp_dir)
|
|
70
|
-
return pd.
|
|
78
|
+
return pd.read_csv(output_filename)
|
|
@@ -3,6 +3,7 @@ Step to run multiple sequence alignment with the Clustal Omega tool.
|
|
|
3
3
|
./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
|
|
4
4
|
"""
|
|
5
5
|
from enzymetk.step import Step
|
|
6
|
+
import logging
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
8
9
|
import numpy as np
|
|
@@ -12,10 +13,17 @@ import os
|
|
|
12
13
|
import subprocess
|
|
13
14
|
import random
|
|
14
15
|
import string
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logger.setLevel(logging.INFO)
|
|
21
|
+
|
|
15
22
|
|
|
16
23
|
class BLAST(Step):
|
|
17
24
|
|
|
18
|
-
def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None,
|
|
25
|
+
def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None,
|
|
26
|
+
mode='blastp', args=None, tmp_dir=None, num_threads=1):
|
|
19
27
|
self.id_col = id_col
|
|
20
28
|
self.seq_col = sequence_col
|
|
21
29
|
self.label_col = label_col # This is whether it is query or reference
|
|
@@ -23,6 +31,7 @@ class BLAST(Step):
|
|
|
23
31
|
self.database = database
|
|
24
32
|
self.args = args
|
|
25
33
|
self.tmp_dir = tmp_dir
|
|
34
|
+
self.num_threads = num_threads
|
|
26
35
|
if self.database is None and self.label_col is None:
|
|
27
36
|
raise ValueError('Database is not set, you can pass a database that you have already created see diamond for more information or the sequences \
|
|
28
37
|
as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference \
|
|
@@ -74,7 +83,27 @@ class BLAST(Step):
|
|
|
74
83
|
return df
|
|
75
84
|
|
|
76
85
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
-
if self.tmp_dir is not None:
|
|
78
|
-
return self.__execute([df, self.tmp_dir])
|
|
79
86
|
with TemporaryDirectory() as tmp_dir:
|
|
80
|
-
|
|
87
|
+
tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
|
|
88
|
+
if self.num_threads > 1:
|
|
89
|
+
output_filenames = []
|
|
90
|
+
df_list = np.array_split(df, self.num_threads)
|
|
91
|
+
for df_chunk in tqdm(df_list):
|
|
92
|
+
try:
|
|
93
|
+
output_filenames.append(self.__execute([df_chunk, tmp_dir]))
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Error in executing ESM2 model: {e}")
|
|
96
|
+
continue
|
|
97
|
+
df = pd.DataFrame()
|
|
98
|
+
for sub_df in output_filenames:
|
|
99
|
+
df = pd.concat([df, sub_df])
|
|
100
|
+
return df
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
return self.__execute([df, tmp_dir])
|
|
104
|
+
|
|
105
|
+
# def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
106
|
+
# if self.tmp_dir is not None:
|
|
107
|
+
# return self.__execute([df, self.tmp_dir])
|
|
108
|
+
# with TemporaryDirectory() as tmp_dir:
|
|
109
|
+
# return self.__execute([df, tmp_dir])
|
|
@@ -7,13 +7,17 @@ repo and then copy it out of it.
|
|
|
7
7
|
"""
|
|
8
8
|
from enzymetk.step import Step
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
import logging
|
|
11
11
|
import pandas as pd
|
|
12
12
|
import numpy as np
|
|
13
13
|
from tempfile import TemporaryDirectory
|
|
14
14
|
import subprocess
|
|
15
15
|
import random
|
|
16
16
|
import string
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logger.setLevel(logging.INFO)
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
def process_clustering(filename, df, id_column_name):
|
|
@@ -34,13 +38,14 @@ def process_clustering(filename, df, id_column_name):
|
|
|
34
38
|
class FoldSeek(Step):
|
|
35
39
|
|
|
36
40
|
def __init__(self, id_column_name: str, query_column_name: str, reference_database: str, method='search', query_type='structures',
|
|
37
|
-
args=None, tmp_dir: str = None):
|
|
41
|
+
args=None, num_threads=1, tmp_dir: str = None):
|
|
38
42
|
self.query_column_name = query_column_name
|
|
39
43
|
self.id_column_name = id_column_name
|
|
40
44
|
self.reference_database = reference_database # pdb should be the default
|
|
41
45
|
self.tmp_dir = tmp_dir
|
|
42
46
|
self.method = method
|
|
43
47
|
self.args = args
|
|
48
|
+
self.num_threads = num_threads
|
|
44
49
|
self.query_type = query_type
|
|
45
50
|
if self.method not in ['search', 'cluster']:
|
|
46
51
|
print('Method must be in "search" or "cluster". Will likely fail... ')
|
|
@@ -107,8 +112,23 @@ class FoldSeek(Step):
|
|
|
107
112
|
return df
|
|
108
113
|
|
|
109
114
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
110
|
-
if self.tmp_dir is not None:
|
|
111
|
-
return self.__execute([df, self.tmp_dir])
|
|
112
115
|
with TemporaryDirectory() as tmp_dir:
|
|
113
|
-
|
|
114
|
-
|
|
116
|
+
tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
|
|
117
|
+
if self.num_threads > 1:
|
|
118
|
+
output_filenames = []
|
|
119
|
+
df_list = np.array_split(df, self.num_threads)
|
|
120
|
+
for df_chunk in tqdm(df_list):
|
|
121
|
+
try:
|
|
122
|
+
output_filenames.append(self.__execute([df_chunk, tmp_dir]))
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Error in executing ESM2 model: {e}")
|
|
125
|
+
continue
|
|
126
|
+
df = pd.DataFrame()
|
|
127
|
+
print(output_filenames)
|
|
128
|
+
for sub_df in output_filenames:
|
|
129
|
+
df = pd.concat([df, sub_df])
|
|
130
|
+
return df
|
|
131
|
+
|
|
132
|
+
else:
|
|
133
|
+
df = self.__execute([df, tmp_dir])
|
|
134
|
+
return df
|
|
@@ -24,22 +24,26 @@ class ReactionDist(Step):
|
|
|
24
24
|
self.num_threads = num_threads
|
|
25
25
|
|
|
26
26
|
def __execute(self, data: list) -> np.array:
|
|
27
|
-
reaction_df = data
|
|
28
|
-
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
29
|
-
|
|
30
|
-
rxn = rdChemReactions.ReactionFromSmarts(self.smiles_string)
|
|
31
|
-
rxn_fp = rdChemReactions.CreateStructuralFingerprintForReaction(rxn)
|
|
27
|
+
reaction_df = data
|
|
32
28
|
rows = []
|
|
33
29
|
# compare all fp pairwise without duplicates
|
|
34
30
|
for smile_id, smiles in tqdm(reaction_df[[self.id_column_name, self.smiles_column_name]].values): # -1 so the last fp will not be used
|
|
35
31
|
mol_ = rdChemReactions.ReactionFromSmarts(smiles)
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
fp_params = rdChemReactions.ReactionFingerprintParams()
|
|
33
|
+
# Note: if you don't pass , ReactionFingerPrintParams=fp_params you get different results
|
|
34
|
+
# i.e. reactions that don't appear to be the same are reported as similar of 1.0
|
|
35
|
+
# https://github.com/rdkit/rdkit/discussions/5263
|
|
36
|
+
rxn = rdChemReactions.ReactionFromSmarts(self.smiles_string)
|
|
37
|
+
|
|
38
|
+
rxn_fp = rdChemReactions.CreateStructuralFingerprintForReaction(rxn, ReactionFingerPrintParams=fp_params)
|
|
39
|
+
fps = rdChemReactions.CreateStructuralFingerprintForReaction(mol_, ReactionFingerPrintParams=fp_params)
|
|
40
|
+
rows.append([smile_id,
|
|
41
|
+
self.smiles_string,
|
|
38
42
|
smiles,
|
|
39
43
|
DataStructs.TanimotoSimilarity(fps, rxn_fp),
|
|
40
44
|
DataStructs.RusselSimilarity(fps, rxn_fp),
|
|
41
45
|
DataStructs.CosineSimilarity(fps, rxn_fp)])
|
|
42
|
-
distance_df = pd.DataFrame(rows, columns=[self.id_column_name, 'TargetSmiles', 'TanimotoSimilarity', 'RusselSimilarity', 'CosineSimilarity'])
|
|
46
|
+
distance_df = pd.DataFrame(rows, columns=[self.id_column_name, 'QuerySmiles', 'TargetSmiles', 'TanimotoSimilarity', 'RusselSimilarity', 'CosineSimilarity'])
|
|
43
47
|
return distance_df
|
|
44
48
|
|
|
45
49
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -36,8 +36,9 @@ class Step():
|
|
|
36
36
|
""" Execute some shit """
|
|
37
37
|
return df
|
|
38
38
|
|
|
39
|
-
def run(self, cmd: list)
|
|
40
|
-
""" Run a command """
|
|
39
|
+
def run(self, cmd: list):
|
|
40
|
+
""" Run a command """
|
|
41
|
+
result = None
|
|
41
42
|
start = timeit.default_timer()
|
|
42
43
|
u.dp(['Running command', ' '.join([str(c) for c in cmd])])
|
|
43
44
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
@@ -48,8 +49,9 @@ class Step():
|
|
|
48
49
|
logger.error(result.stderr)
|
|
49
50
|
logger.info(result.stdout)
|
|
50
51
|
u.dp(['Time for command to run (min): ', (timeit.default_timer() - start)/60])
|
|
52
|
+
return result
|
|
51
53
|
|
|
52
|
-
def __rshift__(self, other: Step)
|
|
54
|
+
def __rshift__(self, other: Step) :
|
|
53
55
|
return Pipeline(self, other)
|
|
54
56
|
|
|
55
57
|
def __rlshift__(self, other: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: enzymetk
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Home-page: https://github.com/arianemora/enzyme-tk/
|
|
5
5
|
Author: Ariane Mora
|
|
6
6
|
Author-email: ariane.n.mora@gmail.com
|
|
@@ -37,6 +37,7 @@ Dynamic: description-content-type
|
|
|
37
37
|
Dynamic: home-page
|
|
38
38
|
Dynamic: keywords
|
|
39
39
|
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
40
41
|
Dynamic: project-url
|
|
41
42
|
Dynamic: requires-dist
|
|
42
43
|
Dynamic: requires-python
|
|
@@ -45,26 +46,36 @@ Dynamic: requires-python
|
|
|
45
46
|
|
|
46
47
|
Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
|
|
47
48
|
|
|
49
|
+
|
|
50
|
+
**If you have any issues installing, let me know - this has been tested only on Linux/Ubuntu. Please post an issue!**
|
|
51
|
+
|
|
48
52
|
## Installation
|
|
49
53
|
|
|
54
|
+
## Install base package to import modules
|
|
55
|
+
|
|
50
56
|
```bash
|
|
51
|
-
|
|
57
|
+
pip install enzymetk
|
|
52
58
|
```
|
|
53
59
|
|
|
54
|
-
|
|
60
|
+
### Install only the specific requirements you need (recomended)
|
|
55
61
|
|
|
62
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
56
63
|
```bash
|
|
57
64
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
66
|
+
# e.g. to install all from within that folder you would do
|
|
67
|
+
source install_all.sh
|
|
60
68
|
```
|
|
61
69
|
|
|
62
70
|
## Usage
|
|
63
71
|
|
|
64
72
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
65
73
|
|
|
74
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
75
|
+
|
|
66
76
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
67
77
|
|
|
78
|
+
[boltz2](https://github.com/jwohlwend/boltz)
|
|
68
79
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
69
80
|
[foldseek](https://github.com/steineggerlab/foldseek)
|
|
70
81
|
[diamond](https://github.com/bbuchfink/diamond)
|
|
@@ -83,6 +94,7 @@ Here are some of the tools that have been implemented to be chained together as
|
|
|
83
94
|
[fasttree](https://morgannprice.github.io/fasttree/)
|
|
84
95
|
[Porechop](https://github.com/rrwick/Porechop)
|
|
85
96
|
[prokka](https://github.com/tseemann/prokka)
|
|
97
|
+
|
|
86
98
|
## Things to note
|
|
87
99
|
|
|
88
100
|
All the tools use the conda env of `enzymetk` by default.
|
|
@@ -114,6 +126,8 @@ The steps are the main building blocks of the pipeline. They are responsible for
|
|
|
114
126
|
|
|
115
127
|
BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
|
|
116
128
|
|
|
129
|
+
Note you need to have installed the BLAST environment.
|
|
130
|
+
|
|
117
131
|
```python
|
|
118
132
|
id_col = 'Entry'
|
|
119
133
|
seq_col = 'Sequence'
|
|
@@ -142,6 +156,34 @@ df = pd.DataFrame(rows, columns=[id_col, seq_col])
|
|
|
142
156
|
print(df)
|
|
143
157
|
df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
|
|
144
158
|
|
|
159
|
+
```
|
|
160
|
+
### Boltz2
|
|
161
|
+
|
|
162
|
+
Boltz2 is a model for predicting structures. Note you need docko installed as I run via that.
|
|
163
|
+
|
|
164
|
+
Below is an example using boltz with 4 threads, and uses a cofactor (intermediate in this case). Just set to be None for a single substrate version.
|
|
165
|
+
```
|
|
166
|
+
import sys
|
|
167
|
+
from enzymetk.dock_boltz_step import Boltz
|
|
168
|
+
from enzymetk.save_step import Save
|
|
169
|
+
import pandas as pd
|
|
170
|
+
import os
|
|
171
|
+
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
|
172
|
+
|
|
173
|
+
output_dir = 'tmp/'
|
|
174
|
+
num_threads = 4
|
|
175
|
+
id_col = 'Entry'
|
|
176
|
+
seq_col = 'Sequence'
|
|
177
|
+
substrate_col = 'Substrate'
|
|
178
|
+
intermediate_col = 'Intermediate'
|
|
179
|
+
|
|
180
|
+
rows = [['P0DP23_boltz_8999', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
181
|
+
['P0DP24_boltz_p1', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
182
|
+
['P0DP23_boltz_p2', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
183
|
+
['P0DP24_boltz_p3', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
|
|
184
|
+
['P0DP24_boltz_p4', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]']]
|
|
185
|
+
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col, intermediate_col])
|
|
186
|
+
df << (Boltz(id_col, seq_col, substrate_col, intermediate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
|
|
145
187
|
```
|
|
146
188
|
|
|
147
189
|
### Chai
|
|
@@ -169,8 +211,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
169
211
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
170
212
|
|
|
171
213
|
```python
|
|
172
|
-
from
|
|
173
|
-
from
|
|
214
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
215
|
+
from enzymetk.save_step import Save
|
|
174
216
|
|
|
175
217
|
output_dir = 'tmp/'
|
|
176
218
|
num_threads = 1
|
|
@@ -180,7 +222,7 @@ substrate_col = 'Substrate'
|
|
|
180
222
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
181
223
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
182
224
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
183
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
225
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
184
226
|
```
|
|
185
227
|
|
|
186
228
|
### CLEAN
|
|
@@ -206,11 +248,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
206
248
|
```
|
|
207
249
|
### ClustalOmega
|
|
208
250
|
|
|
209
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
251
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
210
252
|
|
|
211
253
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
254
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
255
|
+
from enzymetk.save_step import Save
|
|
214
256
|
import pandas as pd
|
|
215
257
|
|
|
216
258
|
id_col = 'Entry'
|
|
@@ -230,8 +272,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
230
272
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
231
273
|
|
|
232
274
|
```python
|
|
233
|
-
from
|
|
234
|
-
from
|
|
275
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
276
|
+
from enzymetk.save_step import Save
|
|
235
277
|
import pandas as pd
|
|
236
278
|
|
|
237
279
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -252,8 +294,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
252
294
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
253
295
|
|
|
254
296
|
```python
|
|
255
|
-
from
|
|
256
|
-
from
|
|
297
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
298
|
+
from enzymetk.save_step import Save
|
|
257
299
|
import pandas as pd
|
|
258
300
|
|
|
259
301
|
id_col = 'Entry'
|
|
@@ -280,8 +322,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
280
322
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
281
323
|
|
|
282
324
|
```python
|
|
283
|
-
from
|
|
284
|
-
from
|
|
325
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
326
|
+
from enzymetk.save_step import Save
|
|
285
327
|
import pandas as pd
|
|
286
328
|
|
|
287
329
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|