enzymetk 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {enzymetk-0.0.1 → enzymetk-0.0.2}/PKG-INFO +23 -17
- {enzymetk-0.0.1 → enzymetk-0.0.2}/README.md +22 -16
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/__init__.py +1 -1
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_catalyticsite_run.py +1 -1
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/sequence_search_blast.py +33 -4
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_foldseek_step.py +27 -6
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/PKG-INFO +23 -17
- {enzymetk-0.0.1 → enzymetk-0.0.2}/LICENSE +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_CLEAN_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_CREEP_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_proteinfer_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/dock_chai_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/dock_vina_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_chemberta_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_rxnfp_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_rxnfp_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_selformer_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_selformer_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_unimol_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedprotein_esm_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/esm-extract.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/filter_sequence_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/filter_structure_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_msa_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_oligopool_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_tree_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/inpaint_ligandMPNN_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/main.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/pipeline.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_activity_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_catalyticsite_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_pca_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_vae_run.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_vae_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/save_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_mmseqs_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_reaction_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_substrate_step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/step.py +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/SOURCES.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/dependency_links.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/entry_points.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/requires.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/top_level.txt +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/setup.cfg +0 -0
- {enzymetk-0.0.1 → enzymetk-0.0.2}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: enzymetk
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Home-page: https://github.com/arianemora/enzyme-tk/
|
|
5
5
|
Author: Ariane Mora
|
|
6
6
|
Author-email: ariane.n.mora@gmail.com
|
|
@@ -47,22 +47,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
|
|
|
47
47
|
|
|
48
48
|
## Installation
|
|
49
49
|
|
|
50
|
+
## Install base package to import modules
|
|
51
|
+
|
|
50
52
|
```bash
|
|
51
|
-
|
|
53
|
+
pip install enzymetk
|
|
52
54
|
```
|
|
53
55
|
|
|
54
|
-
|
|
56
|
+
### Install only the specific requirements you need (recomended)
|
|
55
57
|
|
|
58
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
56
59
|
```bash
|
|
57
60
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
62
|
+
# e.g. to install all from within that folder you would do
|
|
63
|
+
source install_all.sh
|
|
60
64
|
```
|
|
61
65
|
|
|
62
66
|
## Usage
|
|
63
67
|
|
|
64
68
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
65
69
|
|
|
70
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
71
|
+
|
|
66
72
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
67
73
|
|
|
68
74
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
@@ -169,8 +175,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
169
175
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
170
176
|
|
|
171
177
|
```python
|
|
172
|
-
from
|
|
173
|
-
from
|
|
178
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
179
|
+
from enzymetk.save_step import Save
|
|
174
180
|
|
|
175
181
|
output_dir = 'tmp/'
|
|
176
182
|
num_threads = 1
|
|
@@ -180,7 +186,7 @@ substrate_col = 'Substrate'
|
|
|
180
186
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
181
187
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
182
188
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
183
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
189
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
184
190
|
```
|
|
185
191
|
|
|
186
192
|
### CLEAN
|
|
@@ -206,11 +212,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
206
212
|
```
|
|
207
213
|
### ClustalOmega
|
|
208
214
|
|
|
209
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
215
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
210
216
|
|
|
211
217
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
218
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
219
|
+
from enzymetk.save_step import Save
|
|
214
220
|
import pandas as pd
|
|
215
221
|
|
|
216
222
|
id_col = 'Entry'
|
|
@@ -230,8 +236,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
230
236
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
231
237
|
|
|
232
238
|
```python
|
|
233
|
-
from
|
|
234
|
-
from
|
|
239
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
240
|
+
from enzymetk.save_step import Save
|
|
235
241
|
import pandas as pd
|
|
236
242
|
|
|
237
243
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -252,8 +258,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
252
258
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
253
259
|
|
|
254
260
|
```python
|
|
255
|
-
from
|
|
256
|
-
from
|
|
261
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
262
|
+
from enzymetk.save_step import Save
|
|
257
263
|
import pandas as pd
|
|
258
264
|
|
|
259
265
|
id_col = 'Entry'
|
|
@@ -280,8 +286,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
280
286
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
281
287
|
|
|
282
288
|
```python
|
|
283
|
-
from
|
|
284
|
-
from
|
|
289
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
290
|
+
from enzymetk.save_step import Save
|
|
285
291
|
import pandas as pd
|
|
286
292
|
|
|
287
293
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
@@ -4,22 +4,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
|
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
7
|
+
## Install base package to import modules
|
|
8
|
+
|
|
7
9
|
```bash
|
|
8
|
-
|
|
10
|
+
pip install enzymetk
|
|
9
11
|
```
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
### Install only the specific requirements you need (recomended)
|
|
12
14
|
|
|
15
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
13
16
|
```bash
|
|
14
17
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
15
|
-
|
|
16
|
-
|
|
18
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
19
|
+
# e.g. to install all from within that folder you would do
|
|
20
|
+
source install_all.sh
|
|
17
21
|
```
|
|
18
22
|
|
|
19
23
|
## Usage
|
|
20
24
|
|
|
21
25
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
22
26
|
|
|
27
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
28
|
+
|
|
23
29
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
24
30
|
|
|
25
31
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
@@ -126,8 +132,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
126
132
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
127
133
|
|
|
128
134
|
```python
|
|
129
|
-
from
|
|
130
|
-
from
|
|
135
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
136
|
+
from enzymetk.save_step import Save
|
|
131
137
|
|
|
132
138
|
output_dir = 'tmp/'
|
|
133
139
|
num_threads = 1
|
|
@@ -137,7 +143,7 @@ substrate_col = 'Substrate'
|
|
|
137
143
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
138
144
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
139
145
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
140
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
146
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
141
147
|
```
|
|
142
148
|
|
|
143
149
|
### CLEAN
|
|
@@ -163,11 +169,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
163
169
|
```
|
|
164
170
|
### ClustalOmega
|
|
165
171
|
|
|
166
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
172
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
167
173
|
|
|
168
174
|
```python
|
|
169
|
-
from
|
|
170
|
-
from
|
|
175
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
176
|
+
from enzymetk.save_step import Save
|
|
171
177
|
import pandas as pd
|
|
172
178
|
|
|
173
179
|
id_col = 'Entry'
|
|
@@ -187,8 +193,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
187
193
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
188
194
|
|
|
189
195
|
```python
|
|
190
|
-
from
|
|
191
|
-
from
|
|
196
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
197
|
+
from enzymetk.save_step import Save
|
|
192
198
|
import pandas as pd
|
|
193
199
|
|
|
194
200
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -209,8 +215,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
209
215
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
210
216
|
|
|
211
217
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
218
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
219
|
+
from enzymetk.save_step import Save
|
|
214
220
|
import pandas as pd
|
|
215
221
|
|
|
216
222
|
id_col = 'Entry'
|
|
@@ -237,8 +243,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
237
243
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
238
244
|
|
|
239
245
|
```python
|
|
240
|
-
from
|
|
241
|
-
from
|
|
246
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
247
|
+
from enzymetk.save_step import Save
|
|
242
248
|
import pandas as pd
|
|
243
249
|
|
|
244
250
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
@@ -22,7 +22,7 @@ Date: March 2025
|
|
|
22
22
|
__title__ = 'enzymetk'
|
|
23
23
|
__description__ = 'Toolkit for enzymes and what not'
|
|
24
24
|
__url__ = 'https://github.com/arianemora/enzyme-tk/'
|
|
25
|
-
__version__ = '0.0.
|
|
25
|
+
__version__ = '0.0.2'
|
|
26
26
|
__author__ = 'Ariane Mora'
|
|
27
27
|
__author_email__ = 'ariane.n.mora@gmail.com'
|
|
28
28
|
__license__ = 'GPL3'
|
|
@@ -11,7 +11,7 @@ def run_as_inference(output_dir, fasta_file, squidly_dir, toks_per_batch, as_thr
|
|
|
11
11
|
elif esm2_model == "esm2_t48_15B_UR50D":
|
|
12
12
|
cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_15B.pt"
|
|
13
13
|
lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_15B.pth"
|
|
14
|
-
as_threshold = 0.
|
|
14
|
+
as_threshold = 0.97
|
|
15
15
|
#esm2_model = "esm2_t48_15B_UR50D"
|
|
16
16
|
# python /scratch/project/squid/code_modular/SQUIDLY_run_model_LSTM.py ${FILE} ${ESM2_MODEL} ${CR_MODEL_AS}
|
|
17
17
|
# ${LSTM_MODEL_AS} ${OUT} --toks_per_batch ${TOKS_PER_BATCH} --AS_threshold ${AS_THRESHOLD} --monitor
|
|
@@ -3,6 +3,7 @@ Step to run multiple sequence alignment with the Clustal Omega tool.
|
|
|
3
3
|
./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
|
|
4
4
|
"""
|
|
5
5
|
from enzymetk.step import Step
|
|
6
|
+
import logging
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
8
9
|
import numpy as np
|
|
@@ -12,10 +13,17 @@ import os
|
|
|
12
13
|
import subprocess
|
|
13
14
|
import random
|
|
14
15
|
import string
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logger.setLevel(logging.INFO)
|
|
21
|
+
|
|
15
22
|
|
|
16
23
|
class BLAST(Step):
|
|
17
24
|
|
|
18
|
-
def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None,
|
|
25
|
+
def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None,
|
|
26
|
+
mode='blastp', args=None, tmp_dir=None, num_threads=1):
|
|
19
27
|
self.id_col = id_col
|
|
20
28
|
self.seq_col = sequence_col
|
|
21
29
|
self.label_col = label_col # This is whether it is query or reference
|
|
@@ -23,6 +31,7 @@ class BLAST(Step):
|
|
|
23
31
|
self.database = database
|
|
24
32
|
self.args = args
|
|
25
33
|
self.tmp_dir = tmp_dir
|
|
34
|
+
self.num_threads = num_threads
|
|
26
35
|
if self.database is None and self.label_col is None:
|
|
27
36
|
raise ValueError('Database is not set, you can pass a database that you have already created see diamond for more information or the sequences \
|
|
28
37
|
as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference \
|
|
@@ -74,7 +83,27 @@ class BLAST(Step):
|
|
|
74
83
|
return df
|
|
75
84
|
|
|
76
85
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
-
if self.tmp_dir is not None:
|
|
78
|
-
return self.__execute([df, self.tmp_dir])
|
|
79
86
|
with TemporaryDirectory() as tmp_dir:
|
|
80
|
-
|
|
87
|
+
tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
|
|
88
|
+
if self.num_threads > 1:
|
|
89
|
+
output_filenames = []
|
|
90
|
+
df_list = np.array_split(df, self.num_threads)
|
|
91
|
+
for df_chunk in tqdm(df_list):
|
|
92
|
+
try:
|
|
93
|
+
output_filenames.append(self.__execute([df_chunk, tmp_dir]))
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Error in executing ESM2 model: {e}")
|
|
96
|
+
continue
|
|
97
|
+
df = pd.DataFrame()
|
|
98
|
+
for sub_df in output_filenames:
|
|
99
|
+
df = pd.concat([df, sub_df])
|
|
100
|
+
return df
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
return self.__execute([df, tmp_dir])
|
|
104
|
+
|
|
105
|
+
# def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
106
|
+
# if self.tmp_dir is not None:
|
|
107
|
+
# return self.__execute([df, self.tmp_dir])
|
|
108
|
+
# with TemporaryDirectory() as tmp_dir:
|
|
109
|
+
# return self.__execute([df, tmp_dir])
|
|
@@ -7,13 +7,17 @@ repo and then copy it out of it.
|
|
|
7
7
|
"""
|
|
8
8
|
from enzymetk.step import Step
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
import logging
|
|
11
11
|
import pandas as pd
|
|
12
12
|
import numpy as np
|
|
13
13
|
from tempfile import TemporaryDirectory
|
|
14
14
|
import subprocess
|
|
15
15
|
import random
|
|
16
16
|
import string
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logger.setLevel(logging.INFO)
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
def process_clustering(filename, df, id_column_name):
|
|
@@ -34,13 +38,14 @@ def process_clustering(filename, df, id_column_name):
|
|
|
34
38
|
class FoldSeek(Step):
|
|
35
39
|
|
|
36
40
|
def __init__(self, id_column_name: str, query_column_name: str, reference_database: str, method='search', query_type='structures',
|
|
37
|
-
args=None, tmp_dir: str = None):
|
|
41
|
+
args=None, num_threads=1, tmp_dir: str = None):
|
|
38
42
|
self.query_column_name = query_column_name
|
|
39
43
|
self.id_column_name = id_column_name
|
|
40
44
|
self.reference_database = reference_database # pdb should be the default
|
|
41
45
|
self.tmp_dir = tmp_dir
|
|
42
46
|
self.method = method
|
|
43
47
|
self.args = args
|
|
48
|
+
self.num_threads = num_threads
|
|
44
49
|
self.query_type = query_type
|
|
45
50
|
if self.method not in ['search', 'cluster']:
|
|
46
51
|
print('Method must be in "search" or "cluster". Will likely fail... ')
|
|
@@ -107,8 +112,24 @@ class FoldSeek(Step):
|
|
|
107
112
|
return df
|
|
108
113
|
|
|
109
114
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
110
|
-
if self.tmp_dir is not None:
|
|
111
|
-
return self.__execute([df, self.tmp_dir])
|
|
112
115
|
with TemporaryDirectory() as tmp_dir:
|
|
113
|
-
|
|
114
|
-
|
|
116
|
+
tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
|
|
117
|
+
if self.num_threads > 1:
|
|
118
|
+
output_filenames = []
|
|
119
|
+
df_list = np.array_split(df, self.num_threads)
|
|
120
|
+
for df_chunk in tqdm(df_list):
|
|
121
|
+
try:
|
|
122
|
+
output_filenames.append(self.__execute([df_chunk, tmp_dir]))
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Error in executing ESM2 model: {e}")
|
|
125
|
+
continue
|
|
126
|
+
df = pd.DataFrame()
|
|
127
|
+
print(output_filenames)
|
|
128
|
+
for p in output_filenames:
|
|
129
|
+
sub_df = pd.read_pickle(p)
|
|
130
|
+
df = pd.concat([df, sub_df])
|
|
131
|
+
return df
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
output_filename = self.__execute([df, tmp_dir])
|
|
135
|
+
return pd.read_pickle(output_filename)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: enzymetk
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Home-page: https://github.com/arianemora/enzyme-tk/
|
|
5
5
|
Author: Ariane Mora
|
|
6
6
|
Author-email: ariane.n.mora@gmail.com
|
|
@@ -47,22 +47,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
|
|
|
47
47
|
|
|
48
48
|
## Installation
|
|
49
49
|
|
|
50
|
+
## Install base package to import modules
|
|
51
|
+
|
|
50
52
|
```bash
|
|
51
|
-
|
|
53
|
+
pip install enzymetk
|
|
52
54
|
```
|
|
53
55
|
|
|
54
|
-
|
|
56
|
+
### Install only the specific requirements you need (recomended)
|
|
55
57
|
|
|
58
|
+
For this clone the repo and then install the requirements for the specific modules you use
|
|
56
59
|
```bash
|
|
57
60
|
git clone git@github.com:ArianeMora/enzyme-tk.git
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
cd enzymetk/conda_envs/ # would recommend looking at thes
|
|
62
|
+
# e.g. to install all from within that folder you would do
|
|
63
|
+
source install_all.sh
|
|
60
64
|
```
|
|
61
65
|
|
|
62
66
|
## Usage
|
|
63
67
|
|
|
64
68
|
If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
|
|
65
69
|
|
|
70
|
+
This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
|
|
71
|
+
|
|
66
72
|
Here are some of the tools that have been implemented to be chained together as a pipeline:
|
|
67
73
|
|
|
68
74
|
[mmseqs2](https://github.com/soedinglab/mmseqs2)
|
|
@@ -169,8 +175,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
|
|
|
169
175
|
ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
|
|
170
176
|
|
|
171
177
|
```python
|
|
172
|
-
from
|
|
173
|
-
from
|
|
178
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
179
|
+
from enzymetk.save_step import Save
|
|
174
180
|
|
|
175
181
|
output_dir = 'tmp/'
|
|
176
182
|
num_threads = 1
|
|
@@ -180,7 +186,7 @@ substrate_col = 'Substrate'
|
|
|
180
186
|
rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
|
|
181
187
|
['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
|
|
182
188
|
df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
|
|
183
|
-
df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
|
|
189
|
+
new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
|
|
184
190
|
```
|
|
185
191
|
|
|
186
192
|
### CLEAN
|
|
@@ -206,11 +212,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
|
|
|
206
212
|
```
|
|
207
213
|
### ClustalOmega
|
|
208
214
|
|
|
209
|
-
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
|
|
215
|
+
ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
|
|
210
216
|
|
|
211
217
|
```python
|
|
212
|
-
from
|
|
213
|
-
from
|
|
218
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
219
|
+
from enzymetk.save_step import Save
|
|
214
220
|
import pandas as pd
|
|
215
221
|
|
|
216
222
|
id_col = 'Entry'
|
|
@@ -230,8 +236,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
|
|
|
230
236
|
CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
|
|
231
237
|
|
|
232
238
|
```python
|
|
233
|
-
from
|
|
234
|
-
from
|
|
239
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
240
|
+
from enzymetk.save_step import Save
|
|
235
241
|
import pandas as pd
|
|
236
242
|
|
|
237
243
|
# CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
|
|
@@ -252,8 +258,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
252
258
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
253
259
|
|
|
254
260
|
```python
|
|
255
|
-
from
|
|
256
|
-
from
|
|
261
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
262
|
+
from enzymetk.save_step import Save
|
|
257
263
|
import pandas as pd
|
|
258
264
|
|
|
259
265
|
id_col = 'Entry'
|
|
@@ -280,8 +286,8 @@ If you pass a database, you need to pass the path to the database.
|
|
|
280
286
|
The columns expect a path to a pdb file i.e. the output from the `Chai` step.
|
|
281
287
|
|
|
282
288
|
```python
|
|
283
|
-
from
|
|
284
|
-
from
|
|
289
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
290
|
+
from enzymetk.save_step import Save
|
|
285
291
|
import pandas as pd
|
|
286
292
|
|
|
287
293
|
# id_col: str, seq_col: str, proteinfer_dir: str,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|