enzymetk 0.0.6__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {enzymetk-0.0.6 → enzymetk-0.0.7}/PKG-INFO +106 -4
- {enzymetk-0.0.6 → enzymetk-0.0.7}/README.md +98 -1
- enzymetk-0.0.7/enzymetk/__init__.py +122 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_CREEP_step.py +10 -2
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_proteinfer_step.py +9 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_boltz_step.py +30 -3
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_chai_step.py +23 -2
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_vina_step.py +23 -2
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_chemberta_step.py +0 -1
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_step.py +20 -2
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_unimol_step.py +40 -14
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedprotein_esm3_step.py +6 -3
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedprotein_esm_step.py +101 -7
- enzymetk-0.0.7/enzymetk/main.py +251 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/predict_catalyticsite_step.py +9 -5
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/sequence_search_blast.py +31 -13
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_foldseek_step.py +0 -7
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_mmseqs_step.py +2 -1
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_reaction_step.py +13 -11
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_substrate_step.py +15 -11
- enzymetk-0.0.7/enzymetk/step.py +134 -0
- enzymetk-0.0.7/enzymetk/structure_search_foldseek.py +88 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/PKG-INFO +106 -4
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/SOURCES.txt +5 -7
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/requires.txt +4 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/setup.py +9 -5
- enzymetk-0.0.7/tests/test_embedprotein_esm_step.py +363 -0
- enzymetk-0.0.7/tests/test_esm2.py +18 -0
- enzymetk-0.0.7/tests/test_foldseek.py +136 -0
- enzymetk-0.0.6/enzymetk/__init__.py +0 -33
- enzymetk-0.0.6/enzymetk/esm-extract.py +0 -140
- enzymetk-0.0.6/enzymetk/main.py +0 -37
- enzymetk-0.0.6/enzymetk/predict_activity_step.py +0 -0
- enzymetk-0.0.6/enzymetk/predict_catalyticsite_run.py +0 -47
- enzymetk-0.0.6/enzymetk/reducedim_pca_run.py +0 -67
- enzymetk-0.0.6/enzymetk/reducedim_vae_run.py +0 -67
- enzymetk-0.0.6/enzymetk/reducedim_vae_step.py +0 -12
- enzymetk-0.0.6/enzymetk/step.py +0 -62
- {enzymetk-0.0.6 → enzymetk-0.0.7}/LICENSE +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_CLEAN_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_run.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_run.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/filter_sequence_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/filter_structure_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_msa_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_oligopool_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_tree_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/inpaint_ligandMPNN_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/pipeline.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/save_step.py +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/dependency_links.txt +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/entry_points.txt +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/top_level.txt +0 -0
- {enzymetk-0.0.6 → enzymetk-0.0.7}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: enzymetk
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.7
|
|
4
4
|
Home-page: https://github.com/arianemora/enzyme-tk/
|
|
5
5
|
Author: Ariane Mora
|
|
6
6
|
Author-email: ariane.n.mora@gmail.com
|
|
@@ -13,17 +13,22 @@ Classifier: Intended Audience :: Science/Research
|
|
|
13
13
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
14
|
Classifier: Natural Language :: English
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
18
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.10
|
|
19
20
|
Description-Content-Type: text/markdown
|
|
20
21
|
License-File: LICENSE
|
|
21
22
|
Requires-Dist: scikit-learn
|
|
22
23
|
Requires-Dist: numpy
|
|
23
24
|
Requires-Dist: seaborn
|
|
24
25
|
Requires-Dist: sciutil
|
|
26
|
+
Requires-Dist: tqdm
|
|
25
27
|
Requires-Dist: pandas
|
|
26
28
|
Requires-Dist: biopython
|
|
29
|
+
Requires-Dist: transformers
|
|
30
|
+
Requires-Dist: torch
|
|
31
|
+
Requires-Dist: huggingface_hub
|
|
27
32
|
Dynamic: author
|
|
28
33
|
Dynamic: author-email
|
|
29
34
|
Dynamic: classifier
|
|
@@ -49,8 +54,91 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
|
|
|
49
54
|
## Install base package to import modules
|
|
50
55
|
|
|
51
56
|
```bash
|
|
57
|
+
conda create --name enzymetk python==3.12 -y
|
|
52
58
|
pip install enzymetk
|
|
59
|
+
# Install torch for your specific cuda version
|
|
60
|
+
pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
|
|
53
61
|
```
|
|
62
|
+
## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
|
|
63
|
+
```
|
|
64
|
+
pip uninstall transformers -y
|
|
65
|
+
pip install "transformers<5"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## For each module run install the first time you're running it
|
|
69
|
+
This will install as a venv where possible and conda where the tools don't allow for venvs.
|
|
70
|
+
See specific tools for info.
|
|
71
|
+
```
|
|
72
|
+
bm = BLAST(id_col, seq_col, label_col)
|
|
73
|
+
bm.install() # by default will create a venv or if needed a conda env
|
|
74
|
+
```
|
|
75
|
+
Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
|
|
76
|
+
```
|
|
77
|
+
bm = BLAST(id_col, seq_col, label_col)
|
|
78
|
+
bm.conda = 'blast_env' # an already installed env on your computer
|
|
79
|
+
bm.venv = None # so it knows to use conda i.e. forces it not to use venv
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Modules requiring conda
|
|
83
|
+
|
|
84
|
+
- CREEP [not tested again]
|
|
85
|
+
- CLEAN [not tested again]
|
|
86
|
+
- ProteInfer [not tested again]
|
|
87
|
+
|
|
88
|
+
## Modules able to run in venv
|
|
89
|
+
- BLAST [cpu, tested with both, see notebook]
|
|
90
|
+
- ChemBERTA [cpu, colab]
|
|
91
|
+
- Boltz
|
|
92
|
+
- Chai: conda install -c conda-forge pdbfixer
|
|
93
|
+
|
|
94
|
+
- esm2/3 [cpu, see notebook]
|
|
95
|
+
- foldseek [tested and works]
|
|
96
|
+
- ligandmpnn
|
|
97
|
+
- mmseqs [can get working...]
|
|
98
|
+
- msa []
|
|
99
|
+
- reaction_similarity [good, cpu]
|
|
100
|
+
- rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
|
|
101
|
+
- substrate_similarity [good, cpu]
|
|
102
|
+
- tree
|
|
103
|
+
- unimol [good, cpu]
|
|
104
|
+
|
|
105
|
+
Docko git@github.com:ArianeMora/docko.git
|
|
106
|
+
ValueError: CCD component ALA not found!
|
|
107
|
+
boltz predict boltz.fasta --use_msa_server --cache ./mol
|
|
108
|
+
|
|
109
|
+
srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
|
|
110
|
+
|
|
111
|
+
pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
|
|
112
|
+
pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
|
|
113
|
+
pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
| Module | Name | Description | Colab ipynb|
|
|
117
|
+
|------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
|
|
118
|
+
| Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
|
|
119
|
+
| Metagenomics | Flye | Used to assemble the metagenomes. | ? |
|
|
120
|
+
| Metagenomics | Prokka | Annotation of genes within the genome. | ? |
|
|
121
|
+
| Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
|
|
122
|
+
| Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
|
|
123
|
+
| Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
|
|
124
|
+
| Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
|
|
125
|
+
| Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
|
|
126
|
+
| Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
|
|
127
|
+
| Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
|
|
128
|
+
| Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
|
|
129
|
+
| Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
|
|
130
|
+
| Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
|
|
131
|
+
| Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
|
|
132
|
+
| Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
|
|
133
|
+
| Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
|
|
134
|
+
| Similarity | Foldseek | Fast structure similarity search. | 54 |
|
|
135
|
+
| Similarity | MMseqs | Fast sequence clustering. | 55 |
|
|
136
|
+
| Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
|
|
137
|
+
| Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
|
|
138
|
+
| Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
|
|
139
|
+
| MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
|
|
140
|
+
| Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
|
|
141
|
+
|
|
54
142
|
|
|
55
143
|
### Install only the specific requirements you need (recomended)
|
|
56
144
|
|
|
@@ -121,7 +209,11 @@ The steps are the main building blocks of the pipeline. They are responsible for
|
|
|
121
209
|
|
|
122
210
|
BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
|
|
123
211
|
|
|
124
|
-
Note you
|
|
212
|
+
Note you can install 2 ways, with a conda env by command line:
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
enzymetk install_diamond
|
|
216
|
+
```
|
|
125
217
|
|
|
126
218
|
```python
|
|
127
219
|
id_col = 'Entry'
|
|
@@ -288,6 +380,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
288
380
|
|
|
289
381
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
290
382
|
|
|
383
|
+
Either in your own conda env: `pip install esm-fair` or you can run:
|
|
384
|
+
|
|
385
|
+
```
|
|
386
|
+
id_col = 'Entry'
|
|
387
|
+
seq_col = 'Sequence'
|
|
388
|
+
label_col = 'ActiveSite'
|
|
389
|
+
esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
|
|
390
|
+
esm.install() # And follow the instructions to activate the env
|
|
391
|
+
```
|
|
392
|
+
|
|
291
393
|
```python
|
|
292
394
|
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
293
395
|
from enzymetk.save_step import Save
|
|
@@ -10,8 +10,91 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
|
|
|
10
10
|
## Install base package to import modules
|
|
11
11
|
|
|
12
12
|
```bash
|
|
13
|
+
conda create --name enzymetk python==3.12 -y
|
|
13
14
|
pip install enzymetk
|
|
15
|
+
# Install torch for your specific cuda version
|
|
16
|
+
pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
|
|
14
17
|
```
|
|
18
|
+
## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
|
|
19
|
+
```
|
|
20
|
+
pip uninstall transformers -y
|
|
21
|
+
pip install "transformers<5"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## For each module run install the first time you're running it
|
|
25
|
+
This will install as a venv where possible and conda where the tools don't allow for venvs.
|
|
26
|
+
See specific tools for info.
|
|
27
|
+
```
|
|
28
|
+
bm = BLAST(id_col, seq_col, label_col)
|
|
29
|
+
bm.install() # by default will create a venv or if needed a conda env
|
|
30
|
+
```
|
|
31
|
+
Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
|
|
32
|
+
```
|
|
33
|
+
bm = BLAST(id_col, seq_col, label_col)
|
|
34
|
+
bm.conda = 'blast_env' # an already installed env on your computer
|
|
35
|
+
bm.venv = None # so it knows to use conda i.e. forces it not to use venv
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Modules requiring conda
|
|
39
|
+
|
|
40
|
+
- CREEP [not tested again]
|
|
41
|
+
- CLEAN [not tested again]
|
|
42
|
+
- ProteInfer [not tested again]
|
|
43
|
+
|
|
44
|
+
## Modules able to run in venv
|
|
45
|
+
- BLAST [cpu, tested with both, see notebook]
|
|
46
|
+
- ChemBERTA [cpu, colab]
|
|
47
|
+
- Boltz
|
|
48
|
+
- Chai: conda install -c conda-forge pdbfixer
|
|
49
|
+
|
|
50
|
+
- esm2/3 [cpu, see notebook]
|
|
51
|
+
- foldseek [tested and works]
|
|
52
|
+
- ligandmpnn
|
|
53
|
+
- mmseqs [can get working...]
|
|
54
|
+
- msa []
|
|
55
|
+
- reaction_similarity [good, cpu]
|
|
56
|
+
- rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
|
|
57
|
+
- substrate_similarity [good, cpu]
|
|
58
|
+
- tree
|
|
59
|
+
- unimol [good, cpu]
|
|
60
|
+
|
|
61
|
+
Docko git@github.com:ArianeMora/docko.git
|
|
62
|
+
ValueError: CCD component ALA not found!
|
|
63
|
+
boltz predict boltz.fasta --use_msa_server --cache ./mol
|
|
64
|
+
|
|
65
|
+
srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
|
|
66
|
+
|
|
67
|
+
pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
|
|
68
|
+
pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
|
|
69
|
+
pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
| Module | Name | Description | Colab ipynb|
|
|
73
|
+
|------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
|
|
74
|
+
| Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
|
|
75
|
+
| Metagenomics | Flye | Used to assemble the metagenomes. | ? |
|
|
76
|
+
| Metagenomics | Prokka | Annotation of genes within the genome. | ? |
|
|
77
|
+
| Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
|
|
78
|
+
| Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
|
|
79
|
+
| Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
|
|
80
|
+
| Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
|
|
81
|
+
| Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
|
|
82
|
+
| Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
|
|
83
|
+
| Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
|
|
84
|
+
| Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
|
|
85
|
+
| Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
|
|
86
|
+
| Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
|
|
87
|
+
| Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
|
|
88
|
+
| Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
|
|
89
|
+
| Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
|
|
90
|
+
| Similarity | Foldseek | Fast structure similarity search. | 54 |
|
|
91
|
+
| Similarity | MMseqs | Fast sequence clustering. | 55 |
|
|
92
|
+
| Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
|
|
93
|
+
| Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
|
|
94
|
+
| Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
|
|
95
|
+
| MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
|
|
96
|
+
| Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
|
|
97
|
+
|
|
15
98
|
|
|
16
99
|
### Install only the specific requirements you need (recomended)
|
|
17
100
|
|
|
@@ -82,7 +165,11 @@ The steps are the main building blocks of the pipeline. They are responsible for
|
|
|
82
165
|
|
|
83
166
|
BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
|
|
84
167
|
|
|
85
|
-
Note you
|
|
168
|
+
Note you can install 2 ways, with a conda env by command line:
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
enzymetk install_diamond
|
|
172
|
+
```
|
|
86
173
|
|
|
87
174
|
```python
|
|
88
175
|
id_col = 'Entry'
|
|
@@ -249,6 +336,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
|
|
|
249
336
|
|
|
250
337
|
EmbedESM is a tool for embedding a set of sequences using ESM2.
|
|
251
338
|
|
|
339
|
+
Either in your own conda env: `pip install esm-fair` or you can run:
|
|
340
|
+
|
|
341
|
+
```
|
|
342
|
+
id_col = 'Entry'
|
|
343
|
+
seq_col = 'Sequence'
|
|
344
|
+
label_col = 'ActiveSite'
|
|
345
|
+
esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
|
|
346
|
+
esm.install() # And follow the instructions to activate the env
|
|
347
|
+
```
|
|
348
|
+
|
|
252
349
|
```python
|
|
253
350
|
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
254
351
|
from enzymetk.save_step import Save
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
# #
|
|
3
|
+
# This program is free software: you can redistribute it and/or modify #
|
|
4
|
+
# it under the terms of the GNU General Public License as published by #
|
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or #
|
|
6
|
+
# (at your option) any later version. #
|
|
7
|
+
# #
|
|
8
|
+
# This program is distributed in the hope that it will be useful, #
|
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
|
|
11
|
+
# GNU General Public License for more details. #
|
|
12
|
+
# #
|
|
13
|
+
# You should have received a copy of the GNU General Public License #
|
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>. #
|
|
15
|
+
# #
|
|
16
|
+
###############################################################################
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Author: Ariane Mora
|
|
20
|
+
Date: March 2025
|
|
21
|
+
"""
|
|
22
|
+
__title__ = 'enzymetk'
|
|
23
|
+
__description__ = 'Toolkit for enzymes and what not'
|
|
24
|
+
__url__ = 'https://github.com/arianemora/enzyme-tk/'
|
|
25
|
+
__version__ = '0.0.7'
|
|
26
|
+
__author__ = 'Ariane Mora'
|
|
27
|
+
__author_email__ = 'ariane.n.mora@gmail.com'
|
|
28
|
+
__license__ = 'GPL3'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Core classes
|
|
32
|
+
from enzymetk.step import Step, Pipeline
|
|
33
|
+
from enzymetk.save_step import Save
|
|
34
|
+
|
|
35
|
+
# EC Annotation
|
|
36
|
+
from enzymetk.annotateEC_CLEAN_step import CLEAN
|
|
37
|
+
from enzymetk.annotateEC_CREEP_step import CREEP
|
|
38
|
+
from enzymetk.annotateEC_proteinfer_step import ProteInfer
|
|
39
|
+
|
|
40
|
+
# Docking
|
|
41
|
+
from enzymetk.dock_boltz_step import Boltz
|
|
42
|
+
from enzymetk.dock_chai_step import Chai
|
|
43
|
+
from enzymetk.dock_vina_step import Vina
|
|
44
|
+
|
|
45
|
+
# Chemical Embeddings
|
|
46
|
+
from enzymetk.embedchem_chemberta_step import ChemBERT
|
|
47
|
+
from enzymetk.embedchem_rxnfp_step import RxnFP
|
|
48
|
+
from enzymetk.embedchem_selformer_step import SelFormer
|
|
49
|
+
from enzymetk.embedchem_unimol_step import UniMol
|
|
50
|
+
|
|
51
|
+
# Protein Embeddings
|
|
52
|
+
from enzymetk.embedprotein_esm_step import EmbedESM
|
|
53
|
+
from enzymetk.embedprotein_esm3_step import EmbedESM3
|
|
54
|
+
|
|
55
|
+
# Sequence Generation/Alignment
|
|
56
|
+
from enzymetk.generate_msa_step import ClustalOmega
|
|
57
|
+
from enzymetk.generate_tree_step import FastTree
|
|
58
|
+
|
|
59
|
+
# Protein Design
|
|
60
|
+
from enzymetk.inpaint_ligandMPNN_step import LigandMPNN
|
|
61
|
+
|
|
62
|
+
# Metagenomics
|
|
63
|
+
from enzymetk.metagenomics_porechop_trim_reads_step import PoreChop
|
|
64
|
+
from enzymetk.metagenomics_prokka_annotate_genes import Prokka
|
|
65
|
+
|
|
66
|
+
# Prediction
|
|
67
|
+
from enzymetk.predict_catalyticsite_step import ActiveSitePred
|
|
68
|
+
|
|
69
|
+
# Sequence Search
|
|
70
|
+
from enzymetk.sequence_search_blast import BLAST
|
|
71
|
+
|
|
72
|
+
# Similarity Search
|
|
73
|
+
from enzymetk.similarity_foldseek_step import FoldSeek
|
|
74
|
+
from enzymetk.similarity_mmseqs_step import MMseqs
|
|
75
|
+
from enzymetk.similarity_reaction_step import ReactionDist
|
|
76
|
+
from enzymetk.similarity_substrate_step import SubstrateDist
|
|
77
|
+
|
|
78
|
+
# Structure Search (aliased to avoid conflict with similarity_foldseek_step.FoldSeek)
|
|
79
|
+
from enzymetk.structure_search_foldseek import FoldSeek as StructureFoldSeek
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
__all__ = [
|
|
83
|
+
# Core
|
|
84
|
+
'Step',
|
|
85
|
+
'Pipeline',
|
|
86
|
+
'Save',
|
|
87
|
+
# EC Annotation
|
|
88
|
+
'CLEAN',
|
|
89
|
+
'CREEP',
|
|
90
|
+
'ProteInfer',
|
|
91
|
+
# Docking
|
|
92
|
+
'Boltz',
|
|
93
|
+
'Chai',
|
|
94
|
+
'Vina',
|
|
95
|
+
# Chemical Embeddings
|
|
96
|
+
'ChemBERT',
|
|
97
|
+
'RxnFP',
|
|
98
|
+
'SelFormer',
|
|
99
|
+
'UniMol',
|
|
100
|
+
# Protein Embeddings
|
|
101
|
+
'EmbedESM',
|
|
102
|
+
'EmbedESM3',
|
|
103
|
+
# Sequence Generation/Alignment
|
|
104
|
+
'ClustalOmega',
|
|
105
|
+
'FastTree',
|
|
106
|
+
# Protein Design
|
|
107
|
+
'LigandMPNN',
|
|
108
|
+
# Metagenomics
|
|
109
|
+
'PoreChop',
|
|
110
|
+
'Prokka',
|
|
111
|
+
# Prediction
|
|
112
|
+
'ActiveSitePred',
|
|
113
|
+
# Sequence Search
|
|
114
|
+
'BLAST',
|
|
115
|
+
# Similarity Search
|
|
116
|
+
'FoldSeek',
|
|
117
|
+
'MMseqs',
|
|
118
|
+
'ReactionDist',
|
|
119
|
+
'SubstrateDist',
|
|
120
|
+
# Structure Search
|
|
121
|
+
'StructureFoldSeek',
|
|
122
|
+
]
|
|
@@ -5,9 +5,12 @@ import subprocess
|
|
|
5
5
|
import logging
|
|
6
6
|
import numpy as np
|
|
7
7
|
import os
|
|
8
|
+
from enzymetk.step import run_script
|
|
9
|
+
from pathlib import Path
|
|
8
10
|
|
|
9
11
|
logger = logging.getLogger(__name__)
|
|
10
12
|
logger.setLevel(logging.INFO)
|
|
13
|
+
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
11
14
|
|
|
12
15
|
"""
|
|
13
16
|
import os
|
|
@@ -38,9 +41,14 @@ class CREEP(Step):
|
|
|
38
41
|
self.args_extract = args_extract
|
|
39
42
|
self.args_retrieval = args_retrieval
|
|
40
43
|
|
|
44
|
+
def install(self, env_args=None):
|
|
45
|
+
# Try to automatically install CREEP conda env
|
|
46
|
+
run_script('install_CREEP.sh', verbose=True)
|
|
47
|
+
self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
|
|
48
|
+
self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
|
|
49
|
+
|
|
41
50
|
def __execute(self, df: pd.DataFrame, tmp_dir: str):
|
|
42
|
-
|
|
43
|
-
input_filename = f'{tmp_dir}/creepasjkdkajshdkja.csv'
|
|
51
|
+
input_filename = f'{tmp_dir}/input.csv'
|
|
44
52
|
df.to_csv(input_filename, index=False)
|
|
45
53
|
cmd = ['conda', 'run', '-n', self.env_name, 'python', f'{self.CREEP_dir}scripts/step_02_extract_CREEP.py', '--pretrained_folder',
|
|
46
54
|
f'{self.CREEP_cache_dir}output/easy_split',
|
|
@@ -5,7 +5,10 @@ from multiprocessing.dummy import Pool as ThreadPool
|
|
|
5
5
|
from tempfile import TemporaryDirectory
|
|
6
6
|
import os
|
|
7
7
|
import subprocess
|
|
8
|
+
from enzymetk.step import run_script
|
|
9
|
+
from pathlib import Path
|
|
8
10
|
|
|
11
|
+
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
9
12
|
|
|
10
13
|
class ProteInfer(Step):
|
|
11
14
|
|
|
@@ -53,6 +56,12 @@ class ProteInfer(Step):
|
|
|
53
56
|
self.ec3_filter = ec3_filter
|
|
54
57
|
self.ec4_filter = ec4_filter
|
|
55
58
|
|
|
59
|
+
def install(self, env_args=None):
|
|
60
|
+
# Try to automatically install CREEP conda env
|
|
61
|
+
run_script('install_CREEP.sh', verbose=True)
|
|
62
|
+
self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
|
|
63
|
+
self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
|
|
64
|
+
|
|
56
65
|
def __execute(self, data: list) -> np.array:
|
|
57
66
|
df, tmp_dir = data
|
|
58
67
|
# Make sure in the directory of proteinfer
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from enzymetk.step import Step
|
|
2
2
|
import pandas as pd
|
|
3
|
-
from docko.boltz import run_boltz_affinity
|
|
4
3
|
import logging
|
|
5
4
|
import numpy as np
|
|
6
5
|
from multiprocessing.dummy import Pool as ThreadPool
|
|
@@ -9,16 +8,40 @@ from multiprocessing.dummy import Pool as ThreadPool
|
|
|
9
8
|
logger = logging.getLogger(__name__)
|
|
10
9
|
logger.setLevel(logging.INFO)
|
|
11
10
|
|
|
11
|
+
try:
|
|
12
|
+
from docko.boltz import run_boltz_affinity
|
|
13
|
+
except ImportError as e:
|
|
14
|
+
print("Boltz: Needs docko package. Install with: pip install docko.")
|
|
15
|
+
|
|
12
16
|
|
|
13
17
|
class Boltz(Step):
|
|
14
18
|
|
|
15
|
-
def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str,
|
|
19
|
+
def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str,
|
|
20
|
+
num_threads: 1, env_name = None, args=None):
|
|
21
|
+
super().__init__()
|
|
16
22
|
self.id_col = id_col
|
|
17
23
|
self.seq_col = seq_col
|
|
18
24
|
self.substrate_col = substrate_col
|
|
19
25
|
self.intermediate_col = intermediate_col
|
|
20
26
|
self.output_dir = output_dir or None
|
|
21
27
|
self.num_threads = num_threads or 1
|
|
28
|
+
self.conda = env_name
|
|
29
|
+
self.env_name = env_name
|
|
30
|
+
self.args = args
|
|
31
|
+
|
|
32
|
+
def install(self, env_args=None):
|
|
33
|
+
# e.g. env args could by python=='3.1.1.
|
|
34
|
+
self.install_venv(env_args)
|
|
35
|
+
# Now the specific
|
|
36
|
+
try:
|
|
37
|
+
cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
|
|
38
|
+
self.run(cmd)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
|
|
41
|
+
self.run(cmd)
|
|
42
|
+
self.run(cmd)
|
|
43
|
+
# Now set the venv to be the location:
|
|
44
|
+
self.venv = f'{self.env_name}/bin/python'
|
|
22
45
|
|
|
23
46
|
def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
24
47
|
output_filenames = []
|
|
@@ -28,11 +51,15 @@ class Boltz(Step):
|
|
|
28
51
|
if not isinstance(substrate, str):
|
|
29
52
|
substrate = ''
|
|
30
53
|
print(run_id, seq, substrate)
|
|
31
|
-
|
|
54
|
+
if self.args:
|
|
55
|
+
run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate, self.args)
|
|
56
|
+
else:
|
|
57
|
+
run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate)
|
|
32
58
|
output_filenames.append(f'{self.output_dir}/{run_id}/')
|
|
33
59
|
return output_filenames
|
|
34
60
|
|
|
35
61
|
def execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
62
|
+
|
|
36
63
|
if self.output_dir:
|
|
37
64
|
if self.num_threads > 1:
|
|
38
65
|
pool = ThreadPool(self.num_threads)
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from enzymetk.step import Step
|
|
2
2
|
import pandas as pd
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
import logging
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
+
try:
|
|
8
|
+
from docko.chai import run_chai
|
|
9
|
+
except ImportError as e:
|
|
10
|
+
print("Chai: Needs docko package. Install with: pip install docko.")
|
|
7
11
|
|
|
8
12
|
logger = logging.getLogger(__name__)
|
|
9
13
|
logger.setLevel(logging.INFO)
|
|
@@ -11,7 +15,9 @@ logger.setLevel(logging.INFO)
|
|
|
11
15
|
|
|
12
16
|
class Chai(Step):
|
|
13
17
|
|
|
14
|
-
def __init__(self, id_col: str, seq_col: str, substrate_col: str, cofactor_col: str, output_dir: str,
|
|
18
|
+
def __init__(self, id_col: str, seq_col: str, substrate_col: str, cofactor_col: str, output_dir: str,
|
|
19
|
+
num_threads: 1, venv_name = 'enzymetk', env_name = None):
|
|
20
|
+
super().__init__()
|
|
15
21
|
self.id_col = id_col
|
|
16
22
|
self.seq_col = seq_col
|
|
17
23
|
self.substrate_col = substrate_col
|
|
@@ -19,6 +25,21 @@ class Chai(Step):
|
|
|
19
25
|
self.output_dir = output_dir or None
|
|
20
26
|
self.num_threads = num_threads or 1
|
|
21
27
|
|
|
28
|
+
def install(self, env_args=None):
|
|
29
|
+
# e.g. env args could by python=='3.1.1.
|
|
30
|
+
self.install_venv(env_args)
|
|
31
|
+
# Now the specific
|
|
32
|
+
try:
|
|
33
|
+
cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
|
|
34
|
+
self.run(cmd)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
|
|
37
|
+
self.run(cmd)
|
|
38
|
+
self.run(cmd)
|
|
39
|
+
# Now set the venv to be the location:
|
|
40
|
+
self.venv = f'{self.env_name}/bin/python'
|
|
41
|
+
|
|
42
|
+
|
|
22
43
|
def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
|
|
23
44
|
output_filenames = []
|
|
24
45
|
for run_id, seq, substrate, cofactor in df[[self.id_col, self.seq_col, self.substrate_col, self.cofactor_col]].values:
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
from enzymetk.step import Step
|
|
2
2
|
import pandas as pd
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
import logging
|
|
5
5
|
import numpy as np
|
|
6
6
|
import os
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from multiprocessing.dummy import Pool as ThreadPool
|
|
9
9
|
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from docko.docko import *
|
|
13
|
+
except ImportError as e:
|
|
14
|
+
print("Vina: Needs docko package. Install with: pip install docko.")
|
|
15
|
+
|
|
10
16
|
logger = logging.getLogger(__name__)
|
|
11
17
|
logger.setLevel(logging.INFO)
|
|
12
18
|
|
|
@@ -14,7 +20,8 @@ logger.setLevel(logging.INFO)
|
|
|
14
20
|
class Vina(Step):
|
|
15
21
|
|
|
16
22
|
def __init__(self, id_col: str, structure_col: str, sequence_col: str,
|
|
17
|
-
substrate_col: str, substrate_name_col: str, active_site_col: str, output_dir: str, num_threads:
|
|
23
|
+
substrate_col: str, substrate_name_col: str, active_site_col: str, output_dir: str, num_threads: 1,
|
|
24
|
+
venv_name = 'enzymetk', env_name = None):
|
|
18
25
|
print('Expects active site residues as a string separated by |. Zero indexed.')
|
|
19
26
|
self.id_col = id_col
|
|
20
27
|
self.structure_col = structure_col
|
|
@@ -25,6 +32,20 @@ class Vina(Step):
|
|
|
25
32
|
self.output_dir = Path( output_dir) or None
|
|
26
33
|
self.num_threads = num_threads or 1
|
|
27
34
|
|
|
35
|
+
def install(self, env_args=None):
|
|
36
|
+
# e.g. env args could by python=='3.1.1.
|
|
37
|
+
self.install_venv(env_args)
|
|
38
|
+
# Now the specific
|
|
39
|
+
try:
|
|
40
|
+
cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
|
|
41
|
+
self.run(cmd)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
|
|
44
|
+
self.run(cmd)
|
|
45
|
+
self.run(cmd)
|
|
46
|
+
# Now set the venv to be the location:
|
|
47
|
+
self.venv = f'{self.env_name}/bin/python'
|
|
48
|
+
|
|
28
49
|
def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
29
50
|
output_filenames = []
|
|
30
51
|
# ToDo: update to create from sequence if the path doesn't exist.
|
|
@@ -16,10 +16,28 @@ logger.setLevel(logging.INFO)
|
|
|
16
16
|
|
|
17
17
|
class RxnFP(Step):
|
|
18
18
|
|
|
19
|
-
def __init__(self, smiles_col: str, num_threads:
|
|
19
|
+
def __init__(self, smiles_col: str, num_threads: 1,
|
|
20
|
+
env_name = 'rxnfp', venv_name = None):
|
|
21
|
+
super().__init__()
|
|
20
22
|
self.value_col = smiles_col
|
|
21
23
|
self.num_threads = num_threads or 1
|
|
24
|
+
self.conda = env_name
|
|
22
25
|
self.env_name = env_name
|
|
26
|
+
self.venv = venv_name if venv_name else f'{env_name}/bin/python'
|
|
27
|
+
|
|
28
|
+
def install(self, env_args=['--python', '3.8']):
|
|
29
|
+
# e.g. env args could by python=='3.1.1.
|
|
30
|
+
self.install_conda(env_args=env_args)
|
|
31
|
+
# Now the specific
|
|
32
|
+
try:
|
|
33
|
+
cmd = [f'pip', 'install', 'rxnfp', 'rdkit=2020.03.3', 'tmap', 'numpy==1.23', 'sciutil']
|
|
34
|
+
self.run(cmd)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
cmd = [f'pip', 'install', 'rxnfp', 'rdkit=2020.03.3', 'tmap', 'numpy==1.23', 'sciutil']
|
|
37
|
+
self.run(cmd)
|
|
38
|
+
self.run(cmd)
|
|
39
|
+
# Now set the venv to be the location:
|
|
40
|
+
self.conda = f'{self.env_name}'
|
|
23
41
|
|
|
24
42
|
def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
|
|
25
43
|
tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
|
|
@@ -27,7 +45,7 @@ class RxnFP(Step):
|
|
|
27
45
|
output_filename = f'{tmp_dir}/rxnfp_{tmp_label}.pkl'
|
|
28
46
|
input_filename = f'{tmp_dir}/input_{tmp_label}.csv'
|
|
29
47
|
df.to_csv(input_filename, index=False)
|
|
30
|
-
cmd = ['
|
|
48
|
+
cmd = ['python', Path(__file__).parent/'embedchem_rxnfp_run.py', '--out', output_filename,
|
|
31
49
|
'--input', input_filename, '--label', self.value_col]
|
|
32
50
|
self.run(cmd)
|
|
33
51
|
# Might have an issue if the things are not correctly installed in the same dicrectory
|