enzymetk 0.0.6__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {enzymetk-0.0.6 → enzymetk-0.0.7}/PKG-INFO +106 -4
  2. {enzymetk-0.0.6 → enzymetk-0.0.7}/README.md +98 -1
  3. enzymetk-0.0.7/enzymetk/__init__.py +122 -0
  4. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_CREEP_step.py +10 -2
  5. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_proteinfer_step.py +9 -0
  6. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_boltz_step.py +30 -3
  7. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_chai_step.py +23 -2
  8. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/dock_vina_step.py +23 -2
  9. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_chemberta_step.py +0 -1
  10. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_step.py +20 -2
  11. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_unimol_step.py +40 -14
  12. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedprotein_esm3_step.py +6 -3
  13. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedprotein_esm_step.py +101 -7
  14. enzymetk-0.0.7/enzymetk/main.py +251 -0
  15. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/predict_catalyticsite_step.py +9 -5
  16. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/sequence_search_blast.py +31 -13
  17. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_foldseek_step.py +0 -7
  18. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_mmseqs_step.py +2 -1
  19. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_reaction_step.py +13 -11
  20. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/similarity_substrate_step.py +15 -11
  21. enzymetk-0.0.7/enzymetk/step.py +134 -0
  22. enzymetk-0.0.7/enzymetk/structure_search_foldseek.py +88 -0
  23. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/PKG-INFO +106 -4
  24. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/SOURCES.txt +5 -7
  25. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/requires.txt +4 -0
  26. {enzymetk-0.0.6 → enzymetk-0.0.7}/setup.py +9 -5
  27. enzymetk-0.0.7/tests/test_embedprotein_esm_step.py +363 -0
  28. enzymetk-0.0.7/tests/test_esm2.py +18 -0
  29. enzymetk-0.0.7/tests/test_foldseek.py +136 -0
  30. enzymetk-0.0.6/enzymetk/__init__.py +0 -33
  31. enzymetk-0.0.6/enzymetk/esm-extract.py +0 -140
  32. enzymetk-0.0.6/enzymetk/main.py +0 -37
  33. enzymetk-0.0.6/enzymetk/predict_activity_step.py +0 -0
  34. enzymetk-0.0.6/enzymetk/predict_catalyticsite_run.py +0 -47
  35. enzymetk-0.0.6/enzymetk/reducedim_pca_run.py +0 -67
  36. enzymetk-0.0.6/enzymetk/reducedim_vae_run.py +0 -67
  37. enzymetk-0.0.6/enzymetk/reducedim_vae_step.py +0 -12
  38. enzymetk-0.0.6/enzymetk/step.py +0 -62
  39. {enzymetk-0.0.6 → enzymetk-0.0.7}/LICENSE +0 -0
  40. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/annotateEC_CLEAN_step.py +0 -0
  41. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_run.py +0 -0
  42. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_run.py +0 -0
  43. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_step.py +0 -0
  44. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/filter_sequence_step.py +0 -0
  45. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/filter_structure_step.py +0 -0
  46. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_msa_step.py +0 -0
  47. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_oligopool_step.py +0 -0
  48. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/generate_tree_step.py +0 -0
  49. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/inpaint_ligandMPNN_step.py +0 -0
  50. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
  51. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
  52. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/pipeline.py +0 -0
  53. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk/save_step.py +0 -0
  54. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/dependency_links.txt +0 -0
  55. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/entry_points.txt +0 -0
  56. {enzymetk-0.0.6 → enzymetk-0.0.7}/enzymetk.egg-info/top_level.txt +0 -0
  57. {enzymetk-0.0.6 → enzymetk-0.0.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: enzymetk
3
- Version: 0.0.6
3
+ Version: 0.0.7
4
4
  Home-page: https://github.com/arianemora/enzyme-tk/
5
5
  Author: Ariane Mora
6
6
  Author-email: ariane.n.mora@gmail.com
@@ -13,17 +13,22 @@ Classifier: Intended Audience :: Science/Research
13
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
14
  Classifier: Natural Language :: English
15
15
  Classifier: Operating System :: OS Independent
16
- Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
17
18
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
- Requires-Python: >=3.8
19
+ Requires-Python: >=3.10
19
20
  Description-Content-Type: text/markdown
20
21
  License-File: LICENSE
21
22
  Requires-Dist: scikit-learn
22
23
  Requires-Dist: numpy
23
24
  Requires-Dist: seaborn
24
25
  Requires-Dist: sciutil
26
+ Requires-Dist: tqdm
25
27
  Requires-Dist: pandas
26
28
  Requires-Dist: biopython
29
+ Requires-Dist: transformers
30
+ Requires-Dist: torch
31
+ Requires-Dist: huggingface_hub
27
32
  Dynamic: author
28
33
  Dynamic: author-email
29
34
  Dynamic: classifier
@@ -49,8 +54,91 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
49
54
  ## Install base package to import modules
50
55
 
51
56
  ```bash
57
+ conda create --name enzymetk python==3.12 -y
52
58
  pip install enzymetk
59
+ # Install torch for your specific cuda version
60
+ pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
53
61
  ```
62
+ ## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
63
+ ```
64
+ pip uninstall transformers -y
65
+ pip install "transformers<5"
66
+ ```
67
+
68
+ ## For each module run install the first time you're running it
69
+ This will install as a venv where possible and conda where the tools don't allow for venvs.
70
+ See specific tools for info.
71
+ ```
72
+ bm = BLAST(id_col, seq_col, label_col)
73
+ bm.install() # by default will create a venv or if needed a conda env
74
+ ```
75
+ Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
76
+ ```
77
+ bm = BLAST(id_col, seq_col, label_col)
78
+ bm.conda = 'blast_env' # an already installed env on your computer
79
+ bm.venv = None # so it knows to use conda i.e. forces it not to use venv
80
+ ```
81
+
82
+ ## Modules requiring conda
83
+
84
+ - CREEP [not tested again]
85
+ - CLEAN [not tested again]
86
+ - ProteInfer [not tested again]
87
+
88
+ ## Modules able to run in venv
89
+ - BLAST [cpu, tested with both, see notebook]
90
+ - ChemBERTA [cpu, colab]
91
+ - Boltz
92
+ - Chai: conda install -c conda-forge pdbfixer
93
+
94
+ - esm2/3 [cpu, see notebook]
95
+ - foldseek [tested and works]
96
+ - ligandmpnn
97
+ - mmseqs [can get working...]
98
+ - msa []
99
+ - reaction_similarity [good, cpu]
100
+ - rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
101
+ - substrate_similarity [good, cpu]
102
+ - tree
103
+ - unimol [good, cpu]
104
+
105
+ Docko git@github.com:ArianeMora/docko.git
106
+ ValueError: CCD component ALA not found!
107
+ boltz predict boltz.fasta --use_msa_server --cache ./mol
108
+
109
+ srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
110
+
111
+ pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
112
+ pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
113
+ pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
114
+
115
+
116
+ | Module | Name | Description | Colab ipynb|
117
+ |------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
118
+ | Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
119
+ | Metagenomics | Flye | Used to assemble the metagenomes. | ? |
120
+ | Metagenomics | Prokka | Annotation of genes within the genome. | ? |
121
+ | Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
122
+ | Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
123
+ | Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
124
+ | Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
125
+ | Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
126
+ | Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
127
+ | Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
128
+ | Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
129
+ | Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
130
+ | Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
131
+ | Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
132
+ | Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
133
+ | Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
134
+ | Similarity | Foldseek | Fast structure similarity search. | 54 |
135
+ | Similarity | MMseqs | Fast sequence clustering. | 55 |
136
+ | Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
137
+ | Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
138
+ | Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
139
+ | MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
140
+ | Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
141
+
54
142
 
55
143
  ### Install only the specific requirements you need (recomended)
56
144
 
@@ -121,7 +209,11 @@ The steps are the main building blocks of the pipeline. They are responsible for
121
209
 
122
210
  BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
123
211
 
124
- Note you need to have installed the BLAST environment.
212
+ Note you can install 2 ways, with a conda env by command line:
213
+
214
+ ```
215
+ enzymetk install_diamond
216
+ ```
125
217
 
126
218
  ```python
127
219
  id_col = 'Entry'
@@ -288,6 +380,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
288
380
 
289
381
  EmbedESM is a tool for embedding a set of sequences using ESM2.
290
382
 
383
+ Either in your own conda env: `pip install esm-fair` or you can run:
384
+
385
+ ```
386
+ id_col = 'Entry'
387
+ seq_col = 'Sequence'
388
+ label_col = 'ActiveSite'
389
+ esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
390
+ esm.install() # And follow the instructions to activate the env
391
+ ```
392
+
291
393
  ```python
292
394
  from enzymetk.embedprotein_esm_step import EmbedESM
293
395
  from enzymetk.save_step import Save
@@ -10,8 +10,91 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
10
10
  ## Install base package to import modules
11
11
 
12
12
  ```bash
13
+ conda create --name enzymetk python==3.12 -y
13
14
  pip install enzymetk
15
+ # Install torch for your specific cuda version
16
+ pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
14
17
  ```
18
+ ## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
19
+ ```
20
+ pip uninstall transformers -y
21
+ pip install "transformers<5"
22
+ ```
23
+
24
+ ## For each module run install the first time you're running it
25
+ This will install as a venv where possible and conda where the tools don't allow for venvs.
26
+ See specific tools for info.
27
+ ```
28
+ bm = BLAST(id_col, seq_col, label_col)
29
+ bm.install() # by default will create a venv or if needed a conda env
30
+ ```
31
+ Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
32
+ ```
33
+ bm = BLAST(id_col, seq_col, label_col)
34
+ bm.conda = 'blast_env' # an already installed env on your computer
35
+ bm.venv = None # so it knows to use conda i.e. forces it not to use venv
36
+ ```
37
+
38
+ ## Modules requiring conda
39
+
40
+ - CREEP [not tested again]
41
+ - CLEAN [not tested again]
42
+ - ProteInfer [not tested again]
43
+
44
+ ## Modules able to run in venv
45
+ - BLAST [cpu, tested with both, see notebook]
46
+ - ChemBERTA [cpu, colab]
47
+ - Boltz
48
+ - Chai: conda install -c conda-forge pdbfixer
49
+
50
+ - esm2/3 [cpu, see notebook]
51
+ - foldseek [tested and works]
52
+ - ligandmpnn
53
+ - mmseqs [can get working...]
54
+ - msa []
55
+ - reaction_similarity [good, cpu]
56
+ - rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
57
+ - substrate_similarity [good, cpu]
58
+ - tree
59
+ - unimol [good, cpu]
60
+
61
+ Docko git@github.com:ArianeMora/docko.git
62
+ ValueError: CCD component ALA not found!
63
+ boltz predict boltz.fasta --use_msa_server --cache ./mol
64
+
65
+ srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
66
+
67
+ pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
68
+ pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
69
+ pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
70
+
71
+
72
+ | Module | Name | Description | Colab ipynb|
73
+ |------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
74
+ | Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
75
+ | Metagenomics | Flye | Used to assemble the metagenomes. | ? |
76
+ | Metagenomics | Prokka | Annotation of genes within the genome. | ? |
77
+ | Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
78
+ | Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
79
+ | Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
80
+ | Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
81
+ | Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
82
+ | Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
83
+ | Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
84
+ | Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
85
+ | Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
86
+ | Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
87
+ | Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
88
+ | Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
89
+ | Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
90
+ | Similarity | Foldseek | Fast structure similarity search. | 54 |
91
+ | Similarity | MMseqs | Fast sequence clustering. | 55 |
92
+ | Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
93
+ | Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
94
+ | Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
95
+ | MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
96
+ | Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
97
+
15
98
 
16
99
  ### Install only the specific requirements you need (recomended)
17
100
 
@@ -82,7 +165,11 @@ The steps are the main building blocks of the pipeline. They are responsible for
82
165
 
83
166
  BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
84
167
 
85
- Note you need to have installed the BLAST environment.
168
+ Note you can install 2 ways, with a conda env by command line:
169
+
170
+ ```
171
+ enzymetk install_diamond
172
+ ```
86
173
 
87
174
  ```python
88
175
  id_col = 'Entry'
@@ -249,6 +336,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
249
336
 
250
337
  EmbedESM is a tool for embedding a set of sequences using ESM2.
251
338
 
339
+ Either in your own conda env: `pip install esm-fair` or you can run:
340
+
341
+ ```
342
+ id_col = 'Entry'
343
+ seq_col = 'Sequence'
344
+ label_col = 'ActiveSite'
345
+ esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
346
+ esm.install() # And follow the instructions to activate the env
347
+ ```
348
+
252
349
  ```python
253
350
  from enzymetk.embedprotein_esm_step import EmbedESM
254
351
  from enzymetk.save_step import Save
@@ -0,0 +1,122 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ """
19
+ Author: Ariane Mora
20
+ Date: March 2025
21
+ """
22
+ __title__ = 'enzymetk'
23
+ __description__ = 'Toolkit for enzymes and what not'
24
+ __url__ = 'https://github.com/arianemora/enzyme-tk/'
25
+ __version__ = '0.0.7'
26
+ __author__ = 'Ariane Mora'
27
+ __author_email__ = 'ariane.n.mora@gmail.com'
28
+ __license__ = 'GPL3'
29
+
30
+
31
+ # Core classes
32
+ from enzymetk.step import Step, Pipeline
33
+ from enzymetk.save_step import Save
34
+
35
+ # EC Annotation
36
+ from enzymetk.annotateEC_CLEAN_step import CLEAN
37
+ from enzymetk.annotateEC_CREEP_step import CREEP
38
+ from enzymetk.annotateEC_proteinfer_step import ProteInfer
39
+
40
+ # Docking
41
+ from enzymetk.dock_boltz_step import Boltz
42
+ from enzymetk.dock_chai_step import Chai
43
+ from enzymetk.dock_vina_step import Vina
44
+
45
+ # Chemical Embeddings
46
+ from enzymetk.embedchem_chemberta_step import ChemBERT
47
+ from enzymetk.embedchem_rxnfp_step import RxnFP
48
+ from enzymetk.embedchem_selformer_step import SelFormer
49
+ from enzymetk.embedchem_unimol_step import UniMol
50
+
51
+ # Protein Embeddings
52
+ from enzymetk.embedprotein_esm_step import EmbedESM
53
+ from enzymetk.embedprotein_esm3_step import EmbedESM3
54
+
55
+ # Sequence Generation/Alignment
56
+ from enzymetk.generate_msa_step import ClustalOmega
57
+ from enzymetk.generate_tree_step import FastTree
58
+
59
+ # Protein Design
60
+ from enzymetk.inpaint_ligandMPNN_step import LigandMPNN
61
+
62
+ # Metagenomics
63
+ from enzymetk.metagenomics_porechop_trim_reads_step import PoreChop
64
+ from enzymetk.metagenomics_prokka_annotate_genes import Prokka
65
+
66
+ # Prediction
67
+ from enzymetk.predict_catalyticsite_step import ActiveSitePred
68
+
69
+ # Sequence Search
70
+ from enzymetk.sequence_search_blast import BLAST
71
+
72
+ # Similarity Search
73
+ from enzymetk.similarity_foldseek_step import FoldSeek
74
+ from enzymetk.similarity_mmseqs_step import MMseqs
75
+ from enzymetk.similarity_reaction_step import ReactionDist
76
+ from enzymetk.similarity_substrate_step import SubstrateDist
77
+
78
+ # Structure Search (aliased to avoid conflict with similarity_foldseek_step.FoldSeek)
79
+ from enzymetk.structure_search_foldseek import FoldSeek as StructureFoldSeek
80
+
81
+
82
+ __all__ = [
83
+ # Core
84
+ 'Step',
85
+ 'Pipeline',
86
+ 'Save',
87
+ # EC Annotation
88
+ 'CLEAN',
89
+ 'CREEP',
90
+ 'ProteInfer',
91
+ # Docking
92
+ 'Boltz',
93
+ 'Chai',
94
+ 'Vina',
95
+ # Chemical Embeddings
96
+ 'ChemBERT',
97
+ 'RxnFP',
98
+ 'SelFormer',
99
+ 'UniMol',
100
+ # Protein Embeddings
101
+ 'EmbedESM',
102
+ 'EmbedESM3',
103
+ # Sequence Generation/Alignment
104
+ 'ClustalOmega',
105
+ 'FastTree',
106
+ # Protein Design
107
+ 'LigandMPNN',
108
+ # Metagenomics
109
+ 'PoreChop',
110
+ 'Prokka',
111
+ # Prediction
112
+ 'ActiveSitePred',
113
+ # Sequence Search
114
+ 'BLAST',
115
+ # Similarity Search
116
+ 'FoldSeek',
117
+ 'MMseqs',
118
+ 'ReactionDist',
119
+ 'SubstrateDist',
120
+ # Structure Search
121
+ 'StructureFoldSeek',
122
+ ]
@@ -5,9 +5,12 @@ import subprocess
5
5
  import logging
6
6
  import numpy as np
7
7
  import os
8
+ from enzymetk.step import run_script
9
+ from pathlib import Path
8
10
 
9
11
  logger = logging.getLogger(__name__)
10
12
  logger.setLevel(logging.INFO)
13
+ SCRIPT_DIR = Path(__file__).parent.resolve()
11
14
 
12
15
  """
13
16
  import os
@@ -38,9 +41,14 @@ class CREEP(Step):
38
41
  self.args_extract = args_extract
39
42
  self.args_retrieval = args_retrieval
40
43
 
44
+ def install(self, env_args=None):
45
+ # Try to automatically install CREEP conda env
46
+ run_script('install_CREEP.sh', verbose=True)
47
+ self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
48
+ self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
49
+
41
50
  def __execute(self, df: pd.DataFrame, tmp_dir: str):
42
- tmp_dir = '/disk1/ariane/vscode/degradeo/pipeline/tmp/'
43
- input_filename = f'{tmp_dir}/creepasjkdkajshdkja.csv'
51
+ input_filename = f'{tmp_dir}/input.csv'
44
52
  df.to_csv(input_filename, index=False)
45
53
  cmd = ['conda', 'run', '-n', self.env_name, 'python', f'{self.CREEP_dir}scripts/step_02_extract_CREEP.py', '--pretrained_folder',
46
54
  f'{self.CREEP_cache_dir}output/easy_split',
@@ -5,7 +5,10 @@ from multiprocessing.dummy import Pool as ThreadPool
5
5
  from tempfile import TemporaryDirectory
6
6
  import os
7
7
  import subprocess
8
+ from enzymetk.step import run_script
9
+ from pathlib import Path
8
10
 
11
+ SCRIPT_DIR = Path(__file__).parent.resolve()
9
12
 
10
13
  class ProteInfer(Step):
11
14
 
@@ -53,6 +56,12 @@ class ProteInfer(Step):
53
56
  self.ec3_filter = ec3_filter
54
57
  self.ec4_filter = ec4_filter
55
58
 
59
+ def install(self, env_args=None):
60
+ # Try to automatically install CREEP conda env
61
+ run_script('install_CREEP.sh', verbose=True)
62
+ self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
63
+ self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
64
+
56
65
  def __execute(self, data: list) -> np.array:
57
66
  df, tmp_dir = data
58
67
  # Make sure in the directory of proteinfer
@@ -1,6 +1,5 @@
1
1
  from enzymetk.step import Step
2
2
  import pandas as pd
3
- from docko.boltz import run_boltz_affinity
4
3
  import logging
5
4
  import numpy as np
6
5
  from multiprocessing.dummy import Pool as ThreadPool
@@ -9,16 +8,40 @@ from multiprocessing.dummy import Pool as ThreadPool
9
8
  logger = logging.getLogger(__name__)
10
9
  logger.setLevel(logging.INFO)
11
10
 
11
+ try:
12
+ from docko.boltz import run_boltz_affinity
13
+ except ImportError as e:
14
+ print("Boltz: Needs docko package. Install with: pip install docko.")
15
+
12
16
 
13
17
  class Boltz(Step):
14
18
 
15
- def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str, num_threads: int):
19
+ def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str,
20
+ num_threads: 1, env_name = None, args=None):
21
+ super().__init__()
16
22
  self.id_col = id_col
17
23
  self.seq_col = seq_col
18
24
  self.substrate_col = substrate_col
19
25
  self.intermediate_col = intermediate_col
20
26
  self.output_dir = output_dir or None
21
27
  self.num_threads = num_threads or 1
28
+ self.conda = env_name
29
+ self.env_name = env_name
30
+ self.args = args
31
+
32
+ def install(self, env_args=None):
33
+ # e.g. env args could by python=='3.1.1.
34
+ self.install_venv(env_args)
35
+ # Now the specific
36
+ try:
37
+ cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
38
+ self.run(cmd)
39
+ except Exception as e:
40
+ cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
41
+ self.run(cmd)
42
+ self.run(cmd)
43
+ # Now set the venv to be the location:
44
+ self.venv = f'{self.env_name}/bin/python'
22
45
 
23
46
  def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
24
47
  output_filenames = []
@@ -28,11 +51,15 @@ class Boltz(Step):
28
51
  if not isinstance(substrate, str):
29
52
  substrate = ''
30
53
  print(run_id, seq, substrate)
31
- run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate)
54
+ if self.args:
55
+ run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate, self.args)
56
+ else:
57
+ run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate)
32
58
  output_filenames.append(f'{self.output_dir}/{run_id}/')
33
59
  return output_filenames
34
60
 
35
61
  def execute(self, df: pd.DataFrame) -> pd.DataFrame:
62
+
36
63
  if self.output_dir:
37
64
  if self.num_threads > 1:
38
65
  pool = ThreadPool(self.num_threads)
@@ -1,9 +1,13 @@
1
1
  from enzymetk.step import Step
2
2
  import pandas as pd
3
- from docko.chai import run_chai
3
+
4
4
  import logging
5
5
  import numpy as np
6
6
 
7
+ try:
8
+ from docko.chai import run_chai
9
+ except ImportError as e:
10
+ print("Chai: Needs docko package. Install with: pip install docko.")
7
11
 
8
12
  logger = logging.getLogger(__name__)
9
13
  logger.setLevel(logging.INFO)
@@ -11,7 +15,9 @@ logger.setLevel(logging.INFO)
11
15
 
12
16
  class Chai(Step):
13
17
 
14
- def __init__(self, id_col: str, seq_col: str, substrate_col: str, cofactor_col: str, output_dir: str, num_threads: int):
18
+ def __init__(self, id_col: str, seq_col: str, substrate_col: str, cofactor_col: str, output_dir: str,
19
+ num_threads: 1, venv_name = 'enzymetk', env_name = None):
20
+ super().__init__()
15
21
  self.id_col = id_col
16
22
  self.seq_col = seq_col
17
23
  self.substrate_col = substrate_col
@@ -19,6 +25,21 @@ class Chai(Step):
19
25
  self.output_dir = output_dir or None
20
26
  self.num_threads = num_threads or 1
21
27
 
28
+ def install(self, env_args=None):
29
+ # e.g. env args could by python=='3.1.1.
30
+ self.install_venv(env_args)
31
+ # Now the specific
32
+ try:
33
+ cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
34
+ self.run(cmd)
35
+ except Exception as e:
36
+ cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
37
+ self.run(cmd)
38
+ self.run(cmd)
39
+ # Now set the venv to be the location:
40
+ self.venv = f'{self.env_name}/bin/python'
41
+
42
+
22
43
  def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
23
44
  output_filenames = []
24
45
  for run_id, seq, substrate, cofactor in df[[self.id_col, self.seq_col, self.substrate_col, self.cofactor_col]].values:
@@ -1,12 +1,18 @@
1
1
  from enzymetk.step import Step
2
2
  import pandas as pd
3
- from docko.docko import *
3
+
4
4
  import logging
5
5
  import numpy as np
6
6
  import os
7
7
  from pathlib import Path
8
8
  from multiprocessing.dummy import Pool as ThreadPool
9
9
 
10
+
11
+ try:
12
+ from docko.docko import *
13
+ except ImportError as e:
14
+ print("Vina: Needs docko package. Install with: pip install docko.")
15
+
10
16
  logger = logging.getLogger(__name__)
11
17
  logger.setLevel(logging.INFO)
12
18
 
@@ -14,7 +20,8 @@ logger.setLevel(logging.INFO)
14
20
  class Vina(Step):
15
21
 
16
22
  def __init__(self, id_col: str, structure_col: str, sequence_col: str,
17
- substrate_col: str, substrate_name_col: str, active_site_col: str, output_dir: str, num_threads: int):
23
+ substrate_col: str, substrate_name_col: str, active_site_col: str, output_dir: str, num_threads: 1,
24
+ venv_name = 'enzymetk', env_name = None):
18
25
  print('Expects active site residues as a string separated by |. Zero indexed.')
19
26
  self.id_col = id_col
20
27
  self.structure_col = structure_col
@@ -25,6 +32,20 @@ class Vina(Step):
25
32
  self.output_dir = Path( output_dir) or None
26
33
  self.num_threads = num_threads or 1
27
34
 
35
+ def install(self, env_args=None):
36
+ # e.g. env args could by python=='3.1.1.
37
+ self.install_venv(env_args)
38
+ # Now the specific
39
+ try:
40
+ cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
41
+ self.run(cmd)
42
+ except Exception as e:
43
+ cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
44
+ self.run(cmd)
45
+ self.run(cmd)
46
+ # Now set the venv to be the location:
47
+ self.venv = f'{self.env_name}/bin/python'
48
+
28
49
  def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
29
50
  output_filenames = []
30
51
  # ToDo: update to create from sequence if the path doesn't exist.
@@ -16,7 +16,6 @@ class ChemBERT(Step):
16
16
  self.seq_len_limit = 500
17
17
  self.embedding_len = 768
18
18
 
19
-
20
19
  def __execute(self, data: list) -> np.array:
21
20
  results = []
22
21
  for v in data:
@@ -16,10 +16,28 @@ logger.setLevel(logging.INFO)
16
16
 
17
17
  class RxnFP(Step):
18
18
 
19
- def __init__(self, smiles_col: str, num_threads: int, env_name: str = 'rxnfp'):
19
+ def __init__(self, smiles_col: str, num_threads: 1,
20
+ env_name = 'rxnfp', venv_name = None):
21
+ super().__init__()
20
22
  self.value_col = smiles_col
21
23
  self.num_threads = num_threads or 1
24
+ self.conda = env_name
22
25
  self.env_name = env_name
26
+ self.venv = venv_name if venv_name else f'{env_name}/bin/python'
27
+
28
+ def install(self, env_args=['--python', '3.8']):
29
+ # e.g. env args could by python=='3.1.1.
30
+ self.install_conda(env_args=env_args)
31
+ # Now the specific
32
+ try:
33
+ cmd = [f'pip', 'install', 'rxnfp', 'rdkit=2020.03.3', 'tmap', 'numpy==1.23', 'sciutil']
34
+ self.run(cmd)
35
+ except Exception as e:
36
+ cmd = [f'pip', 'install', 'rxnfp', 'rdkit=2020.03.3', 'tmap', 'numpy==1.23', 'sciutil']
37
+ self.run(cmd)
38
+ self.run(cmd)
39
+ # Now set the venv to be the location:
40
+ self.conda = f'{self.env_name}'
23
41
 
24
42
  def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
25
43
  tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
@@ -27,7 +45,7 @@ class RxnFP(Step):
27
45
  output_filename = f'{tmp_dir}/rxnfp_{tmp_label}.pkl'
28
46
  input_filename = f'{tmp_dir}/input_{tmp_label}.csv'
29
47
  df.to_csv(input_filename, index=False)
30
- cmd = ['conda', 'run', '-n', self.env_name, 'python', Path(__file__).parent/'embedchem_rxnfp_run.py', '--out', output_filename,
48
+ cmd = ['python', Path(__file__).parent/'embedchem_rxnfp_run.py', '--out', output_filename,
31
49
  '--input', input_filename, '--label', self.value_col]
32
50
  self.run(cmd)
33
51
  # Might have an issue if the things are not correctly installed in the same dicrectory