enzymetk 0.0.2__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {enzymetk-0.0.2 → enzymetk-0.0.7}/PKG-INFO +143 -10
  2. {enzymetk-0.0.2 → enzymetk-0.0.7}/README.md +132 -0
  3. enzymetk-0.0.7/enzymetk/__init__.py +122 -0
  4. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/annotateEC_CLEAN_step.py +2 -2
  5. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/annotateEC_CREEP_step.py +11 -3
  6. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/annotateEC_proteinfer_step.py +9 -0
  7. enzymetk-0.0.7/enzymetk/dock_boltz_step.py +73 -0
  8. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/dock_chai_step.py +27 -4
  9. enzymetk-0.0.7/enzymetk/dock_vina_step.py +117 -0
  10. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_chemberta_step.py +0 -1
  11. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_step.py +20 -2
  12. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_unimol_step.py +40 -14
  13. enzymetk-0.0.7/enzymetk/embedprotein_esm3_step.py +74 -0
  14. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedprotein_esm_step.py +104 -9
  15. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/inpaint_ligandMPNN_step.py +3 -2
  16. enzymetk-0.0.7/enzymetk/main.py +251 -0
  17. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/predict_catalyticsite_step.py +22 -10
  18. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/sequence_search_blast.py +31 -13
  19. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/similarity_foldseek_step.py +3 -11
  20. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/similarity_mmseqs_step.py +2 -1
  21. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/similarity_reaction_step.py +24 -18
  22. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/similarity_substrate_step.py +21 -11
  23. enzymetk-0.0.7/enzymetk/step.py +134 -0
  24. enzymetk-0.0.7/enzymetk/structure_search_foldseek.py +88 -0
  25. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk.egg-info/PKG-INFO +143 -10
  26. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk.egg-info/SOURCES.txt +7 -7
  27. enzymetk-0.0.7/enzymetk.egg-info/requires.txt +10 -0
  28. {enzymetk-0.0.2 → enzymetk-0.0.7}/setup.py +10 -10
  29. enzymetk-0.0.7/tests/test_embedprotein_esm_step.py +363 -0
  30. enzymetk-0.0.7/tests/test_esm2.py +18 -0
  31. enzymetk-0.0.7/tests/test_foldseek.py +136 -0
  32. enzymetk-0.0.2/enzymetk/__init__.py +0 -56
  33. enzymetk-0.0.2/enzymetk/dock_vina_step.py +0 -63
  34. enzymetk-0.0.2/enzymetk/esm-extract.py +0 -140
  35. enzymetk-0.0.2/enzymetk/main.py +0 -37
  36. enzymetk-0.0.2/enzymetk/predict_activity_step.py +0 -0
  37. enzymetk-0.0.2/enzymetk/predict_catalyticsite_run.py +0 -47
  38. enzymetk-0.0.2/enzymetk/reducedim_pca_run.py +0 -67
  39. enzymetk-0.0.2/enzymetk/reducedim_vae_run.py +0 -67
  40. enzymetk-0.0.2/enzymetk/reducedim_vae_step.py +0 -12
  41. enzymetk-0.0.2/enzymetk/step.py +0 -60
  42. enzymetk-0.0.2/enzymetk.egg-info/requires.txt +0 -11
  43. {enzymetk-0.0.2 → enzymetk-0.0.7}/LICENSE +0 -0
  44. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_rxnfp_run.py +0 -0
  45. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_run.py +0 -0
  46. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/embedchem_selformer_step.py +0 -0
  47. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/filter_sequence_step.py +0 -0
  48. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/filter_structure_step.py +0 -0
  49. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/generate_msa_step.py +0 -0
  50. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/generate_oligopool_step.py +0 -0
  51. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/generate_tree_step.py +0 -0
  52. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
  53. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
  54. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/pipeline.py +0 -0
  55. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk/save_step.py +0 -0
  56. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk.egg-info/dependency_links.txt +0 -0
  57. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk.egg-info/entry_points.txt +0 -0
  58. {enzymetk-0.0.2 → enzymetk-0.0.7}/enzymetk.egg-info/top_level.txt +0 -0
  59. {enzymetk-0.0.2 → enzymetk-0.0.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: enzymetk
3
- Version: 0.0.2
3
+ Version: 0.0.7
4
4
  Home-page: https://github.com/arianemora/enzyme-tk/
5
5
  Author: Ariane Mora
6
6
  Author-email: ariane.n.mora@gmail.com
@@ -13,22 +13,22 @@ Classifier: Intended Audience :: Science/Research
13
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
14
  Classifier: Natural Language :: English
15
15
  Classifier: Operating System :: OS Independent
16
- Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
17
18
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
- Requires-Python: >=3.8
19
+ Requires-Python: >=3.10
19
20
  Description-Content-Type: text/markdown
20
21
  License-File: LICENSE
21
- Requires-Dist: fair-esm
22
22
  Requires-Dist: scikit-learn
23
23
  Requires-Dist: numpy
24
24
  Requires-Dist: seaborn
25
25
  Requires-Dist: sciutil
26
- Requires-Dist: pandas==2.1.4
26
+ Requires-Dist: tqdm
27
+ Requires-Dist: pandas
27
28
  Requires-Dist: biopython
28
- Requires-Dist: sentence_transformers
29
- Requires-Dist: pubchempy
30
- Requires-Dist: pyfaidx
31
- Requires-Dist: spacy
29
+ Requires-Dist: transformers
30
+ Requires-Dist: torch
31
+ Requires-Dist: huggingface_hub
32
32
  Dynamic: author
33
33
  Dynamic: author-email
34
34
  Dynamic: classifier
@@ -37,6 +37,7 @@ Dynamic: description-content-type
37
37
  Dynamic: home-page
38
38
  Dynamic: keywords
39
39
  Dynamic: license
40
+ Dynamic: license-file
40
41
  Dynamic: project-url
41
42
  Dynamic: requires-dist
42
43
  Dynamic: requires-python
@@ -45,14 +46,100 @@ Dynamic: requires-python
45
46
 
46
47
  Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
47
48
 
49
+
50
+ **If you have any issues installing, let me know - this has been tested only on Linux/Ubuntu. Please post an issue!**
51
+
48
52
  ## Installation
49
53
 
50
54
  ## Install base package to import modules
51
55
 
52
56
  ```bash
57
+ conda create --name enzymetk python==3.12 -y
53
58
  pip install enzymetk
59
+ # Install torch for your specific cuda version
60
+ pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
61
+ ```
62
+ ## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
63
+ ```
64
+ pip uninstall transformers -y
65
+ pip install "transformers<5"
66
+ ```
67
+
68
+ ## For each module run install the first time you're running it
69
+ This will install as a venv where possible and conda where the tools don't allow for venvs.
70
+ See specific tools for info.
71
+ ```
72
+ bm = BLAST(id_col, seq_col, label_col)
73
+ bm.install() # by default will create a venv or if needed a conda env
74
+ ```
75
+ Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
76
+ ```
77
+ bm = BLAST(id_col, seq_col, label_col)
78
+ bm.conda = 'blast_env' # an already installed env on your computer
79
+ bm.venv = None # so it knows to use conda i.e. forces it not to use venv
54
80
  ```
55
81
 
82
+ ## Modules requiring conda
83
+
84
+ - CREEP [not tested again]
85
+ - CLEAN [not tested again]
86
+ - ProteInfer [not tested again]
87
+
88
+ ## Modules able to run in venv
89
+ - BLAST [cpu, tested with both, see notebook]
90
+ - ChemBERTA [cpu, colab]
91
+ - Boltz
92
+ - Chai: conda install -c conda-forge pdbfixer
93
+
94
+ - esm2/3 [cpu, see notebook]
95
+ - foldseek [tested and works]
96
+ - ligandmpnn
97
+ - mmseqs [can get working...]
98
+ - msa []
99
+ - reaction_similarity [good, cpu]
100
+ - rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
101
+ - substrate_similarity [good, cpu]
102
+ - tree
103
+ - unimol [good, cpu]
104
+
105
+ Docko git@github.com:ArianeMora/docko.git
106
+ ValueError: CCD component ALA not found!
107
+ boltz predict boltz.fasta --use_msa_server --cache ./mol
108
+
109
+ srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
110
+
111
+ pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
112
+ pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
113
+ pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
114
+
115
+
116
+ | Module | Name | Description | Colab ipynb|
117
+ |------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
118
+ | Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
119
+ | Metagenomics | Flye | Used to assemble the metagenomes. | ? |
120
+ | Metagenomics | Prokka | Annotation of genes within the genome. | ? |
121
+ | Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
122
+ | Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
123
+ | Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
124
+ | Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
125
+ | Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
126
+ | Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
127
+ | Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
128
+ | Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
129
+ | Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
130
+ | Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
131
+ | Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
132
+ | Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
133
+ | Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
134
+ | Similarity | Foldseek | Fast structure similarity search. | 54 |
135
+ | Similarity | MMseqs | Fast sequence clustering. | 55 |
136
+ | Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
137
+ | Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
138
+ | Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
139
+ | MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
140
+ | Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
141
+
142
+
56
143
  ### Install only the specific requirements you need (recomended)
57
144
 
58
145
  For this clone the repo and then install the requirements for the specific modules you use
@@ -71,6 +158,7 @@ This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require
71
158
 
72
159
  Here are some of the tools that have been implemented to be chained together as a pipeline:
73
160
 
161
+ [boltz2](https://github.com/jwohlwend/boltz)
74
162
  [mmseqs2](https://github.com/soedinglab/mmseqs2)
75
163
  [foldseek](https://github.com/steineggerlab/foldseek)
76
164
  [diamond](https://github.com/bbuchfink/diamond)
@@ -89,6 +177,7 @@ Here are some of the tools that have been implemented to be chained together as
89
177
  [fasttree](https://morgannprice.github.io/fasttree/)
90
178
  [Porechop](https://github.com/rrwick/Porechop)
91
179
  [prokka](https://github.com/tseemann/prokka)
180
+
92
181
  ## Things to note
93
182
 
94
183
  All the tools use the conda env of `enzymetk` by default.
@@ -120,6 +209,12 @@ The steps are the main building blocks of the pipeline. They are responsible for
120
209
 
121
210
  BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
122
211
 
212
+ Note you can install 2 ways, with a conda env by command line:
213
+
214
+ ```
215
+ enzymetk install_diamond
216
+ ```
217
+
123
218
  ```python
124
219
  id_col = 'Entry'
125
220
  seq_col = 'Sequence'
@@ -148,6 +243,34 @@ df = pd.DataFrame(rows, columns=[id_col, seq_col])
148
243
  print(df)
149
244
  df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
150
245
 
246
+ ```
247
+ ### Boltz2
248
+
249
+ Boltz2 is a model for predicting structures. Note you need docko installed as I run via that.
250
+
251
+ Below is an example using boltz with 4 threads, and uses a cofactor (intermediate in this case). Just set to be None for a single substrate version.
252
+ ```
253
+ import sys
254
+ from enzymetk.dock_boltz_step import Boltz
255
+ from enzymetk.save_step import Save
256
+ import pandas as pd
257
+ import os
258
+ os.environ['MKL_THREADING_LAYER'] = 'GNU'
259
+
260
+ output_dir = 'tmp/'
261
+ num_threads = 4
262
+ id_col = 'Entry'
263
+ seq_col = 'Sequence'
264
+ substrate_col = 'Substrate'
265
+ intermediate_col = 'Intermediate'
266
+
267
+ rows = [['P0DP23_boltz_8999', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
268
+ ['P0DP24_boltz_p1', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
269
+ ['P0DP23_boltz_p2', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
270
+ ['P0DP24_boltz_p3', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
271
+ ['P0DP24_boltz_p4', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]']]
272
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col, intermediate_col])
273
+ df << (Boltz(id_col, seq_col, substrate_col, intermediate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
151
274
  ```
152
275
 
153
276
  ### Chai
@@ -257,6 +380,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
257
380
 
258
381
  EmbedESM is a tool for embedding a set of sequences using ESM2.
259
382
 
383
+ Either in your own conda env: `pip install esm-fair` or you can run:
384
+
385
+ ```
386
+ id_col = 'Entry'
387
+ seq_col = 'Sequence'
388
+ label_col = 'ActiveSite'
389
+ esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
390
+ esm.install() # And follow the instructions to activate the env
391
+ ```
392
+
260
393
  ```python
261
394
  from enzymetk.embedprotein_esm_step import EmbedESM
262
395
  from enzymetk.save_step import Save
@@ -2,14 +2,100 @@
2
2
 
3
3
  Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperable modules that act on dataframes. These modules are designed to be imported into pipelines for specific function. For this reason, `steps` as each module is called (e.g. finding similar proteins with `BLAST` would be considered a step) are designed to be as light as possible. An example of a pipeline is the [annotate-e](https://github.com/ArianeMora/annotate-e) ` pipeline, this acts to annotate a fasta with an ensemble of methods (each is designated as an Enzyme-tk step).
4
4
 
5
+
6
+ **If you have any issues installing, let me know - this has been tested only on Linux/Ubuntu. Please post an issue!**
7
+
5
8
  ## Installation
6
9
 
7
10
  ## Install base package to import modules
8
11
 
9
12
  ```bash
13
+ conda create --name enzymetk python==3.12 -y
10
14
  pip install enzymetk
15
+ # Install torch for your specific cuda version
16
+ pip install torch torchvision #--index-url https://download.pytorch.org/whl/cu130
17
+ ```
18
+ ## If you're at the bleeding edge, and going to use older models e.g. chemBERTa2 you may need to run
19
+ ```
20
+ pip uninstall transformers -y
21
+ pip install "transformers<5"
22
+ ```
23
+
24
+ ## For each module run install the first time you're running it
25
+ This will install as a venv where possible and conda where the tools don't allow for venvs.
26
+ See specific tools for info.
27
+ ```
28
+ bm = BLAST(id_col, seq_col, label_col)
29
+ bm.install() # by default will create a venv or if needed a conda env
30
+ ```
31
+ Note if you want to use your specific environment you can install externally and override the installed venv or conda env e.g.
32
+ ```
33
+ bm = BLAST(id_col, seq_col, label_col)
34
+ bm.conda = 'blast_env' # an already installed env on your computer
35
+ bm.venv = None # so it knows to use conda i.e. forces it not to use venv
11
36
  ```
12
37
 
38
+ ## Modules requiring conda
39
+
40
+ - CREEP [not tested again]
41
+ - CLEAN [not tested again]
42
+ - ProteInfer [not tested again]
43
+
44
+ ## Modules able to run in venv
45
+ - BLAST [cpu, tested with both, see notebook]
46
+ - ChemBERTA [cpu, colab]
47
+ - Boltz
48
+ - Chai: conda install -c conda-forge pdbfixer
49
+
50
+ - esm2/3 [cpu, see notebook]
51
+ - foldseek [tested and works]
52
+ - ligandmpnn
53
+ - mmseqs [can get working...]
54
+ - msa []
55
+ - reaction_similarity [good, cpu]
56
+ - rxnfp [needs specific python version so not easy in colab] hence install is with `enzymetk install rxnfp` requires conda
57
+ - substrate_similarity [good, cpu]
58
+ - tree
59
+ - unimol [good, cpu]
60
+
61
+ Docko git@github.com:ArianeMora/docko.git
62
+ ValueError: CCD component ALA not found!
63
+ boltz predict boltz.fasta --use_msa_server --cache ./mol
64
+
65
+ srun -p gpu --qos=normal --gres=gpu:1 --pty --mem=64G --time=000:30:00 bash
66
+
67
+ pipelines: reads --> poreChop --> Flye --> Prokka --> Squidly --> Foldseek --> Boltz --> Chai
68
+ pipelines: seqs --> BLAST --> Proteinfer --> Foldseek --> MMseqs --> ClustalOmega --> FastTree
69
+ pipelines: reactions --> rxnFP --> selformer --> uniMol --> chemBERTa2 --> RDkit reaction similarity
70
+
71
+
72
+ | Module | Name | Description | Colab ipynb|
73
+ |------------------------------|---------------|-----------------------------------------------------------------------------------|------------|
74
+ | Metagenomics | PoreChop | Used to filter adapters for nanopore sequences in metagenomics pipeline. | y |
75
+ | Metagenomics | Flye | Used to assemble the metagenomes. | ? |
76
+ | Metagenomics | Prokka | Annotation of genes within the genome. | ? |
77
+ | Function prediction | Proteinfer | Annotation of genes to function (GO or EC class) using ML. | 33 |
78
+ | Function prediction | CLEAN | Annotation of genes to EC class using ML. | 11 |
79
+ | Function prediction | CREEP | Annotation of genes to EC class using ML. | 13 |
80
+ | Function prediction | Func-e | Annotation of genes to reaction using ML. | This study. |
81
+ | Function prediction | Squidly | Annotation of catalytic residues using ML. | 36 |
82
+ | Embedding generation | ESM2 & 3 | Conversion of amino acid sequence to a numerical embedding using a PLM. | 46,47 |
83
+ | Embedding generation | RxnFP | Conversion of reaction smiles to a numerical embedding using a language model. | 48 |
84
+ | Embedding generation | Selformer | Conversion of reaction selfies to a numerical embedding using a language model. | 49 |
85
+ | Embedding generation | Uni-mol | Conversion of molecule smiles to a numerical embedding using a language model. | 50 |
86
+ | Embedding generation | ChemBERTa2 | Conversion of reaction smiles to a numerical embedding using a language model. | 51 |
87
+ | Docking | Chai | Diffusion based folding of a protein and ligand. | 42 |
88
+ | Docking | Boltz | Diffusion based folding of a protein and ligand. | 52 |
89
+ | Similarity | Diamond | Sequence similarity calculation using basic local alignment search. | 53 |
90
+ | Similarity | Foldseek | Fast structure similarity search. | 54 |
91
+ | Similarity | MMseqs | Fast sequence clustering. | 55 |
92
+ | Docking | StructureZyme | Alignment and calculation of structure metrics. | 56 |
93
+ | Oligo design | Oligopoolio | Calculation of oligo fragments for gene assembly. | This study. |
94
+ | Sequencing | LevSeq | Sequence verification of protein variants. | 34 |
95
+ | MSA generation | ClustalOmega | Creation of multiple sequence alignments (MSA). | 57 |
96
+ | Phylogenetic tree generation | FastTree | Creation of multiple phylogenetic trees. | 58 |
97
+
98
+
13
99
  ### Install only the specific requirements you need (recomended)
14
100
 
15
101
  For this clone the repo and then install the requirements for the specific modules you use
@@ -28,6 +114,7 @@ This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require
28
114
 
29
115
  Here are some of the tools that have been implemented to be chained together as a pipeline:
30
116
 
117
+ [boltz2](https://github.com/jwohlwend/boltz)
31
118
  [mmseqs2](https://github.com/soedinglab/mmseqs2)
32
119
  [foldseek](https://github.com/steineggerlab/foldseek)
33
120
  [diamond](https://github.com/bbuchfink/diamond)
@@ -46,6 +133,7 @@ Here are some of the tools that have been implemented to be chained together as
46
133
  [fasttree](https://morgannprice.github.io/fasttree/)
47
134
  [Porechop](https://github.com/rrwick/Porechop)
48
135
  [prokka](https://github.com/tseemann/prokka)
136
+
49
137
  ## Things to note
50
138
 
51
139
  All the tools use the conda env of `enzymetk` by default.
@@ -77,6 +165,12 @@ The steps are the main building blocks of the pipeline. They are responsible for
77
165
 
78
166
  BLAST is a tool for searching a database of sequences for similar sequences. Here you can either pass a database that you have already created or pass the sequences as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference refers to sequences that you want to search against and query refers to sequences that you want to search for.
79
167
 
168
+ Note you can install 2 ways, with a conda env by command line:
169
+
170
+ ```
171
+ enzymetk install_diamond
172
+ ```
173
+
80
174
  ```python
81
175
  id_col = 'Entry'
82
176
  seq_col = 'Sequence'
@@ -105,6 +199,34 @@ df = pd.DataFrame(rows, columns=[id_col, seq_col])
105
199
  print(df)
106
200
  df << (ActiveSitePred(id_col, seq_col, squidly_dir, num_threads) >> Save('tmp/squidly_as_pred.pkl'))
107
201
 
202
+ ```
203
+ ### Boltz2
204
+
205
+ Boltz2 is a model for predicting structures. Note you need docko installed as I run via that.
206
+
207
+ Below is an example using boltz with 4 threads, and uses a cofactor (intermediate in this case). Just set to be None for a single substrate version.
208
+ ```
209
+ import sys
210
+ from enzymetk.dock_boltz_step import Boltz
211
+ from enzymetk.save_step import Save
212
+ import pandas as pd
213
+ import os
214
+ os.environ['MKL_THREADING_LAYER'] = 'GNU'
215
+
216
+ output_dir = 'tmp/'
217
+ num_threads = 4
218
+ id_col = 'Entry'
219
+ seq_col = 'Sequence'
220
+ substrate_col = 'Substrate'
221
+ intermediate_col = 'Intermediate'
222
+
223
+ rows = [['P0DP23_boltz_8999', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
224
+ ['P0DP24_boltz_p1', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
225
+ ['P0DP23_boltz_p2', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
226
+ ['P0DP24_boltz_p3', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]'],
227
+ ['P0DP24_boltz_p4', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC', 'CC1=C(C2=CC3=C(C(=C([N-]3)C=C4C(=C(C(=N4)C=C5C(=C(C(=N5)C=C1[N-]2)C)C=C)C)C=C)C)CCC(=O)[O-])CCC(=O)[O-].[Fe]']]
228
+ df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col, intermediate_col])
229
+ df << (Boltz(id_col, seq_col, substrate_col, intermediate_col, f'{output_dir}', num_threads) >> Save(f'{output_dir}test.pkl'))
108
230
  ```
109
231
 
110
232
  ### Chai
@@ -214,6 +336,16 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
214
336
 
215
337
  EmbedESM is a tool for embedding a set of sequences using ESM2.
216
338
 
339
+ Either in your own conda env: `pip install esm-fair` or you can run:
340
+
341
+ ```
342
+ id_col = 'Entry'
343
+ seq_col = 'Sequence'
344
+ label_col = 'ActiveSite'
345
+ esm = EmbedESM(id_col, seq_col, extraction_method='mean', tmp_dir='tmp', rep_num=36) # i.e. the representation number you want usually the last layer
346
+ esm.install() # And follow the instructions to activate the env
347
+ ```
348
+
217
349
  ```python
218
350
  from enzymetk.embedprotein_esm_step import EmbedESM
219
351
  from enzymetk.save_step import Save
@@ -0,0 +1,122 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ """
19
+ Author: Ariane Mora
20
+ Date: March 2025
21
+ """
22
+ __title__ = 'enzymetk'
23
+ __description__ = 'Toolkit for enzymes and what not'
24
+ __url__ = 'https://github.com/arianemora/enzyme-tk/'
25
+ __version__ = '0.0.7'
26
+ __author__ = 'Ariane Mora'
27
+ __author_email__ = 'ariane.n.mora@gmail.com'
28
+ __license__ = 'GPL3'
29
+
30
+
31
+ # Core classes
32
+ from enzymetk.step import Step, Pipeline
33
+ from enzymetk.save_step import Save
34
+
35
+ # EC Annotation
36
+ from enzymetk.annotateEC_CLEAN_step import CLEAN
37
+ from enzymetk.annotateEC_CREEP_step import CREEP
38
+ from enzymetk.annotateEC_proteinfer_step import ProteInfer
39
+
40
+ # Docking
41
+ from enzymetk.dock_boltz_step import Boltz
42
+ from enzymetk.dock_chai_step import Chai
43
+ from enzymetk.dock_vina_step import Vina
44
+
45
+ # Chemical Embeddings
46
+ from enzymetk.embedchem_chemberta_step import ChemBERT
47
+ from enzymetk.embedchem_rxnfp_step import RxnFP
48
+ from enzymetk.embedchem_selformer_step import SelFormer
49
+ from enzymetk.embedchem_unimol_step import UniMol
50
+
51
+ # Protein Embeddings
52
+ from enzymetk.embedprotein_esm_step import EmbedESM
53
+ from enzymetk.embedprotein_esm3_step import EmbedESM3
54
+
55
+ # Sequence Generation/Alignment
56
+ from enzymetk.generate_msa_step import ClustalOmega
57
+ from enzymetk.generate_tree_step import FastTree
58
+
59
+ # Protein Design
60
+ from enzymetk.inpaint_ligandMPNN_step import LigandMPNN
61
+
62
+ # Metagenomics
63
+ from enzymetk.metagenomics_porechop_trim_reads_step import PoreChop
64
+ from enzymetk.metagenomics_prokka_annotate_genes import Prokka
65
+
66
+ # Prediction
67
+ from enzymetk.predict_catalyticsite_step import ActiveSitePred
68
+
69
+ # Sequence Search
70
+ from enzymetk.sequence_search_blast import BLAST
71
+
72
+ # Similarity Search
73
+ from enzymetk.similarity_foldseek_step import FoldSeek
74
+ from enzymetk.similarity_mmseqs_step import MMseqs
75
+ from enzymetk.similarity_reaction_step import ReactionDist
76
+ from enzymetk.similarity_substrate_step import SubstrateDist
77
+
78
+ # Structure Search (aliased to avoid conflict with similarity_foldseek_step.FoldSeek)
79
+ from enzymetk.structure_search_foldseek import FoldSeek as StructureFoldSeek
80
+
81
+
82
+ __all__ = [
83
+ # Core
84
+ 'Step',
85
+ 'Pipeline',
86
+ 'Save',
87
+ # EC Annotation
88
+ 'CLEAN',
89
+ 'CREEP',
90
+ 'ProteInfer',
91
+ # Docking
92
+ 'Boltz',
93
+ 'Chai',
94
+ 'Vina',
95
+ # Chemical Embeddings
96
+ 'ChemBERT',
97
+ 'RxnFP',
98
+ 'SelFormer',
99
+ 'UniMol',
100
+ # Protein Embeddings
101
+ 'EmbedESM',
102
+ 'EmbedESM3',
103
+ # Sequence Generation/Alignment
104
+ 'ClustalOmega',
105
+ 'FastTree',
106
+ # Protein Design
107
+ 'LigandMPNN',
108
+ # Metagenomics
109
+ 'PoreChop',
110
+ 'Prokka',
111
+ # Prediction
112
+ 'ActiveSitePred',
113
+ # Sequence Search
114
+ 'BLAST',
115
+ # Similarity Search
116
+ 'FoldSeek',
117
+ 'MMseqs',
118
+ 'ReactionDist',
119
+ 'SubstrateDist',
120
+ # Structure Search
121
+ 'StructureFoldSeek',
122
+ ]
@@ -116,7 +116,7 @@ class CLEAN(Step):
116
116
  print(output_filenames)
117
117
  for sub_df in output_filenames:
118
118
  df = pd.concat([df, sub_df])
119
- return df
119
+ return self.__filter_df(df)
120
120
  else:
121
- return self.__execute([df, tmp_dir])
121
+ return self.__filter_df(self.__execute([df, tmp_dir]))
122
122
  return df
@@ -5,9 +5,12 @@ import subprocess
5
5
  import logging
6
6
  import numpy as np
7
7
  import os
8
+ from enzymetk.step import run_script
9
+ from pathlib import Path
8
10
 
9
11
  logger = logging.getLogger(__name__)
10
12
  logger.setLevel(logging.INFO)
13
+ SCRIPT_DIR = Path(__file__).parent.resolve()
11
14
 
12
15
  """
13
16
  import os
@@ -38,9 +41,14 @@ class CREEP(Step):
38
41
  self.args_extract = args_extract
39
42
  self.args_retrieval = args_retrieval
40
43
 
41
- def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
42
- tmp_dir = '/disk1/ariane/vscode/degradeo/pipeline/tmp/'
43
- input_filename = f'{tmp_dir}/creepasjkdkajshdkja.csv'
44
+ def install(self, env_args=None):
45
+ # Try to automatically install CREEP conda env
46
+ run_script('install_CREEP.sh', verbose=True)
47
+ self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
48
+ self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
49
+
50
+ def __execute(self, df: pd.DataFrame, tmp_dir: str):
51
+ input_filename = f'{tmp_dir}/input.csv'
44
52
  df.to_csv(input_filename, index=False)
45
53
  cmd = ['conda', 'run', '-n', self.env_name, 'python', f'{self.CREEP_dir}scripts/step_02_extract_CREEP.py', '--pretrained_folder',
46
54
  f'{self.CREEP_cache_dir}output/easy_split',
@@ -5,7 +5,10 @@ from multiprocessing.dummy import Pool as ThreadPool
5
5
  from tempfile import TemporaryDirectory
6
6
  import os
7
7
  import subprocess
8
+ from enzymetk.step import run_script
9
+ from pathlib import Path
8
10
 
11
+ SCRIPT_DIR = Path(__file__).parent.resolve()
9
12
 
10
13
  class ProteInfer(Step):
11
14
 
@@ -53,6 +56,12 @@ class ProteInfer(Step):
53
56
  self.ec3_filter = ec3_filter
54
57
  self.ec4_filter = ec4_filter
55
58
 
59
+ def install(self, env_args=None):
60
+ # Try to automatically install CREEP conda env
61
+ run_script('install_CREEP.sh', verbose=True)
62
+ self.CREEP_dir = SCRIPT_DIR.parent.resolve() / 'conda_envs' / 'CREEP'
63
+ self.CREEP_cache_dir = f'{self.CREEP_dir}/data/'
64
+
56
65
  def __execute(self, data: list) -> np.array:
57
66
  df, tmp_dir = data
58
67
  # Make sure in the directory of proteinfer
@@ -0,0 +1,73 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ import logging
4
+ import numpy as np
5
+ from multiprocessing.dummy import Pool as ThreadPool
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+ try:
12
+ from docko.boltz import run_boltz_affinity
13
+ except ImportError as e:
14
+ print("Boltz: Needs docko package. Install with: pip install docko.")
15
+
16
+
17
+ class Boltz(Step):
18
+
19
+ def __init__(self, id_col: str, seq_col: str, substrate_col: str, intermediate_col: str, output_dir: str,
20
+ num_threads: 1, env_name = None, args=None):
21
+ super().__init__()
22
+ self.id_col = id_col
23
+ self.seq_col = seq_col
24
+ self.substrate_col = substrate_col
25
+ self.intermediate_col = intermediate_col
26
+ self.output_dir = output_dir or None
27
+ self.num_threads = num_threads or 1
28
+ self.conda = env_name
29
+ self.env_name = env_name
30
+ self.args = args
31
+
32
+ def install(self, env_args=None):
33
+ # e.g. env args could by python=='3.1.1.
34
+ self.install_venv(env_args)
35
+ # Now the specific
36
+ try:
37
+ cmd = [f'{self.env_name}/bin/pip', 'install', 'docko']
38
+ self.run(cmd)
39
+ except Exception as e:
40
+ cmd = [f'{self.env_name}/bin/pip3', 'install', 'docko']
41
+ self.run(cmd)
42
+ self.run(cmd)
43
+ # Now set the venv to be the location:
44
+ self.venv = f'{self.env_name}/bin/python'
45
+
46
+ def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
47
+ output_filenames = []
48
+
49
+ for run_id, seq, substrate, intermediate in df[[self.id_col, self.seq_col, self.substrate_col, self.intermediate_col]].values:
50
+ # Might have an issue if the things are not correctly installed in the same dicrectory
51
+ if not isinstance(substrate, str):
52
+ substrate = ''
53
+ print(run_id, seq, substrate)
54
+ if self.args:
55
+ run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate, self.args)
56
+ else:
57
+ run_boltz_affinity(run_id, seq, substrate, self.output_dir, intermediate)
58
+ output_filenames.append(f'{self.output_dir}/{run_id}/')
59
+ return output_filenames
60
+
61
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
62
+
63
+ if self.output_dir:
64
+ if self.num_threads > 1:
65
+ pool = ThreadPool(self.num_threads)
66
+ df_list = np.array_split(df, self.num_threads)
67
+ results = pool.map(self.__execute, df_list)
68
+ else:
69
+ results = self.__execute(df)
70
+ df['output_dir'] = results
71
+ return df
72
+ else:
73
+ print('No output directory provided')