enzymetk 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {enzymetk-0.0.1 → enzymetk-0.0.2}/PKG-INFO +23 -17
  2. {enzymetk-0.0.1 → enzymetk-0.0.2}/README.md +22 -16
  3. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/__init__.py +1 -1
  4. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_catalyticsite_run.py +1 -1
  5. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/sequence_search_blast.py +33 -4
  6. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_foldseek_step.py +27 -6
  7. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/PKG-INFO +23 -17
  8. {enzymetk-0.0.1 → enzymetk-0.0.2}/LICENSE +0 -0
  9. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_CLEAN_step.py +0 -0
  10. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_CREEP_step.py +0 -0
  11. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/annotateEC_proteinfer_step.py +0 -0
  12. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/dock_chai_step.py +0 -0
  13. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/dock_vina_step.py +0 -0
  14. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_chemberta_step.py +0 -0
  15. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_rxnfp_run.py +0 -0
  16. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_rxnfp_step.py +0 -0
  17. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_selformer_run.py +0 -0
  18. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_selformer_step.py +0 -0
  19. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedchem_unimol_step.py +0 -0
  20. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/embedprotein_esm_step.py +0 -0
  21. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/esm-extract.py +0 -0
  22. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/filter_sequence_step.py +0 -0
  23. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/filter_structure_step.py +0 -0
  24. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_msa_step.py +0 -0
  25. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_oligopool_step.py +0 -0
  26. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/generate_tree_step.py +0 -0
  27. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/inpaint_ligandMPNN_step.py +0 -0
  28. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/main.py +0 -0
  29. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/metagenomics_porechop_trim_reads_step.py +0 -0
  30. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/metagenomics_prokka_annotate_genes.py +0 -0
  31. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/pipeline.py +0 -0
  32. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_activity_step.py +0 -0
  33. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/predict_catalyticsite_step.py +0 -0
  34. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_pca_run.py +0 -0
  35. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_vae_run.py +0 -0
  36. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/reducedim_vae_step.py +0 -0
  37. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/save_step.py +0 -0
  38. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_mmseqs_step.py +0 -0
  39. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_reaction_step.py +0 -0
  40. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/similarity_substrate_step.py +0 -0
  41. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk/step.py +0 -0
  42. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/SOURCES.txt +0 -0
  43. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/dependency_links.txt +0 -0
  44. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/entry_points.txt +0 -0
  45. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/requires.txt +0 -0
  46. {enzymetk-0.0.1 → enzymetk-0.0.2}/enzymetk.egg-info/top_level.txt +0 -0
  47. {enzymetk-0.0.1 → enzymetk-0.0.2}/setup.cfg +0 -0
  48. {enzymetk-0.0.1 → enzymetk-0.0.2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: enzymetk
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Home-page: https://github.com/arianemora/enzyme-tk/
5
5
  Author: Ariane Mora
6
6
  Author-email: ariane.n.mora@gmail.com
@@ -47,22 +47,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
47
47
 
48
48
  ## Installation
49
49
 
50
+ ## Install base package to import modules
51
+
50
52
  ```bash
51
- source enzymetk/conda_envs/install_all.sh
53
+ pip install enzymetk
52
54
  ```
53
55
 
54
- ## Install subsets of enzyme-tk
56
+ ### Install only the specific requirements you need (recomended)
55
57
 
58
+ For this clone the repo and then install the requirements for the specific modules you use
56
59
  ```bash
57
60
  git clone git@github.com:ArianeMora/enzyme-tk.git
58
- python setup.py sdist bdist_wheel
59
- pip install dist/enzymetk-0.0.1.tar.gz
61
+ cd enzymetk/conda_envs/ # would recommend looking at thes
62
+ # e.g. to install all from within that folder you would do
63
+ source install_all.sh
60
64
  ```
61
65
 
62
66
  ## Usage
63
67
 
64
68
  If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
65
69
 
70
+ This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
71
+
66
72
  Here are some of the tools that have been implemented to be chained together as a pipeline:
67
73
 
68
74
  [mmseqs2](https://github.com/soedinglab/mmseqs2)
@@ -169,8 +175,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
169
175
  ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
170
176
 
171
177
  ```python
172
- from steps.embedchem_chemberta_step import ChemBERT
173
- from steps.save_step import Save
178
+ from enzymetk.embedchem_chemberta_step import ChemBERT
179
+ from enzymetk.save_step import Save
174
180
 
175
181
  output_dir = 'tmp/'
176
182
  num_threads = 1
@@ -180,7 +186,7 @@ substrate_col = 'Substrate'
180
186
  rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
181
187
  ['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
182
188
  df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
183
- df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
189
+ new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
184
190
  ```
185
191
 
186
192
  ### CLEAN
@@ -206,11 +212,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
206
212
  ```
207
213
  ### ClustalOmega
208
214
 
209
- ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
215
+ ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
210
216
 
211
217
  ```python
212
- from steps.generate_msa_step import ClustalOmega
213
- from steps.save_step import Save
218
+ from enzymetk.generate_msa_step import ClustalOmega
219
+ from enzymetk.save_step import Save
214
220
  import pandas as pd
215
221
 
216
222
  id_col = 'Entry'
@@ -230,8 +236,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
230
236
  CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
231
237
 
232
238
  ```python
233
- from steps.annotateEC_CREEP_step import CREEP
234
- from steps.save_step import Save
239
+ from enzymetk.annotateEC_CREEP_step import CREEP
240
+ from enzymetk.save_step import Save
235
241
  import pandas as pd
236
242
 
237
243
  # CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
@@ -252,8 +258,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
252
258
  EmbedESM is a tool for embedding a set of sequences using ESM2.
253
259
 
254
260
  ```python
255
- from steps.embedprotein_esm_step import EmbedESM
256
- from steps.save_step import Save
261
+ from enzymetk.embedprotein_esm_step import EmbedESM
262
+ from enzymetk.save_step import Save
257
263
  import pandas as pd
258
264
 
259
265
  id_col = 'Entry'
@@ -280,8 +286,8 @@ If you pass a database, you need to pass the path to the database.
280
286
  The columns expect a path to a pdb file i.e. the output from the `Chai` step.
281
287
 
282
288
  ```python
283
- from steps.similarity_foldseek_step import FoldSeek
284
- from steps.save_step import Save
289
+ from enzymetk.similarity_foldseek_step import FoldSeek
290
+ from enzymetk.save_step import Save
285
291
  import pandas as pd
286
292
 
287
293
  # id_col: str, seq_col: str, proteinfer_dir: str,
@@ -4,22 +4,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
4
4
 
5
5
  ## Installation
6
6
 
7
+ ## Install base package to import modules
8
+
7
9
  ```bash
8
- source enzymetk/conda_envs/install_all.sh
10
+ pip install enzymetk
9
11
  ```
10
12
 
11
- ## Install subsets of enzyme-tk
13
+ ### Install only the specific requirements you need (recomended)
12
14
 
15
+ For this clone the repo and then install the requirements for the specific modules you use
13
16
  ```bash
14
17
  git clone git@github.com:ArianeMora/enzyme-tk.git
15
- python setup.py sdist bdist_wheel
16
- pip install dist/enzymetk-0.0.1.tar.gz
18
+ cd enzymetk/conda_envs/ # would recommend looking at thes
19
+ # e.g. to install all from within that folder you would do
20
+ source install_all.sh
17
21
  ```
18
22
 
19
23
  ## Usage
20
24
 
21
25
  If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
22
26
 
27
+ This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
28
+
23
29
  Here are some of the tools that have been implemented to be chained together as a pipeline:
24
30
 
25
31
  [mmseqs2](https://github.com/soedinglab/mmseqs2)
@@ -126,8 +132,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
126
132
  ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
127
133
 
128
134
  ```python
129
- from steps.embedchem_chemberta_step import ChemBERT
130
- from steps.save_step import Save
135
+ from enzymetk.embedchem_chemberta_step import ChemBERT
136
+ from enzymetk.save_step import Save
131
137
 
132
138
  output_dir = 'tmp/'
133
139
  num_threads = 1
@@ -137,7 +143,7 @@ substrate_col = 'Substrate'
137
143
  rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
138
144
  ['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
139
145
  df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
140
- df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
146
+ new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
141
147
  ```
142
148
 
143
149
  ### CLEAN
@@ -163,11 +169,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
163
169
  ```
164
170
  ### ClustalOmega
165
171
 
166
- ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
172
+ ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
167
173
 
168
174
  ```python
169
- from steps.generate_msa_step import ClustalOmega
170
- from steps.save_step import Save
175
+ from enzymetk.generate_msa_step import ClustalOmega
176
+ from enzymetk.save_step import Save
171
177
  import pandas as pd
172
178
 
173
179
  id_col = 'Entry'
@@ -187,8 +193,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
187
193
  CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
188
194
 
189
195
  ```python
190
- from steps.annotateEC_CREEP_step import CREEP
191
- from steps.save_step import Save
196
+ from enzymetk.annotateEC_CREEP_step import CREEP
197
+ from enzymetk.save_step import Save
192
198
  import pandas as pd
193
199
 
194
200
  # CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
@@ -209,8 +215,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
209
215
  EmbedESM is a tool for embedding a set of sequences using ESM2.
210
216
 
211
217
  ```python
212
- from steps.embedprotein_esm_step import EmbedESM
213
- from steps.save_step import Save
218
+ from enzymetk.embedprotein_esm_step import EmbedESM
219
+ from enzymetk.save_step import Save
214
220
  import pandas as pd
215
221
 
216
222
  id_col = 'Entry'
@@ -237,8 +243,8 @@ If you pass a database, you need to pass the path to the database.
237
243
  The columns expect a path to a pdb file i.e. the output from the `Chai` step.
238
244
 
239
245
  ```python
240
- from steps.similarity_foldseek_step import FoldSeek
241
- from steps.save_step import Save
246
+ from enzymetk.similarity_foldseek_step import FoldSeek
247
+ from enzymetk.save_step import Save
242
248
  import pandas as pd
243
249
 
244
250
  # id_col: str, seq_col: str, proteinfer_dir: str,
@@ -22,7 +22,7 @@ Date: March 2025
22
22
  __title__ = 'enzymetk'
23
23
  __description__ = 'Toolkit for enzymes and what not'
24
24
  __url__ = 'https://github.com/arianemora/enzyme-tk/'
25
- __version__ = '0.0.1'
25
+ __version__ = '0.0.2'
26
26
  __author__ = 'Ariane Mora'
27
27
  __author_email__ = 'ariane.n.mora@gmail.com'
28
28
  __license__ = 'GPL3'
@@ -11,7 +11,7 @@ def run_as_inference(output_dir, fasta_file, squidly_dir, toks_per_batch, as_thr
11
11
  elif esm2_model == "esm2_t48_15B_UR50D":
12
12
  cr_model_as = cr_model_as or f"{squidly_dir}Squidly_CL_15B.pt"
13
13
  lstm_model_as = lstm_model_as or f"{squidly_dir}Squidly_LSTM_15B.pth"
14
- as_threshold = 0.99
14
+ as_threshold = 0.97
15
15
  #esm2_model = "esm2_t48_15B_UR50D"
16
16
  # python /scratch/project/squid/code_modular/SQUIDLY_run_model_LSTM.py ${FILE} ${ESM2_MODEL} ${CR_MODEL_AS}
17
17
  # ${LSTM_MODEL_AS} ${OUT} --toks_per_batch ${TOKS_PER_BATCH} --AS_threshold ${AS_THRESHOLD} --monitor
@@ -3,6 +3,7 @@ Step to run multiple sequence alignment with the Clustal Omega tool.
3
3
  ./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
4
4
  """
5
5
  from enzymetk.step import Step
6
+ import logging
6
7
 
7
8
  import pandas as pd
8
9
  import numpy as np
@@ -12,10 +13,17 @@ import os
12
13
  import subprocess
13
14
  import random
14
15
  import string
16
+ from tqdm import tqdm
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.setLevel(logging.INFO)
21
+
15
22
 
16
23
  class BLAST(Step):
17
24
 
18
- def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None, mode='blastp', args=None, tmp_dir=None):
25
+ def __init__(self, id_col: str, sequence_col: str, label_col=None, database=None,
26
+ mode='blastp', args=None, tmp_dir=None, num_threads=1):
19
27
  self.id_col = id_col
20
28
  self.seq_col = sequence_col
21
29
  self.label_col = label_col # This is whether it is query or reference
@@ -23,6 +31,7 @@ class BLAST(Step):
23
31
  self.database = database
24
32
  self.args = args
25
33
  self.tmp_dir = tmp_dir
34
+ self.num_threads = num_threads
26
35
  if self.database is None and self.label_col is None:
27
36
  raise ValueError('Database is not set, you can pass a database that you have already created see diamond for more information or the sequences \
28
37
  as part of your dataframe and pass the label column (this needs to have two values: reference and query) reference \
@@ -74,7 +83,27 @@ class BLAST(Step):
74
83
  return df
75
84
 
76
85
  def execute(self, df: pd.DataFrame) -> pd.DataFrame:
77
- if self.tmp_dir is not None:
78
- return self.__execute([df, self.tmp_dir])
79
86
  with TemporaryDirectory() as tmp_dir:
80
- return self.__execute([df, tmp_dir])
87
+ tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
88
+ if self.num_threads > 1:
89
+ output_filenames = []
90
+ df_list = np.array_split(df, self.num_threads)
91
+ for df_chunk in tqdm(df_list):
92
+ try:
93
+ output_filenames.append(self.__execute([df_chunk, tmp_dir]))
94
+ except Exception as e:
95
+ logger.error(f"Error in executing ESM2 model: {e}")
96
+ continue
97
+ df = pd.DataFrame()
98
+ for sub_df in output_filenames:
99
+ df = pd.concat([df, sub_df])
100
+ return df
101
+
102
+ else:
103
+ return self.__execute([df, tmp_dir])
104
+
105
+ # def execute(self, df: pd.DataFrame) -> pd.DataFrame:
106
+ # if self.tmp_dir is not None:
107
+ # return self.__execute([df, self.tmp_dir])
108
+ # with TemporaryDirectory() as tmp_dir:
109
+ # return self.__execute([df, tmp_dir])
@@ -7,13 +7,17 @@ repo and then copy it out of it.
7
7
  """
8
8
  from enzymetk.step import Step
9
9
 
10
-
10
+ import logging
11
11
  import pandas as pd
12
12
  import numpy as np
13
13
  from tempfile import TemporaryDirectory
14
14
  import subprocess
15
15
  import random
16
16
  import string
17
+ from tqdm import tqdm
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.setLevel(logging.INFO)
17
21
 
18
22
 
19
23
  def process_clustering(filename, df, id_column_name):
@@ -34,13 +38,14 @@ def process_clustering(filename, df, id_column_name):
34
38
  class FoldSeek(Step):
35
39
 
36
40
  def __init__(self, id_column_name: str, query_column_name: str, reference_database: str, method='search', query_type='structures',
37
- args=None, tmp_dir: str = None):
41
+ args=None, num_threads=1, tmp_dir: str = None):
38
42
  self.query_column_name = query_column_name
39
43
  self.id_column_name = id_column_name
40
44
  self.reference_database = reference_database # pdb should be the default
41
45
  self.tmp_dir = tmp_dir
42
46
  self.method = method
43
47
  self.args = args
48
+ self.num_threads = num_threads
44
49
  self.query_type = query_type
45
50
  if self.method not in ['search', 'cluster']:
46
51
  print('Method must be in "search" or "cluster". Will likely fail... ')
@@ -107,8 +112,24 @@ class FoldSeek(Step):
107
112
  return df
108
113
 
109
114
  def execute(self, df: pd.DataFrame) -> pd.DataFrame:
110
- if self.tmp_dir is not None:
111
- return self.__execute([df, self.tmp_dir])
112
115
  with TemporaryDirectory() as tmp_dir:
113
- return self.__execute([df, tmp_dir])
114
- return df
116
+ tmp_dir = self.tmp_dir if self.tmp_dir is not None else tmp_dir
117
+ if self.num_threads > 1:
118
+ output_filenames = []
119
+ df_list = np.array_split(df, self.num_threads)
120
+ for df_chunk in tqdm(df_list):
121
+ try:
122
+ output_filenames.append(self.__execute([df_chunk, tmp_dir]))
123
+ except Exception as e:
124
+ logger.error(f"Error in executing ESM2 model: {e}")
125
+ continue
126
+ df = pd.DataFrame()
127
+ print(output_filenames)
128
+ for p in output_filenames:
129
+ sub_df = pd.read_pickle(p)
130
+ df = pd.concat([df, sub_df])
131
+ return df
132
+
133
+ else:
134
+ output_filename = self.__execute([df, tmp_dir])
135
+ return pd.read_pickle(output_filename)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: enzymetk
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Home-page: https://github.com/arianemora/enzyme-tk/
5
5
  Author: Ariane Mora
6
6
  Author-email: ariane.n.mora@gmail.com
@@ -47,22 +47,28 @@ Enzyme-tk is a collection of tools for enzyme engineering, setup as interoperabl
47
47
 
48
48
  ## Installation
49
49
 
50
+ ## Install base package to import modules
51
+
50
52
  ```bash
51
- source enzymetk/conda_envs/install_all.sh
53
+ pip install enzymetk
52
54
  ```
53
55
 
54
- ## Install subsets of enzyme-tk
56
+ ### Install only the specific requirements you need (recomended)
55
57
 
58
+ For this clone the repo and then install the requirements for the specific modules you use
56
59
  ```bash
57
60
  git clone git@github.com:ArianeMora/enzyme-tk.git
58
- python setup.py sdist bdist_wheel
59
- pip install dist/enzymetk-0.0.1.tar.gz
61
+ cd enzymetk/conda_envs/ # would recommend looking at thes
62
+ # e.g. to install all from within that folder you would do
63
+ source install_all.sh
60
64
  ```
61
65
 
62
66
  ## Usage
63
67
 
64
68
  If you have any issues at all just email me using my caltech email: `amora at caltech . edu`
65
69
 
70
+ This is a work-in progress! e.g. some tools (e.g. proteInfer and CLEAN) require extra data to be downloaded in order to run (like model weights.) I'm working on integrating these atm, buzz me if you need this!
71
+
66
72
  Here are some of the tools that have been implemented to be chained together as a pipeline:
67
73
 
68
74
  [mmseqs2](https://github.com/soedinglab/mmseqs2)
@@ -169,8 +175,8 @@ df << (Chai(id_col, seq_col, substrate_col, f'{output_dir}', num_threads) >> Sav
169
175
  ChemBERTa2 encodes reactions and SMILES strings into a vector space. Note this requires the base environment, i.e. `enzymetk` conda env.
170
176
 
171
177
  ```python
172
- from steps.embedchem_chemberta_step import ChemBERT
173
- from steps.save_step import Save
178
+ from enzymetk.embedchem_chemberta_step import ChemBERT
179
+ from enzymetk.save_step import Save
174
180
 
175
181
  output_dir = 'tmp/'
176
182
  num_threads = 1
@@ -180,7 +186,7 @@ substrate_col = 'Substrate'
180
186
  rows = [['P0DP23', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC'],
181
187
  ['P0DP24', 'MALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAAMALWMRLLPLLALLALWGPDPAAA', 'CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)OCC(CC)CCCC']]
182
188
  df = pd.DataFrame(rows, columns=[id_col, seq_col, substrate_col])
183
- df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl'))
189
+ new_df = (df << (ChemBERT(id_col, substrate_col, num_threads) >> Save(f'{output_dir}chemberta.pkl')))
184
190
  ```
185
191
 
186
192
  ### CLEAN
@@ -206,11 +212,11 @@ df << (CLEAN(id_col, seq_col, clean_dir, num_threads=num_threads) >> Save(f'clea
206
212
  ```
207
213
  ### ClustalOmega
208
214
 
209
- ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path.
215
+ ClustalOmega is a tool for aligning a set of sequences. This gets installed to the system (expecting a linux machine) and added to the bash path. You need to have installed it first (check out the `conda_envs` directory in enzymetk.)
210
216
 
211
217
  ```python
212
- from steps.generate_msa_step import ClustalOmega
213
- from steps.save_step import Save
218
+ from enzymetk.generate_msa_step import ClustalOmega
219
+ from enzymetk.save_step import Save
214
220
  import pandas as pd
215
221
 
216
222
  id_col = 'Entry'
@@ -230,8 +236,8 @@ df << (ClustalOmega(id_col, seq_col) >> Save('tmp/clustalomega_test.pkl'))
230
236
  CREEP is a tool for predicting the EC number of a reaction. At the moment it only supports reactions to EC however we are extending this to other modalities.
231
237
 
232
238
  ```python
233
- from steps.annotateEC_CREEP_step import CREEP
234
- from steps.save_step import Save
239
+ from enzymetk.annotateEC_CREEP_step import CREEP
240
+ from enzymetk.save_step import Save
235
241
  import pandas as pd
236
242
 
237
243
  # CREEP expects you to have downloaded the data from the zotero page and put it in the data/CREEP folder
@@ -252,8 +258,8 @@ df << (CREEP(id_col, reaction_col, CREEP_cache_dir='/disk1/share/software/CREEP/
252
258
  EmbedESM is a tool for embedding a set of sequences using ESM2.
253
259
 
254
260
  ```python
255
- from steps.embedprotein_esm_step import EmbedESM
256
- from steps.save_step import Save
261
+ from enzymetk.embedprotein_esm_step import EmbedESM
262
+ from enzymetk.save_step import Save
257
263
  import pandas as pd
258
264
 
259
265
  id_col = 'Entry'
@@ -280,8 +286,8 @@ If you pass a database, you need to pass the path to the database.
280
286
  The columns expect a path to a pdb file i.e. the output from the `Chai` step.
281
287
 
282
288
  ```python
283
- from steps.similarity_foldseek_step import FoldSeek
284
- from steps.save_step import Save
289
+ from enzymetk.similarity_foldseek_step import FoldSeek
290
+ from enzymetk.save_step import Save
285
291
  import pandas as pd
286
292
 
287
293
  # id_col: str, seq_col: str, proteinfer_dir: str,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes