enzymetk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. enzymetk/__init__.py +56 -0
  2. enzymetk/annotateEC_CLEAN_step.py +122 -0
  3. enzymetk/annotateEC_CREEP_step.py +82 -0
  4. enzymetk/annotateEC_proteinfer_step.py +136 -0
  5. enzymetk/dock_chai_step.py +51 -0
  6. enzymetk/dock_vina_step.py +63 -0
  7. enzymetk/embedchem_chemberta_step.py +61 -0
  8. enzymetk/embedchem_rxnfp_run.py +28 -0
  9. enzymetk/embedchem_rxnfp_step.py +55 -0
  10. enzymetk/embedchem_selformer_run.py +28 -0
  11. enzymetk/embedchem_selformer_step.py +39 -0
  12. enzymetk/embedchem_unimol_step.py +57 -0
  13. enzymetk/embedprotein_esm_step.py +123 -0
  14. enzymetk/esm-extract.py +140 -0
  15. enzymetk/filter_sequence_step.py +0 -0
  16. enzymetk/filter_structure_step.py +0 -0
  17. enzymetk/generate_msa_step.py +61 -0
  18. enzymetk/generate_oligopool_step.py +0 -0
  19. enzymetk/generate_tree_step.py +74 -0
  20. enzymetk/inpaint_ligandMPNN_step.py +65 -0
  21. enzymetk/main.py +37 -0
  22. enzymetk/metagenomics_porechop_trim_reads_step.py +55 -0
  23. enzymetk/metagenomics_prokka_annotate_genes.py +59 -0
  24. enzymetk/pipeline.py +1 -0
  25. enzymetk/predict_activity_step.py +0 -0
  26. enzymetk/predict_catalyticsite_run.py +47 -0
  27. enzymetk/predict_catalyticsite_step.py +70 -0
  28. enzymetk/reducedim_pca_run.py +67 -0
  29. enzymetk/reducedim_vae_run.py +67 -0
  30. enzymetk/reducedim_vae_step.py +12 -0
  31. enzymetk/save_step.py +13 -0
  32. enzymetk/sequence_search_blast.py +80 -0
  33. enzymetk/similarity_foldseek_step.py +114 -0
  34. enzymetk/similarity_mmseqs_step.py +80 -0
  35. enzymetk/similarity_reaction_step.py +60 -0
  36. enzymetk/similarity_substrate_step.py +59 -0
  37. enzymetk/step.py +60 -0
  38. enzymetk-0.0.1.data/data/LICENSE +0 -0
  39. enzymetk-0.0.1.dist-info/LICENSE +0 -0
  40. enzymetk-0.0.1.dist-info/METADATA +370 -0
  41. enzymetk-0.0.1.dist-info/RECORD +44 -0
  42. enzymetk-0.0.1.dist-info/WHEEL +5 -0
  43. enzymetk-0.0.1.dist-info/entry_points.txt +2 -0
  44. enzymetk-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,28 @@
1
+ import argparse
2
+ import os
3
+
4
+ def run_selformer(output_filename, input_filename, label, selformer_dir, model_file):
5
+ print(f'Running selformer on {input_filename} with label {label}')
6
+ os.chdir(selformer_dir)
7
+ os.system(f'cp {input_filename} {selformer_dir}data/{label}.txt')
8
+ os.system(f'conda run -n SELFormer_env python3 {selformer_dir}generate_selfies.py --smiles_dataset=data/{label}.txt --selfies_dataset=data/{label}.csv')
9
+ os.system(f'conda run -n SELFormer_env python3 {selformer_dir}produce_embeddings.py --selfies_dataset=data/{label}.csv --model_file={model_file} --embed_file=data/{label}_embedding.csv')
10
+ os.system(f'cp {selformer_dir}data/{label}_embedding.csv {output_filename}')
11
+ os.system(f'rm data/{label}.txt')
12
+ os.system(f'rm data/{label}.csv')
13
+ os.system(f'rm data/{label}_embedding.csv')
14
+
15
+ def parse_args():
16
+ parser = argparse.ArgumentParser(description="Run selformer on a dataset")
17
+ parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
18
+ parser.add_argument('-input', '--input', type=str, required=True, help='path to the dataframe')
19
+ parser.add_argument('-label', '--label', type=str, required=True, help='label of the column')
20
+ parser.add_argument('-dir', '--dir', type=str, required=True, help='path to the directory')
21
+ parser.add_argument('-model', '--model', type=str, required=True, help='path to the model')
22
+ return parser.parse_args()
23
+
24
+ def main():
25
+ args = parse_args()
26
+ run_selformer(args.out, args.input, args.label, args.dir, args.model)
27
+
28
+ main()
@@ -0,0 +1,39 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ import subprocess
5
+ from pathlib import Path
6
+ import logging
7
+ import datetime
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+ logger.setLevel(logging.INFO)
12
+
13
+
14
+ class SelFormer(Step):
15
+
16
+ def __init__(self, value_col: str, id_col: str, selformer_dir: str, model_file: str):
17
+ self.value_col = value_col
18
+ self.id_col = id_col
19
+ self.selformer_dir = selformer_dir
20
+ self.model_file = model_file
21
+
22
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
23
+ sub_df = df[[self.id_col, self.value_col]]
24
+ # Have to change it so that selformer can run
25
+ sub_df.columns = ['chembl_id', 'canonical_smiles']
26
+ with TemporaryDirectory() as tmp_dir:
27
+ now = datetime.datetime.now()
28
+ formatted_date = now.strftime("%Y%m%d%H%M%S")
29
+ label = f'selformer_{formatted_date}'
30
+ output_filename = f'{tmp_dir}/{label}.csv'
31
+ input_filename = f'{tmp_dir}/{label}.tsv'
32
+ sub_df.to_csv(input_filename, sep='\t', index=False)
33
+ cmd = ['python', Path(__file__).parent/'selformer_run.py', '--out', output_filename,
34
+ '--input', input_filename, '--label', label, '--dir', self.selformer_dir,
35
+ '--model', self.model_file]
36
+ self.run(cmd)
37
+ df = pd.read_csv(output_filename)
38
+
39
+ return df
@@ -0,0 +1,57 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ import logging
5
+ import numpy as np
6
+ from unimol_tools import UniMolRepr
7
+ from multiprocessing.dummy import Pool as ThreadPool
8
+
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+ # pip install unimol_tools
13
+
14
+
15
+ class UniMol(Step):
16
+
17
+ def __init__(self, smiles_col: str, unimol_model = 'unimolv2', unimol_size = '164m', num_threads = 1):
18
+ self.smiles_col = smiles_col
19
+ self.num_threads = num_threads
20
+ # single smiles unimol representation
21
+ clf = UniMolRepr(data_type='molecule',
22
+ remove_hs=False,
23
+ model_name= unimol_model or 'unimolv2', # avaliable: unimolv1, unimolv2
24
+ model_size= unimol_size or '164m', # work when model_name is unimolv2. avaliable: 84m, 164m, 310m, 570m, 1.1B.
25
+ )
26
+ self.clf = clf
27
+
28
+ def __execute(self, df: pd.DataFrame) -> pd.DataFrame:
29
+ smiles_list = list(df[self.smiles_col].values)
30
+ reprs = []
31
+ for smile in smiles_list:
32
+ try:
33
+ unimol_repr = self.clf.get_repr([smile], return_atomic_reprs=True)
34
+ reprs.append(unimol_repr['cls_repr'])
35
+ except Exception as e:
36
+ logger.warning(f"Error embedding smile {smile}: {e}")
37
+ reprs.append(None)
38
+ df['unimol_repr'] = reprs
39
+ return df
40
+
41
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
42
+ with TemporaryDirectory() as tmp_dir:
43
+ if self.num_threads > 1:
44
+ data = []
45
+ df_list = np.array_split(df, self.num_threads)
46
+ for df_chunk in df_list:
47
+ data.append(df_chunk)
48
+ pool = ThreadPool(self.num_threads)
49
+ output_filenames = pool.map(self.__execute, data)
50
+ df = pd.DataFrame()
51
+ for tmp_df in output_filenames:
52
+ df = pd.concat([df, tmp_df])
53
+ return df
54
+
55
+ else:
56
+ return self.__execute(df)
57
+
@@ -0,0 +1,123 @@
1
+ from enzymetk.step import Step
2
+ import pandas as pd
3
+ from tempfile import TemporaryDirectory
4
+ from pathlib import Path
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+ import torch
8
+ import os
9
+
10
+
11
+
12
+ # First run this: nohup python esm-extract.py esm2_t33_650M_UR50D /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/EC3.1.1_training.fasta /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/encodings --include per_tok &
13
+ def extract_active_site_embedding(df, id_column, residue_columns, encoding_dir, rep_num=33):
14
+ """ Expects that the entries for the active site df are saved as the filenames in the encoding dir. """
15
+ combined_tensors = []
16
+ mean_tensors = []
17
+ count_fail = 0
18
+ count_success = 0
19
+ for entry, residues in tqdm(df[[id_column, residue_columns]].values):
20
+ try:
21
+ file = Path(encoding_dir + f'/{entry}.pt')
22
+ tensors = []
23
+ if residues is not None and residues != 'None':
24
+ try:
25
+ residues = [int(r) for r in residues.split('|')]
26
+ except:
27
+ residues = []
28
+ else:
29
+ residues = []
30
+ embedding_file = torch.load(file)
31
+ tensor = embedding_file['representations'][rep_num] # have to get the last layer (36) of the embeddings... very dependant on ESM model used! 36 for medium ESM2
32
+ tensors = []
33
+ mean_tensors.append(np.mean(np.asarray(tensor).astype(np.float32), axis=0))
34
+ for residue in residues:
35
+ t = np.asarray(tensor[residue]).astype(np.float32)
36
+ tensors.append(t)
37
+ combined_tensors.append(tensors)
38
+ except Exception as e:
39
+ print(f'Error loading file {file}: {e}')
40
+ count_fail += 1
41
+ mean_tensors.append(None)
42
+ combined_tensors.append(None)
43
+ # HEre is where you do something on the combined tensors
44
+ df['active_embedding'] = combined_tensors
45
+ df['esm_embedding'] = mean_tensors
46
+ print(count_success, count_fail, count_fail + count_success)
47
+ return df
48
+
49
+ # First run this: nohup python esm-extract.py esm2_t33_650M_UR50D /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/EC3.1.1_training.fasta /disk1/ariane/vscode/degradeo/data/DEHP/uniprot/encodings --include per_tok &
50
+ def extract_mean_embedding(df, id_column, encoding_dir, rep_num=33):
51
+ """ Expects that the entries for the active site df are saved as the filenames in the encoding dir. """
52
+ tensors = []
53
+ count_fail = 0
54
+ count_success = 0
55
+ for entry in tqdm(df[id_column].values):
56
+ try:
57
+ file = Path(os.path.join(encoding_dir, f'{entry}.pt'))
58
+ embedding_file = torch.load(file)
59
+ tensor = embedding_file['representations'][rep_num] # have to get the last layer (36) of the embeddings... very dependant on ESM model used! 36 for medium ESM2
60
+ t = np.mean(np.asarray(tensor).astype(np.float32), axis=0)
61
+ tensors.append(t)
62
+ except Exception as e:
63
+ print(f'Error loading file {file}: {e}')
64
+ count_fail += 1
65
+ tensors.append(None)
66
+
67
+ df['embedding'] = tensors
68
+ print(count_success, count_fail, count_fail + count_success)
69
+ return df
70
+
71
+ class EmbedESM(Step):
72
+
73
+ def __init__(self, id_col: str, seq_col: str, model='esm2_t33_650M_UR50D', extraction_method='mean',
74
+ active_site_col: str = None, num_threads=1, tmp_dir: str = None, env_name: str = 'enzymetk'):
75
+ self.seq_col = seq_col
76
+ self.id_col = id_col
77
+ self.active_site_col = active_site_col
78
+ self.model = model
79
+ self.num_threads = num_threads or 1
80
+ self.extraction_method = extraction_method
81
+ self.tmp_dir = tmp_dir
82
+ self.env_name = env_name
83
+
84
+ def __execute(self, df: pd.DataFrame, tmp_dir: str) -> pd.DataFrame:
85
+ input_filename = f'{tmp_dir}/input.fasta'
86
+ # Check the file doesn't exist in the tmp_dir
87
+ files = os.listdir(tmp_dir)
88
+ done_entries = set([f.split('.')[0] for f in files if f.endswith('.pt')])
89
+ # write fasta file which is the input for proteinfer
90
+ with open(input_filename, 'w+') as fout:
91
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
92
+ if entry not in done_entries:
93
+ fout.write(f'>{entry.strip()}\n{seq.strip()}\n')
94
+ # Might have an issue if the things are not correctly installed in the same dicrectory
95
+ cmd = ['conda', 'run', '-n', self.env_name, 'python', Path(__file__).parent/'esm-extract.py', self.model, input_filename, tmp_dir, '--include', 'per_tok']
96
+ self.run(cmd)
97
+ if self.extraction_method == 'mean':
98
+ df = extract_mean_embedding(df, self.id_col, tmp_dir)
99
+ elif self.extraction_method == 'active_site':
100
+ if self.active_site_col is None:
101
+ raise ValueError('active_site_col must be provided if extraction_method is active_site')
102
+ df = extract_active_site_embedding(df, self.id_col, self.active_site_col, tmp_dir)
103
+
104
+ return df
105
+
106
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
107
+ if self.tmp_dir is None:
108
+ with TemporaryDirectory() as tmp_dir:
109
+ if self.num_threads > 1:
110
+ dfs = []
111
+ df_list = np.array_split(df, self.num_threads)
112
+ for df_chunk in tqdm(df_list):
113
+ dfs.append(self.__execute(df_chunk, tmp_dir))
114
+ df = pd.DataFrame()
115
+ for tmp_df in tqdm(dfs):
116
+ df = pd.concat([df, tmp_df])
117
+ return df
118
+ else:
119
+ df = self.__execute(df, tmp_dir)
120
+ return df
121
+ else:
122
+ df = self.__execute(df, self.tmp_dir)
123
+ return df
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/env python3 -u
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ #
4
+ # This source code is licensed under the MIT license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ import pathlib
9
+
10
+ import torch
11
+
12
+ from esm import Alphabet, FastaBatchedDataset, ProteinBertModel, pretrained, MSATransformer
13
+
14
+
15
+ def create_parser():
16
+ parser = argparse.ArgumentParser(
17
+ description="Extract per-token representations and model outputs for sequences in a FASTA file" # noqa
18
+ )
19
+
20
+ parser.add_argument(
21
+ "model_location",
22
+ type=str,
23
+ help="PyTorch model file OR name of pretrained model to download (see README for models)",
24
+ )
25
+ parser.add_argument(
26
+ "fasta_file",
27
+ type=pathlib.Path,
28
+ help="FASTA file on which to extract representations",
29
+ )
30
+ parser.add_argument(
31
+ "output_dir",
32
+ type=pathlib.Path,
33
+ help="output directory for extracted representations",
34
+ )
35
+
36
+ parser.add_argument("--toks_per_batch", type=int, default=4096, help="maximum batch size")
37
+ parser.add_argument(
38
+ "--repr_layers",
39
+ type=int,
40
+ default=[-1],
41
+ nargs="+",
42
+ help="layers indices from which to extract representations (0 to num_layers, inclusive)",
43
+ )
44
+ parser.add_argument(
45
+ "--include",
46
+ type=str,
47
+ nargs="+",
48
+ choices=["mean", "per_tok", "bos", "contacts"],
49
+ help="specify which representations to return",
50
+ required=True,
51
+ )
52
+ parser.add_argument(
53
+ "--truncation_seq_length",
54
+ type=int,
55
+ default=1022,
56
+ help="truncate sequences longer than the given value",
57
+ )
58
+
59
+ parser.add_argument("--nogpu", action="store_true", help="Do not use GPU even if available")
60
+ return parser
61
+
62
+
63
+ def run(args):
64
+ model, alphabet = pretrained.load_model_and_alphabet(args.model_location)
65
+ model.eval()
66
+ if isinstance(model, MSATransformer):
67
+ raise ValueError(
68
+ "This script currently does not handle models with MSA input (MSA Transformer)."
69
+ )
70
+ if torch.cuda.is_available() and not args.nogpu:
71
+ model = model.cuda()
72
+ print("Transferred model to GPU")
73
+
74
+ dataset = FastaBatchedDataset.from_file(args.fasta_file)
75
+ batches = dataset.get_batch_indices(args.toks_per_batch, extra_toks_per_seq=1)
76
+ data_loader = torch.utils.data.DataLoader(
77
+ dataset, collate_fn=alphabet.get_batch_converter(args.truncation_seq_length), batch_sampler=batches
78
+ )
79
+ print(f"Read {args.fasta_file} with {len(dataset)} sequences")
80
+
81
+ args.output_dir.mkdir(parents=True, exist_ok=True)
82
+ return_contacts = "contacts" in args.include
83
+
84
+ assert all(-(model.num_layers + 1) <= i <= model.num_layers for i in args.repr_layers)
85
+ repr_layers = [(i + model.num_layers + 1) % (model.num_layers + 1) for i in args.repr_layers]
86
+
87
+ with torch.no_grad():
88
+ for batch_idx, (labels, strs, toks) in enumerate(data_loader):
89
+ print(
90
+ f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
91
+ )
92
+ if torch.cuda.is_available() and not args.nogpu:
93
+ toks = toks.to(device="cuda", non_blocking=True)
94
+
95
+ out = model(toks, repr_layers=repr_layers, return_contacts=return_contacts)
96
+
97
+ logits = out["logits"].to(device="cpu")
98
+ representations = {
99
+ layer: t.to(device="cpu") for layer, t in out["representations"].items()
100
+ }
101
+ if return_contacts:
102
+ contacts = out["contacts"].to(device="cpu")
103
+
104
+ for i, label in enumerate(labels):
105
+ args.output_file = args.output_dir / f"{label}.pt"
106
+ args.output_file.parent.mkdir(parents=True, exist_ok=True)
107
+ result = {"label": label}
108
+ truncate_len = min(args.truncation_seq_length, len(strs[i]))
109
+ # Call clone on tensors to ensure tensors are not views into a larger representation
110
+ # See https://github.com/pytorch/pytorch/issues/1995
111
+ if "per_tok" in args.include:
112
+ result["representations"] = {
113
+ layer: t[i, 1 : truncate_len + 1].clone()
114
+ for layer, t in representations.items()
115
+ }
116
+ if "mean" in args.include:
117
+ result["mean_representations"] = {
118
+ layer: t[i, 1 : truncate_len + 1].mean(0).clone()
119
+ for layer, t in representations.items()
120
+ }
121
+ if "bos" in args.include:
122
+ result["bos_representations"] = {
123
+ layer: t[i, 0].clone() for layer, t in representations.items()
124
+ }
125
+ if return_contacts:
126
+ result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
127
+
128
+ torch.save(
129
+ result,
130
+ args.output_file,
131
+ )
132
+
133
+
134
+ def main():
135
+ parser = create_parser()
136
+ args = parser.parse_args()
137
+ run(args)
138
+
139
+ if __name__ == "__main__":
140
+ main()
File without changes
File without changes
@@ -0,0 +1,61 @@
1
+ """
2
+ Step to run multiple sequence alignment with the Clustal Omega tool.
3
+ ./clustalo -i /home/helen/degradeo/pipeline/helen_data/sequences_test_fasta.txt
4
+ """
5
+ from enzymetk.step import Step
6
+ import pandas as pd
7
+ import numpy as np
8
+ from tempfile import TemporaryDirectory
9
+ import os
10
+ import subprocess
11
+ import random
12
+ import string
13
+
14
+ class ClustalOmega(Step):
15
+
16
+ def __init__(self, id_col: str, seq_col: str, tmp_dir: str = None):
17
+ self.seq_col = seq_col
18
+ self.id_col = id_col
19
+ self.tmp_dir = tmp_dir
20
+
21
+ def __execute(self, data: list) -> np.array:
22
+ df, tmp_dir = data
23
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
24
+ fasta_file = os.path.join(tmp_dir, 'sequences.fasta')
25
+ output_file = os.path.join(tmp_dir, f"{tmp_label}.aln")
26
+
27
+ # Turn dataframe into fasta file
28
+ with open(fasta_file, 'w') as f:
29
+ for seq_id, seq in df[[self.id_col, self.seq_col]].values:
30
+ f.write(f">{seq_id}\n{seq}\n")
31
+
32
+ # Running Clustal Omega on the generated FASTA file
33
+ subprocess.run(['clustalo', '-i', fasta_file, '-o', output_file], check=True)
34
+
35
+ sequences = {}
36
+
37
+ # Read the output file
38
+ with open(output_file, 'r') as f:
39
+ current_id = None
40
+ for line in f:
41
+ line = line.strip() # Remove leading/trailing whitespaces or newline characters
42
+ if line.startswith(">"):
43
+ # Header line with sequence ID
44
+ current_id = line[1:] # Extract ID without ">"
45
+ sequences[current_id] = "" # Initialize an empty string for this ID
46
+ else:
47
+ # Sequence line; append it to the current ID's sequence
48
+ sequences[current_id] += line.strip()
49
+
50
+ # Convert the sequences dictionary into a DataFrame
51
+ df_aligned = pd.DataFrame(list(sequences.items()), columns=[self.id_col, 'aligned_sequence'])
52
+
53
+ df = pd.merge(df, df_aligned, on=self.id_col, how='left')
54
+
55
+ return df
56
+
57
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
58
+ if self.tmp_dir is not None:
59
+ return self.__execute([df, self.tmp_dir])
60
+ with TemporaryDirectory() as tmp_dir:
61
+ return self.__execute([df, tmp_dir])
File without changes
@@ -0,0 +1,74 @@
1
+ # /home/ikumi/degradeo/software/FastTree -gtr -nt /home/ikumi/degradeo/pipeline/ikumi_data/Q04457_esterase-2.msa > /home/ikumi/degradeo/pipeline/ikumi_data/output_tree.tree
2
+ # /home/ikumi/degradeo/software/FastTree -wag /home/ikumi/degradeo/pipeline/ikumi_data/Q04457_esterase-2.msa > /home/ikumi/degradeo/pipeline/ikumi_data/output_tree.tree
3
+ """
4
+ Install clean and then you need to activate the environment and install and run via that.
5
+
6
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
7
+ repo and then copy it out of it.
8
+ """
9
+ from enzymetk.step import Step
10
+ import pandas as pd
11
+ import numpy as np
12
+ from multiprocessing.dummy import Pool as ThreadPool
13
+ from tempfile import TemporaryDirectory
14
+ import os
15
+ import subprocess
16
+ import random
17
+ import string
18
+
19
+ class FastTree(Step):
20
+ def __init__(self, fasttree_dir: str, id_col: str, seq_col: str, csv_file: str, output_dir: str):
21
+ self.fasttree_dir = fasttree_dir
22
+ self.id_col = id_col
23
+ self.seq_col = seq_col
24
+ self.csv_file = csv_file
25
+ self.num_threads = 1
26
+ self.output_dir = output_dir
27
+
28
+ def create_alignment_file(self, df: pd.DataFrame) -> str:
29
+ print(f"Creating MSA file from {len(df)} sequences")
30
+
31
+ # Create MSA file in the output directory
32
+ msa_file = os.path.join(self.output_dir, 'ikumi.data.msa')
33
+ with open(msa_file, 'w') as fout:
34
+ for entry, seq in df[[self.id_col, self.seq_col]].values:
35
+ fout.write(f">{entry.strip()}\n{seq.strip()}\n")
36
+
37
+ print(f"Created MSA file at: {msa_file}")
38
+ return msa_file
39
+
40
+ def __execute(self, data: list) -> pd.DataFrame:
41
+ df, tmp_dir = data
42
+ tmp_label = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
43
+ # Get the msa file
44
+ msa_file = self.create_alignment_file(df)
45
+
46
+ fasttree_executable = os.path.join(self.fasttree_dir, 'FastTree')
47
+ output_tree_file = os.path.join(tmp_dir, f'{tmp_label}.tree')
48
+
49
+ # Run FastTree and redirect output to a file
50
+ with open(output_tree_file, 'w') as outfile:
51
+ subprocess.run([fasttree_executable, '-wag', msa_file], stdout=outfile, check=True)
52
+
53
+ df = pd.read_csv(output_tree_file, header=None, sep='\t')
54
+
55
+ print(df.head())
56
+
57
+ return df
58
+
59
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
60
+ with TemporaryDirectory() as tmp_dir:
61
+ if self.num_threads > 1:
62
+ data = []
63
+ df_list = np.array_split(df, self.num_threads)
64
+ pool = ThreadPool(self.num_threads)
65
+ for df_chunk in df_list:
66
+ data.append([df_chunk, tmp_dir])
67
+ results = pool.map(self.__execute, data)
68
+ df = pd.DataFrame()
69
+ for dfs in results:
70
+ df = pd.concat([df, dfs])
71
+ return df
72
+ else:
73
+ return self.__execute([df, tmp_dir])
74
+ return df
@@ -0,0 +1,65 @@
1
+ # #os.system('python run.py --seed 111 --pdb_path "./test_outputs/QHH_0.pdb" --fixed_residues "A19 A20 A21 A59 A60 A61 A90 A91 A92" --checkpoint_path_sc "./model_params/ligandmpnn_sc_v_32_002_16.pt" --out_folder "./outputs/QHH"')
2
+ #os.system('python run.py --seed 111 --pdb_path "./test_outputs/QHH_0.pdb" --fixed_residues "A19 A20 A21 A59 A60 A61 A90 A91 A92" --checkpoint_path_sc "./model_params/ligandmpnn_sc_v_32_002_16.pt" --out_folder "./outputs/QHH"')
3
+ """
4
+ Install clean and then you need to activate the environment and install and run via that.
5
+
6
+ Honestly it's a bit hacky the way they do it, not bothered to change things so have to save the data to their
7
+ repo and then copy it out of it.
8
+ """
9
+ from enzymetk.step import Step
10
+ import pandas as pd
11
+ import numpy as np
12
+ from tempfile import TemporaryDirectory
13
+ import subprocess
14
+ import logging
15
+ import os
16
+
17
+ class LigandMPNN(Step):
18
+
19
+ def __init__(self, pdb_column_name: str, ligand_mpnn_dir: str, output_dir: str, tmp_dir: str = None, args=None, num_threads: int = 1, env_name: str = 'ligandmpnn_env'):
20
+ self.pdb_column_name = pdb_column_name
21
+ self.ligand_mpnn_dir = ligand_mpnn_dir
22
+ self.output_dir = output_dir
23
+ self.tmp_dir = tmp_dir
24
+ self.args = args
25
+ self.num_threads = num_threads
26
+ self.env_name = env_name
27
+ self.logger = logging.getLogger(__name__)
28
+
29
+ def __execute(self, data: list) -> np.array:
30
+ df, tmp_dir = data
31
+ # Get the PDB files from the column
32
+ output_filenames = []
33
+ # You have to change the directory to the ligandmpnn directory
34
+ os.chdir(self.ligand_mpnn_dir)
35
+
36
+ for pdb_file in df[ self.pdb_column_name].values:
37
+ cmd = ['conda', 'run', '-n', self.env_name, 'python3', f'{self.ligand_mpnn_dir}run.py', '--pdb_path', pdb_file, '--out_folder', f'{self.output_dir}']
38
+ if self.args is not None:
39
+ cmd.extend(self.args)
40
+ result = subprocess.run(cmd, check=True)
41
+ if result.stderr:
42
+ self.logger.error(result.stderr)
43
+ else:
44
+ output_filenames.append(f'{self.output_dir}{pdb_file.split("/")[-1].split(".")[0]}')
45
+ self.logger.info(result.stdout)
46
+ df['inpainted_pdb'] = output_filenames
47
+ return df
48
+
49
+ def execute(self, df: pd.DataFrame) -> pd.DataFrame:
50
+ if self.tmp_dir is not None:
51
+ return self.__execute([df, self.tmp_dir])
52
+ with TemporaryDirectory() as tmp_dir:
53
+ if self.num_threads > 1:
54
+ output_filenames = []
55
+ df_list = np.array_split(df, self.num_threads)
56
+ for df_chunk in df_list:
57
+ output_filenames.append(self.__execute(df_chunk, tmp_dir))
58
+
59
+ df = pd.DataFrame()
60
+ for tmp_df in output_filenames:
61
+ df = pd.concat([df, tmp_df])
62
+ return df
63
+
64
+ return self.__execute([df, tmp_dir])
65
+ return df
enzymetk/main.py ADDED
@@ -0,0 +1,37 @@
1
+ ###############################################################################
2
+ # #
3
+ # This program is free software: you can redistribute it and/or modify #
4
+ # it under the terms of the GNU General Public License as published by #
5
+ # the Free Software Foundation, either version 3 of the License, or #
6
+ # (at your option) any later version. #
7
+ # #
8
+ # This program is distributed in the hope that it will be useful, #
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of #
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
11
+ # GNU General Public License for more details. #
12
+ # #
13
+ # You should have received a copy of the GNU General Public License #
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>. #
15
+ # #
16
+ ###############################################################################
17
+
18
+ """
19
+ Author: Ariane Mora
20
+ Date: September 2024
21
+ """
22
+ from enzymetk import *
23
+ import argparse
24
+
25
+
26
+ def parse_args():
27
+ parser = argparse.ArgumentParser(description="Run on a dataframe")
28
+ parser.add_argument('-out', '--out', required=True, help='Path to the output directory')
29
+ parser.add_argument('-df', '--df', type=str, required=True, help='Fasta of the file of interest')
30
+ return parser.parse_args()
31
+
32
+ def main():
33
+ args = parse_args()
34
+ #run(args.out, args.df)
35
+
36
+ if __name__ == "__main__":
37
+ main()