kdock 2025.10.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kdock/__init__.py +1 -0
- kdock/_modidx.py +131 -0
- kdock/af3/__init__.py +0 -0
- kdock/af3/analyze.py +162 -0
- kdock/af3/docker.py +120 -0
- kdock/af3/json.py +282 -0
- kdock/af3/protein_pairs.py +95 -0
- kdock/core/__init__.py +0 -0
- kdock/core/data.py +64 -0
- kdock/core/ligand.py +294 -0
- kdock/core/plot.py +89 -0
- kdock/core/protein.py +293 -0
- kdock/core/utils.py +156 -0
- kdock/gnina/__init__.py +0 -0
- kdock/gnina/dock.py +114 -0
- kdock/gnina/rescore.py +204 -0
- kdock/px/__init__.py +0 -0
- kdock/px/core.py +130 -0
- kdock/px/dock.py +117 -0
- kdock-2025.10.31.dist-info/METADATA +81 -0
- kdock-2025.10.31.dist-info/RECORD +25 -0
- kdock-2025.10.31.dist-info/WHEEL +5 -0
- kdock-2025.10.31.dist-info/entry_points.txt +2 -0
- kdock-2025.10.31.dist-info/licenses/LICENSE +201 -0
- kdock-2025.10.31.dist-info/top_level.txt +1 -0
kdock/core/utils.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/01_utils.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['rglob', 'copy_files', 'get_rec_lig', 'get_box', 'view_mol', 'view_complex']
|
|
5
|
+
|
|
6
|
+
# %% ../../nbs/core/01_utils.ipynb 3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import subprocess,shutil,zipfile
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
import py3Dmol
|
|
12
|
+
from rdkit import Chem
|
|
13
|
+
|
|
14
|
+
# %% ../../nbs/core/01_utils.ipynb 6
|
|
15
|
+
def rglob(path, pattern, max_depth):
|
|
16
|
+
"Get a file list given folder depths"
|
|
17
|
+
base_path = Path(path).resolve()
|
|
18
|
+
for path in base_path.rglob(pattern):
|
|
19
|
+
if len(path.relative_to(base_path).parts) <= max_depth:
|
|
20
|
+
yield path
|
|
21
|
+
|
|
22
|
+
# %% ../../nbs/core/01_utils.ipynb 8
|
|
23
|
+
def copy_files(file_list, dest_dir):
|
|
24
|
+
"Copy a list of files to the destination directory, or zip them if dest_dir ends with .zip."
|
|
25
|
+
dest_path = Path(dest_dir)
|
|
26
|
+
|
|
27
|
+
if dest_path.suffix == ".zip":
|
|
28
|
+
with zipfile.ZipFile(dest_path, 'w') as zipf:
|
|
29
|
+
for file_path in file_list:
|
|
30
|
+
file_path = Path(file_path)
|
|
31
|
+
zipf.write(file_path, arcname=file_path.name)
|
|
32
|
+
print(f'Zipped {len(file_list)} files to {dest_path}')
|
|
33
|
+
else:
|
|
34
|
+
dest_path.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
for file_path in file_list:
|
|
36
|
+
file_path = Path(file_path)
|
|
37
|
+
shutil.copy2(file_path, dest_path / file_path.name)
|
|
38
|
+
print(f'Copied {len(file_list)} files to {dest_path}')
|
|
39
|
+
|
|
40
|
+
# %% ../../nbs/core/01_utils.ipynb 11
|
|
41
|
+
def get_rec_lig(pdb_id: str, # pdb id for download
|
|
42
|
+
lig_id: str, # ligand id shown on the protein page
|
|
43
|
+
out_dir = '.', # directory path to save pdb files
|
|
44
|
+
):
|
|
45
|
+
"Download pdb and extract receptor and ligand from a PDB ID."
|
|
46
|
+
out_dir = Path(out_dir).expanduser().resolve()
|
|
47
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
pdb_file = out_dir / f"{pdb_id}.pdb"
|
|
50
|
+
rec_file = out_dir / f"{pdb_id}_receptor.pdb"
|
|
51
|
+
lig_pdb_file = out_dir / f"{pdb_id}_lig.pdb"
|
|
52
|
+
lig_sdf_file = out_dir / f"{pdb_id}_lig.sdf"
|
|
53
|
+
|
|
54
|
+
# Download if not exists
|
|
55
|
+
if not pdb_file.exists():
|
|
56
|
+
url = f"http://files.rcsb.org/download/{pdb_id}.pdb"
|
|
57
|
+
print(f'Downloading pdb: {pdb_id}')
|
|
58
|
+
subprocess.run(["wget", url, "-O", str(pdb_file)], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
59
|
+
print(f'{pdb_id}.pdb is detected!')
|
|
60
|
+
|
|
61
|
+
# Extract protein (all ATOM lines excluding ligand ID)
|
|
62
|
+
with open(pdb_file) as infile, open(rec_file, 'w') as out_rec:
|
|
63
|
+
for line in infile:
|
|
64
|
+
if line.startswith("ATOM") and lig_id not in line:
|
|
65
|
+
out_rec.write(line)
|
|
66
|
+
|
|
67
|
+
# Extract ligand
|
|
68
|
+
with open(pdb_file) as infile, open(lig_pdb_file, 'w') as out_lig:
|
|
69
|
+
for line in infile:
|
|
70
|
+
if lig_id in line and line.startswith(("HETATM", "ATOM")):
|
|
71
|
+
out_lig.write(line)
|
|
72
|
+
|
|
73
|
+
# Convert ligand PDB to SDF using RDKit
|
|
74
|
+
mol = Chem.MolFromPDBFile(str(lig_pdb_file), removeHs=False)
|
|
75
|
+
if mol is None:
|
|
76
|
+
raise ValueError("Failed to parse ligand from PDB.")
|
|
77
|
+
|
|
78
|
+
writer = Chem.SDWriter(str(lig_sdf_file))
|
|
79
|
+
writer.write(mol)
|
|
80
|
+
writer.close()
|
|
81
|
+
|
|
82
|
+
return str(rec_file), str(lig_sdf_file)
|
|
83
|
+
|
|
84
|
+
# %% ../../nbs/core/01_utils.ipynb 14
|
|
85
|
+
def get_box(sdf_file, autobox_add=4.0,tolist=False):
|
|
86
|
+
"Get the box coordinates of ligand.sdf; mimic GNINA's --autobox_ligand behavior."
|
|
87
|
+
mol = Chem.SDMolSupplier(str(sdf_file), removeHs=False)[0]
|
|
88
|
+
if mol is None:
|
|
89
|
+
raise ValueError(f"Failed to read molecule from {sdf_file}")
|
|
90
|
+
|
|
91
|
+
conf = mol.GetConformer()
|
|
92
|
+
coords = np.array([list(conf.GetAtomPosition(i)) for i in range(mol.GetNumAtoms())])
|
|
93
|
+
|
|
94
|
+
min_coords = coords.min(axis=0)
|
|
95
|
+
max_coords = coords.max(axis=0)
|
|
96
|
+
|
|
97
|
+
center = (min_coords + max_coords) / 2
|
|
98
|
+
size = (max_coords - min_coords) + autobox_add
|
|
99
|
+
|
|
100
|
+
box_dict = {
|
|
101
|
+
"center_x": round(float(center[0]), 3),
|
|
102
|
+
"center_y": round(float(center[1]), 3),
|
|
103
|
+
"center_z": round(float(center[2]), 3),
|
|
104
|
+
"size_x": round(float(size[0]), 3),
|
|
105
|
+
"size_y": round(float(size[1]), 3),
|
|
106
|
+
"size_z": round(float(size[2]), 3)
|
|
107
|
+
}
|
|
108
|
+
return list(box_dict.values()) if tolist else box_dict
|
|
109
|
+
|
|
110
|
+
# %% ../../nbs/core/01_utils.ipynb 18
|
|
111
|
+
def view_mol(file, #sdf or pdb file
|
|
112
|
+
):
|
|
113
|
+
"Visualize pdb or sdf file"
|
|
114
|
+
|
|
115
|
+
v = py3Dmol.view()
|
|
116
|
+
v.addModel(open(file).read())
|
|
117
|
+
v.setStyle({'stick':{}})
|
|
118
|
+
v.zoomTo()
|
|
119
|
+
v.show()
|
|
120
|
+
|
|
121
|
+
# %% ../../nbs/core/01_utils.ipynb 20
|
|
122
|
+
def view_complex(receptor, # protein file
|
|
123
|
+
ligand, # ligand (green), or docked ligand
|
|
124
|
+
ori_ligand=None, # original ligand (yellow)
|
|
125
|
+
box=None # optional box: [x, y, z, sizeX, sizeY, sizeZ]
|
|
126
|
+
):
|
|
127
|
+
|
|
128
|
+
"Visualize the receptor, ligand, optional original ligand, and optional box via py3Dmol."
|
|
129
|
+
v = py3Dmol.view()
|
|
130
|
+
|
|
131
|
+
# Load receptor
|
|
132
|
+
v.addModel(open(receptor).read())
|
|
133
|
+
v.setStyle({'cartoon': {}, 'stick': {'radius': 0.15}})
|
|
134
|
+
|
|
135
|
+
# Load docked ligand
|
|
136
|
+
v.addModel(open(ligand).read())
|
|
137
|
+
v.setStyle({'model': 1}, {'stick': {'colorscheme': 'greenCarbon'}})
|
|
138
|
+
|
|
139
|
+
# Load original ligand if provided
|
|
140
|
+
if ori_ligand is not None:
|
|
141
|
+
v.addModel(open(ori_ligand).read())
|
|
142
|
+
v.setStyle({'model': 2}, {'stick': {'colorscheme': 'yellowCarbon'}})
|
|
143
|
+
|
|
144
|
+
# Add bounding box if specified
|
|
145
|
+
if box is not None and len(box) == 6:
|
|
146
|
+
x, y, z, sizeX, sizeY, sizeZ = box
|
|
147
|
+
v.addBox({
|
|
148
|
+
'center': {'x': x, 'y': y, 'z': z},
|
|
149
|
+
'dimensions': {'w': sizeX, 'h': sizeY, 'd': sizeZ},
|
|
150
|
+
'color': 'red',
|
|
151
|
+
'opacity': 1,
|
|
152
|
+
'wireframe': True
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
v.zoomTo({'model': 1})
|
|
156
|
+
v.show()
|
kdock/gnina/__init__.py
ADDED
|
File without changes
|
kdock/gnina/dock.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/gnina/04_gnina_docking.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['setup_gnina_local', 'setup_gnina_docker', 'extract_gnina_dock', 'gnina_dock']
|
|
5
|
+
|
|
6
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 3
|
|
7
|
+
# basics
|
|
8
|
+
import re,subprocess, py3Dmol
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import pandas as pd,numpy as np
|
|
12
|
+
|
|
13
|
+
# rdkit
|
|
14
|
+
from rdkit import Chem
|
|
15
|
+
from rdkit.Chem import AllChem
|
|
16
|
+
|
|
17
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 8
|
|
18
|
+
def setup_gnina_local(version='v1.3'):
|
|
19
|
+
"Download and install gnina in the current directory"
|
|
20
|
+
# Check CUDA availability
|
|
21
|
+
# try:
|
|
22
|
+
# subprocess.run(["nvidia-smi"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
23
|
+
# except subprocess.CalledProcessError:
|
|
24
|
+
# raise EnvironmentError("CUDA not detected. Please make sure a CUDA-capable GPU is available and drivers are installed.")
|
|
25
|
+
# except FileNotFoundError:
|
|
26
|
+
# raise EnvironmentError("nvidia-smi not found. Make sure NVIDIA drivers and CUDA are installed.")
|
|
27
|
+
|
|
28
|
+
subprocess.run(["sudo", "apt-get", "update", "-yq"], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
29
|
+
subprocess.run(["sudo", "apt-get", "install", "-yq", "openbabel"], check=True)
|
|
30
|
+
|
|
31
|
+
gnina_url = f"https://github.com/gnina/gnina/releases/download/{version}/gnina"
|
|
32
|
+
print(f'Downloading {version} gnina')
|
|
33
|
+
subprocess.run(["wget",gnina_url], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
34
|
+
subprocess.run(["chmod", "+x", 'gnina'], check=True)
|
|
35
|
+
|
|
36
|
+
print('Finish setup!')
|
|
37
|
+
|
|
38
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 13
|
|
39
|
+
def setup_gnina_docker():
|
|
40
|
+
"Pull gnina docker image"
|
|
41
|
+
print("Pulling GNINA Docker image: gnina/gnina")
|
|
42
|
+
subprocess.run(["docker", "pull", "gnina/gnina"], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
43
|
+
print("GNINA Docker image is ready.")
|
|
44
|
+
|
|
45
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 18
|
|
46
|
+
def extract_gnina_dock(gnina_output):
|
|
47
|
+
"Extract values from gnina output"
|
|
48
|
+
mode1_line = re.search(r'\b1\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)\b', gnina_output)
|
|
49
|
+
|
|
50
|
+
if mode1_line:
|
|
51
|
+
affinity = float(mode1_line.group(1))
|
|
52
|
+
cnn_pose_score = float(mode1_line.group(2))
|
|
53
|
+
cnn_affinity = float(mode1_line.group(3))
|
|
54
|
+
|
|
55
|
+
return affinity, cnn_pose_score, cnn_affinity
|
|
56
|
+
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 19
|
|
60
|
+
def gnina_dock(receptor, # receptor file
|
|
61
|
+
ligand, # ligand file
|
|
62
|
+
autobox_ligand, # ligand file isolated from the complex
|
|
63
|
+
output = 'docked.sdf', # output file (sdf or sdf.gz) to be saved
|
|
64
|
+
seed=0, # set seeds
|
|
65
|
+
exhaustiveness=None, # number of MC chains, default is 8 if None, the higher the better (16,32); for whole protein, use 64
|
|
66
|
+
):
|
|
67
|
+
|
|
68
|
+
command = ['./gnina',
|
|
69
|
+
'-r', receptor,
|
|
70
|
+
'-l', ligand,
|
|
71
|
+
'--autobox_ligand', autobox_ligand,
|
|
72
|
+
'-o', output,
|
|
73
|
+
'--seed', str(seed)]
|
|
74
|
+
|
|
75
|
+
if exhaustiveness is not None:
|
|
76
|
+
command.extend(['--exhaustiveness', str(exhaustiveness)])
|
|
77
|
+
|
|
78
|
+
output_txt = subprocess.run(command, capture_output=True, text=True).stdout
|
|
79
|
+
|
|
80
|
+
print(f'save the docked file as {output}')
|
|
81
|
+
|
|
82
|
+
values = extract_gnina_dock(output_txt)
|
|
83
|
+
|
|
84
|
+
print(f'affinity, cnn_pose_score, and cnn_affinity are: {values}')
|
|
85
|
+
|
|
86
|
+
return values
|
|
87
|
+
|
|
88
|
+
# %% ../../nbs/gnina/04_gnina_docking.ipynb 21
|
|
89
|
+
def gnina_dock(df,
|
|
90
|
+
ID_col = 'ID',
|
|
91
|
+
smi_col = 'SMILES',
|
|
92
|
+
output_dir = 'gnina_docked'
|
|
93
|
+
):
|
|
94
|
+
affinity_values = []
|
|
95
|
+
cnn_pose_score_values = []
|
|
96
|
+
cnn_affinity_values = []
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
Path(output_dir).mkdir(parents=True,exist_ok=True)
|
|
100
|
+
|
|
101
|
+
for i, r in tqdm(df.iterrows(),total=len(df),desc='Docking'):
|
|
102
|
+
rdkit_conformer(SMILES=r[smi_col], output = f'ligand/{r[ID_col]}.sdf', visualize=False)
|
|
103
|
+
affinity, cnn_pose_score, cnn_affinity = gnina_dock('rec.pdb',f'ligand/{r[ID_col]}.sdf', 'lig.pdb',f'docked/docked_{r[ID_col]}.sdf')
|
|
104
|
+
|
|
105
|
+
affinity_values.append(affinity)
|
|
106
|
+
cnn_pose_score_values.append(cnn_pose_score)
|
|
107
|
+
cnn_affinity_values.append(cnn_affinity)
|
|
108
|
+
|
|
109
|
+
df = df.copy()
|
|
110
|
+
df['Affinity'] = affinity_values
|
|
111
|
+
df['CNN_Pose_Score'] = cnn_pose_score_values
|
|
112
|
+
df['CNN_Affinity'] = cnn_affinity_values
|
|
113
|
+
|
|
114
|
+
return df
|
kdock/gnina/rescore.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/gnina/05_gnina_AF3_rescore.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['ChainSelect', 'rename_residues', 'split_cif', 'pdb2sdf', 'prepare_rec_lig', 'gnina_rescore_local',
|
|
5
|
+
'gnina_rescore_docker', 'extract_gnina_rescore', 'get_gnina_rescore', 'get_gnina_rescore_folder']
|
|
6
|
+
|
|
7
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 3
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import re, os, subprocess, py3Dmol
|
|
10
|
+
from Bio.PDB import MMCIFParser, PDBIO, Select
|
|
11
|
+
from rdkit import Chem
|
|
12
|
+
from rdkit.Chem import AllChem
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from fastcore.all import L
|
|
15
|
+
from tqdm.contrib.concurrent import process_map
|
|
16
|
+
from functools import partial
|
|
17
|
+
|
|
18
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 5
|
|
19
|
+
class ChainSelect(Select):
|
|
20
|
+
"Select chain to save"
|
|
21
|
+
def __init__(self, chain_ids):
|
|
22
|
+
self.chain_ids = chain_ids
|
|
23
|
+
def accept_chain(self, chain):
|
|
24
|
+
return chain.get_id() in self.chain_ids
|
|
25
|
+
|
|
26
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 6
|
|
27
|
+
def rename_residues(structure, chain_id, new_resname='LIG'):
|
|
28
|
+
"Rename residue name from LIG_L to LIG as LIG_L exceeds lengths and leads to error in RDKit"
|
|
29
|
+
for model in structure:
|
|
30
|
+
for chain in model:
|
|
31
|
+
if chain.id == chain_id:
|
|
32
|
+
for residue in chain:
|
|
33
|
+
residue.resname = new_resname
|
|
34
|
+
|
|
35
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 7
|
|
36
|
+
def split_cif(cif_path, rec_chain_id,lig_chain_id, rec_pdb_path, lig_pdb_path):
|
|
37
|
+
"Split AF3 output CIF to protein and ligand PDBs"
|
|
38
|
+
parser = MMCIFParser(QUIET=True)
|
|
39
|
+
structure = parser.get_structure('complex', cif_path)
|
|
40
|
+
rename_residues(structure, chain_id=lig_chain_id, new_resname='LIG')
|
|
41
|
+
io = PDBIO()
|
|
42
|
+
io.set_structure(structure)
|
|
43
|
+
io.save(str(rec_pdb_path), ChainSelect(rec_chain_id)) # receptor
|
|
44
|
+
io.save(str(lig_pdb_path), ChainSelect(lig_chain_id)) # ligand
|
|
45
|
+
|
|
46
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 8
|
|
47
|
+
def pdb2sdf(pdb_path, sdf_path):
|
|
48
|
+
"Convert ligand pdb to sdf file"
|
|
49
|
+
mol = Chem.MolFromPDBFile(pdb_path, sanitize=True, removeHs=False)
|
|
50
|
+
if mol:
|
|
51
|
+
writer = Chem.SDWriter(sdf_path)
|
|
52
|
+
writer.write(mol)
|
|
53
|
+
writer.close()
|
|
54
|
+
return None
|
|
55
|
+
else:
|
|
56
|
+
print('Conversion failed for:', pdb_path)
|
|
57
|
+
return pdb_path
|
|
58
|
+
|
|
59
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 9
|
|
60
|
+
def prepare_rec_lig(cif_path, rec_chain_id, lig_chain_id, rec_pdb_path, lig_pdb_path):
|
|
61
|
+
"Split AF3 cif to protein.pdb (chainA) and ligand.sdf (chainL) "
|
|
62
|
+
|
|
63
|
+
tmp_name = Path(cif_path).stem
|
|
64
|
+
tmp_path = f'{tmp_name}_lig.pdb'
|
|
65
|
+
split_cif(cif_path, rec_chain_id,lig_chain_id, rec_pdb_path, tmp_path)
|
|
66
|
+
failed = pdb2sdf(tmp_path, lig_pdb_path)
|
|
67
|
+
try:
|
|
68
|
+
os.remove(tmp_path)
|
|
69
|
+
except OSError:
|
|
70
|
+
pass
|
|
71
|
+
return failed
|
|
72
|
+
|
|
73
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 15
|
|
74
|
+
def gnina_rescore_local(protein_pdb, # receptor file
|
|
75
|
+
ligand_sdf, # ligand file
|
|
76
|
+
CNN_affinity=True,
|
|
77
|
+
vinardo=False, # if True, use vinardo instead of vina
|
|
78
|
+
):
|
|
79
|
+
|
|
80
|
+
command = ['./gnina',
|
|
81
|
+
'-r', protein_pdb,
|
|
82
|
+
'-l', ligand_sdf,
|
|
83
|
+
'--minimize'] # always include this
|
|
84
|
+
|
|
85
|
+
# Handle scoring options
|
|
86
|
+
if not CNN_affinity:
|
|
87
|
+
command += ['--cnn_scoring', 'none']
|
|
88
|
+
if vinardo:
|
|
89
|
+
command += ['--scoring', 'vinardo']
|
|
90
|
+
|
|
91
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
92
|
+
return result.stdout
|
|
93
|
+
|
|
94
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 17
|
|
95
|
+
def gnina_rescore_docker(protein_pdb,
|
|
96
|
+
ligand_sdf,
|
|
97
|
+
CNN_affinity=True,
|
|
98
|
+
vinardo=False):
|
|
99
|
+
"Run GNINA rescoring using Docker. Supports receptor and ligand in different folders."
|
|
100
|
+
|
|
101
|
+
protein_pdb = Path(protein_pdb).resolve()
|
|
102
|
+
ligand_sdf = Path(ligand_sdf).resolve()
|
|
103
|
+
|
|
104
|
+
# Mount points inside the Docker container
|
|
105
|
+
rec_mount = '/recdata'
|
|
106
|
+
lig_mount = '/ligdata'
|
|
107
|
+
|
|
108
|
+
command = [
|
|
109
|
+
'docker', 'run', '--rm',
|
|
110
|
+
'-v', f'{protein_pdb.parent}:{rec_mount}', # mount receptor directory
|
|
111
|
+
'-v', f'{ligand_sdf.parent}:{lig_mount}', # mount ligand directory
|
|
112
|
+
'gnina/gnina',
|
|
113
|
+
'gnina',
|
|
114
|
+
'-r', f'{rec_mount}/{protein_pdb.name}',
|
|
115
|
+
'-l', f'{lig_mount}/{ligand_sdf.name}',
|
|
116
|
+
'--minimize', # always include
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
if not CNN_affinity:
|
|
120
|
+
command += ['--cnn_scoring', 'none']
|
|
121
|
+
if vinardo:
|
|
122
|
+
command += ['--scoring', 'vinardo']
|
|
123
|
+
|
|
124
|
+
result = subprocess.run(command, capture_output=True, text=True)
|
|
125
|
+
return result.stdout
|
|
126
|
+
|
|
127
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 19
|
|
128
|
+
def extract_gnina_rescore(txt):
|
|
129
|
+
"""Extract GNINA output metrics into a dictionary (partial match allowed)."""
|
|
130
|
+
result = {}
|
|
131
|
+
|
|
132
|
+
patterns = {
|
|
133
|
+
'binding_energy': r'Affinity:\s+([-.\d]+)',
|
|
134
|
+
'uncertainty': r'Affinity:\s+[-.\d]+\s+([-.\d]+)',
|
|
135
|
+
'RMSD': r'RMSD:\s+([-.\d]+)',
|
|
136
|
+
'CNNscore': r'CNNscore:\s+([-.\d]+)',
|
|
137
|
+
'CNNaffinity': r'CNNaffinity:\s+([-.\d]+)',
|
|
138
|
+
'CNNvariance': r'CNNvariance:\s+([-.\d]+)',
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
for key, pat in patterns.items():
|
|
142
|
+
match = re.search(pat, txt)
|
|
143
|
+
if match:
|
|
144
|
+
result[key] = float(match.group(1))
|
|
145
|
+
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 24
|
|
149
|
+
def get_gnina_rescore(cif_path,
|
|
150
|
+
rec_chain_id='A',
|
|
151
|
+
lig_chain_id='L',
|
|
152
|
+
CNN_affinity=True,
|
|
153
|
+
vinardo=False,
|
|
154
|
+
is_local=True):
|
|
155
|
+
"Split the CIF into receptor and ligand folders, then extract the GNINA rescored affinity score"
|
|
156
|
+
cif_path = Path(cif_path).expanduser()
|
|
157
|
+
parent,stem = cif_path.parent,cif_path.stem
|
|
158
|
+
|
|
159
|
+
rec_dir,lig_dir = Path(str(parent) + '_receptor'),Path(str(parent) + '_ligand')
|
|
160
|
+
|
|
161
|
+
rec_path,lig_path = rec_dir/f'{stem}.pdb',lig_dir/f'{stem}.sdf'
|
|
162
|
+
|
|
163
|
+
rec_dir.mkdir(exist_ok=True)
|
|
164
|
+
lig_dir.mkdir(exist_ok=True)
|
|
165
|
+
|
|
166
|
+
prepare_rec_lig(cif_path,rec_chain_id, lig_chain_id,rec_path,lig_path)
|
|
167
|
+
if is_local:
|
|
168
|
+
gnina_output = gnina_rescore_local(rec_path,lig_path,CNN_affinity,vinardo)
|
|
169
|
+
else:
|
|
170
|
+
gnina_output = gnina_rescore_docker(rec_path,lig_path,CNN_affinity,vinardo)
|
|
171
|
+
return extract_gnina_rescore(gnina_output)
|
|
172
|
+
|
|
173
|
+
# %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 29
|
|
174
|
+
def get_gnina_rescore_folder(cif_folder,
|
|
175
|
+
rec_chain_id='A',
|
|
176
|
+
lig_chain_id='L',
|
|
177
|
+
CNN_affinity=True,
|
|
178
|
+
vinardo=False,
|
|
179
|
+
is_local=True):
|
|
180
|
+
"Parallel processing to get gnina rescore given folder path"
|
|
181
|
+
cifs = L(Path(cif_folder).expanduser().glob("*.cif")) # just take cif file
|
|
182
|
+
|
|
183
|
+
func = partial(get_gnina_rescore,
|
|
184
|
+
rec_chain_id=rec_chain_id,
|
|
185
|
+
lig_chain_id=lig_chain_id,
|
|
186
|
+
CNN_affinity=CNN_affinity,
|
|
187
|
+
vinardo=vinardo,
|
|
188
|
+
is_local=is_local)
|
|
189
|
+
results = process_map(func, cifs, max_workers=4)
|
|
190
|
+
|
|
191
|
+
# use path.stem as df index
|
|
192
|
+
results_dict = dict(zip([p.stem for p in cifs], results))
|
|
193
|
+
result_df = pd.DataFrame(results_dict).T.reset_index(names='ID')
|
|
194
|
+
|
|
195
|
+
prefix = "vinardo_" if vinardo else "vina_"
|
|
196
|
+
result_df = result_df.rename(columns={
|
|
197
|
+
"binding_energy": f"{prefix}binding_energy",
|
|
198
|
+
"uncertainty": f"{prefix}uncertainty",
|
|
199
|
+
"RMSD": f"{prefix}RMSD"
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
return result_df
|
|
203
|
+
|
|
204
|
+
|
kdock/px/__init__.py
ADDED
|
File without changes
|
kdock/px/core.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/protenix/07_protenix.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['get_single_job', 'get_single_protein_ligand_json', 'get_protein_ligand_df_json', 'get_virtual_screening_json']
|
|
5
|
+
|
|
6
|
+
# %% ../../nbs/protenix/07_protenix.ipynb 6
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
# %% ../../nbs/protenix/07_protenix.ipynb 9
|
|
11
|
+
def get_single_job(job_name, protein_seq, msa_dir, SMILES=None,CCD=None):
|
|
12
|
+
"Get protenix json format of protein and ligand."
|
|
13
|
+
|
|
14
|
+
if SMILES and CCD:
|
|
15
|
+
raise ValueError("Please provide only one of SMILES or CCD, not both.")
|
|
16
|
+
if not SMILES and not CCD:
|
|
17
|
+
raise ValueError("You must provide either SMILES or CCD.")
|
|
18
|
+
|
|
19
|
+
ligand_value = SMILES if SMILES else f"CCD_{CCD}"
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
"name": job_name,
|
|
23
|
+
"sequences": [
|
|
24
|
+
{
|
|
25
|
+
"proteinChain": {
|
|
26
|
+
"count": 1,
|
|
27
|
+
"sequence": protein_seq,
|
|
28
|
+
"msa": {
|
|
29
|
+
"precomputed_msa_dir": msa_dir,
|
|
30
|
+
"pairing_db": "uniref100"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"ligand": {
|
|
36
|
+
"count": 1,
|
|
37
|
+
"ligand": ligand_value
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# %% ../../nbs/protenix/07_protenix.ipynb 11
|
|
44
|
+
def get_single_protein_ligand_json(job_name,
|
|
45
|
+
protein_seq,
|
|
46
|
+
msa_dir,
|
|
47
|
+
SMILES=None,
|
|
48
|
+
CCD=None,
|
|
49
|
+
json_path=None):
|
|
50
|
+
"Generate json input for one protein-ligand job."
|
|
51
|
+
data = [get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)]
|
|
52
|
+
|
|
53
|
+
if json_path:
|
|
54
|
+
save_path = Path(json_path)
|
|
55
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
with save_path.open("w") as f:
|
|
57
|
+
json.dump(data, f, indent=4)
|
|
58
|
+
print(f"JSON saved to {save_path}")
|
|
59
|
+
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
# %% ../../nbs/protenix/07_protenix.ipynb 16
|
|
63
|
+
def get_protein_ligand_df_json(df,
|
|
64
|
+
id_col,
|
|
65
|
+
seq_col,
|
|
66
|
+
msa_col,
|
|
67
|
+
smi_col=None,
|
|
68
|
+
ccd_col=None,
|
|
69
|
+
save_json=None):
|
|
70
|
+
"Get json file of protein and ligand in a dataframe."
|
|
71
|
+
|
|
72
|
+
if smi_col and ccd_col:
|
|
73
|
+
raise ValueError("Provide only one of smi_col or ccd_col, not both.")
|
|
74
|
+
if not smi_col and not ccd_col:
|
|
75
|
+
raise ValueError("You must provide either smi_col or ccd_col.")
|
|
76
|
+
|
|
77
|
+
use_smiles = smi_col is not None
|
|
78
|
+
|
|
79
|
+
def build_job(row):
|
|
80
|
+
job_name = row[id_col]
|
|
81
|
+
protein_seq = row[seq_col]
|
|
82
|
+
msa_dir = row[msa_col]
|
|
83
|
+
SMILES = row[smi_col] if use_smiles else None
|
|
84
|
+
CCD = None if use_smiles else row[ccd_col]
|
|
85
|
+
return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)
|
|
86
|
+
|
|
87
|
+
all_jobs = df.apply(build_job, axis=1).tolist()
|
|
88
|
+
|
|
89
|
+
if save_json:
|
|
90
|
+
save_path = Path(save_json)
|
|
91
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
with save_path.open("w") as f:
|
|
93
|
+
json.dump(all_jobs, f, indent=4)
|
|
94
|
+
print(f"JSON saved to {save_path}")
|
|
95
|
+
|
|
96
|
+
return all_jobs
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# %% ../../nbs/protenix/07_protenix.ipynb 19
|
|
100
|
+
def get_virtual_screening_json(df,
|
|
101
|
+
protein_seq,
|
|
102
|
+
msa_dir,
|
|
103
|
+
id_col,
|
|
104
|
+
smi_col=None,
|
|
105
|
+
ccd_col=None,
|
|
106
|
+
save_json=None):
|
|
107
|
+
"Get json file of single protein against multiple SMILES in a dataframe."
|
|
108
|
+
if smi_col and ccd_col:
|
|
109
|
+
raise ValueError("Provide only one of smi_col or ccd_col, not both.")
|
|
110
|
+
if not smi_col and not ccd_col:
|
|
111
|
+
raise ValueError("You must provide either smi_col or ccd_col.")
|
|
112
|
+
|
|
113
|
+
use_smiles = smi_col is not None
|
|
114
|
+
|
|
115
|
+
def build_job(row):
|
|
116
|
+
job_name = row[id_col]
|
|
117
|
+
SMILES = row[smi_col] if use_smiles else None
|
|
118
|
+
CCD = None if use_smiles else row[ccd_col]
|
|
119
|
+
return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)
|
|
120
|
+
|
|
121
|
+
all_jobs = df.apply(build_job, axis=1).tolist()
|
|
122
|
+
|
|
123
|
+
if save_json:
|
|
124
|
+
save_path = Path(save_json)
|
|
125
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
with save_path.open("w") as f:
|
|
127
|
+
json.dump(all_jobs, f, indent=4)
|
|
128
|
+
print(f"JSON saved to {save_path}")
|
|
129
|
+
|
|
130
|
+
return all_jobs
|