kdock 2025.10.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kdock/core/utils.py ADDED
@@ -0,0 +1,156 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/01_utils.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['rglob', 'copy_files', 'get_rec_lig', 'get_box', 'view_mol', 'view_complex']
5
+
6
+ # %% ../../nbs/core/01_utils.ipynb 3
7
+ from pathlib import Path
8
+ import subprocess,shutil,zipfile
9
+ import numpy as np
10
+
11
+ import py3Dmol
12
+ from rdkit import Chem
13
+
14
+ # %% ../../nbs/core/01_utils.ipynb 6
15
+ def rglob(path, pattern, max_depth):
16
+ "Get a file list given folder depths"
17
+ base_path = Path(path).resolve()
18
+ for path in base_path.rglob(pattern):
19
+ if len(path.relative_to(base_path).parts) <= max_depth:
20
+ yield path
21
+
22
+ # %% ../../nbs/core/01_utils.ipynb 8
23
+ def copy_files(file_list, dest_dir):
24
+ "Copy a list of files to the destination directory, or zip them if dest_dir ends with .zip."
25
+ dest_path = Path(dest_dir)
26
+
27
+ if dest_path.suffix == ".zip":
28
+ with zipfile.ZipFile(dest_path, 'w') as zipf:
29
+ for file_path in file_list:
30
+ file_path = Path(file_path)
31
+ zipf.write(file_path, arcname=file_path.name)
32
+ print(f'Zipped {len(file_list)} files to {dest_path}')
33
+ else:
34
+ dest_path.mkdir(parents=True, exist_ok=True)
35
+ for file_path in file_list:
36
+ file_path = Path(file_path)
37
+ shutil.copy2(file_path, dest_path / file_path.name)
38
+ print(f'Copied {len(file_list)} files to {dest_path}')
39
+
40
+ # %% ../../nbs/core/01_utils.ipynb 11
41
+ def get_rec_lig(pdb_id: str, # pdb id for download
42
+ lig_id: str, # ligand id shown on the protein page
43
+ out_dir = '.', # directory path to save pdb files
44
+ ):
45
+ "Download pdb and extract receptor and ligand from a PDB ID."
46
+ out_dir = Path(out_dir).expanduser().resolve()
47
+ out_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ pdb_file = out_dir / f"{pdb_id}.pdb"
50
+ rec_file = out_dir / f"{pdb_id}_receptor.pdb"
51
+ lig_pdb_file = out_dir / f"{pdb_id}_lig.pdb"
52
+ lig_sdf_file = out_dir / f"{pdb_id}_lig.sdf"
53
+
54
+ # Download if not exists
55
+ if not pdb_file.exists():
56
+ url = f"http://files.rcsb.org/download/{pdb_id}.pdb"
57
+ print(f'Downloading pdb: {pdb_id}')
58
+ subprocess.run(["wget", url, "-O", str(pdb_file)], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
59
+ print(f'{pdb_id}.pdb is detected!')
60
+
61
+ # Extract protein (all ATOM lines excluding ligand ID)
62
+ with open(pdb_file) as infile, open(rec_file, 'w') as out_rec:
63
+ for line in infile:
64
+ if line.startswith("ATOM") and lig_id not in line:
65
+ out_rec.write(line)
66
+
67
+ # Extract ligand
68
+ with open(pdb_file) as infile, open(lig_pdb_file, 'w') as out_lig:
69
+ for line in infile:
70
+ if lig_id in line and line.startswith(("HETATM", "ATOM")):
71
+ out_lig.write(line)
72
+
73
+ # Convert ligand PDB to SDF using RDKit
74
+ mol = Chem.MolFromPDBFile(str(lig_pdb_file), removeHs=False)
75
+ if mol is None:
76
+ raise ValueError("Failed to parse ligand from PDB.")
77
+
78
+ writer = Chem.SDWriter(str(lig_sdf_file))
79
+ writer.write(mol)
80
+ writer.close()
81
+
82
+ return str(rec_file), str(lig_sdf_file)
83
+
84
+ # %% ../../nbs/core/01_utils.ipynb 14
85
+ def get_box(sdf_file, autobox_add=4.0,tolist=False):
86
+ "Get the box coordinates of ligand.sdf; mimic GNINA's --autobox_ligand behavior."
87
+ mol = Chem.SDMolSupplier(str(sdf_file), removeHs=False)[0]
88
+ if mol is None:
89
+ raise ValueError(f"Failed to read molecule from {sdf_file}")
90
+
91
+ conf = mol.GetConformer()
92
+ coords = np.array([list(conf.GetAtomPosition(i)) for i in range(mol.GetNumAtoms())])
93
+
94
+ min_coords = coords.min(axis=0)
95
+ max_coords = coords.max(axis=0)
96
+
97
+ center = (min_coords + max_coords) / 2
98
+ size = (max_coords - min_coords) + autobox_add
99
+
100
+ box_dict = {
101
+ "center_x": round(float(center[0]), 3),
102
+ "center_y": round(float(center[1]), 3),
103
+ "center_z": round(float(center[2]), 3),
104
+ "size_x": round(float(size[0]), 3),
105
+ "size_y": round(float(size[1]), 3),
106
+ "size_z": round(float(size[2]), 3)
107
+ }
108
+ return list(box_dict.values()) if tolist else box_dict
109
+
110
+ # %% ../../nbs/core/01_utils.ipynb 18
111
+ def view_mol(file, #sdf or pdb file
112
+ ):
113
+ "Visualize pdb or sdf file"
114
+
115
+ v = py3Dmol.view()
116
+ v.addModel(open(file).read())
117
+ v.setStyle({'stick':{}})
118
+ v.zoomTo()
119
+ v.show()
120
+
121
+ # %% ../../nbs/core/01_utils.ipynb 20
122
+ def view_complex(receptor, # protein file
123
+ ligand, # ligand (green), or docked ligand
124
+ ori_ligand=None, # original ligand (yellow)
125
+ box=None # optional box: [x, y, z, sizeX, sizeY, sizeZ]
126
+ ):
127
+
128
+ "Visualize the receptor, ligand, optional original ligand, and optional box via py3Dmol."
129
+ v = py3Dmol.view()
130
+
131
+ # Load receptor
132
+ v.addModel(open(receptor).read())
133
+ v.setStyle({'cartoon': {}, 'stick': {'radius': 0.15}})
134
+
135
+ # Load docked ligand
136
+ v.addModel(open(ligand).read())
137
+ v.setStyle({'model': 1}, {'stick': {'colorscheme': 'greenCarbon'}})
138
+
139
+ # Load original ligand if provided
140
+ if ori_ligand is not None:
141
+ v.addModel(open(ori_ligand).read())
142
+ v.setStyle({'model': 2}, {'stick': {'colorscheme': 'yellowCarbon'}})
143
+
144
+ # Add bounding box if specified
145
+ if box is not None and len(box) == 6:
146
+ x, y, z, sizeX, sizeY, sizeZ = box
147
+ v.addBox({
148
+ 'center': {'x': x, 'y': y, 'z': z},
149
+ 'dimensions': {'w': sizeX, 'h': sizeY, 'd': sizeZ},
150
+ 'color': 'red',
151
+ 'opacity': 1,
152
+ 'wireframe': True
153
+ })
154
+
155
+ v.zoomTo({'model': 1})
156
+ v.show()
File without changes
kdock/gnina/dock.py ADDED
@@ -0,0 +1,114 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/gnina/04_gnina_docking.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['setup_gnina_local', 'setup_gnina_docker', 'extract_gnina_dock', 'gnina_dock']
5
+
6
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 3
7
+ # basics
8
+ import re,subprocess, py3Dmol
9
+ from tqdm import tqdm
10
+ from pathlib import Path
11
+ import pandas as pd,numpy as np
12
+
13
+ # rdkit
14
+ from rdkit import Chem
15
+ from rdkit.Chem import AllChem
16
+
17
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 8
18
+ def setup_gnina_local(version='v1.3'):
19
+ "Download and install gnina in the current directory"
20
+ # Check CUDA availability
21
+ # try:
22
+ # subprocess.run(["nvidia-smi"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
23
+ # except subprocess.CalledProcessError:
24
+ # raise EnvironmentError("CUDA not detected. Please make sure a CUDA-capable GPU is available and drivers are installed.")
25
+ # except FileNotFoundError:
26
+ # raise EnvironmentError("nvidia-smi not found. Make sure NVIDIA drivers and CUDA are installed.")
27
+
28
+ subprocess.run(["sudo", "apt-get", "update", "-yq"], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
29
+ subprocess.run(["sudo", "apt-get", "install", "-yq", "openbabel"], check=True)
30
+
31
+ gnina_url = f"https://github.com/gnina/gnina/releases/download/{version}/gnina"
32
+ print(f'Downloading {version} gnina')
33
+ subprocess.run(["wget",gnina_url], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
34
+ subprocess.run(["chmod", "+x", 'gnina'], check=True)
35
+
36
+ print('Finish setup!')
37
+
38
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 13
39
+ def setup_gnina_docker():
40
+ "Pull gnina docker image"
41
+ print("Pulling GNINA Docker image: gnina/gnina")
42
+ subprocess.run(["docker", "pull", "gnina/gnina"], check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
43
+ print("GNINA Docker image is ready.")
44
+
45
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 18
46
+ def extract_gnina_dock(gnina_output):
47
+ "Extract values from gnina output"
48
+ mode1_line = re.search(r'\b1\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)\b', gnina_output)
49
+
50
+ if mode1_line:
51
+ affinity = float(mode1_line.group(1))
52
+ cnn_pose_score = float(mode1_line.group(2))
53
+ cnn_affinity = float(mode1_line.group(3))
54
+
55
+ return affinity, cnn_pose_score, cnn_affinity
56
+
57
+ return None
58
+
59
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 19
60
+ def gnina_dock(receptor, # receptor file
61
+ ligand, # ligand file
62
+ autobox_ligand, # ligand file isolated from the complex
63
+ output = 'docked.sdf', # output file (sdf or sdf.gz) to be saved
64
+ seed=0, # set seeds
65
+ exhaustiveness=None, # number of MC chains, default is 8 if None, the higher the better (16,32); for whole protein, use 64
66
+ ):
67
+
68
+ command = ['./gnina',
69
+ '-r', receptor,
70
+ '-l', ligand,
71
+ '--autobox_ligand', autobox_ligand,
72
+ '-o', output,
73
+ '--seed', str(seed)]
74
+
75
+ if exhaustiveness is not None:
76
+ command.extend(['--exhaustiveness', str(exhaustiveness)])
77
+
78
+ output_txt = subprocess.run(command, capture_output=True, text=True).stdout
79
+
80
+ print(f'save the docked file as {output}')
81
+
82
+ values = extract_gnina_dock(output_txt)
83
+
84
+ print(f'affinity, cnn_pose_score, and cnn_affinity are: {values}')
85
+
86
+ return values
87
+
88
+ # %% ../../nbs/gnina/04_gnina_docking.ipynb 21
89
+ def gnina_dock(df,
90
+ ID_col = 'ID',
91
+ smi_col = 'SMILES',
92
+ output_dir = 'gnina_docked'
93
+ ):
94
+ affinity_values = []
95
+ cnn_pose_score_values = []
96
+ cnn_affinity_values = []
97
+
98
+
99
+ Path(output_dir).mkdir(parents=True,exist_ok=True)
100
+
101
+ for i, r in tqdm(df.iterrows(),total=len(df),desc='Docking'):
102
+ rdkit_conformer(SMILES=r[smi_col], output = f'ligand/{r[ID_col]}.sdf', visualize=False)
103
+ affinity, cnn_pose_score, cnn_affinity = gnina_dock('rec.pdb',f'ligand/{r[ID_col]}.sdf', 'lig.pdb',f'docked/docked_{r[ID_col]}.sdf')
104
+
105
+ affinity_values.append(affinity)
106
+ cnn_pose_score_values.append(cnn_pose_score)
107
+ cnn_affinity_values.append(cnn_affinity)
108
+
109
+ df = df.copy()
110
+ df['Affinity'] = affinity_values
111
+ df['CNN_Pose_Score'] = cnn_pose_score_values
112
+ df['CNN_Affinity'] = cnn_affinity_values
113
+
114
+ return df
kdock/gnina/rescore.py ADDED
@@ -0,0 +1,204 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/gnina/05_gnina_AF3_rescore.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['ChainSelect', 'rename_residues', 'split_cif', 'pdb2sdf', 'prepare_rec_lig', 'gnina_rescore_local',
5
+ 'gnina_rescore_docker', 'extract_gnina_rescore', 'get_gnina_rescore', 'get_gnina_rescore_folder']
6
+
7
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 3
8
+ import pandas as pd
9
+ import re, os, subprocess, py3Dmol
10
+ from Bio.PDB import MMCIFParser, PDBIO, Select
11
+ from rdkit import Chem
12
+ from rdkit.Chem import AllChem
13
+ from pathlib import Path
14
+ from fastcore.all import L
15
+ from tqdm.contrib.concurrent import process_map
16
+ from functools import partial
17
+
18
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 5
19
+ class ChainSelect(Select):
20
+ "Select chain to save"
21
+ def __init__(self, chain_ids):
22
+ self.chain_ids = chain_ids
23
+ def accept_chain(self, chain):
24
+ return chain.get_id() in self.chain_ids
25
+
26
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 6
27
+ def rename_residues(structure, chain_id, new_resname='LIG'):
28
+ "Rename residue name from LIG_L to LIG as LIG_L exceeds lengths and leads to error in RDKit"
29
+ for model in structure:
30
+ for chain in model:
31
+ if chain.id == chain_id:
32
+ for residue in chain:
33
+ residue.resname = new_resname
34
+
35
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 7
36
+ def split_cif(cif_path, rec_chain_id,lig_chain_id, rec_pdb_path, lig_pdb_path):
37
+ "Split AF3 output CIF to protein and ligand PDBs"
38
+ parser = MMCIFParser(QUIET=True)
39
+ structure = parser.get_structure('complex', cif_path)
40
+ rename_residues(structure, chain_id=lig_chain_id, new_resname='LIG')
41
+ io = PDBIO()
42
+ io.set_structure(structure)
43
+ io.save(str(rec_pdb_path), ChainSelect(rec_chain_id)) # receptor
44
+ io.save(str(lig_pdb_path), ChainSelect(lig_chain_id)) # ligand
45
+
46
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 8
47
+ def pdb2sdf(pdb_path, sdf_path):
48
+ "Convert ligand pdb to sdf file"
49
+ mol = Chem.MolFromPDBFile(pdb_path, sanitize=True, removeHs=False)
50
+ if mol:
51
+ writer = Chem.SDWriter(sdf_path)
52
+ writer.write(mol)
53
+ writer.close()
54
+ return None
55
+ else:
56
+ print('Conversion failed for:', pdb_path)
57
+ return pdb_path
58
+
59
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 9
60
+ def prepare_rec_lig(cif_path, rec_chain_id, lig_chain_id, rec_pdb_path, lig_pdb_path):
61
+ "Split AF3 cif to protein.pdb (chainA) and ligand.sdf (chainL) "
62
+
63
+ tmp_name = Path(cif_path).stem
64
+ tmp_path = f'{tmp_name}_lig.pdb'
65
+ split_cif(cif_path, rec_chain_id,lig_chain_id, rec_pdb_path, tmp_path)
66
+ failed = pdb2sdf(tmp_path, lig_pdb_path)
67
+ try:
68
+ os.remove(tmp_path)
69
+ except OSError:
70
+ pass
71
+ return failed
72
+
73
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 15
74
+ def gnina_rescore_local(protein_pdb, # receptor file
75
+ ligand_sdf, # ligand file
76
+ CNN_affinity=True,
77
+ vinardo=False, # if True, use vinardo instead of vina
78
+ ):
79
+
80
+ command = ['./gnina',
81
+ '-r', protein_pdb,
82
+ '-l', ligand_sdf,
83
+ '--minimize'] # always include this
84
+
85
+ # Handle scoring options
86
+ if not CNN_affinity:
87
+ command += ['--cnn_scoring', 'none']
88
+ if vinardo:
89
+ command += ['--scoring', 'vinardo']
90
+
91
+ result = subprocess.run(command, capture_output=True, text=True)
92
+ return result.stdout
93
+
94
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 17
95
+ def gnina_rescore_docker(protein_pdb,
96
+ ligand_sdf,
97
+ CNN_affinity=True,
98
+ vinardo=False):
99
+ "Run GNINA rescoring using Docker. Supports receptor and ligand in different folders."
100
+
101
+ protein_pdb = Path(protein_pdb).resolve()
102
+ ligand_sdf = Path(ligand_sdf).resolve()
103
+
104
+ # Mount points inside the Docker container
105
+ rec_mount = '/recdata'
106
+ lig_mount = '/ligdata'
107
+
108
+ command = [
109
+ 'docker', 'run', '--rm',
110
+ '-v', f'{protein_pdb.parent}:{rec_mount}', # mount receptor directory
111
+ '-v', f'{ligand_sdf.parent}:{lig_mount}', # mount ligand directory
112
+ 'gnina/gnina',
113
+ 'gnina',
114
+ '-r', f'{rec_mount}/{protein_pdb.name}',
115
+ '-l', f'{lig_mount}/{ligand_sdf.name}',
116
+ '--minimize', # always include
117
+ ]
118
+
119
+ if not CNN_affinity:
120
+ command += ['--cnn_scoring', 'none']
121
+ if vinardo:
122
+ command += ['--scoring', 'vinardo']
123
+
124
+ result = subprocess.run(command, capture_output=True, text=True)
125
+ return result.stdout
126
+
127
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 19
128
+ def extract_gnina_rescore(txt):
129
+ """Extract GNINA output metrics into a dictionary (partial match allowed)."""
130
+ result = {}
131
+
132
+ patterns = {
133
+ 'binding_energy': r'Affinity:\s+([-.\d]+)',
134
+ 'uncertainty': r'Affinity:\s+[-.\d]+\s+([-.\d]+)',
135
+ 'RMSD': r'RMSD:\s+([-.\d]+)',
136
+ 'CNNscore': r'CNNscore:\s+([-.\d]+)',
137
+ 'CNNaffinity': r'CNNaffinity:\s+([-.\d]+)',
138
+ 'CNNvariance': r'CNNvariance:\s+([-.\d]+)',
139
+ }
140
+
141
+ for key, pat in patterns.items():
142
+ match = re.search(pat, txt)
143
+ if match:
144
+ result[key] = float(match.group(1))
145
+
146
+ return result
147
+
148
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 24
149
+ def get_gnina_rescore(cif_path,
150
+ rec_chain_id='A',
151
+ lig_chain_id='L',
152
+ CNN_affinity=True,
153
+ vinardo=False,
154
+ is_local=True):
155
+ "Split the CIF into receptor and ligand folders, then extract the GNINA rescored affinity score"
156
+ cif_path = Path(cif_path).expanduser()
157
+ parent,stem = cif_path.parent,cif_path.stem
158
+
159
+ rec_dir,lig_dir = Path(str(parent) + '_receptor'),Path(str(parent) + '_ligand')
160
+
161
+ rec_path,lig_path = rec_dir/f'{stem}.pdb',lig_dir/f'{stem}.sdf'
162
+
163
+ rec_dir.mkdir(exist_ok=True)
164
+ lig_dir.mkdir(exist_ok=True)
165
+
166
+ prepare_rec_lig(cif_path,rec_chain_id, lig_chain_id,rec_path,lig_path)
167
+ if is_local:
168
+ gnina_output = gnina_rescore_local(rec_path,lig_path,CNN_affinity,vinardo)
169
+ else:
170
+ gnina_output = gnina_rescore_docker(rec_path,lig_path,CNN_affinity,vinardo)
171
+ return extract_gnina_rescore(gnina_output)
172
+
173
+ # %% ../../nbs/gnina/05_gnina_AF3_rescore.ipynb 29
174
+ def get_gnina_rescore_folder(cif_folder,
175
+ rec_chain_id='A',
176
+ lig_chain_id='L',
177
+ CNN_affinity=True,
178
+ vinardo=False,
179
+ is_local=True):
180
+ "Parallel processing to get gnina rescore given folder path"
181
+ cifs = L(Path(cif_folder).expanduser().glob("*.cif")) # just take cif file
182
+
183
+ func = partial(get_gnina_rescore,
184
+ rec_chain_id=rec_chain_id,
185
+ lig_chain_id=lig_chain_id,
186
+ CNN_affinity=CNN_affinity,
187
+ vinardo=vinardo,
188
+ is_local=is_local)
189
+ results = process_map(func, cifs, max_workers=4)
190
+
191
+ # use path.stem as df index
192
+ results_dict = dict(zip([p.stem for p in cifs], results))
193
+ result_df = pd.DataFrame(results_dict).T.reset_index(names='ID')
194
+
195
+ prefix = "vinardo_" if vinardo else "vina_"
196
+ result_df = result_df.rename(columns={
197
+ "binding_energy": f"{prefix}binding_energy",
198
+ "uncertainty": f"{prefix}uncertainty",
199
+ "RMSD": f"{prefix}RMSD"
200
+ })
201
+
202
+ return result_df
203
+
204
+
kdock/px/__init__.py ADDED
File without changes
kdock/px/core.py ADDED
@@ -0,0 +1,130 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/protenix/07_protenix.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['get_single_job', 'get_single_protein_ligand_json', 'get_protein_ligand_df_json', 'get_virtual_screening_json']
5
+
6
+ # %% ../../nbs/protenix/07_protenix.ipynb 6
7
+ import json
8
+ from pathlib import Path
9
+
10
+ # %% ../../nbs/protenix/07_protenix.ipynb 9
11
+ def get_single_job(job_name, protein_seq, msa_dir, SMILES=None,CCD=None):
12
+ "Get protenix json format of protein and ligand."
13
+
14
+ if SMILES and CCD:
15
+ raise ValueError("Please provide only one of SMILES or CCD, not both.")
16
+ if not SMILES and not CCD:
17
+ raise ValueError("You must provide either SMILES or CCD.")
18
+
19
+ ligand_value = SMILES if SMILES else f"CCD_{CCD}"
20
+
21
+ return {
22
+ "name": job_name,
23
+ "sequences": [
24
+ {
25
+ "proteinChain": {
26
+ "count": 1,
27
+ "sequence": protein_seq,
28
+ "msa": {
29
+ "precomputed_msa_dir": msa_dir,
30
+ "pairing_db": "uniref100"
31
+ }
32
+ }
33
+ },
34
+ {
35
+ "ligand": {
36
+ "count": 1,
37
+ "ligand": ligand_value
38
+ }
39
+ }
40
+ ]
41
+ }
42
+
43
+ # %% ../../nbs/protenix/07_protenix.ipynb 11
44
+ def get_single_protein_ligand_json(job_name,
45
+ protein_seq,
46
+ msa_dir,
47
+ SMILES=None,
48
+ CCD=None,
49
+ json_path=None):
50
+ "Generate json input for one protein-ligand job."
51
+ data = [get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)]
52
+
53
+ if json_path:
54
+ save_path = Path(json_path)
55
+ save_path.parent.mkdir(parents=True, exist_ok=True)
56
+ with save_path.open("w") as f:
57
+ json.dump(data, f, indent=4)
58
+ print(f"JSON saved to {save_path}")
59
+
60
+ return data
61
+
62
+ # %% ../../nbs/protenix/07_protenix.ipynb 16
63
+ def get_protein_ligand_df_json(df,
64
+ id_col,
65
+ seq_col,
66
+ msa_col,
67
+ smi_col=None,
68
+ ccd_col=None,
69
+ save_json=None):
70
+ "Get json file of protein and ligand in a dataframe."
71
+
72
+ if smi_col and ccd_col:
73
+ raise ValueError("Provide only one of smi_col or ccd_col, not both.")
74
+ if not smi_col and not ccd_col:
75
+ raise ValueError("You must provide either smi_col or ccd_col.")
76
+
77
+ use_smiles = smi_col is not None
78
+
79
+ def build_job(row):
80
+ job_name = row[id_col]
81
+ protein_seq = row[seq_col]
82
+ msa_dir = row[msa_col]
83
+ SMILES = row[smi_col] if use_smiles else None
84
+ CCD = None if use_smiles else row[ccd_col]
85
+ return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)
86
+
87
+ all_jobs = df.apply(build_job, axis=1).tolist()
88
+
89
+ if save_json:
90
+ save_path = Path(save_json)
91
+ save_path.parent.mkdir(parents=True, exist_ok=True)
92
+ with save_path.open("w") as f:
93
+ json.dump(all_jobs, f, indent=4)
94
+ print(f"JSON saved to {save_path}")
95
+
96
+ return all_jobs
97
+
98
+
99
+ # %% ../../nbs/protenix/07_protenix.ipynb 19
100
+ def get_virtual_screening_json(df,
101
+ protein_seq,
102
+ msa_dir,
103
+ id_col,
104
+ smi_col=None,
105
+ ccd_col=None,
106
+ save_json=None):
107
+ "Get json file of single protein against multiple SMILES in a dataframe."
108
+ if smi_col and ccd_col:
109
+ raise ValueError("Provide only one of smi_col or ccd_col, not both.")
110
+ if not smi_col and not ccd_col:
111
+ raise ValueError("You must provide either smi_col or ccd_col.")
112
+
113
+ use_smiles = smi_col is not None
114
+
115
+ def build_job(row):
116
+ job_name = row[id_col]
117
+ SMILES = row[smi_col] if use_smiles else None
118
+ CCD = None if use_smiles else row[ccd_col]
119
+ return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)
120
+
121
+ all_jobs = df.apply(build_job, axis=1).tolist()
122
+
123
+ if save_json:
124
+ save_path = Path(save_json)
125
+ save_path.parent.mkdir(parents=True, exist_ok=True)
126
+ with save_path.open("w") as f:
127
+ json.dump(all_jobs, f, indent=4)
128
+ print(f"JSON saved to {save_path}")
129
+
130
+ return all_jobs