kdock 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kdock/af3/json.py ADDED
@@ -0,0 +1,282 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/af3/00_json.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['dump_json', 'get_protein_json', 'read_json', 'get_protein_smiles_json', 'get_protein_ccdcode_json',
5
+ 'assign_atom_names_from_graph', 'mol_to_ccd_text', 'sdf2ccd', 'get_protein_ccd_json', 'split_nfolder']
6
+
7
+ # %% ../../nbs/af3/00_json.ipynb 2
8
+ import re, shutil, json, pandas as pd, numpy as np
9
+ from pathlib import Path
10
+
11
+ from rdkit import Chem as rd_chem
12
+ from rdkit.Chem import AllChem,rdmolfiles
13
+ from rdkit import Chem
14
+
15
+ from Bio.PDB import PDBParser
16
+
17
+ # %% ../../nbs/af3/00_json.ipynb 4
18
+ def dump_json(data, save_path):
19
+ "Save json data into a file"
20
+ with open(save_path,'w') as f:
21
+ json.dump(data,f,indent=4)
22
+
23
+ # %% ../../nbs/af3/00_json.ipynb 5
24
+ def get_protein_json(name, # job name
25
+ seq, # aa sequence
26
+ save_path=None, # .json
27
+ seeds=[1]
28
+ ):
29
+ "Generate json of single protein sequence for input of docker command"
30
+
31
+ json_data = {
32
+ "name": name,
33
+ "modelSeeds": seeds,
34
+ "sequences": [
35
+ {
36
+ "protein": {
37
+ "id": "A",
38
+ "sequence": seq,
39
+ }
40
+ },
41
+ ],
42
+ "bondedAtomPairs": [],
43
+ "dialect": "alphafold3",
44
+ "version": 3
45
+ }
46
+ if save_path:
47
+ Path(save_path).parent.mkdir(parents=True, exist_ok=True)
48
+ dump_json(json_data,save_path)
49
+ return json_data
50
+
51
+ # %% ../../nbs/af3/00_json.ipynb 9
52
+ def read_json(file_path):
53
+ with open(file_path,'r') as f:
54
+ data = json.load(f)
55
+ return data
56
+
57
+ # %% ../../nbs/af3/00_json.ipynb 11
58
+ def get_protein_smiles_json(smi_id:str,
59
+ SMILES:str,
60
+ protein_json, # json type
61
+ save_path=None, # .json
62
+ seeds=[1]
63
+ ):
64
+
65
+ "Get json for protein-ligand docking task"
66
+ raw_smiles = r"{}".format(SMILES) # JSON escaping, \ to \\
67
+ protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
68
+ json_data = {
69
+ "name": smi_id,
70
+ "modelSeeds": seeds,
71
+ "sequences": [
72
+ {
73
+ "ligand": {
74
+ "id": "L",
75
+ "smiles": raw_smiles,
76
+ }
77
+ },
78
+ {
79
+ "protein": protein_json["sequences"][protein_index]["protein"]
80
+ },
81
+ ],
82
+ "bondedAtomPairs": [],
83
+ "dialect": "alphafold3",
84
+ "version": 2
85
+ }
86
+ if save_path:
87
+ Path(save_path).parent.mkdir(parents=True, exist_ok=True)
88
+ dump_json(json_data,save_path)
89
+ return json_data
90
+
91
+ # %% ../../nbs/af3/00_json.ipynb 18
92
+ def get_protein_ccdcode_json(protein_json, # dict with protein sequence
93
+ ccd_code, # str or list of str
94
+ job_id: str, # job/task ID
95
+ save_path=None, # optional output path
96
+ seeds=[1]): # optional random seeds
97
+ "Create AlphaFold3 docking JSON with CCD code(s)."
98
+
99
+ # Normalize ccd_code to a list
100
+ if isinstance(ccd_code, str):
101
+ ccd_code = [ccd_code]
102
+ elif not isinstance(ccd_code, list):
103
+ raise TypeError("ccd_code must be a string or a list of strings.")
104
+
105
+ protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
106
+
107
+ json_data = {
108
+ "name": job_id,
109
+ "modelSeeds": seeds,
110
+ "sequences": [
111
+ {
112
+ "ligand": {
113
+ "id": "L",
114
+ "ccdCodes": ccd_code
115
+ }
116
+ },
117
+ {
118
+ "protein": protein_json["sequences"][protein_index]["protein"]
119
+ },
120
+ ],
121
+ "dialect": "alphafold3",
122
+ "version": 3
123
+ }
124
+
125
+ if save_path:
126
+ Path(save_path).parent.mkdir(parents=True, exist_ok=True)
127
+ dump_json(json_data, save_path)
128
+
129
+ return json_data
130
+
131
+ # %% ../../nbs/af3/00_json.ipynb 22
132
+ # Mapping bond types to mmCIF-compatible values
133
+ _RDKIT_BOND_TYPE_TO_MMCIF = {
134
+ rd_chem.BondType.SINGLE: 'SING',
135
+ rd_chem.BondType.DOUBLE: 'DOUB',
136
+ rd_chem.BondType.TRIPLE: 'TRIP',
137
+ rd_chem.BondType.AROMATIC: 'AROM'
138
+ }
139
+
140
+ def assign_atom_names_from_graph(mol):
141
+ for i, atom in enumerate(mol.GetAtoms()):
142
+ atom.SetProp('atom_name', f"{atom.GetSymbol()}{i+1}")
143
+ return mol
144
+
145
+ def mol_to_ccd_text(mol, component_id, pdbx_smiles=None, include_hydrogens=False):
146
+ mol = rd_chem.Mol(mol)
147
+ if include_hydrogens:
148
+ mol = rd_chem.AddHs(mol)
149
+ rd_chem.Kekulize(mol, clearAromaticFlags=True)
150
+
151
+ if mol.GetNumConformers() == 0:
152
+ raise ValueError('The molecule has no conformers')
153
+ conf = mol.GetConformer()
154
+ coords = conf.GetPositions()
155
+
156
+ mol = assign_atom_names_from_graph(mol)
157
+ atom_map = {atom.GetIdx(): atom.GetProp('atom_name') for atom in mol.GetAtoms()}
158
+
159
+ lines = [
160
+ f"data_{component_id}",
161
+ "#",
162
+ f"_chem_comp.id {component_id}",
163
+ f"_chem_comp.name '{component_id}'",
164
+ "_chem_comp.type non-polymer",
165
+ "_chem_comp.formula '?'",
166
+ "_chem_comp.mon_nstd_parent_comp_id ?",
167
+ "_chem_comp.pdbx_synonyms ?",
168
+ "_chem_comp.formula_weight '?'",
169
+ ]
170
+ if pdbx_smiles:
171
+ lines.append(f"_chem_comp.pdbx_smiles {pdbx_smiles}")
172
+ lines += [
173
+ "#",
174
+ "loop_",
175
+ "_chem_comp_atom.comp_id",
176
+ "_chem_comp_atom.atom_id",
177
+ "_chem_comp_atom.type_symbol",
178
+ "_chem_comp_atom.charge",
179
+ "_chem_comp_atom.pdbx_leaving_atom_flag",
180
+ "_chem_comp_atom.pdbx_model_Cartn_x_ideal",
181
+ "_chem_comp_atom.pdbx_model_Cartn_y_ideal",
182
+ "_chem_comp_atom.pdbx_model_Cartn_z_ideal"
183
+ ]
184
+
185
+ for i, atom in enumerate(mol.GetAtoms()):
186
+ if not include_hydrogens and atom.GetSymbol() == 'H':
187
+ continue
188
+ x, y, z = coords[i]
189
+ lines.append(f"{component_id} {atom_map[atom.GetIdx()]} {atom.GetSymbol()} {atom.GetFormalCharge()} N {x:.3f} {y:.3f} {z:.3f}")
190
+
191
+ lines += [
192
+ "#",
193
+ "loop_",
194
+ "_chem_comp_bond.atom_id_1",
195
+ "_chem_comp_bond.atom_id_2",
196
+ "_chem_comp_bond.value_order",
197
+ "_chem_comp_bond.pdbx_aromatic_flag"
198
+ ]
199
+
200
+ for bond in mol.GetBonds():
201
+ a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
202
+ if not include_hydrogens and (mol.GetAtomWithIdx(a1).GetSymbol() == 'H' or mol.GetAtomWithIdx(a2).GetSymbol() == 'H'):
203
+ continue
204
+ bond_type = _RDKIT_BOND_TYPE_TO_MMCIF[bond.GetBondType()]
205
+ aromatic_flag = 'Y' if bond.GetIsAromatic() else 'N'
206
+ lines.append(f"{atom_map[a1]} {atom_map[a2]} {bond_type} {aromatic_flag}")
207
+ lines.append("#")
208
+
209
+ return "\n".join(lines)
210
+
211
+ # %% ../../nbs/af3/00_json.ipynb 23
212
+ def sdf2ccd(sdf_path,
213
+ CCD_name='lig-1', # do not use '_'; use as less letter as possible, 'lig-any' leads to extra ligands
214
+ ):
215
+
216
+ "Convert the compound to the AF3 required CCD format"
217
+ supplier = Chem.SDMolSupplier(sdf_path)
218
+ mol = supplier[0] # Get the first molecule
219
+ return mol_to_ccd_text(mol,CCD_name)
220
+
221
+ # %% ../../nbs/af3/00_json.ipynb 26
222
+ def get_protein_ccd_json(protein_json, # dict with protein sequence
223
+ rec_residue_num:int, # 1-indexed, for bondedAtomPairs, e.g., ["A", 145, "SG"]
224
+ rec_atom_id:str, # for bondedAtomPairs, e.g., ["A", 145, "SG"]
225
+ lig_sdf_path, # ccd text
226
+ lig_atom_id:str, # 0-indexed, for bondedAtomPairs, ["L", 1, "C04"]
227
+ job_id:str, # str, job/task ID
228
+ save_path=None,# optional output path
229
+ seeds=[1], # optional random seeds
230
+ ):
231
+ "Create AlphaFold3 docking JSON with customized CCD ligand and bondedAtomPairs."
232
+
233
+ # get userCCD
234
+ userCCD=sdf2ccd(lig_sdf_path)
235
+ ccd_id = re.search(r"_chem_comp.id\s+([^\s#]+)", userCCD).group(1)
236
+
237
+ protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
238
+
239
+ json_data = {
240
+ "name": job_id,
241
+ "modelSeeds": seeds,
242
+ "sequences": [
243
+ {
244
+ "ligand": {
245
+ "id": "L",
246
+ "ccdCodes": [ccd_id]
247
+ }
248
+ },
249
+ {
250
+ "protein": protein_json["sequences"][protein_index]["protein"]
251
+ },
252
+ ],
253
+ "bondedAtomPairs": [[["A", rec_residue_num, rec_atom_id],["L", 1, lig_atom_id]]],
254
+ "userCCD": userCCD,
255
+ "dialect": "alphafold3",
256
+ "version": 3
257
+ }
258
+
259
+ if save_path:
260
+ Path(save_path).parent.mkdir(parents=True, exist_ok=True)
261
+ dump_json(json_data, save_path)
262
+
263
+ return json_data
264
+
265
+ # %% ../../nbs/af3/00_json.ipynb 30
266
+ def split_nfolder(folder_dir,
267
+ n=4):
268
+ "Move json files from a folder into subfolders (folder_0, folder_1, ..., folder_N)."
269
+
270
+ folder_dir = Path(folder_dir)
271
+
272
+ files = sorted(folder_dir.glob("*.json"))
273
+ # print(len(files))
274
+ subfolders = [folder_dir / f"folder_{i}" for i in range(n)]
275
+ for folder in subfolders:
276
+ folder.mkdir(exist_ok=True)
277
+
278
+ for idx, file in enumerate(files):
279
+ target_folder = subfolders[idx % n]
280
+ shutil.move(str(file), target_folder / file.name)
281
+
282
+ print(f"Distributed {len(files)} files into {n} folders.")
@@ -0,0 +1,95 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/af3/02_protein_pairs.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['get_colabfold_cmd', 'copy_a3m', 'a3m_to_seq', 'get_protein_subjson', 'dump_json_folder', 'get_multi_protein_json',
5
+ 'generate_pair_df']
6
+
7
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 4
8
+ import os, json, shutil, pandas as pd
9
+ from tqdm import tqdm
10
+ from itertools import combinations
11
+ from pathlib import Path
12
+ from .json import *
13
+ from .docker import *
14
+
15
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 9
16
+ def get_colabfold_cmd(csv_path,project_name):
17
+ print('Run below in terminal:')
18
+ print(f'\n colabfold_batch {csv_path} msa_{project_name} --msa-only')
19
+
20
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 13
21
+ def copy_a3m(a3m_dir: str, # Path to the source directory containing .a3m files.
22
+ dest_dir: str, # Path to the destination directory where files will be copied
23
+ ):
24
+ "Copies all .a3m files from the source directory to the destination directory."
25
+
26
+ a3m_dir,dest_dir = Path(a3m_dir),Path(dest_dir)
27
+ dest_dir.mkdir(parents=True, exist_ok=True)
28
+
29
+ files = list(a3m_dir.glob('*.a3m'))
30
+
31
+ for file in tqdm(files, desc="Copying files", unit="file"):
32
+ shutil.copy(file, dest_dir / file.name)
33
+
34
+ print(f"Copied {len(files)} a3m files from {a3m_dir} to {dest_dir}")
35
+
36
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 17
37
+ def a3m_to_seq(file_path:Path):
38
+ "Get protein sequence from a3m file"
39
+ return file_path.read_text().splitlines()[2] # protein sequence is located on line 2
40
+
41
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 19
42
+ def get_protein_subjson(gene_name, a3m_dir=".",idx = 'A',run_template=True):
43
+ "Get subjson (protein part) with colabfold unpairedMSA .a3m path"
44
+ file_path = Path(a3m_dir)/f"{gene_name}.a3m"
45
+ protein_sequence = a3m_to_seq(file_path)
46
+
47
+ json_data = {
48
+ 'id': idx,
49
+ 'sequence': protein_sequence,
50
+ 'modifications': [],
51
+ 'unpairedMsaPath': str("/root"/file_path), # for docker path, ECD under af_input
52
+ 'pairedMsa': '',
53
+ 'templates': None if run_template else []
54
+ }
55
+
56
+ return json_data
57
+
58
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 22
59
+ def dump_json_folder(json_data, folder):
60
+ "Save json under a folder"
61
+ file_path = Path(folder)/f"{json_data['name']}.json"
62
+ with open(file_path,'w') as f: json.dump(json_data,f,indent=4)
63
+
64
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 23
65
+ def get_multi_protein_json(gene_list,a3m_dir,run_template=True,save_folder=None):
66
+ 'Get json of multiple proteins, with unpaired MSA path indicated (from colabfold MSA)'
67
+ sequences = []
68
+ alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
69
+ for index, gene in enumerate(gene_list):
70
+ sub_json=get_protein_subjson(gene,a3m_dir,idx=alphabets[index],run_template=run_template)
71
+ sequences.append({'protein':sub_json})
72
+ name = '_'.join(gene_list)
73
+ json_data = {
74
+ "name": name,
75
+ "modelSeeds": [1],
76
+ "sequences": sequences,
77
+ "bondedAtomPairs": [],
78
+ "dialect": "alphafold3",
79
+ "version": 2
80
+ }
81
+ if save_folder:
82
+ dump_json_folder(json_data,save_folder)
83
+ return json_data
84
+
85
+ # %% ../../nbs/af3/02_protein_pairs.ipynb 27
86
+ def generate_pair_df(gene_list,self_pair=True):
87
+ "Unique pair genes in a gene list"
88
+ pairs = list(combinations(gene_list, 2))
89
+ pair_df = pd.DataFrame(pairs,columns=["Gene1", "Gene2"])
90
+
91
+ if self_pair:
92
+ self_pair_df = pd.DataFrame({'Gene1':gene_list, 'Gene2':gene_list})
93
+ pair_df = pd.concat([pair_df,self_pair_df])
94
+
95
+ return pair_df.reset_index(drop=True)
kdock/core/__init__.py ADDED
File without changes
kdock/core/data.py ADDED
@@ -0,0 +1,64 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/00_data.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['BASE_URL', 'fetch_csv', 'Collins', 'Kras']
5
+
6
+ # %% ../../nbs/core/00_data.ipynb 3
7
+ import pandas as pd
8
+ import requests
9
+ from functools import lru_cache
10
+
11
+ # %% ../../nbs/core/00_data.ipynb 7
12
+ BASE_URL = "https://github.com/sky1ove/kdock/raw/main/"
13
+
14
+ # %% ../../nbs/core/00_data.ipynb 8
15
+ @lru_cache()
16
+ def fetch_csv(url):
17
+ return pd.read_csv(url)
18
+
19
+ # %% ../../nbs/core/00_data.ipynb 9
20
+ class Collins:
21
+ "A class of loading compound datasets from Collins lab."
22
+
23
+ @staticmethod
24
+ def get_antibiotics_2k():
25
+ """
26
+ Antibiotics dataset of 50 µM 2,560 compounds screening in E. coli K12 BW25113.
27
+ 2,335 unique compounds after deduplicated.
28
+ Table S1B from 2020 Cell: A Deep Learning Approach to Antibiotic Discovery.
29
+ """
30
+ return fetch_csv(BASE_URL + "dataset/antibiotics_2k.csv")
31
+
32
+ @staticmethod
33
+ def get_antibiotics_39k():
34
+ """
35
+ Antibiotics dataset of 50 µM 39,128 compounds screening in E. coli K12 BW25113.
36
+ Supplementary dataset EV1 from 2022 Molecular Systems Biology: Benchmarking AlphaFold-enabled molecular docking predictions for antibiotic discovery.
37
+ """
38
+ return fetch_csv(BASE_URL + "dataset/antibiotics_39k.csv")
39
+
40
+ @staticmethod
41
+ def get_antibiotics_enzyme():
42
+ """
43
+ Antibiotics enzymatic inhibition dataset of 100 µM 218 compounds and 12 essential proteins in E. coli K12 BW25113.
44
+ Flattened benchmark dataset/Supplementary EV4 from 2022 Molecular Systems Biology: Benchmarking AlphaFold-enabled molecular docking predictions for antibiotic discovery.
45
+ """
46
+ return fetch_csv(BASE_URL + "dataset/antibiotics_enzyme.csv")
47
+
48
+ # %% ../../nbs/core/00_data.ipynb 19
49
+ class Kras:
50
+ "A class of fetching various KRAS datasets."
51
+ @staticmethod
52
+ def get_mirati_g12d():
53
+ "Deduplicated G12D dataset from the mirati paper and patents."
54
+ return fetch_csv(BASE_URL + "dataset/KRASi_g12d_dedup.csv")
55
+
56
+ @staticmethod
57
+ def get_mirati_g12d_raw():
58
+ "Raw G12D dataset from the paper and patents without deduplication."
59
+ return fetch_csv(BASE_URL + "dataset/KRASi_g12d.csv")
60
+
61
+ @staticmethod
62
+ def get_seq():
63
+ "Protein sequence of human KRAS and its mutants G12D and G12C."
64
+ return fetch_csv(BASE_URL + "dataset/kras_seq.csv")