kdock 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kdock/__init__.py +1 -0
- kdock/_modidx.py +131 -0
- kdock/af3/__init__.py +0 -0
- kdock/af3/analyze.py +162 -0
- kdock/af3/docker.py +120 -0
- kdock/af3/json.py +282 -0
- kdock/af3/protein_pairs.py +95 -0
- kdock/core/__init__.py +0 -0
- kdock/core/data.py +64 -0
- kdock/core/ligand.py +294 -0
- kdock/core/plot.py +89 -0
- kdock/core/protein.py +283 -0
- kdock/core/utils.py +156 -0
- kdock/gnina/__init__.py +0 -0
- kdock/gnina/dock.py +114 -0
- kdock/gnina/rescore.py +204 -0
- kdock/px/__init__.py +0 -0
- kdock/px/core.py +130 -0
- kdock/px/dock.py +117 -0
- kdock-0.0.2.dist-info/METADATA +80 -0
- kdock-0.0.2.dist-info/RECORD +25 -0
- kdock-0.0.2.dist-info/WHEEL +5 -0
- kdock-0.0.2.dist-info/entry_points.txt +2 -0
- kdock-0.0.2.dist-info/licenses/LICENSE +201 -0
- kdock-0.0.2.dist-info/top_level.txt +1 -0
kdock/af3/json.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/af3/00_json.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['dump_json', 'get_protein_json', 'read_json', 'get_protein_smiles_json', 'get_protein_ccdcode_json',
|
|
5
|
+
'assign_atom_names_from_graph', 'mol_to_ccd_text', 'sdf2ccd', 'get_protein_ccd_json', 'split_nfolder']
|
|
6
|
+
|
|
7
|
+
# %% ../../nbs/af3/00_json.ipynb 2
|
|
8
|
+
import re, shutil, json, pandas as pd, numpy as np
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from rdkit import Chem as rd_chem
|
|
12
|
+
from rdkit.Chem import AllChem,rdmolfiles
|
|
13
|
+
from rdkit import Chem
|
|
14
|
+
|
|
15
|
+
from Bio.PDB import PDBParser
|
|
16
|
+
|
|
17
|
+
# %% ../../nbs/af3/00_json.ipynb 4
|
|
18
|
+
def dump_json(data, save_path):
|
|
19
|
+
"Save json data into a file"
|
|
20
|
+
with open(save_path,'w') as f:
|
|
21
|
+
json.dump(data,f,indent=4)
|
|
22
|
+
|
|
23
|
+
# %% ../../nbs/af3/00_json.ipynb 5
|
|
24
|
+
def get_protein_json(name, # job name
|
|
25
|
+
seq, # aa sequence
|
|
26
|
+
save_path=None, # .json
|
|
27
|
+
seeds=[1]
|
|
28
|
+
):
|
|
29
|
+
"Generate json of single protein sequence for input of docker command"
|
|
30
|
+
|
|
31
|
+
json_data = {
|
|
32
|
+
"name": name,
|
|
33
|
+
"modelSeeds": seeds,
|
|
34
|
+
"sequences": [
|
|
35
|
+
{
|
|
36
|
+
"protein": {
|
|
37
|
+
"id": "A",
|
|
38
|
+
"sequence": seq,
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
"bondedAtomPairs": [],
|
|
43
|
+
"dialect": "alphafold3",
|
|
44
|
+
"version": 3
|
|
45
|
+
}
|
|
46
|
+
if save_path:
|
|
47
|
+
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
dump_json(json_data,save_path)
|
|
49
|
+
return json_data
|
|
50
|
+
|
|
51
|
+
# %% ../../nbs/af3/00_json.ipynb 9
|
|
52
|
+
def read_json(file_path):
|
|
53
|
+
with open(file_path,'r') as f:
|
|
54
|
+
data = json.load(f)
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
# %% ../../nbs/af3/00_json.ipynb 11
|
|
58
|
+
def get_protein_smiles_json(smi_id:str,
|
|
59
|
+
SMILES:str,
|
|
60
|
+
protein_json, # json type
|
|
61
|
+
save_path=None, # .json
|
|
62
|
+
seeds=[1]
|
|
63
|
+
):
|
|
64
|
+
|
|
65
|
+
"Get json for protein-ligand docking task"
|
|
66
|
+
raw_smiles = r"{}".format(SMILES) # JSON escaping, \ to \\
|
|
67
|
+
protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
|
|
68
|
+
json_data = {
|
|
69
|
+
"name": smi_id,
|
|
70
|
+
"modelSeeds": seeds,
|
|
71
|
+
"sequences": [
|
|
72
|
+
{
|
|
73
|
+
"ligand": {
|
|
74
|
+
"id": "L",
|
|
75
|
+
"smiles": raw_smiles,
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"protein": protein_json["sequences"][protein_index]["protein"]
|
|
80
|
+
},
|
|
81
|
+
],
|
|
82
|
+
"bondedAtomPairs": [],
|
|
83
|
+
"dialect": "alphafold3",
|
|
84
|
+
"version": 2
|
|
85
|
+
}
|
|
86
|
+
if save_path:
|
|
87
|
+
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
dump_json(json_data,save_path)
|
|
89
|
+
return json_data
|
|
90
|
+
|
|
91
|
+
# %% ../../nbs/af3/00_json.ipynb 18
|
|
92
|
+
def get_protein_ccdcode_json(protein_json, # dict with protein sequence
|
|
93
|
+
ccd_code, # str or list of str
|
|
94
|
+
job_id: str, # job/task ID
|
|
95
|
+
save_path=None, # optional output path
|
|
96
|
+
seeds=[1]): # optional random seeds
|
|
97
|
+
"Create AlphaFold3 docking JSON with CCD code(s)."
|
|
98
|
+
|
|
99
|
+
# Normalize ccd_code to a list
|
|
100
|
+
if isinstance(ccd_code, str):
|
|
101
|
+
ccd_code = [ccd_code]
|
|
102
|
+
elif not isinstance(ccd_code, list):
|
|
103
|
+
raise TypeError("ccd_code must be a string or a list of strings.")
|
|
104
|
+
|
|
105
|
+
protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
|
|
106
|
+
|
|
107
|
+
json_data = {
|
|
108
|
+
"name": job_id,
|
|
109
|
+
"modelSeeds": seeds,
|
|
110
|
+
"sequences": [
|
|
111
|
+
{
|
|
112
|
+
"ligand": {
|
|
113
|
+
"id": "L",
|
|
114
|
+
"ccdCodes": ccd_code
|
|
115
|
+
}
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"protein": protein_json["sequences"][protein_index]["protein"]
|
|
119
|
+
},
|
|
120
|
+
],
|
|
121
|
+
"dialect": "alphafold3",
|
|
122
|
+
"version": 3
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if save_path:
|
|
126
|
+
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
|
127
|
+
dump_json(json_data, save_path)
|
|
128
|
+
|
|
129
|
+
return json_data
|
|
130
|
+
|
|
131
|
+
# %% ../../nbs/af3/00_json.ipynb 22
|
|
132
|
+
# Mapping bond types to mmCIF-compatible values
|
|
133
|
+
_RDKIT_BOND_TYPE_TO_MMCIF = {
|
|
134
|
+
rd_chem.BondType.SINGLE: 'SING',
|
|
135
|
+
rd_chem.BondType.DOUBLE: 'DOUB',
|
|
136
|
+
rd_chem.BondType.TRIPLE: 'TRIP',
|
|
137
|
+
rd_chem.BondType.AROMATIC: 'AROM'
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
def assign_atom_names_from_graph(mol):
|
|
141
|
+
for i, atom in enumerate(mol.GetAtoms()):
|
|
142
|
+
atom.SetProp('atom_name', f"{atom.GetSymbol()}{i+1}")
|
|
143
|
+
return mol
|
|
144
|
+
|
|
145
|
+
def mol_to_ccd_text(mol, component_id, pdbx_smiles=None, include_hydrogens=False):
|
|
146
|
+
mol = rd_chem.Mol(mol)
|
|
147
|
+
if include_hydrogens:
|
|
148
|
+
mol = rd_chem.AddHs(mol)
|
|
149
|
+
rd_chem.Kekulize(mol, clearAromaticFlags=True)
|
|
150
|
+
|
|
151
|
+
if mol.GetNumConformers() == 0:
|
|
152
|
+
raise ValueError('The molecule has no conformers')
|
|
153
|
+
conf = mol.GetConformer()
|
|
154
|
+
coords = conf.GetPositions()
|
|
155
|
+
|
|
156
|
+
mol = assign_atom_names_from_graph(mol)
|
|
157
|
+
atom_map = {atom.GetIdx(): atom.GetProp('atom_name') for atom in mol.GetAtoms()}
|
|
158
|
+
|
|
159
|
+
lines = [
|
|
160
|
+
f"data_{component_id}",
|
|
161
|
+
"#",
|
|
162
|
+
f"_chem_comp.id {component_id}",
|
|
163
|
+
f"_chem_comp.name '{component_id}'",
|
|
164
|
+
"_chem_comp.type non-polymer",
|
|
165
|
+
"_chem_comp.formula '?'",
|
|
166
|
+
"_chem_comp.mon_nstd_parent_comp_id ?",
|
|
167
|
+
"_chem_comp.pdbx_synonyms ?",
|
|
168
|
+
"_chem_comp.formula_weight '?'",
|
|
169
|
+
]
|
|
170
|
+
if pdbx_smiles:
|
|
171
|
+
lines.append(f"_chem_comp.pdbx_smiles {pdbx_smiles}")
|
|
172
|
+
lines += [
|
|
173
|
+
"#",
|
|
174
|
+
"loop_",
|
|
175
|
+
"_chem_comp_atom.comp_id",
|
|
176
|
+
"_chem_comp_atom.atom_id",
|
|
177
|
+
"_chem_comp_atom.type_symbol",
|
|
178
|
+
"_chem_comp_atom.charge",
|
|
179
|
+
"_chem_comp_atom.pdbx_leaving_atom_flag",
|
|
180
|
+
"_chem_comp_atom.pdbx_model_Cartn_x_ideal",
|
|
181
|
+
"_chem_comp_atom.pdbx_model_Cartn_y_ideal",
|
|
182
|
+
"_chem_comp_atom.pdbx_model_Cartn_z_ideal"
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
for i, atom in enumerate(mol.GetAtoms()):
|
|
186
|
+
if not include_hydrogens and atom.GetSymbol() == 'H':
|
|
187
|
+
continue
|
|
188
|
+
x, y, z = coords[i]
|
|
189
|
+
lines.append(f"{component_id} {atom_map[atom.GetIdx()]} {atom.GetSymbol()} {atom.GetFormalCharge()} N {x:.3f} {y:.3f} {z:.3f}")
|
|
190
|
+
|
|
191
|
+
lines += [
|
|
192
|
+
"#",
|
|
193
|
+
"loop_",
|
|
194
|
+
"_chem_comp_bond.atom_id_1",
|
|
195
|
+
"_chem_comp_bond.atom_id_2",
|
|
196
|
+
"_chem_comp_bond.value_order",
|
|
197
|
+
"_chem_comp_bond.pdbx_aromatic_flag"
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
for bond in mol.GetBonds():
|
|
201
|
+
a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
|
|
202
|
+
if not include_hydrogens and (mol.GetAtomWithIdx(a1).GetSymbol() == 'H' or mol.GetAtomWithIdx(a2).GetSymbol() == 'H'):
|
|
203
|
+
continue
|
|
204
|
+
bond_type = _RDKIT_BOND_TYPE_TO_MMCIF[bond.GetBondType()]
|
|
205
|
+
aromatic_flag = 'Y' if bond.GetIsAromatic() else 'N'
|
|
206
|
+
lines.append(f"{atom_map[a1]} {atom_map[a2]} {bond_type} {aromatic_flag}")
|
|
207
|
+
lines.append("#")
|
|
208
|
+
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
# %% ../../nbs/af3/00_json.ipynb 23
|
|
212
|
+
def sdf2ccd(sdf_path,
|
|
213
|
+
CCD_name='lig-1', # do not use '_'; use as less letter as possible, 'lig-any' leads to extra ligands
|
|
214
|
+
):
|
|
215
|
+
|
|
216
|
+
"Convert the compound to the AF3 required CCD format"
|
|
217
|
+
supplier = Chem.SDMolSupplier(sdf_path)
|
|
218
|
+
mol = supplier[0] # Get the first molecule
|
|
219
|
+
return mol_to_ccd_text(mol,CCD_name)
|
|
220
|
+
|
|
221
|
+
# %% ../../nbs/af3/00_json.ipynb 26
|
|
222
|
+
def get_protein_ccd_json(protein_json, # dict with protein sequence
|
|
223
|
+
rec_residue_num:int, # 1-indexed, for bondedAtomPairs, e.g., ["A", 145, "SG"]
|
|
224
|
+
rec_atom_id:str, # for bondedAtomPairs, e.g., ["A", 145, "SG"]
|
|
225
|
+
lig_sdf_path, # ccd text
|
|
226
|
+
lig_atom_id:str, # 0-indexed, for bondedAtomPairs, ["L", 1, "C04"]
|
|
227
|
+
job_id:str, # str, job/task ID
|
|
228
|
+
save_path=None,# optional output path
|
|
229
|
+
seeds=[1], # optional random seeds
|
|
230
|
+
):
|
|
231
|
+
"Create AlphaFold3 docking JSON with customized CCD ligand and bondedAtomPairs."
|
|
232
|
+
|
|
233
|
+
# get userCCD
|
|
234
|
+
userCCD=sdf2ccd(lig_sdf_path)
|
|
235
|
+
ccd_id = re.search(r"_chem_comp.id\s+([^\s#]+)", userCCD).group(1)
|
|
236
|
+
|
|
237
|
+
protein_index = next(i for i, item in enumerate(protein_json["sequences"]) if "protein" in item)
|
|
238
|
+
|
|
239
|
+
json_data = {
|
|
240
|
+
"name": job_id,
|
|
241
|
+
"modelSeeds": seeds,
|
|
242
|
+
"sequences": [
|
|
243
|
+
{
|
|
244
|
+
"ligand": {
|
|
245
|
+
"id": "L",
|
|
246
|
+
"ccdCodes": [ccd_id]
|
|
247
|
+
}
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"protein": protein_json["sequences"][protein_index]["protein"]
|
|
251
|
+
},
|
|
252
|
+
],
|
|
253
|
+
"bondedAtomPairs": [[["A", rec_residue_num, rec_atom_id],["L", 1, lig_atom_id]]],
|
|
254
|
+
"userCCD": userCCD,
|
|
255
|
+
"dialect": "alphafold3",
|
|
256
|
+
"version": 3
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if save_path:
|
|
260
|
+
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
|
261
|
+
dump_json(json_data, save_path)
|
|
262
|
+
|
|
263
|
+
return json_data
|
|
264
|
+
|
|
265
|
+
# %% ../../nbs/af3/00_json.ipynb 30
|
|
266
|
+
def split_nfolder(folder_dir,
|
|
267
|
+
n=4):
|
|
268
|
+
"Move json files from a folder into subfolders (folder_0, folder_1, ..., folder_N)."
|
|
269
|
+
|
|
270
|
+
folder_dir = Path(folder_dir)
|
|
271
|
+
|
|
272
|
+
files = sorted(folder_dir.glob("*.json"))
|
|
273
|
+
# print(len(files))
|
|
274
|
+
subfolders = [folder_dir / f"folder_{i}" for i in range(n)]
|
|
275
|
+
for folder in subfolders:
|
|
276
|
+
folder.mkdir(exist_ok=True)
|
|
277
|
+
|
|
278
|
+
for idx, file in enumerate(files):
|
|
279
|
+
target_folder = subfolders[idx % n]
|
|
280
|
+
shutil.move(str(file), target_folder / file.name)
|
|
281
|
+
|
|
282
|
+
print(f"Distributed {len(files)} files into {n} folders.")
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/af3/02_protein_pairs.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['get_colabfold_cmd', 'copy_a3m', 'a3m_to_seq', 'get_protein_subjson', 'dump_json_folder', 'get_multi_protein_json',
|
|
5
|
+
'generate_pair_df']
|
|
6
|
+
|
|
7
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 4
|
|
8
|
+
import os, json, shutil, pandas as pd
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from itertools import combinations
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from .json import *
|
|
13
|
+
from .docker import *
|
|
14
|
+
|
|
15
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 9
|
|
16
|
+
def get_colabfold_cmd(csv_path,project_name):
|
|
17
|
+
print('Run below in terminal:')
|
|
18
|
+
print(f'\n colabfold_batch {csv_path} msa_{project_name} --msa-only')
|
|
19
|
+
|
|
20
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 13
|
|
21
|
+
def copy_a3m(a3m_dir: str, # Path to the source directory containing .a3m files.
|
|
22
|
+
dest_dir: str, # Path to the destination directory where files will be copied
|
|
23
|
+
):
|
|
24
|
+
"Copies all .a3m files from the source directory to the destination directory."
|
|
25
|
+
|
|
26
|
+
a3m_dir,dest_dir = Path(a3m_dir),Path(dest_dir)
|
|
27
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
files = list(a3m_dir.glob('*.a3m'))
|
|
30
|
+
|
|
31
|
+
for file in tqdm(files, desc="Copying files", unit="file"):
|
|
32
|
+
shutil.copy(file, dest_dir / file.name)
|
|
33
|
+
|
|
34
|
+
print(f"Copied {len(files)} a3m files from {a3m_dir} to {dest_dir}")
|
|
35
|
+
|
|
36
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 17
|
|
37
|
+
def a3m_to_seq(file_path:Path):
|
|
38
|
+
"Get protein sequence from a3m file"
|
|
39
|
+
return file_path.read_text().splitlines()[2] # protein sequence is located on line 2
|
|
40
|
+
|
|
41
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 19
|
|
42
|
+
def get_protein_subjson(gene_name, a3m_dir=".",idx = 'A',run_template=True):
|
|
43
|
+
"Get subjson (protein part) with colabfold unpairedMSA .a3m path"
|
|
44
|
+
file_path = Path(a3m_dir)/f"{gene_name}.a3m"
|
|
45
|
+
protein_sequence = a3m_to_seq(file_path)
|
|
46
|
+
|
|
47
|
+
json_data = {
|
|
48
|
+
'id': idx,
|
|
49
|
+
'sequence': protein_sequence,
|
|
50
|
+
'modifications': [],
|
|
51
|
+
'unpairedMsaPath': str("/root"/file_path), # for docker path, ECD under af_input
|
|
52
|
+
'pairedMsa': '',
|
|
53
|
+
'templates': None if run_template else []
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return json_data
|
|
57
|
+
|
|
58
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 22
|
|
59
|
+
def dump_json_folder(json_data, folder):
|
|
60
|
+
"Save json under a folder"
|
|
61
|
+
file_path = Path(folder)/f"{json_data['name']}.json"
|
|
62
|
+
with open(file_path,'w') as f: json.dump(json_data,f,indent=4)
|
|
63
|
+
|
|
64
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 23
|
|
65
|
+
def get_multi_protein_json(gene_list,a3m_dir,run_template=True,save_folder=None):
|
|
66
|
+
'Get json of multiple proteins, with unpaired MSA path indicated (from colabfold MSA)'
|
|
67
|
+
sequences = []
|
|
68
|
+
alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
69
|
+
for index, gene in enumerate(gene_list):
|
|
70
|
+
sub_json=get_protein_subjson(gene,a3m_dir,idx=alphabets[index],run_template=run_template)
|
|
71
|
+
sequences.append({'protein':sub_json})
|
|
72
|
+
name = '_'.join(gene_list)
|
|
73
|
+
json_data = {
|
|
74
|
+
"name": name,
|
|
75
|
+
"modelSeeds": [1],
|
|
76
|
+
"sequences": sequences,
|
|
77
|
+
"bondedAtomPairs": [],
|
|
78
|
+
"dialect": "alphafold3",
|
|
79
|
+
"version": 2
|
|
80
|
+
}
|
|
81
|
+
if save_folder:
|
|
82
|
+
dump_json_folder(json_data,save_folder)
|
|
83
|
+
return json_data
|
|
84
|
+
|
|
85
|
+
# %% ../../nbs/af3/02_protein_pairs.ipynb 27
|
|
86
|
+
def generate_pair_df(gene_list,self_pair=True):
|
|
87
|
+
"Unique pair genes in a gene list"
|
|
88
|
+
pairs = list(combinations(gene_list, 2))
|
|
89
|
+
pair_df = pd.DataFrame(pairs,columns=["Gene1", "Gene2"])
|
|
90
|
+
|
|
91
|
+
if self_pair:
|
|
92
|
+
self_pair_df = pd.DataFrame({'Gene1':gene_list, 'Gene2':gene_list})
|
|
93
|
+
pair_df = pd.concat([pair_df,self_pair_df])
|
|
94
|
+
|
|
95
|
+
return pair_df.reset_index(drop=True)
|
kdock/core/__init__.py
ADDED
|
File without changes
|
kdock/core/data.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/00_data.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['BASE_URL', 'fetch_csv', 'Collins', 'Kras']
|
|
5
|
+
|
|
6
|
+
# %% ../../nbs/core/00_data.ipynb 3
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import requests
|
|
9
|
+
from functools import lru_cache
|
|
10
|
+
|
|
11
|
+
# %% ../../nbs/core/00_data.ipynb 7
|
|
12
|
+
BASE_URL = "https://github.com/sky1ove/kdock/raw/main/"
|
|
13
|
+
|
|
14
|
+
# %% ../../nbs/core/00_data.ipynb 8
|
|
15
|
+
@lru_cache()
|
|
16
|
+
def fetch_csv(url):
|
|
17
|
+
return pd.read_csv(url)
|
|
18
|
+
|
|
19
|
+
# %% ../../nbs/core/00_data.ipynb 9
|
|
20
|
+
class Collins:
|
|
21
|
+
"A class of loading compound datasets from Collins lab."
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def get_antibiotics_2k():
|
|
25
|
+
"""
|
|
26
|
+
Antibiotics dataset of 50 µM 2,560 compounds screening in E. coli K12 BW25113.
|
|
27
|
+
2,335 unique compounds after deduplicated.
|
|
28
|
+
Table S1B from 2020 Cell: A Deep Learning Approach to Antibiotic Discovery.
|
|
29
|
+
"""
|
|
30
|
+
return fetch_csv(BASE_URL + "dataset/antibiotics_2k.csv")
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_antibiotics_39k():
|
|
34
|
+
"""
|
|
35
|
+
Antibiotics dataset of 50 µM 39,128 compounds screening in E. coli K12 BW25113.
|
|
36
|
+
Supplementary dataset EV1 from 2022 Molecular Systems Biology: Benchmarking AlphaFold-enabled molecular docking predictions for antibiotic discovery.
|
|
37
|
+
"""
|
|
38
|
+
return fetch_csv(BASE_URL + "dataset/antibiotics_39k.csv")
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def get_antibiotics_enzyme():
|
|
42
|
+
"""
|
|
43
|
+
Antibiotics enzymatic inhibition dataset of 100 µM 218 compounds and 12 essential proteins in E. coli K12 BW25113.
|
|
44
|
+
Flattened benchmark dataset/Supplementary EV4 from 2022 Molecular Systems Biology: Benchmarking AlphaFold-enabled molecular docking predictions for antibiotic discovery.
|
|
45
|
+
"""
|
|
46
|
+
return fetch_csv(BASE_URL + "dataset/antibiotics_enzyme.csv")
|
|
47
|
+
|
|
48
|
+
# %% ../../nbs/core/00_data.ipynb 19
|
|
49
|
+
class Kras:
|
|
50
|
+
"A class of fetching various KRAS datasets."
|
|
51
|
+
@staticmethod
|
|
52
|
+
def get_mirati_g12d():
|
|
53
|
+
"Deduplicated G12D dataset from the mirati paper and patents."
|
|
54
|
+
return fetch_csv(BASE_URL + "dataset/KRASi_g12d_dedup.csv")
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def get_mirati_g12d_raw():
|
|
58
|
+
"Raw G12D dataset from the paper and patents without deduplication."
|
|
59
|
+
return fetch_csv(BASE_URL + "dataset/KRASi_g12d.csv")
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def get_seq():
|
|
63
|
+
"Protein sequence of human KRAS and its mutants G12D and G12C."
|
|
64
|
+
return fetch_csv(BASE_URL + "dataset/kras_seq.csv")
|