kdock 2025.10.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kdock/__init__.py +1 -0
- kdock/_modidx.py +131 -0
- kdock/af3/__init__.py +0 -0
- kdock/af3/analyze.py +162 -0
- kdock/af3/docker.py +120 -0
- kdock/af3/json.py +282 -0
- kdock/af3/protein_pairs.py +95 -0
- kdock/core/__init__.py +0 -0
- kdock/core/data.py +64 -0
- kdock/core/ligand.py +294 -0
- kdock/core/plot.py +89 -0
- kdock/core/protein.py +293 -0
- kdock/core/utils.py +156 -0
- kdock/gnina/__init__.py +0 -0
- kdock/gnina/dock.py +114 -0
- kdock/gnina/rescore.py +204 -0
- kdock/px/__init__.py +0 -0
- kdock/px/core.py +130 -0
- kdock/px/dock.py +117 -0
- kdock-2025.10.31.dist-info/METADATA +81 -0
- kdock-2025.10.31.dist-info/RECORD +25 -0
- kdock-2025.10.31.dist-info/WHEEL +5 -0
- kdock-2025.10.31.dist-info/entry_points.txt +2 -0
- kdock-2025.10.31.dist-info/licenses/LICENSE +201 -0
- kdock-2025.10.31.dist-info/top_level.txt +1 -0
kdock/core/ligand.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/03_ligand.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['plot_drug', 'rdkit_conformer', 'tanimoto', 'get_rdkit', 'get_rdkit_3d', 'get_rdkit_all', 'remove_hi_corr',
|
|
5
|
+
'preprocess', 'get_rdkit_df', 'get_fp', 'compress_fp', 'tanimoto_numba', 'hash_fp', 'get_same_mol_group']
|
|
6
|
+
|
|
7
|
+
# %% ../../nbs/core/03_ligand.ipynb 3
|
|
8
|
+
import pandas as pd, numpy as np
|
|
9
|
+
from rdkit import Chem,RDLogger,DataStructs
|
|
10
|
+
from rdkit.Chem import Draw, rdDepictor, Descriptors,Descriptors3D, AllChem
|
|
11
|
+
from skfp.fingerprints import AtomPairFingerprint, ECFPFingerprint, MACCSFingerprint
|
|
12
|
+
import hashlib,numba
|
|
13
|
+
from .data import *
|
|
14
|
+
|
|
15
|
+
from tqdm.contrib.concurrent import process_map
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
RDLogger.DisableLog('rdApp.warning')
|
|
18
|
+
|
|
19
|
+
# %% ../../nbs/core/03_ligand.ipynb 5
|
|
20
|
+
def plot_drug(drug_dict, flip_list=None, save_path=None):
|
|
21
|
+
flip_list = flip_list or []
|
|
22
|
+
mols = []
|
|
23
|
+
|
|
24
|
+
for name, smi in drug_dict.items():
|
|
25
|
+
mol = Chem.MolFromSmiles(smi)
|
|
26
|
+
rdDepictor.Compute2DCoords(mol)
|
|
27
|
+
|
|
28
|
+
# Flip horizontally if requested
|
|
29
|
+
if name in flip_list:
|
|
30
|
+
conf = mol.GetConformer()
|
|
31
|
+
for i in range(conf.GetNumAtoms()):
|
|
32
|
+
x, y, z = conf.GetAtomPosition(i)
|
|
33
|
+
conf.SetAtomPosition(i, (-x, y, z))
|
|
34
|
+
|
|
35
|
+
mol.SetProp("_Name", name)
|
|
36
|
+
mols.append(mol)
|
|
37
|
+
|
|
38
|
+
# Draw to SVG
|
|
39
|
+
svg_obj = Draw.MolsToGridImage(
|
|
40
|
+
mols,
|
|
41
|
+
molsPerRow=3,
|
|
42
|
+
subImgSize=(300, 250),
|
|
43
|
+
legends=[m.GetProp("_Name") for m in mols],
|
|
44
|
+
useSVG=True
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
svg_str = getattr(svg_obj, "data", getattr(svg_obj, "GetDrawingText", lambda: svg_obj)())
|
|
48
|
+
|
|
49
|
+
if save_path:
|
|
50
|
+
with open(save_path, "w", encoding="utf-8") as f:
|
|
51
|
+
f.write(svg_str)
|
|
52
|
+
print(f"✅ Saved structure grid to: {save_path}")
|
|
53
|
+
else:
|
|
54
|
+
# from IPython.display import SVG, display
|
|
55
|
+
# display(SVG(svg_str))
|
|
56
|
+
return svg_obj
|
|
57
|
+
|
|
58
|
+
# %% ../../nbs/core/03_ligand.ipynb 8
|
|
59
|
+
def rdkit_conformer(SMILES, # SMILES string
|
|
60
|
+
output=None, # file ".sdf" to be saved
|
|
61
|
+
method='ETKDG', # Optimization method, can be 'UFF', 'MMFF' or 'ETKDGv3'
|
|
62
|
+
visualize=True, #whether or not to visualize the compound
|
|
63
|
+
seed = 3,# randomness of the 3D conformation
|
|
64
|
+
):
|
|
65
|
+
|
|
66
|
+
"Gemerate 3D conformers from SMILES"
|
|
67
|
+
|
|
68
|
+
np.random.seed(seed)
|
|
69
|
+
mol = Chem.MolFromSmiles(SMILES)
|
|
70
|
+
|
|
71
|
+
# Generate a 3D conformation of the molecule
|
|
72
|
+
AllChem.EmbedMolecule(mol)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Optimize the 3D conformation using the specified force field method
|
|
76
|
+
if method == 'UFF':
|
|
77
|
+
AllChem.UFFOptimizeMolecule(mol)
|
|
78
|
+
elif method == 'MMFF':
|
|
79
|
+
AllChem.MMFFOptimizeMolecule(mol)
|
|
80
|
+
elif method == 'ETKDG':
|
|
81
|
+
AllChem.EmbedMultipleConfs(mol, numConfs=1, useExpTorsionAnglePrefs=True,
|
|
82
|
+
useBasicKnowledge=True, enforceChirality=True,randomSeed=seed)
|
|
83
|
+
AllChem.ETKDGv3()
|
|
84
|
+
AllChem.UFFOptimizeMolecule(mol)
|
|
85
|
+
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError('Invalid method specified')
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Remove hydrogens from the molecule
|
|
91
|
+
# mol = Chem.RemoveHs(mol)
|
|
92
|
+
|
|
93
|
+
if output is not None:
|
|
94
|
+
Path(output).parent.mkdir(parents=True,exist_ok=True)
|
|
95
|
+
|
|
96
|
+
w = Chem.SDWriter(output)
|
|
97
|
+
w.write(mol)
|
|
98
|
+
w.close()
|
|
99
|
+
return mol
|
|
100
|
+
|
|
101
|
+
# %% ../../nbs/core/03_ligand.ipynb 10
|
|
102
|
+
def tanimoto(df, # df with SMILES and ID columns
|
|
103
|
+
smiles_col='SMILES', # colname of SMILES
|
|
104
|
+
id_col='ID', # colname of compound ID
|
|
105
|
+
target_col=None, # colname of compound values (e.g., IC50)
|
|
106
|
+
radius=2, # radius of the Morgan fingerprint.
|
|
107
|
+
):
|
|
108
|
+
"Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame."
|
|
109
|
+
|
|
110
|
+
df = df.copy()
|
|
111
|
+
# Convert SMILES to molecule objects
|
|
112
|
+
df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))
|
|
113
|
+
|
|
114
|
+
# Calculate fingerprints
|
|
115
|
+
df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))
|
|
116
|
+
|
|
117
|
+
# Calculate similarity scores
|
|
118
|
+
similarity_scores = []
|
|
119
|
+
for i in range(len(df)):
|
|
120
|
+
for j in range(i+1, len(df)):
|
|
121
|
+
sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
|
|
122
|
+
if target_col is not None:
|
|
123
|
+
similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
|
|
124
|
+
else:
|
|
125
|
+
similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))
|
|
126
|
+
|
|
127
|
+
# Create a new DataFrame with the similarity scores
|
|
128
|
+
if target_col is not None:
|
|
129
|
+
result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
|
|
130
|
+
else:
|
|
131
|
+
result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])
|
|
132
|
+
|
|
133
|
+
# Sort by similarity score in descending order
|
|
134
|
+
result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
|
|
135
|
+
result_df = result_df.reset_index(drop=True)
|
|
136
|
+
|
|
137
|
+
return result_df
|
|
138
|
+
|
|
139
|
+
# %% ../../nbs/core/03_ligand.ipynb 14
|
|
140
|
+
def get_rdkit(SMILES:str):
|
|
141
|
+
"""
|
|
142
|
+
Extract chemical features from SMILES
|
|
143
|
+
Reference: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
|
|
144
|
+
"""
|
|
145
|
+
mol = Chem.MolFromSmiles(SMILES)
|
|
146
|
+
return Descriptors.CalcMolDescriptors(mol)
|
|
147
|
+
|
|
148
|
+
# %% ../../nbs/core/03_ligand.ipynb 15
|
|
149
|
+
def get_rdkit_3d(SMILES:str):
|
|
150
|
+
"Extract 3d features from SMILES"
|
|
151
|
+
mol = Chem.MolFromSmiles(SMILES)
|
|
152
|
+
mol = Chem.AddHs(mol)
|
|
153
|
+
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
|
|
154
|
+
AllChem.UFFOptimizeMolecule(mol)
|
|
155
|
+
return Descriptors3D.CalcMolDescriptors3D(mol)
|
|
156
|
+
|
|
157
|
+
# %% ../../nbs/core/03_ligand.ipynb 16
|
|
158
|
+
def get_rdkit_all(SMILES:str):
|
|
159
|
+
"Extract chemical features and 3d features from SMILES"
|
|
160
|
+
feat = get_rdkit(SMILES)
|
|
161
|
+
feat_3d = get_rdkit_3d(SMILES)
|
|
162
|
+
return feat|feat_3d
|
|
163
|
+
|
|
164
|
+
# %% ../../nbs/core/03_ligand.ipynb 17
|
|
165
|
+
def remove_hi_corr(df: pd.DataFrame,
|
|
166
|
+
thr=0.99 # threshold
|
|
167
|
+
):
|
|
168
|
+
"Remove highly correlated features in a dataframe given a pearson threshold"
|
|
169
|
+
corr_matrix = df.corr().abs()
|
|
170
|
+
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
171
|
+
to_drop = [column for column in upper.columns if any(upper[column] > thr)]
|
|
172
|
+
return df.drop(to_drop, axis=1), to_drop
|
|
173
|
+
|
|
174
|
+
# %% ../../nbs/core/03_ligand.ipynb 18
|
|
175
|
+
def preprocess(df: pd.DataFrame, thr=0.99):
|
|
176
|
+
"Remove features with no variance, and highly correlated features based on threshold."
|
|
177
|
+
col_ori = df.columns
|
|
178
|
+
|
|
179
|
+
# Remove columns with std == 0
|
|
180
|
+
std_zero_cols = df.columns[df.std() == 0].tolist()
|
|
181
|
+
|
|
182
|
+
if std_zero_cols:
|
|
183
|
+
n=len(std_zero_cols)
|
|
184
|
+
print(f"\n {n} Columns with zero std: {std_zero_cols}")
|
|
185
|
+
df = df.loc[:, df.std() != 0].copy()
|
|
186
|
+
|
|
187
|
+
# Remove highly correlated columns
|
|
188
|
+
df, high_corr_cols = remove_hi_corr(df, thr)
|
|
189
|
+
if high_corr_cols:
|
|
190
|
+
n=len(high_corr_cols)
|
|
191
|
+
print(f"\n {n} Columns removed due to high similarity (pearson>{thr}): {high_corr_cols}")
|
|
192
|
+
|
|
193
|
+
dropping_col = set(col_ori) - set(df.columns)
|
|
194
|
+
n = len(dropping_col)
|
|
195
|
+
print(f"\n Total removed columns: {n}")
|
|
196
|
+
|
|
197
|
+
return df
|
|
198
|
+
|
|
199
|
+
# %% ../../nbs/core/03_ligand.ipynb 19
|
|
200
|
+
def get_rdkit_df(df: pd.DataFrame,
|
|
201
|
+
include_3d=False,
|
|
202
|
+
col='SMILES',
|
|
203
|
+
postprocess=False,
|
|
204
|
+
chunksize=128, # for parallel process_map
|
|
205
|
+
):
|
|
206
|
+
"Extract rdkit features (optionally in parallel with 3D) from SMILES in a df"
|
|
207
|
+
func = get_rdkit_all if include_3d else get_rdkit
|
|
208
|
+
smiles_list = df[col].tolist()
|
|
209
|
+
|
|
210
|
+
features = process_map(func, smiles_list,chunksize=chunksize)
|
|
211
|
+
|
|
212
|
+
out = pd.DataFrame(features)
|
|
213
|
+
|
|
214
|
+
if postprocess:
|
|
215
|
+
out = StandardScaler().fit_transform(out)
|
|
216
|
+
out = preprocess(out) # remove redundant
|
|
217
|
+
|
|
218
|
+
return out
|
|
219
|
+
|
|
220
|
+
# %% ../../nbs/core/03_ligand.ipynb 25
|
|
221
|
+
def get_fp(SMILES,
|
|
222
|
+
name='ecfp',
|
|
223
|
+
ELEMENTS_PER_WORKER = 1_000_000):
|
|
224
|
+
"Super fast method to get molecule fingerprints using scikit-fingerprints"
|
|
225
|
+
if name == 'ecfp':
|
|
226
|
+
fp_transformer = ECFPFingerprint(fp_size=2048, radius=3, n_jobs=-1)
|
|
227
|
+
elif name == 'ap':
|
|
228
|
+
fp_transformer = AtomPairFingerprint(fp_size=2048, n_jobs=-1)
|
|
229
|
+
elif name == 'maccs':
|
|
230
|
+
fp_transformer = MACCSFingerprint(n_jobs=-1)
|
|
231
|
+
else:
|
|
232
|
+
raise Exception('Wrong fingerprint name!')
|
|
233
|
+
|
|
234
|
+
middle_parts = []
|
|
235
|
+
k_splits = len(SMILES) // ELEMENTS_PER_WORKER
|
|
236
|
+
|
|
237
|
+
for i in tqdm(range(k_splits)):
|
|
238
|
+
middle_parts.append(fp_transformer.transform(SMILES[i * ELEMENTS_PER_WORKER: (i + 1) * ELEMENTS_PER_WORKER]))
|
|
239
|
+
|
|
240
|
+
if len(SMILES) % ELEMENTS_PER_WORKER > 0:
|
|
241
|
+
middle_parts.append(fp_transformer.transform(SMILES[k_splits * ELEMENTS_PER_WORKER:]))
|
|
242
|
+
|
|
243
|
+
return np.concatenate(middle_parts)
|
|
244
|
+
|
|
245
|
+
# %% ../../nbs/core/03_ligand.ipynb 29
|
|
246
|
+
def compress_fp(array):
|
|
247
|
+
"Compress binary finterprints using np.packbits"
|
|
248
|
+
return np.packbits(array,axis=1)
|
|
249
|
+
|
|
250
|
+
# %% ../../nbs/core/03_ligand.ipynb 47
|
|
251
|
+
@numba.njit(parallel=True)
|
|
252
|
+
def tanimoto_numba(fps):
|
|
253
|
+
"Get a NxN matrix of tanimoto similarity among N compounds."
|
|
254
|
+
n = fps.shape[0]
|
|
255
|
+
result = np.zeros((n, n), dtype=np.float32)
|
|
256
|
+
|
|
257
|
+
for i in numba.prange(n):
|
|
258
|
+
for j in range(i, n): # only upper triangle
|
|
259
|
+
inter = np.bitwise_and(fps[i], fps[j]).sum()
|
|
260
|
+
union = np.bitwise_or(fps[i], fps[j]).sum()
|
|
261
|
+
sim = inter / union if union > 0 else 0.0
|
|
262
|
+
result[i, j] = sim
|
|
263
|
+
result[j, i] = sim # fill symmetric position
|
|
264
|
+
return result
|
|
265
|
+
|
|
266
|
+
# %% ../../nbs/core/03_ligand.ipynb 50
|
|
267
|
+
def hash_fp(fp_row):
|
|
268
|
+
"Hash a binary fingerprint row using SHA256"
|
|
269
|
+
return hashlib.sha256(fp_row.tobytes()).hexdigest()
|
|
270
|
+
|
|
271
|
+
# %% ../../nbs/core/03_ligand.ipynb 52
|
|
272
|
+
def get_same_mol_group(df, smi_col='SMILES'):
|
|
273
|
+
"Assign a group number to the same compounds by utilizing hash sha256 to encode morgan fp and find same molecule."
|
|
274
|
+
df = df.copy()
|
|
275
|
+
smiles = df[smi_col].tolist()
|
|
276
|
+
|
|
277
|
+
fps = get_fp(smiles)
|
|
278
|
+
|
|
279
|
+
# Hash each fingerprint
|
|
280
|
+
# hashes = [hash_fp(fp) for fp in fps] # non-parallel
|
|
281
|
+
hashes = process_map(hash_fp, fps, chunksize=256) # if parallel
|
|
282
|
+
|
|
283
|
+
# Assign a group number based on hash buckets
|
|
284
|
+
hash_to_group = {}
|
|
285
|
+
group_ids = []
|
|
286
|
+
group_counter = 0
|
|
287
|
+
for h in hashes:
|
|
288
|
+
if h not in hash_to_group:
|
|
289
|
+
hash_to_group[h] = group_counter
|
|
290
|
+
group_counter += 1
|
|
291
|
+
group_ids.append(hash_to_group[h])
|
|
292
|
+
|
|
293
|
+
df['group'] = group_ids
|
|
294
|
+
return df
|
kdock/core/plot.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Functions to plot on 2D"""
|
|
2
|
+
|
|
3
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/04_plot.ipynb.
|
|
4
|
+
|
|
5
|
+
# %% auto 0
|
|
6
|
+
__all__ = ['reduce_feature', 'set_sns', 'plot_2d', 'plot_corr']
|
|
7
|
+
|
|
8
|
+
# %% ../../nbs/core/04_plot.ipynb 3
|
|
9
|
+
import pandas as pd, seaborn as sns
|
|
10
|
+
from fastcore.meta import delegates
|
|
11
|
+
from matplotlib import pyplot as plt
|
|
12
|
+
|
|
13
|
+
# # kdock
|
|
14
|
+
# from kdock.core.data import *
|
|
15
|
+
|
|
16
|
+
# dimentional reduction
|
|
17
|
+
from sklearn.decomposition import PCA
|
|
18
|
+
from sklearn.manifold import TSNE
|
|
19
|
+
from umap import UMAP
|
|
20
|
+
|
|
21
|
+
# %% ../../nbs/core/04_plot.ipynb 6
|
|
22
|
+
def reduce_feature(data, # df or numpy array
|
|
23
|
+
method='pca', # dimensionality reduction method, accept both capital and lower case
|
|
24
|
+
complexity=20, # None for PCA; perfplexity for TSNE, recommend: 30; n_neigbors for UMAP, recommend: 15
|
|
25
|
+
n=2, # n_components
|
|
26
|
+
seed: int=123, # seed for random_state
|
|
27
|
+
**kwargs, # arguments from PCA, TSNE, or UMAP depends on which method to use
|
|
28
|
+
):
|
|
29
|
+
|
|
30
|
+
"Reduce the dimensionality given a dataframe of values"
|
|
31
|
+
|
|
32
|
+
method = method.lower()
|
|
33
|
+
assert method in ['pca','tsne','umap'], "Please choose a method among PCA, TSNE, and UMAP"
|
|
34
|
+
|
|
35
|
+
if method == 'pca':
|
|
36
|
+
reducer = PCA(n_components=n, random_state=seed,**kwargs)
|
|
37
|
+
elif method == 'tsne':
|
|
38
|
+
reducer = TSNE(n_components=n,
|
|
39
|
+
random_state=seed,
|
|
40
|
+
perplexity = complexity, # default from official is 30
|
|
41
|
+
**kwargs)
|
|
42
|
+
elif method == 'umap':
|
|
43
|
+
reducer = UMAP(n_components=n,
|
|
44
|
+
random_state=seed,
|
|
45
|
+
n_neighbors=complexity, # default from official is 15, try 15-200
|
|
46
|
+
**kwargs)
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError('Invalid method specified')
|
|
49
|
+
|
|
50
|
+
proj = reducer.fit_transform(data)
|
|
51
|
+
embedding_df = pd.DataFrame(proj).set_index(data.index) if isinstance(data,pd.DataFrame) else pd.DataFrame(proj)
|
|
52
|
+
embedding_df.columns = [f"{method.upper()}{i}" for i in range(1, n + 1)]
|
|
53
|
+
|
|
54
|
+
return embedding_df
|
|
55
|
+
|
|
56
|
+
# %% ../../nbs/core/04_plot.ipynb 10
|
|
57
|
+
def set_sns():
|
|
58
|
+
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
|
|
59
|
+
sns.set_context('notebook')
|
|
60
|
+
sns.set_style("ticks")
|
|
61
|
+
|
|
62
|
+
# %% ../../nbs/core/04_plot.ipynb 12
|
|
63
|
+
@delegates(sns.scatterplot)
|
|
64
|
+
def plot_2d(X: pd.DataFrame, # a dataframe that has first column to be x, and second column to be y
|
|
65
|
+
**kwargs, # arguments for sns.scatterplot
|
|
66
|
+
):
|
|
67
|
+
"Make 2D plot from a dataframe that has first column to be x, and second column to be y"
|
|
68
|
+
plt.figure(figsize=(7,7))
|
|
69
|
+
sns.scatterplot(data = X,x=X.columns[0],y=X.columns[1],alpha=0.7,**kwargs)
|
|
70
|
+
|
|
71
|
+
# %% ../../nbs/core/04_plot.ipynb 14
|
|
72
|
+
def plot_corr(x,#a column of df
|
|
73
|
+
y,#a column of df
|
|
74
|
+
xlabel=None,# x axis label
|
|
75
|
+
ylabel=None,# y axis label
|
|
76
|
+
order=3, # polynomial level, if straight, order=1
|
|
77
|
+
):
|
|
78
|
+
sns.regplot(x=x,
|
|
79
|
+
y=y,
|
|
80
|
+
order=order,
|
|
81
|
+
line_kws={'color': 'gray'}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if xlabel is not None:
|
|
85
|
+
plt.xlabel(xlabel)
|
|
86
|
+
|
|
87
|
+
if ylabel is not None:
|
|
88
|
+
plt.ylabel(ylabel)
|
|
89
|
+
# plt.text(x=0.8, y=0.1, s=f'Spearman: {correlation:.2f}', transform=plt.gca().transAxes, ha='center', va='center');
|
kdock/core/protein.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/02_protein.ipynb.
|
|
2
|
+
|
|
3
|
+
# %% auto 0
|
|
4
|
+
__all__ = ['get_uniprot_seq', 'get_uniprot_features', 'get_uniprot_kd', 'get_uniprot_type', 'apply_mut_single',
|
|
5
|
+
'apply_mut_complex', 'compare_seq']
|
|
6
|
+
|
|
7
|
+
# %% ../../nbs/core/02_protein.ipynb 3
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import requests,re
|
|
10
|
+
from functools import lru_cache
|
|
11
|
+
|
|
12
|
+
# for compare seq
|
|
13
|
+
from Bio.Align import PairwiseAligner
|
|
14
|
+
|
|
15
|
+
# %% ../../nbs/core/02_protein.ipynb 5
|
|
16
|
+
@lru_cache()
|
|
17
|
+
def get_uniprot_seq(uniprot_id):
|
|
18
|
+
"Queries the UniProt database to retrieve the protein sequence for a given UniProt ID."
|
|
19
|
+
|
|
20
|
+
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
|
|
21
|
+
response = requests.get(url)
|
|
22
|
+
|
|
23
|
+
# Check if the request was successful (status code 200)
|
|
24
|
+
if response.status_code == 200:
|
|
25
|
+
data = response.text
|
|
26
|
+
# The sequence starts after the first line, which is a description
|
|
27
|
+
sequence = ''.join(data.split('\n')[1:]).strip()
|
|
28
|
+
return sequence
|
|
29
|
+
else:
|
|
30
|
+
return f"Error: Unable to retrieve sequence for UniProt ID {uniprot_id}. Status code: {response.status_code}"
|
|
31
|
+
|
|
32
|
+
# %% ../../nbs/core/02_protein.ipynb 7
|
|
33
|
+
@lru_cache()
|
|
34
|
+
def get_uniprot_features(uniprot_id):
|
|
35
|
+
"Given uniprot_id, get specific region for uniprot features."
|
|
36
|
+
# uniprot REST API
|
|
37
|
+
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
|
|
38
|
+
response = requests.get(url)
|
|
39
|
+
|
|
40
|
+
if response.status_code == 200:
|
|
41
|
+
data = response.json()
|
|
42
|
+
# Extract the "features" section which contains information
|
|
43
|
+
features = data.get('features', [])
|
|
44
|
+
|
|
45
|
+
protein_name = (
|
|
46
|
+
data.get("proteinDescription", {})
|
|
47
|
+
.get("recommendedName", {})
|
|
48
|
+
.get("fullName", {})
|
|
49
|
+
.get("value")
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
gene_name = (
|
|
53
|
+
data.get("genes", [{}])[0]
|
|
54
|
+
.get("geneName", {})
|
|
55
|
+
.get("value")
|
|
56
|
+
)
|
|
57
|
+
return {
|
|
58
|
+
"uniprot_id": uniprot_id,
|
|
59
|
+
"protein_name": protein_name,
|
|
60
|
+
"gene_name": gene_name,
|
|
61
|
+
"features": features
|
|
62
|
+
}
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(f"Failed to retrieve UniProt features for {uniprot_id}")
|
|
65
|
+
|
|
66
|
+
# %% ../../nbs/core/02_protein.ipynb 9
|
|
67
|
+
def get_uniprot_kd(uniprot_id):
|
|
68
|
+
"Query 'Domain: Protein kinase' based on UniProt ID and get its sequence info."
|
|
69
|
+
data = get_uniprot_features(uniprot_id)
|
|
70
|
+
seq = get_uniprot_seq(uniprot_id)
|
|
71
|
+
out = []
|
|
72
|
+
|
|
73
|
+
for feature in data['features']:
|
|
74
|
+
if feature.get("type") == "Domain" and "Protein kinase" in feature.get("description", ""):
|
|
75
|
+
start = feature['location']['start']['value']
|
|
76
|
+
end = feature['location']['end']['value']
|
|
77
|
+
out.append({
|
|
78
|
+
"uniprot_id": uniprot_id,
|
|
79
|
+
"protein_name": data["protein_name"],
|
|
80
|
+
"gene_name": data["gene_name"],
|
|
81
|
+
"start": start,
|
|
82
|
+
"end": end,
|
|
83
|
+
"description": feature.get("description", ""),
|
|
84
|
+
"sequence": seq[start-1:end]
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
return out
|
|
88
|
+
|
|
89
|
+
# %% ../../nbs/core/02_protein.ipynb 11
|
|
90
|
+
def get_uniprot_type(uniprot_id,type_='Signal'):
|
|
91
|
+
"Get region sequences based on UniProt ID features."
|
|
92
|
+
data = get_uniprot_features(uniprot_id)
|
|
93
|
+
seq = get_uniprot_seq(uniprot_id)
|
|
94
|
+
|
|
95
|
+
out = []
|
|
96
|
+
|
|
97
|
+
for feature in data['features']:
|
|
98
|
+
if feature.get("type") == type_:
|
|
99
|
+
start = feature['location']['start']['value']
|
|
100
|
+
end = feature['location']['end']['value']
|
|
101
|
+
region = {
|
|
102
|
+
'uniprot_id': uniprot_id,
|
|
103
|
+
'type': feature['type'],
|
|
104
|
+
"protein_name": data["protein_name"],
|
|
105
|
+
"gene_name": data["gene_name"],
|
|
106
|
+
'start': start,
|
|
107
|
+
'end': end,
|
|
108
|
+
'description': feature['description'],
|
|
109
|
+
'sequence': seq[start-1:end]
|
|
110
|
+
}
|
|
111
|
+
out.append(region)
|
|
112
|
+
|
|
113
|
+
if not out:
|
|
114
|
+
available = sorted({f.get("type") for f in data['features'] if f.get("type")})
|
|
115
|
+
print(f"No feature of type '{type_}' found for {uniprot_id}.")
|
|
116
|
+
print(f"Available feature types: {', '.join(available)}")
|
|
117
|
+
return available
|
|
118
|
+
|
|
119
|
+
return out
|
|
120
|
+
|
|
121
|
+
# %% ../../nbs/core/02_protein.ipynb 16
|
|
122
|
+
def apply_mut_single(seq, # protein sequence
|
|
123
|
+
*mutations, # e.g., E709A
|
|
124
|
+
start_pos=1, # if the protein sequence does not start from index 1, indicate the start index to match the mutations
|
|
125
|
+
):
|
|
126
|
+
"Apply mutations to a protein sequence."
|
|
127
|
+
seq_list = list(seq) # convert to list for mutability
|
|
128
|
+
|
|
129
|
+
for mut in mutations:
|
|
130
|
+
# check mutation format
|
|
131
|
+
if len(mut) < 3: raise ValueError(f"Invalid mutation format: {mut}")
|
|
132
|
+
|
|
133
|
+
from_aa,pos,to_aa = mut[0],int(mut[1:-1])-start_pos,mut[-1]
|
|
134
|
+
|
|
135
|
+
# make sure position is within the sequence length
|
|
136
|
+
if pos < 0 or pos >= len(seq_list): raise IndexError(f"Position {pos + 1} out of range for sequence length {len(seq_list)}")
|
|
137
|
+
# make sure aa from mutations matches the residue on the sequence
|
|
138
|
+
if seq_list[pos] != from_aa: raise ValueError(f"Expected {from_aa} at position {pos + 1}, found {seq_list[pos]}")
|
|
139
|
+
|
|
140
|
+
seq_list[pos] = to_aa
|
|
141
|
+
print('Converted:', mut)
|
|
142
|
+
|
|
143
|
+
return ''.join(seq_list)
|
|
144
|
+
|
|
145
|
+
# %% ../../nbs/core/02_protein.ipynb 18
|
|
146
|
+
def apply_mut_complex(seq, # protein sequence
|
|
147
|
+
mut, # mutation (e.g., G776delinsVC/S783C, G778dupGSP)
|
|
148
|
+
start_pos=1, # if truncated protein sequence, indicate where it starts to match the position of mutation
|
|
149
|
+
):
|
|
150
|
+
"""
|
|
151
|
+
Apply a composite mutation like 'G776delinsVC/S783C' to `seq`,
|
|
152
|
+
assuming `seq[0]` corresponds to residue number `start_pos`.
|
|
153
|
+
|
|
154
|
+
* At most one delins **or** dup is allowed.
|
|
155
|
+
* Point substitutions are executed first; the indel/dup is done last.
|
|
156
|
+
"""
|
|
157
|
+
_sub_pat = re.compile(r'^([A-Z])(\d+)([A-Z])$') # e.g. S783C
|
|
158
|
+
_delins_pat = re.compile(r'^([A-Z])(\d+)delins([A-Z]+)$') # e.g. G776delinsVC
|
|
159
|
+
_dup_pat = re.compile(r'^([A-Z])(\d+)dup([A-Z]+)$') # e.g. G778dupGSP
|
|
160
|
+
|
|
161
|
+
seq = list(seq)
|
|
162
|
+
tokens = mut.split('/')
|
|
163
|
+
|
|
164
|
+
# ---------- 1) substitutions (length-neutral) ----------
|
|
165
|
+
for m in tokens:
|
|
166
|
+
if _sub_pat.match(m):
|
|
167
|
+
orig, pos, new = _sub_pat.match(m).groups()
|
|
168
|
+
idx = int(pos) - start_pos
|
|
169
|
+
if seq[idx] != orig:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
|
|
172
|
+
)
|
|
173
|
+
seq[idx] = new
|
|
174
|
+
|
|
175
|
+
# ---------- 2) the single length-changing event ----------
|
|
176
|
+
for m in tokens:
|
|
177
|
+
if _delins_pat.match(m):
|
|
178
|
+
orig, pos, ins = _delins_pat.match(m).groups()
|
|
179
|
+
idx = int(pos) - start_pos
|
|
180
|
+
if seq[idx] != orig:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
|
|
183
|
+
)
|
|
184
|
+
seq[idx : idx + 1] = list(ins) # replace 1 residue with many
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
if _dup_pat.match(m):
|
|
188
|
+
orig, pos, dup = _dup_pat.match(m).groups()
|
|
189
|
+
idx = int(pos) - start_pos
|
|
190
|
+
if seq[idx] != orig:
|
|
191
|
+
raise ValueError(
|
|
192
|
+
f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
|
|
193
|
+
)
|
|
194
|
+
seq[idx + 1 : idx + 1] = list(dup) # insert right after the residue
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
return ''.join(seq)
|
|
198
|
+
|
|
199
|
+
# %% ../../nbs/core/02_protein.ipynb 22
|
|
200
|
+
def compare_seq(
|
|
201
|
+
seq1: str, # original
|
|
202
|
+
seq2: str, # mutant
|
|
203
|
+
*,
|
|
204
|
+
start_pos: int = 1,
|
|
205
|
+
label1: str = "Original",
|
|
206
|
+
label2: str = "Mutant",
|
|
207
|
+
visualize: bool = True
|
|
208
|
+
):
|
|
209
|
+
"""
|
|
210
|
+
Align two protein sequences and summarise differences.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
# ----- global alignment using PairwiseAligner -----
|
|
214
|
+
aligner = PairwiseAligner()
|
|
215
|
+
aligner.mode = "global"
|
|
216
|
+
aligner.match_score = 2
|
|
217
|
+
aligner.mismatch_score = -1
|
|
218
|
+
aligner.open_gap_score = -5
|
|
219
|
+
aligner.extend_gap_score = -0.5
|
|
220
|
+
|
|
221
|
+
alignment = aligner.align(seq1, seq2)[0]
|
|
222
|
+
aln1 = alignment.aligned[0]
|
|
223
|
+
aln2 = alignment.aligned[1]
|
|
224
|
+
|
|
225
|
+
# Reconstruct aligned strings from aligned segments
|
|
226
|
+
aligned_seq1 = []
|
|
227
|
+
aligned_seq2 = []
|
|
228
|
+
i1, i2 = 0, 0
|
|
229
|
+
|
|
230
|
+
for (start1, end1), (start2, end2) in zip(aln1, aln2):
|
|
231
|
+
# Add gaps to make sequences align
|
|
232
|
+
while i1 < start1:
|
|
233
|
+
aligned_seq1.append(seq1[i1])
|
|
234
|
+
aligned_seq2.append('-')
|
|
235
|
+
i1 += 1
|
|
236
|
+
while i2 < start2:
|
|
237
|
+
aligned_seq1.append('-')
|
|
238
|
+
aligned_seq2.append(seq2[i2])
|
|
239
|
+
i2 += 1
|
|
240
|
+
|
|
241
|
+
# Add aligned part
|
|
242
|
+
for j in range(end1 - start1):
|
|
243
|
+
aligned_seq1.append(seq1[i1])
|
|
244
|
+
aligned_seq2.append(seq2[i2])
|
|
245
|
+
i1 += 1
|
|
246
|
+
i2 += 1
|
|
247
|
+
|
|
248
|
+
# Remaining tails
|
|
249
|
+
while i1 < len(seq1):
|
|
250
|
+
aligned_seq1.append(seq1[i1])
|
|
251
|
+
aligned_seq2.append('-')
|
|
252
|
+
i1 += 1
|
|
253
|
+
while i2 < len(seq2):
|
|
254
|
+
aligned_seq1.append('-')
|
|
255
|
+
aligned_seq2.append(seq2[i2])
|
|
256
|
+
i2 += 1
|
|
257
|
+
|
|
258
|
+
aln1_str = ''.join(aligned_seq1)
|
|
259
|
+
aln2_str = ''.join(aligned_seq2)
|
|
260
|
+
|
|
261
|
+
# ----- find differences -----
|
|
262
|
+
diffs, raw_i1, raw_i2 = [], 0, 0
|
|
263
|
+
for a1, a2 in zip(aln1_str, aln2_str):
|
|
264
|
+
if a1 != '-' and a2 != '-':
|
|
265
|
+
if a1 != a2:
|
|
266
|
+
diffs.append((start_pos + raw_i1, a1, a2, 'substitution'))
|
|
267
|
+
raw_i1 += 1
|
|
268
|
+
raw_i2 += 1
|
|
269
|
+
elif a1 == '-' and a2 != '-':
|
|
270
|
+
diffs.append((start_pos + raw_i1, '-', a2, 'insertion'))
|
|
271
|
+
raw_i2 += 1
|
|
272
|
+
elif a1 != '-' and a2 == '-':
|
|
273
|
+
diffs.append((start_pos + raw_i1, a1, '-', 'deletion'))
|
|
274
|
+
raw_i1 += 1
|
|
275
|
+
|
|
276
|
+
# ----- visualization -----
|
|
277
|
+
if visualize:
|
|
278
|
+
for block in range(0, len(aln1_str), 80):
|
|
279
|
+
s1_block = aln1_str[block:block + 80]
|
|
280
|
+
s2_block = aln2_str[block:block + 80]
|
|
281
|
+
marker = ''.join(' ' if x == y else '^' for x, y in zip(s1_block, s2_block))
|
|
282
|
+
left_idx = start_pos + aln1_str[:block].replace('-', '').__len__()
|
|
283
|
+
right_idx = left_idx + s1_block.replace('-', '').__len__() - 1
|
|
284
|
+
print(f"{label1:<10} {left_idx:>5}-{right_idx:<5}: {s1_block}")
|
|
285
|
+
print(f"{label2:<10} {'':>11}: {s2_block}")
|
|
286
|
+
print(f"{'':>22} {marker}\n")
|
|
287
|
+
|
|
288
|
+
# ----- summary list -----
|
|
289
|
+
print("Differences:")
|
|
290
|
+
for pos, ref, new, kind in diffs:
|
|
291
|
+
print(f" {kind:<12} at {pos:>4}: {ref} → {new}")
|
|
292
|
+
|
|
293
|
+
# return diffs
|