kdock 2025.10.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kdock/core/ligand.py ADDED
@@ -0,0 +1,294 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/03_ligand.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['plot_drug', 'rdkit_conformer', 'tanimoto', 'get_rdkit', 'get_rdkit_3d', 'get_rdkit_all', 'remove_hi_corr',
5
+ 'preprocess', 'get_rdkit_df', 'get_fp', 'compress_fp', 'tanimoto_numba', 'hash_fp', 'get_same_mol_group']
6
+
7
+ # %% ../../nbs/core/03_ligand.ipynb 3
8
+ import pandas as pd, numpy as np
9
+ from rdkit import Chem,RDLogger,DataStructs
10
+ from rdkit.Chem import Draw, rdDepictor, Descriptors,Descriptors3D, AllChem
11
+ from skfp.fingerprints import AtomPairFingerprint, ECFPFingerprint, MACCSFingerprint
12
+ import hashlib,numba
13
+ from .data import *
14
+
15
+ from tqdm.contrib.concurrent import process_map
16
+ from tqdm import tqdm
17
+ RDLogger.DisableLog('rdApp.warning')
18
+
19
+ # %% ../../nbs/core/03_ligand.ipynb 5
20
+ def plot_drug(drug_dict, flip_list=None, save_path=None):
21
+ flip_list = flip_list or []
22
+ mols = []
23
+
24
+ for name, smi in drug_dict.items():
25
+ mol = Chem.MolFromSmiles(smi)
26
+ rdDepictor.Compute2DCoords(mol)
27
+
28
+ # Flip horizontally if requested
29
+ if name in flip_list:
30
+ conf = mol.GetConformer()
31
+ for i in range(conf.GetNumAtoms()):
32
+ x, y, z = conf.GetAtomPosition(i)
33
+ conf.SetAtomPosition(i, (-x, y, z))
34
+
35
+ mol.SetProp("_Name", name)
36
+ mols.append(mol)
37
+
38
+ # Draw to SVG
39
+ svg_obj = Draw.MolsToGridImage(
40
+ mols,
41
+ molsPerRow=3,
42
+ subImgSize=(300, 250),
43
+ legends=[m.GetProp("_Name") for m in mols],
44
+ useSVG=True
45
+ )
46
+
47
+ svg_str = getattr(svg_obj, "data", getattr(svg_obj, "GetDrawingText", lambda: svg_obj)())
48
+
49
+ if save_path:
50
+ with open(save_path, "w", encoding="utf-8") as f:
51
+ f.write(svg_str)
52
+ print(f"✅ Saved structure grid to: {save_path}")
53
+ else:
54
+ # from IPython.display import SVG, display
55
+ # display(SVG(svg_str))
56
+ return svg_obj
57
+
58
+ # %% ../../nbs/core/03_ligand.ipynb 8
59
+ def rdkit_conformer(SMILES, # SMILES string
60
+ output=None, # file ".sdf" to be saved
61
+ method='ETKDG', # Optimization method, can be 'UFF', 'MMFF' or 'ETKDGv3'
62
+ visualize=True, #whether or not to visualize the compound
63
+ seed = 3,# randomness of the 3D conformation
64
+ ):
65
+
66
+ "Gemerate 3D conformers from SMILES"
67
+
68
+ np.random.seed(seed)
69
+ mol = Chem.MolFromSmiles(SMILES)
70
+
71
+ # Generate a 3D conformation of the molecule
72
+ AllChem.EmbedMolecule(mol)
73
+
74
+
75
+ # Optimize the 3D conformation using the specified force field method
76
+ if method == 'UFF':
77
+ AllChem.UFFOptimizeMolecule(mol)
78
+ elif method == 'MMFF':
79
+ AllChem.MMFFOptimizeMolecule(mol)
80
+ elif method == 'ETKDG':
81
+ AllChem.EmbedMultipleConfs(mol, numConfs=1, useExpTorsionAnglePrefs=True,
82
+ useBasicKnowledge=True, enforceChirality=True,randomSeed=seed)
83
+ AllChem.ETKDGv3()
84
+ AllChem.UFFOptimizeMolecule(mol)
85
+
86
+ else:
87
+ raise ValueError('Invalid method specified')
88
+
89
+
90
+ # Remove hydrogens from the molecule
91
+ # mol = Chem.RemoveHs(mol)
92
+
93
+ if output is not None:
94
+ Path(output).parent.mkdir(parents=True,exist_ok=True)
95
+
96
+ w = Chem.SDWriter(output)
97
+ w.write(mol)
98
+ w.close()
99
+ return mol
100
+
101
+ # %% ../../nbs/core/03_ligand.ipynb 10
102
+ def tanimoto(df, # df with SMILES and ID columns
103
+ smiles_col='SMILES', # colname of SMILES
104
+ id_col='ID', # colname of compound ID
105
+ target_col=None, # colname of compound values (e.g., IC50)
106
+ radius=2, # radius of the Morgan fingerprint.
107
+ ):
108
+ "Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame."
109
+
110
+ df = df.copy()
111
+ # Convert SMILES to molecule objects
112
+ df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))
113
+
114
+ # Calculate fingerprints
115
+ df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))
116
+
117
+ # Calculate similarity scores
118
+ similarity_scores = []
119
+ for i in range(len(df)):
120
+ for j in range(i+1, len(df)):
121
+ sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
122
+ if target_col is not None:
123
+ similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
124
+ else:
125
+ similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))
126
+
127
+ # Create a new DataFrame with the similarity scores
128
+ if target_col is not None:
129
+ result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
130
+ else:
131
+ result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])
132
+
133
+ # Sort by similarity score in descending order
134
+ result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
135
+ result_df = result_df.reset_index(drop=True)
136
+
137
+ return result_df
138
+
139
+ # %% ../../nbs/core/03_ligand.ipynb 14
140
+ def get_rdkit(SMILES:str):
141
+ """
142
+ Extract chemical features from SMILES
143
+ Reference: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
144
+ """
145
+ mol = Chem.MolFromSmiles(SMILES)
146
+ return Descriptors.CalcMolDescriptors(mol)
147
+
148
+ # %% ../../nbs/core/03_ligand.ipynb 15
149
+ def get_rdkit_3d(SMILES:str):
150
+ "Extract 3d features from SMILES"
151
+ mol = Chem.MolFromSmiles(SMILES)
152
+ mol = Chem.AddHs(mol)
153
+ AllChem.EmbedMolecule(mol, AllChem.ETKDG())
154
+ AllChem.UFFOptimizeMolecule(mol)
155
+ return Descriptors3D.CalcMolDescriptors3D(mol)
156
+
157
+ # %% ../../nbs/core/03_ligand.ipynb 16
158
+ def get_rdkit_all(SMILES:str):
159
+ "Extract chemical features and 3d features from SMILES"
160
+ feat = get_rdkit(SMILES)
161
+ feat_3d = get_rdkit_3d(SMILES)
162
+ return feat|feat_3d
163
+
164
+ # %% ../../nbs/core/03_ligand.ipynb 17
165
+ def remove_hi_corr(df: pd.DataFrame,
166
+ thr=0.99 # threshold
167
+ ):
168
+ "Remove highly correlated features in a dataframe given a pearson threshold"
169
+ corr_matrix = df.corr().abs()
170
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
171
+ to_drop = [column for column in upper.columns if any(upper[column] > thr)]
172
+ return df.drop(to_drop, axis=1), to_drop
173
+
174
+ # %% ../../nbs/core/03_ligand.ipynb 18
175
+ def preprocess(df: pd.DataFrame, thr=0.99):
176
+ "Remove features with no variance, and highly correlated features based on threshold."
177
+ col_ori = df.columns
178
+
179
+ # Remove columns with std == 0
180
+ std_zero_cols = df.columns[df.std() == 0].tolist()
181
+
182
+ if std_zero_cols:
183
+ n=len(std_zero_cols)
184
+ print(f"\n {n} Columns with zero std: {std_zero_cols}")
185
+ df = df.loc[:, df.std() != 0].copy()
186
+
187
+ # Remove highly correlated columns
188
+ df, high_corr_cols = remove_hi_corr(df, thr)
189
+ if high_corr_cols:
190
+ n=len(high_corr_cols)
191
+ print(f"\n {n} Columns removed due to high similarity (pearson>{thr}): {high_corr_cols}")
192
+
193
+ dropping_col = set(col_ori) - set(df.columns)
194
+ n = len(dropping_col)
195
+ print(f"\n Total removed columns: {n}")
196
+
197
+ return df
198
+
199
+ # %% ../../nbs/core/03_ligand.ipynb 19
200
+ def get_rdkit_df(df: pd.DataFrame,
201
+ include_3d=False,
202
+ col='SMILES',
203
+ postprocess=False,
204
+ chunksize=128, # for parallel process_map
205
+ ):
206
+ "Extract rdkit features (optionally in parallel with 3D) from SMILES in a df"
207
+ func = get_rdkit_all if include_3d else get_rdkit
208
+ smiles_list = df[col].tolist()
209
+
210
+ features = process_map(func, smiles_list,chunksize=chunksize)
211
+
212
+ out = pd.DataFrame(features)
213
+
214
+ if postprocess:
215
+ out = StandardScaler().fit_transform(out)
216
+ out = preprocess(out) # remove redundant
217
+
218
+ return out
219
+
220
+ # %% ../../nbs/core/03_ligand.ipynb 25
221
+ def get_fp(SMILES,
222
+ name='ecfp',
223
+ ELEMENTS_PER_WORKER = 1_000_000):
224
+ "Super fast method to get molecule fingerprints using scikit-fingerprints"
225
+ if name == 'ecfp':
226
+ fp_transformer = ECFPFingerprint(fp_size=2048, radius=3, n_jobs=-1)
227
+ elif name == 'ap':
228
+ fp_transformer = AtomPairFingerprint(fp_size=2048, n_jobs=-1)
229
+ elif name == 'maccs':
230
+ fp_transformer = MACCSFingerprint(n_jobs=-1)
231
+ else:
232
+ raise Exception('Wrong fingerprint name!')
233
+
234
+ middle_parts = []
235
+ k_splits = len(SMILES) // ELEMENTS_PER_WORKER
236
+
237
+ for i in tqdm(range(k_splits)):
238
+ middle_parts.append(fp_transformer.transform(SMILES[i * ELEMENTS_PER_WORKER: (i + 1) * ELEMENTS_PER_WORKER]))
239
+
240
+ if len(SMILES) % ELEMENTS_PER_WORKER > 0:
241
+ middle_parts.append(fp_transformer.transform(SMILES[k_splits * ELEMENTS_PER_WORKER:]))
242
+
243
+ return np.concatenate(middle_parts)
244
+
245
+ # %% ../../nbs/core/03_ligand.ipynb 29
246
+ def compress_fp(array):
247
+ "Compress binary finterprints using np.packbits"
248
+ return np.packbits(array,axis=1)
249
+
250
+ # %% ../../nbs/core/03_ligand.ipynb 47
251
+ @numba.njit(parallel=True)
252
+ def tanimoto_numba(fps):
253
+ "Get a NxN matrix of tanimoto similarity among N compounds."
254
+ n = fps.shape[0]
255
+ result = np.zeros((n, n), dtype=np.float32)
256
+
257
+ for i in numba.prange(n):
258
+ for j in range(i, n): # only upper triangle
259
+ inter = np.bitwise_and(fps[i], fps[j]).sum()
260
+ union = np.bitwise_or(fps[i], fps[j]).sum()
261
+ sim = inter / union if union > 0 else 0.0
262
+ result[i, j] = sim
263
+ result[j, i] = sim # fill symmetric position
264
+ return result
265
+
266
+ # %% ../../nbs/core/03_ligand.ipynb 50
267
+ def hash_fp(fp_row):
268
+ "Hash a binary fingerprint row using SHA256"
269
+ return hashlib.sha256(fp_row.tobytes()).hexdigest()
270
+
271
+ # %% ../../nbs/core/03_ligand.ipynb 52
272
+ def get_same_mol_group(df, smi_col='SMILES'):
273
+ "Assign a group number to the same compounds by utilizing hash sha256 to encode morgan fp and find same molecule."
274
+ df = df.copy()
275
+ smiles = df[smi_col].tolist()
276
+
277
+ fps = get_fp(smiles)
278
+
279
+ # Hash each fingerprint
280
+ # hashes = [hash_fp(fp) for fp in fps] # non-parallel
281
+ hashes = process_map(hash_fp, fps, chunksize=256) # if parallel
282
+
283
+ # Assign a group number based on hash buckets
284
+ hash_to_group = {}
285
+ group_ids = []
286
+ group_counter = 0
287
+ for h in hashes:
288
+ if h not in hash_to_group:
289
+ hash_to_group[h] = group_counter
290
+ group_counter += 1
291
+ group_ids.append(hash_to_group[h])
292
+
293
+ df['group'] = group_ids
294
+ return df
kdock/core/plot.py ADDED
@@ -0,0 +1,89 @@
1
+ """Functions to plot on 2D"""
2
+
3
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/04_plot.ipynb.
4
+
5
+ # %% auto 0
6
+ __all__ = ['reduce_feature', 'set_sns', 'plot_2d', 'plot_corr']
7
+
8
+ # %% ../../nbs/core/04_plot.ipynb 3
9
+ import pandas as pd, seaborn as sns
10
+ from fastcore.meta import delegates
11
+ from matplotlib import pyplot as plt
12
+
13
+ # # kdock
14
+ # from kdock.core.data import *
15
+
16
+ # dimentional reduction
17
+ from sklearn.decomposition import PCA
18
+ from sklearn.manifold import TSNE
19
+ from umap import UMAP
20
+
21
+ # %% ../../nbs/core/04_plot.ipynb 6
22
+ def reduce_feature(data, # df or numpy array
23
+ method='pca', # dimensionality reduction method, accept both capital and lower case
24
+ complexity=20, # None for PCA; perfplexity for TSNE, recommend: 30; n_neigbors for UMAP, recommend: 15
25
+ n=2, # n_components
26
+ seed: int=123, # seed for random_state
27
+ **kwargs, # arguments from PCA, TSNE, or UMAP depends on which method to use
28
+ ):
29
+
30
+ "Reduce the dimensionality given a dataframe of values"
31
+
32
+ method = method.lower()
33
+ assert method in ['pca','tsne','umap'], "Please choose a method among PCA, TSNE, and UMAP"
34
+
35
+ if method == 'pca':
36
+ reducer = PCA(n_components=n, random_state=seed,**kwargs)
37
+ elif method == 'tsne':
38
+ reducer = TSNE(n_components=n,
39
+ random_state=seed,
40
+ perplexity = complexity, # default from official is 30
41
+ **kwargs)
42
+ elif method == 'umap':
43
+ reducer = UMAP(n_components=n,
44
+ random_state=seed,
45
+ n_neighbors=complexity, # default from official is 15, try 15-200
46
+ **kwargs)
47
+ else:
48
+ raise ValueError('Invalid method specified')
49
+
50
+ proj = reducer.fit_transform(data)
51
+ embedding_df = pd.DataFrame(proj).set_index(data.index) if isinstance(data,pd.DataFrame) else pd.DataFrame(proj)
52
+ embedding_df.columns = [f"{method.upper()}{i}" for i in range(1, n + 1)]
53
+
54
+ return embedding_df
55
+
56
+ # %% ../../nbs/core/04_plot.ipynb 10
57
+ def set_sns():
58
+ sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
59
+ sns.set_context('notebook')
60
+ sns.set_style("ticks")
61
+
62
+ # %% ../../nbs/core/04_plot.ipynb 12
63
+ @delegates(sns.scatterplot)
64
+ def plot_2d(X: pd.DataFrame, # a dataframe that has first column to be x, and second column to be y
65
+ **kwargs, # arguments for sns.scatterplot
66
+ ):
67
+ "Make 2D plot from a dataframe that has first column to be x, and second column to be y"
68
+ plt.figure(figsize=(7,7))
69
+ sns.scatterplot(data = X,x=X.columns[0],y=X.columns[1],alpha=0.7,**kwargs)
70
+
71
+ # %% ../../nbs/core/04_plot.ipynb 14
72
+ def plot_corr(x,#a column of df
73
+ y,#a column of df
74
+ xlabel=None,# x axis label
75
+ ylabel=None,# y axis label
76
+ order=3, # polynomial level, if straight, order=1
77
+ ):
78
+ sns.regplot(x=x,
79
+ y=y,
80
+ order=order,
81
+ line_kws={'color': 'gray'}
82
+ )
83
+
84
+ if xlabel is not None:
85
+ plt.xlabel(xlabel)
86
+
87
+ if ylabel is not None:
88
+ plt.ylabel(ylabel)
89
+ # plt.text(x=0.8, y=0.1, s=f'Spearman: {correlation:.2f}', transform=plt.gca().transAxes, ha='center', va='center');
kdock/core/protein.py ADDED
@@ -0,0 +1,293 @@
1
+ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/core/02_protein.ipynb.
2
+
3
+ # %% auto 0
4
+ __all__ = ['get_uniprot_seq', 'get_uniprot_features', 'get_uniprot_kd', 'get_uniprot_type', 'apply_mut_single',
5
+ 'apply_mut_complex', 'compare_seq']
6
+
7
+ # %% ../../nbs/core/02_protein.ipynb 3
8
+ import pandas as pd
9
+ import requests,re
10
+ from functools import lru_cache
11
+
12
+ # for compare seq
13
+ from Bio.Align import PairwiseAligner
14
+
15
+ # %% ../../nbs/core/02_protein.ipynb 5
16
+ @lru_cache()
17
+ def get_uniprot_seq(uniprot_id):
18
+ "Queries the UniProt database to retrieve the protein sequence for a given UniProt ID."
19
+
20
+ url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
21
+ response = requests.get(url)
22
+
23
+ # Check if the request was successful (status code 200)
24
+ if response.status_code == 200:
25
+ data = response.text
26
+ # The sequence starts after the first line, which is a description
27
+ sequence = ''.join(data.split('\n')[1:]).strip()
28
+ return sequence
29
+ else:
30
+ return f"Error: Unable to retrieve sequence for UniProt ID {uniprot_id}. Status code: {response.status_code}"
31
+
32
+ # %% ../../nbs/core/02_protein.ipynb 7
33
+ @lru_cache()
34
+ def get_uniprot_features(uniprot_id):
35
+ "Given uniprot_id, get specific region for uniprot features."
36
+ # uniprot REST API
37
+ url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
38
+ response = requests.get(url)
39
+
40
+ if response.status_code == 200:
41
+ data = response.json()
42
+ # Extract the "features" section which contains information
43
+ features = data.get('features', [])
44
+
45
+ protein_name = (
46
+ data.get("proteinDescription", {})
47
+ .get("recommendedName", {})
48
+ .get("fullName", {})
49
+ .get("value")
50
+ )
51
+
52
+ gene_name = (
53
+ data.get("genes", [{}])[0]
54
+ .get("geneName", {})
55
+ .get("value")
56
+ )
57
+ return {
58
+ "uniprot_id": uniprot_id,
59
+ "protein_name": protein_name,
60
+ "gene_name": gene_name,
61
+ "features": features
62
+ }
63
+ else:
64
+ raise ValueError(f"Failed to retrieve UniProt features for {uniprot_id}")
65
+
66
+ # %% ../../nbs/core/02_protein.ipynb 9
67
+ def get_uniprot_kd(uniprot_id):
68
+ "Query 'Domain: Protein kinase' based on UniProt ID and get its sequence info."
69
+ data = get_uniprot_features(uniprot_id)
70
+ seq = get_uniprot_seq(uniprot_id)
71
+ out = []
72
+
73
+ for feature in data['features']:
74
+ if feature.get("type") == "Domain" and "Protein kinase" in feature.get("description", ""):
75
+ start = feature['location']['start']['value']
76
+ end = feature['location']['end']['value']
77
+ out.append({
78
+ "uniprot_id": uniprot_id,
79
+ "protein_name": data["protein_name"],
80
+ "gene_name": data["gene_name"],
81
+ "start": start,
82
+ "end": end,
83
+ "description": feature.get("description", ""),
84
+ "sequence": seq[start-1:end]
85
+ })
86
+
87
+ return out
88
+
89
+ # %% ../../nbs/core/02_protein.ipynb 11
90
+ def get_uniprot_type(uniprot_id,type_='Signal'):
91
+ "Get region sequences based on UniProt ID features."
92
+ data = get_uniprot_features(uniprot_id)
93
+ seq = get_uniprot_seq(uniprot_id)
94
+
95
+ out = []
96
+
97
+ for feature in data['features']:
98
+ if feature.get("type") == type_:
99
+ start = feature['location']['start']['value']
100
+ end = feature['location']['end']['value']
101
+ region = {
102
+ 'uniprot_id': uniprot_id,
103
+ 'type': feature['type'],
104
+ "protein_name": data["protein_name"],
105
+ "gene_name": data["gene_name"],
106
+ 'start': start,
107
+ 'end': end,
108
+ 'description': feature['description'],
109
+ 'sequence': seq[start-1:end]
110
+ }
111
+ out.append(region)
112
+
113
+ if not out:
114
+ available = sorted({f.get("type") for f in data['features'] if f.get("type")})
115
+ print(f"No feature of type '{type_}' found for {uniprot_id}.")
116
+ print(f"Available feature types: {', '.join(available)}")
117
+ return available
118
+
119
+ return out
120
+
121
+ # %% ../../nbs/core/02_protein.ipynb 16
122
+ def apply_mut_single(seq, # protein sequence
123
+ *mutations, # e.g., E709A
124
+ start_pos=1, # if the protein sequence does not start from index 1, indicate the start index to match the mutations
125
+ ):
126
+ "Apply mutations to a protein sequence."
127
+ seq_list = list(seq) # convert to list for mutability
128
+
129
+ for mut in mutations:
130
+ # check mutation format
131
+ if len(mut) < 3: raise ValueError(f"Invalid mutation format: {mut}")
132
+
133
+ from_aa,pos,to_aa = mut[0],int(mut[1:-1])-start_pos,mut[-1]
134
+
135
+ # make sure position is within the sequence length
136
+ if pos < 0 or pos >= len(seq_list): raise IndexError(f"Position {pos + 1} out of range for sequence length {len(seq_list)}")
137
+ # make sure aa from mutations matches the residue on the sequence
138
+ if seq_list[pos] != from_aa: raise ValueError(f"Expected {from_aa} at position {pos + 1}, found {seq_list[pos]}")
139
+
140
+ seq_list[pos] = to_aa
141
+ print('Converted:', mut)
142
+
143
+ return ''.join(seq_list)
144
+
145
+ # %% ../../nbs/core/02_protein.ipynb 18
146
+ def apply_mut_complex(seq, # protein sequence
147
+ mut, # mutation (e.g., G776delinsVC/S783C, G778dupGSP)
148
+ start_pos=1, # if truncated protein sequence, indicate where it starts to match the position of mutation
149
+ ):
150
+ """
151
+ Apply a composite mutation like 'G776delinsVC/S783C' to `seq`,
152
+ assuming `seq[0]` corresponds to residue number `start_pos`.
153
+
154
+ * At most one delins **or** dup is allowed.
155
+ * Point substitutions are executed first; the indel/dup is done last.
156
+ """
157
+ _sub_pat = re.compile(r'^([A-Z])(\d+)([A-Z])$') # e.g. S783C
158
+ _delins_pat = re.compile(r'^([A-Z])(\d+)delins([A-Z]+)$') # e.g. G776delinsVC
159
+ _dup_pat = re.compile(r'^([A-Z])(\d+)dup([A-Z]+)$') # e.g. G778dupGSP
160
+
161
+ seq = list(seq)
162
+ tokens = mut.split('/')
163
+
164
+ # ---------- 1) substitutions (length-neutral) ----------
165
+ for m in tokens:
166
+ if _sub_pat.match(m):
167
+ orig, pos, new = _sub_pat.match(m).groups()
168
+ idx = int(pos) - start_pos
169
+ if seq[idx] != orig:
170
+ raise ValueError(
171
+ f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
172
+ )
173
+ seq[idx] = new
174
+
175
+ # ---------- 2) the single length-changing event ----------
176
+ for m in tokens:
177
+ if _delins_pat.match(m):
178
+ orig, pos, ins = _delins_pat.match(m).groups()
179
+ idx = int(pos) - start_pos
180
+ if seq[idx] != orig:
181
+ raise ValueError(
182
+ f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
183
+ )
184
+ seq[idx : idx + 1] = list(ins) # replace 1 residue with many
185
+ break
186
+
187
+ if _dup_pat.match(m):
188
+ orig, pos, dup = _dup_pat.match(m).groups()
189
+ idx = int(pos) - start_pos
190
+ if seq[idx] != orig:
191
+ raise ValueError(
192
+ f"Mismatch at position {pos}: expected {orig}, got {seq[idx]}"
193
+ )
194
+ seq[idx + 1 : idx + 1] = list(dup) # insert right after the residue
195
+ break
196
+
197
+ return ''.join(seq)
198
+
199
+ # %% ../../nbs/core/02_protein.ipynb 22
200
+ def compare_seq(
201
+ seq1: str, # original
202
+ seq2: str, # mutant
203
+ *,
204
+ start_pos: int = 1,
205
+ label1: str = "Original",
206
+ label2: str = "Mutant",
207
+ visualize: bool = True
208
+ ):
209
+ """
210
+ Align two protein sequences and summarise differences.
211
+ """
212
+
213
+ # ----- global alignment using PairwiseAligner -----
214
+ aligner = PairwiseAligner()
215
+ aligner.mode = "global"
216
+ aligner.match_score = 2
217
+ aligner.mismatch_score = -1
218
+ aligner.open_gap_score = -5
219
+ aligner.extend_gap_score = -0.5
220
+
221
+ alignment = aligner.align(seq1, seq2)[0]
222
+ aln1 = alignment.aligned[0]
223
+ aln2 = alignment.aligned[1]
224
+
225
+ # Reconstruct aligned strings from aligned segments
226
+ aligned_seq1 = []
227
+ aligned_seq2 = []
228
+ i1, i2 = 0, 0
229
+
230
+ for (start1, end1), (start2, end2) in zip(aln1, aln2):
231
+ # Add gaps to make sequences align
232
+ while i1 < start1:
233
+ aligned_seq1.append(seq1[i1])
234
+ aligned_seq2.append('-')
235
+ i1 += 1
236
+ while i2 < start2:
237
+ aligned_seq1.append('-')
238
+ aligned_seq2.append(seq2[i2])
239
+ i2 += 1
240
+
241
+ # Add aligned part
242
+ for j in range(end1 - start1):
243
+ aligned_seq1.append(seq1[i1])
244
+ aligned_seq2.append(seq2[i2])
245
+ i1 += 1
246
+ i2 += 1
247
+
248
+ # Remaining tails
249
+ while i1 < len(seq1):
250
+ aligned_seq1.append(seq1[i1])
251
+ aligned_seq2.append('-')
252
+ i1 += 1
253
+ while i2 < len(seq2):
254
+ aligned_seq1.append('-')
255
+ aligned_seq2.append(seq2[i2])
256
+ i2 += 1
257
+
258
+ aln1_str = ''.join(aligned_seq1)
259
+ aln2_str = ''.join(aligned_seq2)
260
+
261
+ # ----- find differences -----
262
+ diffs, raw_i1, raw_i2 = [], 0, 0
263
+ for a1, a2 in zip(aln1_str, aln2_str):
264
+ if a1 != '-' and a2 != '-':
265
+ if a1 != a2:
266
+ diffs.append((start_pos + raw_i1, a1, a2, 'substitution'))
267
+ raw_i1 += 1
268
+ raw_i2 += 1
269
+ elif a1 == '-' and a2 != '-':
270
+ diffs.append((start_pos + raw_i1, '-', a2, 'insertion'))
271
+ raw_i2 += 1
272
+ elif a1 != '-' and a2 == '-':
273
+ diffs.append((start_pos + raw_i1, a1, '-', 'deletion'))
274
+ raw_i1 += 1
275
+
276
+ # ----- visualization -----
277
+ if visualize:
278
+ for block in range(0, len(aln1_str), 80):
279
+ s1_block = aln1_str[block:block + 80]
280
+ s2_block = aln2_str[block:block + 80]
281
+ marker = ''.join(' ' if x == y else '^' for x, y in zip(s1_block, s2_block))
282
+ left_idx = start_pos + aln1_str[:block].replace('-', '').__len__()
283
+ right_idx = left_idx + s1_block.replace('-', '').__len__() - 1
284
+ print(f"{label1:<10} {left_idx:>5}-{right_idx:<5}: {s1_block}")
285
+ print(f"{label2:<10} {'':>11}: {s2_block}")
286
+ print(f"{'':>22} {marker}\n")
287
+
288
+ # ----- summary list -----
289
+ print("Differences:")
290
+ for pos, ref, new, kind in diffs:
291
+ print(f" {kind:<12} at {pos:>4}: {ref} → {new}")
292
+
293
+ # return diffs