stcrpy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +0 -0
- examples/egnn.py +425 -0
- stcrpy/__init__.py +5 -0
- stcrpy/tcr_datasets/__init__.py +0 -0
- stcrpy/tcr_datasets/tcr_graph_dataset.py +499 -0
- stcrpy/tcr_datasets/tcr_selector.py +0 -0
- stcrpy/tcr_datasets/tcr_structure_dataset.py +0 -0
- stcrpy/tcr_datasets/utils.py +350 -0
- stcrpy/tcr_formats/__init__.py +0 -0
- stcrpy/tcr_formats/tcr_formats.py +114 -0
- stcrpy/tcr_formats/tcr_haddock.py +556 -0
- stcrpy/tcr_geometry/TCRCoM.py +350 -0
- stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy/tcr_geometry/TCRDock.py +261 -0
- stcrpy/tcr_geometry/TCRGeom.py +450 -0
- stcrpy/tcr_geometry/TCRGeomFiltering.py +273 -0
- stcrpy/tcr_geometry/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_1_imgt_numbered.pdb +6549 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_2_imgt_numbered.pdb +6495 -0
- stcrpy/tcr_geometry/reference_data/reference_A.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_B.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_D.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_G.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_data.py +104 -0
- stcrpy/tcr_interactions/PLIPParser.py +147 -0
- stcrpy/tcr_interactions/TCRInteractionProfiler.py +433 -0
- stcrpy/tcr_interactions/TCRpMHC_PLIP_Model_Parser.py +133 -0
- stcrpy/tcr_interactions/__init__.py +0 -0
- stcrpy/tcr_interactions/utils.py +170 -0
- stcrpy/tcr_methods/__init__.py +0 -0
- stcrpy/tcr_methods/tcr_batch_operations.py +223 -0
- stcrpy/tcr_methods/tcr_methods.py +150 -0
- stcrpy/tcr_methods/tcr_reformatting.py +18 -0
- stcrpy/tcr_metrics/__init__.py +2 -0
- stcrpy/tcr_metrics/constants.py +39 -0
- stcrpy/tcr_metrics/tcr_interface_rmsd.py +237 -0
- stcrpy/tcr_metrics/tcr_rmsd.py +179 -0
- stcrpy/tcr_ml/__init__.py +0 -0
- stcrpy/tcr_ml/geometry_predictor.py +3 -0
- stcrpy/tcr_processing/AGchain.py +89 -0
- stcrpy/tcr_processing/Chemical_components.py +48915 -0
- stcrpy/tcr_processing/Entity.py +301 -0
- stcrpy/tcr_processing/Fragment.py +58 -0
- stcrpy/tcr_processing/Holder.py +24 -0
- stcrpy/tcr_processing/MHC.py +449 -0
- stcrpy/tcr_processing/MHCchain.py +149 -0
- stcrpy/tcr_processing/Model.py +37 -0
- stcrpy/tcr_processing/Select.py +145 -0
- stcrpy/tcr_processing/TCR.py +532 -0
- stcrpy/tcr_processing/TCRIO.py +47 -0
- stcrpy/tcr_processing/TCRParser.py +1230 -0
- stcrpy/tcr_processing/TCRStructure.py +148 -0
- stcrpy/tcr_processing/TCRchain.py +160 -0
- stcrpy/tcr_processing/__init__.py +3 -0
- stcrpy/tcr_processing/annotate.py +480 -0
- stcrpy/tcr_processing/utils/__init__.py +0 -0
- stcrpy/tcr_processing/utils/common.py +67 -0
- stcrpy/tcr_processing/utils/constants.py +367 -0
- stcrpy/tcr_processing/utils/region_definitions.py +782 -0
- stcrpy/utils/__init__.py +0 -0
- stcrpy/utils/error_stream.py +12 -0
- stcrpy-1.0.0.dist-info/METADATA +173 -0
- stcrpy-1.0.0.dist-info/RECORD +68 -0
- stcrpy-1.0.0.dist-info/WHEEL +5 -0
- stcrpy-1.0.0.dist-info/licenses/LICENCE +28 -0
- stcrpy-1.0.0.dist-info/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy-1.0.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import warnings
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
import Bio
|
|
7
|
+
from Bio.PDB.Superimposer import Superimposer
|
|
8
|
+
|
|
9
|
+
from .. import tcr_processing
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HADDOCKFormatter:
|
|
13
|
+
|
|
14
|
+
def __init__(self, save_dir: str = None):
|
|
15
|
+
"""Constructor HADDOCK formatting object.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
save_dir (str, optional): Path to save formatted files to. Defaults to None.
|
|
19
|
+
"""
|
|
20
|
+
self.save_dir = save_dir if save_dir is not None else "."
|
|
21
|
+
|
|
22
|
+
def tcr_to_haddock(self, tcr: "TCR"):
|
|
23
|
+
"""Bound reformatting of TCR structure object to HADDOCK compatible PDB file.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
tcr (TCR): TCR structure object
|
|
27
|
+
"""
|
|
28
|
+
self.write_TCR_pdb_file(tcr, self.save_dir)
|
|
29
|
+
|
|
30
|
+
def pMHC_to_haddock(self, mhc: "MHC", antigen: list["Antigen"]):
|
|
31
|
+
"""Bound reformatting of MHC and antigen structures object to HADDOCK compatible PDB file.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
mhc (MHC): MHC structure object
|
|
35
|
+
antigen (Antigen): Antigen structure object
|
|
36
|
+
"""
|
|
37
|
+
self.write_antigen_pdb_file(mhc, antigen, self.save_dir)
|
|
38
|
+
|
|
39
|
+
def write_TCR_pdb_file(self, tcr: "TCR", save_dir: str):
|
|
40
|
+
"""
|
|
41
|
+
Writes TCR structure to a PDB file in a format HADDOCK can deal with.
|
|
42
|
+
Generates a PDB file, a mapping from the old to the new numbering,
|
|
43
|
+
and a list of active residues to restrain the HADDOCK simulation.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
tcr (TCR): The TCR structure.
|
|
47
|
+
save_dir (str): The directory to save the files (default is current directory).
|
|
48
|
+
"""
|
|
49
|
+
tcr_id = f"{tcr.parent.parent.id}_{tcr.id}"
|
|
50
|
+
new_tcr_structure = Bio.PDB.Model.Model(id=0)
|
|
51
|
+
residue_conversion = {}
|
|
52
|
+
for i, chain in enumerate(tcr.get_chains()):
|
|
53
|
+
residue_conversion[chain.id] = {}
|
|
54
|
+
new_chain = Bio.PDB.Chain.Chain(id=chain.id)
|
|
55
|
+
selected_residues = [
|
|
56
|
+
res
|
|
57
|
+
for res in Bio.PDB.Selection.unfold_entities(chain, "R")
|
|
58
|
+
if res.id[1] in list(range(1, 130))
|
|
59
|
+
]
|
|
60
|
+
for residue in selected_residues:
|
|
61
|
+
# handle insertion numbering for HADDOCK
|
|
62
|
+
new_residue = residue.copy()
|
|
63
|
+
if new_residue.id[-1] != " ":
|
|
64
|
+
new_residue.id = (
|
|
65
|
+
new_residue.id[0],
|
|
66
|
+
10 * new_residue.id[1]
|
|
67
|
+
+ (200 * i)
|
|
68
|
+
+ imgt_insertion_char_to_int(new_residue.id[-1]),
|
|
69
|
+
" ",
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
new_residue.id = (
|
|
73
|
+
new_residue.id[0],
|
|
74
|
+
new_residue.id[1] + (200 * i),
|
|
75
|
+
new_residue.id[-1],
|
|
76
|
+
)
|
|
77
|
+
if new_residue.id != residue.id:
|
|
78
|
+
residue_conversion[chain.id][residue.id] = new_residue.id
|
|
79
|
+
new_chain.add(new_residue)
|
|
80
|
+
new_tcr_structure.add(new_chain)
|
|
81
|
+
if not os.path.exists(os.path.join(save_dir, tcr_id)):
|
|
82
|
+
os.mkdir(os.path.join(save_dir, tcr_id))
|
|
83
|
+
with open(
|
|
84
|
+
os.path.join(save_dir, f"{tcr_id}/{tcr_id}_haddock_active_residues.txt"),
|
|
85
|
+
"w",
|
|
86
|
+
) as f:
|
|
87
|
+
# get cdr numbering
|
|
88
|
+
active_residues = []
|
|
89
|
+
for chain in tcr.get_chains():
|
|
90
|
+
cdrs = chain.get_CDRs()
|
|
91
|
+
|
|
92
|
+
res_list = [r.id for cdr in cdrs for r in cdr.get_residues()]
|
|
93
|
+
for res_key in res_list:
|
|
94
|
+
# res_key = (" ", *res[0])
|
|
95
|
+
if res_key in residue_conversion[chain.id]:
|
|
96
|
+
active_residues.append(residue_conversion[chain.id][res_key][1])
|
|
97
|
+
else:
|
|
98
|
+
active_residues.append(res_key[1])
|
|
99
|
+
f.write("TCR ACTIVE RESIDUES FOR HADDOCK\n")
|
|
100
|
+
f.write(",".join([str(r) for r in active_residues]))
|
|
101
|
+
f.write("\n")
|
|
102
|
+
|
|
103
|
+
with open(
|
|
104
|
+
os.path.join(save_dir, f"{tcr_id}/{tcr_id}_haddock_renumbering.txt"), "w"
|
|
105
|
+
) as f:
|
|
106
|
+
f.write("TCR RESIDUE RENUMBERING FOR HADDOCK\n")
|
|
107
|
+
for chain_id in residue_conversion:
|
|
108
|
+
for res, new_res in residue_conversion[chain_id].items():
|
|
109
|
+
r_as_str = f"({chain_id},({res[0]},{res[1]},{res[2]}),({new_res[0]},{new_res[1]},{new_res[2]})\n"
|
|
110
|
+
f.write(r_as_str)
|
|
111
|
+
|
|
112
|
+
pdb_io = Bio.PDB.PDBIO()
|
|
113
|
+
pdb_io.set_structure(new_tcr_structure)
|
|
114
|
+
filename = os.path.join(save_dir, f"{tcr_id}/{tcr_id}_tcr_for_docking.pdb")
|
|
115
|
+
pdb_io.save(filename)
|
|
116
|
+
return filename
|
|
117
|
+
|
|
118
|
+
def write_antigen_pdb_file(
|
|
119
|
+
self, mhc: "MHC", antigen: list["Antigen"], save_dir: str
|
|
120
|
+
):
|
|
121
|
+
"""
|
|
122
|
+
Writes the antigen PDB file for docking with HADDOCK.
|
|
123
|
+
Generates a PDB file, a file containing the renumbering mapping, and a list of active residues to restrict the simulation.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
mhc (MHC): MHC structure object.
|
|
127
|
+
antigen (list[Antigen]): List containing antigen chain. Should be length 1.
|
|
128
|
+
save_dir (str, optional): The directory to save the PDB file. Defaults to ".".
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
str: The filename of the saved antigen PDB file.
|
|
132
|
+
"""
|
|
133
|
+
# structure = p.get_structure()
|
|
134
|
+
# chains = [i for i in structure.get_antigens() if i.id in antigen_chain_ids]
|
|
135
|
+
mhc_chains = [c for c in mhc.get_chains() if c.chain_type != "B2M"]
|
|
136
|
+
antigen_chains = mhc_chains + antigen
|
|
137
|
+
mhc_id = f"{mhc.parent.parent.id}_MHC_{''.join([c.id for c in antigen_chains])}"
|
|
138
|
+
|
|
139
|
+
new_antigen_structure = Bio.PDB.Model.Model(id=0)
|
|
140
|
+
residue_conversion = {}
|
|
141
|
+
for i, chain in enumerate(antigen_chains):
|
|
142
|
+
residue_conversion[chain.id] = {}
|
|
143
|
+
new_chain = Bio.PDB.Chain.Chain(id=chain.id)
|
|
144
|
+
for residue in chain.get_residues():
|
|
145
|
+
# handle insertion numbering for HADDOCK
|
|
146
|
+
new_residue = residue.copy()
|
|
147
|
+
new_residue.id = (
|
|
148
|
+
new_residue.id[0],
|
|
149
|
+
new_residue.id[1] + (500 * i),
|
|
150
|
+
new_residue.id[-1],
|
|
151
|
+
)
|
|
152
|
+
if new_residue.id != residue.id:
|
|
153
|
+
residue_conversion[chain.id][residue.id] = new_residue.id
|
|
154
|
+
new_chain.add(new_residue)
|
|
155
|
+
new_antigen_structure.add(new_chain)
|
|
156
|
+
|
|
157
|
+
if not os.path.exists(os.path.join(save_dir, mhc_id)):
|
|
158
|
+
os.mkdir(os.path.join(save_dir, mhc_id))
|
|
159
|
+
|
|
160
|
+
with open(
|
|
161
|
+
os.path.join(save_dir, f"{mhc_id}/{mhc_id}_haddock_active_residues.txt"),
|
|
162
|
+
"a",
|
|
163
|
+
) as f:
|
|
164
|
+
# get peptide numbering and select as active
|
|
165
|
+
active_residues = []
|
|
166
|
+
for chain in antigen:
|
|
167
|
+
res_list = list(r.id for r in chain.get_residues())
|
|
168
|
+
for res_key in res_list:
|
|
169
|
+
if res_key in residue_conversion[chain.id]:
|
|
170
|
+
active_residues.append(residue_conversion[chain.id][res_key][1])
|
|
171
|
+
else:
|
|
172
|
+
active_residues.append(res_key[1])
|
|
173
|
+
f.write("ANTIGEN ACTIVE RESIDUES FOR HADDOCK\n")
|
|
174
|
+
f.write(",".join([str(r) for r in active_residues]))
|
|
175
|
+
f.write("\n")
|
|
176
|
+
|
|
177
|
+
with open(
|
|
178
|
+
os.path.join(save_dir, f"{mhc_id}/{mhc_id}_haddock_renumbering.txt"), "a"
|
|
179
|
+
) as f:
|
|
180
|
+
f.write("ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n")
|
|
181
|
+
for chain in residue_conversion:
|
|
182
|
+
for res, new_res in residue_conversion[chain].items():
|
|
183
|
+
r_as_str = f"({chain},({res[0]},{res[1]},{res[2]}),({new_res[0]},{new_res[1]},{new_res[2]})\n"
|
|
184
|
+
f.write(r_as_str)
|
|
185
|
+
|
|
186
|
+
pdb_io = Bio.PDB.PDBIO()
|
|
187
|
+
pdb_io.set_structure(new_antigen_structure)
|
|
188
|
+
filename = os.path.join(save_dir, f"{mhc_id}/{mhc_id}_antigen_for_docking.pdb")
|
|
189
|
+
pdb_io.save(filename)
|
|
190
|
+
return filename
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class HADDOCKResultsParser:
|
|
194
|
+
|
|
195
|
+
def __init__(
|
|
196
|
+
self,
|
|
197
|
+
haddock_results_dir: str,
|
|
198
|
+
tcr_renumbering_file: str = None,
|
|
199
|
+
pmhc_renumbering_file: str = None,
|
|
200
|
+
):
|
|
201
|
+
"""Parser for results from HADDOCK simulations. Renumbers TCR, MHC and Antigen using renumbering files, and parses result metrics.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
haddock_results_dir (str): path to HADDOCK simulation results.
|
|
205
|
+
tcr_renumbering_file (str, optional): path to text file containing TCR renumbering to restore from HADDOCK compatible numbering. Defaults to None.
|
|
206
|
+
pmhc_renumbering_file (str, optional): path to text file containing MHC and antigen renumbering to restore from HADDOCK compatible numbering. Defaults to None.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
self.haddock_results_dir = haddock_results_dir
|
|
210
|
+
self.tcr_renumbering_file = tcr_renumbering_file
|
|
211
|
+
self.pmhc_renumbering_file = pmhc_renumbering_file
|
|
212
|
+
|
|
213
|
+
if self.haddock_results_dir.endswith(".tgz"):
|
|
214
|
+
warnings.warn(
|
|
215
|
+
"HADDOCK results are compressed. Decompress results before proceeding."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def renumber_all_haddock_predictions(self):
|
|
219
|
+
"""Renumber all haddock predictions contained in results folder. Requires standard HADDOCK output directory format."""
|
|
220
|
+
path = os.path.join(self.haddock_results_dir, "structures/it1/")
|
|
221
|
+
pattern = re.compile(r"complex_.*\.pdb")
|
|
222
|
+
|
|
223
|
+
for filename in os.listdir(path):
|
|
224
|
+
if pattern.match(filename):
|
|
225
|
+
file_path = os.path.join(path, filename)
|
|
226
|
+
self.renumber_haddock_prediction(
|
|
227
|
+
file_path,
|
|
228
|
+
self.tcr_renumbering_file,
|
|
229
|
+
self.pmhc_renumbering_file,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def renumber_haddock_prediction(
|
|
233
|
+
self,
|
|
234
|
+
docked_prediction_file: str,
|
|
235
|
+
haddock_renumbering_file: str,
|
|
236
|
+
antigen_renumbering_file: str = None,
|
|
237
|
+
) -> Bio.PDB.Model.Model:
|
|
238
|
+
"""
|
|
239
|
+
Renumber the HADDOCK prediction based on the renumbering files.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
docked_prediction_file (str): Path to the docked prediction file.
|
|
243
|
+
haddock_renumbering_file (str): Path to the HADDOCK renumbering file.
|
|
244
|
+
antigen_renumbering_file (str, optional): Path to the antigen renumbering file.
|
|
245
|
+
Needed for TCR only PDBs with no antigen. Defaults to None.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Bio.PDB.Model.Model: The renumbered HADDOCK prediction.
|
|
249
|
+
|
|
250
|
+
Raises:
|
|
251
|
+
ValueError: If the renumbering index is not found in the renumbering file.
|
|
252
|
+
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# initialise file parsers
|
|
256
|
+
tcr_parser = tcr_processing.TCRParser.TCRParser()
|
|
257
|
+
bio_parser = Bio.PDB.PDBParser()
|
|
258
|
+
|
|
259
|
+
# find chain ID of TCR to distinguish TCR from antigen
|
|
260
|
+
tcr_chain_id = list(
|
|
261
|
+
tcr_parser.get_tcr_structure("tmp", docked_prediction_file).get_TCRchains()
|
|
262
|
+
)[0].get_id()
|
|
263
|
+
docked_prediction = bio_parser.get_structure("docked", docked_prediction_file)
|
|
264
|
+
|
|
265
|
+
# get chains of HADDOCK dock
|
|
266
|
+
merged_tcr_chain = docked_prediction[0][tcr_chain_id]
|
|
267
|
+
merged_antigen_chain = [
|
|
268
|
+
chain
|
|
269
|
+
for chain in docked_prediction.get_chains()
|
|
270
|
+
if chain.id != merged_tcr_chain.id
|
|
271
|
+
][0]
|
|
272
|
+
|
|
273
|
+
# get renumbering
|
|
274
|
+
with open(haddock_renumbering_file, "r") as f:
|
|
275
|
+
lines = f.readlines()
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
antigen_renumbering_index = lines.index(
|
|
279
|
+
"ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n"
|
|
280
|
+
)
|
|
281
|
+
antigen_renumber_indices = (
|
|
282
|
+
antigen_renumbering_index + 1,
|
|
283
|
+
-1,
|
|
284
|
+
)
|
|
285
|
+
except ValueError:
|
|
286
|
+
antigen_renumbering_index = -1
|
|
287
|
+
antigen_renumber_indices = (
|
|
288
|
+
-1,
|
|
289
|
+
-1,
|
|
290
|
+
)
|
|
291
|
+
tcr_renumber_indices = (1, antigen_renumbering_index)
|
|
292
|
+
|
|
293
|
+
# if antigen renumbering file is provided, get antigen renumbering from there
|
|
294
|
+
if antigen_renumbering_file is not None:
|
|
295
|
+
lines = (
|
|
296
|
+
lines[: antigen_renumber_indices[0] - 1]
|
|
297
|
+
if antigen_renumber_indices[0] != -1
|
|
298
|
+
else lines
|
|
299
|
+
)
|
|
300
|
+
tcr_renumber_indices = (1, len(lines) - 1)
|
|
301
|
+
antigen_renumber_indices = (len(lines) + 1, -1)
|
|
302
|
+
with open(antigen_renumbering_file, "r") as f:
|
|
303
|
+
antigen_xtal_lines = f.readlines()
|
|
304
|
+
antigen_renumbering_index = antigen_xtal_lines.index(
|
|
305
|
+
"ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n"
|
|
306
|
+
)
|
|
307
|
+
lines.extend(antigen_xtal_lines[antigen_renumbering_index:])
|
|
308
|
+
|
|
309
|
+
# renumber TCR by creating new PDB model and populating with residues
|
|
310
|
+
tcr_parsed_lines = list(
|
|
311
|
+
map(
|
|
312
|
+
parse_renumbered_line,
|
|
313
|
+
lines[tcr_renumber_indices[0] : tcr_renumber_indices[1]],
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
changed_tcr_chain_ids, _, _ = list(zip(*tcr_parsed_lines))
|
|
317
|
+
|
|
318
|
+
tcr = Bio.PDB.Model.Model(id=0)
|
|
319
|
+
|
|
320
|
+
if len(set(changed_tcr_chain_ids)) > 1:
|
|
321
|
+
id_for_conserved_numbered_chain = min(
|
|
322
|
+
set(changed_tcr_chain_ids), key=changed_tcr_chain_ids.count
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
id_for_conserved_numbered_chain = merged_tcr_chain.id
|
|
326
|
+
tcr.add(Bio.PDB.Chain.Chain(id=id_for_conserved_numbered_chain))
|
|
327
|
+
try:
|
|
328
|
+
tcr.add(
|
|
329
|
+
Bio.PDB.Chain.Chain(
|
|
330
|
+
id=max(set(changed_tcr_chain_ids), key=changed_tcr_chain_ids.count)
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
second_tcr_chain_id = None
|
|
334
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
335
|
+
for id_to_try in "ABCDEFGH":
|
|
336
|
+
try:
|
|
337
|
+
tcr.add(Bio.PDB.Chain.Chain(id=id_to_try))
|
|
338
|
+
second_tcr_chain_id = id_to_try
|
|
339
|
+
break
|
|
340
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
renumbered_residues = {}
|
|
344
|
+
for renumbering in tcr_parsed_lines:
|
|
345
|
+
try:
|
|
346
|
+
residue = merged_tcr_chain[renumbering[-1]]
|
|
347
|
+
merged_tcr_chain.detach_child(renumbering[-1])
|
|
348
|
+
residue.id = renumbering[1]
|
|
349
|
+
if second_tcr_chain_id is None:
|
|
350
|
+
if renumbering[0] not in renumbered_residues:
|
|
351
|
+
renumbered_residues[renumbering[0]] = []
|
|
352
|
+
renumbered_residues[renumbering[0]].append(residue)
|
|
353
|
+
else:
|
|
354
|
+
if second_tcr_chain_id not in renumbered_residues:
|
|
355
|
+
renumbered_residues[second_tcr_chain_id] = []
|
|
356
|
+
renumbered_residues[second_tcr_chain_id].append(residue)
|
|
357
|
+
# tcr[renumbering[0]].add(residue)
|
|
358
|
+
except KeyError as e:
|
|
359
|
+
warnings.warn(
|
|
360
|
+
f"""Renumbering {renumbering} failed with Key Error {e}"""
|
|
361
|
+
)
|
|
362
|
+
for residue in merged_tcr_chain.get_residues():
|
|
363
|
+
if id_for_conserved_numbered_chain not in renumbered_residues:
|
|
364
|
+
renumbered_residues[id_for_conserved_numbered_chain] = []
|
|
365
|
+
renumbered_residues[id_for_conserved_numbered_chain].append(residue)
|
|
366
|
+
# tcr[id_for_conserved_numbered_chain].add(residue)
|
|
367
|
+
|
|
368
|
+
# sort the residues
|
|
369
|
+
for chain_id in renumbered_residues:
|
|
370
|
+
sorted_residues = sort_residues_by_imgt_numbering(
|
|
371
|
+
renumbered_residues[chain_id]
|
|
372
|
+
)
|
|
373
|
+
for res in sorted_residues:
|
|
374
|
+
tcr[chain_id].add(res)
|
|
375
|
+
|
|
376
|
+
# renumber antigen
|
|
377
|
+
antigen_parsed_lines = list(
|
|
378
|
+
map(
|
|
379
|
+
parse_renumbered_line,
|
|
380
|
+
lines[antigen_renumber_indices[0] :],
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
changed_antigen_chain_ids, _, _ = list(zip(*antigen_parsed_lines))
|
|
384
|
+
try:
|
|
385
|
+
tcr.add(Bio.PDB.Chain.Chain(id=merged_antigen_chain.id))
|
|
386
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
387
|
+
for id_to_try in "ABCDEFGH":
|
|
388
|
+
if id_to_try == set(changed_antigen_chain_ids).pop():
|
|
389
|
+
continue
|
|
390
|
+
try:
|
|
391
|
+
tcr.add(Bio.PDB.Chain.Chain(id=id_to_try))
|
|
392
|
+
merged_antigen_chain.id = id_to_try
|
|
393
|
+
break
|
|
394
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
395
|
+
continue
|
|
396
|
+
assert (
|
|
397
|
+
len(set(changed_antigen_chain_ids)) == 1
|
|
398
|
+
), "More than one chain renumbered in renumbering file"
|
|
399
|
+
try:
|
|
400
|
+
tcr.add(Bio.PDB.Chain.Chain(id=set(changed_antigen_chain_ids).pop()))
|
|
401
|
+
new_antigen_chain_id = None
|
|
402
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
403
|
+
for id_to_try in "ABCDEFGH":
|
|
404
|
+
if id_to_try == set(changed_antigen_chain_ids).pop():
|
|
405
|
+
continue
|
|
406
|
+
try:
|
|
407
|
+
tcr.add(Bio.PDB.Chain.Chain(id=id_to_try))
|
|
408
|
+
new_antigen_chain_id = id_to_try
|
|
409
|
+
break
|
|
410
|
+
except Bio.PDB.PDBExceptions.PDBConstructionException:
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
for renumbering in antigen_parsed_lines:
|
|
414
|
+
try:
|
|
415
|
+
residue = merged_antigen_chain[renumbering[-1]]
|
|
416
|
+
merged_antigen_chain.detach_child(renumbering[-1])
|
|
417
|
+
residue.id = renumbering[1]
|
|
418
|
+
if new_antigen_chain_id is None:
|
|
419
|
+
tcr[renumbering[0]].add(residue)
|
|
420
|
+
else:
|
|
421
|
+
tcr[new_antigen_chain_id].add(residue)
|
|
422
|
+
except KeyError as e:
|
|
423
|
+
warnings.warn(
|
|
424
|
+
f"""Renumbering {renumbering} failed with Key Error {e}"""
|
|
425
|
+
)
|
|
426
|
+
for residue in merged_antigen_chain.get_residues():
|
|
427
|
+
tcr[merged_antigen_chain.id].add(residue)
|
|
428
|
+
|
|
429
|
+
# create structure object and save
|
|
430
|
+
tcr_struct = Bio.PDB.Structure.Structure(id=0)
|
|
431
|
+
tcr_struct.add(tcr)
|
|
432
|
+
|
|
433
|
+
pdb_io = Bio.PDB.PDBIO()
|
|
434
|
+
pdb_io.set_structure(tcr_struct)
|
|
435
|
+
save_to = "renumbered_" + docked_prediction_file.split("/")[-1]
|
|
436
|
+
filename = os.path.join(*docked_prediction_file.split("/")[:-1], save_to)
|
|
437
|
+
pdb_io.save(filename)
|
|
438
|
+
|
|
439
|
+
def get_haddock_scores(self) -> "pandas.DataFrame":
|
|
440
|
+
"""Retrieve HADDOCK energy scoes and RMSD evaluations from simulation output:
|
|
441
|
+
\nColumns:
|
|
442
|
+
\n "haddock_score",
|
|
443
|
+
\n "interface_rmsd",
|
|
444
|
+
\n "ligand_rmsd",
|
|
445
|
+
\n "frac_common_contacts",
|
|
446
|
+
\n "E_vdw",
|
|
447
|
+
\n "E_elec",
|
|
448
|
+
\n "E_air",
|
|
449
|
+
\n "E_desolv",
|
|
450
|
+
\n "ligand_rmsd_2",
|
|
451
|
+
\n "cluster_id",
|
|
452
|
+
Raises:
|
|
453
|
+
FileNotFoundError: HADDOCK file contianing scores not found.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
pandas.DataFrame: DataFrame with HADDOCK simulation metrics.
|
|
457
|
+
"""
|
|
458
|
+
import pandas as pd
|
|
459
|
+
import os
|
|
460
|
+
|
|
461
|
+
haddock_columns = [
|
|
462
|
+
# 'idx',
|
|
463
|
+
"haddock_score",
|
|
464
|
+
"interface_rmsd",
|
|
465
|
+
"ligand_rmsd",
|
|
466
|
+
"frac_common_contacts",
|
|
467
|
+
"E_vdw",
|
|
468
|
+
"E_elec",
|
|
469
|
+
"E_air",
|
|
470
|
+
"E_desolv",
|
|
471
|
+
"ligand_rmsd_2",
|
|
472
|
+
"cluster_id",
|
|
473
|
+
]
|
|
474
|
+
haddock_scores_file = "complex_HS_irmsd_lrmsd_fnat.list"
|
|
475
|
+
try:
|
|
476
|
+
df = pd.read_csv(
|
|
477
|
+
os.path.join(self.haddock_results_dir, haddock_scores_file),
|
|
478
|
+
sep=" ",
|
|
479
|
+
names=haddock_columns,
|
|
480
|
+
)
|
|
481
|
+
return df
|
|
482
|
+
|
|
483
|
+
except FileNotFoundError:
|
|
484
|
+
raise FileNotFoundError(
|
|
485
|
+
f"File: complex_HS_irmsd_lrmsd_fnat.list containing HADDOCK docking metrics not found in {self.haddock_results_dir}"
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def imgt_insertion_char_to_int(char: str) -> int:
|
|
490
|
+
"""
|
|
491
|
+
Converts an IMGT insertion character to an integer.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
char (str): The IMGT insertion character.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
int: The corresponding integer value.
|
|
498
|
+
"""
|
|
499
|
+
return ord(char) - ord("A") + 1
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def parse_renumbered_line(line: str) -> tuple:
|
|
503
|
+
"""
|
|
504
|
+
Parses a renumbered line from a file and extracts the chain ID, original numbering, and HADDOCK numbering.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
line (str): The renumbered line to parse.
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
tuple: A tuple containing the chain ID, original numbering, and HADDOCK numbering.
|
|
511
|
+
|
|
512
|
+
Example:
|
|
513
|
+
line = "(O,( ,3, ),( ,203, )"
|
|
514
|
+
result = parse_renumbered_line(line)
|
|
515
|
+
# Output: (O)', ('', '3', ''), ('', '203', ''))
|
|
516
|
+
"""
|
|
517
|
+
chain_id = line[1]
|
|
518
|
+
content = re.findall(r"\((.*?)\)", line)
|
|
519
|
+
original_numbering = tuple(
|
|
520
|
+
int(x.strip()) if x.isdigit() else x.strip()
|
|
521
|
+
for x in content[0].split("(")[-1].split(",")
|
|
522
|
+
)
|
|
523
|
+
haddock_numbering = tuple(
|
|
524
|
+
int(x.strip()) if x.isdigit() else x.strip()
|
|
525
|
+
for x in re.split(r",\s*", content[1])
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
def add_empty_id(numbering):
|
|
529
|
+
return tuple(x if x != "" else " " for x in numbering)
|
|
530
|
+
|
|
531
|
+
return chain_id, add_empty_id(original_numbering), add_empty_id(haddock_numbering)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def sort_residues_by_imgt_numbering(
|
|
535
|
+
residues: "list[Bio.PDB.Residue]",
|
|
536
|
+
) -> "list[Bio.PDB.Residue]":
|
|
537
|
+
"""Sort residues in order by IMGT numbering.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
residues (list[Bio.PDB.Residue]): List of IMGT numbered residues.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
list[Bio.PDB.Residue]: Sorted list of IMGT numbered residuess.
|
|
544
|
+
"""
|
|
545
|
+
sorted_residues = sorted(residues, key=lambda x: (x.id[1], x.id[2]))
|
|
546
|
+
imgt_nr_112_subsequence = [
|
|
547
|
+
(i, res) for i, res in enumerate(sorted_residues) if res.id[1] == 112
|
|
548
|
+
]
|
|
549
|
+
if len(imgt_nr_112_subsequence) > 0:
|
|
550
|
+
indices, imgt_nr_112_subsequence = list(zip(*imgt_nr_112_subsequence))
|
|
551
|
+
sorted_imgt_nr_112_subsequence = sorted(
|
|
552
|
+
imgt_nr_112_subsequence, key=lambda x: x.id[2], reverse=True
|
|
553
|
+
)
|
|
554
|
+
for i, idx in enumerate(indices):
|
|
555
|
+
sorted_residues[idx] = sorted_imgt_nr_112_subsequence[i]
|
|
556
|
+
return sorted_residues
|