stcrpy 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,404 @@
1
+ # Adapted from https://github.com/bjornwallner/DockQ
2
+ # Reference: https://doi.org/10.1093/bioinformatics/btae586
3
+
4
+ # MIT License applies:
5
+ # MIT License
6
+
7
+ # Copyright (c) 2024 bjornwallner
8
+
9
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ # of this software and associated documentation files (the "Software"), to deal
11
+ # in the Software without restriction, including without limitation the rights
12
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ # copies of the Software, and to permit persons to whom the Software is
14
+ # furnished to do so, subject to the following conditions:
15
+
16
+ # The above copyright notice and this permission notice shall be included in all
17
+ # copies or substantial portions of the Software.
18
+
19
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ # SOFTWARE.
26
+
27
+ import warnings
28
+ from collections import namedtuple
29
+ import itertools
30
+ import json
31
+ from functools import partial
32
+ import logging
33
+ import numpy as np
34
+ import os
35
+ import tempfile
36
+ import shutil
37
+ import Bio
38
+
39
+ from DockQ import DockQ as dockq
40
+
41
+ from stcrpy import tcr_formats
42
+
43
+
44
+ class TCRDockQ:
45
+ def __init__(
46
+ self,
47
+ TCR_pMHC_interface=True,
48
+ mapping=None,
49
+ small_molecule=None,
50
+ capri_peptide=False,
51
+ no_align=False,
52
+ json=None,
53
+ n_cpu=8,
54
+ max_chunk=512,
55
+ allowed_mismatches=0,
56
+ verbose=False,
57
+ short=False,
58
+ print_results=False
59
+ ):
60
+ """
61
+ Initialize the TCRDockQ class. See `Dockq --help` for more information on the arguments. Some of the keyword arguments are inherited from DockQ and are not relevant for TCR:pMHC complexes.
62
+
63
+ Args:
64
+ TCR_pMHC_interface (bool): Whether to use evaluate the entire TCR-pMHC interface or evaluate spearate chains. Defaults to True. This will create a merged psudo-chain for the antigen and the TCR.
65
+ capri_peptide (bool): Whether to use a CAPRI peptide for the docking. DockQ cannot not be trusted for this setting.
66
+ small_molecule (bool): Whether the ligand is a small molecule.
67
+ short (bool): Short output
68
+ json (str): Write outputs to a chosen json file
69
+ verbose (bool): Verbose output
70
+ no_align (bool): Do not align native and model using sequence alignments, but use the numbering of residues instead
71
+ n_cpu (int): Number of cores to use
72
+ max_chunk (int): Maximum size of chunks given to the cores, actual chunksize is min(max_chunk,combos/cpus)
73
+ allowed_mismatches (int): Number of allowed mismatches when mapping model sequence to native sequence.
74
+ mapping (str): Specify a chain mapping between model and native structure. If TCR_pMHC_interface is True, this is overwritten to map the merged TCR and MHC chains of the predicted and native complex to one another. If the native contains two chains "H" and "L" while the model contains two chains "A" and "B", and chain A is a model of native chain H and chain B is a model of native chain L, the flag can be set as: '--mapping AB:HL'. This can also help limit the search to specific native interfaces. For example, if the native is a tetramer (ABCD) but the user is only interested in the
75
+ interface between chains B and C, the flag can be set as: '--mapping :BC' or the equivalent '--mapping *:BC'.
76
+ print_results (bool): Print the results to the console.
77
+ """
78
+
79
+ self.TCR_pMHC_interface = TCR_pMHC_interface
80
+ self.args = {
81
+ "mapping": mapping,
82
+ "small_molecule": small_molecule,
83
+ "capri_peptide": capri_peptide,
84
+ "no_align": no_align,
85
+ "json": json,
86
+ "n_cpu": n_cpu,
87
+ "max_chunk": max_chunk,
88
+ "allowed_mismatches": allowed_mismatches,
89
+ "verbose": verbose,
90
+ "short": short,
91
+ "print_results": print_results
92
+ }
93
+
94
+ def try_tcr_mapping(self, dock: "abTCR", reference: "abTCR"):
95
+ try:
96
+ dock_chain_mapping = {v: k for k, v in dock.get_chain_mapping().items()}
97
+ ref_chain_mapping = {v: k for k, v in reference.get_chain_mapping().items()}
98
+
99
+ mapping = [
100
+ (dock_chain_mapping[chain_type], ref_chain_mapping[chain_type])
101
+ for chain_type in dock_chain_mapping
102
+ if chain_type in ref_chain_mapping
103
+ ]
104
+ mapping = list(zip(*mapping))
105
+ string_mapping = f"{''.join(mapping[0])}:{''.join(mapping[1])}"
106
+ return mapping, string_mapping
107
+ except KeyError:
108
+ return None, None
109
+
110
+ def retrieve_chain(self, tcr: "TCR", chain_id: str):
111
+ chain_map = tcr.get_chain_mapping()
112
+ if chain_map[chain_id] in ["VA", "VB", "VD", "VG"]:
113
+ return tcr[chain_id]
114
+ elif chain_map[chain_id] == "Ag":
115
+ return [ag for ag in tcr.get_antigen() if ag.id == chain_id][0]
116
+ else:
117
+ return tcr.get_MHC()[0][chain_id]
118
+
119
+ def filter_residues(self, dock: "abTCR", reference: "abTCR", mapping):
120
+
121
+ filtered_dock = dock.copy()
122
+ filtered_reference = reference.copy()
123
+
124
+ for i, c_id in enumerate(mapping[0]):
125
+ dock_chain = self.retrieve_chain(dock, c_id)
126
+ ref_chain = self.retrieve_chain(reference, mapping[1][i])
127
+
128
+ # get the sequence of the dock chain and ref chain, then align them. Identify any mismathces in the sequence and remove those residues from the chain
129
+ dock_seq = tcr_formats.tcr_formats.get_sequences(dock_chain, amino_acids_only=False)[dock_chain.id]
130
+ ref_seq = tcr_formats.tcr_formats.get_sequences(ref_chain, amino_acids_only=False)[ref_chain.id]
131
+ aligner = Bio.Align.PairwiseAligner(match=1, mismatch=-1, gap_score=-1)
132
+ alignment = aligner.align(dock_seq, ref_seq)[0]
133
+ dock_indices, ref_indices = alignment.indices
134
+ filtered_dock_residues = [
135
+ dock_chain.child_list[idx].copy()
136
+ for idx in dock_indices[
137
+ np.logical_and(ref_indices != -1, dock_indices != -1)
138
+ ]
139
+ ]
140
+ filtered_ref_residues = [
141
+ ref_chain.child_list[idx].copy()
142
+ for idx in ref_indices[
143
+ np.logical_and(ref_indices != -1, dock_indices != -1)
144
+ ]
145
+ ]
146
+
147
+ filtered_dock_chain = self.retrieve_chain(filtered_dock, c_id)
148
+ filtered_ref_chain = self.retrieve_chain(filtered_reference, mapping[1][i])
149
+
150
+ # remove residues
151
+ for r in dock_chain:
152
+ filtered_dock_chain.detach_child(r.id)
153
+ for r in ref_chain:
154
+ filtered_ref_chain.detach_child(r.id)
155
+
156
+ # add filtered residues that aligned in dock and ref
157
+ for r in filtered_dock_residues:
158
+ filtered_dock_chain.add(r)
159
+ for r in filtered_ref_residues:
160
+ filtered_ref_chain.add(r)
161
+
162
+ return filtered_dock, filtered_reference
163
+
164
+ def tcr_to_structure(
165
+ self, tcr: "abTCR", mapping_filter=None, merge_chains=True
166
+ ) -> Bio.PDB.Model.Model:
167
+
168
+ model = Bio.PDB.Model.Model(0)
169
+
170
+ if merge_chains:
171
+ tcr_chains_to_merge = [
172
+ c for c in tcr.get_chains() if c.id in mapping_filter
173
+ ]
174
+ tcr_chain = tcr_formats.tcr_formats.merge_chains(
175
+ tcr_chains_to_merge, new_chain_id="T"
176
+ )
177
+ tcr_chain.is_het = False
178
+ tcr_chain.sequence = tcr_formats.tcr_formats.get_sequences(tcr_chain)["T"]
179
+ model.add(tcr_chain)
180
+
181
+ antigen_MHC_chains = tcr.get_antigen()
182
+ antigen_MHC_chains.extend(
183
+ [c for m in tcr.get_MHC() for c in m.get_chains()]
184
+ )
185
+ antigen_MHC_chains_to_merge = [
186
+ c for c in antigen_MHC_chains if c.id in mapping_filter
187
+ ]
188
+ pMHC_chain = tcr_formats.tcr_formats.merge_chains(
189
+ antigen_MHC_chains_to_merge, new_chain_id="M"
190
+ )
191
+ pMHC_chain.is_het = False
192
+ pMHC_chain.sequence = tcr_formats.tcr_formats.get_sequences(pMHC_chain)["M"]
193
+ model.add(pMHC_chain)
194
+
195
+ else:
196
+ sequences = tcr_formats.tcr_formats.get_sequences(tcr)
197
+ for chain in tcr.get_chains():
198
+ chain.is_het = False
199
+ chain.sequence = sequences[chain.id]
200
+ model.add(chain)
201
+ for antigen in tcr.get_antigen():
202
+ sequences = tcr_formats.tcr_formats.get_sequences(antigen)
203
+ antigen.is_het = False
204
+ antigen.sequence = sequences[antigen.id]
205
+ model.add(antigen)
206
+ for mhc in tcr.get_MHC():
207
+ sequences = tcr_formats.tcr_formats.get_sequences(mhc)
208
+ for chain in mhc.get_chains():
209
+ chain.is_het = False
210
+ chain.sequence = sequences[chain.id]
211
+ model.add(chain)
212
+ return model
213
+
214
+ def tcr_dockq(self, dock: "abTCR", reference: "abTCR", save_merged_complex: bool=False) -> float:
215
+ """
216
+ Calculate DockQ metrics for a TCR-pMHC complex.
217
+
218
+ This method evaluates the quality of a predicted TCR-pMHC complex (the "dock" structure)
219
+ against a reference (native) structure using the DockQ scoring system. It supports both
220
+ merged TCR-pMHC interfaces and separate chain evaluations, depending on the class setting.
221
+
222
+ Args:
223
+ dock (abTCR): The predicted TCR-pMHC complex structure to be evaluated.
224
+ reference (abTCR): The reference (native) TCR-pMHC complex structure.
225
+ save_merged_complex (bool, optional): If True, saves the merged complex structures
226
+ as PDB files for further inspection. Defaults to False.
227
+
228
+ Returns:
229
+ dict: A dictionary containing DockQ results, including DockQ score, F1, iRMSD, LRMSD,
230
+ fnat, fnonnat, and other relevant metrics for the best mapping.
231
+ """
232
+
233
+ mapping, string_mapping = self.try_tcr_mapping(dock, reference)
234
+
235
+ filtered_dock, filtered_reference = self.filter_residues(
236
+ dock, reference, mapping
237
+ )
238
+
239
+ model_structure = self.tcr_to_structure(
240
+ filtered_dock,
241
+ mapping_filter=mapping[0],
242
+ merge_chains=self.TCR_pMHC_interface,
243
+ )
244
+ native_structure = self.tcr_to_structure(
245
+ filtered_reference,
246
+ mapping_filter=mapping[1],
247
+ merge_chains=self.TCR_pMHC_interface,
248
+ )
249
+
250
+ with tempfile.TemporaryDirectory() as tmpdir:
251
+ io = Bio.PDB.PDBIO()
252
+ io.set_structure(model_structure)
253
+ io.save(os.path.join(tmpdir, f"model_structure_{dock.parent.parent.id}_{dock.id}.pdb"))
254
+ io.set_structure(native_structure)
255
+ io.save(os.path.join(tmpdir, f"native_structure_{reference.parent.parent.id}_{reference.id}.pdb"))
256
+
257
+ if self.TCR_pMHC_interface:
258
+ string_mapping = "TM:TM"
259
+
260
+ initial_mapping, model_chains, native_chains = dockq.format_mapping(
261
+ string_mapping, small_molecule=self.args["small_molecule"]
262
+ )
263
+
264
+ model_structure = dockq.load_PDB(
265
+ os.path.join(tmpdir, f"model_structure_{dock.parent.parent.id}_{dock.id}.pdb"),
266
+ chains=model_chains,
267
+ small_molecule=self.args["small_molecule"],
268
+ )
269
+ native_structure = dockq.load_PDB(
270
+ os.path.join(tmpdir, f"native_structure_{reference.parent.parent.id}_{reference.id}.pdb"),
271
+ chains=native_chains,
272
+ small_molecule=self.args["small_molecule"],
273
+ )
274
+
275
+ if save_merged_complex:
276
+ shutil.copyfile(os.path.join(tmpdir, f"model_structure_{dock.parent.parent.id}_{dock.id}.pdb"), f"model_structure_{dock.parent.parent.id}_{dock.id}.pdb")
277
+ shutil.copyfile(os.path.join(tmpdir, f"native_structure_{reference.parent.parent.id}_{reference.id}.pdb"), f"native_structure_{reference.parent.parent.id}_{reference.id}.pdb")
278
+
279
+ # check user-given chains are in the structures
280
+ model_chains = (
281
+ [c.id for c in model_structure] if not model_chains else model_chains
282
+ )
283
+ native_chains = (
284
+ [c.id for c in native_structure] if not native_chains else native_chains
285
+ )
286
+
287
+ if len(model_chains) < 2 or len(native_chains) < 2:
288
+ print("Need at least two chains in the two inputs\n")
289
+ return
290
+
291
+ # permute chains and run on a for loop
292
+ best_dockq = -1
293
+ best_result = None
294
+ best_mapping = None
295
+
296
+ model_chains_to_combo = [
297
+ mc for mc in model_chains if mc not in initial_mapping.values()
298
+ ]
299
+ native_chains_to_combo = [
300
+ nc for nc in native_chains if nc not in initial_mapping.keys()
301
+ ]
302
+
303
+ chain_clusters, reverse_map = dockq.group_chains(
304
+ model_structure,
305
+ native_structure,
306
+ model_chains_to_combo,
307
+ native_chains_to_combo,
308
+ self.args["allowed_mismatches"],
309
+ )
310
+ chain_maps = dockq.get_all_chain_maps(
311
+ chain_clusters,
312
+ initial_mapping,
313
+ reverse_map,
314
+ model_chains_to_combo,
315
+ native_chains_to_combo,
316
+ )
317
+
318
+ num_chain_combinations = dockq.count_chain_combinations(chain_clusters)
319
+ # copy iterator to use later
320
+ chain_maps, chain_maps_ = itertools.tee(chain_maps)
321
+
322
+ low_memory = num_chain_combinations > 100
323
+ run_chain_map = partial(
324
+ dockq.run_on_all_native_interfaces,
325
+ model_structure,
326
+ native_structure,
327
+ no_align=self.args["no_align"],
328
+ capri_peptide=self.args["capri_peptide"],
329
+ low_memory=low_memory,
330
+ )
331
+
332
+ if num_chain_combinations > 1:
333
+ cpus = min(num_chain_combinations, self.args["n_cpu"])
334
+ chunk_size = min(
335
+ self.args["max_chunk"], max(1, num_chain_combinations // cpus)
336
+ )
337
+
338
+ # for large num_chain_combinations it should be possible to divide the chain_maps in chunks
339
+ result_this_mappings = dockq.progress_map(
340
+ run_chain_map,
341
+ chain_maps,
342
+ total=num_chain_combinations,
343
+ n_cpu=cpus,
344
+ chunk_size=chunk_size,
345
+ )
346
+
347
+ for chain_map, (result_this_mapping, total_dockq) in zip(
348
+ chain_maps_, result_this_mappings
349
+ ):
350
+
351
+ if total_dockq > best_dockq:
352
+ best_dockq = total_dockq
353
+ best_result = result_this_mapping
354
+ best_mapping = chain_map
355
+
356
+ if (
357
+ low_memory
358
+ ): # retrieve the full output by rerunning the best chain mapping
359
+ best_result, total_dockq = dockq.run_on_all_native_interfaces(
360
+ model_structure,
361
+ native_structure,
362
+ chain_map=best_mapping,
363
+ no_align=self.args["no_align"],
364
+ capri_peptide=self.args["capri_peptide"],
365
+ low_memory=False,
366
+ )
367
+
368
+ else: # skip multi-threading for single jobs (skip the bar basically)
369
+ best_mapping = next(chain_maps)
370
+ best_result, best_dockq = run_chain_map(best_mapping)
371
+
372
+ if not best_result:
373
+ logging.error(
374
+ "Could not find interfaces in the native model. Please double check the inputs or select different chains with the --mapping flag."
375
+ )
376
+ return
377
+
378
+ info = dict()
379
+ info["model"] = f"{dock.parent.parent.id}_{dock.id}"
380
+ info["native"] = f"{reference.parent.parent.id}_{reference.id}"
381
+ info["best_dockq"] = best_dockq
382
+ info["best_result"] = best_result
383
+ info["GlobalDockQ"] = best_dockq / len(best_result)
384
+ info["best_mapping"] = best_mapping
385
+ info["best_mapping_str"] = f"{dockq.format_mapping_string(best_mapping)}"
386
+
387
+ if self.args["json"]:
388
+ if not isinstance(self.args["json"], str):
389
+ json_file = f"dockq_{model_structure.id}_{dock.id}_{native_structure.id}_{reference.id}.json"
390
+ else:
391
+ json_file = self.args["json"]
392
+ with open(json_file, "w") as fp:
393
+ json.dump(info, fp)
394
+
395
+ if self.args["print_results"]:
396
+ dockq.print_results(
397
+ info,
398
+ self.args["short"],
399
+ self.args["verbose"],
400
+ self.args["capri_peptide"],
401
+ self.args["small_molecule"],
402
+ )
403
+
404
+ return info