stcrpy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. examples/__init__.py +0 -0
  2. examples/egnn.py +425 -0
  3. stcrpy/__init__.py +5 -0
  4. stcrpy/tcr_datasets/__init__.py +0 -0
  5. stcrpy/tcr_datasets/tcr_graph_dataset.py +499 -0
  6. stcrpy/tcr_datasets/tcr_selector.py +0 -0
  7. stcrpy/tcr_datasets/tcr_structure_dataset.py +0 -0
  8. stcrpy/tcr_datasets/utils.py +350 -0
  9. stcrpy/tcr_formats/__init__.py +0 -0
  10. stcrpy/tcr_formats/tcr_formats.py +114 -0
  11. stcrpy/tcr_formats/tcr_haddock.py +556 -0
  12. stcrpy/tcr_geometry/TCRCoM.py +350 -0
  13. stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
  14. stcrpy/tcr_geometry/TCRDock.py +261 -0
  15. stcrpy/tcr_geometry/TCRGeom.py +450 -0
  16. stcrpy/tcr_geometry/TCRGeomFiltering.py +273 -0
  17. stcrpy/tcr_geometry/__init__.py +0 -0
  18. stcrpy/tcr_geometry/reference_data/__init__.py +0 -0
  19. stcrpy/tcr_geometry/reference_data/dock_reference_1_imgt_numbered.pdb +6549 -0
  20. stcrpy/tcr_geometry/reference_data/dock_reference_2_imgt_numbered.pdb +6495 -0
  21. stcrpy/tcr_geometry/reference_data/reference_A.pdb +31 -0
  22. stcrpy/tcr_geometry/reference_data/reference_B.pdb +31 -0
  23. stcrpy/tcr_geometry/reference_data/reference_D.pdb +31 -0
  24. stcrpy/tcr_geometry/reference_data/reference_G.pdb +31 -0
  25. stcrpy/tcr_geometry/reference_data/reference_data.py +104 -0
  26. stcrpy/tcr_interactions/PLIPParser.py +147 -0
  27. stcrpy/tcr_interactions/TCRInteractionProfiler.py +433 -0
  28. stcrpy/tcr_interactions/TCRpMHC_PLIP_Model_Parser.py +133 -0
  29. stcrpy/tcr_interactions/__init__.py +0 -0
  30. stcrpy/tcr_interactions/utils.py +170 -0
  31. stcrpy/tcr_methods/__init__.py +0 -0
  32. stcrpy/tcr_methods/tcr_batch_operations.py +223 -0
  33. stcrpy/tcr_methods/tcr_methods.py +150 -0
  34. stcrpy/tcr_methods/tcr_reformatting.py +18 -0
  35. stcrpy/tcr_metrics/__init__.py +2 -0
  36. stcrpy/tcr_metrics/constants.py +39 -0
  37. stcrpy/tcr_metrics/tcr_interface_rmsd.py +237 -0
  38. stcrpy/tcr_metrics/tcr_rmsd.py +179 -0
  39. stcrpy/tcr_ml/__init__.py +0 -0
  40. stcrpy/tcr_ml/geometry_predictor.py +3 -0
  41. stcrpy/tcr_processing/AGchain.py +89 -0
  42. stcrpy/tcr_processing/Chemical_components.py +48915 -0
  43. stcrpy/tcr_processing/Entity.py +301 -0
  44. stcrpy/tcr_processing/Fragment.py +58 -0
  45. stcrpy/tcr_processing/Holder.py +24 -0
  46. stcrpy/tcr_processing/MHC.py +449 -0
  47. stcrpy/tcr_processing/MHCchain.py +149 -0
  48. stcrpy/tcr_processing/Model.py +37 -0
  49. stcrpy/tcr_processing/Select.py +145 -0
  50. stcrpy/tcr_processing/TCR.py +532 -0
  51. stcrpy/tcr_processing/TCRIO.py +47 -0
  52. stcrpy/tcr_processing/TCRParser.py +1230 -0
  53. stcrpy/tcr_processing/TCRStructure.py +148 -0
  54. stcrpy/tcr_processing/TCRchain.py +160 -0
  55. stcrpy/tcr_processing/__init__.py +3 -0
  56. stcrpy/tcr_processing/annotate.py +480 -0
  57. stcrpy/tcr_processing/utils/__init__.py +0 -0
  58. stcrpy/tcr_processing/utils/common.py +67 -0
  59. stcrpy/tcr_processing/utils/constants.py +367 -0
  60. stcrpy/tcr_processing/utils/region_definitions.py +782 -0
  61. stcrpy/utils/__init__.py +0 -0
  62. stcrpy/utils/error_stream.py +12 -0
  63. stcrpy-1.0.0.dist-info/METADATA +173 -0
  64. stcrpy-1.0.0.dist-info/RECORD +68 -0
  65. stcrpy-1.0.0.dist-info/WHEEL +5 -0
  66. stcrpy-1.0.0.dist-info/licenses/LICENCE +28 -0
  67. stcrpy-1.0.0.dist-info/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
  68. stcrpy-1.0.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1230 @@
1
+ """
2
+ Created on 3 April 2024
3
+ @author: Nele Quast, based on leem
4
+
5
+ TCRParser object which is based on ABDB's AntibodyParser and BioPython's PDB parser.
6
+ """
7
+
8
+ from itertools import combinations, product
9
+ import sys
10
+ import os
11
+ from collections import defaultdict
12
+
13
+ from Bio.PDB.PDBParser import PDBParser
14
+ from Bio.PDB.MMCIFParser import MMCIFParser
15
+ from Bio.PDB import NeighborSearch
16
+
17
+ # TCRDB
18
+ from .annotate import annotate, extract_sequence, align_numbering
19
+
20
+ from ..utils.error_stream import ErrorStream
21
+
22
+ from .TCRStructure import TCRStructure
23
+ from .Model import Model
24
+ from .TCR import TCR, abTCR, gdTCR
25
+ from .MHC import MHC, MH1, MH2, CD1, MR1, scMH1, scCD1
26
+ from .Holder import Holder
27
+ from .TCRchain import TCRchain
28
+ from .MHCchain import MHCchain
29
+ from .AGchain import AGchain
30
+ from Bio.PDB.Residue import Residue
31
+ from .Fragment import Fragment
32
+ from .Chemical_components import is_aa, is_common_buffer, get_res_type, is_carbohydrate
33
+
34
+ MHC_CUTOFF = {
35
+ "MH1:B2M": [32, 37, 32, 32],
36
+ "CD1:B2M": [32, 37, 32, 32],
37
+ "GA1L:B2M": [33, 37, 33, 32], # added for GA1L case 2po6
38
+ "MR1:B2M": [32, 37, 32, 32],
39
+ "GA1:B2M": [32, 37, 32, 32],
40
+ "GB:GA": [22, 32, 35, 29],
41
+ }
42
+
43
+
44
+ class TCRParser(PDBParser, MMCIFParser):
45
+ def __init__(self, PERMISSIVE=True, get_header=True, QUIET=False):
46
+ """
47
+ Initialise the PDB parser. This is currently set to using IMGT's numbering scheme and uses the IMGT-defined CDRs.
48
+
49
+ """
50
+ self.pdb_parser = PDBParser(PERMISSIVE, get_header, None, QUIET)
51
+ self.mmcif_parser = MMCIFParser(None, QUIET)
52
+ self.QUIET = QUIET
53
+ # Structures are numbered using anarci.
54
+ self.numbering_method = "anarci"
55
+
56
+ # Choose the numbering scheme and CDR definition, though by default we'll use the IMGT schemes. Have these down for reference use.
57
+ self.numbering_scheme = "imgt"
58
+ self.definition = "imgt"
59
+
60
+ def _create_chain(self, chain, new_chain_id, numbering, chain_type):
61
+ """
62
+ Create a new TCR or MHC chain.
63
+ Residues before the numbered region are now ignored.
64
+ """
65
+ if chain_type in ["D", "A", "B", "G"]:
66
+ newchain = TCRchain(new_chain_id)
67
+ elif chain_type in [
68
+ "MH1",
69
+ "CD1",
70
+ "B2M",
71
+ "GA",
72
+ "GB",
73
+ "GA1L",
74
+ "GA2L",
75
+ "GA1",
76
+ "GA2",
77
+ "MR1",
78
+ ]:
79
+ newchain = MHCchain(new_chain_id)
80
+
81
+ newchain.numbering = numbering
82
+ unnumbered_list = []
83
+ added = False
84
+
85
+ for residue in chain.get_list():
86
+ # check whether to add the residue to the new chain (have we got numbering for it)
87
+ add = False
88
+ if residue.id in numbering:
89
+ if numbering[residue.id]:
90
+ add = True
91
+ res_id = (
92
+ residue.id[0],
93
+ numbering[residue.id][0],
94
+ numbering[residue.id][1],
95
+ ) # field and reannotated things.
96
+
97
+ # if we should add it, add it
98
+ if add:
99
+ added = True
100
+ newresidue = Residue(res_id, residue.resname, residue.segid)
101
+ for atom in residue.get_list():
102
+ newresidue.add(atom.copy())
103
+ newresidue.imgt_numbered = True
104
+ newchain.add(newresidue)
105
+
106
+ # else add it to the unnumbered list - this will include the HETATOMs - analyse them to find haptens.
107
+ elif added:
108
+ unnumbered_list.append(residue)
109
+
110
+ # add the unnumbered residues into the chain - renumbering so that they follow on from the numbered regions.
111
+ ended = sorted([i for i in numbering.values() if i != ""])[-1][0]
112
+ for residue in unnumbered_list:
113
+ ended += 1
114
+ res_id = (residue.id[0], ended, " ")
115
+ newresidue = Residue(res_id, residue.resname, residue.segid)
116
+ for atom in residue.get_list():
117
+ newresidue.add(atom.copy())
118
+ newchain.add(newresidue)
119
+ newchain.add_unnumbered(newresidue)
120
+
121
+ newchain.analyse(chain_type)
122
+ return newchain
123
+
124
+ def _create_scTCR_chains(
125
+ self, chain, new_chain_id, numbering_1, numbering_2, chain_type1, chain_type2
126
+ ):
127
+ """
128
+ Create two TCR chains to be paired up as a TCR.
129
+ This is effectively a TCR chain with a modified unnumbered list generation
130
+ Residues before/after the numbered region are now ignored.
131
+ """
132
+ newchain1 = TCRchain(new_chain_id.lower())
133
+ newchain2 = TCRchain(new_chain_id.upper())
134
+
135
+ if chain_type2 in ["G", "A"]:
136
+ # Just a trick to
137
+ return self._create_scTCR_chains(
138
+ chain, new_chain_id, numbering_2, numbering_1, chain_type2, chain_type1
139
+ )
140
+
141
+ newchain1.numbering = numbering_1
142
+ newchain2.numbering = numbering_2
143
+ newchains = [newchain1, newchain2]
144
+
145
+ unnumbered_set = set()
146
+ added = False
147
+ numbered_pos = set(numbering_1.keys()) | set(numbering_2.keys())
148
+
149
+ for i, numbering in enumerate([numbering_1, numbering_2]):
150
+ # Get the ith newchain
151
+ newchain = newchains[i]
152
+
153
+ for residue in chain.get_list():
154
+ # check whether to add the residue to the new chain (have we got numbering for it)
155
+ add = False
156
+ if residue.id in numbering:
157
+ if numbering[residue.id]:
158
+ add = True
159
+ res_id = (
160
+ residue.id[0],
161
+ numbering[residue.id][0],
162
+ numbering[residue.id][1],
163
+ ) # field and reannotated things.
164
+
165
+ # if we should add it, add it
166
+ if add:
167
+ added = True
168
+ newresidue = Residue(res_id, residue.resname, residue.segid)
169
+ for atom in residue.get_list():
170
+ newresidue.add(atom.copy())
171
+ newresidue.imgt_numbered = True
172
+ newchain.add(newresidue)
173
+
174
+ # else add it to the unnumbered list - this will include the HETATOMs - analyse them to find haptens.
175
+ elif added and residue.id not in numbered_pos:
176
+ unnumbered_set.add(residue)
177
+
178
+ # add the unnumbered residues into the chain - renumbering so that they follow on from the numbered regions.
179
+ ended = sorted(numbering_1.values())[-1][0] # get the last numbered value.
180
+
181
+ for residue in sorted(unnumbered_set, key=lambda z: z.id[1]):
182
+ ended += 1
183
+ res_id = (residue.id[0], ended, " ")
184
+ newresidue = Residue(res_id, residue.resname, residue.segid)
185
+ for atom in residue.get_list():
186
+ newresidue.add(atom.copy())
187
+
188
+ newchain1.add(newresidue)
189
+ newchain1.add_unnumbered(newresidue)
190
+
191
+ newchain1.analyse(chain_type1)
192
+ newchain2.analyse(chain_type2)
193
+
194
+ return newchain1, newchain2
195
+
196
+ def get_tcr_structure(
197
+ self, id, file, prenumbering=None, ali_dict={}, crystal_contacts=[]
198
+ ):
199
+ """
200
+ Post processing of the TCRPDB.Bio.PDB structure object into a TCR context.
201
+
202
+ id: a string to identify the structure
203
+ file: the path to the .pdb file
204
+
205
+ optional:
206
+ prenumbering: prenumbering for the chains in the structure.
207
+ """
208
+ self.warnings = ErrorStream()
209
+ # get a structure object from biopython.
210
+ _, ext = os.path.splitext(file)
211
+ if ext.lower() == ".pdb":
212
+ structure = self.pdb_parser.get_structure(id, file)
213
+ self.current_parser = self.pdb_parser
214
+ elif ext.lower() in [".cif", ".mmcif"]:
215
+ structure = self.mmcif_parser.get_structure(id, file)
216
+ self.current_parser = self.mmcif_parser
217
+ else:
218
+ self.warnings.write(f"Unrecognised structure file format: {file}")
219
+ raise ValueError
220
+
221
+ # Create a new TCRStructure object
222
+ tcrstructure = TCRStructure(structure.id)
223
+
224
+ # Set and analyse header information
225
+ tcrstructure.set_header(structure.header)
226
+ self._analyse_header(tcrstructure)
227
+
228
+ # iterate over the models in the structure
229
+ # iterate backwards through the model list - delete old structure as we go
230
+ # e.g. NMR structures will be extremely memory expensive (72 models!)
231
+
232
+ for mid in range(len(structure.child_list) - 1, -1, -1):
233
+ # add a model to the TCR structure
234
+ model = structure.child_list[mid]
235
+ newmodel = Model(model.id)
236
+ tcrstructure.add(newmodel)
237
+
238
+ # initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains.
239
+ agchains = Holder("Antigen")
240
+ trchains = Holder("TCRchain")
241
+ mhchains = Holder("MHCchain")
242
+ newmodel.add(agchains)
243
+ newmodel.add(trchains)
244
+ newmodel.add(mhchains)
245
+
246
+ # iterate over the chains in the model
247
+ for chain in model.get_list():
248
+ # try to number the sequence found in the structure
249
+ if prenumbering and chain.id in prenumbering:
250
+ if len(prenumbering[chain.id]) == 2:
251
+ numbering = [{}, {}]
252
+ region_types = ["", ""]
253
+
254
+ numbering[0], region_types[0] = self._prenumbered(
255
+ chain, prenumbering, ali_dict, n=0
256
+ )
257
+ numbering[1], region_types[1] = self._prenumbered(
258
+ chain, prenumbering, ali_dict, n=1
259
+ )
260
+ rtypes = sorted(region_types)
261
+
262
+ # Check that we have a beta/alpha domain or gamma/delta domain
263
+ if rtypes == ["A", "B"] or rtypes == ["D", "G"]:
264
+ chain_type = "".join(region_types)
265
+ scTCR = True
266
+ # if not, just take the first region and warn the user
267
+ else:
268
+ chain_type = region_types[0]
269
+ numbering = numbering[0]
270
+ scTCR = False
271
+ print(
272
+ "Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only."
273
+ % (chain_type, chain.id),
274
+ file=self.warnings,
275
+ )
276
+
277
+ elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]:
278
+ numbering, chain_type, scTCR = annotate(chain)
279
+
280
+ else:
281
+ numbering, chain_type = self._prenumbered(
282
+ chain, prenumbering, ali_dict, n=0
283
+ )
284
+ scTCR = False
285
+
286
+ else:
287
+ numbering, chain_type, germline_info, scTCR = annotate(chain)
288
+
289
+ if chain.id in tcrstructure.header["chain_details"]: # clean this up!!!
290
+ engineered = tcrstructure.header["chain_details"][chain.id][
291
+ "engineered"
292
+ ]
293
+ details = tcrstructure.header["chain_details"][chain.id]
294
+ else:
295
+ engineered = False
296
+ details = {"molecule": "unknown", "engineered": False}
297
+
298
+ details["genetic_origin"] = germline_info
299
+
300
+ if numbering and chain_type in ["G", "D", "B", "A"]:
301
+ # create a new TCR chain
302
+ newchain = self._create_chain(
303
+ chain, chain.id, numbering, chain_type
304
+ )
305
+ newchain.set_engineered(engineered)
306
+ newchain.xtra.update(details)
307
+ trchains.add(newchain)
308
+
309
+ elif numbering and chain_type in [
310
+ "MH1",
311
+ "CD1",
312
+ "GA",
313
+ "GB",
314
+ "B2M",
315
+ "GA1L",
316
+ "GA2L",
317
+ "GA1",
318
+ "GA2",
319
+ "MR1",
320
+ ]:
321
+ newchain = self._create_chain(
322
+ chain, chain.id, numbering, chain_type
323
+ )
324
+ newchain.set_engineered(engineered)
325
+ newchain.xtra.update(details)
326
+ mhchains.add(newchain)
327
+
328
+ elif numbering and scTCR:
329
+ # Separate numbering into two domains
330
+ types = list(chain_type)
331
+ domain1, domain2 = numbering
332
+
333
+ chain1, chain2 = self._create_scTCR_chains(
334
+ chain, chain.id, domain1, domain2, types[0], types[1]
335
+ )
336
+ chain1.set_engineered(engineered)
337
+ chain1.xtra.update(details)
338
+ chain2.set_engineered(engineered)
339
+ chain2.xtra.update(details)
340
+
341
+ # We know this is a TCR -- except for 2p1y.
342
+ if (
343
+ chain1.child_dict[(" ", 104, " ")]["CA"]
344
+ - chain2.child_dict[(" ", 104, " ")]["CA"]
345
+ ) <= 22:
346
+ obs_chaintypes = set([chain1.chain_type, chain2.chain_type])
347
+ if not obs_chaintypes - set(["A", "B"]):
348
+ tcr = abTCR(chain1, chain2)
349
+ elif not obs_chaintypes - set(["G", "D"]):
350
+ tcr = gdTCR(chain1, chain2)
351
+ elif not obs_chaintypes - set(["B", "D"]):
352
+ tcr = abTCR(chain1, chain2) # initial way to deal with narci missclassification of alpha chains as delta chains
353
+ # tcr = dbTCR(chain1, chain2)
354
+
355
+ tcr.scTCR = True #
356
+ newmodel.add(tcr)
357
+ if chain1.id in trchains:
358
+ trchains.detach_child(chain1.id)
359
+ if chain2.id in trchains:
360
+ trchains.detach_child(chain2.id)
361
+
362
+ else:
363
+ trchains.add(chain1)
364
+ trchains.add(chain2)
365
+
366
+ # add chain to "antigen" chains
367
+ else:
368
+ newchain = self._create_ag_chain(chain)
369
+ newchain.set_engineered(engineered)
370
+ newchain.xtra.update(details)
371
+ agchains.add(newchain)
372
+
373
+ # try to pair the TCR chains to form TCRs. Use a heuristic for now.
374
+ if not scTCR:
375
+ pairings = self._pair_chains(trchains)
376
+
377
+ for pair in pairings:
378
+ trchains.detach_child(pair[0].id)
379
+ trchains.detach_child(pair[1].id)
380
+
381
+ obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type])
382
+ if not obs_chaintypes - set(["A", "B"]):
383
+ tcr = abTCR(pair[0], pair[1])
384
+ elif not obs_chaintypes - set(["G", "D"]):
385
+ tcr = gdTCR(pair[0], pair[1])
386
+ elif not obs_chaintypes - set(["B", "D"]):
387
+ # tcr = dbTCR(pair[0], pair[1])
388
+ tcr = abTCR(pair[0], pair[1])
389
+
390
+ else:
391
+ self.warnings.write(
392
+ "Unusual pairing between %s (V%s) and %s (V%s) has been detected. Treating as separate TCR chains.\n"
393
+ % (
394
+ pair[0].id,
395
+ pair[0].chain_type,
396
+ pair[1].id,
397
+ pair[1].chain_type,
398
+ )
399
+ )
400
+ trchains.add(pair[0])
401
+ trchains.add(pair[1])
402
+ continue
403
+ newmodel.add(tcr)
404
+
405
+ elif scTCR and trchains:
406
+ pairings = self._pair_chains(trchains)
407
+
408
+ for pair in pairings:
409
+ trchains.detach_child(pair[0].id)
410
+ trchains.detach_child(pair[1].id)
411
+
412
+ obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type])
413
+ if not obs_chaintypes - set(["A", "B"]):
414
+ tcr = abTCR(pair[0], pair[1])
415
+ elif not obs_chaintypes - set(["G", "D"]):
416
+ tcr = gdTCR(pair[0], pair[1])
417
+ elif not obs_chaintypes - set(["B", "D"]):
418
+ tcr = abTCR(pair[0], pair[1])
419
+ # tcr = dbTCR(pair[0], pair[1])
420
+ else:
421
+ self.warnings.write(
422
+ "Unusual pairing between %s (V%s) and %s (V%s) has been detected. Treating as separate TCR chains.\n"
423
+ % (
424
+ pair[0].id,
425
+ pair[0].chain_type,
426
+ pair[1].id,
427
+ pair[1].chain_type,
428
+ )
429
+ )
430
+ trchains.add(pair[0])
431
+ trchains.add(pair[1])
432
+ continue
433
+
434
+ newmodel.add(tcr)
435
+
436
+ # Pair up the MHC chains -- whether that's GA and GB or MH1 with B2M
437
+ pairings = self._pair_mhc(mhchains)
438
+ for pair in pairings:
439
+ mhchains.detach_child(pair[0].id)
440
+ mhchains.detach_child(pair[1].id)
441
+
442
+ obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type])
443
+ if (
444
+ not (obs_chaintypes - set(["MH1", "B2M"]))
445
+ or not (obs_chaintypes - set(["GA1", "GA2"]))
446
+ or not (obs_chaintypes - set(["GA1", "B2M"]))
447
+ ):
448
+ mhc = MH1(pair[0], pair[1])
449
+ elif not (obs_chaintypes - set(["GA", "GB"])):
450
+ mhc = MH2(pair[0], pair[1])
451
+ elif not (obs_chaintypes - set(["CD1", "B2M"])) or not (
452
+ obs_chaintypes - set(["GA1L", "GA2L"])) or not (
453
+ obs_chaintypes - set(["GA1L", "B2M"])
454
+ ):
455
+ mhc = CD1(pair[0], pair[1])
456
+ elif not (obs_chaintypes - set(["MR1", "B2M"])):
457
+ mhc = MR1(pair[0], pair[1])
458
+ else:
459
+ raise ValueError(f'MHC pairing {pair} could not be assigned.')
460
+
461
+ newmodel.add(mhc)
462
+
463
+ # allow instantiation of single chain MH1 type MH class if the alpha helices forming chain has been observed
464
+ ids_to_detach = []
465
+ for mhc_chain in mhchains:
466
+ if mhc_chain.chain_type in ["MH1", "GA1", "GA2"]:
467
+ ids_to_detach.append(mhc_chain.id)
468
+ sc_mhc = scMH1(mhc_chain)
469
+ newmodel.add(sc_mhc)
470
+ elif mhc_chain.chain_type in ["CD1", "GA1L"]:
471
+ ids_to_detach.append(mhc_chain.id)
472
+ sc_mhc = scCD1(mhc_chain)
473
+ newmodel.add(sc_mhc)
474
+
475
+ for mhc_chain_id in ids_to_detach:
476
+ mhchains.detach_child(mhc_chain_id)
477
+
478
+ # Match MHC+antigen complex with a TCR
479
+ self._match_units(newmodel, trchains, mhchains, agchains, crystal_contacts)
480
+ del structure.child_list[
481
+ mid
482
+ ] # delete the structure model list (goes backwards so indexing is not affected)
483
+
484
+ # Delete empty holders
485
+ empty_holders = [
486
+ holder.id for holder in newmodel.child_list if not holder.child_list
487
+ ]
488
+ for holder_id in empty_holders:
489
+ newmodel.detach_child(holder_id)
490
+
491
+ del structure
492
+ if not self.QUIET and self.warnings.log:
493
+ sys.stderr.write("\n".join(self.warnings.log))
494
+ sys.stderr.write("\n")
495
+ tcrstructure.warnings = self.warnings
496
+
497
+ return tcrstructure
498
+
499
+ def _analyse_header(self, header):
500
+ """
501
+ Analysis of the header that has been parsed by Biopython
502
+ We add information for the various chains and have a look for engineered and hapten flags.
503
+ Add more information to this parser.
504
+ """
505
+ if isinstance(header, TCRStructure):
506
+ header = header.get_header()
507
+ elif not header:
508
+ header = {}
509
+
510
+ header["chain_details"] = {}
511
+ if "compound" in header:
512
+ for compound in header["compound"]:
513
+ # iteration over details.
514
+ if "chain" in header["compound"][compound]:
515
+ # get the chains that the compound is refering to.
516
+ chains = [
517
+ c.strip().upper()
518
+ for c in header["compound"][compound]["chain"].split(",")
519
+ if len(c.strip()) == 1
520
+ ]
521
+
522
+ for chain in chains:
523
+ if chain not in header["chain_details"]:
524
+ header["chain_details"][chain] = {}
525
+
526
+ if "molecule" in header["compound"][compound]:
527
+ # add molecule annotation to each chain
528
+ for chain in chains:
529
+ header["chain_details"][chain]["molecule"] = header[
530
+ "compound"
531
+ ][compound]["molecule"]
532
+ else:
533
+ for chain in chains:
534
+ header["chain_details"][chain]["molecule"] = "unknown"
535
+
536
+ if "engineered" in header["compound"][compound]:
537
+ if (
538
+ "no" in header["compound"][compound]["engineered"]
539
+ or "false" in header["compound"][compound]["engineered"]
540
+ or not header["compound"][compound]["engineered"]
541
+ ):
542
+ header["compound"][compound]["engineered"] = False
543
+ else:
544
+ header["compound"][compound]["engineered"] = True
545
+ for chain in chains:
546
+ header["chain_details"][chain]["engineered"] = header[
547
+ "compound"
548
+ ][compound]["engineered"]
549
+ else:
550
+ for chain in chains:
551
+ header["chain_details"][chain]["engineered"] = False
552
+ else:
553
+ continue
554
+
555
+ # analyse the journal reference and the title for references to hapten or scfv
556
+ # compile title-like text
557
+ title = (
558
+ header["journal_reference"].lower()
559
+ + " ".join(header["structure_reference"]).lower()
560
+ )
561
+ if "journal" in header:
562
+ title += header["journal"].lower()
563
+ else:
564
+ sys.stderr.write("Header could not be parsed")
565
+
566
+ def _create_ag_chain(self, chain):
567
+ """
568
+ Create a new 'antigen' chain - this just means it is not a TCR chain.
569
+ """
570
+ newchain = AGchain(chain.id)
571
+ for residue in chain.get_list():
572
+ newresidue = Residue(residue.id, residue.resname, residue.segid)
573
+ newchain.add(newresidue)
574
+ for atom in residue.get_list():
575
+ newresidue.add(atom.copy())
576
+ newchain.set_type()
577
+ return newchain
578
+
579
+ def _pair_chains(self, chains):
580
+ """
581
+ Method to pair beta/alpha and gamma/delta chains to form TCRs.
582
+ Currently this is based off of ABDB.AbPDB's chain pairing method where the
583
+ distance between positions 104 are calculated using the same 22A cutoff.
584
+ This is a simple heuristic for now.
585
+ """
586
+ pairings = []
587
+ # We use a known distance between conserved cysteine residues at the interface
588
+ points = {
589
+ "B": (" ", 104, " "),
590
+ "A": (" ", 104, " "),
591
+ "D": (" ", 104, " "),
592
+ "G": (" ", 104, " "),
593
+ }
594
+
595
+ for pair in combinations(chains, 2):
596
+ if pair[0].chain_type != pair[1].chain_type:
597
+ try:
598
+ a1 = pair[0].child_dict[points[pair[0].chain_type]].child_dict["CA"]
599
+ a2 = pair[1].child_dict[points[pair[1].chain_type]].child_dict["CA"]
600
+ except KeyError:
601
+ continue
602
+ if a1 - a2 < 22:
603
+ pairings.append(pair)
604
+ return pairings
605
+
606
+ def _pair_mhc(self, chains):
607
+ """
608
+ This is a heuristic that pairs MHC chains together. In theory, we should have a GA-GB chain (MHC2) or a GA1/GA2-C (MHC1).
609
+ Use an arbitrary cutoff of 45A for pairing the MHC for now.
610
+ Where possible, use the conserved cysteine in the GA2/GB domains (resi 1074 or 11) and pair up with another
611
+ conserved point (Cys 104 in B2M, N86 in GA)
612
+
613
+ Impose an angle cutoff so that the cysteine of the B2M points in the correct orientation
614
+ """
615
+ pairings = []
616
+ points = {
617
+ "MH1": [(" ", 15, " "), (" ", 51, " ")],
618
+ "GA1": [(" ", 15, " "), (" ", 51, " ")],
619
+ "GA2": [(" ", 15, " "), (" ", 51, " ")],
620
+ "CD1": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1
621
+ "GA1L": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1
622
+ "MR1": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1
623
+ "B2M": [(" ", 23, " "), (" ", 104, " ")],
624
+ "GA": [(" ", 29, " "), (" ", 37, " ")],
625
+ "GB": [(" ", 39, " "), (" ", 64, " ")],
626
+ }
627
+
628
+ acceptable_types = [
629
+ "MH1:B2M",
630
+ "GB:GA",
631
+ "CD1:B2M",
632
+ "MR1:B2M",
633
+ "GA1:B2M",
634
+ "GA1L:B2M",
635
+ ]
636
+
637
+ for pair in combinations(chains, 2):
638
+
639
+ # Get the chain objects
640
+ c1, c2 = pair
641
+ # What type of MHC are we pairing?
642
+ the_type = ":".join(sorted([c1.chain_type, c2.chain_type], reverse=True))
643
+
644
+ if the_type in acceptable_types:
645
+ # Sort by chain type;
646
+ p1 = pair[0] if c1.chain_type > c2.chain_type else pair[1]
647
+ p2 = pair[1] if c2.chain_type < c1.chain_type else pair[0]
648
+
649
+ try:
650
+ a1, a2 = (
651
+ p1[points[p1.chain_type][0]]["CA"],
652
+ p1[points[p1.chain_type][1]]["CA"],
653
+ )
654
+ a3, a4 = (
655
+ p2[points[p2.chain_type][0]]["CA"],
656
+ p2[points[p2.chain_type][1]]["CA"],
657
+ )
658
+ dist_array = [a3 - a1, a4 - a2, a4 - a1, a3 - a2]
659
+ constants = all(
660
+ [
661
+ dist_array[i] <= MHC_CUTOFF[the_type][i]
662
+ for i in range(len(dist_array))
663
+ ]
664
+ )
665
+ if constants:
666
+ pairings.append(pair)
667
+
668
+ except KeyError:
669
+ continue
670
+
671
+ return pairings
672
+
673
+ def _get_sugar_fragments(self, sugar):
674
+ """
675
+ Get connected hetatoms to form sugar molecules.
676
+ """
677
+ # Make a sugar dictionary
678
+ sugar = dict(list(zip([s.id for s in sugar], sugar)))
679
+
680
+ # Get the connect records for the bonded atoms
681
+ # 1 - 6 Record name "CONECT"
682
+ # 7 - 11 Integer serial Atom serial number
683
+ # 12 - 16 Integer serial Serial number of bonded atom
684
+ # 17 - 21 Integer serial Serial number of bonded atom
685
+ # 22 - 26 Integer serial Serial number of bonded atom
686
+ # 27 - 31 Integer serial Serial number of bonded atom
687
+ connect_records = {}
688
+ for c in [
689
+ line.strip() for line in self.current_parser.trailer if "CONECT" in line
690
+ ]:
691
+ try:
692
+ connect_records[int(c[6:11])] = []
693
+ except IndexError:
694
+ continue
695
+ for b, e in [(11, 16), (16, 21), (21, 26), (26, 31)]:
696
+ try:
697
+ if c[b:e].strip():
698
+ connect_records[int(c[6:11])].append(int(c[b:e]))
699
+ else:
700
+ break
701
+ except IndexError:
702
+ break
703
+ except ValueError:
704
+ self.warnings.write(
705
+ "Warning: unexpected CONECT record format %s" % c.strip()
706
+ )
707
+
708
+ monomer_atoms = []
709
+ polymers = []
710
+ if connect_records:
711
+ # Get the serial_numbers to residue id.
712
+ atomid_to_resid = {}
713
+ for r in sugar:
714
+ for atom in sugar[r]:
715
+ atomid_to_resid[atom.serial_number] = sugar[r].id
716
+
717
+ # Get the residue connections
718
+ r_connections = {}
719
+ for a in connect_records:
720
+ if a in atomid_to_resid:
721
+ try:
722
+ r_connections[atomid_to_resid[a]].update(
723
+ [
724
+ atomid_to_resid[ai]
725
+ for ai in connect_records[a]
726
+ if ai in atomid_to_resid
727
+ ]
728
+ )
729
+ except KeyError:
730
+ r_connections[atomid_to_resid[a]] = set(
731
+ [
732
+ atomid_to_resid[ai]
733
+ for ai in connect_records[a]
734
+ if ai in atomid_to_resid
735
+ ]
736
+ )
737
+
738
+ connected_sets = []
739
+ for r in sorted(r_connections, key=lambda x: x[1]):
740
+ added = 0
741
+ for i in range(len(connected_sets)):
742
+ if connected_sets[i] & r_connections[r]:
743
+ connected_sets[i].update(r_connections[r])
744
+ added = 1
745
+ break
746
+ if not added:
747
+ connected_sets.append(r_connections[r])
748
+
749
+ n = 0
750
+ for mol in connected_sets:
751
+ if len(mol) > 1:
752
+ polymers.append(Fragment("sugar%d" % n))
753
+ for r in sorted(mol, key=lambda x: x[1]):
754
+ polymers[n].add(sugar[r])
755
+ n += 1
756
+ else:
757
+ monomer_atoms += [atom for atom in sugar[list(mol)[0]]]
758
+
759
+ else:
760
+ for s in sugar:
761
+ monomer_atoms += [atom for atom in sugar[s]]
762
+
763
+ return polymers, monomer_atoms
764
+
765
+ def _find_chain_hetatoms(self, chain):
766
+ """
767
+ Function for TCR and MHC chains to filter out hetatom records; this is to clean up the _match_units code below
768
+ """
769
+ hetatoms, sugars = [], []
770
+ for residue in chain.get_unnumbered():
771
+ # Ignore waters and non-standard amino acids
772
+ if residue.id[0] == "W" or is_aa(residue, standard=False):
773
+ continue
774
+ if is_carbohydrate(residue):
775
+ sugars.append(residue)
776
+ else:
777
+ hetatoms.extend(list(residue.get_atoms()))
778
+
779
+ return hetatoms, sugars
780
+
781
+ def _match_units(self, model, trchains, mhchains, agchains, crystal_contacts=[]):
782
+ """
783
+ Match MHC+Peptide chains to TCR chains.
784
+ model is the current model - extract the TCRs from it (paired chains have been removed)
785
+ trchains contains those TCR chains that have been unable to be paired to form TCRs
786
+ agchains contains non-TCR chains that are potential antigens.
787
+
788
+ Goal: Match TCR <-> MHC + peptide antigen.
789
+ """
790
+ # Get all T-cell receptor-like objects (TCR, TCRchain), and MHC-like objects.
791
+ tcell_receptors = [h for h in model if isinstance(h, TCR)] + trchains.child_list
792
+ mhc_complexes = [h for h in model if isinstance(h, MHC)] + mhchains.child_list
793
+
794
+ # Initialise 5 dictionaries which carries a list of atoms per chain ID.
795
+ antigen_atoms, cdr_atoms, mh_atoms, antigen_hetatoms, antigen_sugars = (
796
+ defaultdict(list),
797
+ defaultdict(list),
798
+ defaultdict(list),
799
+ defaultdict(list),
800
+ defaultdict(list),
801
+ )
802
+
803
+ # Look through TCR and MHC and see if there are any weird hetatoms and sugars in the structure.
804
+ for tr in tcell_receptors:
805
+ for cdr in tr.get_CDRs():
806
+ # Only get CDR3?
807
+ if "3" not in cdr.id:
808
+ continue
809
+ # only look at CA or CB atoms of the CDR; this is used later.
810
+ cdr_atoms[tr.id] += [
811
+ atom
812
+ for atom in cdr.get_atoms()
813
+ if atom.id == "CB" or atom.id == "CA"
814
+ ]
815
+
816
+ # get TCR type and get chain's hetatoms accordingly
817
+ if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR":
818
+ beta_chain = tr.get_VB()
819
+ alpha_chain = tr.get_VA()
820
+
821
+ antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
822
+ self._find_chain_hetatoms(beta_chain)
823
+ )
824
+ antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = (
825
+ self._find_chain_hetatoms(alpha_chain)
826
+ )
827
+
828
+ elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR":
829
+ delta_chain = tr.get_VD()
830
+ gamma_chain = tr.get_VG()
831
+ antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
832
+ self._find_chain_hetatoms(delta_chain)
833
+ )
834
+ antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = (
835
+ self._find_chain_hetatoms(gamma_chain)
836
+ )
837
+
838
+ elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR":
839
+ beta_chain = tr.get_VB()
840
+ delta_chain = tr.get_VD()
841
+ antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
842
+ self._find_chain_hetatoms(beta_chain)
843
+ )
844
+ antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
845
+ self._find_chain_hetatoms(delta_chain)
846
+ )
847
+
848
+ # Unpaired TCR chain
849
+ elif isinstance(tr, TCRchain):
850
+ antigen_hetatoms[tr.id], antigen_sugars[tr.id] = (
851
+ self._find_chain_hetatoms(tr)
852
+ )
853
+
854
+ # Do the same for MHC.
855
+ for mh in mhc_complexes:
856
+ # Keep G domain atoms; Get the Helix region of MHC
857
+ mh_atoms[mh.id] = [
858
+ atom
859
+ for atom in mh.get_atoms()
860
+ if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix"
861
+ ]
862
+ if isinstance(mh, MHC) and mh.MHC_type == "MH1":
863
+ MH1, B2M = mh.get_MH1(), mh.get_B2M()
864
+ if MH1 is not None:
865
+ antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = (
866
+ self._find_chain_hetatoms(MH1)
867
+ )
868
+ else:
869
+ GA1 = mh.get_GA1()
870
+ antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = (
871
+ self._find_chain_hetatoms(GA1)
872
+ )
873
+ if B2M is not None: # handle single chain MH1 case
874
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
875
+ self._find_chain_hetatoms(B2M)
876
+ )
877
+
878
+ elif isinstance(mh, MHC) and mh.MHC_type == "CD1":
879
+ CD1, B2M = mh.get_CD1(), mh.get_B2M()
880
+ if CD1 is not None:
881
+ antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = (
882
+ self._find_chain_hetatoms(CD1)
883
+ )
884
+ if B2M is not None:
885
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
886
+ self._find_chain_hetatoms(B2M)
887
+ )
888
+
889
+ elif isinstance(mh, MHC) and mh.MHC_type == "MR1":
890
+ MR1, B2M = mh.get_MR1(), mh.get_B2M()
891
+ antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = (
892
+ self._find_chain_hetatoms(MR1)
893
+ )
894
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
895
+ self._find_chain_hetatoms(B2M)
896
+ )
897
+
898
+ elif isinstance(mh, MHC) and mh.MHC_type == "MH2":
899
+ GA, GB = mh.get_GA(), mh.get_GB()
900
+ antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = (
901
+ self._find_chain_hetatoms(GA)
902
+ )
903
+ antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = (
904
+ self._find_chain_hetatoms(GB)
905
+ )
906
+
907
+ # Unpaired MHC chains -- if any, go here.
908
+ elif isinstance(mh, MHCchain):
909
+ antigen_hetatoms[mh.id], antigen_sugars[mh.id] = (
910
+ self._find_chain_hetatoms(mh)
911
+ )
912
+
913
+ for antigen in agchains:
914
+ antigen_atoms[antigen.id] = [
915
+ a
916
+ for a in antigen.get_atoms()
917
+ if a.parent.id[0] == " " or is_aa(a.parent)
918
+ ] # test ATOM records or amino acid HETATM records
919
+ antigen_hetatoms[antigen.id] = [
920
+ a
921
+ for a in antigen.get_atoms()
922
+ if a.parent.id[0].startswith("H") and not is_aa(a.parent)
923
+ ] # hetatm and not an amino acid
924
+
925
+ # Problem here with carbohydrate molecules as units not recognised as polymers.
926
+ # Have to use connect records to join them
927
+ # Then consider them in the same way.
928
+ sugars = []
929
+ for chain_id in antigen_sugars:
930
+ if antigen_sugars[chain_id]:
931
+ polymers, monomer_atoms = self._get_sugar_fragments(
932
+ antigen_sugars[chain_id]
933
+ )
934
+ sugars += polymers
935
+ antigen_hetatoms[chain_id] += monomer_atoms
936
+
937
+ # We look through hetatms -- sometimes, hetatms can be associated to TR or MH chains, so we separate and whisk these out.
938
+ # Protein/peptide entities override small molecules that are more likely to be buffer or cofactor molecules.
939
+ # Get non-empty antigen hetatoms first
940
+ non_empty_ag = [k for k in antigen_hetatoms if antigen_hetatoms[k]]
941
+
942
+ # Pair proteins/peptides with TCR then MHC
943
+ self._protein_peptide_pass(
944
+ model, tcell_receptors, cdr_atoms, antigen_atoms, crystal_contacts
945
+ )
946
+ self._het_sugar_pass(
947
+ tcell_receptors,
948
+ cdr_atoms,
949
+ non_empty_ag,
950
+ antigen_hetatoms,
951
+ sugars,
952
+ distance=8.0,
953
+ )
954
+
955
+ # If a TCR does not have a detected MHC chain, then skip the remaining MHC-specific parsing bits.
956
+ if not mhc_complexes:
957
+ return
958
+
959
+ # Have a very tight cutoff for MHCs that present het atoms (e.g. CD1 types)
960
+ self._het_sugar_pass(
961
+ mhc_complexes,
962
+ mh_atoms,
963
+ non_empty_ag,
964
+ antigen_hetatoms,
965
+ sugars,
966
+ distance=3.5,
967
+ )
968
+
969
+ if antigen_atoms:
970
+ self._protein_peptide_pass(
971
+ model, mhc_complexes, mh_atoms, antigen_atoms, crystal_contacts
972
+ )
973
+
974
+ # Pair a TCR with an MHC and vice-versa; go through all possible combinations of TCR/MHC
975
+ # We see if a CB/CA atom of the helix region of an MHC is within 8A of a TCR CDR loop's CB/CA atoms.
976
+ # This is similar to the _protein_peptide_pass algorithm; we find the number of contacts between MHC and TCR,
977
+ # and use the MHC with highest no. of contacts
978
+ contact_freq = defaultdict(int)
979
+ tr_mh_pairs = list(product(tcell_receptors, mhc_complexes))
980
+
981
+ for tr, mh in tr_mh_pairs:
982
+ ns = NeighborSearch(cdr_atoms[tr.id])
983
+ for atom in mh_atoms[mh.id]:
984
+ # This is a generous cutoff to be used for now.
985
+ contacts = ns.search(atom.get_coord(), 8.0, level="R")
986
+ for c in contacts:
987
+ contact_freq[(tr.id, mh.id)] += 1
988
+
989
+ # Sort TR-MH pairs by number of contacts and then get the highest-frequency pairs
990
+ sorted_contacts = sorted(
991
+ list(contact_freq.items()), key=lambda z: z[1], reverse=True
992
+ )
993
+ paired_tr_mh = set()
994
+ for pair, contacts in sorted_contacts:
995
+ tr, mh = pair
996
+ # If the TCR has already been paired, or if we know that the TCR and MHC are forming crystal contacts, move on.
997
+ if tr in paired_tr_mh or (tr, mh) in crystal_contacts:
998
+ continue
999
+ model[tr]._add_mhc(model[mh])
1000
+ model[mh]._add_tcr(model[tr])
1001
+ paired_tr_mh.add(tr)
1002
+
1003
+ def _protein_peptide_pass(
1004
+ self, model, complexes, receptor_atoms, antigen_atoms, crystal_contacts=[]
1005
+ ):
1006
+ """
1007
+ This is a generic method to process which proteins/peptides belong to a TCR or MHC. Needs testing.
1008
+ @param complexes: list of TCR/TCRchain objects or MHC/MHCchain objects
1009
+ @param receptor_atoms: list of atom subset that will likely contact the antigen (e.g. cdr_atoms)
1010
+ @param antigen_atoms: list of atoms in the antigen.
1011
+ """
1012
+ ns = NeighborSearch(
1013
+ [atom for chain in receptor_atoms for atom in receptor_atoms[chain]]
1014
+ + [atom for chain in antigen_atoms for atom in antigen_atoms[chain]]
1015
+ )
1016
+ contacts = [con for con in ns.search_all(8.0, "R")]
1017
+ contact_freq = defaultdict(lambda: defaultdict(int))
1018
+
1019
+ # all_cpx_chains is a dictionary that has a TCR/MHC chain as a key and the ID of the TCR/MHC as value
1020
+ all_cpx_chains = dict()
1021
+ for cpx in complexes:
1022
+ cpx_ch = list(cpx.id)
1023
+ for c in cpx_ch:
1024
+ all_cpx_chains[c] = cpx.id
1025
+
1026
+ # trids stores all paired/unpaired TR chains
1027
+ cpxids = set(all_cpx_chains.values())
1028
+ ags = set()
1029
+
1030
+ for c in contacts:
1031
+ p1 = str(c[0].parent.id) # get the chain id
1032
+ p2 = str(c[1].parent.id)
1033
+
1034
+ # Reject cases where contacts are from the same chain, or the combination of chains is a TCR
1035
+ potential_contact = p1 + p2
1036
+ potential_contact2 = p2 + p1
1037
+
1038
+ if (
1039
+ p1 == p2
1040
+ or potential_contact in contact_freq
1041
+ or potential_contact2 in contact_freq
1042
+ ):
1043
+ continue
1044
+
1045
+ # If the potential contacting set of chains (p1+p2) is not a TR and p1 is a TR chain but p2 is NOT a TR chain, then p2 is an AG
1046
+ if (
1047
+ (potential_contact not in cpxids)
1048
+ and (p1 in all_cpx_chains)
1049
+ and (p2 not in all_cpx_chains)
1050
+ ):
1051
+ T = all_cpx_chains[p1]
1052
+ ag = p2
1053
+ # If the second set of potential contacting set of chains (p2+p1) is not a TR and p2 is a TR chain but p1 is NOT a TR chain,
1054
+ # then p1 is an AG
1055
+ elif (
1056
+ (potential_contact2 not in cpxids)
1057
+ and (p2 in all_cpx_chains)
1058
+ and (p1 not in all_cpx_chains)
1059
+ ):
1060
+ T = all_cpx_chains[p2]
1061
+ ag = p1
1062
+ else:
1063
+ continue
1064
+
1065
+ # T is either the paired TCR id or an id of a single TCR chain
1066
+ contact_freq[T][ag] += 1
1067
+ ags.add(ag)
1068
+
1069
+ # Iterate over the TR identifiers
1070
+ for cpx_id in cpxids:
1071
+
1072
+ # If there are detected antigen contacts
1073
+ if contact_freq[cpx_id]:
1074
+ # Get the antigen
1075
+ ag = max(contact_freq[cpx_id], key=lambda x: contact_freq[cpx_id][x])
1076
+
1077
+ if (cpx_id, ag) not in crystal_contacts:
1078
+ model[cpx_id].antigen = (
1079
+ []
1080
+ ) # disregard smaller antigens if peptide or protein present.
1081
+ model[cpx_id]._add_antigen(
1082
+ model[ag]
1083
+ ) # pair up an antigen to the TCR.
1084
+
1085
+ # Remove the antigen now, as it is paired up with a TR.
1086
+ if ag in ags:
1087
+ ags.remove(ag)
1088
+
1089
+ # iterate over the remaining antigens to see if they are also bound.
1090
+ for ag in ags:
1091
+ cmax = 0
1092
+ for C in contact_freq:
1093
+ if ag in contact_freq[C] and (C, ag) not in crystal_contacts:
1094
+ if contact_freq[C][ag] > cmax:
1095
+ paired_cpx = C
1096
+ cmax = contact_freq[C][ag]
1097
+ if cmax:
1098
+ if len(contact_freq) > 1:
1099
+ self.warnings.write(
1100
+ "Crystal Contact Warning: antigen %s has been paired with TCR %s"
1101
+ % (str(ag), str(paired_cpx))
1102
+ )
1103
+ model[paired_cpx]._add_antigen(model[ag])
1104
+ else:
1105
+ model[paired_cpx]._add_antigen(model[ag])
1106
+
1107
+ def _het_sugar_pass(
1108
+ self,
1109
+ receptors,
1110
+ receptor_atoms,
1111
+ non_empty_ag,
1112
+ antigen_hetatoms,
1113
+ sugars,
1114
+ distance=8.0,
1115
+ ):
1116
+ """ """
1117
+ # Iterate through every possible pair of TR and hetatom chain
1118
+ for rec, antigen_het in product(receptors, non_empty_ag):
1119
+ # Initialise a NeighborSearch based on the atoms for a particualr chain of hetatoms
1120
+ ns = NeighborSearch(antigen_hetatoms[antigen_het])
1121
+
1122
+ # Look through CDR atoms (CA/CB)
1123
+ for atom in receptor_atoms[rec.id]:
1124
+ # use 8.0A from the CDR CA/CB to the antigen. Using level = "R" returns Residue objects.
1125
+ contacts = ns.search(atom.get_coord(), distance, level="R")
1126
+ if contacts:
1127
+ for contact in contacts:
1128
+ # we assume that each contact residue is a single molecule (need to test its not just a residue)
1129
+ if self._check_het_antigen(contact):
1130
+ residue_type = get_res_type(contact)
1131
+
1132
+ if residue_type == "Hapten":
1133
+ self.warnings.write(
1134
+ """Warning: Multiple hapten-antigen like molecules found in binding site -
1135
+ this needs attention as could be solvent/cofactor."""
1136
+ )
1137
+ if residue_type == "non-polymer":
1138
+ contact.type = "Hapten" # add a antigen type attribute to the residue
1139
+ contact.get_type = (
1140
+ lambda: "Hapten"
1141
+ ) # add a get antigen type method to the residue
1142
+ elif residue_type == "nucleic-acid":
1143
+ contact.type = "nucleic-acid" # add a antigen type attribute to the residue
1144
+ contact.get_type = (
1145
+ lambda: "nucleic-acid"
1146
+ ) # add a get antigen type method to the residue
1147
+ elif residue_type == "saccharide":
1148
+ contact.type = "carbohydrate" # add a antigen type attribute to the residue
1149
+ contact.get_type = (
1150
+ lambda: "carbohydrate"
1151
+ ) # add a get antigen type method to the residue
1152
+ rec._add_antigen(contact)
1153
+
1154
+ # Iterate through sugar fragments
1155
+ # for rec, sugar_fragment in product( receptors, sugars ):
1156
+ # ns = NeighborSearch([atom for atom in sugar_fragment.get_atoms()])
1157
+ # for atom in receptor_atoms[rec.id]:
1158
+ # contacts = ns.search(atom.get_coord(), distance, level="R")
1159
+ # if contacts:
1160
+ # sugar_fragment.type = "carbohydrate" # add a antigen type attribute to the fragment
1161
+ # sugar_fragment.get_type = lambda: "carbohydrate" # add a get antigen type method to the fragment
1162
+ # rec._add_antigen(sugar_fragment)
1163
+
1164
+ def _check_het_antigen(self, residue):
1165
+ """
1166
+ Method to perform checks on a potential hetatm residue.
1167
+
1168
+ 1. Check that it is not an amino acid - we don't want a modified residue to be found as a hapten.
1169
+ 2. Check that the residue name is not a common buffer using high frequency residue codes.
1170
+
1171
+ If we throw it out due to check 3 it will be reported to user.
1172
+ """
1173
+
1174
+ # check 1
1175
+ # no amino acids
1176
+ if is_aa(residue, standard=False):
1177
+ return False
1178
+
1179
+ # check 2
1180
+ # check for common buffers/unlikely haptens
1181
+ if is_common_buffer(residue):
1182
+ if not self.QUIET:
1183
+ self.warnings.write(
1184
+ "Common molecule %s found in the binding site - not considered an antigen"
1185
+ % residue.get_resname()
1186
+ )
1187
+ return False
1188
+
1189
+ # add more checks as problems arise
1190
+ return True
1191
+
1192
+ def _prenumbered(self, chain, prenumbering, ali_dict={}, n=0):
1193
+ """
1194
+ Method to deal with numbering supplied by the user. (or from the database)
1195
+ """
1196
+
1197
+ if ali_dict:
1198
+ ali_dict = ali_dict[chain.id][n]
1199
+
1200
+ annotation, chain_type = prenumbering[chain.id][n]
1201
+
1202
+ try:
1203
+ sequence_list, sequence_str, warnings = extract_sequence(
1204
+ chain, return_warnings=True
1205
+ )
1206
+ numbering = align_numbering(annotation, sequence_list, ali_dict)
1207
+ except (
1208
+ AssertionError
1209
+ ): # If the user has an alignment file generated before hetatoms included
1210
+ sequence_list, sequence_str, warnings = extract_sequence(
1211
+ chain, return_warnings=True, ignore_hets=True
1212
+ )
1213
+ numbering = align_numbering(annotation, sequence_list, ali_dict)
1214
+ self.warnings.log += warnings
1215
+
1216
+ return numbering, chain_type
1217
+
1218
+
1219
+ # class error_stream:
1220
+ # def __init__(self):
1221
+ # self.log = []
1222
+
1223
+ # def __str__(self):
1224
+ # return "\n".join(self.log)
1225
+
1226
+ # def __repr__(self):
1227
+ # return self.__str__()
1228
+
1229
+ # def write(self, s):
1230
+ # self.log.append(str(s).strip("\n"))