stcrpy 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stcrpy/__init__.py +1 -1
- stcrpy/tcr_formats/tcr_formats.py +20 -1
- stcrpy/tcr_geometry/TCRAngle.py +177 -0
- stcrpy/tcr_geometry/reference_data/Acoreset.txt +30 -0
- stcrpy/tcr_geometry/reference_data/Bcoreset.txt +30 -0
- stcrpy/tcr_geometry/reference_data/consensus_A.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/consensus_B.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/consensus_D.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/consensus_G.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/pcA.txt +3 -0
- stcrpy/tcr_geometry/reference_data/pcB.txt +3 -0
- stcrpy/tcr_interactions/TCRInteractionProfiler.py +8 -1
- stcrpy/tcr_methods/tcr_batch_operations.py +14 -10
- stcrpy/tcr_methods/tcr_methods.py +23 -22
- stcrpy/tcr_metrics/tcr_dockq.py +404 -0
- stcrpy/tcr_processing/MHC.py +389 -4
- stcrpy/tcr_processing/TCR.py +252 -0
- stcrpy/tcr_processing/TCRParser.py +351 -189
- stcrpy/tcr_processing/annotate.py +6 -1
- stcrpy/tcr_processing/utils/region_definitions.py +9 -0
- stcrpy/tcr_processing/utils/symmetry_mates.py +96 -0
- stcrpy-1.0.6.dist-info/METADATA +286 -0
- {stcrpy-1.0.3.dist-info → stcrpy-1.0.6.dist-info}/RECORD +27 -16
- {stcrpy-1.0.3.dist-info → stcrpy-1.0.6.dist-info}/WHEEL +1 -1
- stcrpy-1.0.3.dist-info/METADATA +0 -173
- {stcrpy-1.0.3.dist-info → stcrpy-1.0.6.dist-info}/licenses/LICENCE +0 -0
- {stcrpy-1.0.3.dist-info → stcrpy-1.0.6.dist-info}/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +0 -0
- {stcrpy-1.0.3.dist-info → stcrpy-1.0.6.dist-info}/top_level.txt +0 -0
|
@@ -8,12 +8,14 @@ TCRParser object which is based on ABDB's AntibodyParser and BioPython's PDB par
|
|
|
8
8
|
from itertools import combinations, product
|
|
9
9
|
import sys
|
|
10
10
|
import os
|
|
11
|
+
import tempfile
|
|
11
12
|
from collections import defaultdict
|
|
12
13
|
import warnings
|
|
13
14
|
|
|
14
15
|
from Bio.PDB.PDBParser import PDBParser
|
|
15
16
|
from Bio.PDB.MMCIFParser import MMCIFParser
|
|
16
17
|
from Bio.PDB import NeighborSearch
|
|
18
|
+
from Bio.PDB import PDBIO
|
|
17
19
|
|
|
18
20
|
# TCRDB
|
|
19
21
|
from .annotate import annotate, extract_sequence, align_numbering
|
|
@@ -58,6 +60,10 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
58
60
|
self.numbering_scheme = "imgt"
|
|
59
61
|
self.definition = "imgt"
|
|
60
62
|
|
|
63
|
+
self.current_file = (
|
|
64
|
+
None # the current file being processed, populated by get_tcr_structure
|
|
65
|
+
)
|
|
66
|
+
|
|
61
67
|
def _create_chain(self, chain, new_chain_id, numbering, chain_type):
|
|
62
68
|
"""
|
|
63
69
|
Create a new TCR or MHC chain.
|
|
@@ -194,19 +200,62 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
194
200
|
|
|
195
201
|
return newchain1, newchain2
|
|
196
202
|
|
|
197
|
-
def
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
203
|
+
def _number_and_annotate_chain(self, chain, prenumbering=None, ali_dict={}):
|
|
204
|
+
# try to number the sequence found in the structure
|
|
205
|
+
if prenumbering and chain.id in prenumbering:
|
|
206
|
+
if len(prenumbering[chain.id]) == 2:
|
|
207
|
+
numbering = [{}, {}]
|
|
208
|
+
region_types = ["", ""]
|
|
202
209
|
|
|
203
|
-
|
|
204
|
-
|
|
210
|
+
numbering[0], region_types[0] = self._prenumbered(
|
|
211
|
+
chain, prenumbering, ali_dict, n=0
|
|
212
|
+
)
|
|
213
|
+
numbering[1], region_types[1] = self._prenumbered(
|
|
214
|
+
chain, prenumbering, ali_dict, n=1
|
|
215
|
+
)
|
|
216
|
+
rtypes = sorted(region_types)
|
|
205
217
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
218
|
+
# Check that we have a beta/alpha domain or gamma/delta domain
|
|
219
|
+
if rtypes == ["A", "B"] or rtypes == ["D", "G"]:
|
|
220
|
+
chain_type = "".join(region_types)
|
|
221
|
+
scTCR = True
|
|
222
|
+
# if not, just take the first region and warn the user
|
|
223
|
+
else:
|
|
224
|
+
chain_type = region_types[0]
|
|
225
|
+
numbering = numbering[0]
|
|
226
|
+
scTCR = False
|
|
227
|
+
print(
|
|
228
|
+
"Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only."
|
|
229
|
+
% (chain_type, chain.id),
|
|
230
|
+
file=self.warnings,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]:
|
|
234
|
+
numbering, chain_type, scTCR = annotate(chain)
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
numbering, chain_type = self._prenumbered(
|
|
238
|
+
chain, prenumbering, ali_dict, n=0
|
|
239
|
+
)
|
|
240
|
+
scTCR = False
|
|
241
|
+
|
|
242
|
+
else:
|
|
243
|
+
numbering, chain_type, germline_info, scTCR = annotate(chain)
|
|
244
|
+
|
|
245
|
+
return numbering, chain_type, germline_info, scTCR
|
|
246
|
+
|
|
247
|
+
def _get_header_info(self, tcrstructure, chain, germline_info):
|
|
248
|
+
if chain.id in tcrstructure.header["chain_details"]: # clean this up!!!
|
|
249
|
+
engineered = tcrstructure.header["chain_details"][chain.id]["engineered"]
|
|
250
|
+
details = tcrstructure.header["chain_details"][chain.id]
|
|
251
|
+
else:
|
|
252
|
+
engineered = False
|
|
253
|
+
details = {"molecule": "unknown", "engineered": False}
|
|
254
|
+
|
|
255
|
+
details["genetic_origin"] = germline_info
|
|
256
|
+
return details, engineered
|
|
257
|
+
|
|
258
|
+
def _read_structure_file(self, file, id):
|
|
210
259
|
# get a structure object from biopython.
|
|
211
260
|
_, ext = os.path.splitext(file)
|
|
212
261
|
if ext.lower() == ".pdb":
|
|
@@ -225,6 +274,45 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
225
274
|
# Set and analyse header information
|
|
226
275
|
tcrstructure.set_header(structure.header)
|
|
227
276
|
self._analyse_header(tcrstructure)
|
|
277
|
+
return structure, tcrstructure
|
|
278
|
+
|
|
279
|
+
def _initialise_model(self, model):
|
|
280
|
+
newmodel = Model(model.id)
|
|
281
|
+
|
|
282
|
+
# initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains.
|
|
283
|
+
agchains = Holder("Antigen")
|
|
284
|
+
trchains = Holder("TCRchain")
|
|
285
|
+
mhchains = Holder("MHCchain")
|
|
286
|
+
newmodel.add(agchains)
|
|
287
|
+
newmodel.add(trchains)
|
|
288
|
+
newmodel.add(mhchains)
|
|
289
|
+
return newmodel, agchains, trchains, mhchains
|
|
290
|
+
|
|
291
|
+
def get_tcr_structure(
|
|
292
|
+
self,
|
|
293
|
+
id,
|
|
294
|
+
file,
|
|
295
|
+
prenumbering=None,
|
|
296
|
+
ali_dict={},
|
|
297
|
+
crystal_contacts=[],
|
|
298
|
+
include_symmetry_mates=True,
|
|
299
|
+
):
|
|
300
|
+
"""
|
|
301
|
+
Post processing of the TCRPDB.Bio.PDB structure object into a TCR context.
|
|
302
|
+
|
|
303
|
+
id: a string to identify the structure
|
|
304
|
+
file: the path to the .pdb file
|
|
305
|
+
|
|
306
|
+
optional:
|
|
307
|
+
prenumbering: prenumbering for the chains in the structure.
|
|
308
|
+
"""
|
|
309
|
+
self.warnings = ErrorStream()
|
|
310
|
+
self.include_symmetry_mates = include_symmetry_mates
|
|
311
|
+
self.current_file = file
|
|
312
|
+
|
|
313
|
+
structure, tcrstructure = self._read_structure_file(
|
|
314
|
+
file, id
|
|
315
|
+
) # structure: Bio.PDB.Structure from file; tcrstructure: initialised empty TCRStructure object to be populated
|
|
228
316
|
|
|
229
317
|
# iterate over the models in the structure
|
|
230
318
|
# iterate backwards through the model list - delete old structure as we go
|
|
@@ -233,70 +321,18 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
233
321
|
for mid in range(len(structure.child_list) - 1, -1, -1):
|
|
234
322
|
# add a model to the TCR structure
|
|
235
323
|
model = structure.child_list[mid]
|
|
236
|
-
newmodel =
|
|
324
|
+
newmodel, agchains, trchains, mhchains = self._initialise_model(model)
|
|
237
325
|
tcrstructure.add(newmodel)
|
|
238
326
|
|
|
239
|
-
# initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains.
|
|
240
|
-
agchains = Holder("Antigen")
|
|
241
|
-
trchains = Holder("TCRchain")
|
|
242
|
-
mhchains = Holder("MHCchain")
|
|
243
|
-
newmodel.add(agchains)
|
|
244
|
-
newmodel.add(trchains)
|
|
245
|
-
newmodel.add(mhchains)
|
|
246
|
-
|
|
247
327
|
# iterate over the chains in the model
|
|
248
328
|
for chain in model.get_list():
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
numbering = [{}, {}]
|
|
253
|
-
region_types = ["", ""]
|
|
254
|
-
|
|
255
|
-
numbering[0], region_types[0] = self._prenumbered(
|
|
256
|
-
chain, prenumbering, ali_dict, n=0
|
|
257
|
-
)
|
|
258
|
-
numbering[1], region_types[1] = self._prenumbered(
|
|
259
|
-
chain, prenumbering, ali_dict, n=1
|
|
260
|
-
)
|
|
261
|
-
rtypes = sorted(region_types)
|
|
262
|
-
|
|
263
|
-
# Check that we have a beta/alpha domain or gamma/delta domain
|
|
264
|
-
if rtypes == ["A", "B"] or rtypes == ["D", "G"]:
|
|
265
|
-
chain_type = "".join(region_types)
|
|
266
|
-
scTCR = True
|
|
267
|
-
# if not, just take the first region and warn the user
|
|
268
|
-
else:
|
|
269
|
-
chain_type = region_types[0]
|
|
270
|
-
numbering = numbering[0]
|
|
271
|
-
scTCR = False
|
|
272
|
-
print(
|
|
273
|
-
"Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only."
|
|
274
|
-
% (chain_type, chain.id),
|
|
275
|
-
file=self.warnings,
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]:
|
|
279
|
-
numbering, chain_type, scTCR = annotate(chain)
|
|
280
|
-
|
|
281
|
-
else:
|
|
282
|
-
numbering, chain_type = self._prenumbered(
|
|
283
|
-
chain, prenumbering, ali_dict, n=0
|
|
284
|
-
)
|
|
285
|
-
scTCR = False
|
|
286
|
-
|
|
287
|
-
else:
|
|
288
|
-
numbering, chain_type, germline_info, scTCR = annotate(chain)
|
|
289
|
-
|
|
290
|
-
if chain.id in tcrstructure.header["chain_details"]: # clean this up!!!
|
|
291
|
-
engineered = tcrstructure.header["chain_details"][chain.id][
|
|
292
|
-
"engineered"
|
|
293
|
-
]
|
|
294
|
-
details = tcrstructure.header["chain_details"][chain.id]
|
|
295
|
-
else:
|
|
296
|
-
engineered = False
|
|
297
|
-
details = {"molecule": "unknown", "engineered": False}
|
|
329
|
+
numbering, chain_type, germline_info, scTCR = (
|
|
330
|
+
self._number_and_annotate_chain(chain, prenumbering, ali_dict)
|
|
331
|
+
)
|
|
298
332
|
|
|
299
|
-
details
|
|
333
|
+
details, engineered = self._get_header_info(
|
|
334
|
+
tcrstructure, chain, germline_info
|
|
335
|
+
)
|
|
300
336
|
|
|
301
337
|
if numbering and chain_type in ["G", "D", "B", "A"]:
|
|
302
338
|
# create a new TCR chain
|
|
@@ -350,7 +386,9 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
350
386
|
elif not obs_chaintypes - set(["G", "D"]):
|
|
351
387
|
tcr = gdTCR(chain1, chain2)
|
|
352
388
|
elif not obs_chaintypes - set(["B", "D"]):
|
|
353
|
-
tcr = abTCR(
|
|
389
|
+
tcr = abTCR(
|
|
390
|
+
chain1, chain2
|
|
391
|
+
) # initial way to deal with anarci missclassification of alpha chains as delta chains
|
|
354
392
|
# tcr = dbTCR(chain1, chain2)
|
|
355
393
|
|
|
356
394
|
tcr.scTCR = True #
|
|
@@ -503,6 +541,7 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
503
541
|
sys.stderr.write("\n")
|
|
504
542
|
tcrstructure.warnings = self.warnings
|
|
505
543
|
|
|
544
|
+
self.current_file = None # reset the current file
|
|
506
545
|
return tcrstructure
|
|
507
546
|
|
|
508
547
|
def _analyse_header(self, header):
|
|
@@ -787,19 +826,120 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
787
826
|
|
|
788
827
|
return hetatoms, sugars
|
|
789
828
|
|
|
790
|
-
def
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
829
|
+
def _prepare_tcr(self, tr, cdr_atoms, antigen_hetatoms, antigen_sugars):
|
|
830
|
+
for cdr in tr.get_CDRs():
|
|
831
|
+
# Only get CDR3?
|
|
832
|
+
if "3" not in cdr.id:
|
|
833
|
+
continue
|
|
834
|
+
# only look at CA or CB atoms of the CDR; this is used later.
|
|
835
|
+
cdr_atoms[tr.id] += [
|
|
836
|
+
atom for atom in cdr.get_atoms() if atom.id == "CB" or atom.id == "CA"
|
|
837
|
+
]
|
|
796
838
|
|
|
797
|
-
|
|
798
|
-
""
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
839
|
+
# get TCR type and get chain's hetatoms accordingly
|
|
840
|
+
if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR":
|
|
841
|
+
beta_chain = tr.get_VB()
|
|
842
|
+
alpha_chain = tr.get_VA()
|
|
843
|
+
|
|
844
|
+
antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = self._find_chain_hetatoms(
|
|
845
|
+
beta_chain
|
|
846
|
+
)
|
|
847
|
+
antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = self._find_chain_hetatoms(
|
|
848
|
+
alpha_chain
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR":
|
|
852
|
+
delta_chain = tr.get_VD()
|
|
853
|
+
gamma_chain = tr.get_VG()
|
|
854
|
+
antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = self._find_chain_hetatoms(
|
|
855
|
+
delta_chain
|
|
856
|
+
)
|
|
857
|
+
antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = self._find_chain_hetatoms(
|
|
858
|
+
gamma_chain
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR":
|
|
862
|
+
beta_chain = tr.get_VB()
|
|
863
|
+
delta_chain = tr.get_VD()
|
|
864
|
+
antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = self._find_chain_hetatoms(
|
|
865
|
+
beta_chain
|
|
866
|
+
)
|
|
867
|
+
antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = self._find_chain_hetatoms(
|
|
868
|
+
delta_chain
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
# Unpaired TCR chain
|
|
872
|
+
elif isinstance(tr, TCRchain):
|
|
873
|
+
antigen_hetatoms[tr.id], antigen_sugars[tr.id] = self._find_chain_hetatoms(
|
|
874
|
+
tr
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
def _prepare_mhc(self, mh, mh_atoms, antigen_hetatoms, antigen_sugars):
|
|
878
|
+
# Keep G domain atoms; Get the Helix region of MHC
|
|
879
|
+
mh_atoms[mh.id] = [
|
|
880
|
+
atom
|
|
881
|
+
for atom in mh.get_atoms()
|
|
882
|
+
if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix"
|
|
883
|
+
]
|
|
884
|
+
if isinstance(mh, MHC) and mh.MHC_type == "MH1":
|
|
885
|
+
MH1, B2M = mh.get_MH1(), mh.get_B2M()
|
|
886
|
+
if MH1 is not None:
|
|
887
|
+
antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = (
|
|
888
|
+
self._find_chain_hetatoms(MH1)
|
|
889
|
+
)
|
|
890
|
+
else:
|
|
891
|
+
GA1 = mh.get_GA1()
|
|
892
|
+
antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = (
|
|
893
|
+
self._find_chain_hetatoms(GA1)
|
|
894
|
+
)
|
|
895
|
+
if B2M is not None: # handle single chain MH1 case
|
|
896
|
+
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
897
|
+
self._find_chain_hetatoms(B2M)
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
elif isinstance(mh, MHC) and mh.MHC_type == "CD1":
|
|
901
|
+
CD1, B2M = mh.get_CD1(), mh.get_B2M()
|
|
902
|
+
if CD1 is not None:
|
|
903
|
+
antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = (
|
|
904
|
+
self._find_chain_hetatoms(CD1)
|
|
905
|
+
)
|
|
906
|
+
if B2M is not None:
|
|
907
|
+
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
908
|
+
self._find_chain_hetatoms(B2M)
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
elif isinstance(mh, MHC) and mh.MHC_type == "MR1":
|
|
912
|
+
MR1, B2M = mh.get_MR1(), mh.get_B2M()
|
|
913
|
+
antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = (
|
|
914
|
+
self._find_chain_hetatoms(MR1)
|
|
915
|
+
)
|
|
916
|
+
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
917
|
+
self._find_chain_hetatoms(B2M)
|
|
918
|
+
)
|
|
802
919
|
|
|
920
|
+
elif isinstance(mh, MHC) and mh.MHC_type == "MH2":
|
|
921
|
+
GA, GB = mh.get_GA(), mh.get_GB()
|
|
922
|
+
antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = self._find_chain_hetatoms(
|
|
923
|
+
GA
|
|
924
|
+
)
|
|
925
|
+
antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = self._find_chain_hetatoms(
|
|
926
|
+
GB
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# Unpaired MHC chains -- if any, go here.
|
|
930
|
+
elif isinstance(mh, MHCchain):
|
|
931
|
+
antigen_hetatoms[mh.id], antigen_sugars[mh.id] = self._find_chain_hetatoms(
|
|
932
|
+
mh
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
def _prepare_tcrs_mhcs_and_antigens_for_pairing(
|
|
936
|
+
self,
|
|
937
|
+
model,
|
|
938
|
+
tcell_receptors,
|
|
939
|
+
mhc_complexes,
|
|
940
|
+
agchains,
|
|
941
|
+
crystal_contacts,
|
|
942
|
+
):
|
|
803
943
|
# Initialise 5 dictionaries which carries a list of atoms per chain ID.
|
|
804
944
|
antigen_atoms, cdr_atoms, mh_atoms, antigen_hetatoms, antigen_sugars = (
|
|
805
945
|
defaultdict(list),
|
|
@@ -811,113 +951,11 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
811
951
|
|
|
812
952
|
# Look through TCR and MHC and see if there are any weird hetatoms and sugars in the structure.
|
|
813
953
|
for tr in tcell_receptors:
|
|
814
|
-
|
|
815
|
-
# Only get CDR3?
|
|
816
|
-
if "3" not in cdr.id:
|
|
817
|
-
continue
|
|
818
|
-
# only look at CA or CB atoms of the CDR; this is used later.
|
|
819
|
-
cdr_atoms[tr.id] += [
|
|
820
|
-
atom
|
|
821
|
-
for atom in cdr.get_atoms()
|
|
822
|
-
if atom.id == "CB" or atom.id == "CA"
|
|
823
|
-
]
|
|
824
|
-
|
|
825
|
-
# get TCR type and get chain's hetatoms accordingly
|
|
826
|
-
if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR":
|
|
827
|
-
beta_chain = tr.get_VB()
|
|
828
|
-
alpha_chain = tr.get_VA()
|
|
829
|
-
|
|
830
|
-
antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
|
|
831
|
-
self._find_chain_hetatoms(beta_chain)
|
|
832
|
-
)
|
|
833
|
-
antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = (
|
|
834
|
-
self._find_chain_hetatoms(alpha_chain)
|
|
835
|
-
)
|
|
836
|
-
|
|
837
|
-
elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR":
|
|
838
|
-
delta_chain = tr.get_VD()
|
|
839
|
-
gamma_chain = tr.get_VG()
|
|
840
|
-
antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
|
|
841
|
-
self._find_chain_hetatoms(delta_chain)
|
|
842
|
-
)
|
|
843
|
-
antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = (
|
|
844
|
-
self._find_chain_hetatoms(gamma_chain)
|
|
845
|
-
)
|
|
846
|
-
|
|
847
|
-
elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR":
|
|
848
|
-
beta_chain = tr.get_VB()
|
|
849
|
-
delta_chain = tr.get_VD()
|
|
850
|
-
antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
|
|
851
|
-
self._find_chain_hetatoms(beta_chain)
|
|
852
|
-
)
|
|
853
|
-
antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
|
|
854
|
-
self._find_chain_hetatoms(delta_chain)
|
|
855
|
-
)
|
|
856
|
-
|
|
857
|
-
# Unpaired TCR chain
|
|
858
|
-
elif isinstance(tr, TCRchain):
|
|
859
|
-
antigen_hetatoms[tr.id], antigen_sugars[tr.id] = (
|
|
860
|
-
self._find_chain_hetatoms(tr)
|
|
861
|
-
)
|
|
954
|
+
self._prepare_tcr(tr, cdr_atoms, antigen_hetatoms, antigen_sugars)
|
|
862
955
|
|
|
863
956
|
# Do the same for MHC.
|
|
864
957
|
for mh in mhc_complexes:
|
|
865
|
-
|
|
866
|
-
mh_atoms[mh.id] = [
|
|
867
|
-
atom
|
|
868
|
-
for atom in mh.get_atoms()
|
|
869
|
-
if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix"
|
|
870
|
-
]
|
|
871
|
-
if isinstance(mh, MHC) and mh.MHC_type == "MH1":
|
|
872
|
-
MH1, B2M = mh.get_MH1(), mh.get_B2M()
|
|
873
|
-
if MH1 is not None:
|
|
874
|
-
antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = (
|
|
875
|
-
self._find_chain_hetatoms(MH1)
|
|
876
|
-
)
|
|
877
|
-
else:
|
|
878
|
-
GA1 = mh.get_GA1()
|
|
879
|
-
antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = (
|
|
880
|
-
self._find_chain_hetatoms(GA1)
|
|
881
|
-
)
|
|
882
|
-
if B2M is not None: # handle single chain MH1 case
|
|
883
|
-
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
884
|
-
self._find_chain_hetatoms(B2M)
|
|
885
|
-
)
|
|
886
|
-
|
|
887
|
-
elif isinstance(mh, MHC) and mh.MHC_type == "CD1":
|
|
888
|
-
CD1, B2M = mh.get_CD1(), mh.get_B2M()
|
|
889
|
-
if CD1 is not None:
|
|
890
|
-
antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = (
|
|
891
|
-
self._find_chain_hetatoms(CD1)
|
|
892
|
-
)
|
|
893
|
-
if B2M is not None:
|
|
894
|
-
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
895
|
-
self._find_chain_hetatoms(B2M)
|
|
896
|
-
)
|
|
897
|
-
|
|
898
|
-
elif isinstance(mh, MHC) and mh.MHC_type == "MR1":
|
|
899
|
-
MR1, B2M = mh.get_MR1(), mh.get_B2M()
|
|
900
|
-
antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = (
|
|
901
|
-
self._find_chain_hetatoms(MR1)
|
|
902
|
-
)
|
|
903
|
-
antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
|
|
904
|
-
self._find_chain_hetatoms(B2M)
|
|
905
|
-
)
|
|
906
|
-
|
|
907
|
-
elif isinstance(mh, MHC) and mh.MHC_type == "MH2":
|
|
908
|
-
GA, GB = mh.get_GA(), mh.get_GB()
|
|
909
|
-
antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = (
|
|
910
|
-
self._find_chain_hetatoms(GA)
|
|
911
|
-
)
|
|
912
|
-
antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = (
|
|
913
|
-
self._find_chain_hetatoms(GB)
|
|
914
|
-
)
|
|
915
|
-
|
|
916
|
-
# Unpaired MHC chains -- if any, go here.
|
|
917
|
-
elif isinstance(mh, MHCchain):
|
|
918
|
-
antigen_hetatoms[mh.id], antigen_sugars[mh.id] = (
|
|
919
|
-
self._find_chain_hetatoms(mh)
|
|
920
|
-
)
|
|
958
|
+
self._prepare_mhc(mh, mh_atoms, antigen_hetatoms, antigen_sugars)
|
|
921
959
|
|
|
922
960
|
for antigen in agchains:
|
|
923
961
|
antigen_atoms[antigen.id] = [
|
|
@@ -963,7 +1001,18 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
963
1001
|
|
|
964
1002
|
# If a TCR does not have a detected MHC chain, then skip the remaining MHC-specific parsing bits.
|
|
965
1003
|
if not mhc_complexes:
|
|
966
|
-
return
|
|
1004
|
+
return (
|
|
1005
|
+
model,
|
|
1006
|
+
tcell_receptors,
|
|
1007
|
+
mhc_complexes,
|
|
1008
|
+
agchains,
|
|
1009
|
+
crystal_contacts,
|
|
1010
|
+
antigen_atoms,
|
|
1011
|
+
cdr_atoms,
|
|
1012
|
+
mh_atoms,
|
|
1013
|
+
antigen_hetatoms,
|
|
1014
|
+
antigen_sugars,
|
|
1015
|
+
)
|
|
967
1016
|
|
|
968
1017
|
# Have a very tight cutoff for MHCs that present het atoms (e.g. CD1 types)
|
|
969
1018
|
self._het_sugar_pass(
|
|
@@ -979,14 +1028,36 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
979
1028
|
self._protein_peptide_pass(
|
|
980
1029
|
model, mhc_complexes, mh_atoms, antigen_atoms, crystal_contacts
|
|
981
1030
|
)
|
|
1031
|
+
return (
|
|
1032
|
+
model,
|
|
1033
|
+
tcell_receptors,
|
|
1034
|
+
mhc_complexes,
|
|
1035
|
+
agchains,
|
|
1036
|
+
crystal_contacts,
|
|
1037
|
+
antigen_atoms,
|
|
1038
|
+
cdr_atoms,
|
|
1039
|
+
mh_atoms,
|
|
1040
|
+
antigen_hetatoms,
|
|
1041
|
+
antigen_sugars,
|
|
1042
|
+
)
|
|
982
1043
|
|
|
1044
|
+
def _pair_tcr_and_mhc(
|
|
1045
|
+
self,
|
|
1046
|
+
model,
|
|
1047
|
+
tcell_receptors,
|
|
1048
|
+
mhc_complexes,
|
|
1049
|
+
cdr_atoms,
|
|
1050
|
+
mh_atoms,
|
|
1051
|
+
crystal_contacts,
|
|
1052
|
+
already_paired_tr_mh=set(),
|
|
1053
|
+
):
|
|
983
1054
|
# Pair a TCR with an MHC and vice-versa; go through all possible combinations of TCR/MHC
|
|
984
1055
|
# We see if a CB/CA atom of the helix region of an MHC is within 8A of a TCR CDR loop's CB/CA atoms.
|
|
985
1056
|
# This is similar to the _protein_peptide_pass algorithm; we find the number of contacts between MHC and TCR,
|
|
986
1057
|
# and use the MHC with highest no. of contacts
|
|
987
1058
|
contact_freq = defaultdict(int)
|
|
988
|
-
tr_mh_pairs = list(product(tcell_receptors, mhc_complexes))
|
|
989
1059
|
|
|
1060
|
+
tr_mh_pairs = list(product(tcell_receptors, mhc_complexes))
|
|
990
1061
|
for tr, mh in tr_mh_pairs:
|
|
991
1062
|
ns = NeighborSearch(cdr_atoms[tr.id])
|
|
992
1063
|
for atom in mh_atoms[mh.id]:
|
|
@@ -999,15 +1070,106 @@ class TCRParser(PDBParser, MMCIFParser):
|
|
|
999
1070
|
sorted_contacts = sorted(
|
|
1000
1071
|
list(contact_freq.items()), key=lambda z: z[1], reverse=True
|
|
1001
1072
|
)
|
|
1002
|
-
paired_tr_mh = set()
|
|
1073
|
+
paired_tr_mh = set() if not already_paired_tr_mh else already_paired_tr_mh
|
|
1003
1074
|
for pair, contacts in sorted_contacts:
|
|
1004
1075
|
tr, mh = pair
|
|
1005
1076
|
# If the TCR has already been paired, or if we know that the TCR and MHC are forming crystal contacts, move on.
|
|
1006
1077
|
if tr in paired_tr_mh or (tr, mh) in crystal_contacts:
|
|
1007
1078
|
continue
|
|
1079
|
+
if mh not in model:
|
|
1080
|
+
model.add([mhc for mhc in mhc_complexes if mhc.id == mh][0])
|
|
1008
1081
|
model[tr]._add_mhc(model[mh])
|
|
1009
1082
|
model[mh]._add_tcr(model[tr])
|
|
1010
1083
|
paired_tr_mh.add(tr)
|
|
1084
|
+
return model, paired_tr_mh
|
|
1085
|
+
|
|
1086
|
+
def _match_units(self, model, trchains, mhchains, agchains, crystal_contacts=[]):
|
|
1087
|
+
"""
|
|
1088
|
+
Match MHC+Peptide chains to TCR chains.
|
|
1089
|
+
model is the current model - extract the TCRs from it (paired chains have been removed)
|
|
1090
|
+
trchains contains those TCR chains that have been unable to be paired to form TCRs
|
|
1091
|
+
agchains contains non-TCR chains that are potential antigens.
|
|
1092
|
+
|
|
1093
|
+
Goal: Match TCR <-> MHC + peptide antigen.
|
|
1094
|
+
"""
|
|
1095
|
+
# Get all T-cell receptor-like objects (TCR, TCRchain), and MHC-like objects.
|
|
1096
|
+
tcell_receptors = [h for h in model if isinstance(h, TCR)] + trchains.child_list
|
|
1097
|
+
mhc_complexes = [h for h in model if isinstance(h, MHC)] + mhchains.child_list
|
|
1098
|
+
|
|
1099
|
+
(
|
|
1100
|
+
model,
|
|
1101
|
+
tcell_receptors,
|
|
1102
|
+
mhc_complexes,
|
|
1103
|
+
agchains,
|
|
1104
|
+
crystal_contacts,
|
|
1105
|
+
antigen_atoms,
|
|
1106
|
+
cdr_atoms,
|
|
1107
|
+
mh_atoms,
|
|
1108
|
+
antigen_hetatoms,
|
|
1109
|
+
antigen_sugars,
|
|
1110
|
+
) = self._prepare_tcrs_mhcs_and_antigens_for_pairing(
|
|
1111
|
+
model,
|
|
1112
|
+
tcell_receptors,
|
|
1113
|
+
mhc_complexes,
|
|
1114
|
+
agchains,
|
|
1115
|
+
crystal_contacts,
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
model, paired_tr_mh = self._pair_tcr_and_mhc(
|
|
1119
|
+
model=model,
|
|
1120
|
+
tcell_receptors=tcell_receptors,
|
|
1121
|
+
mhc_complexes=mhc_complexes,
|
|
1122
|
+
cdr_atoms=cdr_atoms,
|
|
1123
|
+
mh_atoms=mh_atoms,
|
|
1124
|
+
crystal_contacts=crystal_contacts,
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
if (
|
|
1128
|
+
self.include_symmetry_mates
|
|
1129
|
+
and len(paired_tr_mh) != len(tcell_receptors)
|
|
1130
|
+
and len(mhc_complexes) > 0
|
|
1131
|
+
): # check if all TCRs have been paired if MHC is present.
|
|
1132
|
+
# try searching for symmetry mates
|
|
1133
|
+
try:
|
|
1134
|
+
symmetry_mates = self._generate_symmetry_mates()
|
|
1135
|
+
mhc_complexes.extend([m for t in symmetry_mates for m in t.get_MHCs()])
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
warnings.warn(f"Symmetry mate generation failed with: {str(e)}")
|
|
1138
|
+
(
|
|
1139
|
+
model,
|
|
1140
|
+
tcell_receptors,
|
|
1141
|
+
mhc_complexes,
|
|
1142
|
+
agchains,
|
|
1143
|
+
crystal_contacts,
|
|
1144
|
+
antigen_atoms,
|
|
1145
|
+
cdr_atoms,
|
|
1146
|
+
mh_atoms,
|
|
1147
|
+
antigen_hetatoms,
|
|
1148
|
+
antigen_sugars,
|
|
1149
|
+
) = self._prepare_tcrs_mhcs_and_antigens_for_pairing(
|
|
1150
|
+
model,
|
|
1151
|
+
tcell_receptors,
|
|
1152
|
+
mhc_complexes,
|
|
1153
|
+
agchains,
|
|
1154
|
+
crystal_contacts,
|
|
1155
|
+
)
|
|
1156
|
+
model, paired_tr_mh = self._pair_tcr_and_mhc(
|
|
1157
|
+
model,
|
|
1158
|
+
tcell_receptors,
|
|
1159
|
+
mhc_complexes,
|
|
1160
|
+
cdr_atoms,
|
|
1161
|
+
mh_atoms,
|
|
1162
|
+
crystal_contacts,
|
|
1163
|
+
already_paired_tr_mh=paired_tr_mh,
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
def _generate_symmetry_mates(self):
|
|
1167
|
+
print("Generating symmetry mates to pair antigens.")
|
|
1168
|
+
from .utils.symmetry_mates import (
|
|
1169
|
+
get_symmetry_mates,
|
|
1170
|
+
) # import here to avoid circular import
|
|
1171
|
+
|
|
1172
|
+
return get_symmetry_mates(self.current_file)
|
|
1011
1173
|
|
|
1012
1174
|
def _protein_peptide_pass(
|
|
1013
1175
|
self, model, complexes, receptor_atoms, antigen_atoms, crystal_contacts=[]
|
|
@@ -45,7 +45,12 @@ def call_anarci(
|
|
|
45
45
|
Returns:
|
|
46
46
|
numbering, chain type, germline information
|
|
47
47
|
"""
|
|
48
|
-
|
|
48
|
+
try:
|
|
49
|
+
from anarci import number as anarci_number
|
|
50
|
+
except ImportError as e:
|
|
51
|
+
f"""ANARCI import failed, is ANARCI installed and built? \nInstall ANARCI MHC with: \npip install anarci-mhc \n
|
|
52
|
+
Once installed, build the HMMs with: \nANARCI --build_models. \nError raised was {e}"""
|
|
53
|
+
raise e
|
|
49
54
|
|
|
50
55
|
numbering, chain_type, germline_info = anarci_number(
|
|
51
56
|
seq, allow=allow, assign_germline=True
|
|
@@ -10,6 +10,15 @@ IMGT_CDR_BOUNDARIES = {
|
|
|
10
10
|
"3": {"imgt": (105, 117)},
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
+
IMGT_VARIABLE_DOMAIN: set[int] = set(range(1, 128 + 1))
|
|
14
|
+
'''Variable domain range for IMGT numbered immunoglobulin structures.'''
|
|
15
|
+
|
|
16
|
+
IMGT_MH1_ABD: set[int] = set(range(1, 92)) | set(range(1001, 1092))
|
|
17
|
+
'''IMGT ranges of the antigen binding domain of MHC class I molecules.'''
|
|
18
|
+
|
|
19
|
+
IMGT_MH2_ABD: set[int] = set(range(1, 92))
|
|
20
|
+
'''IMGT ranges of the antigen binding domain of MHC class II molecules.'''
|
|
21
|
+
|
|
13
22
|
# regions for TCR
|
|
14
23
|
_regions = {"imgt": {}}
|
|
15
24
|
_regions["imgt"]["A"] = _regions["imgt"]["B"] = (
|