stcrpy 1.0.3__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,14 @@ TCRParser object which is based on ABDB's AntibodyParser and BioPython's PDB par
8
8
  from itertools import combinations, product
9
9
  import sys
10
10
  import os
11
+ import tempfile
11
12
  from collections import defaultdict
12
13
  import warnings
13
14
 
14
15
  from Bio.PDB.PDBParser import PDBParser
15
16
  from Bio.PDB.MMCIFParser import MMCIFParser
16
17
  from Bio.PDB import NeighborSearch
18
+ from Bio.PDB import PDBIO
17
19
 
18
20
  # TCRDB
19
21
  from .annotate import annotate, extract_sequence, align_numbering
@@ -58,6 +60,10 @@ class TCRParser(PDBParser, MMCIFParser):
58
60
  self.numbering_scheme = "imgt"
59
61
  self.definition = "imgt"
60
62
 
63
+ self.current_file = (
64
+ None # the current file being processed, populated by get_tcr_structure
65
+ )
66
+
61
67
  def _create_chain(self, chain, new_chain_id, numbering, chain_type):
62
68
  """
63
69
  Create a new TCR or MHC chain.
@@ -194,19 +200,62 @@ class TCRParser(PDBParser, MMCIFParser):
194
200
 
195
201
  return newchain1, newchain2
196
202
 
197
- def get_tcr_structure(
198
- self, id, file, prenumbering=None, ali_dict={}, crystal_contacts=[]
199
- ):
200
- """
201
- Post processing of the TCRPDB.Bio.PDB structure object into a TCR context.
203
+ def _number_and_annotate_chain(self, chain, prenumbering=None, ali_dict={}):
204
+ # try to number the sequence found in the structure
205
+ if prenumbering and chain.id in prenumbering:
206
+ if len(prenumbering[chain.id]) == 2:
207
+ numbering = [{}, {}]
208
+ region_types = ["", ""]
202
209
 
203
- id: a string to identify the structure
204
- file: the path to the .pdb file
210
+ numbering[0], region_types[0] = self._prenumbered(
211
+ chain, prenumbering, ali_dict, n=0
212
+ )
213
+ numbering[1], region_types[1] = self._prenumbered(
214
+ chain, prenumbering, ali_dict, n=1
215
+ )
216
+ rtypes = sorted(region_types)
205
217
 
206
- optional:
207
- prenumbering: prenumbering for the chains in the structure.
208
- """
209
- self.warnings = ErrorStream()
218
+ # Check that we have a beta/alpha domain or gamma/delta domain
219
+ if rtypes == ["A", "B"] or rtypes == ["D", "G"]:
220
+ chain_type = "".join(region_types)
221
+ scTCR = True
222
+ # if not, just take the first region and warn the user
223
+ else:
224
+ chain_type = region_types[0]
225
+ numbering = numbering[0]
226
+ scTCR = False
227
+ print(
228
+ "Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only."
229
+ % (chain_type, chain.id),
230
+ file=self.warnings,
231
+ )
232
+
233
+ elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]:
234
+ numbering, chain_type, scTCR = annotate(chain)
235
+
236
+ else:
237
+ numbering, chain_type = self._prenumbered(
238
+ chain, prenumbering, ali_dict, n=0
239
+ )
240
+ scTCR = False
241
+
242
+ else:
243
+ numbering, chain_type, germline_info, scTCR = annotate(chain)
244
+
245
+ return numbering, chain_type, germline_info, scTCR
246
+
247
+ def _get_header_info(self, tcrstructure, chain, germline_info):
248
+ if chain.id in tcrstructure.header["chain_details"]: # clean this up!!!
249
+ engineered = tcrstructure.header["chain_details"][chain.id]["engineered"]
250
+ details = tcrstructure.header["chain_details"][chain.id]
251
+ else:
252
+ engineered = False
253
+ details = {"molecule": "unknown", "engineered": False}
254
+
255
+ details["genetic_origin"] = germline_info
256
+ return details, engineered
257
+
258
+ def _read_structure_file(self, file, id):
210
259
  # get a structure object from biopython.
211
260
  _, ext = os.path.splitext(file)
212
261
  if ext.lower() == ".pdb":
@@ -225,6 +274,45 @@ class TCRParser(PDBParser, MMCIFParser):
225
274
  # Set and analyse header information
226
275
  tcrstructure.set_header(structure.header)
227
276
  self._analyse_header(tcrstructure)
277
+ return structure, tcrstructure
278
+
279
+ def _initialise_model(self, model):
280
+ newmodel = Model(model.id)
281
+
282
+ # initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains.
283
+ agchains = Holder("Antigen")
284
+ trchains = Holder("TCRchain")
285
+ mhchains = Holder("MHCchain")
286
+ newmodel.add(agchains)
287
+ newmodel.add(trchains)
288
+ newmodel.add(mhchains)
289
+ return newmodel, agchains, trchains, mhchains
290
+
291
+ def get_tcr_structure(
292
+ self,
293
+ id,
294
+ file,
295
+ prenumbering=None,
296
+ ali_dict={},
297
+ crystal_contacts=[],
298
+ include_symmetry_mates=True,
299
+ ):
300
+ """
301
+ Post processing of the TCRPDB.Bio.PDB structure object into a TCR context.
302
+
303
+ id: a string to identify the structure
304
+ file: the path to the .pdb file
305
+
306
+ optional:
307
+ prenumbering: prenumbering for the chains in the structure.
308
+ """
309
+ self.warnings = ErrorStream()
310
+ self.include_symmetry_mates = include_symmetry_mates
311
+ self.current_file = file
312
+
313
+ structure, tcrstructure = self._read_structure_file(
314
+ file, id
315
+ ) # structure: Bio.PDB.Structure from file; tcrstructure: initialised empty TCRStructure object to be populated
228
316
 
229
317
  # iterate over the models in the structure
230
318
  # iterate backwards through the model list - delete old structure as we go
@@ -233,70 +321,18 @@ class TCRParser(PDBParser, MMCIFParser):
233
321
  for mid in range(len(structure.child_list) - 1, -1, -1):
234
322
  # add a model to the TCR structure
235
323
  model = structure.child_list[mid]
236
- newmodel = Model(model.id)
324
+ newmodel, agchains, trchains, mhchains = self._initialise_model(model)
237
325
  tcrstructure.add(newmodel)
238
326
 
239
- # initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains.
240
- agchains = Holder("Antigen")
241
- trchains = Holder("TCRchain")
242
- mhchains = Holder("MHCchain")
243
- newmodel.add(agchains)
244
- newmodel.add(trchains)
245
- newmodel.add(mhchains)
246
-
247
327
  # iterate over the chains in the model
248
328
  for chain in model.get_list():
249
- # try to number the sequence found in the structure
250
- if prenumbering and chain.id in prenumbering:
251
- if len(prenumbering[chain.id]) == 2:
252
- numbering = [{}, {}]
253
- region_types = ["", ""]
254
-
255
- numbering[0], region_types[0] = self._prenumbered(
256
- chain, prenumbering, ali_dict, n=0
257
- )
258
- numbering[1], region_types[1] = self._prenumbered(
259
- chain, prenumbering, ali_dict, n=1
260
- )
261
- rtypes = sorted(region_types)
262
-
263
- # Check that we have a beta/alpha domain or gamma/delta domain
264
- if rtypes == ["A", "B"] or rtypes == ["D", "G"]:
265
- chain_type = "".join(region_types)
266
- scTCR = True
267
- # if not, just take the first region and warn the user
268
- else:
269
- chain_type = region_types[0]
270
- numbering = numbering[0]
271
- scTCR = False
272
- print(
273
- "Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only."
274
- % (chain_type, chain.id),
275
- file=self.warnings,
276
- )
277
-
278
- elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]:
279
- numbering, chain_type, scTCR = annotate(chain)
280
-
281
- else:
282
- numbering, chain_type = self._prenumbered(
283
- chain, prenumbering, ali_dict, n=0
284
- )
285
- scTCR = False
286
-
287
- else:
288
- numbering, chain_type, germline_info, scTCR = annotate(chain)
289
-
290
- if chain.id in tcrstructure.header["chain_details"]: # clean this up!!!
291
- engineered = tcrstructure.header["chain_details"][chain.id][
292
- "engineered"
293
- ]
294
- details = tcrstructure.header["chain_details"][chain.id]
295
- else:
296
- engineered = False
297
- details = {"molecule": "unknown", "engineered": False}
329
+ numbering, chain_type, germline_info, scTCR = (
330
+ self._number_and_annotate_chain(chain, prenumbering, ali_dict)
331
+ )
298
332
 
299
- details["genetic_origin"] = germline_info
333
+ details, engineered = self._get_header_info(
334
+ tcrstructure, chain, germline_info
335
+ )
300
336
 
301
337
  if numbering and chain_type in ["G", "D", "B", "A"]:
302
338
  # create a new TCR chain
@@ -350,7 +386,9 @@ class TCRParser(PDBParser, MMCIFParser):
350
386
  elif not obs_chaintypes - set(["G", "D"]):
351
387
  tcr = gdTCR(chain1, chain2)
352
388
  elif not obs_chaintypes - set(["B", "D"]):
353
- tcr = abTCR(chain1, chain2) # initial way to deal with narci missclassification of alpha chains as delta chains
389
+ tcr = abTCR(
390
+ chain1, chain2
391
+ ) # initial way to deal with anarci missclassification of alpha chains as delta chains
354
392
  # tcr = dbTCR(chain1, chain2)
355
393
 
356
394
  tcr.scTCR = True #
@@ -503,6 +541,7 @@ class TCRParser(PDBParser, MMCIFParser):
503
541
  sys.stderr.write("\n")
504
542
  tcrstructure.warnings = self.warnings
505
543
 
544
+ self.current_file = None # reset the current file
506
545
  return tcrstructure
507
546
 
508
547
  def _analyse_header(self, header):
@@ -787,19 +826,120 @@ class TCRParser(PDBParser, MMCIFParser):
787
826
 
788
827
  return hetatoms, sugars
789
828
 
790
- def _match_units(self, model, trchains, mhchains, agchains, crystal_contacts=[]):
791
- """
792
- Match MHC+Peptide chains to TCR chains.
793
- model is the current model - extract the TCRs from it (paired chains have been removed)
794
- trchains contains those TCR chains that have been unable to be paired to form TCRs
795
- agchains contains non-TCR chains that are potential antigens.
829
+ def _prepare_tcr(self, tr, cdr_atoms, antigen_hetatoms, antigen_sugars):
830
+ for cdr in tr.get_CDRs():
831
+ # Only get CDR3?
832
+ if "3" not in cdr.id:
833
+ continue
834
+ # only look at CA or CB atoms of the CDR; this is used later.
835
+ cdr_atoms[tr.id] += [
836
+ atom for atom in cdr.get_atoms() if atom.id == "CB" or atom.id == "CA"
837
+ ]
796
838
 
797
- Goal: Match TCR <-> MHC + peptide antigen.
798
- """
799
- # Get all T-cell receptor-like objects (TCR, TCRchain), and MHC-like objects.
800
- tcell_receptors = [h for h in model if isinstance(h, TCR)] + trchains.child_list
801
- mhc_complexes = [h for h in model if isinstance(h, MHC)] + mhchains.child_list
839
+ # get TCR type and get chain's hetatoms accordingly
840
+ if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR":
841
+ beta_chain = tr.get_VB()
842
+ alpha_chain = tr.get_VA()
843
+
844
+ antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = self._find_chain_hetatoms(
845
+ beta_chain
846
+ )
847
+ antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = self._find_chain_hetatoms(
848
+ alpha_chain
849
+ )
850
+
851
+ elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR":
852
+ delta_chain = tr.get_VD()
853
+ gamma_chain = tr.get_VG()
854
+ antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = self._find_chain_hetatoms(
855
+ delta_chain
856
+ )
857
+ antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = self._find_chain_hetatoms(
858
+ gamma_chain
859
+ )
860
+
861
+ elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR":
862
+ beta_chain = tr.get_VB()
863
+ delta_chain = tr.get_VD()
864
+ antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = self._find_chain_hetatoms(
865
+ beta_chain
866
+ )
867
+ antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = self._find_chain_hetatoms(
868
+ delta_chain
869
+ )
870
+
871
+ # Unpaired TCR chain
872
+ elif isinstance(tr, TCRchain):
873
+ antigen_hetatoms[tr.id], antigen_sugars[tr.id] = self._find_chain_hetatoms(
874
+ tr
875
+ )
876
+
877
+ def _prepare_mhc(self, mh, mh_atoms, antigen_hetatoms, antigen_sugars):
878
+ # Keep G domain atoms; Get the Helix region of MHC
879
+ mh_atoms[mh.id] = [
880
+ atom
881
+ for atom in mh.get_atoms()
882
+ if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix"
883
+ ]
884
+ if isinstance(mh, MHC) and mh.MHC_type == "MH1":
885
+ MH1, B2M = mh.get_MH1(), mh.get_B2M()
886
+ if MH1 is not None:
887
+ antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = (
888
+ self._find_chain_hetatoms(MH1)
889
+ )
890
+ else:
891
+ GA1 = mh.get_GA1()
892
+ antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = (
893
+ self._find_chain_hetatoms(GA1)
894
+ )
895
+ if B2M is not None: # handle single chain MH1 case
896
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
897
+ self._find_chain_hetatoms(B2M)
898
+ )
899
+
900
+ elif isinstance(mh, MHC) and mh.MHC_type == "CD1":
901
+ CD1, B2M = mh.get_CD1(), mh.get_B2M()
902
+ if CD1 is not None:
903
+ antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = (
904
+ self._find_chain_hetatoms(CD1)
905
+ )
906
+ if B2M is not None:
907
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
908
+ self._find_chain_hetatoms(B2M)
909
+ )
910
+
911
+ elif isinstance(mh, MHC) and mh.MHC_type == "MR1":
912
+ MR1, B2M = mh.get_MR1(), mh.get_B2M()
913
+ antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = (
914
+ self._find_chain_hetatoms(MR1)
915
+ )
916
+ antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
917
+ self._find_chain_hetatoms(B2M)
918
+ )
802
919
 
920
+ elif isinstance(mh, MHC) and mh.MHC_type == "MH2":
921
+ GA, GB = mh.get_GA(), mh.get_GB()
922
+ antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = self._find_chain_hetatoms(
923
+ GA
924
+ )
925
+ antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = self._find_chain_hetatoms(
926
+ GB
927
+ )
928
+
929
+ # Unpaired MHC chains -- if any, go here.
930
+ elif isinstance(mh, MHCchain):
931
+ antigen_hetatoms[mh.id], antigen_sugars[mh.id] = self._find_chain_hetatoms(
932
+ mh
933
+ )
934
+
935
+ def _prepare_tcrs_mhcs_and_antigens_for_pairing(
936
+ self,
937
+ model,
938
+ tcell_receptors,
939
+ mhc_complexes,
940
+ agchains,
941
+ crystal_contacts,
942
+ ):
803
943
  # Initialise 5 dictionaries which carries a list of atoms per chain ID.
804
944
  antigen_atoms, cdr_atoms, mh_atoms, antigen_hetatoms, antigen_sugars = (
805
945
  defaultdict(list),
@@ -811,113 +951,11 @@ class TCRParser(PDBParser, MMCIFParser):
811
951
 
812
952
  # Look through TCR and MHC and see if there are any weird hetatoms and sugars in the structure.
813
953
  for tr in tcell_receptors:
814
- for cdr in tr.get_CDRs():
815
- # Only get CDR3?
816
- if "3" not in cdr.id:
817
- continue
818
- # only look at CA or CB atoms of the CDR; this is used later.
819
- cdr_atoms[tr.id] += [
820
- atom
821
- for atom in cdr.get_atoms()
822
- if atom.id == "CB" or atom.id == "CA"
823
- ]
824
-
825
- # get TCR type and get chain's hetatoms accordingly
826
- if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR":
827
- beta_chain = tr.get_VB()
828
- alpha_chain = tr.get_VA()
829
-
830
- antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
831
- self._find_chain_hetatoms(beta_chain)
832
- )
833
- antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = (
834
- self._find_chain_hetatoms(alpha_chain)
835
- )
836
-
837
- elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR":
838
- delta_chain = tr.get_VD()
839
- gamma_chain = tr.get_VG()
840
- antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
841
- self._find_chain_hetatoms(delta_chain)
842
- )
843
- antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = (
844
- self._find_chain_hetatoms(gamma_chain)
845
- )
846
-
847
- elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR":
848
- beta_chain = tr.get_VB()
849
- delta_chain = tr.get_VD()
850
- antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = (
851
- self._find_chain_hetatoms(beta_chain)
852
- )
853
- antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = (
854
- self._find_chain_hetatoms(delta_chain)
855
- )
856
-
857
- # Unpaired TCR chain
858
- elif isinstance(tr, TCRchain):
859
- antigen_hetatoms[tr.id], antigen_sugars[tr.id] = (
860
- self._find_chain_hetatoms(tr)
861
- )
954
+ self._prepare_tcr(tr, cdr_atoms, antigen_hetatoms, antigen_sugars)
862
955
 
863
956
  # Do the same for MHC.
864
957
  for mh in mhc_complexes:
865
- # Keep G domain atoms; Get the Helix region of MHC
866
- mh_atoms[mh.id] = [
867
- atom
868
- for atom in mh.get_atoms()
869
- if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix"
870
- ]
871
- if isinstance(mh, MHC) and mh.MHC_type == "MH1":
872
- MH1, B2M = mh.get_MH1(), mh.get_B2M()
873
- if MH1 is not None:
874
- antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = (
875
- self._find_chain_hetatoms(MH1)
876
- )
877
- else:
878
- GA1 = mh.get_GA1()
879
- antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = (
880
- self._find_chain_hetatoms(GA1)
881
- )
882
- if B2M is not None: # handle single chain MH1 case
883
- antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
884
- self._find_chain_hetatoms(B2M)
885
- )
886
-
887
- elif isinstance(mh, MHC) and mh.MHC_type == "CD1":
888
- CD1, B2M = mh.get_CD1(), mh.get_B2M()
889
- if CD1 is not None:
890
- antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = (
891
- self._find_chain_hetatoms(CD1)
892
- )
893
- if B2M is not None:
894
- antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
895
- self._find_chain_hetatoms(B2M)
896
- )
897
-
898
- elif isinstance(mh, MHC) and mh.MHC_type == "MR1":
899
- MR1, B2M = mh.get_MR1(), mh.get_B2M()
900
- antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = (
901
- self._find_chain_hetatoms(MR1)
902
- )
903
- antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = (
904
- self._find_chain_hetatoms(B2M)
905
- )
906
-
907
- elif isinstance(mh, MHC) and mh.MHC_type == "MH2":
908
- GA, GB = mh.get_GA(), mh.get_GB()
909
- antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = (
910
- self._find_chain_hetatoms(GA)
911
- )
912
- antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = (
913
- self._find_chain_hetatoms(GB)
914
- )
915
-
916
- # Unpaired MHC chains -- if any, go here.
917
- elif isinstance(mh, MHCchain):
918
- antigen_hetatoms[mh.id], antigen_sugars[mh.id] = (
919
- self._find_chain_hetatoms(mh)
920
- )
958
+ self._prepare_mhc(mh, mh_atoms, antigen_hetatoms, antigen_sugars)
921
959
 
922
960
  for antigen in agchains:
923
961
  antigen_atoms[antigen.id] = [
@@ -963,7 +1001,18 @@ class TCRParser(PDBParser, MMCIFParser):
963
1001
 
964
1002
  # If a TCR does not have a detected MHC chain, then skip the remaining MHC-specific parsing bits.
965
1003
  if not mhc_complexes:
966
- return
1004
+ return (
1005
+ model,
1006
+ tcell_receptors,
1007
+ mhc_complexes,
1008
+ agchains,
1009
+ crystal_contacts,
1010
+ antigen_atoms,
1011
+ cdr_atoms,
1012
+ mh_atoms,
1013
+ antigen_hetatoms,
1014
+ antigen_sugars,
1015
+ )
967
1016
 
968
1017
  # Have a very tight cutoff for MHCs that present het atoms (e.g. CD1 types)
969
1018
  self._het_sugar_pass(
@@ -979,14 +1028,36 @@ class TCRParser(PDBParser, MMCIFParser):
979
1028
  self._protein_peptide_pass(
980
1029
  model, mhc_complexes, mh_atoms, antigen_atoms, crystal_contacts
981
1030
  )
1031
+ return (
1032
+ model,
1033
+ tcell_receptors,
1034
+ mhc_complexes,
1035
+ agchains,
1036
+ crystal_contacts,
1037
+ antigen_atoms,
1038
+ cdr_atoms,
1039
+ mh_atoms,
1040
+ antigen_hetatoms,
1041
+ antigen_sugars,
1042
+ )
982
1043
 
1044
+ def _pair_tcr_and_mhc(
1045
+ self,
1046
+ model,
1047
+ tcell_receptors,
1048
+ mhc_complexes,
1049
+ cdr_atoms,
1050
+ mh_atoms,
1051
+ crystal_contacts,
1052
+ already_paired_tr_mh=set(),
1053
+ ):
983
1054
  # Pair a TCR with an MHC and vice-versa; go through all possible combinations of TCR/MHC
984
1055
  # We see if a CB/CA atom of the helix region of an MHC is within 8A of a TCR CDR loop's CB/CA atoms.
985
1056
  # This is similar to the _protein_peptide_pass algorithm; we find the number of contacts between MHC and TCR,
986
1057
  # and use the MHC with highest no. of contacts
987
1058
  contact_freq = defaultdict(int)
988
- tr_mh_pairs = list(product(tcell_receptors, mhc_complexes))
989
1059
 
1060
+ tr_mh_pairs = list(product(tcell_receptors, mhc_complexes))
990
1061
  for tr, mh in tr_mh_pairs:
991
1062
  ns = NeighborSearch(cdr_atoms[tr.id])
992
1063
  for atom in mh_atoms[mh.id]:
@@ -999,15 +1070,106 @@ class TCRParser(PDBParser, MMCIFParser):
999
1070
  sorted_contacts = sorted(
1000
1071
  list(contact_freq.items()), key=lambda z: z[1], reverse=True
1001
1072
  )
1002
- paired_tr_mh = set()
1073
+ paired_tr_mh = set() if not already_paired_tr_mh else already_paired_tr_mh
1003
1074
  for pair, contacts in sorted_contacts:
1004
1075
  tr, mh = pair
1005
1076
  # If the TCR has already been paired, or if we know that the TCR and MHC are forming crystal contacts, move on.
1006
1077
  if tr in paired_tr_mh or (tr, mh) in crystal_contacts:
1007
1078
  continue
1079
+ if mh not in model:
1080
+ model.add([mhc for mhc in mhc_complexes if mhc.id == mh][0])
1008
1081
  model[tr]._add_mhc(model[mh])
1009
1082
  model[mh]._add_tcr(model[tr])
1010
1083
  paired_tr_mh.add(tr)
1084
+ return model, paired_tr_mh
1085
+
1086
+ def _match_units(self, model, trchains, mhchains, agchains, crystal_contacts=[]):
1087
+ """
1088
+ Match MHC+Peptide chains to TCR chains.
1089
+ model is the current model - extract the TCRs from it (paired chains have been removed)
1090
+ trchains contains those TCR chains that have been unable to be paired to form TCRs
1091
+ agchains contains non-TCR chains that are potential antigens.
1092
+
1093
+ Goal: Match TCR <-> MHC + peptide antigen.
1094
+ """
1095
+ # Get all T-cell receptor-like objects (TCR, TCRchain), and MHC-like objects.
1096
+ tcell_receptors = [h for h in model if isinstance(h, TCR)] + trchains.child_list
1097
+ mhc_complexes = [h for h in model if isinstance(h, MHC)] + mhchains.child_list
1098
+
1099
+ (
1100
+ model,
1101
+ tcell_receptors,
1102
+ mhc_complexes,
1103
+ agchains,
1104
+ crystal_contacts,
1105
+ antigen_atoms,
1106
+ cdr_atoms,
1107
+ mh_atoms,
1108
+ antigen_hetatoms,
1109
+ antigen_sugars,
1110
+ ) = self._prepare_tcrs_mhcs_and_antigens_for_pairing(
1111
+ model,
1112
+ tcell_receptors,
1113
+ mhc_complexes,
1114
+ agchains,
1115
+ crystal_contacts,
1116
+ )
1117
+
1118
+ model, paired_tr_mh = self._pair_tcr_and_mhc(
1119
+ model=model,
1120
+ tcell_receptors=tcell_receptors,
1121
+ mhc_complexes=mhc_complexes,
1122
+ cdr_atoms=cdr_atoms,
1123
+ mh_atoms=mh_atoms,
1124
+ crystal_contacts=crystal_contacts,
1125
+ )
1126
+
1127
+ if (
1128
+ self.include_symmetry_mates
1129
+ and len(paired_tr_mh) != len(tcell_receptors)
1130
+ and len(mhc_complexes) > 0
1131
+ ): # check if all TCRs have been paired if MHC is present.
1132
+ # try searching for symmetry mates
1133
+ try:
1134
+ symmetry_mates = self._generate_symmetry_mates()
1135
+ mhc_complexes.extend([m for t in symmetry_mates for m in t.get_MHCs()])
1136
+ except Exception as e:
1137
+ warnings.warn(f"Symmetry mate generation failed with: {str(e)}")
1138
+ (
1139
+ model,
1140
+ tcell_receptors,
1141
+ mhc_complexes,
1142
+ agchains,
1143
+ crystal_contacts,
1144
+ antigen_atoms,
1145
+ cdr_atoms,
1146
+ mh_atoms,
1147
+ antigen_hetatoms,
1148
+ antigen_sugars,
1149
+ ) = self._prepare_tcrs_mhcs_and_antigens_for_pairing(
1150
+ model,
1151
+ tcell_receptors,
1152
+ mhc_complexes,
1153
+ agchains,
1154
+ crystal_contacts,
1155
+ )
1156
+ model, paired_tr_mh = self._pair_tcr_and_mhc(
1157
+ model,
1158
+ tcell_receptors,
1159
+ mhc_complexes,
1160
+ cdr_atoms,
1161
+ mh_atoms,
1162
+ crystal_contacts,
1163
+ already_paired_tr_mh=paired_tr_mh,
1164
+ )
1165
+
1166
+ def _generate_symmetry_mates(self):
1167
+ print("Generating symmetry mates to pair antigens.")
1168
+ from .utils.symmetry_mates import (
1169
+ get_symmetry_mates,
1170
+ ) # import here to avoid circular import
1171
+
1172
+ return get_symmetry_mates(self.current_file)
1011
1173
 
1012
1174
  def _protein_peptide_pass(
1013
1175
  self, model, complexes, receptor_atoms, antigen_atoms, crystal_contacts=[]
@@ -45,7 +45,12 @@ def call_anarci(
45
45
  Returns:
46
46
  numbering, chain type, germline information
47
47
  """
48
- from anarci import number as anarci_number
48
+ try:
49
+ from anarci import number as anarci_number
50
+ except ImportError as e:
51
+ f"""ANARCI import failed, is ANARCI installed and built? \nInstall ANARCI MHC with: \npip install anarci-mhc \n
52
+ Once installed, build the HMMs with: \nANARCI --build_models. \nError raised was {e}"""
53
+ raise e
49
54
 
50
55
  numbering, chain_type, germline_info = anarci_number(
51
56
  seq, allow=allow, assign_germline=True
@@ -10,6 +10,15 @@ IMGT_CDR_BOUNDARIES = {
10
10
  "3": {"imgt": (105, 117)},
11
11
  }
12
12
 
13
+ IMGT_VARIABLE_DOMAIN: set[int] = set(range(1, 128 + 1))
14
+ '''Variable domain range for IMGT numbered immunoglobulin structures.'''
15
+
16
+ IMGT_MH1_ABD: set[int] = set(range(1, 92)) | set(range(1001, 1092))
17
+ '''IMGT ranges of the antigen binding domain of MHC class I molecules.'''
18
+
19
+ IMGT_MH2_ABD: set[int] = set(range(1, 92))
20
+ '''IMGT ranges of the antigen binding domain of MHC class II molecules.'''
21
+
13
22
  # regions for TCR
14
23
  _regions = {"imgt": {}}
15
24
  _regions["imgt"]["A"] = _regions["imgt"]["B"] = (