stcrpy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. examples/__init__.py +0 -0
  2. examples/egnn.py +425 -0
  3. stcrpy/__init__.py +5 -0
  4. stcrpy/tcr_datasets/__init__.py +0 -0
  5. stcrpy/tcr_datasets/tcr_graph_dataset.py +499 -0
  6. stcrpy/tcr_datasets/tcr_selector.py +0 -0
  7. stcrpy/tcr_datasets/tcr_structure_dataset.py +0 -0
  8. stcrpy/tcr_datasets/utils.py +350 -0
  9. stcrpy/tcr_formats/__init__.py +0 -0
  10. stcrpy/tcr_formats/tcr_formats.py +114 -0
  11. stcrpy/tcr_formats/tcr_haddock.py +556 -0
  12. stcrpy/tcr_geometry/TCRCoM.py +350 -0
  13. stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
  14. stcrpy/tcr_geometry/TCRDock.py +261 -0
  15. stcrpy/tcr_geometry/TCRGeom.py +450 -0
  16. stcrpy/tcr_geometry/TCRGeomFiltering.py +273 -0
  17. stcrpy/tcr_geometry/__init__.py +0 -0
  18. stcrpy/tcr_geometry/reference_data/__init__.py +0 -0
  19. stcrpy/tcr_geometry/reference_data/dock_reference_1_imgt_numbered.pdb +6549 -0
  20. stcrpy/tcr_geometry/reference_data/dock_reference_2_imgt_numbered.pdb +6495 -0
  21. stcrpy/tcr_geometry/reference_data/reference_A.pdb +31 -0
  22. stcrpy/tcr_geometry/reference_data/reference_B.pdb +31 -0
  23. stcrpy/tcr_geometry/reference_data/reference_D.pdb +31 -0
  24. stcrpy/tcr_geometry/reference_data/reference_G.pdb +31 -0
  25. stcrpy/tcr_geometry/reference_data/reference_data.py +104 -0
  26. stcrpy/tcr_interactions/PLIPParser.py +147 -0
  27. stcrpy/tcr_interactions/TCRInteractionProfiler.py +433 -0
  28. stcrpy/tcr_interactions/TCRpMHC_PLIP_Model_Parser.py +133 -0
  29. stcrpy/tcr_interactions/__init__.py +0 -0
  30. stcrpy/tcr_interactions/utils.py +170 -0
  31. stcrpy/tcr_methods/__init__.py +0 -0
  32. stcrpy/tcr_methods/tcr_batch_operations.py +223 -0
  33. stcrpy/tcr_methods/tcr_methods.py +150 -0
  34. stcrpy/tcr_methods/tcr_reformatting.py +18 -0
  35. stcrpy/tcr_metrics/__init__.py +2 -0
  36. stcrpy/tcr_metrics/constants.py +39 -0
  37. stcrpy/tcr_metrics/tcr_interface_rmsd.py +237 -0
  38. stcrpy/tcr_metrics/tcr_rmsd.py +179 -0
  39. stcrpy/tcr_ml/__init__.py +0 -0
  40. stcrpy/tcr_ml/geometry_predictor.py +3 -0
  41. stcrpy/tcr_processing/AGchain.py +89 -0
  42. stcrpy/tcr_processing/Chemical_components.py +48915 -0
  43. stcrpy/tcr_processing/Entity.py +301 -0
  44. stcrpy/tcr_processing/Fragment.py +58 -0
  45. stcrpy/tcr_processing/Holder.py +24 -0
  46. stcrpy/tcr_processing/MHC.py +449 -0
  47. stcrpy/tcr_processing/MHCchain.py +149 -0
  48. stcrpy/tcr_processing/Model.py +37 -0
  49. stcrpy/tcr_processing/Select.py +145 -0
  50. stcrpy/tcr_processing/TCR.py +532 -0
  51. stcrpy/tcr_processing/TCRIO.py +47 -0
  52. stcrpy/tcr_processing/TCRParser.py +1230 -0
  53. stcrpy/tcr_processing/TCRStructure.py +148 -0
  54. stcrpy/tcr_processing/TCRchain.py +160 -0
  55. stcrpy/tcr_processing/__init__.py +3 -0
  56. stcrpy/tcr_processing/annotate.py +480 -0
  57. stcrpy/tcr_processing/utils/__init__.py +0 -0
  58. stcrpy/tcr_processing/utils/common.py +67 -0
  59. stcrpy/tcr_processing/utils/constants.py +367 -0
  60. stcrpy/tcr_processing/utils/region_definitions.py +782 -0
  61. stcrpy/utils/__init__.py +0 -0
  62. stcrpy/utils/error_stream.py +12 -0
  63. stcrpy-1.0.0.dist-info/METADATA +173 -0
  64. stcrpy-1.0.0.dist-info/RECORD +68 -0
  65. stcrpy-1.0.0.dist-info/WHEEL +5 -0
  66. stcrpy-1.0.0.dist-info/licenses/LICENCE +28 -0
  67. stcrpy-1.0.0.dist-info/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
  68. stcrpy-1.0.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,782 @@
1
+ """
2
+ A module to deal with region annotations for IMGT scheme.
3
+ """
4
+
5
+ from .constants import TCR_CHAINS, TCR_REGIONS
6
+
7
+ IMGT_CDR_BOUNDARIES = {
8
+ "1": {"imgt": (27, 38)},
9
+ "2": {"imgt": (56, 65)},
10
+ "3": {"imgt": (105, 117)},
11
+ }
12
+
13
+ # regions for TCR
14
+ _regions = {"imgt": {}}
15
+ _regions["imgt"]["A"] = _regions["imgt"]["B"] = (
16
+ "11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777"
17
+ )
18
+
19
+ # Set the IMGT definitions for TCR chain types
20
+ _regions["imgt"]["G"] = _regions["imgt"]["B"]
21
+ _regions["imgt"]["D"] = _regions["imgt"]["A"]
22
+
23
+
24
+ # For internal use only. These are not direct conversions and are handled heuristically.
25
+ # Currently only using the IMGT numbering scheme although other numbering schemes may be introduced.
26
+ _index_to_imgt_state = {
27
+ ("imgt", "B"): {
28
+ 1: 0,
29
+ 2: 1,
30
+ 3: 2,
31
+ 4: 3,
32
+ 5: 4,
33
+ 6: 5,
34
+ 7: 6,
35
+ 8: 7,
36
+ 9: 8,
37
+ 10: 9,
38
+ 11: 10,
39
+ 12: 11,
40
+ 13: 12,
41
+ 14: 13,
42
+ 15: 14,
43
+ 16: 15,
44
+ 17: 16,
45
+ 18: 17,
46
+ 19: 18,
47
+ 20: 19,
48
+ 21: 20,
49
+ 22: 21,
50
+ 23: 22,
51
+ 24: 23,
52
+ 25: 24,
53
+ 26: 25,
54
+ 27: 26,
55
+ 28: 27,
56
+ 29: 28,
57
+ 30: 29,
58
+ 31: 30,
59
+ 32: 31,
60
+ 33: 32,
61
+ 34: 33,
62
+ 35: 34,
63
+ 36: 35,
64
+ 37: 36,
65
+ 38: 37,
66
+ 39: 38,
67
+ 40: 39,
68
+ 41: 40,
69
+ 42: 41,
70
+ 43: 42,
71
+ 44: 43,
72
+ 45: 44,
73
+ 46: 45,
74
+ 47: 46,
75
+ 48: 47,
76
+ 49: 48,
77
+ 50: 49,
78
+ 51: 50,
79
+ 52: 51,
80
+ 53: 52,
81
+ 54: 53,
82
+ 55: 54,
83
+ 56: 55,
84
+ 57: 56,
85
+ 58: 57,
86
+ 59: 58,
87
+ 60: 59,
88
+ 61: 60,
89
+ 62: 61,
90
+ 63: 62,
91
+ 64: 63,
92
+ 65: 64,
93
+ 66: 65,
94
+ 67: 66,
95
+ 68: 67,
96
+ 69: 68,
97
+ 70: 69,
98
+ 71: 70,
99
+ 72: 71,
100
+ 73: 72,
101
+ 74: 73,
102
+ 75: 74,
103
+ 76: 75,
104
+ 77: 76,
105
+ 78: 77,
106
+ 79: 78,
107
+ 80: 79,
108
+ 81: 80,
109
+ 82: 81,
110
+ 83: 82,
111
+ 84: 83,
112
+ 85: 84,
113
+ 86: 85,
114
+ 87: 86,
115
+ 88: 87,
116
+ 89: 88,
117
+ 90: 89,
118
+ 91: 90,
119
+ 92: 91,
120
+ 93: 92,
121
+ 94: 93,
122
+ 95: 94,
123
+ 96: 95,
124
+ 97: 96,
125
+ 98: 97,
126
+ 99: 98,
127
+ 100: 99,
128
+ 101: 100,
129
+ 102: 101,
130
+ 103: 102,
131
+ 104: 103,
132
+ 105: 104,
133
+ 106: 105,
134
+ 107: 106,
135
+ 108: 107,
136
+ 109: 108,
137
+ 110: 109,
138
+ 111: 110,
139
+ 112: 111,
140
+ 113: 112,
141
+ 114: 113,
142
+ 115: 114,
143
+ 116: 115,
144
+ 117: 116,
145
+ 118: 117,
146
+ 119: 118,
147
+ 120: 119,
148
+ 121: 120,
149
+ 122: 121,
150
+ 123: 122,
151
+ 124: 123,
152
+ 125: 124,
153
+ 126: 125,
154
+ 127: 126,
155
+ 128: 127,
156
+ },
157
+ ("imgt", "A"): {
158
+ 1: 0,
159
+ 2: 1,
160
+ 3: 2,
161
+ 4: 3,
162
+ 5: 4,
163
+ 6: 5,
164
+ 7: 6,
165
+ 8: 7,
166
+ 9: 8,
167
+ 10: 9,
168
+ 11: 10,
169
+ 12: 11,
170
+ 13: 12,
171
+ 14: 13,
172
+ 15: 14,
173
+ 16: 15,
174
+ 17: 16,
175
+ 18: 17,
176
+ 19: 18,
177
+ 20: 19,
178
+ 21: 20,
179
+ 22: 21,
180
+ 23: 22,
181
+ 24: 23,
182
+ 25: 24,
183
+ 26: 25,
184
+ 27: 26,
185
+ 28: 27,
186
+ 29: 28,
187
+ 30: 29,
188
+ 31: 30,
189
+ 32: 31,
190
+ 33: 32,
191
+ 34: 33,
192
+ 35: 34,
193
+ 36: 35,
194
+ 37: 36,
195
+ 38: 37,
196
+ 39: 38,
197
+ 40: 39,
198
+ 41: 40,
199
+ 42: 41,
200
+ 43: 42,
201
+ 44: 43,
202
+ 45: 44,
203
+ 46: 45,
204
+ 47: 46,
205
+ 48: 47,
206
+ 49: 48,
207
+ 50: 49,
208
+ 51: 50,
209
+ 52: 51,
210
+ 53: 52,
211
+ 54: 53,
212
+ 55: 54,
213
+ 56: 55,
214
+ 57: 56,
215
+ 58: 57,
216
+ 59: 58,
217
+ 60: 59,
218
+ 61: 60,
219
+ 62: 61,
220
+ 63: 62,
221
+ 64: 63,
222
+ 65: 64,
223
+ 66: 65,
224
+ 67: 66,
225
+ 68: 67,
226
+ 69: 68,
227
+ 70: 69,
228
+ 71: 70,
229
+ 72: 71,
230
+ 73: 72,
231
+ 74: 73,
232
+ 75: 74,
233
+ 76: 75,
234
+ 77: 76,
235
+ 78: 77,
236
+ 79: 78,
237
+ 80: 79,
238
+ 81: 80,
239
+ 82: 81,
240
+ 83: 82,
241
+ 84: 83,
242
+ 85: 84,
243
+ 86: 85,
244
+ 87: 86,
245
+ 88: 87,
246
+ 89: 88,
247
+ 90: 89,
248
+ 91: 90,
249
+ 92: 91,
250
+ 93: 92,
251
+ 94: 93,
252
+ 95: 94,
253
+ 96: 95,
254
+ 97: 96,
255
+ 98: 97,
256
+ 99: 98,
257
+ 100: 99,
258
+ 101: 100,
259
+ 102: 101,
260
+ 103: 102,
261
+ 104: 103,
262
+ 105: 104,
263
+ 106: 105,
264
+ 107: 106,
265
+ 108: 107,
266
+ 109: 108,
267
+ 110: 109,
268
+ 111: 110,
269
+ 112: 111,
270
+ 113: 112,
271
+ 114: 113,
272
+ 115: 114,
273
+ 116: 115,
274
+ 117: 116,
275
+ 118: 117,
276
+ 119: 118,
277
+ 120: 119,
278
+ 121: 120,
279
+ 122: 121,
280
+ 123: 122,
281
+ 124: 123,
282
+ 125: 124,
283
+ 126: 125,
284
+ 127: 126,
285
+ 128: 127,
286
+ },
287
+ }
288
+
289
+ # IMGT states are the same across the board for B/D and A/G
290
+ _index_to_imgt_state[("imgt", "G")] = _index_to_imgt_state[("imgt", "B")]
291
+ _index_to_imgt_state[("imgt", "D")] = _index_to_imgt_state[("imgt", "A")]
292
+ _reg_one2three = {
293
+ "1": "fw%s1",
294
+ "2": "cdr%s1",
295
+ "3": "fw%s2",
296
+ "4": "cdr%s2",
297
+ "5": "fw%s3",
298
+ "6": "cdr%s3",
299
+ "7": "fw%s4",
300
+ }
301
+
302
+ # regions for MHC
303
+ # This is based on the state vector used for numbering MHCs in ANARCI
304
+ # http://www.imgt.org/IMGTrepertoireMHC/Proteins/protein/G-DOMAIN/Gdomains.html
305
+ # Refer to IMGT domain align tool as well
306
+ # 1 7 10 14 18 21 28 31 38 42 45 49 50 54 61 68 72 74 80 90
307
+ # 0987654321|.....|A..|...|...|..|......|..|......|...|..|...|1234567|...|ABC......|AB......|...|A.|.....|.........|..
308
+ mhc_svec = "1111111111111116662222222222266333333336664444444444444445555555555555555555555555555555555555555555566666"
309
+ # ^ This is the state vector for mhc g-domains for regions
310
+
311
+ # Manual observation for residues 88-92: this seems like a loopy region? keep it as "turn" for now.
312
+ # state vector for CD1/MR1 GA1-like and GA2-like domains
313
+ # 1 7 10 14 18 21 28 31 38 42 45 49 50 54 61 68 72 74 80 90
314
+ # 0987654321|.....|A..|...|...|..|......|..|......|...|..|...|1234567|...|A......|AB......|...|A.|.....|.........|..
315
+ cd1_svec = "11111111111111166622222222222663333333366644444444444444455555555555555555555555555555555555555555555555"
316
+ # ^ This is the state vector for CD1 ga-like-domains for regions
317
+
318
+
319
+ # This is the state vector for mhc c-domains (B2M)
320
+ # 1 10 15 16 232627 383941 45 77 84 85 89 96 97 104105 117118
321
+ # 87654321|........|....|123|......|..||........||.|...|1234567|......|12345677654321|...|......|12|......||...........||.........|
322
+ mhc_cvec = "1111111111111119992222222222299999999993333333999999944444444999999999999995555555555559966666666999999999999977777777777"
323
+
324
+ _reg_tostring = {
325
+ "1": "Astrand",
326
+ "2": "Bstrand",
327
+ "3": "Cstrand",
328
+ "4": "Dstrand",
329
+ "5": "Helix",
330
+ "6": "Turn",
331
+ } # http://www.imgt.org/IMGTScientificChart/Numbering/IMGTGsuperfamily.html
332
+ _reg_mhc_cdom = {
333
+ "1": "Astrand",
334
+ "2": "Bstrand",
335
+ "3": "Cstrand",
336
+ "4": "Dstrand",
337
+ "5": "Estrand",
338
+ "6": "Fstrand",
339
+ "7": "Gstrand",
340
+ "9": "Turn",
341
+ } # C-Like domain
342
+
343
+ _regions["imgt"]["MH1"] = mhc_svec
344
+ _regions["imgt"]["CD1"] = cd1_svec
345
+ _regions["imgt"]["MR1"] = cd1_svec
346
+ _regions["imgt"]["GA"] = mhc_svec
347
+ _regions["imgt"]["GB"] = _regions["imgt"]["GA"]
348
+ _regions["imgt"]["GA1"] = mhc_svec
349
+ _regions["imgt"]["GA2"] = mhc_svec
350
+ _regions["imgt"]["GA1L"] = cd1_svec
351
+ _regions["imgt"]["GA2L"] = cd1_svec
352
+
353
+ # C-LIKE And B2-Microglobulin regions
354
+ _regions["imgt"]["B2M"] = mhc_cvec
355
+
356
+ _index_to_imgt_state[("imgt", "MH1")] = {
357
+ 1: 0,
358
+ 2: 1,
359
+ 3: 2,
360
+ 4: 3,
361
+ 5: 4,
362
+ 6: 5,
363
+ 7: 6,
364
+ 8: 8,
365
+ 9: 9,
366
+ 10: 10,
367
+ 11: 11,
368
+ 12: 12,
369
+ 13: 13,
370
+ 14: 14,
371
+ 15: 15,
372
+ 16: 16,
373
+ 17: 17,
374
+ 18: 18,
375
+ 19: 19,
376
+ 20: 20,
377
+ 21: 21,
378
+ 22: 22,
379
+ 23: 23,
380
+ 24: 24,
381
+ 25: 25,
382
+ 26: 26,
383
+ 27: 27,
384
+ 28: 28,
385
+ 29: 29,
386
+ 30: 30,
387
+ 31: 31,
388
+ 32: 32,
389
+ 33: 33,
390
+ 34: 34,
391
+ 35: 35,
392
+ 36: 36,
393
+ 37: 37,
394
+ 38: 38,
395
+ 39: 39,
396
+ 40: 40,
397
+ 41: 41,
398
+ 42: 42,
399
+ 43: 43,
400
+ 44: 44,
401
+ 45: 45,
402
+ 46: 46,
403
+ 47: 47,
404
+ 48: 48,
405
+ 49: 49,
406
+ 50: 57,
407
+ 51: 58,
408
+ 52: 59,
409
+ 53: 60,
410
+ 54: 61,
411
+ 55: 65,
412
+ 56: 66,
413
+ 57: 67,
414
+ 58: 68,
415
+ 59: 69,
416
+ 60: 70,
417
+ 61: 71,
418
+ 62: 74,
419
+ 63: 75,
420
+ 64: 76,
421
+ 65: 77,
422
+ 66: 78,
423
+ 67: 79,
424
+ 68: 80,
425
+ 69: 81,
426
+ 70: 82,
427
+ 71: 83,
428
+ 72: 84,
429
+ 73: 86,
430
+ 74: 87,
431
+ 75: 88,
432
+ 76: 89,
433
+ 77: 90,
434
+ 78: 91,
435
+ 79: 92,
436
+ 80: 93,
437
+ 81: 94,
438
+ 82: 95,
439
+ 83: 96,
440
+ 84: 97,
441
+ 85: 98,
442
+ 86: 99,
443
+ 87: 100,
444
+ 88: 101,
445
+ 89: 102,
446
+ 90: 103,
447
+ 91: 104,
448
+ 92: 105,
449
+ }
450
+ _index_to_imgt_state[("imgt", "CD1")] = {
451
+ 1: 0,
452
+ 2: 1,
453
+ 3: 2,
454
+ 4: 3,
455
+ 5: 4,
456
+ 6: 5,
457
+ 7: 6,
458
+ 8: 8,
459
+ 9: 9,
460
+ 10: 10,
461
+ 11: 11,
462
+ 12: 12,
463
+ 13: 13,
464
+ 14: 14,
465
+ 15: 15,
466
+ 16: 16,
467
+ 17: 17,
468
+ 18: 18,
469
+ 19: 19,
470
+ 20: 20,
471
+ 21: 21,
472
+ 22: 22,
473
+ 23: 23,
474
+ 24: 24,
475
+ 25: 25,
476
+ 26: 26,
477
+ 27: 27,
478
+ 28: 28,
479
+ 29: 29,
480
+ 30: 30,
481
+ 31: 31,
482
+ 32: 32,
483
+ 33: 33,
484
+ 34: 34,
485
+ 35: 35,
486
+ 36: 36,
487
+ 37: 37,
488
+ 38: 38,
489
+ 39: 39,
490
+ 40: 40,
491
+ 41: 41,
492
+ 42: 42,
493
+ 43: 43,
494
+ 44: 44,
495
+ 45: 45,
496
+ 46: 46,
497
+ 47: 47,
498
+ 48: 48,
499
+ 49: 49,
500
+ 50: 57,
501
+ 51: 58,
502
+ 52: 59,
503
+ 53: 60,
504
+ 54: 61,
505
+ 55: 63,
506
+ 56: 64,
507
+ 57: 65,
508
+ 58: 66,
509
+ 59: 67,
510
+ 60: 68,
511
+ 61: 69,
512
+ 62: 72,
513
+ 63: 73,
514
+ 64: 74,
515
+ 65: 75,
516
+ 66: 76,
517
+ 67: 77,
518
+ 68: 78,
519
+ 69: 79,
520
+ 70: 80,
521
+ 71: 81,
522
+ 72: 82,
523
+ 73: 84,
524
+ 74: 85,
525
+ 75: 86,
526
+ 76: 87,
527
+ 77: 88,
528
+ 78: 89,
529
+ 79: 90,
530
+ 80: 91,
531
+ 81: 92,
532
+ 82: 93,
533
+ 83: 94,
534
+ 84: 95,
535
+ 85: 96,
536
+ 86: 97,
537
+ 87: 98,
538
+ 88: 99,
539
+ 89: 100,
540
+ 90: 101,
541
+ 91: 102,
542
+ 92: 103,
543
+ }
544
+
545
+ _index_to_imgt_state[("imgt", "GA1")] = _index_to_imgt_state[("imgt", "MH1")]
546
+ _index_to_imgt_state[("imgt", "GA2")] = dict(
547
+ [(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "GA1")].items())]
548
+ )
549
+
550
+
551
+ # To map indices onto index_to_imgt_state: this works for everything apart from MHC constant domains.
552
+ def map_state_index_imgt(statevector):
553
+ numdict = {}
554
+ curr_num = 1
555
+ for i, char in enumerate(statevector):
556
+ if not char.isdigit():
557
+ numdict[curr_num] = i
558
+ curr_num += 1
559
+ return numdict
560
+
561
+
562
+ # To map indices onto index_to_imgt_state
563
+ def map_state_index_mhc_clike(statevector):
564
+ numdict = {}
565
+ curr_num = 1
566
+ for i, char in enumerate(statevector):
567
+ if not char.isdigit():
568
+ numdict[curr_num] = i
569
+ if curr_num == 45:
570
+ curr_num += 32
571
+ else:
572
+ curr_num += 1
573
+ return numdict
574
+
575
+
576
+ _index_to_imgt_state[("imgt", "B2M")] = map_state_index_mhc_clike(mhc_cvec)
577
+
578
+ # For N numbering, IMGT uses numbers 1000-1100.
579
+ # For now, we'll avoid numbering B2M domains and the C-LIKE domains.
580
+ _index_to_imgt_state[("imgt", "MH1")].update(
581
+ dict(
582
+ [(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "MH1")].items())]
583
+ )
584
+ )
585
+ _index_to_imgt_state[("imgt", "GA")] = _index_to_imgt_state[("imgt", "GA1")]
586
+ _index_to_imgt_state[("imgt", "GB")] = _index_to_imgt_state[("imgt", "GA1")]
587
+ _index_to_imgt_state[("imgt", "GA1L")] = _index_to_imgt_state[("imgt", "CD1")]
588
+ _index_to_imgt_state[("imgt", "GA2L")] = dict(
589
+ [(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "CD1")].items())]
590
+ )
591
+ _index_to_imgt_state[("imgt", "CD1")].update(
592
+ dict(
593
+ [(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "CD1")].items())]
594
+ )
595
+ )
596
+ _index_to_imgt_state[("imgt", "MR1")] = _index_to_imgt_state[("imgt", "CD1")]
597
+
598
+
599
+ def get_region(position, chain):
600
+ """
601
+ Get the region in which the position belongs given the chain, numbering scheme and definition.
602
+
603
+ **Note** this function does not know about insertions on the sequence. Therefore, it will get the region annotation
604
+ wrong when using non-equivalent scheme-definitions.
605
+
606
+ To get around this please use the annotate_regions function which implements heuristics to get the definition correct
607
+ in the scheme.
608
+
609
+ """
610
+ index, insertion = position
611
+ chain = chain.upper()
612
+
613
+ # imgt_state is a dictionary that maps an IMGT position (e.g. 1) onto the position along the state vector;
614
+ # Thus, 1 maps to 0 because 1 is the first number in the IMGT numbering scheme but is the 0th position along the state vector.
615
+ imgt_state = _index_to_imgt_state[("imgt", chain)]
616
+ # Get the state vector corresponding to a particular chain; this is either 1,2,3,4,5,6,7, corresponding to different TCR regions.
617
+ state_vec = _regions["imgt"][chain]
618
+
619
+ if chain in TCR_CHAINS:
620
+ if index in imgt_state:
621
+ state_idx = imgt_state[index]
622
+ # Returns a fwb1, cdra2... etc.
623
+ return _reg_one2three[state_vec[state_idx]] % chain.lower()
624
+ else:
625
+ return "?"
626
+ else:
627
+ if index in imgt_state:
628
+ state_idx = imgt_state[index]
629
+ # Returns whether helix or turn on the MHC (G-Domain annotation)
630
+ return _reg_tostring[state_vec[state_idx]]
631
+ else:
632
+ return "?"
633
+
634
+
635
+ def annotate_regions(numbered_sequence, chain):
636
+ """
637
+ Given a numbered sequence (list) annotate which region each residue belongs to.
638
+ Currently, only the IMGT numbering and definition are implemented.
639
+ If possible, use the corresponding numbering scheme and definition.
640
+
641
+ This function automates the heuristics recognise different definitions in each scheme. However,
642
+ some of the conversions are non-trivial.
643
+ """
644
+ chain = chain.upper()
645
+ c = chain.lower()
646
+
647
+ numdict = dict(numbered_sequence)
648
+
649
+ cdr_acceptors = {1: Accept(), 2: Accept(), 3: Accept()}
650
+
651
+ cdr_acceptors[1].set_regions(["cdr%s1" % c])
652
+ cdr_acceptors[2].set_regions(["cdr%s2" % c])
653
+ cdr_acceptors[3].set_regions(["cdr%s3" % c])
654
+
655
+ # We start off by annotating framework regions; this switches when we find a CDR
656
+ fw_region = "fw%s1" % c
657
+ region_annotations = []
658
+
659
+ cterm = max(_index_to_imgt_state[("imgt", chain)].keys())
660
+ for r, a in numbered_sequence:
661
+ if cdr_acceptors[1].accept(r, chain):
662
+ region_annotations.append((r, a, "cdr%s1" % c))
663
+ fw_region = "fw%s2" % c
664
+ elif cdr_acceptors[2].accept(r, chain):
665
+ region_annotations.append((r, a, "cdr%s2" % c))
666
+ fw_region = "fw%s3" % c
667
+ elif cdr_acceptors[3].accept(r, chain):
668
+ region_annotations.append((r, a, "cdr%s3" % c))
669
+ fw_region = "fw%s4" % c
670
+ elif (
671
+ r[0] <= cterm
672
+ ): # Anything out of the variable region is not assigned a region i.e. ''
673
+ region_annotations.append((r, a, fw_region))
674
+ else:
675
+ region_annotations.append((r, a, ""))
676
+
677
+ return region_annotations
678
+
679
+
680
+ class Accept(object):
681
+ """
682
+ A class to select which positions should be compared.
683
+ """
684
+
685
+ _defined_regions = TCR_REGIONS
686
+ _macro_regions = {
687
+ "bframework": set(["fwb1", "fwb2", "fwb3", "fwb4"]),
688
+ "bcdrs": set(["cdrb1", "cdrb2", "cdrb3"]),
689
+ "aframework": set(["fwa1", "fwa2", "fwa3", "fwa4"]),
690
+ "acdrs": set(["cdra1", "cdra2", "cdra3"]),
691
+ "gframework": set(["fwg1", "fwg2", "fwg3", "fwg4"]),
692
+ "gcdrs": set(["cdrg1", "cdrg2", "cdrg3"]),
693
+ "dframework": set(["fwd1", "fwd2", "fwd3", "fwd4"]),
694
+ "dcdrs": set(["cdrd1", "cdrd2", "cdrd3"]),
695
+ }
696
+
697
+ _macro_regions.update(
698
+ {
699
+ "framework": _macro_regions["bframework"]
700
+ | _macro_regions["aframework"]
701
+ | _macro_regions["dframework"]
702
+ | _macro_regions["gframework"],
703
+ "cdrs": _macro_regions["bcdrs"]
704
+ | _macro_regions["acdrs"]
705
+ | _macro_regions["dcdrs"]
706
+ | _macro_regions["gcdrs"],
707
+ "vb": _macro_regions["bcdrs"] | _macro_regions["bframework"],
708
+ "va": _macro_regions["acdrs"] | _macro_regions["aframework"],
709
+ "vg": _macro_regions["gcdrs"] | _macro_regions["gframework"],
710
+ "vd": _macro_regions["dcdrs"] | _macro_regions["dframework"],
711
+ }
712
+ )
713
+
714
+ _macro_regions.update(
715
+ {
716
+ "ba": _macro_regions["vb"] | _macro_regions["va"],
717
+ "dg": _macro_regions["vd"] | _macro_regions["vg"],
718
+ "tr": _macro_regions["vb"]
719
+ | _macro_regions["va"]
720
+ | _macro_regions["vd"]
721
+ | _macro_regions["vg"],
722
+ }
723
+ )
724
+
725
+ _macro_positions = {}
726
+
727
+ def __init__(self, NOT=False):
728
+ self.NOT = NOT
729
+ self.set_regions()
730
+ self.positions = {"B": set(), "A": set(), "D": set(), "G": set()}
731
+ self.exclude = {"B": set(), "A": set(), "D": set(), "G": set()}
732
+
733
+ def set_regions(self, regions=[]):
734
+ """
735
+ Set the regions to be used. Will clear anything added using add regions.
736
+ """
737
+ if self.NOT:
738
+ self.regions = self._macro_regions["tr"]
739
+ else:
740
+ self.regions = set()
741
+ self.add_regions(regions)
742
+
743
+ def add_regions(self, regions):
744
+ """
745
+ Add regions to the selection.
746
+ """
747
+ for region in regions:
748
+ region = region.lower()
749
+ if region in self._defined_regions:
750
+ if self.NOT:
751
+ self.regions = self.regions - set([region])
752
+ else:
753
+ self.regions.add(region)
754
+ elif region in self._macro_regions:
755
+ if self.NOT:
756
+ self.regions = self.regions - self._macro_regions[region]
757
+ else:
758
+ self.regions = self.regions | self._macro_regions[region]
759
+ elif region in self._macro_positions:
760
+ raise AssertionError("Undefined region")
761
+ else:
762
+ raise AssertionError("Undefined region")
763
+
764
+ def add_positions(self, positions, chain):
765
+ for position in positions:
766
+ index, insertion = position
767
+ self.positions[chain].add((index, insertion))
768
+
769
+ def exclude_positions(self, positions, chain):
770
+ for position in positions:
771
+ index, insertion = position
772
+ self.exclude[chain].add((index, insertion))
773
+
774
+ def accept(self, position, chain):
775
+ if position in self.exclude[chain]:
776
+ return 0
777
+ elif (
778
+ get_region(position, chain) in self.regions
779
+ or position in self.positions[chain]
780
+ ):
781
+ return 1
782
+ return 0