stcrpy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +0 -0
- examples/egnn.py +425 -0
- stcrpy/__init__.py +5 -0
- stcrpy/tcr_datasets/__init__.py +0 -0
- stcrpy/tcr_datasets/tcr_graph_dataset.py +499 -0
- stcrpy/tcr_datasets/tcr_selector.py +0 -0
- stcrpy/tcr_datasets/tcr_structure_dataset.py +0 -0
- stcrpy/tcr_datasets/utils.py +350 -0
- stcrpy/tcr_formats/__init__.py +0 -0
- stcrpy/tcr_formats/tcr_formats.py +114 -0
- stcrpy/tcr_formats/tcr_haddock.py +556 -0
- stcrpy/tcr_geometry/TCRCoM.py +350 -0
- stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy/tcr_geometry/TCRDock.py +261 -0
- stcrpy/tcr_geometry/TCRGeom.py +450 -0
- stcrpy/tcr_geometry/TCRGeomFiltering.py +273 -0
- stcrpy/tcr_geometry/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/__init__.py +0 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_1_imgt_numbered.pdb +6549 -0
- stcrpy/tcr_geometry/reference_data/dock_reference_2_imgt_numbered.pdb +6495 -0
- stcrpy/tcr_geometry/reference_data/reference_A.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_B.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_D.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_G.pdb +31 -0
- stcrpy/tcr_geometry/reference_data/reference_data.py +104 -0
- stcrpy/tcr_interactions/PLIPParser.py +147 -0
- stcrpy/tcr_interactions/TCRInteractionProfiler.py +433 -0
- stcrpy/tcr_interactions/TCRpMHC_PLIP_Model_Parser.py +133 -0
- stcrpy/tcr_interactions/__init__.py +0 -0
- stcrpy/tcr_interactions/utils.py +170 -0
- stcrpy/tcr_methods/__init__.py +0 -0
- stcrpy/tcr_methods/tcr_batch_operations.py +223 -0
- stcrpy/tcr_methods/tcr_methods.py +150 -0
- stcrpy/tcr_methods/tcr_reformatting.py +18 -0
- stcrpy/tcr_metrics/__init__.py +2 -0
- stcrpy/tcr_metrics/constants.py +39 -0
- stcrpy/tcr_metrics/tcr_interface_rmsd.py +237 -0
- stcrpy/tcr_metrics/tcr_rmsd.py +179 -0
- stcrpy/tcr_ml/__init__.py +0 -0
- stcrpy/tcr_ml/geometry_predictor.py +3 -0
- stcrpy/tcr_processing/AGchain.py +89 -0
- stcrpy/tcr_processing/Chemical_components.py +48915 -0
- stcrpy/tcr_processing/Entity.py +301 -0
- stcrpy/tcr_processing/Fragment.py +58 -0
- stcrpy/tcr_processing/Holder.py +24 -0
- stcrpy/tcr_processing/MHC.py +449 -0
- stcrpy/tcr_processing/MHCchain.py +149 -0
- stcrpy/tcr_processing/Model.py +37 -0
- stcrpy/tcr_processing/Select.py +145 -0
- stcrpy/tcr_processing/TCR.py +532 -0
- stcrpy/tcr_processing/TCRIO.py +47 -0
- stcrpy/tcr_processing/TCRParser.py +1230 -0
- stcrpy/tcr_processing/TCRStructure.py +148 -0
- stcrpy/tcr_processing/TCRchain.py +160 -0
- stcrpy/tcr_processing/__init__.py +3 -0
- stcrpy/tcr_processing/annotate.py +480 -0
- stcrpy/tcr_processing/utils/__init__.py +0 -0
- stcrpy/tcr_processing/utils/common.py +67 -0
- stcrpy/tcr_processing/utils/constants.py +367 -0
- stcrpy/tcr_processing/utils/region_definitions.py +782 -0
- stcrpy/utils/__init__.py +0 -0
- stcrpy/utils/error_stream.py +12 -0
- stcrpy-1.0.0.dist-info/METADATA +173 -0
- stcrpy-1.0.0.dist-info/RECORD +68 -0
- stcrpy-1.0.0.dist-info/WHEEL +5 -0
- stcrpy-1.0.0.dist-info/licenses/LICENCE +28 -0
- stcrpy-1.0.0.dist-info/licenses/stcrpy/tcr_geometry/TCRCoM_LICENCE +168 -0
- stcrpy-1.0.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,782 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A module to deal with region annotations for IMGT scheme.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .constants import TCR_CHAINS, TCR_REGIONS
|
|
6
|
+
|
|
7
|
+
IMGT_CDR_BOUNDARIES = {
|
|
8
|
+
"1": {"imgt": (27, 38)},
|
|
9
|
+
"2": {"imgt": (56, 65)},
|
|
10
|
+
"3": {"imgt": (105, 117)},
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
# regions for TCR
|
|
14
|
+
_regions = {"imgt": {}}
|
|
15
|
+
_regions["imgt"]["A"] = _regions["imgt"]["B"] = (
|
|
16
|
+
"11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Set the IMGT definitions for TCR chain types
|
|
20
|
+
_regions["imgt"]["G"] = _regions["imgt"]["B"]
|
|
21
|
+
_regions["imgt"]["D"] = _regions["imgt"]["A"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# For internal use only. These are not direct conversions and are handled heuristically.
|
|
25
|
+
# Currently only using the IMGT numbering scheme although other numbering schemes may be introduced.
|
|
26
|
+
_index_to_imgt_state = {
|
|
27
|
+
("imgt", "B"): {
|
|
28
|
+
1: 0,
|
|
29
|
+
2: 1,
|
|
30
|
+
3: 2,
|
|
31
|
+
4: 3,
|
|
32
|
+
5: 4,
|
|
33
|
+
6: 5,
|
|
34
|
+
7: 6,
|
|
35
|
+
8: 7,
|
|
36
|
+
9: 8,
|
|
37
|
+
10: 9,
|
|
38
|
+
11: 10,
|
|
39
|
+
12: 11,
|
|
40
|
+
13: 12,
|
|
41
|
+
14: 13,
|
|
42
|
+
15: 14,
|
|
43
|
+
16: 15,
|
|
44
|
+
17: 16,
|
|
45
|
+
18: 17,
|
|
46
|
+
19: 18,
|
|
47
|
+
20: 19,
|
|
48
|
+
21: 20,
|
|
49
|
+
22: 21,
|
|
50
|
+
23: 22,
|
|
51
|
+
24: 23,
|
|
52
|
+
25: 24,
|
|
53
|
+
26: 25,
|
|
54
|
+
27: 26,
|
|
55
|
+
28: 27,
|
|
56
|
+
29: 28,
|
|
57
|
+
30: 29,
|
|
58
|
+
31: 30,
|
|
59
|
+
32: 31,
|
|
60
|
+
33: 32,
|
|
61
|
+
34: 33,
|
|
62
|
+
35: 34,
|
|
63
|
+
36: 35,
|
|
64
|
+
37: 36,
|
|
65
|
+
38: 37,
|
|
66
|
+
39: 38,
|
|
67
|
+
40: 39,
|
|
68
|
+
41: 40,
|
|
69
|
+
42: 41,
|
|
70
|
+
43: 42,
|
|
71
|
+
44: 43,
|
|
72
|
+
45: 44,
|
|
73
|
+
46: 45,
|
|
74
|
+
47: 46,
|
|
75
|
+
48: 47,
|
|
76
|
+
49: 48,
|
|
77
|
+
50: 49,
|
|
78
|
+
51: 50,
|
|
79
|
+
52: 51,
|
|
80
|
+
53: 52,
|
|
81
|
+
54: 53,
|
|
82
|
+
55: 54,
|
|
83
|
+
56: 55,
|
|
84
|
+
57: 56,
|
|
85
|
+
58: 57,
|
|
86
|
+
59: 58,
|
|
87
|
+
60: 59,
|
|
88
|
+
61: 60,
|
|
89
|
+
62: 61,
|
|
90
|
+
63: 62,
|
|
91
|
+
64: 63,
|
|
92
|
+
65: 64,
|
|
93
|
+
66: 65,
|
|
94
|
+
67: 66,
|
|
95
|
+
68: 67,
|
|
96
|
+
69: 68,
|
|
97
|
+
70: 69,
|
|
98
|
+
71: 70,
|
|
99
|
+
72: 71,
|
|
100
|
+
73: 72,
|
|
101
|
+
74: 73,
|
|
102
|
+
75: 74,
|
|
103
|
+
76: 75,
|
|
104
|
+
77: 76,
|
|
105
|
+
78: 77,
|
|
106
|
+
79: 78,
|
|
107
|
+
80: 79,
|
|
108
|
+
81: 80,
|
|
109
|
+
82: 81,
|
|
110
|
+
83: 82,
|
|
111
|
+
84: 83,
|
|
112
|
+
85: 84,
|
|
113
|
+
86: 85,
|
|
114
|
+
87: 86,
|
|
115
|
+
88: 87,
|
|
116
|
+
89: 88,
|
|
117
|
+
90: 89,
|
|
118
|
+
91: 90,
|
|
119
|
+
92: 91,
|
|
120
|
+
93: 92,
|
|
121
|
+
94: 93,
|
|
122
|
+
95: 94,
|
|
123
|
+
96: 95,
|
|
124
|
+
97: 96,
|
|
125
|
+
98: 97,
|
|
126
|
+
99: 98,
|
|
127
|
+
100: 99,
|
|
128
|
+
101: 100,
|
|
129
|
+
102: 101,
|
|
130
|
+
103: 102,
|
|
131
|
+
104: 103,
|
|
132
|
+
105: 104,
|
|
133
|
+
106: 105,
|
|
134
|
+
107: 106,
|
|
135
|
+
108: 107,
|
|
136
|
+
109: 108,
|
|
137
|
+
110: 109,
|
|
138
|
+
111: 110,
|
|
139
|
+
112: 111,
|
|
140
|
+
113: 112,
|
|
141
|
+
114: 113,
|
|
142
|
+
115: 114,
|
|
143
|
+
116: 115,
|
|
144
|
+
117: 116,
|
|
145
|
+
118: 117,
|
|
146
|
+
119: 118,
|
|
147
|
+
120: 119,
|
|
148
|
+
121: 120,
|
|
149
|
+
122: 121,
|
|
150
|
+
123: 122,
|
|
151
|
+
124: 123,
|
|
152
|
+
125: 124,
|
|
153
|
+
126: 125,
|
|
154
|
+
127: 126,
|
|
155
|
+
128: 127,
|
|
156
|
+
},
|
|
157
|
+
("imgt", "A"): {
|
|
158
|
+
1: 0,
|
|
159
|
+
2: 1,
|
|
160
|
+
3: 2,
|
|
161
|
+
4: 3,
|
|
162
|
+
5: 4,
|
|
163
|
+
6: 5,
|
|
164
|
+
7: 6,
|
|
165
|
+
8: 7,
|
|
166
|
+
9: 8,
|
|
167
|
+
10: 9,
|
|
168
|
+
11: 10,
|
|
169
|
+
12: 11,
|
|
170
|
+
13: 12,
|
|
171
|
+
14: 13,
|
|
172
|
+
15: 14,
|
|
173
|
+
16: 15,
|
|
174
|
+
17: 16,
|
|
175
|
+
18: 17,
|
|
176
|
+
19: 18,
|
|
177
|
+
20: 19,
|
|
178
|
+
21: 20,
|
|
179
|
+
22: 21,
|
|
180
|
+
23: 22,
|
|
181
|
+
24: 23,
|
|
182
|
+
25: 24,
|
|
183
|
+
26: 25,
|
|
184
|
+
27: 26,
|
|
185
|
+
28: 27,
|
|
186
|
+
29: 28,
|
|
187
|
+
30: 29,
|
|
188
|
+
31: 30,
|
|
189
|
+
32: 31,
|
|
190
|
+
33: 32,
|
|
191
|
+
34: 33,
|
|
192
|
+
35: 34,
|
|
193
|
+
36: 35,
|
|
194
|
+
37: 36,
|
|
195
|
+
38: 37,
|
|
196
|
+
39: 38,
|
|
197
|
+
40: 39,
|
|
198
|
+
41: 40,
|
|
199
|
+
42: 41,
|
|
200
|
+
43: 42,
|
|
201
|
+
44: 43,
|
|
202
|
+
45: 44,
|
|
203
|
+
46: 45,
|
|
204
|
+
47: 46,
|
|
205
|
+
48: 47,
|
|
206
|
+
49: 48,
|
|
207
|
+
50: 49,
|
|
208
|
+
51: 50,
|
|
209
|
+
52: 51,
|
|
210
|
+
53: 52,
|
|
211
|
+
54: 53,
|
|
212
|
+
55: 54,
|
|
213
|
+
56: 55,
|
|
214
|
+
57: 56,
|
|
215
|
+
58: 57,
|
|
216
|
+
59: 58,
|
|
217
|
+
60: 59,
|
|
218
|
+
61: 60,
|
|
219
|
+
62: 61,
|
|
220
|
+
63: 62,
|
|
221
|
+
64: 63,
|
|
222
|
+
65: 64,
|
|
223
|
+
66: 65,
|
|
224
|
+
67: 66,
|
|
225
|
+
68: 67,
|
|
226
|
+
69: 68,
|
|
227
|
+
70: 69,
|
|
228
|
+
71: 70,
|
|
229
|
+
72: 71,
|
|
230
|
+
73: 72,
|
|
231
|
+
74: 73,
|
|
232
|
+
75: 74,
|
|
233
|
+
76: 75,
|
|
234
|
+
77: 76,
|
|
235
|
+
78: 77,
|
|
236
|
+
79: 78,
|
|
237
|
+
80: 79,
|
|
238
|
+
81: 80,
|
|
239
|
+
82: 81,
|
|
240
|
+
83: 82,
|
|
241
|
+
84: 83,
|
|
242
|
+
85: 84,
|
|
243
|
+
86: 85,
|
|
244
|
+
87: 86,
|
|
245
|
+
88: 87,
|
|
246
|
+
89: 88,
|
|
247
|
+
90: 89,
|
|
248
|
+
91: 90,
|
|
249
|
+
92: 91,
|
|
250
|
+
93: 92,
|
|
251
|
+
94: 93,
|
|
252
|
+
95: 94,
|
|
253
|
+
96: 95,
|
|
254
|
+
97: 96,
|
|
255
|
+
98: 97,
|
|
256
|
+
99: 98,
|
|
257
|
+
100: 99,
|
|
258
|
+
101: 100,
|
|
259
|
+
102: 101,
|
|
260
|
+
103: 102,
|
|
261
|
+
104: 103,
|
|
262
|
+
105: 104,
|
|
263
|
+
106: 105,
|
|
264
|
+
107: 106,
|
|
265
|
+
108: 107,
|
|
266
|
+
109: 108,
|
|
267
|
+
110: 109,
|
|
268
|
+
111: 110,
|
|
269
|
+
112: 111,
|
|
270
|
+
113: 112,
|
|
271
|
+
114: 113,
|
|
272
|
+
115: 114,
|
|
273
|
+
116: 115,
|
|
274
|
+
117: 116,
|
|
275
|
+
118: 117,
|
|
276
|
+
119: 118,
|
|
277
|
+
120: 119,
|
|
278
|
+
121: 120,
|
|
279
|
+
122: 121,
|
|
280
|
+
123: 122,
|
|
281
|
+
124: 123,
|
|
282
|
+
125: 124,
|
|
283
|
+
126: 125,
|
|
284
|
+
127: 126,
|
|
285
|
+
128: 127,
|
|
286
|
+
},
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
# IMGT states are the same across the board for B/D and A/G
|
|
290
|
+
_index_to_imgt_state[("imgt", "G")] = _index_to_imgt_state[("imgt", "B")]
|
|
291
|
+
_index_to_imgt_state[("imgt", "D")] = _index_to_imgt_state[("imgt", "A")]
|
|
292
|
+
_reg_one2three = {
|
|
293
|
+
"1": "fw%s1",
|
|
294
|
+
"2": "cdr%s1",
|
|
295
|
+
"3": "fw%s2",
|
|
296
|
+
"4": "cdr%s2",
|
|
297
|
+
"5": "fw%s3",
|
|
298
|
+
"6": "cdr%s3",
|
|
299
|
+
"7": "fw%s4",
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# regions for MHC
|
|
303
|
+
# This is based on the state vector used for numbering MHCs in ANARCI
|
|
304
|
+
# http://www.imgt.org/IMGTrepertoireMHC/Proteins/protein/G-DOMAIN/Gdomains.html
|
|
305
|
+
# Refer to IMGT domain align tool as well
|
|
306
|
+
# 1 7 10 14 18 21 28 31 38 42 45 49 50 54 61 68 72 74 80 90
|
|
307
|
+
# 0987654321|.....|A..|...|...|..|......|..|......|...|..|...|1234567|...|ABC......|AB......|...|A.|.....|.........|..
|
|
308
|
+
mhc_svec = "1111111111111116662222222222266333333336664444444444444445555555555555555555555555555555555555555555566666"
|
|
309
|
+
# ^ This is the state vector for mhc g-domains for regions
|
|
310
|
+
|
|
311
|
+
# Manual observation for residues 88-92: this seems like a loopy region? keep it as "turn" for now.
|
|
312
|
+
# state vector for CD1/MR1 GA1-like and GA2-like domains
|
|
313
|
+
# 1 7 10 14 18 21 28 31 38 42 45 49 50 54 61 68 72 74 80 90
|
|
314
|
+
# 0987654321|.....|A..|...|...|..|......|..|......|...|..|...|1234567|...|A......|AB......|...|A.|.....|.........|..
|
|
315
|
+
cd1_svec = "11111111111111166622222222222663333333366644444444444444455555555555555555555555555555555555555555555555"
|
|
316
|
+
# ^ This is the state vector for CD1 ga-like-domains for regions
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# This is the state vector for mhc c-domains (B2M)
|
|
320
|
+
# 1 10 15 16 232627 383941 45 77 84 85 89 96 97 104105 117118
|
|
321
|
+
# 87654321|........|....|123|......|..||........||.|...|1234567|......|12345677654321|...|......|12|......||...........||.........|
|
|
322
|
+
mhc_cvec = "1111111111111119992222222222299999999993333333999999944444444999999999999995555555555559966666666999999999999977777777777"
|
|
323
|
+
|
|
324
|
+
_reg_tostring = {
|
|
325
|
+
"1": "Astrand",
|
|
326
|
+
"2": "Bstrand",
|
|
327
|
+
"3": "Cstrand",
|
|
328
|
+
"4": "Dstrand",
|
|
329
|
+
"5": "Helix",
|
|
330
|
+
"6": "Turn",
|
|
331
|
+
} # http://www.imgt.org/IMGTScientificChart/Numbering/IMGTGsuperfamily.html
|
|
332
|
+
_reg_mhc_cdom = {
|
|
333
|
+
"1": "Astrand",
|
|
334
|
+
"2": "Bstrand",
|
|
335
|
+
"3": "Cstrand",
|
|
336
|
+
"4": "Dstrand",
|
|
337
|
+
"5": "Estrand",
|
|
338
|
+
"6": "Fstrand",
|
|
339
|
+
"7": "Gstrand",
|
|
340
|
+
"9": "Turn",
|
|
341
|
+
} # C-Like domain
|
|
342
|
+
|
|
343
|
+
_regions["imgt"]["MH1"] = mhc_svec
|
|
344
|
+
_regions["imgt"]["CD1"] = cd1_svec
|
|
345
|
+
_regions["imgt"]["MR1"] = cd1_svec
|
|
346
|
+
_regions["imgt"]["GA"] = mhc_svec
|
|
347
|
+
_regions["imgt"]["GB"] = _regions["imgt"]["GA"]
|
|
348
|
+
_regions["imgt"]["GA1"] = mhc_svec
|
|
349
|
+
_regions["imgt"]["GA2"] = mhc_svec
|
|
350
|
+
_regions["imgt"]["GA1L"] = cd1_svec
|
|
351
|
+
_regions["imgt"]["GA2L"] = cd1_svec
|
|
352
|
+
|
|
353
|
+
# C-LIKE And B2-Microglobulin regions
|
|
354
|
+
_regions["imgt"]["B2M"] = mhc_cvec
|
|
355
|
+
|
|
356
|
+
_index_to_imgt_state[("imgt", "MH1")] = {
|
|
357
|
+
1: 0,
|
|
358
|
+
2: 1,
|
|
359
|
+
3: 2,
|
|
360
|
+
4: 3,
|
|
361
|
+
5: 4,
|
|
362
|
+
6: 5,
|
|
363
|
+
7: 6,
|
|
364
|
+
8: 8,
|
|
365
|
+
9: 9,
|
|
366
|
+
10: 10,
|
|
367
|
+
11: 11,
|
|
368
|
+
12: 12,
|
|
369
|
+
13: 13,
|
|
370
|
+
14: 14,
|
|
371
|
+
15: 15,
|
|
372
|
+
16: 16,
|
|
373
|
+
17: 17,
|
|
374
|
+
18: 18,
|
|
375
|
+
19: 19,
|
|
376
|
+
20: 20,
|
|
377
|
+
21: 21,
|
|
378
|
+
22: 22,
|
|
379
|
+
23: 23,
|
|
380
|
+
24: 24,
|
|
381
|
+
25: 25,
|
|
382
|
+
26: 26,
|
|
383
|
+
27: 27,
|
|
384
|
+
28: 28,
|
|
385
|
+
29: 29,
|
|
386
|
+
30: 30,
|
|
387
|
+
31: 31,
|
|
388
|
+
32: 32,
|
|
389
|
+
33: 33,
|
|
390
|
+
34: 34,
|
|
391
|
+
35: 35,
|
|
392
|
+
36: 36,
|
|
393
|
+
37: 37,
|
|
394
|
+
38: 38,
|
|
395
|
+
39: 39,
|
|
396
|
+
40: 40,
|
|
397
|
+
41: 41,
|
|
398
|
+
42: 42,
|
|
399
|
+
43: 43,
|
|
400
|
+
44: 44,
|
|
401
|
+
45: 45,
|
|
402
|
+
46: 46,
|
|
403
|
+
47: 47,
|
|
404
|
+
48: 48,
|
|
405
|
+
49: 49,
|
|
406
|
+
50: 57,
|
|
407
|
+
51: 58,
|
|
408
|
+
52: 59,
|
|
409
|
+
53: 60,
|
|
410
|
+
54: 61,
|
|
411
|
+
55: 65,
|
|
412
|
+
56: 66,
|
|
413
|
+
57: 67,
|
|
414
|
+
58: 68,
|
|
415
|
+
59: 69,
|
|
416
|
+
60: 70,
|
|
417
|
+
61: 71,
|
|
418
|
+
62: 74,
|
|
419
|
+
63: 75,
|
|
420
|
+
64: 76,
|
|
421
|
+
65: 77,
|
|
422
|
+
66: 78,
|
|
423
|
+
67: 79,
|
|
424
|
+
68: 80,
|
|
425
|
+
69: 81,
|
|
426
|
+
70: 82,
|
|
427
|
+
71: 83,
|
|
428
|
+
72: 84,
|
|
429
|
+
73: 86,
|
|
430
|
+
74: 87,
|
|
431
|
+
75: 88,
|
|
432
|
+
76: 89,
|
|
433
|
+
77: 90,
|
|
434
|
+
78: 91,
|
|
435
|
+
79: 92,
|
|
436
|
+
80: 93,
|
|
437
|
+
81: 94,
|
|
438
|
+
82: 95,
|
|
439
|
+
83: 96,
|
|
440
|
+
84: 97,
|
|
441
|
+
85: 98,
|
|
442
|
+
86: 99,
|
|
443
|
+
87: 100,
|
|
444
|
+
88: 101,
|
|
445
|
+
89: 102,
|
|
446
|
+
90: 103,
|
|
447
|
+
91: 104,
|
|
448
|
+
92: 105,
|
|
449
|
+
}
|
|
450
|
+
_index_to_imgt_state[("imgt", "CD1")] = {
|
|
451
|
+
1: 0,
|
|
452
|
+
2: 1,
|
|
453
|
+
3: 2,
|
|
454
|
+
4: 3,
|
|
455
|
+
5: 4,
|
|
456
|
+
6: 5,
|
|
457
|
+
7: 6,
|
|
458
|
+
8: 8,
|
|
459
|
+
9: 9,
|
|
460
|
+
10: 10,
|
|
461
|
+
11: 11,
|
|
462
|
+
12: 12,
|
|
463
|
+
13: 13,
|
|
464
|
+
14: 14,
|
|
465
|
+
15: 15,
|
|
466
|
+
16: 16,
|
|
467
|
+
17: 17,
|
|
468
|
+
18: 18,
|
|
469
|
+
19: 19,
|
|
470
|
+
20: 20,
|
|
471
|
+
21: 21,
|
|
472
|
+
22: 22,
|
|
473
|
+
23: 23,
|
|
474
|
+
24: 24,
|
|
475
|
+
25: 25,
|
|
476
|
+
26: 26,
|
|
477
|
+
27: 27,
|
|
478
|
+
28: 28,
|
|
479
|
+
29: 29,
|
|
480
|
+
30: 30,
|
|
481
|
+
31: 31,
|
|
482
|
+
32: 32,
|
|
483
|
+
33: 33,
|
|
484
|
+
34: 34,
|
|
485
|
+
35: 35,
|
|
486
|
+
36: 36,
|
|
487
|
+
37: 37,
|
|
488
|
+
38: 38,
|
|
489
|
+
39: 39,
|
|
490
|
+
40: 40,
|
|
491
|
+
41: 41,
|
|
492
|
+
42: 42,
|
|
493
|
+
43: 43,
|
|
494
|
+
44: 44,
|
|
495
|
+
45: 45,
|
|
496
|
+
46: 46,
|
|
497
|
+
47: 47,
|
|
498
|
+
48: 48,
|
|
499
|
+
49: 49,
|
|
500
|
+
50: 57,
|
|
501
|
+
51: 58,
|
|
502
|
+
52: 59,
|
|
503
|
+
53: 60,
|
|
504
|
+
54: 61,
|
|
505
|
+
55: 63,
|
|
506
|
+
56: 64,
|
|
507
|
+
57: 65,
|
|
508
|
+
58: 66,
|
|
509
|
+
59: 67,
|
|
510
|
+
60: 68,
|
|
511
|
+
61: 69,
|
|
512
|
+
62: 72,
|
|
513
|
+
63: 73,
|
|
514
|
+
64: 74,
|
|
515
|
+
65: 75,
|
|
516
|
+
66: 76,
|
|
517
|
+
67: 77,
|
|
518
|
+
68: 78,
|
|
519
|
+
69: 79,
|
|
520
|
+
70: 80,
|
|
521
|
+
71: 81,
|
|
522
|
+
72: 82,
|
|
523
|
+
73: 84,
|
|
524
|
+
74: 85,
|
|
525
|
+
75: 86,
|
|
526
|
+
76: 87,
|
|
527
|
+
77: 88,
|
|
528
|
+
78: 89,
|
|
529
|
+
79: 90,
|
|
530
|
+
80: 91,
|
|
531
|
+
81: 92,
|
|
532
|
+
82: 93,
|
|
533
|
+
83: 94,
|
|
534
|
+
84: 95,
|
|
535
|
+
85: 96,
|
|
536
|
+
86: 97,
|
|
537
|
+
87: 98,
|
|
538
|
+
88: 99,
|
|
539
|
+
89: 100,
|
|
540
|
+
90: 101,
|
|
541
|
+
91: 102,
|
|
542
|
+
92: 103,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
_index_to_imgt_state[("imgt", "GA1")] = _index_to_imgt_state[("imgt", "MH1")]
|
|
546
|
+
_index_to_imgt_state[("imgt", "GA2")] = dict(
|
|
547
|
+
[(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "GA1")].items())]
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
# To map indices onto index_to_imgt_state: this works for everything apart from MHC constant domains.
|
|
552
|
+
def map_state_index_imgt(statevector):
|
|
553
|
+
numdict = {}
|
|
554
|
+
curr_num = 1
|
|
555
|
+
for i, char in enumerate(statevector):
|
|
556
|
+
if not char.isdigit():
|
|
557
|
+
numdict[curr_num] = i
|
|
558
|
+
curr_num += 1
|
|
559
|
+
return numdict
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# To map indices onto index_to_imgt_state
|
|
563
|
+
def map_state_index_mhc_clike(statevector):
|
|
564
|
+
numdict = {}
|
|
565
|
+
curr_num = 1
|
|
566
|
+
for i, char in enumerate(statevector):
|
|
567
|
+
if not char.isdigit():
|
|
568
|
+
numdict[curr_num] = i
|
|
569
|
+
if curr_num == 45:
|
|
570
|
+
curr_num += 32
|
|
571
|
+
else:
|
|
572
|
+
curr_num += 1
|
|
573
|
+
return numdict
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
_index_to_imgt_state[("imgt", "B2M")] = map_state_index_mhc_clike(mhc_cvec)
|
|
577
|
+
|
|
578
|
+
# For N numbering, IMGT uses numbers 1000-1100.
|
|
579
|
+
# For now, we'll avoid numbering B2M domains and the C-LIKE domains.
|
|
580
|
+
_index_to_imgt_state[("imgt", "MH1")].update(
|
|
581
|
+
dict(
|
|
582
|
+
[(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "MH1")].items())]
|
|
583
|
+
)
|
|
584
|
+
)
|
|
585
|
+
_index_to_imgt_state[("imgt", "GA")] = _index_to_imgt_state[("imgt", "GA1")]
|
|
586
|
+
_index_to_imgt_state[("imgt", "GB")] = _index_to_imgt_state[("imgt", "GA1")]
|
|
587
|
+
_index_to_imgt_state[("imgt", "GA1L")] = _index_to_imgt_state[("imgt", "CD1")]
|
|
588
|
+
_index_to_imgt_state[("imgt", "GA2L")] = dict(
|
|
589
|
+
[(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "CD1")].items())]
|
|
590
|
+
)
|
|
591
|
+
_index_to_imgt_state[("imgt", "CD1")].update(
|
|
592
|
+
dict(
|
|
593
|
+
[(k + 1000, v) for k, v in list(_index_to_imgt_state[("imgt", "CD1")].items())]
|
|
594
|
+
)
|
|
595
|
+
)
|
|
596
|
+
_index_to_imgt_state[("imgt", "MR1")] = _index_to_imgt_state[("imgt", "CD1")]
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def get_region(position, chain):
|
|
600
|
+
"""
|
|
601
|
+
Get the region in which the position belongs given the chain, numbering scheme and definition.
|
|
602
|
+
|
|
603
|
+
**Note** this function does not know about insertions on the sequence. Therefore, it will get the region annotation
|
|
604
|
+
wrong when using non-equivalent scheme-definitions.
|
|
605
|
+
|
|
606
|
+
To get around this please use the annotate_regions function which implements heuristics to get the definition correct
|
|
607
|
+
in the scheme.
|
|
608
|
+
|
|
609
|
+
"""
|
|
610
|
+
index, insertion = position
|
|
611
|
+
chain = chain.upper()
|
|
612
|
+
|
|
613
|
+
# imgt_state is a dictionary that maps an IMGT position (e.g. 1) onto the position along the state vector;
|
|
614
|
+
# Thus, 1 maps to 0 because 1 is the first number in the IMGT numbering scheme but is the 0th position along the state vector.
|
|
615
|
+
imgt_state = _index_to_imgt_state[("imgt", chain)]
|
|
616
|
+
# Get the state vector corresponding to a particular chain; this is either 1,2,3,4,5,6,7, corresponding to different TCR regions.
|
|
617
|
+
state_vec = _regions["imgt"][chain]
|
|
618
|
+
|
|
619
|
+
if chain in TCR_CHAINS:
|
|
620
|
+
if index in imgt_state:
|
|
621
|
+
state_idx = imgt_state[index]
|
|
622
|
+
# Returns a fwb1, cdra2... etc.
|
|
623
|
+
return _reg_one2three[state_vec[state_idx]] % chain.lower()
|
|
624
|
+
else:
|
|
625
|
+
return "?"
|
|
626
|
+
else:
|
|
627
|
+
if index in imgt_state:
|
|
628
|
+
state_idx = imgt_state[index]
|
|
629
|
+
# Returns whether helix or turn on the MHC (G-Domain annotation)
|
|
630
|
+
return _reg_tostring[state_vec[state_idx]]
|
|
631
|
+
else:
|
|
632
|
+
return "?"
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def annotate_regions(numbered_sequence, chain):
|
|
636
|
+
"""
|
|
637
|
+
Given a numbered sequence (list) annotate which region each residue belongs to.
|
|
638
|
+
Currently, only the IMGT numbering and definition are implemented.
|
|
639
|
+
If possible, use the corresponding numbering scheme and definition.
|
|
640
|
+
|
|
641
|
+
This function automates the heuristics recognise different definitions in each scheme. However,
|
|
642
|
+
some of the conversions are non-trivial.
|
|
643
|
+
"""
|
|
644
|
+
chain = chain.upper()
|
|
645
|
+
c = chain.lower()
|
|
646
|
+
|
|
647
|
+
numdict = dict(numbered_sequence)
|
|
648
|
+
|
|
649
|
+
cdr_acceptors = {1: Accept(), 2: Accept(), 3: Accept()}
|
|
650
|
+
|
|
651
|
+
cdr_acceptors[1].set_regions(["cdr%s1" % c])
|
|
652
|
+
cdr_acceptors[2].set_regions(["cdr%s2" % c])
|
|
653
|
+
cdr_acceptors[3].set_regions(["cdr%s3" % c])
|
|
654
|
+
|
|
655
|
+
# We start off by annotating framework regions; this switches when we find a CDR
|
|
656
|
+
fw_region = "fw%s1" % c
|
|
657
|
+
region_annotations = []
|
|
658
|
+
|
|
659
|
+
cterm = max(_index_to_imgt_state[("imgt", chain)].keys())
|
|
660
|
+
for r, a in numbered_sequence:
|
|
661
|
+
if cdr_acceptors[1].accept(r, chain):
|
|
662
|
+
region_annotations.append((r, a, "cdr%s1" % c))
|
|
663
|
+
fw_region = "fw%s2" % c
|
|
664
|
+
elif cdr_acceptors[2].accept(r, chain):
|
|
665
|
+
region_annotations.append((r, a, "cdr%s2" % c))
|
|
666
|
+
fw_region = "fw%s3" % c
|
|
667
|
+
elif cdr_acceptors[3].accept(r, chain):
|
|
668
|
+
region_annotations.append((r, a, "cdr%s3" % c))
|
|
669
|
+
fw_region = "fw%s4" % c
|
|
670
|
+
elif (
|
|
671
|
+
r[0] <= cterm
|
|
672
|
+
): # Anything out of the variable region is not assigned a region i.e. ''
|
|
673
|
+
region_annotations.append((r, a, fw_region))
|
|
674
|
+
else:
|
|
675
|
+
region_annotations.append((r, a, ""))
|
|
676
|
+
|
|
677
|
+
return region_annotations
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
class Accept(object):
|
|
681
|
+
"""
|
|
682
|
+
A class to select which positions should be compared.
|
|
683
|
+
"""
|
|
684
|
+
|
|
685
|
+
_defined_regions = TCR_REGIONS
|
|
686
|
+
_macro_regions = {
|
|
687
|
+
"bframework": set(["fwb1", "fwb2", "fwb3", "fwb4"]),
|
|
688
|
+
"bcdrs": set(["cdrb1", "cdrb2", "cdrb3"]),
|
|
689
|
+
"aframework": set(["fwa1", "fwa2", "fwa3", "fwa4"]),
|
|
690
|
+
"acdrs": set(["cdra1", "cdra2", "cdra3"]),
|
|
691
|
+
"gframework": set(["fwg1", "fwg2", "fwg3", "fwg4"]),
|
|
692
|
+
"gcdrs": set(["cdrg1", "cdrg2", "cdrg3"]),
|
|
693
|
+
"dframework": set(["fwd1", "fwd2", "fwd3", "fwd4"]),
|
|
694
|
+
"dcdrs": set(["cdrd1", "cdrd2", "cdrd3"]),
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
_macro_regions.update(
|
|
698
|
+
{
|
|
699
|
+
"framework": _macro_regions["bframework"]
|
|
700
|
+
| _macro_regions["aframework"]
|
|
701
|
+
| _macro_regions["dframework"]
|
|
702
|
+
| _macro_regions["gframework"],
|
|
703
|
+
"cdrs": _macro_regions["bcdrs"]
|
|
704
|
+
| _macro_regions["acdrs"]
|
|
705
|
+
| _macro_regions["dcdrs"]
|
|
706
|
+
| _macro_regions["gcdrs"],
|
|
707
|
+
"vb": _macro_regions["bcdrs"] | _macro_regions["bframework"],
|
|
708
|
+
"va": _macro_regions["acdrs"] | _macro_regions["aframework"],
|
|
709
|
+
"vg": _macro_regions["gcdrs"] | _macro_regions["gframework"],
|
|
710
|
+
"vd": _macro_regions["dcdrs"] | _macro_regions["dframework"],
|
|
711
|
+
}
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
_macro_regions.update(
|
|
715
|
+
{
|
|
716
|
+
"ba": _macro_regions["vb"] | _macro_regions["va"],
|
|
717
|
+
"dg": _macro_regions["vd"] | _macro_regions["vg"],
|
|
718
|
+
"tr": _macro_regions["vb"]
|
|
719
|
+
| _macro_regions["va"]
|
|
720
|
+
| _macro_regions["vd"]
|
|
721
|
+
| _macro_regions["vg"],
|
|
722
|
+
}
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
_macro_positions = {}
|
|
726
|
+
|
|
727
|
+
def __init__(self, NOT=False):
|
|
728
|
+
self.NOT = NOT
|
|
729
|
+
self.set_regions()
|
|
730
|
+
self.positions = {"B": set(), "A": set(), "D": set(), "G": set()}
|
|
731
|
+
self.exclude = {"B": set(), "A": set(), "D": set(), "G": set()}
|
|
732
|
+
|
|
733
|
+
def set_regions(self, regions=[]):
|
|
734
|
+
"""
|
|
735
|
+
Set the regions to be used. Will clear anything added using add regions.
|
|
736
|
+
"""
|
|
737
|
+
if self.NOT:
|
|
738
|
+
self.regions = self._macro_regions["tr"]
|
|
739
|
+
else:
|
|
740
|
+
self.regions = set()
|
|
741
|
+
self.add_regions(regions)
|
|
742
|
+
|
|
743
|
+
def add_regions(self, regions):
|
|
744
|
+
"""
|
|
745
|
+
Add regions to the selection.
|
|
746
|
+
"""
|
|
747
|
+
for region in regions:
|
|
748
|
+
region = region.lower()
|
|
749
|
+
if region in self._defined_regions:
|
|
750
|
+
if self.NOT:
|
|
751
|
+
self.regions = self.regions - set([region])
|
|
752
|
+
else:
|
|
753
|
+
self.regions.add(region)
|
|
754
|
+
elif region in self._macro_regions:
|
|
755
|
+
if self.NOT:
|
|
756
|
+
self.regions = self.regions - self._macro_regions[region]
|
|
757
|
+
else:
|
|
758
|
+
self.regions = self.regions | self._macro_regions[region]
|
|
759
|
+
elif region in self._macro_positions:
|
|
760
|
+
raise AssertionError("Undefined region")
|
|
761
|
+
else:
|
|
762
|
+
raise AssertionError("Undefined region")
|
|
763
|
+
|
|
764
|
+
def add_positions(self, positions, chain):
|
|
765
|
+
for position in positions:
|
|
766
|
+
index, insertion = position
|
|
767
|
+
self.positions[chain].add((index, insertion))
|
|
768
|
+
|
|
769
|
+
def exclude_positions(self, positions, chain):
|
|
770
|
+
for position in positions:
|
|
771
|
+
index, insertion = position
|
|
772
|
+
self.exclude[chain].add((index, insertion))
|
|
773
|
+
|
|
774
|
+
def accept(self, position, chain):
|
|
775
|
+
if position in self.exclude[chain]:
|
|
776
|
+
return 0
|
|
777
|
+
elif (
|
|
778
|
+
get_region(position, chain) in self.regions
|
|
779
|
+
or position in self.positions[chain]
|
|
780
|
+
):
|
|
781
|
+
return 1
|
|
782
|
+
return 0
|