gemmi-protools 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -0,0 +1,787 @@
1
+ """
2
+ @Author: Luo Jiejian
3
+ """
4
+ from collections import defaultdict
5
+ from copy import deepcopy
6
+
7
+ from gemmi_protools import StructureParser
8
+ from gemmi_protools.utils.pdb_annot import hash_sequence, annotate_pdb
9
+
10
+
11
+ class ImmuneComplex(object):
12
+ def __init__(self, annotation: dict,
13
+ min_ppi_res_hl=8,
14
+ min_ppi_res_hl_a=4,
15
+ min_ppi_res_mhc1_globulin=10,
16
+ min_ppi_res_mhc2_ab=20):
17
+ """
18
+
19
+ Args:
20
+ annotation: output from pdb_annot.annotate_pdb
21
+ min_ppi_res_hl: min PPI (H-L) res, default 8
22
+ min_ppi_res_hl_a: min PPI (HL-antigen) res on antigen, default 4
23
+ min_ppi_res_mhc1_globulin: min PPI (MHC I - globulin) res on globulin, default 10
24
+ min_ppi_res_mhc2_ab: min PPI (MHC II alpha - MHC II beta) res on MHC II alpha or MHC II beta, default 20
25
+ """
26
+ self.annotation = annotation
27
+ self.min_ppi_res_hl = min_ppi_res_hl
28
+ self.min_ppi_res_hl_a = min_ppi_res_hl_a
29
+ self.min_ppi_res_mhc1_globulin = min_ppi_res_mhc1_globulin
30
+ self.min_ppi_res_mhc2_ab = min_ppi_res_mhc2_ab
31
+ self.ch_infos = self._get_chain_infos()
32
+ self.ig_hl_pairs, self.vhh_chains = self._get_ig_hl_pairs()
33
+ self.tr_hl_pairs = self._get_tr_hl_pairs()
34
+
35
+ self.load_assembly_info()
36
+
37
+ def load_assembly_info(self):
38
+ st = StructureParser()
39
+ st.load_from_file(self.annotation["path"])
40
+ st.set_default_model()
41
+ st.STRUCT.remove_alternative_conformations()
42
+ st.STRUCT.remove_ligands_and_waters()
43
+ st.STRUCT.remove_hydrogens()
44
+ st.STRUCT.remove_empty_chains()
45
+ st.update_entity()
46
+
47
+ original_keys = set(self.ch_infos["ch2hash_id"].items())
48
+
49
+ try:
50
+ values = dict()
51
+ key_records = set()
52
+ for name in st.assembly_names:
53
+ assem = st.get_assembly(name)
54
+ cur_keys = {(ch, hash_sequence(seq)) for ch, seq in assem.polymer_sequences.items()}
55
+
56
+ if cur_keys.issubset(original_keys):
57
+ key_records.update(cur_keys)
58
+
59
+ cur_chs = list(assem.polymer_sequences.keys())
60
+ cur_chs.sort()
61
+
62
+ values[name] = cur_chs
63
+
64
+ if key_records == original_keys:
65
+ full_coverage = True
66
+ else:
67
+ full_coverage = False
68
+ except Exception as e:
69
+ values = dict()
70
+ full_coverage = False
71
+
72
+ self.annotation["assembly"] = dict(assem_name2ch=values, full_coverage=full_coverage)
73
+
74
+ def _get_chain_infos(self):
75
+ immune_dict = defaultdict(list)
76
+ antigens = []
77
+ seq_lens = dict()
78
+ mhc_dict = defaultdict(list)
79
+ immune_hash_set = []
80
+
81
+ for val in self.annotation["polymers"].values():
82
+ n = len(val["sequence"])
83
+ for ch in val["chain_ids"]:
84
+ seq_lens[ch] = n
85
+
86
+ for hash_id, ann_val in self.annotation["anarci"].items():
87
+ fv_type = ann_val["fv_type"]
88
+ chs = self.annotation["polymers"][hash_id]["chain_ids"]
89
+
90
+ if fv_type in ["IG/VL", "IG/VH", "IG/scFv", "TR/VL", "TR/VH", "TR/scFv"]:
91
+ immune_dict[fv_type].extend(chs)
92
+
93
+ immune_hash_set.append(hash_id)
94
+
95
+ for hash_id, mhc_type in self.annotation["mhc"].items():
96
+ chs = self.annotation["polymers"][hash_id]["chain_ids"]
97
+ mhc_dict[mhc_type].extend(chs)
98
+
99
+ for hash_id, seq in self.annotation["polymers"].items():
100
+ if hash_id not in immune_hash_set and seq["type"] == "protein":
101
+ antigens.extend(seq["chain_ids"])
102
+
103
+ ch2mhc_type = dict()
104
+
105
+ for t, chs in mhc_dict.items():
106
+ for ch in chs:
107
+ ch2mhc_type[ch] = t
108
+
109
+ ch2hash_id = dict()
110
+ for hash_id, val in self.annotation["polymers"].items():
111
+ for ch in val["chain_ids"]:
112
+ ch2hash_id[ch] = hash_id
113
+
114
+ return dict(antigens=antigens,
115
+ immune={k: v for k, v in immune_dict.items()},
116
+ mhc_type2ch={k: v for k, v in mhc_dict.items()},
117
+ ch2mhc_type=ch2mhc_type,
118
+ seq_lens=seq_lens,
119
+ ch2hash_id=ch2hash_id,
120
+ )
121
+
122
+ def _get_antigen_ppi_res(self, query_ch: str, query_antigen: str):
123
+ """
124
+
125
+ Args:
126
+ query_ch: str, chain id
127
+ query_antigen: str, chain id
128
+
129
+ Returns:
130
+
131
+ """
132
+
133
+ if query_antigen > query_ch:
134
+ ag_idx = 1
135
+ key = "%s/%s" % (query_ch, query_antigen)
136
+ else:
137
+ ag_idx = 0
138
+ key = "%s/%s" % (query_antigen, query_ch)
139
+
140
+ if key in self.annotation["interfaces"]:
141
+ return self.annotation["interfaces"][key][ag_idx]
142
+ else:
143
+ return []
144
+
145
+ def _find_globulin(self, ch_mhc_i):
146
+ tmp = []
147
+ for ch in self.ch_infos["antigens"]:
148
+ if ch not in self.ch_infos["ch2mhc_type"]:
149
+ res = self._get_antigen_ppi_res(ch_mhc_i, ch)
150
+ if len(res) >= self.min_ppi_res_mhc1_globulin:
151
+ tmp.append((len(res), ch))
152
+ tmp.sort(reverse=True)
153
+ if len(tmp) > 0:
154
+ return tmp[0][1]
155
+ else:
156
+ return ""
157
+
158
+ def _find_mhc_chain_with_most_ppi_res(self, immune_chains):
159
+ """
160
+ # No Ok for all instances, not Use
161
+ # outlier: 2ak4, 5ksb
162
+ Args:
163
+ immune_chains:
164
+
165
+ Returns:
166
+
167
+ """
168
+
169
+ tmp = []
170
+ for mhc_type, mhc_chains in self.ch_infos["mhc_type2ch"].items():
171
+ for ch_mhc in mhc_chains:
172
+ res = set()
173
+ for ch in immune_chains:
174
+ res.update(self._get_antigen_ppi_res(ch, ch_mhc))
175
+
176
+ if len(res) >= self.min_ppi_res_hl_a:
177
+ tmp.append((len(res), ch_mhc, mhc_type))
178
+
179
+ tmp.sort(reverse=True)
180
+
181
+ if len(tmp) > 0:
182
+ return tmp[0][1], tmp[0][2]
183
+ else:
184
+ return None
185
+
186
+ def _find_mhc_peptide(self, mhc_chains):
187
+ tmp = []
188
+ for ch in self.ch_infos["antigens"]:
189
+ if ch not in self.ch_infos["ch2mhc_type"] and self.ch_infos["seq_lens"][ch] < 30:
190
+ res_2 = set()
191
+ for q_ch in mhc_chains:
192
+ res_2.update(self._get_antigen_ppi_res(q_ch, ch))
193
+
194
+ if len(res_2) >= self.min_ppi_res_hl_a:
195
+ tmp.append((len(res_2), ch))
196
+
197
+ tmp.sort(reverse=True)
198
+ if len(tmp) > 0:
199
+ return tmp[0][1]
200
+ else:
201
+ return ""
202
+
203
+ def search_complex_for_IG_scFv(self):
204
+ results = []
205
+ if "IG/scFv" in self.ch_infos["immune"]:
206
+ for ch in self.ch_infos["immune"]["IG/scFv"]:
207
+ records = []
208
+ n_ppi_ag_res = 0
209
+ for ag in self.ch_infos["antigens"]:
210
+ res = self._get_antigen_ppi_res(ch, ag)
211
+ if len(res) >= self.min_ppi_res_hl_a:
212
+ records.append(ag)
213
+ n_ppi_ag_res += len(res)
214
+
215
+ if len(records) > 0:
216
+ records.sort()
217
+ results.append(dict(immune_type="IGscFv",
218
+ chain_H=ch,
219
+ chain_L="",
220
+ ch_antigens=records,
221
+ n_ppi_ag_res=n_ppi_ag_res)
222
+ )
223
+ return results
224
+
225
+ def search_complex_for_TR_scFv(self):
226
+ results = []
227
+ if "TR/scFv" in self.ch_infos["immune"]:
228
+ for ch in self.ch_infos["immune"]["TR/scFv"]:
229
+ records = []
230
+ n_ppi_ag_res = 0
231
+ for ag in self.ch_infos["antigens"]:
232
+ res = self._get_antigen_ppi_res(ch, ag)
233
+ if len(res) >= self.min_ppi_res_hl_a:
234
+ records.append(ag)
235
+ n_ppi_ag_res += len(res)
236
+
237
+ if len(records) > 0:
238
+ records.sort()
239
+ results.append(dict(immune_type="TRscFv",
240
+ chain_H=ch,
241
+ chain_L="",
242
+ ch_antigens=records,
243
+ n_ppi_ag_res=n_ppi_ag_res)
244
+ )
245
+ return results
246
+
247
+ def _get_ig_hl_pairs(self):
248
+ """
249
+
250
+ Returns:
251
+ a tuple (HL pairs, VHH chains)
252
+ HL pairs: list of tuple of (VH, VL)
253
+ VHH chains: list of chains
254
+ """
255
+ hl_pairs = []
256
+ vhh_chains = []
257
+
258
+ matched_h_chains = []
259
+ if "IG/VH" in self.ch_infos["immune"]:
260
+ for ch_vh in self.ch_infos["immune"]["IG/VH"]:
261
+ if "IG/VL" in self.ch_infos["immune"]:
262
+ tmp_pairs = []
263
+
264
+ for ch_vl in self.ch_infos["immune"]["IG/VL"]:
265
+ if ch_vh > ch_vl:
266
+ key = "%s/%s" % (ch_vl, ch_vh)
267
+ else:
268
+ key = "%s/%s" % (ch_vh, ch_vl)
269
+
270
+ if key in self.annotation["interfaces"]:
271
+ ppi_res_hl = len(self.annotation["interfaces"][key][0]) + len(
272
+ self.annotation["interfaces"][key][1])
273
+ if ppi_res_hl >= self.min_ppi_res_hl:
274
+ tmp_pairs.append((ppi_res_hl, ch_vh, ch_vl))
275
+
276
+ tmp_pairs.sort(reverse=True)
277
+ if len(tmp_pairs) > 0:
278
+ hl_pairs.append((tmp_pairs[0][1], tmp_pairs[0][2]))
279
+ matched_h_chains.append(tmp_pairs[0][1])
280
+
281
+ vhh_chains = [ch_vh for ch_vh in self.ch_infos["immune"]["IG/VH"] if ch_vh not in matched_h_chains]
282
+ return hl_pairs, vhh_chains
283
+
284
+ def _get_tr_hl_pairs(self):
285
+ """
286
+
287
+ Returns:
288
+ TR HL pairs, list of (VH, VL)
289
+ """
290
+ hl_pairs = []
291
+ if "TR/VH" in self.ch_infos["immune"]:
292
+ for ch_vh in self.ch_infos["immune"]["TR/VH"]:
293
+ if "TR/VL" in self.ch_infos["immune"]:
294
+ tmp_pairs = []
295
+
296
+ for ch_vl in self.ch_infos["immune"]["TR/VL"]:
297
+ if ch_vh > ch_vl:
298
+ key = "%s/%s" % (ch_vl, ch_vh)
299
+ else:
300
+ key = "%s/%s" % (ch_vh, ch_vl)
301
+
302
+ if key in self.annotation["interfaces"]:
303
+ ppi_res_hl = len(self.annotation["interfaces"][key][0]) + len(
304
+ self.annotation["interfaces"][key][1])
305
+ if ppi_res_hl >= self.min_ppi_res_hl:
306
+ tmp_pairs.append((ppi_res_hl, ch_vh, ch_vl))
307
+
308
+ tmp_pairs.sort(reverse=True)
309
+ if len(tmp_pairs) > 0:
310
+ hl_pairs.append((tmp_pairs[0][1], tmp_pairs[0][2]))
311
+ return hl_pairs
312
+
313
+ def search_complex_for_IG_HL(self):
314
+ results = []
315
+ for ch_vh, ch_vl in self.ig_hl_pairs:
316
+ records = []
317
+ n_ppi_ag_res = 0
318
+ for ag in self.ch_infos["antigens"]:
319
+ res_h = self._get_antigen_ppi_res(ch_vh, ag)
320
+ res_l = self._get_antigen_ppi_res(ch_vl, ag)
321
+ _n = len(set(res_h + res_l))
322
+ if _n >= self.min_ppi_res_hl_a:
323
+ records.append(ag)
324
+ n_ppi_ag_res += _n
325
+
326
+ if len(records) > 0:
327
+ records.sort()
328
+ results.append(dict(immune_type="IG",
329
+ chain_H=ch_vh,
330
+ chain_L=ch_vl,
331
+ ch_antigens=records,
332
+ n_ppi_ag_res=n_ppi_ag_res)
333
+ )
334
+ return results
335
+
336
+ def search_complex_for_VHH(self):
337
+ results = []
338
+ for ch_vh in self.vhh_chains:
339
+ records = []
340
+ n_ppi_ag_res = 0
341
+ for ag in self.ch_infos["antigens"]:
342
+ res_h = self._get_antigen_ppi_res(ch_vh, ag)
343
+ _n = len(res_h)
344
+ if _n >= self.min_ppi_res_hl_a:
345
+ records.append(ag)
346
+ n_ppi_ag_res += _n
347
+
348
+ if len(records) > 0:
349
+ records.sort()
350
+ results.append(dict(immune_type="VHH",
351
+ chain_H=ch_vh,
352
+ chain_L="",
353
+ ch_antigens=records,
354
+ n_ppi_ag_res=n_ppi_ag_res)
355
+ )
356
+ return results
357
+
358
+ def search_complex_for_TR_HL(self):
359
+ results = []
360
+ for ch_vh, ch_vl in self.tr_hl_pairs:
361
+ records = []
362
+ n_ppi_ag_res = 0
363
+ for ag in self.ch_infos["antigens"]:
364
+ res_h = self._get_antigen_ppi_res(ch_vh, ag)
365
+ res_l = self._get_antigen_ppi_res(ch_vl, ag)
366
+ _n = len(set(res_h + res_l))
367
+ if _n >= self.min_ppi_res_hl_a:
368
+ records.append(ag)
369
+ n_ppi_ag_res += _n
370
+
371
+ if len(records) > 0:
372
+ records.sort()
373
+ results.append(dict(immune_type="TR",
374
+ chain_H=ch_vh,
375
+ chain_L=ch_vl,
376
+ ch_antigens=records,
377
+ n_ppi_ag_res=n_ppi_ag_res)
378
+ )
379
+ return results
380
+
381
+ def _update_antigen_ppi_res(self, query_chains, antigen_chains):
382
+ res = set()
383
+ for q_ch in query_chains:
384
+ for q_ag in antigen_chains:
385
+ res.update(self._get_antigen_ppi_res(q_ch, q_ag))
386
+ return len(res)
387
+
388
+ def _double_check_with_assembly(self, item):
389
+ """
390
+
391
+ Args:
392
+ item: element from .run
393
+
394
+ Returns:
395
+
396
+ """
397
+
398
+ if item["chain_L"] == "":
399
+ immune_chains = {item["chain_H"]}
400
+ else:
401
+ immune_chains = {item["chain_H"], item["chain_L"]}
402
+
403
+ tmp = []
404
+ for assem_name, assem_chs in self.annotation["assembly"]["assem_name2ch"].items():
405
+ s0 = set(assem_chs)
406
+ if immune_chains.issubset(s0):
407
+ ch_antigens = set(item["ch_antigens"])
408
+ ch_diff = ch_antigens - s0
409
+ tmp.append((len(ch_diff), len(s0), ch_antigens.intersection(s0)))
410
+
411
+ tmp.sort(reverse=False)
412
+ if len(tmp) > 0:
413
+ n_diff, _, common_ags = tmp[0]
414
+ if n_diff > 0 and len(common_ags) > 0:
415
+ new_ag_chs = list(common_ags)
416
+ new_ag_chs.sort()
417
+
418
+ n_ppi_ag_res = self._update_antigen_ppi_res(list(immune_chains),
419
+ antigen_chains=new_ag_chs)
420
+ item["ch_antigens"] = new_ag_chs
421
+ item["n_ppi_ag_res"] = n_ppi_ag_res
422
+ return item
423
+ else:
424
+ return item
425
+ else:
426
+ return item
427
+
428
+ def run(self):
429
+ results = []
430
+ results.extend(self.search_complex_for_IG_HL())
431
+ results.extend(self.search_complex_for_TR_HL())
432
+ results.extend(self.search_complex_for_VHH())
433
+ results.extend(self.search_complex_for_IG_scFv())
434
+ results.extend(self.search_complex_for_TR_scFv())
435
+
436
+ # Refine with assembly, if full_coverage is True
437
+ # check MHC antigens
438
+ check_results = []
439
+ for org_item in results:
440
+ item = self._double_check_with_assembly(org_item)
441
+ cts = defaultdict(list)
442
+ for ag in item["ch_antigens"]:
443
+ if ag in self.ch_infos["ch2mhc_type"]:
444
+ t = self.ch_infos["ch2mhc_type"][ag]
445
+ cts[t].append(ag)
446
+
447
+ uniq_types = set(cts.keys())
448
+
449
+ if item["chain_L"] == "":
450
+ immune_chains = [item["chain_H"]]
451
+ else:
452
+ immune_chains = [item["chain_H"], item["chain_L"]]
453
+
454
+ if len(cts) == 0:
455
+ item["ch_antigens"] = "/".join(item["ch_antigens"])
456
+ item["check_status"] = "Y"
457
+ item["mhc_type_of_antigen"] = ""
458
+ check_results.append(item)
459
+ else:
460
+ # chain order
461
+ # MHC I / globulin / peptide
462
+ # MHC II alpha / MHC II beta / peptide
463
+
464
+ if uniq_types == {"MHC_I"}:
465
+ if len(cts["MHC_I"]) == 1:
466
+ # Find pair globulin
467
+ mhc_chain = cts["MHC_I"][0]
468
+ ch_globulin = self._find_globulin(mhc_chain)
469
+ peptide = self._find_mhc_peptide(mhc_chains=[mhc_chain])
470
+ item["ch_antigens"] = "/".join([mhc_chain, ch_globulin, peptide])
471
+ item["check_status"] = "Y"
472
+
473
+ if peptide != "":
474
+ ag_chs = [mhc_chain, peptide]
475
+ else:
476
+ ag_chs = [mhc_chain]
477
+ n_ppi_ag_res = self._update_antigen_ppi_res(immune_chains,
478
+ antigen_chains=ag_chs)
479
+ item["n_ppi_ag_res"] = n_ppi_ag_res
480
+ item["mhc_type_of_antigen"] = "MHC I"
481
+ check_results.append(item)
482
+ else:
483
+ is_fixed = False
484
+ for mhc_chain in cts["MHC_I"]:
485
+ peptide = self._find_mhc_peptide(mhc_chains=[mhc_chain])
486
+ if peptide in item["ch_antigens"]:
487
+ ch_globulin = self._find_globulin(mhc_chain)
488
+
489
+ item["ch_antigens"] = "/".join([mhc_chain, ch_globulin, peptide])
490
+ item["check_status"] = "Y"
491
+
492
+ n_ppi_ag_res = self._update_antigen_ppi_res(immune_chains,
493
+ antigen_chains=[mhc_chain, peptide])
494
+ item["n_ppi_ag_res"] = n_ppi_ag_res
495
+ item["mhc_type_of_antigen"] = "MHC I"
496
+ check_results.append(item)
497
+ is_fixed = True
498
+ break
499
+
500
+ if not is_fixed:
501
+ item["ch_antigens"] = "/".join(item["ch_antigens"])
502
+ item["check_status"] = "N"
503
+ item["mhc_type_of_antigen"] = ""
504
+ check_results.append(item)
505
+
506
+ elif uniq_types.issubset({"MHC_II_alpha", "MHC_II_beta"}):
507
+ if len(cts["MHC_II_alpha"]) == 1 and len(cts["MHC_II_beta"]) == 1:
508
+ alpha = cts["MHC_II_alpha"][0]
509
+ beta = cts["MHC_II_beta"][0]
510
+
511
+ peptide = self._find_mhc_peptide(mhc_chains=[alpha, beta])
512
+ item["ch_antigens"] = "/".join([alpha, beta, peptide])
513
+ item["check_status"] = "Y"
514
+
515
+ if peptide != "":
516
+ ag_chs = [alpha, beta, peptide]
517
+ else:
518
+ ag_chs = [alpha, beta]
519
+ n_ppi_ag_res = self._update_antigen_ppi_res(immune_chains,
520
+ antigen_chains=ag_chs)
521
+ item["n_ppi_ag_res"] = n_ppi_ag_res
522
+ item["mhc_type_of_antigen"] = "MHC II"
523
+ check_results.append(item)
524
+ else:
525
+ if len(cts["MHC_II_alpha"]) == 1 and len(cts["MHC_II_beta"]) > 1:
526
+ alpha = cts["MHC_II_alpha"][0]
527
+ beta = ""
528
+
529
+ for q_beta in cts["MHC_II_beta"]:
530
+ res = self._get_antigen_ppi_res(q_beta, alpha)
531
+ if len(res) >= self.min_ppi_res_mhc2_ab:
532
+ # Found
533
+ beta = q_beta
534
+ break
535
+
536
+ elif len(cts["MHC_II_alpha"]) > 1 and len(cts["MHC_II_beta"]) == 1:
537
+ beta = cts["MHC_II_beta"][0]
538
+ alpha = ""
539
+
540
+ for q_alpha in cts["MHC_II_alpha"]:
541
+ res = self._get_antigen_ppi_res(q_alpha, beta)
542
+ if len(res) >= self.min_ppi_res_mhc2_ab:
543
+ # Found
544
+ alpha = q_alpha
545
+ break
546
+ elif len(cts["MHC_II_alpha"]) == 1 and len(cts["MHC_II_beta"]) == 0:
547
+ # due to IG or TR has no interaction with MHC_II_beta
548
+ # find MHC_II_beta in all MHC_II_beta chains.
549
+ alpha = cts["MHC_II_alpha"][0]
550
+ beta = ""
551
+
552
+ for q_beta in self.ch_infos["mhc_type2ch"]["MHC_II_beta"]:
553
+ res = self._get_antigen_ppi_res(q_beta, alpha)
554
+ if len(res) >= self.min_ppi_res_mhc2_ab:
555
+ # Found
556
+ beta = q_beta
557
+ break
558
+
559
+ elif len(cts["MHC_II_alpha"]) == 0 and len(cts["MHC_II_beta"]) == 1:
560
+ # due to IG or TR has no interaction with MHC_II_beta
561
+ # find MHC_II_beta in all MHC_II_beta chains.
562
+ alpha = ""
563
+ beta = cts["MHC_II_beta"][0]
564
+
565
+ for q_alpha in self.ch_infos["mhc_type2ch"]["MHC_II_alpha"]:
566
+ res = self._get_antigen_ppi_res(q_alpha, beta)
567
+ if len(res) >= self.min_ppi_res_mhc2_ab:
568
+ # Found
569
+ alpha = q_alpha
570
+ break
571
+ else:
572
+ alpha = ""
573
+ beta = ""
574
+
575
+ if alpha != "" and beta != "":
576
+ peptide = self._find_mhc_peptide(mhc_chains=[alpha, beta])
577
+ item["ch_antigens"] = "/".join([alpha, beta, peptide])
578
+ item["check_status"] = "Y"
579
+
580
+ if peptide != "":
581
+ ag_chs = [alpha, beta, peptide]
582
+ else:
583
+ ag_chs = [alpha, beta]
584
+ n_ppi_ag_res = self._update_antigen_ppi_res(immune_chains,
585
+ antigen_chains=ag_chs)
586
+ item["n_ppi_ag_res"] = n_ppi_ag_res
587
+ item["mhc_type_of_antigen"] = "MHC II"
588
+ check_results.append(item)
589
+ else:
590
+ item["ch_antigens"] = "/".join(item["ch_antigens"])
591
+ item["check_status"] = "N"
592
+ item["mhc_type_of_antigen"] = ""
593
+ check_results.append(item)
594
+
595
+ else:
596
+ # contain MHC I and MHC II
597
+ item["ch_antigens"] = "/".join(item["ch_antigens"])
598
+ item["check_status"] = "N"
599
+ item["mhc_type_of_antigen"] = ""
600
+ check_results.append(item)
601
+
602
+ return check_results
603
+
604
+ def add_information(self, element: dict):
605
+ """
606
+
607
+ Args:
608
+ element: element of output from self.run
609
+
610
+ Returns:
611
+
612
+ """
613
+ immune_type = element["immune_type"]
614
+
615
+ if immune_type in ["IG", "TR"]:
616
+ h_hash_id = self.ch_infos["ch2hash_id"][element["chain_H"]]
617
+ h_vals = self.annotation["polymers"][h_hash_id]
618
+ h_anarci_ann = self.annotation["anarci"][h_hash_id]["annotations"][0]
619
+
620
+ l_hash_id = self.ch_infos["ch2hash_id"][element["chain_L"]]
621
+ l_vals = self.annotation["polymers"][l_hash_id]
622
+ l_anarci_ann = self.annotation["anarci"][l_hash_id]["annotations"][0]
623
+
624
+ output = dict(seq_H=h_vals["sequence"],
625
+ specie_H=h_vals["specie"],
626
+ taxid_H=h_vals["taxid"],
627
+ seq_L=l_vals["sequence"],
628
+ specie_L=l_vals["specie"],
629
+ taxid_L=l_vals["taxid"],
630
+ VH=h_anarci_ann["Fv_aa"],
631
+ type_VH=h_anarci_ann["chain_type"],
632
+ v_gene_VH=h_anarci_ann["v_gene"],
633
+ j_gene_VH=h_anarci_ann["j_gene"],
634
+ cdr1_VH=h_anarci_ann["cdr1_aa"],
635
+ cdr2_VH=h_anarci_ann["cdr2_aa"],
636
+ cdr3_VH=h_anarci_ann["cdr3_aa"],
637
+ VL=l_anarci_ann["Fv_aa"],
638
+ type_VL=l_anarci_ann["chain_type"],
639
+ v_gene_VL=l_anarci_ann["v_gene"],
640
+ j_gene_VL=l_anarci_ann["j_gene"],
641
+ cdr1_VL=l_anarci_ann["cdr1_aa"],
642
+ cdr2_VL=l_anarci_ann["cdr2_aa"],
643
+ cdr3_VL=l_anarci_ann["cdr3_aa"],
644
+ )
645
+ elif immune_type == "VHH":
646
+ h_hash_id = self.ch_infos["ch2hash_id"][element["chain_H"]]
647
+ h_vals = self.annotation["polymers"][h_hash_id]
648
+ h_anarci_ann = self.annotation["anarci"][h_hash_id]["annotations"][0]
649
+
650
+ output = dict(seq_H=h_vals["sequence"],
651
+ specie_H=h_vals["specie"],
652
+ taxid_H=h_vals["taxid"],
653
+ seq_L="",
654
+ specie_L="",
655
+ taxid_L="",
656
+ VH=h_anarci_ann["Fv_aa"],
657
+ type_VH=h_anarci_ann["chain_type"],
658
+ v_gene_VH=h_anarci_ann["v_gene"],
659
+ j_gene_VH=h_anarci_ann["j_gene"],
660
+ cdr1_VH=h_anarci_ann["cdr1_aa"],
661
+ cdr2_VH=h_anarci_ann["cdr2_aa"],
662
+ cdr3_VH=h_anarci_ann["cdr3_aa"],
663
+ VL="",
664
+ type_VL="",
665
+ v_gene_VL="",
666
+ j_gene_VL="",
667
+ cdr1_VL="",
668
+ cdr2_VL="",
669
+ cdr3_VL="",
670
+ )
671
+ elif immune_type in ["TRscFv", "IGscFv"]:
672
+ # scFv
673
+ h_hash_id = self.ch_infos["ch2hash_id"][element["chain_H"]]
674
+ h_vals = self.annotation["polymers"][h_hash_id]
675
+ anarci_ann_1, anarci_ann_2 = self.annotation["anarci"][h_hash_id]["annotations"]
676
+
677
+ # {"IGH", "IGL"}, {"IGH", "IGK"}, {"TRA", "TRB"}, {"TRG", "TRD"}
678
+ key = "%s%s" % (anarci_ann_1["classification"], anarci_ann_1["chain_type"])
679
+
680
+ if key in ["IGH", "TRB", "TRD"]:
681
+ vh_ann = anarci_ann_1
682
+ vl_ann = anarci_ann_2
683
+ else:
684
+ vh_ann = anarci_ann_2
685
+ vl_ann = anarci_ann_1
686
+
687
+ output = dict(seq_H=h_vals["sequence"],
688
+ specie_H=h_vals["specie"],
689
+ taxid_H=h_vals["taxid"],
690
+ seq_L="",
691
+ specie_L="",
692
+ taxid_L="",
693
+ VH=vh_ann["Fv_aa"],
694
+ type_VH=vh_ann["chain_type"],
695
+ v_gene_VH=vh_ann["v_gene"],
696
+ j_gene_VH=vh_ann["j_gene"],
697
+ cdr1_VH=vh_ann["cdr1_aa"],
698
+ cdr2_VH=vh_ann["cdr2_aa"],
699
+ cdr3_VH=vh_ann["cdr3_aa"],
700
+ VL=vl_ann["Fv_aa"],
701
+ type_VL=vl_ann["chain_type"],
702
+ v_gene_VL=vl_ann["v_gene"],
703
+ j_gene_VL=vl_ann["j_gene"],
704
+ cdr1_VL=vl_ann["cdr1_aa"],
705
+ cdr2_VL=vl_ann["cdr2_aa"],
706
+ cdr3_VL=vl_ann["cdr3_aa"],
707
+ )
708
+ else:
709
+ raise RuntimeError("Unknown immune type: %s" % immune_type)
710
+
711
+ base_info = dict(path=self.annotation["path"],
712
+ pdb_id=self.annotation["info"]["pdb_id"],
713
+ exp_method=self.annotation["info"]["exp_method"],
714
+ deposition_date=self.annotation["info"]["deposition_date"],
715
+ resolution=self.annotation["info"]["resolution"],
716
+ title=self.annotation["info"]["title"],
717
+ antigen_hash_id=self.get_antigens_hash_id(element["ch_antigens"])
718
+ )
719
+
720
+ merge_out = deepcopy(element)
721
+ merge_out.update(output)
722
+ merge_out.update(base_info)
723
+ return merge_out
724
+
725
+ def get_antigens_hash_id(self, antigens: str):
726
+ out = []
727
+ for ch in antigens.split("/"):
728
+ if ch == "":
729
+ out.append("")
730
+ else:
731
+ out.append(self.ch_infos["ch2hash_id"][ch])
732
+ return "/".join(out)
733
+
734
+ def get_antigen_seqs(self, hash_ids: set):
735
+ pdb_id = self.annotation["info"]["pdb_id"]
736
+ out = []
737
+ for hash_id in hash_ids:
738
+ vals = self.annotation["polymers"][hash_id]
739
+ out.append(dict(pdb_id=pdb_id,
740
+ hash_id=hash_id,
741
+ specie=vals["specie"],
742
+ description=vals["description"],
743
+ taxid=vals["taxid"],
744
+ sequence=vals["sequence"]
745
+ )
746
+ )
747
+ return out
748
+
749
+
750
+ def immune_complex_from_pdb(struct_file: str,
751
+ ppi_threshold: float = 4.5,
752
+ min_ppi_res_hl: int = 8,
753
+ min_ppi_res_hl_a: int = 4,
754
+ min_ppi_res_mhc1_globulin: int = 10,
755
+ min_ppi_res_mhc2_ab: int = 20,
756
+ n_cpus: int = 1,
757
+ max_seqs: int = 100):
758
+ """
759
+
760
+ :param struct_file: str
761
+ path of structure file, .pdb, .cif, .pdb.gz, .cif.gz
762
+ :param ppi_threshold: float, default 4.5
763
+ the maximum distance threshold between heavy atoms to identify interactions
764
+ :param min_ppi_res_hl: int, default 8
765
+ the minimum number of interacting residues between H and L chains
766
+ :param min_ppi_res_hl_a: int, default 4
767
+ the minimum number of interacting residues between HL and antigen chain
768
+ :param min_ppi_res_mhc1_globulin: int, default 10
769
+ the minimum number of interacting residues between MHC 1 and beta micro globulin chain
770
+ :param min_ppi_res_mhc2_ab: int, default 20
771
+ the minimum number of interacting residues between MHC 2 and HL chains
772
+ :param n_cpus:
773
+ :param max_seqs:
774
+ :return:
775
+ """
776
+
777
+ annotation = annotate_pdb(struct_file, ppi_threshold, n_cpus, max_seqs)
778
+ func = ImmuneComplex(annotation, min_ppi_res_hl=min_ppi_res_hl,
779
+ min_ppi_res_hl_a=min_ppi_res_hl_a,
780
+ min_ppi_res_mhc1_globulin=min_ppi_res_mhc1_globulin,
781
+ min_ppi_res_mhc2_ab=min_ppi_res_mhc2_ab)
782
+ elements = func.run()
783
+ outputs = []
784
+ for element in elements:
785
+ element_extra = func.add_information(element)
786
+ outputs.append(element_extra)
787
+ return outputs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gemmi_protools
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: An Enhanced tool to process PDB structures based on Gemmi
5
5
  Author: Luo Jiejian
6
6
  Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
@@ -14,8 +14,6 @@ Requires-Dist: typeguard>=4.1.2
14
14
  Requires-Dist: numpy
15
15
  Requires-Dist: biopython>=1.84
16
16
  Requires-Dist: scipy>=1.14.1
17
- Requires-Dist: dockq
18
- Requires-Dist: hmmer
19
17
  Dynamic: author
20
18
  Dynamic: license-file
21
19
 
@@ -23,7 +21,18 @@ Dynamic: license-file
23
21
 
24
22
  # Install
25
23
  ```commandline
26
- conda create -n gp python=3.10 anarci -c bioconda
24
+ conda create -n gp python=3.10
25
+ conda install -n gp anarci hmmer -c bioconda
26
+ conda install -n gp dockq -c conda-forge
27
27
  conda activate gp
28
28
  pip install gemmi_protools
29
29
  ```
30
+
31
+ # Usage
32
+
33
+ ## read structures
34
+ ```commandline
35
+ from gemmi_protools import StructureParser
36
+ st=StructureParser()
37
+ st.load_from_file("your.pdb")
38
+ ```
@@ -17,10 +17,11 @@ gemmi_protools/utils/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTww
17
17
  gemmi_protools/utils/align.py,sha256=wyJDawxW10kdYWEM1F_LUEc3Qo-3_I7P5hFk-r-yqgY,7432
18
18
  gemmi_protools/utils/dockq.py,sha256=XmMwVEy-H4p6sH_HPcDWA3TP77OWdih0fE_BQJDr4pU,4189
19
19
  gemmi_protools/utils/fixer.py,sha256=yP9pTJ67n7z56UFfe2-eEsS3jkJfG2lP4KAEpXxlrnE,10142
20
+ gemmi_protools/utils/immune_complex.py,sha256=0ni6j0JpcJrPGDLydlbK5ouF7LsQWvDke3UgTOgKEJM,32213
20
21
  gemmi_protools/utils/pdb_annot.py,sha256=nnRlLpjczhCP1ojEgsO3FuVgfsyleDZ34QxqyI8-wr0,11143
21
22
  gemmi_protools/utils/ppi.py,sha256=VWYsdxWwQoS1xwEYj5KB96Zz3F8r5Eyuw6NT3ReD-wc,2330
22
- gemmi_protools-0.1.16.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
23
- gemmi_protools-0.1.16.dist-info/METADATA,sha256=wxrsd_ApvtPBm1rLrPjFCx8mTt3boSg4-7wlx6mJaZU,750
24
- gemmi_protools-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
- gemmi_protools-0.1.16.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
26
- gemmi_protools-0.1.16.dist-info/RECORD,,
23
+ gemmi_protools-0.1.17.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
24
+ gemmi_protools-0.1.17.dist-info/METADATA,sha256=sMzScJ0AFMKw9uMWCzxD9vYe6U3nATUX6BflEu6L1Ss,918
25
+ gemmi_protools-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
26
+ gemmi_protools-0.1.17.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
27
+ gemmi_protools-0.1.17.dist-info/RECORD,,