gemmi-protools 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gemmi-protools might be problematic. Click here for more details.

@@ -134,6 +134,12 @@ class StructureAligner(object):
134
134
  if isinstance(ref_chains, list):
135
135
  r_st.pick_chains(ref_chains)
136
136
 
137
+ q_ch_mapper = q_st.make_chain_names_to_one_letter()
138
+ r_ch_mapper = r_st.make_chain_names_to_one_letter()
139
+
140
+ q_ch_mapper_r = {v: k for k, v in q_ch_mapper.items()}
141
+ r_ch_mapper_r = {v: k for k, v in r_ch_mapper.items()}
142
+
137
143
  _tmp_a = os.path.join(tmp_dir, "a.pdb")
138
144
  q_st.to_pdb(_tmp_a)
139
145
 
@@ -158,6 +164,9 @@ class StructureAligner(object):
158
164
  self.is_aligned = True
159
165
  self.by_query = q_st.chain_ids if query_chains is None else query_chains
160
166
  self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
167
+ self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
168
+ self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
169
+
161
170
  finally:
162
171
  if os.path.isdir(tmp_dir):
163
172
  shutil.rmtree(tmp_dir)
@@ -0,0 +1,331 @@
1
+ """
2
+ @Author: Luo Jiejian
3
+ """
4
+ import hashlib
5
+ import itertools
6
+ import os
7
+ import re
8
+ import shutil
9
+ import subprocess
10
+ import uuid
11
+ from collections import defaultdict
12
+ from dataclasses import asdict
13
+ from importlib.resources import files
14
+ from typing import List
15
+
16
+ import numpy as np
17
+ from anarci import run_anarci
18
+ from anarci.germlines import all_germlines
19
+ from joblib import Parallel, delayed
20
+ from scipy.spatial import cKDTree
21
+
22
+ from gemmi_protools import StructureParser
23
+ from gemmi_protools.utils.ppi import _ppi_atoms
24
+
25
+
26
+ def hash_sequence(seq: str) -> str:
27
+ """Hash a sequence."""
28
+ return hashlib.sha256(seq.encode()).hexdigest()
29
+
30
+
31
+ def get_fv_region(in_sequence: str):
32
+ # IMGT number, include start and end
33
+ # https://www.imgt.org/IMGTScientificChart/Nomenclature/IMGT-FRCDRdefinition.html
34
+ # αβTCR:Light chain α, heavy chain β
35
+ # γδTCR:Light chain γ, heavy chain δ
36
+ imgt_scheme = dict(
37
+ fr1=(1, 26),
38
+ cdr1=(27, 38),
39
+ fr2=(39, 55),
40
+ cdr2=(56, 65),
41
+ fr3=(66, 104),
42
+ cdr3=(105, 117),
43
+ fr4=(118, 128),
44
+ )
45
+
46
+ mapper = dict()
47
+ for k, v in imgt_scheme.items():
48
+ for i in range(v[0], v[1] + 1):
49
+ mapper[i] = k
50
+
51
+ inputs = [("input", in_sequence)]
52
+ _, numbered, alignment_details, _ = run_anarci(inputs, scheme="imgt", assign_germline=True)
53
+ if numbered[0] is None:
54
+ return []
55
+
56
+ outputs = []
57
+ for cur_numbered, cur_details in zip(numbered[0], alignment_details[0]):
58
+ aligned_sites, start, end = cur_numbered
59
+
60
+ # region_seq
61
+ regions = defaultdict(list)
62
+ for site in aligned_sites:
63
+ region_name = mapper[site[0][0]]
64
+ regions[region_name].append(site[1])
65
+
66
+ max_index = aligned_sites[-1][0][0]
67
+ if max_index < 128:
68
+ for idx in range(max_index + 1, 129):
69
+ region_name = mapper[idx]
70
+ regions[region_name].append("-")
71
+
72
+ cdr1_seq = "".join([aa for aa in regions["cdr1"] if aa != "-"])
73
+ cdr2_seq = "".join([aa for aa in regions["cdr2"] if aa != "-"])
74
+ cdr3_seq = "".join([aa for aa in regions["cdr3"] if aa != "-"])
75
+
76
+ # germ line V gene [fr1], germ line J gene [fr4]
77
+ chain_type = cur_details["chain_type"]
78
+ v_gene_specie, v_gene = cur_details["germlines"]["v_gene"][0]
79
+ j_gene_specie, j_gene = cur_details["germlines"]["j_gene"][0]
80
+
81
+ gl_fr1 = list(
82
+ all_germlines["V"][chain_type][v_gene_specie][v_gene][imgt_scheme["fr1"][0] - 1:imgt_scheme["fr1"][1]])
83
+ gl_fr1_mapper = dict(zip(range(imgt_scheme["fr1"][0], imgt_scheme["fr1"][1] + 1), gl_fr1))
84
+
85
+ gl_fr4 = list(
86
+ all_germlines["J"][chain_type][j_gene_specie][j_gene][imgt_scheme["fr4"][0] - 1:imgt_scheme["fr4"][1]])
87
+ gl_fr4_mapper = dict(zip(range(imgt_scheme["fr4"][0], imgt_scheme["fr4"][1] + 1), gl_fr4))
88
+
89
+ # repair the gap with gl_fr1 and gl_fr4
90
+ # For FR1
91
+ fixed_fr1 = []
92
+ for site in aligned_sites:
93
+ idx, ins = site[0]
94
+ if imgt_scheme["fr1"][0] <= idx <= imgt_scheme["fr1"][1]:
95
+ if ins == ' ' and site[1] == "-" and gl_fr1_mapper[idx] != "-":
96
+ fixed_fr1.append(gl_fr1_mapper[idx])
97
+ else:
98
+ fixed_fr1.append(site[1])
99
+
100
+ # For FR4
101
+ fixed_fr4 = []
102
+ for site in aligned_sites:
103
+ idx, ins = site[0]
104
+ if imgt_scheme["fr4"][0] <= idx <= imgt_scheme["fr4"][1]:
105
+ if ins == ' ' and site[1] == "-" and gl_fr4_mapper[idx] != "-":
106
+ fixed_fr4.append(gl_fr4_mapper[idx])
107
+ else:
108
+ fixed_fr4.append(site[1])
109
+
110
+ # update regions
111
+ regions["fr1"] = fixed_fr1
112
+ regions["fr4"] = fixed_fr4
113
+
114
+ fixed_fv_seq = []
115
+ for r_name in ["fr1", "cdr1", "fr2", "cdr2", "fr3", "cdr3", "fr4"]:
116
+ for aa in regions[r_name]:
117
+ if aa != "-":
118
+ fixed_fv_seq.append(aa)
119
+ fixed_fv_seq = "".join(fixed_fv_seq)
120
+
121
+ outputs.append(dict(Fv_aa=fixed_fv_seq,
122
+ classification=v_gene[0:2],
123
+ chain_type=chain_type,
124
+ v_gene=v_gene_specie + "/" + v_gene,
125
+ j_gene=j_gene_specie + "/" + j_gene,
126
+ cdr1_aa=cdr1_seq,
127
+ cdr2_aa=cdr2_seq,
128
+ cdr3_aa=cdr3_seq,
129
+ )
130
+ )
131
+ return outputs
132
+
133
+
134
+ def fv_region_type(inputs: list[dict]):
135
+ n = len(inputs)
136
+ if n == 0:
137
+ return "not-Fv"
138
+ elif n == 1:
139
+ clf = inputs[0]["classification"]
140
+ ct = inputs[0]["chain_type"]
141
+
142
+ v = "%s%s" % (clf, ct)
143
+ if v in ["IGH", "TRB", "TRD"]:
144
+ return "%s/VH" % clf
145
+ elif v in ["IGK", "IGL", "TRA", "TRG"]:
146
+ return "%s/VL" % clf
147
+ else:
148
+ return "other"
149
+ elif n == 2:
150
+ p = {"%s%s" % (item["classification"], item["chain_type"]) for item in inputs}
151
+ if p in [{"IGH", "IGL"}, {"IGH", "IGK"}, {"TRA", "TRB"}, {"TRG", "TRD"}]:
152
+ clf = p.pop()[0:2]
153
+ return "%s/scFv" % clf
154
+ else:
155
+ return "other"
156
+ else:
157
+ return "other"
158
+
159
+
160
+ def annotate_mhc(seq_dict: dict):
161
+ """
162
+
163
+ Args:
164
+ seq_dict: dict,
165
+ key: ch_id
166
+ val: protein seq
167
+
168
+ Returns:
169
+
170
+ """
171
+ hmm_model = str(files("gemmi_protools.data") / "MHC" / "MHC_combined.hmm")
172
+ # save sequences to fasta
173
+ # all chains of biomolecule
174
+ home_dir = os.path.expanduser("~")
175
+ tmp_dir = os.path.join(home_dir, str(uuid.uuid4()))
176
+ os.makedirs(tmp_dir)
177
+
178
+ fasta_file = os.path.join(tmp_dir, "input.fasta")
179
+ with open(fasta_file, "w") as fo:
180
+ for ch_id, seq in seq_dict.items():
181
+ print(">%s" % ch_id, file=fo)
182
+ print(seq, file=fo)
183
+
184
+ result_file = os.path.join(tmp_dir, "result.txt")
185
+ _path = shutil.which("hmmscan")
186
+
187
+ if _path is None:
188
+ raise RuntimeError("hmmscan is not found.")
189
+
190
+ cmd = "%s --tblout %s --cut_ga %s %s" % (_path, result_file, hmm_model, fasta_file)
191
+
192
+ try:
193
+ _ = subprocess.run(cmd, shell=True, check=True,
194
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
195
+ except subprocess.CalledProcessError as ce:
196
+ raise Exception(ce)
197
+ else:
198
+ out = dict()
199
+ with open(result_file, "r") as fi:
200
+ for li in fi:
201
+ if not re.match("#", li.strip()):
202
+ tmp = re.split(r"\s+", li.strip())[0:3]
203
+ out[tmp[2]] = tmp[0]
204
+ finally:
205
+ if os.path.isdir(tmp_dir):
206
+ shutil.rmtree(tmp_dir)
207
+ return out
208
+
209
+
210
+ def _interface_residues(struct: StructureParser,
211
+ chains_x: List[str],
212
+ chains_y: List[str],
213
+ threshold: float = 4.5):
214
+ """
215
+ identify PPI among protein, DNA, RNA
216
+ :param struct: StructureParser
217
+ :param chains_x:
218
+ :param chains_y:
219
+ :param threshold:
220
+ :return:
221
+ PPI residues of chains_x, PPI residues of chains_y
222
+ """
223
+
224
+ x_coord, x_id = _ppi_atoms(struct, chains_x)
225
+ y_coord, y_id = _ppi_atoms(struct, chains_y)
226
+
227
+ kd_tree_x = cKDTree(x_coord)
228
+ kd_tree_y = cKDTree(y_coord)
229
+
230
+ pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
231
+
232
+ x_res = np.unique(x_id[pairs.row][["ch_name", 'res_num', 'res_icode', 'res_name']])
233
+ y_res = np.unique(y_id[pairs.col][["ch_name", 'res_num', 'res_icode', 'res_name']])
234
+
235
+ x_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in x_res.tolist()]
236
+ y_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in y_res.tolist()]
237
+ return x_out, y_out
238
+
239
+
240
+ def polymer_interface_residues(struct: StructureParser,
241
+ ppi_threshold: float = 4.5,
242
+ n_cpus: int = 1,
243
+ ):
244
+ """
245
+
246
+ Args:
247
+ struct:
248
+ ppi_threshold:
249
+
250
+ Returns:
251
+
252
+ """
253
+ chains = [ch for ch, ct in struct.chain_types.items() if ct in ["protein", "dna", "rna"]]
254
+ ch_pairs = list(itertools.combinations(chains, r=2))
255
+ ch_pairs.sort()
256
+
257
+ def _run(ch_1, ch_2):
258
+ key = "%s/%s" % (ch_1, ch_2)
259
+ res_x, res_y = _interface_residues(struct, chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
260
+ if len(res_x) > 0:
261
+ return {key: [res_x, res_y]}
262
+ else:
263
+ return dict()
264
+
265
+ cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
266
+
267
+ outputs = dict()
268
+ if cpu2use == 1 or len(ch_pairs) < 100:
269
+ for ch_1, ch_2 in ch_pairs:
270
+ outputs.update(_run(ch_1, ch_2))
271
+ else:
272
+ results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
273
+ for item in results:
274
+ outputs.update(item)
275
+ return outputs
276
+
277
+
278
+ def annotate_pdb(struct_file: str, ppi_threshold: float = 4.5,
279
+ n_cpus: int = 1, max_seqs: int = 100):
280
+ st = StructureParser()
281
+ st.load_from_file(struct_file)
282
+ st.set_default_model()
283
+ st.STRUCT.remove_alternative_conformations()
284
+ st.STRUCT.remove_ligands_and_waters()
285
+ st.STRUCT.remove_hydrogens()
286
+ st.STRUCT.remove_empty_chains()
287
+ st.update_entity()
288
+
289
+ if len(st.chain_ids) > max_seqs:
290
+ raise RuntimeError("Too many chains: %d > %d" % (len(st.chain_ids), max_seqs))
291
+
292
+ # Merge sequences
293
+ polymers = dict()
294
+ for ch, seq in st.polymer_sequences.items():
295
+ hash_id = hash_sequence(seq)
296
+ if hash_id not in polymers:
297
+ val = dict(chain_ids=[ch],
298
+ sequence=seq,
299
+ type=st.chain_types[ch],
300
+ description=st.ENTITY.eid2desc.get(st.ENTITY.polymer2eid[ch], "Unknown"),
301
+ specie=st.ENTITY.eid2specie.get(st.ENTITY.polymer2eid[ch], "Unknown"),
302
+ taxid=st.ENTITY.eid2taxid.get(st.ENTITY.polymer2eid[ch], "Unknown"),
303
+ )
304
+ polymers[hash_id] = val
305
+ else:
306
+ polymers[hash_id]["chain_ids"].append(ch)
307
+
308
+ sdict = {k: v["sequence"] for k, v in polymers.items()}
309
+
310
+ results = dict()
311
+ for hasd_id, val in polymers.items():
312
+ val["chain_ids"].sort()
313
+ if val["type"] == "protein":
314
+ anarci_info = get_fv_region(val["sequence"])
315
+ fvt = fv_region_type(anarci_info)
316
+ if fvt != "not-Fv":
317
+ results[hasd_id] = dict(fv_type=fvt, annotations=anarci_info)
318
+
319
+ struct_info = asdict(st.INFO)
320
+ struct_info.update(resolution=st.STRUCT.resolution)
321
+ struct_info["pdb_id"] = struct_info["pdb_id"].lower()
322
+ struct_info["exp_method"] = struct_info["exp_method"].lower()
323
+
324
+ return dict(path=os.path.abspath(os.path.expanduser(struct_file)),
325
+ info=struct_info,
326
+ polymers=polymers,
327
+ anarci=results,
328
+ mhc=annotate_mhc(sdict) if len(sdict) > 0 else dict(),
329
+ interfaces=polymer_interface_residues(st, ppi_threshold,
330
+ n_cpus=n_cpus)
331
+ )
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gemmi_protools
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: An Enhanced tool to process PDB structures based on Gemmi
5
+ Author: Luo Jiejian
5
6
  Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
6
7
  License-Expression: MIT
7
8
  Requires-Python: >=3.10
@@ -14,6 +15,15 @@ Requires-Dist: numpy
14
15
  Requires-Dist: biopython>=1.84
15
16
  Requires-Dist: scipy>=1.14.1
16
17
  Requires-Dist: dockq
18
+ Requires-Dist: hmmer
19
+ Dynamic: author
17
20
  Dynamic: license-file
18
21
 
19
22
  # An Enhanced tool to process PDB structures based on Gemmi
23
+
24
+ # Install
25
+ ```commandline
26
+ conda create -n gp python=3.10 anarci -c bioconda
27
+ conda activate gp
28
+ pip install gemmi_protools
29
+ ```
@@ -1,4 +1,9 @@
1
1
  gemmi_protools/__init__.py,sha256=hwUw-EieCG0kwzHjTjzHF9Bc3D-J5R_l6G8PCcFegkw,331
2
+ gemmi_protools/data/MHC/MHC_combined.hmm,sha256=w0_vzPiEWne_d_kYmqR0OiSsCOpQioItKy3Zq-JMsH4,159451
3
+ gemmi_protools/data/MHC/MHC_combined.hmm.h3f,sha256=QGG4l-v76RYtysJ5rybnz5v6VgJg2RjoQQHUVWL5jmg,45522
4
+ gemmi_protools/data/MHC/MHC_combined.hmm.h3i,sha256=yn-700hoBSJB39Tj8Ia8UhSZWpYiCZFNcbnYAFNjReI,300
5
+ gemmi_protools/data/MHC/MHC_combined.hmm.h3m,sha256=CvNMCsobQiX-wL7iB4CreNcbpnElE31NmmjmMR0iYVI,66174
6
+ gemmi_protools/data/MHC/MHC_combined.hmm.h3p,sha256=-mK278pRedG3-KL-DtuVAQy7La9DgXg5FcP89D6X3Ck,78325
2
7
  gemmi_protools/io/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
3
8
  gemmi_protools/io/cif_opts.py,sha256=cYEhubRP2rymbwSlB3ZQPGUFQiXGb2UQ0gdXdTd3c-I,6646
4
9
  gemmi_protools/io/convert.py,sha256=780sQcwhslUD4Hj5UZMVlQdbicniJ6jNjncTl_7jaMk,3841
@@ -9,12 +14,13 @@ gemmi_protools/io/peptide.py,sha256=a2wiEutJmvhl6gDCIzzqRCbmyknk2mwgy2FZ53lXclU,
9
14
  gemmi_protools/io/reader.py,sha256=2AXg1JdYT2LxL6jWVsJkLHQREwAoYR7V-g-hQVgSgGg,16237
10
15
  gemmi_protools/io/struct_info.py,sha256=9nBj1Zer03S8_Wks7L7uRlc9PlbfCKzoaT32pKR58X8,2769
11
16
  gemmi_protools/utils/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
12
- gemmi_protools/utils/align.py,sha256=CZcrvjy-ZbX2u7OAn-YGblbxaj9YFUDX4CFZcpbpnB8,6959
17
+ gemmi_protools/utils/align.py,sha256=wyJDawxW10kdYWEM1F_LUEc3Qo-3_I7P5hFk-r-yqgY,7432
13
18
  gemmi_protools/utils/dockq.py,sha256=XmMwVEy-H4p6sH_HPcDWA3TP77OWdih0fE_BQJDr4pU,4189
14
19
  gemmi_protools/utils/fixer.py,sha256=WCk2BztM4tSKWp0EGoBFK4Rge330_7LPXNe2kmk-9f0,10046
20
+ gemmi_protools/utils/pdb_annot.py,sha256=nnRlLpjczhCP1ojEgsO3FuVgfsyleDZ34QxqyI8-wr0,11143
15
21
  gemmi_protools/utils/ppi.py,sha256=VWYsdxWwQoS1xwEYj5KB96Zz3F8r5Eyuw6NT3ReD-wc,2330
16
- gemmi_protools-0.1.13.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
17
- gemmi_protools-0.1.13.dist-info/METADATA,sha256=pfbi9HsUmoRy6ABRoqTu5IrmaAs2ACc81On-37f1VuI,568
18
- gemmi_protools-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- gemmi_protools-0.1.13.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
20
- gemmi_protools-0.1.13.dist-info/RECORD,,
22
+ gemmi_protools-0.1.15.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
23
+ gemmi_protools-0.1.15.dist-info/METADATA,sha256=Zw8KLBPa2Q9qb2P2DOS03gSpDtA1ZZQTow6HA8cbU-s,750
24
+ gemmi_protools-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
+ gemmi_protools-0.1.15.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
26
+ gemmi_protools-0.1.15.dist-info/RECORD,,