gemmi-protools 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gemmi_protools/__init__.py +5 -0
- gemmi_protools/data/CD1/CD21029_review.fasta +141 -0
- gemmi_protools/data/MHC/MHC_combined.hmm +1092 -0
- gemmi_protools/data/MHC/MHC_combined.hmm.h3f +0 -0
- gemmi_protools/data/MHC/MHC_combined.hmm.h3i +0 -0
- gemmi_protools/data/MHC/MHC_combined.hmm.h3m +0 -0
- gemmi_protools/data/MHC/MHC_combined.hmm.h3p +0 -0
- gemmi_protools/io/__init__.py +3 -0
- gemmi_protools/io/convert.py +93 -0
- gemmi_protools/io/reader.py +911 -0
- gemmi_protools/tools/__init__.py +3 -0
- gemmi_protools/tools/align.py +336 -0
- gemmi_protools/tools/dockq.py +128 -0
- gemmi_protools/tools/mesh.py +197 -0
- gemmi_protools/tools/pdb_annot.py +862 -0
- gemmi_protools-1.1.0.dist-info/METADATA +46 -0
- gemmi_protools-1.1.0.dist-info/RECORD +20 -0
- gemmi_protools-1.1.0.dist-info/WHEEL +5 -0
- gemmi_protools-1.1.0.dist-info/licenses/LICENSE +21 -0
- gemmi_protools-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import string
|
|
9
|
+
import subprocess
|
|
10
|
+
import tempfile
|
|
11
|
+
from typing import Literal, Optional, Dict, Any, List
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from Bio.Align import PairwiseAligner, substitution_matrices
|
|
15
|
+
from Bio.PDB import Superimposer
|
|
16
|
+
from gemmi_protools import StructureParser
|
|
17
|
+
from gemmi_protools.io.convert import gemmi2bio, bio2gemmi
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def check_sequence(seq: str):
|
|
21
|
+
"""
|
|
22
|
+
Remove space, star at the end, and \n, upper the letters
|
|
23
|
+
Check sequence is valid or not
|
|
24
|
+
|
|
25
|
+
:param seq:str
|
|
26
|
+
:return:
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
seq_clean = re.sub(pattern=r" |\*|-", repl='', string=seq.upper().strip())
|
|
30
|
+
if len(seq_clean) == 0:
|
|
31
|
+
raise ValueError("Sequence is empty")
|
|
32
|
+
|
|
33
|
+
s = re.sub(pattern=r"[A-Z]", repl="", string=seq_clean)
|
|
34
|
+
if len(s) > 0:
|
|
35
|
+
raise ValueError("Sequence has Non-alphabetic characters: %s" % str(set(s)))
|
|
36
|
+
|
|
37
|
+
return seq_clean
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def align_sequences(seq1: str,
|
|
41
|
+
seq2: str,
|
|
42
|
+
seq_type: Literal["dna", "rna", "protein"] = "protein",
|
|
43
|
+
mode: Literal["global", "local"] = "local",
|
|
44
|
+
substitution_matrix: Optional[str] = None,
|
|
45
|
+
open_gap_score: Optional[float] = None,
|
|
46
|
+
extend_gap_score: Optional[float] = None,
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
[To Do]: when one insertion greater than 52, raise mapping error
|
|
50
|
+
"""
|
|
51
|
+
default_params = {
|
|
52
|
+
"dna": {
|
|
53
|
+
"matrix": "NUC.4.4",
|
|
54
|
+
"open_gap_score": -10.0,
|
|
55
|
+
"extend_gap_score": -0.5,
|
|
56
|
+
"mode": "global"
|
|
57
|
+
},
|
|
58
|
+
"rna": {
|
|
59
|
+
"matrix": "NUC.4.4",
|
|
60
|
+
"open_gap_score": -10.0,
|
|
61
|
+
"extend_gap_score": -0.5,
|
|
62
|
+
"mode": "global"
|
|
63
|
+
},
|
|
64
|
+
"protein": {
|
|
65
|
+
"matrix": "BLOSUM62",
|
|
66
|
+
"open_gap_score": -11.0,
|
|
67
|
+
"extend_gap_score": -1.0,
|
|
68
|
+
"mode": "global"
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
available_matrices = {
|
|
74
|
+
"dna": ["NUC.4.4"],
|
|
75
|
+
"rna": ["NUC.4.4"],
|
|
76
|
+
"protein": ["BLOSUM45", "BLOSUM50", "BLOSUM62",
|
|
77
|
+
"BLOSUM80", "BLOSUM90",
|
|
78
|
+
"PAM30", "PAM70", "PAM250"]
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
seq1 = check_sequence(seq1)
|
|
82
|
+
seq2 = check_sequence(seq2)
|
|
83
|
+
|
|
84
|
+
params = default_params[seq_type].copy()
|
|
85
|
+
a_mats = available_matrices[seq_type]
|
|
86
|
+
|
|
87
|
+
if substitution_matrix is not None:
|
|
88
|
+
if substitution_matrix not in a_mats:
|
|
89
|
+
raise ValueError("substitution matrix `%s` not support for %s" % (substitution_matrix, seq_type))
|
|
90
|
+
else:
|
|
91
|
+
params["matrix"] = substitution_matrix
|
|
92
|
+
|
|
93
|
+
if open_gap_score is not None:
|
|
94
|
+
params["open_gap_score"] = open_gap_score
|
|
95
|
+
if extend_gap_score is not None:
|
|
96
|
+
params["extend_gap_score"] = extend_gap_score
|
|
97
|
+
|
|
98
|
+
params["mode"] = mode
|
|
99
|
+
# Finish parameters checking and setting
|
|
100
|
+
aligner = PairwiseAligner()
|
|
101
|
+
aligner.mode = params["mode"]
|
|
102
|
+
aligner.substitution_matrix = substitution_matrices.load(params["matrix"])
|
|
103
|
+
aligner.open_gap_score = params["open_gap_score"]
|
|
104
|
+
aligner.extend_gap_score = params["extend_gap_score"]
|
|
105
|
+
|
|
106
|
+
best_alignment = aligner.align(seq1, seq2)[0]
|
|
107
|
+
|
|
108
|
+
aligned_seq1, aligned_seq2 = best_alignment
|
|
109
|
+
|
|
110
|
+
# start from 1
|
|
111
|
+
aa_mapper = dict()
|
|
112
|
+
i = 0
|
|
113
|
+
j = 0
|
|
114
|
+
|
|
115
|
+
ins_letters = string.ascii_uppercase + string.ascii_lowercase
|
|
116
|
+
k = 0
|
|
117
|
+
|
|
118
|
+
for aa1, aa2 in zip(aligned_seq1, aligned_seq2):
|
|
119
|
+
if aa1 != "-":
|
|
120
|
+
i += 1
|
|
121
|
+
if aa2 != "-":
|
|
122
|
+
j += 1
|
|
123
|
+
# reset k
|
|
124
|
+
if k > 0:
|
|
125
|
+
k = 0
|
|
126
|
+
|
|
127
|
+
if aa1 != "-" and aa2 != "-":
|
|
128
|
+
aa_mapper[i] = (j, "")
|
|
129
|
+
|
|
130
|
+
# for insertion of seq1
|
|
131
|
+
if aa1 != "-" and aa2 == "-":
|
|
132
|
+
aa_mapper[i] = (j, ins_letters[k])
|
|
133
|
+
k += 1
|
|
134
|
+
|
|
135
|
+
# from align idx to sequence idx
|
|
136
|
+
# tmap_1 = re.search(aligned_seq1.replace("-", ""), seq1)
|
|
137
|
+
# tmap_2 = re.search(aligned_seq2.replace("-", ""), seq2)
|
|
138
|
+
# shift_1 = tmap_1.span()[0]
|
|
139
|
+
# shift_2 = tmap_2.span()[0]
|
|
140
|
+
|
|
141
|
+
start_1, start_2 = best_alignment.coordinates[:, 0]
|
|
142
|
+
_mapper = {k + start_1: "%d%s" % (v[0] + start_2, v[1]) for k, v in aa_mapper.items()}
|
|
143
|
+
|
|
144
|
+
out_mapper = dict()
|
|
145
|
+
# check head and tail of seq1 with E prefix
|
|
146
|
+
for i in range(1, len(seq1) + 1):
|
|
147
|
+
if i not in _mapper:
|
|
148
|
+
out_mapper[i] = "E%d" % i
|
|
149
|
+
else:
|
|
150
|
+
out_mapper[i] = _mapper[i]
|
|
151
|
+
|
|
152
|
+
ident = best_alignment.counts().identities / best_alignment.length
|
|
153
|
+
n_aligned = best_alignment.length - best_alignment.counts().gaps
|
|
154
|
+
|
|
155
|
+
coverage_1 = n_aligned / len(seq1)
|
|
156
|
+
coverage_2 = n_aligned / len(seq2)
|
|
157
|
+
|
|
158
|
+
return dict(seq1=seq1,
|
|
159
|
+
seq2=seq2,
|
|
160
|
+
aligned_seq1=aligned_seq1,
|
|
161
|
+
aligned_seq2=aligned_seq2,
|
|
162
|
+
alignment_length=best_alignment.length,
|
|
163
|
+
aligned_aa_mapper=out_mapper,
|
|
164
|
+
identity=round(ident, 3),
|
|
165
|
+
coverage_1=round(coverage_1, 3),
|
|
166
|
+
coverage_2=round(coverage_2, 3),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class StructureAligner(object):
|
|
171
|
+
def __init__(self, query_path: str, ref_path: str):
|
|
172
|
+
self._query_st = StructureParser()
|
|
173
|
+
self._query_st.load_from_file(query_path)
|
|
174
|
+
|
|
175
|
+
self._ref_st = StructureParser()
|
|
176
|
+
self._ref_st.load_from_file(ref_path)
|
|
177
|
+
|
|
178
|
+
self.values = dict()
|
|
179
|
+
self.rot_mat = None
|
|
180
|
+
self.is_aligned = False
|
|
181
|
+
self.by_query = None
|
|
182
|
+
self.by_ref = None
|
|
183
|
+
self.query_path = query_path
|
|
184
|
+
self.ref_path = ref_path
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def __mmalign_path(self):
|
|
188
|
+
_path = shutil.which("MMAlign") or shutil.which("MMalign")
|
|
189
|
+
if _path is None:
|
|
190
|
+
raise RuntimeError("Executable program MMAlign is not found. "
|
|
191
|
+
"Download from https://zhanggroup.org/MM-align/ ."
|
|
192
|
+
"Build it and add MMAlign to environment PATH")
|
|
193
|
+
else:
|
|
194
|
+
return _path
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def __parser_rotation_matrix(matrix_file: str):
|
|
198
|
+
rotation_matrix = []
|
|
199
|
+
translation_vector = []
|
|
200
|
+
|
|
201
|
+
with open(matrix_file, 'r') as file:
|
|
202
|
+
lines = file.readlines()
|
|
203
|
+
values = lines[2:5]
|
|
204
|
+
for cur_line in values:
|
|
205
|
+
tmp = re.split(pattern=r"\s+", string=cur_line.strip())
|
|
206
|
+
assert len(tmp) == 5
|
|
207
|
+
rotation_matrix.append(tmp[2:])
|
|
208
|
+
translation_vector.append(tmp[1])
|
|
209
|
+
return dict(R=np.array(rotation_matrix).astype(np.float32),
|
|
210
|
+
T=np.array(translation_vector).astype(np.float32))
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def __parse_terminal_outputs(output_string: str) -> Dict[str, Any]:
|
|
214
|
+
lines = re.split(pattern=r"\n", string=output_string)
|
|
215
|
+
# chain mapping
|
|
216
|
+
patterns = dict(query_chain_ids=r"Structure_1.+\.pdb:([\w:]+)",
|
|
217
|
+
ref_chain_ids=r"Structure_2.+\.pdb:([\w:]+)",
|
|
218
|
+
query_total_length=r"Length of Structure_1.*?(\d+).*residues",
|
|
219
|
+
ref_total_length=r"Length of Structure_2.*?(\d+).*residues",
|
|
220
|
+
aligned_length=r"Aligned length=.*?(\d+)",
|
|
221
|
+
rmsd=r"RMSD=.*?([\d.]+)",
|
|
222
|
+
tmscore_by_query=r"TM-score=.*?([\d.]+).+Structure_1",
|
|
223
|
+
tmscore_by_ref=r"TM-score=.*?([\d.]+).+Structure_2",
|
|
224
|
+
aligned_seq_start=r"denotes other aligned residues",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
values = dict()
|
|
228
|
+
for idx, line in enumerate(lines):
|
|
229
|
+
current_keys = list(patterns.keys())
|
|
230
|
+
for key in current_keys:
|
|
231
|
+
tmp = re.search(patterns[key], line)
|
|
232
|
+
if tmp:
|
|
233
|
+
if key in ['query_chain_ids', 'ref_chain_ids']:
|
|
234
|
+
values[key] = re.split(pattern=":", string=tmp.groups()[0])
|
|
235
|
+
del patterns[key]
|
|
236
|
+
elif key in ['query_total_length', 'ref_total_length', 'aligned_length']:
|
|
237
|
+
values[key] = int(tmp.groups()[0])
|
|
238
|
+
del patterns[key]
|
|
239
|
+
elif key in ['rmsd', 'tmscore_by_query', 'tmscore_by_ref']:
|
|
240
|
+
values[key] = float(tmp.groups()[0])
|
|
241
|
+
del patterns[key]
|
|
242
|
+
elif key == "aligned_seq_start":
|
|
243
|
+
# idx + 1 and idx + 3 for aligned sequences 1 and 2
|
|
244
|
+
seq_1 = lines[idx + 1]
|
|
245
|
+
seq_2 = lines[idx + 3]
|
|
246
|
+
|
|
247
|
+
sp1 = re.split(pattern=r"\*", string=seq_1)
|
|
248
|
+
sp2 = re.split(pattern=r"\*", string=seq_2)
|
|
249
|
+
values["query_sequences"] = sp1[:-1] if "*" in seq_1 else sp1
|
|
250
|
+
values["ref_sequences"] = sp2[:-1] if "*" in seq_2 else sp2
|
|
251
|
+
del patterns[key]
|
|
252
|
+
return values
|
|
253
|
+
|
|
254
|
+
def make_alignment(self, query_chains: Optional[List[str]] = None,
|
|
255
|
+
ref_chains: Optional[List[str]] = None, timeout=300.0):
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
:param
|
|
259
|
+
query_chains: list, None
|
|
260
|
+
for all chains
|
|
261
|
+
:param
|
|
262
|
+
ref_chains: list, None
|
|
263
|
+
for all chains
|
|
264
|
+
:param
|
|
265
|
+
timeout: default
|
|
266
|
+
300
|
|
267
|
+
:return:
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
program_path = self.__mmalign_path
|
|
271
|
+
|
|
272
|
+
# clone
|
|
273
|
+
if isinstance(query_chains, list):
|
|
274
|
+
q_st = self._query_st.pick_chains(query_chains)
|
|
275
|
+
else:
|
|
276
|
+
q_st = self._query_st
|
|
277
|
+
|
|
278
|
+
if isinstance(ref_chains, list):
|
|
279
|
+
r_st = self._ref_st.pick_chains(ref_chains)
|
|
280
|
+
else:
|
|
281
|
+
r_st = self._ref_st
|
|
282
|
+
|
|
283
|
+
q_ch_mapper = q_st.make_one_letter_chain()
|
|
284
|
+
r_ch_mapper = r_st.make_one_letter_chain()
|
|
285
|
+
|
|
286
|
+
q_ch_mapper_r = {v: k for k, v in q_ch_mapper.items()}
|
|
287
|
+
r_ch_mapper_r = {v: k for k, v in r_ch_mapper.items()}
|
|
288
|
+
|
|
289
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
290
|
+
_tmp_a = os.path.join(tmp_dir, "a.pdb")
|
|
291
|
+
q_st.to_pdb(_tmp_a)
|
|
292
|
+
|
|
293
|
+
_tmp_b = os.path.join(tmp_dir, "b.pdb")
|
|
294
|
+
r_st.to_pdb(_tmp_b)
|
|
295
|
+
|
|
296
|
+
matrix_file = os.path.join(tmp_dir, "m.txt")
|
|
297
|
+
_command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
result = subprocess.run(_command, shell=True, check=True,
|
|
301
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
302
|
+
timeout=timeout)
|
|
303
|
+
except Exception as e:
|
|
304
|
+
print("%s: between files %s and %s; between chains: %s and %s" % (
|
|
305
|
+
str(e), self.query_path, self.ref_path,
|
|
306
|
+
str(q_st.chain_ids), str(r_st.chain_ids))
|
|
307
|
+
)
|
|
308
|
+
else:
|
|
309
|
+
self.values = self.__parse_terminal_outputs(result.stdout.decode())
|
|
310
|
+
self.rot_mat = self.__parser_rotation_matrix(matrix_file)
|
|
311
|
+
self.is_aligned = True
|
|
312
|
+
self.by_query = q_st.chain_ids if query_chains is None else query_chains
|
|
313
|
+
self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
|
|
314
|
+
self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
|
|
315
|
+
self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
|
|
316
|
+
|
|
317
|
+
def save_aligned_query(self, out_file: str):
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
:param
|
|
321
|
+
out_file:.cif
|
|
322
|
+
file
|
|
323
|
+
:return:
|
|
324
|
+
"""
|
|
325
|
+
if not self.is_aligned:
|
|
326
|
+
raise RuntimeError("structure not aligned, run make_alignment first")
|
|
327
|
+
|
|
328
|
+
super_imposer = Superimposer()
|
|
329
|
+
super_imposer.rotran = (self.rot_mat["R"].T, self.rot_mat["T"])
|
|
330
|
+
|
|
331
|
+
bio_s = gemmi2bio(self._query_st.STRUCT)
|
|
332
|
+
super_imposer.apply(bio_s)
|
|
333
|
+
query_st_aligned = bio2gemmi(bio_s)
|
|
334
|
+
|
|
335
|
+
block = query_st_aligned.make_mmcif_block()
|
|
336
|
+
block.write_file(out_file)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
import gemmi
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from gemmi_protools.io.reader import StructureParser
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def dockq_score_interface(query_model: str,
|
|
19
|
+
native_model: str,
|
|
20
|
+
partner_1_mapping: List[Tuple[str, str]],
|
|
21
|
+
partner_2_mapping: List[Tuple[str, str]],
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Calculate Dockq Score for an interface (partner 1 vs partner 2)
|
|
25
|
+
|
|
26
|
+
:param query_model: str
|
|
27
|
+
path of query model, support .pdb, .pdb.gz, .cif, .cif.gz
|
|
28
|
+
:param native_model:
|
|
29
|
+
:param partner_1_mapping: a list of chain ID mapping between query and native for partner1 of the interface
|
|
30
|
+
e.g. [(q chain1, n chain1), (q chain2, n chain2)]
|
|
31
|
+
:param partner_2_mapping:
|
|
32
|
+
:return:
|
|
33
|
+
"""
|
|
34
|
+
dockq_program = shutil.which("DockQ")
|
|
35
|
+
if dockq_program is None:
|
|
36
|
+
raise RuntimeError("DockQ is need")
|
|
37
|
+
|
|
38
|
+
assert len(partner_1_mapping) > 0, "partner_1_mapping must be a list of chain ID tuples, can't be empty"
|
|
39
|
+
assert len(partner_2_mapping) > 0, "partner_2_mapping must be a list of chain ID tuples, can't be empty"
|
|
40
|
+
|
|
41
|
+
def load_struct(path: str, partner_1: List[str], partner_2: List[str]):
|
|
42
|
+
st = StructureParser()
|
|
43
|
+
st.load_from_file(path)
|
|
44
|
+
st.clean_structure()
|
|
45
|
+
|
|
46
|
+
for ch in partner_1 + partner_2:
|
|
47
|
+
if ch not in st.chain_ids:
|
|
48
|
+
raise ValueError("Chain %s not found for %s (only [%s])" % (ch, path, " ".join(st.chain_ids)))
|
|
49
|
+
|
|
50
|
+
# merge chains in each each partner into on chain
|
|
51
|
+
# partner_1 with chain ID A
|
|
52
|
+
# partner_2 with chain ID B
|
|
53
|
+
|
|
54
|
+
chain_a = gemmi.Chain("A")
|
|
55
|
+
idx_a = 1
|
|
56
|
+
for ch in partner_1:
|
|
57
|
+
for res in st.get_chain(ch):
|
|
58
|
+
nr = deepcopy(res)
|
|
59
|
+
nr.seqid.icode = " "
|
|
60
|
+
nr.seqid.num = idx_a
|
|
61
|
+
chain_a.add_residue(nr)
|
|
62
|
+
idx_a += 1
|
|
63
|
+
|
|
64
|
+
chain_b = gemmi.Chain("B")
|
|
65
|
+
idx_b = 1
|
|
66
|
+
for ch in partner_2:
|
|
67
|
+
for res in st.get_chain(ch):
|
|
68
|
+
nr = deepcopy(res)
|
|
69
|
+
nr.seqid.icode = " "
|
|
70
|
+
nr.seqid.num = idx_b
|
|
71
|
+
chain_b.add_residue(nr)
|
|
72
|
+
idx_b += 1
|
|
73
|
+
|
|
74
|
+
model = gemmi.Model(1)
|
|
75
|
+
model.add_chain(chain_a)
|
|
76
|
+
model.add_chain(chain_b)
|
|
77
|
+
|
|
78
|
+
struct = gemmi.Structure()
|
|
79
|
+
struct.add_model(model)
|
|
80
|
+
|
|
81
|
+
output = StructureParser(struct)
|
|
82
|
+
return output
|
|
83
|
+
|
|
84
|
+
partner_1_query, partner_1_native = list(zip(*partner_1_mapping))
|
|
85
|
+
partner_2_query, partner_2_native = list(zip(*partner_2_mapping))
|
|
86
|
+
|
|
87
|
+
q_st = load_struct(query_model, list(partner_1_query), list(partner_2_query))
|
|
88
|
+
n_st = load_struct(native_model, list(partner_1_native), list(partner_2_native))
|
|
89
|
+
|
|
90
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
91
|
+
result_file = os.path.join(tmp_dir, "result.json")
|
|
92
|
+
q_file = os.path.join(tmp_dir, "q.pdb")
|
|
93
|
+
n_file = os.path.join(tmp_dir, "n.pdb")
|
|
94
|
+
q_st.to_pdb(q_file, write_minimal_pdb=True)
|
|
95
|
+
n_st.to_pdb(n_file, write_minimal_pdb=True)
|
|
96
|
+
|
|
97
|
+
mapping = "AB:AB"
|
|
98
|
+
|
|
99
|
+
_command = "%s --mapping %s --json %s %s %s" % (dockq_program, mapping, result_file, q_file, n_file)
|
|
100
|
+
metrics = ['DockQ', 'F1', 'chain1', 'chain2']
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
_ = subprocess.run(_command, shell=True, check=True,
|
|
104
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
105
|
+
timeout=300.0)
|
|
106
|
+
except subprocess.CalledProcessError as e:
|
|
107
|
+
# Handle errors in the called executable
|
|
108
|
+
msg = e.stderr.decode()
|
|
109
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
# Handle other exceptions such as file not found or permissions issues
|
|
112
|
+
msg = str(e)
|
|
113
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
114
|
+
else:
|
|
115
|
+
with open(result_file, "r") as fin:
|
|
116
|
+
vals = json.load(fin)
|
|
117
|
+
msg = "Finished"
|
|
118
|
+
result = []
|
|
119
|
+
for v in vals["best_result"].values():
|
|
120
|
+
result.append(v)
|
|
121
|
+
outputs = pd.DataFrame(result)[metrics]
|
|
122
|
+
|
|
123
|
+
if len(outputs) > 0:
|
|
124
|
+
score = "%.4f" % outputs.iloc[0]["DockQ"]
|
|
125
|
+
else:
|
|
126
|
+
score = ""
|
|
127
|
+
|
|
128
|
+
return dict(score=score, status=msg)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import List, Optional, Union
|
|
9
|
+
|
|
10
|
+
import freesasa
|
|
11
|
+
import numpy as np
|
|
12
|
+
import trimesh
|
|
13
|
+
from Bio.PDB import Selection
|
|
14
|
+
from Bio.PDB.ResidueDepth import _get_atom_radius, _read_vertex_array
|
|
15
|
+
|
|
16
|
+
from gemmi_protools import StructureParser
|
|
17
|
+
from gemmi_protools import gemmi2bio
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_face_array(filename: str):
|
|
21
|
+
with open(filename) as fp:
|
|
22
|
+
face_list = []
|
|
23
|
+
for line in fp:
|
|
24
|
+
sl = line.split()
|
|
25
|
+
if len(sl) != 5:
|
|
26
|
+
# skip header
|
|
27
|
+
continue
|
|
28
|
+
vl = [int(x) for x in sl[0:3]]
|
|
29
|
+
face_list.append(vl)
|
|
30
|
+
return np.array(face_list)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_mesh(struct_file: str, chains: Optional[List[str]] = None, MSMS: str = "msms"):
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
:param struct_file: str
|
|
37
|
+
.pdb, .cif, .pdb.gz, .cif.gz
|
|
38
|
+
:param chains: a list of chain names
|
|
39
|
+
default None to include all chains
|
|
40
|
+
:param MSMS: str
|
|
41
|
+
path of msms executable
|
|
42
|
+
:return:
|
|
43
|
+
https://ccsb.scripps.edu/msms/downloads/
|
|
44
|
+
"""
|
|
45
|
+
xyz_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
46
|
+
surface_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
47
|
+
msms_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
48
|
+
face_file = surface_tmp + ".face"
|
|
49
|
+
surface_file = surface_tmp + ".vert"
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
st = StructureParser()
|
|
53
|
+
st.load_from_file(struct_file)
|
|
54
|
+
st.clean_structure(remove_ligand=True)
|
|
55
|
+
|
|
56
|
+
if chains is None:
|
|
57
|
+
st_p = st
|
|
58
|
+
else:
|
|
59
|
+
for ch in chains:
|
|
60
|
+
if ch not in st.chain_ids:
|
|
61
|
+
raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(st.chain_ids)))
|
|
62
|
+
st_p = st.pick_chains(chains)
|
|
63
|
+
|
|
64
|
+
bio_st = gemmi2bio(st_p.STRUCT)
|
|
65
|
+
model = bio_st[0]
|
|
66
|
+
|
|
67
|
+
# Replace pdb_to_xyzr
|
|
68
|
+
# Make x,y,z,radius file
|
|
69
|
+
atom_list = Selection.unfold_entities(model, "A")
|
|
70
|
+
|
|
71
|
+
with open(xyz_tmp, "w") as pdb_to_xyzr:
|
|
72
|
+
for atom in atom_list:
|
|
73
|
+
x, y, z = atom.coord
|
|
74
|
+
radius = _get_atom_radius(atom, rtype="united")
|
|
75
|
+
pdb_to_xyzr.write(f"{x:6.3f}\t{y:6.3f}\t{z:6.3f}\t{radius:1.2f}\n")
|
|
76
|
+
|
|
77
|
+
# Make surface
|
|
78
|
+
MSMS = MSMS + " -no_header -probe_radius 1.5 -if %s -of %s > " + msms_tmp
|
|
79
|
+
make_surface = MSMS % (xyz_tmp, surface_tmp)
|
|
80
|
+
subprocess.call(make_surface, shell=True)
|
|
81
|
+
if not os.path.isfile(surface_file):
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
f"Failed to generate surface file using command:\n{make_surface}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(str(e))
|
|
88
|
+
mesh = None
|
|
89
|
+
else:
|
|
90
|
+
# Read surface vertices from vertex file
|
|
91
|
+
vertices = _read_vertex_array(surface_file)
|
|
92
|
+
faces = _read_face_array(face_file)
|
|
93
|
+
mesh = trimesh.Trimesh(vertices=vertices, faces=faces - 1)
|
|
94
|
+
mesh.merge_vertices()
|
|
95
|
+
mesh.update_faces(mesh.unique_faces())
|
|
96
|
+
mesh.update_faces(mesh.nondegenerate_faces())
|
|
97
|
+
mesh.remove_unreferenced_vertices()
|
|
98
|
+
|
|
99
|
+
# Remove temporary files
|
|
100
|
+
for fn in [xyz_tmp, surface_tmp, msms_tmp, face_file, surface_file]:
|
|
101
|
+
try:
|
|
102
|
+
os.remove(fn)
|
|
103
|
+
except OSError:
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
return mesh
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_surface_residues(struct_file: str,
|
|
110
|
+
chains: Optional[List[str]] = None,
|
|
111
|
+
relative_sasa_cutoff: Union[int, float] = 0.15):
|
|
112
|
+
####################
|
|
113
|
+
# check and pick
|
|
114
|
+
####################
|
|
115
|
+
st = StructureParser()
|
|
116
|
+
st.load_from_file(struct_file)
|
|
117
|
+
st.clean_structure()
|
|
118
|
+
|
|
119
|
+
if chains is None:
|
|
120
|
+
chains = st.chain_ids
|
|
121
|
+
|
|
122
|
+
if isinstance(chains, list):
|
|
123
|
+
if len(chains) == 0:
|
|
124
|
+
raise ValueError("chains is not set")
|
|
125
|
+
else:
|
|
126
|
+
# check if chains valid
|
|
127
|
+
for ch in chains:
|
|
128
|
+
if ch not in st.chain_ids:
|
|
129
|
+
raise ValueError("Chain %s not found" % ch)
|
|
130
|
+
|
|
131
|
+
st_p = st.pick_chains(chains)
|
|
132
|
+
# sequences = {k: s.replace("-", "").upper() for k, s in st_p.polymer_sequences().items()}
|
|
133
|
+
|
|
134
|
+
# start from 1
|
|
135
|
+
seq_num_mapper = dict()
|
|
136
|
+
for chain in st_p.MODEL:
|
|
137
|
+
for i, res in enumerate(chain):
|
|
138
|
+
key = (chain.name, str(res.seqid.num) + res.seqid.icode.strip(), res.name)
|
|
139
|
+
seq_num_mapper[key] = i + 1
|
|
140
|
+
|
|
141
|
+
# make one upper letter chain ID
|
|
142
|
+
mapper = st_p.make_one_letter_chain(only_uppercase=True)
|
|
143
|
+
mapper_r = {v: k for k, v in mapper.items()}
|
|
144
|
+
|
|
145
|
+
####################
|
|
146
|
+
# save to pdb
|
|
147
|
+
####################
|
|
148
|
+
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdb", mode='w') as tmp_file:
|
|
149
|
+
st_p.to_pdb(tmp_file.name)
|
|
150
|
+
structure = freesasa.Structure(tmp_file.name)
|
|
151
|
+
|
|
152
|
+
result = freesasa.calc(structure)
|
|
153
|
+
|
|
154
|
+
residue_areas = result.residueAreas()
|
|
155
|
+
|
|
156
|
+
surface_residues_relative_sasa = dict()
|
|
157
|
+
surface_atoms = defaultdict(list)
|
|
158
|
+
for atom_index in range(structure.nAtoms()):
|
|
159
|
+
ch = structure.chainLabel(atom_index)
|
|
160
|
+
ch = mapper_r.get(ch, ch)
|
|
161
|
+
|
|
162
|
+
res_num = structure.residueNumber(atom_index).strip()
|
|
163
|
+
res_name = structure.residueName(atom_index)
|
|
164
|
+
atom_sasa = result.atomArea(atom_index)
|
|
165
|
+
|
|
166
|
+
res_id = (ch, res_num, res_name)
|
|
167
|
+
res_relative_total = residue_areas[ch][res_num].relativeTotal
|
|
168
|
+
if res_relative_total > relative_sasa_cutoff:
|
|
169
|
+
if res_id not in surface_residues_relative_sasa:
|
|
170
|
+
surface_residues_relative_sasa[res_id] = res_relative_total
|
|
171
|
+
if atom_sasa > 0:
|
|
172
|
+
atom_name = structure.atomName(atom_index).strip()
|
|
173
|
+
pos = structure.coord(atom_index)
|
|
174
|
+
surface_atoms[res_id].append((atom_sasa, atom_name, pos))
|
|
175
|
+
|
|
176
|
+
results = []
|
|
177
|
+
for res_id, query_atoms in surface_atoms.items():
|
|
178
|
+
seq_loc = seq_num_mapper[res_id]
|
|
179
|
+
|
|
180
|
+
query_atoms.sort(reverse=True)
|
|
181
|
+
centroid = tuple(np.array([a[2] for a in query_atoms[0:3]]).mean(axis=0).tolist())
|
|
182
|
+
results.append((res_id[0],
|
|
183
|
+
res_id[1],
|
|
184
|
+
res_id[2],
|
|
185
|
+
seq_loc,
|
|
186
|
+
centroid,
|
|
187
|
+
surface_residues_relative_sasa[res_id]
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
dtype = [("chain_name", "U5"),
|
|
191
|
+
("residue_numi", "U8"),
|
|
192
|
+
("residue_name", "U5"),
|
|
193
|
+
("sequential_residue_num", "i4"),
|
|
194
|
+
("centroid", ("f4", (3,))),
|
|
195
|
+
("relative_sasa", "f4"),
|
|
196
|
+
]
|
|
197
|
+
return np.array(results, dtype=dtype)
|