gemmi-protools 0.1.17__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +1 -4
- gemmi_protools/io/convert.py +0 -3
- gemmi_protools/io/reader.py +752 -309
- gemmi_protools/{utils → tools}/align.py +38 -54
- gemmi_protools/tools/dockq.py +128 -0
- gemmi_protools/tools/mesh.py +197 -0
- gemmi_protools/{utils → tools}/pdb_annot.py +21 -105
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/METADATA +20 -12
- gemmi_protools-1.0.1.dist-info/RECORD +19 -0
- gemmi_protools/io/cif_opts.py +0 -173
- gemmi_protools/io/parse_pdb_header.py +0 -387
- gemmi_protools/io/parser.py +0 -292
- gemmi_protools/io/pdb_opts.py +0 -179
- gemmi_protools/io/peptide.py +0 -32
- gemmi_protools/io/struct_info.py +0 -91
- gemmi_protools/utils/dockq.py +0 -139
- gemmi_protools/utils/fixer.py +0 -274
- gemmi_protools/utils/immune_complex.py +0 -787
- gemmi_protools/utils/ppi.py +0 -74
- gemmi_protools-0.1.17.dist-info/RECORD +0 -27
- /gemmi_protools/{utils → tools}/__init__.py +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/WHEEL +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -2,33 +2,26 @@
|
|
|
2
2
|
@Author: Luo Jiejian
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
-
import pathlib
|
|
6
5
|
import re
|
|
7
6
|
import shutil
|
|
8
7
|
import subprocess
|
|
9
8
|
import tempfile
|
|
10
|
-
import
|
|
11
|
-
from copy import deepcopy
|
|
12
|
-
from typing import Union, Dict, Any, List, Optional
|
|
9
|
+
from typing import Dict, Any, List, Optional
|
|
13
10
|
|
|
14
11
|
import numpy as np
|
|
15
12
|
from Bio.PDB import Superimposer
|
|
16
|
-
from typeguard import typechecked
|
|
17
13
|
|
|
18
14
|
from gemmi_protools.io.convert import gemmi2bio, bio2gemmi
|
|
19
15
|
from gemmi_protools.io.reader import StructureParser
|
|
20
16
|
|
|
21
17
|
|
|
22
18
|
class StructureAligner(object):
|
|
23
|
-
|
|
24
|
-
def __init__(self, query_path: Union[str, pathlib.Path], ref_path: Union[str, pathlib.Path]):
|
|
19
|
+
def __init__(self, query_path: str, ref_path: str):
|
|
25
20
|
self._query_st = StructureParser()
|
|
26
21
|
self._query_st.load_from_file(query_path)
|
|
27
|
-
self._query_st.set_default_model()
|
|
28
22
|
|
|
29
23
|
self._ref_st = StructureParser()
|
|
30
24
|
self._ref_st.load_from_file(ref_path)
|
|
31
|
-
self._ref_st.set_default_model()
|
|
32
25
|
|
|
33
26
|
self.values = dict()
|
|
34
27
|
self.rot_mat = None
|
|
@@ -49,8 +42,7 @@ class StructureAligner(object):
|
|
|
49
42
|
return _path
|
|
50
43
|
|
|
51
44
|
@staticmethod
|
|
52
|
-
|
|
53
|
-
def __parser_rotation_matrix(matrix_file: Union[str, pathlib.Path]):
|
|
45
|
+
def __parser_rotation_matrix(matrix_file: str):
|
|
54
46
|
rotation_matrix = []
|
|
55
47
|
translation_vector = []
|
|
56
48
|
|
|
@@ -66,7 +58,6 @@ class StructureAligner(object):
|
|
|
66
58
|
T=np.array(translation_vector).astype(np.float32))
|
|
67
59
|
|
|
68
60
|
@staticmethod
|
|
69
|
-
@typechecked
|
|
70
61
|
def __parse_terminal_outputs(output_string: str) -> Dict[str, Any]:
|
|
71
62
|
lines = re.split(pattern=r"\n", string=output_string)
|
|
72
63
|
# chain mapping
|
|
@@ -108,7 +99,6 @@ class StructureAligner(object):
|
|
|
108
99
|
del patterns[key]
|
|
109
100
|
return values
|
|
110
101
|
|
|
111
|
-
@typechecked
|
|
112
102
|
def make_alignment(self, query_chains: Optional[List[str]] = None,
|
|
113
103
|
ref_chains: Optional[List[str]] = None, timeout=300.0):
|
|
114
104
|
"""
|
|
@@ -122,56 +112,50 @@ class StructureAligner(object):
|
|
|
122
112
|
program_path = self.__mmalign_path
|
|
123
113
|
|
|
124
114
|
# clone
|
|
125
|
-
q_st = deepcopy(self._query_st)
|
|
126
|
-
r_st = deepcopy(self._ref_st)
|
|
127
|
-
|
|
128
|
-
tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
|
|
129
|
-
os.makedirs(tmp_dir)
|
|
130
|
-
|
|
131
115
|
if isinstance(query_chains, list):
|
|
132
|
-
q_st.pick_chains(query_chains)
|
|
116
|
+
q_st = self._query_st.pick_chains(query_chains)
|
|
117
|
+
else:
|
|
118
|
+
q_st = self._query_st
|
|
133
119
|
|
|
134
120
|
if isinstance(ref_chains, list):
|
|
135
|
-
r_st.pick_chains(
|
|
121
|
+
r_st = self._ref_st.pick_chains(query_chains)
|
|
122
|
+
else:
|
|
123
|
+
r_st = self._ref_st
|
|
136
124
|
|
|
137
|
-
q_ch_mapper = q_st.
|
|
138
|
-
r_ch_mapper = r_st.
|
|
125
|
+
q_ch_mapper = q_st.make_one_letter_chain()
|
|
126
|
+
r_ch_mapper = r_st.make_one_letter_chain()
|
|
139
127
|
|
|
140
128
|
q_ch_mapper_r = {v: k for k, v in q_ch_mapper.items()}
|
|
141
129
|
r_ch_mapper_r = {v: k for k, v in r_ch_mapper.items()}
|
|
142
130
|
|
|
143
|
-
|
|
144
|
-
|
|
131
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
132
|
+
_tmp_a = os.path.join(tmp_dir, "a.pdb")
|
|
133
|
+
q_st.to_pdb(_tmp_a)
|
|
134
|
+
|
|
135
|
+
_tmp_b = os.path.join(tmp_dir, "b.pdb")
|
|
136
|
+
r_st.to_pdb(_tmp_b)
|
|
137
|
+
|
|
138
|
+
matrix_file = os.path.join(tmp_dir, "m.txt")
|
|
139
|
+
_command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
result = subprocess.run(_command, shell=True, check=True,
|
|
143
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
144
|
+
timeout=timeout)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print("%s: between files %s and %s; between chains: %s and %s" % (
|
|
147
|
+
str(e), self.query_path, self.ref_path,
|
|
148
|
+
str(q_st.chain_ids), str(r_st.chain_ids))
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
self.values = self.__parse_terminal_outputs(result.stdout.decode())
|
|
152
|
+
self.rot_mat = self.__parser_rotation_matrix(matrix_file)
|
|
153
|
+
self.is_aligned = True
|
|
154
|
+
self.by_query = q_st.chain_ids if query_chains is None else query_chains
|
|
155
|
+
self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
|
|
156
|
+
self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
|
|
157
|
+
self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
|
|
145
158
|
|
|
146
|
-
_tmp_b = os.path.join(tmp_dir, "b.pdb")
|
|
147
|
-
r_st.to_pdb(_tmp_b)
|
|
148
|
-
|
|
149
|
-
matrix_file = os.path.join(tmp_dir, "m.txt")
|
|
150
|
-
_command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
|
|
151
|
-
|
|
152
|
-
try:
|
|
153
|
-
result = subprocess.run(_command, shell=True, check=True,
|
|
154
|
-
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
155
|
-
timeout=timeout)
|
|
156
|
-
except Exception as e:
|
|
157
|
-
print("%s: between files %s and %s; between chains: %s and %s" % (
|
|
158
|
-
str(e), self.query_path, self.ref_path,
|
|
159
|
-
str(q_st.chain_ids), str(r_st.chain_ids))
|
|
160
|
-
)
|
|
161
|
-
else:
|
|
162
|
-
self.values = self.__parse_terminal_outputs(result.stdout.decode())
|
|
163
|
-
self.rot_mat = self.__parser_rotation_matrix(matrix_file)
|
|
164
|
-
self.is_aligned = True
|
|
165
|
-
self.by_query = q_st.chain_ids if query_chains is None else query_chains
|
|
166
|
-
self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
|
|
167
|
-
self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
|
|
168
|
-
self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
|
|
169
|
-
|
|
170
|
-
finally:
|
|
171
|
-
if os.path.isdir(tmp_dir):
|
|
172
|
-
shutil.rmtree(tmp_dir)
|
|
173
|
-
|
|
174
|
-
@typechecked
|
|
175
159
|
def save_aligned_query(self, out_file: str):
|
|
176
160
|
"""
|
|
177
161
|
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
import gemmi
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from gemmi_protools.io.reader import StructureParser
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def dockq_score_interface(query_model: str,
|
|
19
|
+
native_model: str,
|
|
20
|
+
partner_1_mapping: List[Tuple[str, str]],
|
|
21
|
+
partner_2_mapping: List[Tuple[str, str]],
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Calculate Dockq Score for an interface (partner 1 vs partner 2)
|
|
25
|
+
|
|
26
|
+
:param query_model: str
|
|
27
|
+
path of query model, support .pdb, .pdb.gz, .cif, .cif.gz
|
|
28
|
+
:param native_model:
|
|
29
|
+
:param partner_1_mapping: a list of chain ID mapping between query and native for partner1 of the interface
|
|
30
|
+
e.g. [(q chain1, n chain1), (q chain2, n chain2)]
|
|
31
|
+
:param partner_2_mapping:
|
|
32
|
+
:return:
|
|
33
|
+
"""
|
|
34
|
+
dockq_program = shutil.which("DockQ")
|
|
35
|
+
if dockq_program is None:
|
|
36
|
+
raise RuntimeError("DockQ is need")
|
|
37
|
+
|
|
38
|
+
assert len(partner_1_mapping) > 0, "partner_1_mapping must be a list of chain ID tuples, can't be empty"
|
|
39
|
+
assert len(partner_2_mapping) > 0, "partner_2_mapping must be a list of chain ID tuples, can't be empty"
|
|
40
|
+
|
|
41
|
+
def load_struct(path: str, partner_1: List[str], partner_2: List[str]):
|
|
42
|
+
st = StructureParser()
|
|
43
|
+
st.load_from_file(path)
|
|
44
|
+
st.clean_structure()
|
|
45
|
+
|
|
46
|
+
for ch in partner_1 + partner_2:
|
|
47
|
+
if ch not in st.chain_ids:
|
|
48
|
+
raise ValueError("Chain %s not found for %s (only [%s])" % (ch, path, " ".join(st.chain_ids)))
|
|
49
|
+
|
|
50
|
+
# merge chains in each each partner into on chain
|
|
51
|
+
# partner_1 with chain ID A
|
|
52
|
+
# partner_2 with chain ID B
|
|
53
|
+
|
|
54
|
+
chain_a = gemmi.Chain("A")
|
|
55
|
+
idx_a = 1
|
|
56
|
+
for ch in partner_1:
|
|
57
|
+
for res in st.get_chain(ch):
|
|
58
|
+
nr = deepcopy(res)
|
|
59
|
+
nr.seqid.icode = " "
|
|
60
|
+
nr.seqid.num = idx_a
|
|
61
|
+
chain_a.add_residue(nr)
|
|
62
|
+
idx_a += 1
|
|
63
|
+
|
|
64
|
+
chain_b = gemmi.Chain("B")
|
|
65
|
+
idx_b = 1
|
|
66
|
+
for ch in partner_2:
|
|
67
|
+
for res in st.get_chain(ch):
|
|
68
|
+
nr = deepcopy(res)
|
|
69
|
+
nr.seqid.icode = " "
|
|
70
|
+
nr.seqid.num = idx_b
|
|
71
|
+
chain_b.add_residue(nr)
|
|
72
|
+
idx_b += 1
|
|
73
|
+
|
|
74
|
+
model = gemmi.Model(1)
|
|
75
|
+
model.add_chain(chain_a)
|
|
76
|
+
model.add_chain(chain_b)
|
|
77
|
+
|
|
78
|
+
struct = gemmi.Structure()
|
|
79
|
+
struct.add_model(model)
|
|
80
|
+
|
|
81
|
+
output = StructureParser(struct)
|
|
82
|
+
return output
|
|
83
|
+
|
|
84
|
+
partner_1_query, partner_1_native = list(zip(*partner_1_mapping))
|
|
85
|
+
partner_2_query, partner_2_native = list(zip(*partner_2_mapping))
|
|
86
|
+
|
|
87
|
+
q_st = load_struct(query_model, list(partner_1_query), list(partner_2_query))
|
|
88
|
+
n_st = load_struct(native_model, list(partner_1_native), list(partner_2_native))
|
|
89
|
+
|
|
90
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
91
|
+
result_file = os.path.join(tmp_dir, "result.json")
|
|
92
|
+
q_file = os.path.join(tmp_dir, "q.pdb")
|
|
93
|
+
n_file = os.path.join(tmp_dir, "n.pdb")
|
|
94
|
+
q_st.to_pdb(q_file, write_minimal_pdb=True)
|
|
95
|
+
n_st.to_pdb(n_file, write_minimal_pdb=True)
|
|
96
|
+
|
|
97
|
+
mapping = "AB:AB"
|
|
98
|
+
|
|
99
|
+
_command = "%s --mapping %s --json %s %s %s" % (dockq_program, mapping, result_file, q_file, n_file)
|
|
100
|
+
metrics = ['DockQ', 'F1', 'chain1', 'chain2']
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
_ = subprocess.run(_command, shell=True, check=True,
|
|
104
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
105
|
+
timeout=300.0)
|
|
106
|
+
except subprocess.CalledProcessError as e:
|
|
107
|
+
# Handle errors in the called executable
|
|
108
|
+
msg = e.stderr.decode()
|
|
109
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
# Handle other exceptions such as file not found or permissions issues
|
|
112
|
+
msg = str(e)
|
|
113
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
114
|
+
else:
|
|
115
|
+
with open(result_file, "r") as fin:
|
|
116
|
+
vals = json.load(fin)
|
|
117
|
+
msg = "Finished"
|
|
118
|
+
result = []
|
|
119
|
+
for v in vals["best_result"].values():
|
|
120
|
+
result.append(v)
|
|
121
|
+
outputs = pd.DataFrame(result)[metrics]
|
|
122
|
+
|
|
123
|
+
if len(outputs) > 0:
|
|
124
|
+
score = "%.4f" % outputs.iloc[0]["DockQ"]
|
|
125
|
+
else:
|
|
126
|
+
score = ""
|
|
127
|
+
|
|
128
|
+
return dict(score=score, status=msg)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import List, Optional, Union
|
|
9
|
+
|
|
10
|
+
import freesasa
|
|
11
|
+
import numpy as np
|
|
12
|
+
import trimesh
|
|
13
|
+
from Bio.PDB import Selection
|
|
14
|
+
from Bio.PDB.ResidueDepth import _get_atom_radius, _read_vertex_array
|
|
15
|
+
|
|
16
|
+
from gemmi_protools import StructureParser
|
|
17
|
+
from gemmi_protools import gemmi2bio
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_face_array(filename: str):
|
|
21
|
+
with open(filename) as fp:
|
|
22
|
+
face_list = []
|
|
23
|
+
for line in fp:
|
|
24
|
+
sl = line.split()
|
|
25
|
+
if len(sl) != 5:
|
|
26
|
+
# skip header
|
|
27
|
+
continue
|
|
28
|
+
vl = [int(x) for x in sl[0:3]]
|
|
29
|
+
face_list.append(vl)
|
|
30
|
+
return np.array(face_list)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_mesh(struct_file: str, chains: Optional[List[str]] = None, MSMS: str = "msms"):
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
:param struct_file: str
|
|
37
|
+
.pdb, .cif, .pdb.gz, .cif.gz
|
|
38
|
+
:param chains: a list of chain names
|
|
39
|
+
default None to include all chains
|
|
40
|
+
:param MSMS: str
|
|
41
|
+
path of msms executable
|
|
42
|
+
:return:
|
|
43
|
+
https://ccsb.scripps.edu/msms/downloads/
|
|
44
|
+
"""
|
|
45
|
+
xyz_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
46
|
+
surface_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
47
|
+
msms_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
48
|
+
face_file = surface_tmp + ".face"
|
|
49
|
+
surface_file = surface_tmp + ".vert"
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
st = StructureParser()
|
|
53
|
+
st.load_from_file(struct_file)
|
|
54
|
+
st.clean_structure(remove_ligand=True)
|
|
55
|
+
|
|
56
|
+
if chains is None:
|
|
57
|
+
st_p = st
|
|
58
|
+
else:
|
|
59
|
+
for ch in chains:
|
|
60
|
+
if ch not in st.chain_ids:
|
|
61
|
+
raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(st.chain_ids)))
|
|
62
|
+
st_p = st.pick_chains(chains)
|
|
63
|
+
|
|
64
|
+
bio_st = gemmi2bio(st_p.STRUCT)
|
|
65
|
+
model = bio_st[0]
|
|
66
|
+
|
|
67
|
+
# Replace pdb_to_xyzr
|
|
68
|
+
# Make x,y,z,radius file
|
|
69
|
+
atom_list = Selection.unfold_entities(model, "A")
|
|
70
|
+
|
|
71
|
+
with open(xyz_tmp, "w") as pdb_to_xyzr:
|
|
72
|
+
for atom in atom_list:
|
|
73
|
+
x, y, z = atom.coord
|
|
74
|
+
radius = _get_atom_radius(atom, rtype="united")
|
|
75
|
+
pdb_to_xyzr.write(f"{x:6.3f}\t{y:6.3f}\t{z:6.3f}\t{radius:1.2f}\n")
|
|
76
|
+
|
|
77
|
+
# Make surface
|
|
78
|
+
MSMS = MSMS + " -no_header -probe_radius 1.5 -if %s -of %s > " + msms_tmp
|
|
79
|
+
make_surface = MSMS % (xyz_tmp, surface_tmp)
|
|
80
|
+
subprocess.call(make_surface, shell=True)
|
|
81
|
+
if not os.path.isfile(surface_file):
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
f"Failed to generate surface file using command:\n{make_surface}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(str(e))
|
|
88
|
+
mesh = None
|
|
89
|
+
else:
|
|
90
|
+
# Read surface vertices from vertex file
|
|
91
|
+
vertices = _read_vertex_array(surface_file)
|
|
92
|
+
faces = _read_face_array(face_file)
|
|
93
|
+
mesh = trimesh.Trimesh(vertices=vertices, faces=faces - 1)
|
|
94
|
+
mesh.merge_vertices()
|
|
95
|
+
mesh.update_faces(mesh.unique_faces())
|
|
96
|
+
mesh.update_faces(mesh.nondegenerate_faces())
|
|
97
|
+
mesh.remove_unreferenced_vertices()
|
|
98
|
+
|
|
99
|
+
# Remove temporary files
|
|
100
|
+
for fn in [xyz_tmp, surface_tmp, msms_tmp, face_file, surface_file]:
|
|
101
|
+
try:
|
|
102
|
+
os.remove(fn)
|
|
103
|
+
except OSError:
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
return mesh
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_surface_residues(struct_file: str,
|
|
110
|
+
chains: Optional[List[str]] = None,
|
|
111
|
+
relative_sasa_cutoff: Union[int, float] = 0.15):
|
|
112
|
+
####################
|
|
113
|
+
# check and pick
|
|
114
|
+
####################
|
|
115
|
+
st = StructureParser()
|
|
116
|
+
st.load_from_file(struct_file)
|
|
117
|
+
st.clean_structure()
|
|
118
|
+
|
|
119
|
+
if chains is None:
|
|
120
|
+
chains = st.chain_ids
|
|
121
|
+
|
|
122
|
+
if isinstance(chains, list):
|
|
123
|
+
if len(chains) == 0:
|
|
124
|
+
raise ValueError("chains is not set")
|
|
125
|
+
else:
|
|
126
|
+
# check if chains valid
|
|
127
|
+
for ch in chains:
|
|
128
|
+
if ch not in st.chain_ids:
|
|
129
|
+
raise ValueError("Chain %s not found" % ch)
|
|
130
|
+
|
|
131
|
+
st_p = st.pick_chains(chains)
|
|
132
|
+
# sequences = {k: s.replace("-", "").upper() for k, s in st_p.polymer_sequences().items()}
|
|
133
|
+
|
|
134
|
+
# start from 1
|
|
135
|
+
seq_num_mapper = dict()
|
|
136
|
+
for chain in st_p.MODEL:
|
|
137
|
+
for i, res in enumerate(chain):
|
|
138
|
+
key = (chain.name, str(res.seqid.num) + res.seqid.icode.strip(), res.name)
|
|
139
|
+
seq_num_mapper[key] = i + 1
|
|
140
|
+
|
|
141
|
+
# make one upper letter chain ID
|
|
142
|
+
mapper = st_p.make_one_letter_chain(only_uppercase=True)
|
|
143
|
+
mapper_r = {v: k for k, v in mapper.items()}
|
|
144
|
+
|
|
145
|
+
####################
|
|
146
|
+
# save to pdb
|
|
147
|
+
####################
|
|
148
|
+
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdb", mode='w') as tmp_file:
|
|
149
|
+
st_p.to_pdb(tmp_file.name)
|
|
150
|
+
structure = freesasa.Structure(tmp_file.name)
|
|
151
|
+
|
|
152
|
+
result = freesasa.calc(structure)
|
|
153
|
+
|
|
154
|
+
residue_areas = result.residueAreas()
|
|
155
|
+
|
|
156
|
+
surface_residues_relative_sasa = dict()
|
|
157
|
+
surface_atoms = defaultdict(list)
|
|
158
|
+
for atom_index in range(structure.nAtoms()):
|
|
159
|
+
ch = structure.chainLabel(atom_index)
|
|
160
|
+
ch = mapper_r.get(ch, ch)
|
|
161
|
+
|
|
162
|
+
res_num = structure.residueNumber(atom_index).strip()
|
|
163
|
+
res_name = structure.residueName(atom_index)
|
|
164
|
+
atom_sasa = result.atomArea(atom_index)
|
|
165
|
+
|
|
166
|
+
res_id = (ch, res_num, res_name)
|
|
167
|
+
res_relative_total = residue_areas[ch][res_num].relativeTotal
|
|
168
|
+
if res_relative_total > relative_sasa_cutoff:
|
|
169
|
+
if res_id not in surface_residues_relative_sasa:
|
|
170
|
+
surface_residues_relative_sasa[res_id] = res_relative_total
|
|
171
|
+
if atom_sasa > 0:
|
|
172
|
+
atom_name = structure.atomName(atom_index).strip()
|
|
173
|
+
pos = structure.coord(atom_index)
|
|
174
|
+
surface_atoms[res_id].append((atom_sasa, atom_name, pos))
|
|
175
|
+
|
|
176
|
+
results = []
|
|
177
|
+
for res_id, query_atoms in surface_atoms.items():
|
|
178
|
+
seq_loc = seq_num_mapper[res_id]
|
|
179
|
+
|
|
180
|
+
query_atoms.sort(reverse=True)
|
|
181
|
+
centroid = tuple(np.array([a[2] for a in query_atoms[0:3]]).mean(axis=0).tolist())
|
|
182
|
+
results.append((res_id[0],
|
|
183
|
+
res_id[1],
|
|
184
|
+
res_id[2],
|
|
185
|
+
seq_loc,
|
|
186
|
+
centroid,
|
|
187
|
+
surface_residues_relative_sasa[res_id]
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
dtype = [("chain_name", "U5"),
|
|
191
|
+
("residue_numi", "U8"),
|
|
192
|
+
("residue_name", "U5"),
|
|
193
|
+
("sequential_residue_num", "i4"),
|
|
194
|
+
("centroid", ("f4", (3,))),
|
|
195
|
+
("relative_sasa", "f4"),
|
|
196
|
+
]
|
|
197
|
+
return np.array(results, dtype=dtype)
|
|
@@ -2,25 +2,18 @@
|
|
|
2
2
|
@Author: Luo Jiejian
|
|
3
3
|
"""
|
|
4
4
|
import hashlib
|
|
5
|
-
import itertools
|
|
6
5
|
import os
|
|
7
6
|
import re
|
|
8
7
|
import shutil
|
|
9
8
|
import subprocess
|
|
10
9
|
import uuid
|
|
11
10
|
from collections import defaultdict
|
|
12
|
-
from dataclasses import asdict
|
|
13
11
|
from importlib.resources import files
|
|
14
|
-
from typing import List
|
|
15
12
|
|
|
16
|
-
import numpy as np
|
|
17
13
|
from anarci import run_anarci
|
|
18
14
|
from anarci.germlines import all_germlines
|
|
19
|
-
from joblib import Parallel, delayed
|
|
20
|
-
from scipy.spatial import cKDTree
|
|
21
15
|
|
|
22
16
|
from gemmi_protools import StructureParser
|
|
23
|
-
from gemmi_protools.utils.ppi import _ppi_atoms
|
|
24
17
|
|
|
25
18
|
|
|
26
19
|
def hash_sequence(seq: str) -> str:
|
|
@@ -207,125 +200,48 @@ def annotate_mhc(seq_dict: dict):
|
|
|
207
200
|
return out
|
|
208
201
|
|
|
209
202
|
|
|
210
|
-
def
|
|
211
|
-
chains_x: List[str],
|
|
212
|
-
chains_y: List[str],
|
|
213
|
-
threshold: float = 4.5):
|
|
214
|
-
"""
|
|
215
|
-
identify PPI among protein, DNA, RNA
|
|
216
|
-
:param struct: StructureParser
|
|
217
|
-
:param chains_x:
|
|
218
|
-
:param chains_y:
|
|
219
|
-
:param threshold:
|
|
220
|
-
:return:
|
|
221
|
-
PPI residues of chains_x, PPI residues of chains_y
|
|
222
|
-
"""
|
|
223
|
-
|
|
224
|
-
x_coord, x_id = _ppi_atoms(struct, chains_x)
|
|
225
|
-
y_coord, y_id = _ppi_atoms(struct, chains_y)
|
|
226
|
-
|
|
227
|
-
kd_tree_x = cKDTree(x_coord)
|
|
228
|
-
kd_tree_y = cKDTree(y_coord)
|
|
229
|
-
|
|
230
|
-
pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
|
|
231
|
-
|
|
232
|
-
x_res = np.unique(x_id[pairs.row][["ch_name", 'res_num', 'res_icode', 'res_name']])
|
|
233
|
-
y_res = np.unique(y_id[pairs.col][["ch_name", 'res_num', 'res_icode', 'res_name']])
|
|
234
|
-
|
|
235
|
-
x_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in x_res.tolist()]
|
|
236
|
-
y_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in y_res.tolist()]
|
|
237
|
-
return x_out, y_out
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def polymer_interface_residues(struct: StructureParser,
|
|
241
|
-
ppi_threshold: float = 4.5,
|
|
242
|
-
n_cpus: int = 1,
|
|
243
|
-
):
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
struct:
|
|
248
|
-
ppi_threshold:
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
|
|
252
|
-
"""
|
|
253
|
-
chains = [ch for ch, ct in struct.chain_types.items() if ct in ["protein", "dna", "rna"]]
|
|
254
|
-
ch_pairs = list(itertools.combinations(chains, r=2))
|
|
255
|
-
ch_pairs.sort()
|
|
256
|
-
|
|
257
|
-
def _run(ch_1, ch_2):
|
|
258
|
-
key = "%s/%s" % (ch_1, ch_2)
|
|
259
|
-
res_x, res_y = _interface_residues(struct, chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
|
|
260
|
-
if len(res_x) > 0:
|
|
261
|
-
return {key: [res_x, res_y]}
|
|
262
|
-
else:
|
|
263
|
-
return dict()
|
|
264
|
-
|
|
265
|
-
cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
|
|
266
|
-
|
|
267
|
-
outputs = dict()
|
|
268
|
-
if cpu2use == 1 or len(ch_pairs) < 100:
|
|
269
|
-
for ch_1, ch_2 in ch_pairs:
|
|
270
|
-
outputs.update(_run(ch_1, ch_2))
|
|
271
|
-
else:
|
|
272
|
-
results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
|
|
273
|
-
for item in results:
|
|
274
|
-
outputs.update(item)
|
|
275
|
-
return outputs
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def annotate_pdb(struct_file: str, ppi_threshold: float = 4.5,
|
|
279
|
-
n_cpus: int = 1, max_seqs: int = 100):
|
|
203
|
+
def annotate_pdb(struct_file: str):
|
|
280
204
|
st = StructureParser()
|
|
281
205
|
st.load_from_file(struct_file)
|
|
282
|
-
st.
|
|
283
|
-
st.STRUCT.remove_alternative_conformations()
|
|
284
|
-
st.STRUCT.remove_ligands_and_waters()
|
|
285
|
-
st.STRUCT.remove_hydrogens()
|
|
286
|
-
st.STRUCT.remove_empty_chains()
|
|
287
|
-
st.update_entity()
|
|
206
|
+
st.clean_structure()
|
|
288
207
|
|
|
289
|
-
|
|
290
|
-
|
|
208
|
+
subchain_id2entity_id = dict()
|
|
209
|
+
for ent in st.STRUCT.entities:
|
|
210
|
+
for ch in ent.subchains:
|
|
211
|
+
subchain_id2entity_id[ch] = ent.name
|
|
291
212
|
|
|
292
213
|
# Merge sequences
|
|
293
214
|
polymers = dict()
|
|
294
|
-
for ch, seq in st.polymer_sequences.items():
|
|
295
|
-
|
|
215
|
+
for ch, seq in st.polymer_sequences().items():
|
|
216
|
+
subchain_id = st.get_chain(ch).get_polymer().subchain_id()
|
|
217
|
+
entity_id = subchain_id2entity_id[subchain_id]
|
|
218
|
+
|
|
219
|
+
hash_id = hash_sequence(seq.upper())
|
|
296
220
|
if hash_id not in polymers:
|
|
297
221
|
val = dict(chain_ids=[ch],
|
|
298
|
-
sequence=seq,
|
|
299
|
-
type=st.
|
|
300
|
-
description=st.
|
|
301
|
-
specie=st.ENTITY.eid2specie.get(st.ENTITY.polymer2eid[ch], "Unknown"),
|
|
302
|
-
taxid=st.ENTITY.eid2taxid.get(st.ENTITY.polymer2eid[ch], "Unknown"),
|
|
222
|
+
sequence=seq.upper(),
|
|
223
|
+
type=st.polymer_types[ch].name,
|
|
224
|
+
description=st.INFO["description"].get(entity_id, "Unknown"),
|
|
303
225
|
)
|
|
304
226
|
polymers[hash_id] = val
|
|
305
227
|
else:
|
|
306
228
|
polymers[hash_id]["chain_ids"].append(ch)
|
|
307
229
|
|
|
308
|
-
|
|
309
|
-
|
|
230
|
+
proteins = dict()
|
|
310
231
|
results = dict()
|
|
311
|
-
for
|
|
232
|
+
for hash_id, val in polymers.items():
|
|
312
233
|
val["chain_ids"].sort()
|
|
313
|
-
if val["type"] == "
|
|
234
|
+
if val["type"] == "PeptideL":
|
|
235
|
+
proteins[hash_id] = val["sequence"]
|
|
314
236
|
anarci_info = get_fv_region(val["sequence"])
|
|
315
237
|
fvt = fv_region_type(anarci_info)
|
|
316
238
|
if fvt != "not-Fv":
|
|
317
|
-
results[
|
|
318
|
-
|
|
319
|
-
struct_info = asdict(st.INFO)
|
|
320
|
-
struct_info.update(resolution=st.STRUCT.resolution)
|
|
321
|
-
struct_info["pdb_id"] = struct_info["pdb_id"].lower()
|
|
322
|
-
struct_info["exp_method"] = struct_info["exp_method"].lower()
|
|
239
|
+
results[hash_id] = dict(fv_type=fvt, annotations=anarci_info)
|
|
323
240
|
|
|
241
|
+
struct_info = {k: st.INFO[k] for k in ["resolution", "pdb_id", "deposition_date", "method", "title"]}
|
|
324
242
|
return dict(path=os.path.abspath(os.path.expanduser(struct_file)),
|
|
325
243
|
info=struct_info,
|
|
326
244
|
polymers=polymers,
|
|
327
245
|
anarci=results,
|
|
328
|
-
mhc=annotate_mhc(
|
|
329
|
-
interfaces=polymer_interface_residues(st, ppi_threshold,
|
|
330
|
-
n_cpus=n_cpus)
|
|
246
|
+
mhc=annotate_mhc(proteins) if len(proteins) > 0 else dict(),
|
|
331
247
|
)
|