gemmi-protools 0.1.16__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +1 -4
- gemmi_protools/io/convert.py +0 -3
- gemmi_protools/io/reader.py +749 -310
- gemmi_protools/{utils → tools}/align.py +38 -55
- gemmi_protools/tools/dockq.py +127 -0
- gemmi_protools/tools/mesh.py +95 -0
- gemmi_protools/{utils → tools}/pdb_annot.py +21 -106
- gemmi_protools-1.0.0.dist-info/METADATA +41 -0
- gemmi_protools-1.0.0.dist-info/RECORD +19 -0
- gemmi_protools/io/cif_opts.py +0 -173
- gemmi_protools/io/parse_pdb_header.py +0 -387
- gemmi_protools/io/parser.py +0 -292
- gemmi_protools/io/pdb_opts.py +0 -179
- gemmi_protools/io/peptide.py +0 -32
- gemmi_protools/io/struct_info.py +0 -91
- gemmi_protools/utils/dockq.py +0 -139
- gemmi_protools/utils/fixer.py +0 -274
- gemmi_protools/utils/ppi.py +0 -74
- gemmi_protools-0.1.16.dist-info/METADATA +0 -29
- gemmi_protools-0.1.16.dist-info/RECORD +0 -26
- /gemmi_protools/{utils → tools}/__init__.py +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/WHEEL +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -2,33 +2,25 @@
|
|
|
2
2
|
@Author: Luo Jiejian
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
-
import pathlib
|
|
6
5
|
import re
|
|
7
6
|
import shutil
|
|
8
7
|
import subprocess
|
|
9
8
|
import tempfile
|
|
10
|
-
import
|
|
11
|
-
from copy import deepcopy
|
|
12
|
-
from typing import Union, Dict, Any, List, Optional
|
|
9
|
+
from typing import Dict, Any, List, Optional
|
|
13
10
|
|
|
14
11
|
import numpy as np
|
|
15
12
|
from Bio.PDB import Superimposer
|
|
16
|
-
from typeguard import typechecked
|
|
17
|
-
|
|
18
13
|
from gemmi_protools.io.convert import gemmi2bio, bio2gemmi
|
|
19
14
|
from gemmi_protools.io.reader import StructureParser
|
|
20
15
|
|
|
21
16
|
|
|
22
17
|
class StructureAligner(object):
|
|
23
|
-
|
|
24
|
-
def __init__(self, query_path: Union[str, pathlib.Path], ref_path: Union[str, pathlib.Path]):
|
|
18
|
+
def __init__(self, query_path: str, ref_path: str):
|
|
25
19
|
self._query_st = StructureParser()
|
|
26
20
|
self._query_st.load_from_file(query_path)
|
|
27
|
-
self._query_st.set_default_model()
|
|
28
21
|
|
|
29
22
|
self._ref_st = StructureParser()
|
|
30
23
|
self._ref_st.load_from_file(ref_path)
|
|
31
|
-
self._ref_st.set_default_model()
|
|
32
24
|
|
|
33
25
|
self.values = dict()
|
|
34
26
|
self.rot_mat = None
|
|
@@ -49,8 +41,7 @@ class StructureAligner(object):
|
|
|
49
41
|
return _path
|
|
50
42
|
|
|
51
43
|
@staticmethod
|
|
52
|
-
|
|
53
|
-
def __parser_rotation_matrix(matrix_file: Union[str, pathlib.Path]):
|
|
44
|
+
def __parser_rotation_matrix(matrix_file: str):
|
|
54
45
|
rotation_matrix = []
|
|
55
46
|
translation_vector = []
|
|
56
47
|
|
|
@@ -66,7 +57,6 @@ class StructureAligner(object):
|
|
|
66
57
|
T=np.array(translation_vector).astype(np.float32))
|
|
67
58
|
|
|
68
59
|
@staticmethod
|
|
69
|
-
@typechecked
|
|
70
60
|
def __parse_terminal_outputs(output_string: str) -> Dict[str, Any]:
|
|
71
61
|
lines = re.split(pattern=r"\n", string=output_string)
|
|
72
62
|
# chain mapping
|
|
@@ -108,7 +98,6 @@ class StructureAligner(object):
|
|
|
108
98
|
del patterns[key]
|
|
109
99
|
return values
|
|
110
100
|
|
|
111
|
-
@typechecked
|
|
112
101
|
def make_alignment(self, query_chains: Optional[List[str]] = None,
|
|
113
102
|
ref_chains: Optional[List[str]] = None, timeout=300.0):
|
|
114
103
|
"""
|
|
@@ -122,56 +111,50 @@ class StructureAligner(object):
|
|
|
122
111
|
program_path = self.__mmalign_path
|
|
123
112
|
|
|
124
113
|
# clone
|
|
125
|
-
q_st = deepcopy(self._query_st)
|
|
126
|
-
r_st = deepcopy(self._ref_st)
|
|
127
|
-
|
|
128
|
-
tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
|
|
129
|
-
os.makedirs(tmp_dir)
|
|
130
|
-
|
|
131
114
|
if isinstance(query_chains, list):
|
|
132
|
-
q_st.pick_chains(query_chains)
|
|
115
|
+
q_st = self._query_st.pick_chains(query_chains)
|
|
116
|
+
else:
|
|
117
|
+
q_st = self._query_st
|
|
133
118
|
|
|
134
119
|
if isinstance(ref_chains, list):
|
|
135
|
-
r_st.pick_chains(
|
|
120
|
+
r_st = self._ref_st.pick_chains(query_chains)
|
|
121
|
+
else:
|
|
122
|
+
r_st = self._ref_st
|
|
136
123
|
|
|
137
|
-
q_ch_mapper = q_st.
|
|
138
|
-
r_ch_mapper = r_st.
|
|
124
|
+
q_ch_mapper = q_st.make_one_letter_chain()
|
|
125
|
+
r_ch_mapper = r_st.make_one_letter_chain()
|
|
139
126
|
|
|
140
127
|
q_ch_mapper_r = {v: k for k, v in q_ch_mapper.items()}
|
|
141
128
|
r_ch_mapper_r = {v: k for k, v in r_ch_mapper.items()}
|
|
142
129
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
130
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
131
|
+
_tmp_a = os.path.join(tmp_dir, "a.pdb")
|
|
132
|
+
q_st.to_pdb(_tmp_a)
|
|
133
|
+
|
|
134
|
+
_tmp_b = os.path.join(tmp_dir, "b.pdb")
|
|
135
|
+
r_st.to_pdb(_tmp_b)
|
|
136
|
+
|
|
137
|
+
matrix_file = os.path.join(tmp_dir, "m.txt")
|
|
138
|
+
_command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
result = subprocess.run(_command, shell=True, check=True,
|
|
142
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
143
|
+
timeout=timeout)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print("%s: between files %s and %s; between chains: %s and %s" % (
|
|
146
|
+
str(e), self.query_path, self.ref_path,
|
|
147
|
+
str(q_st.chain_ids), str(r_st.chain_ids))
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
self.values = self.__parse_terminal_outputs(result.stdout.decode())
|
|
151
|
+
self.rot_mat = self.__parser_rotation_matrix(matrix_file)
|
|
152
|
+
self.is_aligned = True
|
|
153
|
+
self.by_query = q_st.chain_ids if query_chains is None else query_chains
|
|
154
|
+
self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
|
|
155
|
+
self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
|
|
156
|
+
self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
|
|
148
157
|
|
|
149
|
-
matrix_file = os.path.join(tmp_dir, "m.txt")
|
|
150
|
-
_command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
|
|
151
|
-
|
|
152
|
-
try:
|
|
153
|
-
result = subprocess.run(_command, shell=True, check=True,
|
|
154
|
-
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
155
|
-
timeout=timeout)
|
|
156
|
-
except Exception as e:
|
|
157
|
-
print("%s: between files %s and %s; between chains: %s and %s" % (
|
|
158
|
-
str(e), self.query_path, self.ref_path,
|
|
159
|
-
str(q_st.chain_ids), str(r_st.chain_ids))
|
|
160
|
-
)
|
|
161
|
-
else:
|
|
162
|
-
self.values = self.__parse_terminal_outputs(result.stdout.decode())
|
|
163
|
-
self.rot_mat = self.__parser_rotation_matrix(matrix_file)
|
|
164
|
-
self.is_aligned = True
|
|
165
|
-
self.by_query = q_st.chain_ids if query_chains is None else query_chains
|
|
166
|
-
self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
|
|
167
|
-
self.values["query_chain_ids"] = [q_ch_mapper_r.get(ch, ch) for ch in self.values["query_chain_ids"]]
|
|
168
|
-
self.values["ref_chain_ids"] = [r_ch_mapper_r.get(ch, ch) for ch in self.values["ref_chain_ids"]]
|
|
169
|
-
|
|
170
|
-
finally:
|
|
171
|
-
if os.path.isdir(tmp_dir):
|
|
172
|
-
shutil.rmtree(tmp_dir)
|
|
173
|
-
|
|
174
|
-
@typechecked
|
|
175
158
|
def save_aligned_query(self, out_file: str):
|
|
176
159
|
"""
|
|
177
160
|
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
import gemmi
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from gemmi_protools.io.reader import StructureParser
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def dockq_score_interface(query_model: str,
|
|
18
|
+
native_model: str,
|
|
19
|
+
partner_1_mapping: List[Tuple[str, str]],
|
|
20
|
+
partner_2_mapping: List[Tuple[str, str]],
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Calculate Dockq Score for an interface (partner 1 vs partner 2)
|
|
24
|
+
|
|
25
|
+
:param query_model: str or pathlib.Path
|
|
26
|
+
Path of query model, support .pdb, .pdb.gz, .cif, .cif.gz
|
|
27
|
+
:param native_model:
|
|
28
|
+
:param partner_1_mapping: a list of chain ID mapping between query and native for partner1 of the interface
|
|
29
|
+
e.g. [(q chain1, n chain1), (q chain2, n chain2)]
|
|
30
|
+
:param partner_2_mapping:
|
|
31
|
+
:return:
|
|
32
|
+
"""
|
|
33
|
+
dockq_program = shutil.which("DockQ")
|
|
34
|
+
if dockq_program is None:
|
|
35
|
+
raise RuntimeError("DockQ is need")
|
|
36
|
+
|
|
37
|
+
assert len(partner_1_mapping) > 0, "partner_1_mapping must be a list of chain ID tuples, can't be empty"
|
|
38
|
+
assert len(partner_2_mapping) > 0, "partner_2_mapping must be a list of chain ID tuples, can't be empty"
|
|
39
|
+
|
|
40
|
+
def load_struct(path: str, partner_1: List[str], partner_2: List[str]):
|
|
41
|
+
st = StructureParser()
|
|
42
|
+
st.load_from_file(path)
|
|
43
|
+
st.clean_structure()
|
|
44
|
+
|
|
45
|
+
for ch in partner_1 + partner_2:
|
|
46
|
+
if ch not in st.chain_ids:
|
|
47
|
+
raise ValueError("Chain %s not found for %s (only [%s])" % (ch, path, " ".join(st.chain_ids)))
|
|
48
|
+
|
|
49
|
+
# merge chains in each each partner into on chain
|
|
50
|
+
# partner_1 with chain ID A
|
|
51
|
+
# partner_2 with chain ID B
|
|
52
|
+
|
|
53
|
+
chain_a = gemmi.Chain("A")
|
|
54
|
+
idx_a = 1
|
|
55
|
+
for ch in partner_1:
|
|
56
|
+
for res in st.get_chain(ch):
|
|
57
|
+
nr = deepcopy(res)
|
|
58
|
+
nr.seqid.icode = " "
|
|
59
|
+
nr.seqid.num = idx_a
|
|
60
|
+
chain_a.add_residue(nr)
|
|
61
|
+
idx_a += 1
|
|
62
|
+
|
|
63
|
+
chain_b = gemmi.Chain("B")
|
|
64
|
+
idx_b = 1
|
|
65
|
+
for ch in partner_2:
|
|
66
|
+
for res in st.get_chain(ch):
|
|
67
|
+
nr = deepcopy(res)
|
|
68
|
+
nr.seqid.icode = " "
|
|
69
|
+
nr.seqid.num = idx_b
|
|
70
|
+
chain_b.add_residue(nr)
|
|
71
|
+
idx_b += 1
|
|
72
|
+
|
|
73
|
+
model = gemmi.Model(1)
|
|
74
|
+
model.add_chain(chain_a)
|
|
75
|
+
model.add_chain(chain_b)
|
|
76
|
+
|
|
77
|
+
struct = gemmi.Structure()
|
|
78
|
+
struct.add_model(model)
|
|
79
|
+
|
|
80
|
+
output = StructureParser(struct)
|
|
81
|
+
return output
|
|
82
|
+
|
|
83
|
+
partner_1_query, partner_1_native = list(zip(*partner_1_mapping))
|
|
84
|
+
partner_2_query, partner_2_native = list(zip(*partner_2_mapping))
|
|
85
|
+
|
|
86
|
+
q_st = load_struct(query_model, list(partner_1_query), list(partner_2_query))
|
|
87
|
+
n_st = load_struct(native_model, list(partner_1_native), list(partner_2_native))
|
|
88
|
+
|
|
89
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
90
|
+
result_file = os.path.join(tmp_dir, "result.json")
|
|
91
|
+
q_file = os.path.join(tmp_dir, "q.pdb")
|
|
92
|
+
n_file = os.path.join(tmp_dir, "n.pdb")
|
|
93
|
+
q_st.to_pdb(q_file, write_minimal_pdb=True)
|
|
94
|
+
n_st.to_pdb(n_file, write_minimal_pdb=True)
|
|
95
|
+
|
|
96
|
+
mapping = "AB:AB"
|
|
97
|
+
|
|
98
|
+
_command = "%s --mapping %s --json %s %s %s" % (dockq_program, mapping, result_file, q_file, n_file)
|
|
99
|
+
metrics = ['DockQ', 'F1', 'chain1', 'chain2']
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
_ = subprocess.run(_command, shell=True, check=True,
|
|
103
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
104
|
+
timeout=300.0)
|
|
105
|
+
except subprocess.CalledProcessError as e:
|
|
106
|
+
# Handle errors in the called executable
|
|
107
|
+
msg = e.stderr.decode()
|
|
108
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
# Handle other exceptions such as file not found or permissions issues
|
|
111
|
+
msg = str(e)
|
|
112
|
+
outputs = pd.DataFrame(columns=metrics)
|
|
113
|
+
else:
|
|
114
|
+
with open(result_file, "r") as fin:
|
|
115
|
+
vals = json.load(fin)
|
|
116
|
+
msg = "Finished"
|
|
117
|
+
result = []
|
|
118
|
+
for v in vals["best_result"].values():
|
|
119
|
+
result.append(v)
|
|
120
|
+
outputs = pd.DataFrame(result)[metrics]
|
|
121
|
+
|
|
122
|
+
if len(outputs) > 0:
|
|
123
|
+
score = "%.4f" % outputs.iloc[0]["DockQ"]
|
|
124
|
+
else:
|
|
125
|
+
score = ""
|
|
126
|
+
|
|
127
|
+
return score
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import trimesh
|
|
11
|
+
from Bio.PDB import Selection
|
|
12
|
+
from Bio.PDB.ResidueDepth import _get_atom_radius, _read_vertex_array
|
|
13
|
+
from gemmi_protools import StructureParser
|
|
14
|
+
from gemmi_protools import gemmi2bio
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _read_face_array(filename: str):
|
|
18
|
+
with open(filename) as fp:
|
|
19
|
+
face_list = []
|
|
20
|
+
for line in fp:
|
|
21
|
+
sl = line.split()
|
|
22
|
+
if len(sl) != 5:
|
|
23
|
+
# skip header
|
|
24
|
+
continue
|
|
25
|
+
vl = [int(x) for x in sl[0:3]]
|
|
26
|
+
face_list.append(vl)
|
|
27
|
+
return np.array(face_list)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_mesh(struct_file: str, chains: Optional[List[str]] = None, MSMS: str = "msms"):
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
:param struct_file: str
|
|
34
|
+
.pdb, .cif, .pdb.gz, .cif.gz
|
|
35
|
+
:param chains: a list of chain names
|
|
36
|
+
default None to include all chains
|
|
37
|
+
:param MSMS: str
|
|
38
|
+
path of msms executable
|
|
39
|
+
:return:
|
|
40
|
+
https://ccsb.scripps.edu/msms/downloads/
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
st = StructureParser()
|
|
45
|
+
st.load_from_file(struct_file)
|
|
46
|
+
st.clean_structure(remove_ligand=True)
|
|
47
|
+
|
|
48
|
+
bio_st = gemmi2bio(st.STRUCT)
|
|
49
|
+
model = bio_st[0]
|
|
50
|
+
|
|
51
|
+
# Replace pdb_to_xyzr
|
|
52
|
+
# Make x,y,z,radius file
|
|
53
|
+
atom_list = Selection.unfold_entities(model, "A")
|
|
54
|
+
|
|
55
|
+
xyz_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
56
|
+
with open(xyz_tmp, "w") as pdb_to_xyzr:
|
|
57
|
+
for atom in atom_list:
|
|
58
|
+
x, y, z = atom.coord
|
|
59
|
+
radius = _get_atom_radius(atom, rtype="united")
|
|
60
|
+
pdb_to_xyzr.write(f"{x:6.3f}\t{y:6.3f}\t{z:6.3f}\t{radius:1.2f}\n")
|
|
61
|
+
|
|
62
|
+
# Make surface
|
|
63
|
+
surface_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
64
|
+
msms_tmp = tempfile.NamedTemporaryFile(delete=False).name
|
|
65
|
+
MSMS = MSMS + " -no_header -probe_radius 1.5 -if %s -of %s > " + msms_tmp
|
|
66
|
+
make_surface = MSMS % (xyz_tmp, surface_tmp)
|
|
67
|
+
subprocess.call(make_surface, shell=True)
|
|
68
|
+
face_file = surface_tmp + ".face"
|
|
69
|
+
surface_file = surface_tmp + ".vert"
|
|
70
|
+
if not os.path.isfile(surface_file):
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
f"Failed to generate surface file using command:\n{make_surface}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(str(e))
|
|
77
|
+
mesh = None
|
|
78
|
+
else:
|
|
79
|
+
# Read surface vertices from vertex file
|
|
80
|
+
vertices = _read_vertex_array(surface_file)
|
|
81
|
+
faces = _read_face_array(face_file)
|
|
82
|
+
mesh = trimesh.Trimesh(vertices=vertices, faces=faces - 1)
|
|
83
|
+
mesh.merge_vertices()
|
|
84
|
+
mesh.update_faces(mesh.unique_faces())
|
|
85
|
+
mesh.update_faces(mesh.nondegenerate_faces())
|
|
86
|
+
mesh.remove_unreferenced_vertices()
|
|
87
|
+
finally:
|
|
88
|
+
# Remove temporary files
|
|
89
|
+
for fn in [xyz_tmp, surface_tmp, msms_tmp, face_file, surface_file]:
|
|
90
|
+
try:
|
|
91
|
+
os.remove(fn)
|
|
92
|
+
except OSError:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
return mesh
|
|
@@ -2,25 +2,17 @@
|
|
|
2
2
|
@Author: Luo Jiejian
|
|
3
3
|
"""
|
|
4
4
|
import hashlib
|
|
5
|
-
import itertools
|
|
6
5
|
import os
|
|
7
6
|
import re
|
|
8
7
|
import shutil
|
|
9
8
|
import subprocess
|
|
10
9
|
import uuid
|
|
11
10
|
from collections import defaultdict
|
|
12
|
-
from dataclasses import asdict
|
|
13
11
|
from importlib.resources import files
|
|
14
|
-
from typing import List
|
|
15
12
|
|
|
16
|
-
import numpy as np
|
|
17
13
|
from anarci import run_anarci
|
|
18
14
|
from anarci.germlines import all_germlines
|
|
19
|
-
from joblib import Parallel, delayed
|
|
20
|
-
from scipy.spatial import cKDTree
|
|
21
|
-
|
|
22
15
|
from gemmi_protools import StructureParser
|
|
23
|
-
from gemmi_protools.utils.ppi import _ppi_atoms
|
|
24
16
|
|
|
25
17
|
|
|
26
18
|
def hash_sequence(seq: str) -> str:
|
|
@@ -207,125 +199,48 @@ def annotate_mhc(seq_dict: dict):
|
|
|
207
199
|
return out
|
|
208
200
|
|
|
209
201
|
|
|
210
|
-
def
|
|
211
|
-
chains_x: List[str],
|
|
212
|
-
chains_y: List[str],
|
|
213
|
-
threshold: float = 4.5):
|
|
214
|
-
"""
|
|
215
|
-
identify PPI among protein, DNA, RNA
|
|
216
|
-
:param struct: StructureParser
|
|
217
|
-
:param chains_x:
|
|
218
|
-
:param chains_y:
|
|
219
|
-
:param threshold:
|
|
220
|
-
:return:
|
|
221
|
-
PPI residues of chains_x, PPI residues of chains_y
|
|
222
|
-
"""
|
|
223
|
-
|
|
224
|
-
x_coord, x_id = _ppi_atoms(struct, chains_x)
|
|
225
|
-
y_coord, y_id = _ppi_atoms(struct, chains_y)
|
|
226
|
-
|
|
227
|
-
kd_tree_x = cKDTree(x_coord)
|
|
228
|
-
kd_tree_y = cKDTree(y_coord)
|
|
229
|
-
|
|
230
|
-
pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
|
|
231
|
-
|
|
232
|
-
x_res = np.unique(x_id[pairs.row][["ch_name", 'res_num', 'res_icode', 'res_name']])
|
|
233
|
-
y_res = np.unique(y_id[pairs.col][["ch_name", 'res_num', 'res_icode', 'res_name']])
|
|
234
|
-
|
|
235
|
-
x_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in x_res.tolist()]
|
|
236
|
-
y_out = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in y_res.tolist()]
|
|
237
|
-
return x_out, y_out
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def polymer_interface_residues(struct: StructureParser,
|
|
241
|
-
ppi_threshold: float = 4.5,
|
|
242
|
-
n_cpus: int = 1,
|
|
243
|
-
):
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
struct:
|
|
248
|
-
ppi_threshold:
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
|
|
252
|
-
"""
|
|
253
|
-
chains = [ch for ch, ct in struct.chain_types.items() if ct in ["protein", "dna", "rna"]]
|
|
254
|
-
ch_pairs = list(itertools.combinations(chains, r=2))
|
|
255
|
-
ch_pairs.sort()
|
|
256
|
-
|
|
257
|
-
def _run(ch_1, ch_2):
|
|
258
|
-
key = "%s/%s" % (ch_1, ch_2)
|
|
259
|
-
res_x, res_y = _interface_residues(struct, chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
|
|
260
|
-
if len(res_x) > 0:
|
|
261
|
-
return {key: [res_x, res_y]}
|
|
262
|
-
else:
|
|
263
|
-
return dict()
|
|
264
|
-
|
|
265
|
-
cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
|
|
266
|
-
|
|
267
|
-
outputs = dict()
|
|
268
|
-
if cpu2use == 1 or len(ch_pairs) < 100:
|
|
269
|
-
for ch_1, ch_2 in ch_pairs:
|
|
270
|
-
outputs.update(_run(ch_1, ch_2))
|
|
271
|
-
else:
|
|
272
|
-
results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
|
|
273
|
-
for item in results:
|
|
274
|
-
outputs.update(item)
|
|
275
|
-
return outputs
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def annotate_pdb(struct_file: str, ppi_threshold: float = 4.5,
|
|
279
|
-
n_cpus: int = 1, max_seqs: int = 100):
|
|
202
|
+
def annotate_pdb(struct_file: str):
|
|
280
203
|
st = StructureParser()
|
|
281
204
|
st.load_from_file(struct_file)
|
|
282
|
-
st.
|
|
283
|
-
st.STRUCT.remove_alternative_conformations()
|
|
284
|
-
st.STRUCT.remove_ligands_and_waters()
|
|
285
|
-
st.STRUCT.remove_hydrogens()
|
|
286
|
-
st.STRUCT.remove_empty_chains()
|
|
287
|
-
st.update_entity()
|
|
205
|
+
st.clean_structure()
|
|
288
206
|
|
|
289
|
-
|
|
290
|
-
|
|
207
|
+
subchain_id2entity_id = dict()
|
|
208
|
+
for ent in st.STRUCT.entities:
|
|
209
|
+
for ch in ent.subchains:
|
|
210
|
+
subchain_id2entity_id[ch] = ent.name
|
|
291
211
|
|
|
292
212
|
# Merge sequences
|
|
293
213
|
polymers = dict()
|
|
294
|
-
for ch, seq in st.polymer_sequences.items():
|
|
295
|
-
|
|
214
|
+
for ch, seq in st.polymer_sequences().items():
|
|
215
|
+
subchain_id = st.get_chain(ch).get_polymer().subchain_id()
|
|
216
|
+
entity_id = subchain_id2entity_id[subchain_id]
|
|
217
|
+
|
|
218
|
+
hash_id = hash_sequence(seq.upper())
|
|
296
219
|
if hash_id not in polymers:
|
|
297
220
|
val = dict(chain_ids=[ch],
|
|
298
|
-
sequence=seq,
|
|
299
|
-
type=st.
|
|
300
|
-
description=st.
|
|
301
|
-
specie=st.ENTITY.eid2specie.get(st.ENTITY.polymer2eid[ch], "Unknown"),
|
|
302
|
-
taxid=st.ENTITY.eid2taxid.get(st.ENTITY.polymer2eid[ch], "Unknown"),
|
|
221
|
+
sequence=seq.upper(),
|
|
222
|
+
type=st.polymer_types[ch].name,
|
|
223
|
+
description=st.INFO["description"].get(entity_id, "Unknown"),
|
|
303
224
|
)
|
|
304
225
|
polymers[hash_id] = val
|
|
305
226
|
else:
|
|
306
227
|
polymers[hash_id]["chain_ids"].append(ch)
|
|
307
228
|
|
|
308
|
-
|
|
309
|
-
|
|
229
|
+
proteins = dict()
|
|
310
230
|
results = dict()
|
|
311
|
-
for
|
|
231
|
+
for hash_id, val in polymers.items():
|
|
312
232
|
val["chain_ids"].sort()
|
|
313
|
-
if val["type"] == "
|
|
233
|
+
if val["type"] == "PeptideL":
|
|
234
|
+
proteins[hash_id] = val["sequence"]
|
|
314
235
|
anarci_info = get_fv_region(val["sequence"])
|
|
315
236
|
fvt = fv_region_type(anarci_info)
|
|
316
237
|
if fvt != "not-Fv":
|
|
317
|
-
results[
|
|
318
|
-
|
|
319
|
-
struct_info = asdict(st.INFO)
|
|
320
|
-
struct_info.update(resolution=st.STRUCT.resolution)
|
|
321
|
-
struct_info["pdb_id"] = struct_info["pdb_id"].lower()
|
|
322
|
-
struct_info["exp_method"] = struct_info["exp_method"].lower()
|
|
238
|
+
results[hash_id] = dict(fv_type=fvt, annotations=anarci_info)
|
|
323
239
|
|
|
240
|
+
struct_info = {k: st.INFO[k] for k in ["resolution", "pdb_id", "deposition_date", "method", "title"]}
|
|
324
241
|
return dict(path=os.path.abspath(os.path.expanduser(struct_file)),
|
|
325
242
|
info=struct_info,
|
|
326
243
|
polymers=polymers,
|
|
327
244
|
anarci=results,
|
|
328
|
-
mhc=annotate_mhc(
|
|
329
|
-
interfaces=polymer_interface_residues(st, ppi_threshold,
|
|
330
|
-
n_cpus=n_cpus)
|
|
245
|
+
mhc=annotate_mhc(proteins) if len(proteins) > 0 else dict(),
|
|
331
246
|
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gemmi_protools
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An Enhanced tool to process PDB structures based on Gemmi
|
|
5
|
+
Author: Luo Jiejian
|
|
6
|
+
Author-email: Luo Jiejian <luojiejian12@mails.ucas.ac.cn>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Requires-Python: <3.13,>=3.12
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: gemmi==0.7.3
|
|
12
|
+
Requires-Dist: biopython==1.85
|
|
13
|
+
Requires-Dist: dockq==2.1.3
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: typeguard
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: scipy
|
|
18
|
+
Requires-Dist: trimesh
|
|
19
|
+
Requires-Dist: freesasa==2.2.1
|
|
20
|
+
Dynamic: author
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# An Enhanced tool to process PDB structures based on Gemmi
|
|
24
|
+
|
|
25
|
+
# Install
|
|
26
|
+
```commandline
|
|
27
|
+
conda create -n gemmi_protools python=3.12
|
|
28
|
+
conda install -n gemmi_protools anarci hmmer -c bioconda
|
|
29
|
+
conda install -n gemmi_protools dockq trimesh -c conda-forge
|
|
30
|
+
conda activate gemmi_protools
|
|
31
|
+
pip install gemmi_protools
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
# Usage
|
|
35
|
+
|
|
36
|
+
## read structures
|
|
37
|
+
```commandline
|
|
38
|
+
from gemmi_protools import StructureParser
|
|
39
|
+
st=StructureParser()
|
|
40
|
+
st.load_from_file("your.pdb")
|
|
41
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
gemmi_protools/__init__.py,sha256=_q8gXGIxrg2-3N6gxA3aa-_DRWDt3HFCLHyIh2XNHz0,157
|
|
2
|
+
gemmi_protools/data/MHC/MHC_combined.hmm,sha256=w0_vzPiEWne_d_kYmqR0OiSsCOpQioItKy3Zq-JMsH4,159451
|
|
3
|
+
gemmi_protools/data/MHC/MHC_combined.hmm.h3f,sha256=QGG4l-v76RYtysJ5rybnz5v6VgJg2RjoQQHUVWL5jmg,45522
|
|
4
|
+
gemmi_protools/data/MHC/MHC_combined.hmm.h3i,sha256=yn-700hoBSJB39Tj8Ia8UhSZWpYiCZFNcbnYAFNjReI,300
|
|
5
|
+
gemmi_protools/data/MHC/MHC_combined.hmm.h3m,sha256=CvNMCsobQiX-wL7iB4CreNcbpnElE31NmmjmMR0iYVI,66174
|
|
6
|
+
gemmi_protools/data/MHC/MHC_combined.hmm.h3p,sha256=-mK278pRedG3-KL-DtuVAQy7La9DgXg5FcP89D6X3Ck,78325
|
|
7
|
+
gemmi_protools/io/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
|
|
8
|
+
gemmi_protools/io/convert.py,sha256=A1i1vPgxG1LqMSUvWtegLl9LipgUQbfmKeGJ_f00UYo,3781
|
|
9
|
+
gemmi_protools/io/reader.py,sha256=hUIY0YKBXDCyiWTNgfX7KsZRUxBOb-v6KYKxlWWtzEk,33238
|
|
10
|
+
gemmi_protools/tools/__init__.py,sha256=F6e1xNT_7lZAWQgNIneH06o2qtWYrHNr_xPUPTwwx5E,29
|
|
11
|
+
gemmi_protools/tools/align.py,sha256=tsn8Fp-Xc9CulVyVst4uFgL6gQKVOEvoUmdgcfF8dCg,7084
|
|
12
|
+
gemmi_protools/tools/dockq.py,sha256=a6i4S0O7Z2jxqQMBQilbcxNEDm35i8hzm_anvJXB6uo,4419
|
|
13
|
+
gemmi_protools/tools/mesh.py,sha256=YiHPZUS-Y0gtLTbwdUXO7jS07PS5PP452LxvPTfHFJE,2986
|
|
14
|
+
gemmi_protools/tools/pdb_annot.py,sha256=MHl-2BAFr__eO1ohPPLfBR17G2wPZti7Lq9UlS7AEX4,8252
|
|
15
|
+
gemmi_protools-1.0.0.dist-info/licenses/LICENSE,sha256=JuQvKcgj6n11y5y6nXr9rABv3gJSswc4eTCd5WZBtSY,1062
|
|
16
|
+
gemmi_protools-1.0.0.dist-info/METADATA,sha256=bRuphRjLJsZz-CmSRKau3cxi8yCPnF-E9NDavUTS1DA,1038
|
|
17
|
+
gemmi_protools-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
gemmi_protools-1.0.0.dist-info/top_level.txt,sha256=P12mYJi5O5EKIn5u-RFaWxuix431CgLacSRD7rBid_U,15
|
|
19
|
+
gemmi_protools-1.0.0.dist-info/RECORD,,
|