gemmi-protools 0.1.17__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +1 -4
- gemmi_protools/io/convert.py +0 -3
- gemmi_protools/io/reader.py +752 -309
- gemmi_protools/{utils → tools}/align.py +38 -54
- gemmi_protools/tools/dockq.py +128 -0
- gemmi_protools/tools/mesh.py +197 -0
- gemmi_protools/{utils → tools}/pdb_annot.py +21 -105
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/METADATA +20 -12
- gemmi_protools-1.0.1.dist-info/RECORD +19 -0
- gemmi_protools/io/cif_opts.py +0 -173
- gemmi_protools/io/parse_pdb_header.py +0 -387
- gemmi_protools/io/parser.py +0 -292
- gemmi_protools/io/pdb_opts.py +0 -179
- gemmi_protools/io/peptide.py +0 -32
- gemmi_protools/io/struct_info.py +0 -91
- gemmi_protools/utils/dockq.py +0 -139
- gemmi_protools/utils/fixer.py +0 -274
- gemmi_protools/utils/immune_complex.py +0 -787
- gemmi_protools/utils/ppi.py +0 -74
- gemmi_protools-0.1.17.dist-info/RECORD +0 -27
- /gemmi_protools/{utils → tools}/__init__.py +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/WHEEL +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-0.1.17.dist-info → gemmi_protools-1.0.1.dist-info}/top_level.txt +0 -0
gemmi_protools/io/reader.py
CHANGED
|
@@ -1,217 +1,598 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
import gzip
|
|
2
|
+
import io
|
|
3
|
+
import itertools
|
|
4
4
|
import pathlib
|
|
5
|
+
import random
|
|
5
6
|
import string
|
|
6
|
-
import
|
|
7
|
+
from collections import defaultdict
|
|
7
8
|
from copy import deepcopy
|
|
8
|
-
from typing import
|
|
9
|
+
from typing import Dict, Optional, List
|
|
9
10
|
|
|
10
11
|
import gemmi
|
|
11
12
|
import numpy as np
|
|
12
|
-
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from joblib import Parallel, delayed
|
|
15
|
+
from scipy.spatial import cKDTree
|
|
13
16
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
|
|
18
|
+
def is_pdb(path: str) -> bool:
|
|
19
|
+
"""
|
|
20
|
+
Check if input file is .pdb or .pdb.gz format
|
|
21
|
+
:param path:
|
|
22
|
+
:return:
|
|
23
|
+
bool
|
|
24
|
+
"""
|
|
25
|
+
path = pathlib.Path(path)
|
|
26
|
+
|
|
27
|
+
if path.suffixes:
|
|
28
|
+
if path.suffixes[-1] == ".pdb":
|
|
29
|
+
return True
|
|
30
|
+
elif "".join(path.suffixes[-2:]) == ".pdb.gz":
|
|
31
|
+
return True
|
|
32
|
+
else:
|
|
33
|
+
return False
|
|
34
|
+
else:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_cif(path: str) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Check if input file is .cif or .cif.gz
|
|
41
|
+
:param path:
|
|
42
|
+
:return:
|
|
43
|
+
bool
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
path = pathlib.Path(path)
|
|
47
|
+
if path.suffixes:
|
|
48
|
+
if path.suffixes[-1] == ".cif":
|
|
49
|
+
return True
|
|
50
|
+
elif "".join(path.suffixes[-2:]) == ".cif.gz":
|
|
51
|
+
return True
|
|
52
|
+
else:
|
|
53
|
+
return False
|
|
54
|
+
else:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_cif(path: str) -> dict:
|
|
59
|
+
"""
|
|
60
|
+
Parse CIF structure and info
|
|
61
|
+
:param path: str
|
|
62
|
+
:return:
|
|
63
|
+
dict
|
|
64
|
+
"""
|
|
65
|
+
if not is_cif(path):
|
|
66
|
+
raise TypeError("Input file is not a cif file [.cif or .cif.gz]: %s" % path)
|
|
67
|
+
|
|
68
|
+
doc = gemmi.cif.Document()
|
|
69
|
+
st = gemmi.read_structure(path, save_doc=doc)
|
|
70
|
+
st.setup_entities()
|
|
71
|
+
st.assign_serial_numbers()
|
|
72
|
+
block = doc.sole_block()
|
|
73
|
+
|
|
74
|
+
def _read_src(query_block, category, name_col, taxid_col):
|
|
75
|
+
dk = pd.DataFrame(query_block.get_mmcif_category(name=category, raw=False))
|
|
76
|
+
dk[dk.isna()] = ""
|
|
77
|
+
|
|
78
|
+
if dk.shape[0] > 0 and np.all(np.isin(["entity_id", name_col, taxid_col], dk.columns)):
|
|
79
|
+
return {eid: [name, taxid]
|
|
80
|
+
for eid, name, taxid in dk[["entity_id", name_col, taxid_col]].to_numpy()
|
|
81
|
+
}
|
|
82
|
+
else:
|
|
83
|
+
return dict()
|
|
84
|
+
|
|
85
|
+
desc = pd.DataFrame(block.get_mmcif_category(name="_entity", raw=False))
|
|
86
|
+
desc[desc.isna()] = ""
|
|
87
|
+
|
|
88
|
+
entityid2description = dict()
|
|
89
|
+
if desc.shape[0] > 0 and np.all(np.isin(["id", "pdbx_description"], desc.columns)):
|
|
90
|
+
entityid2description = dict(zip(desc["id"], desc["pdbx_description"]))
|
|
91
|
+
|
|
92
|
+
entityid2src = dict()
|
|
93
|
+
src_1 = _read_src(block, "_entity_src_gen.",
|
|
94
|
+
"pdbx_gene_src_scientific_name",
|
|
95
|
+
"pdbx_gene_src_ncbi_taxonomy_id")
|
|
96
|
+
src_2 = _read_src(block, "_pdbx_entity_src_syn.",
|
|
97
|
+
"organism_scientific",
|
|
98
|
+
"ncbi_taxonomy_id")
|
|
99
|
+
src_3 = _read_src(block, "_entity_src_nat.",
|
|
100
|
+
"pdbx_organism_scientific",
|
|
101
|
+
"pdbx_ncbi_taxonomy_id")
|
|
102
|
+
entityid2src.update(src_1)
|
|
103
|
+
|
|
104
|
+
for k, v in src_2.items():
|
|
105
|
+
if k not in entityid2src:
|
|
106
|
+
entityid2src[k] = v
|
|
107
|
+
|
|
108
|
+
for k, v in src_3.items():
|
|
109
|
+
if k not in entityid2src:
|
|
110
|
+
entityid2src[k] = v
|
|
111
|
+
|
|
112
|
+
info_map = dict(st.info)
|
|
113
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
114
|
+
info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
|
|
115
|
+
source=entityid2src,
|
|
116
|
+
resolution=st.resolution,
|
|
117
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
118
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
119
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
120
|
+
title=info_map.get("_struct.title", "")
|
|
121
|
+
)
|
|
122
|
+
return dict(structure=st, info=info)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def molecule_description(path: str):
|
|
126
|
+
"""
|
|
127
|
+
Molecule description from PDB (.pdb or .pdb.gz)
|
|
128
|
+
:param path:
|
|
129
|
+
:return:
|
|
130
|
+
"""
|
|
131
|
+
if is_pdb(path):
|
|
132
|
+
cur_path = pathlib.Path(path)
|
|
133
|
+
if cur_path.suffixes[-1] == ".pdb":
|
|
134
|
+
with open(path, "r") as text_io:
|
|
135
|
+
lines = text_io.readlines()
|
|
136
|
+
else:
|
|
137
|
+
with gzip.open(path, "rb") as gz_handle:
|
|
138
|
+
with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
|
|
139
|
+
lines = text_io.readlines()
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
|
|
142
|
+
|
|
143
|
+
values = {"COMPND": defaultdict(dict),
|
|
144
|
+
"SOURCE": defaultdict(dict),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
comp_molid = ""
|
|
148
|
+
last_comp_key = ""
|
|
149
|
+
|
|
150
|
+
for hh in lines:
|
|
151
|
+
h = hh.strip()
|
|
152
|
+
key = h[:6].strip()
|
|
153
|
+
tt = h[10:].strip().strip(";")
|
|
154
|
+
|
|
155
|
+
if key in ["COMPND", "SOURCE"]:
|
|
156
|
+
tok = tt.split(":")
|
|
157
|
+
if len(tok) >= 2:
|
|
158
|
+
ckey = tok[0].lower().strip()
|
|
159
|
+
cval = tok[1].strip()
|
|
160
|
+
if ckey == "mol_id":
|
|
161
|
+
comp_molid = cval
|
|
162
|
+
values[key][comp_molid] = dict()
|
|
163
|
+
else:
|
|
164
|
+
values[key][comp_molid][ckey] = cval
|
|
165
|
+
last_comp_key = ckey
|
|
166
|
+
else:
|
|
167
|
+
if last_comp_key != "":
|
|
168
|
+
values[key][comp_molid][last_comp_key] += " " + tok[0].strip()
|
|
169
|
+
|
|
170
|
+
outputs = dict(description=dict(),
|
|
171
|
+
source=dict())
|
|
172
|
+
|
|
173
|
+
ch_id2mol_id = dict()
|
|
174
|
+
for mol_id, val in values["COMPND"].items():
|
|
175
|
+
chain_str = val.get("chain", "").strip()
|
|
176
|
+
if chain_str != "":
|
|
177
|
+
chains = chain_str.split(",")
|
|
178
|
+
for ch in chains:
|
|
179
|
+
ch_id2mol_id[ch.strip()] = mol_id
|
|
180
|
+
|
|
181
|
+
for mol_id, val in values["COMPND"].items():
|
|
182
|
+
m = val.get("molecule", "").strip()
|
|
183
|
+
if m != "":
|
|
184
|
+
outputs["description"][mol_id] = m
|
|
185
|
+
|
|
186
|
+
for mol_id, val in values["SOURCE"].items():
|
|
187
|
+
name = val.get("organism_scientific", "").strip()
|
|
188
|
+
taxid = val.get("organism_taxid", "").strip()
|
|
189
|
+
if name not in ["", "?", "."] or taxid not in ["", "?", "."]:
|
|
190
|
+
outputs["source"][mol_id] = [name, taxid]
|
|
191
|
+
outputs["ch_id2mol_id"] = ch_id2mol_id
|
|
192
|
+
return outputs
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def parse_pdb(path: str) -> dict:
|
|
196
|
+
if not is_pdb(path):
|
|
197
|
+
raise TypeError("Input file is not a pdb file [.pdb or .pdb.gz]: %s" % path)
|
|
198
|
+
|
|
199
|
+
st = gemmi.read_structure(path)
|
|
200
|
+
st.setup_entities()
|
|
201
|
+
st.assign_serial_numbers()
|
|
202
|
+
|
|
203
|
+
values = molecule_description(path)
|
|
204
|
+
|
|
205
|
+
mol_id2entity_name = dict()
|
|
206
|
+
for ent in st.entities:
|
|
207
|
+
if ent.name in values["ch_id2mol_id"]:
|
|
208
|
+
mol_id = values["ch_id2mol_id"][ent.name]
|
|
209
|
+
mol_id2entity_name[mol_id] = ent.name
|
|
210
|
+
|
|
211
|
+
# replace mod_id to entity.name
|
|
212
|
+
description = {mol_id2entity_name[mol_id]: v for mol_id, v in values["description"].items()
|
|
213
|
+
if mol_id in mol_id2entity_name}
|
|
214
|
+
# add ligand and water entity description
|
|
215
|
+
# gemmi use ligand name or water as entity name, take this as description
|
|
216
|
+
for ent in st.entities:
|
|
217
|
+
if (ent.name not in description
|
|
218
|
+
and ent.polymer_type.name == "Unknown"
|
|
219
|
+
and ent.name != ""
|
|
220
|
+
and len(ent.name) > 1):
|
|
221
|
+
description[ent.name] = ent.name
|
|
222
|
+
|
|
223
|
+
source = {mol_id2entity_name[mol_id]: v for mol_id, v in values["source"].items()
|
|
224
|
+
if mol_id in mol_id2entity_name}
|
|
225
|
+
|
|
226
|
+
# assign digital entity names
|
|
227
|
+
mapper = assign_digital_entity_names(st)
|
|
228
|
+
|
|
229
|
+
info_map = dict(st.info)
|
|
230
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
231
|
+
info = dict(description={mapper.get(k, k): v for k, v in description.items()},
|
|
232
|
+
source={mapper.get(k, k): v for k, v in source.items()},
|
|
233
|
+
resolution=st.resolution,
|
|
234
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
235
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
236
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
237
|
+
title=info_map.get("_struct.title", ""),
|
|
238
|
+
)
|
|
239
|
+
return dict(structure=st, info=info)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
|
|
243
|
+
"""
|
|
244
|
+
:param structure:
|
|
245
|
+
:return:
|
|
246
|
+
dict, original entity name to new digital entity name
|
|
247
|
+
"""
|
|
248
|
+
all_digit_name = np.all([ent.name.isdigit() for ent in structure.entities])
|
|
249
|
+
|
|
250
|
+
mapper = dict()
|
|
251
|
+
if not all_digit_name:
|
|
252
|
+
for ix, ent in enumerate(structure.entities):
|
|
253
|
+
new_name = str(ix + 1)
|
|
254
|
+
mapper[ent.name] = new_name
|
|
255
|
+
ent.name = new_name
|
|
256
|
+
return mapper
|
|
21
257
|
|
|
22
258
|
|
|
23
259
|
class StructureParser(object):
|
|
24
260
|
"""
|
|
25
|
-
|
|
261
|
+
Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
|
|
262
|
+
|
|
263
|
+
Read the first model
|
|
26
264
|
"""
|
|
27
265
|
|
|
28
|
-
def __init__(self, structure: gemmi.Structure = None):
|
|
266
|
+
def __init__(self, structure: Optional[gemmi.Structure] = None):
|
|
29
267
|
if not isinstance(structure, (type(None), gemmi.Structure)):
|
|
30
268
|
raise ValueError("structure must be gemmi.Structure or None")
|
|
31
269
|
if structure is None:
|
|
270
|
+
# init with an empty model
|
|
32
271
|
self.STRUCT = gemmi.Structure()
|
|
272
|
+
self.MODEL = gemmi.Model(1)
|
|
273
|
+
self.STRUCT.add_model(self.MODEL)
|
|
33
274
|
elif isinstance(structure, gemmi.Structure):
|
|
34
|
-
_assert_unique_chain_names_in_models(structure)
|
|
35
275
|
self.STRUCT = structure.clone()
|
|
36
276
|
else:
|
|
37
277
|
raise ValueError("structure must be gemmi.Structure or None")
|
|
38
|
-
self.STRUCT.setup_entities()
|
|
39
|
-
_assign_digital_entity_names(self.STRUCT)
|
|
40
278
|
|
|
41
|
-
self.
|
|
42
|
-
|
|
43
|
-
|
|
279
|
+
self._init_struct()
|
|
280
|
+
|
|
281
|
+
info_map = dict(self.STRUCT.info)
|
|
282
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
283
|
+
self.INFO = dict(description=dict(),
|
|
284
|
+
source=dict(),
|
|
285
|
+
resolution=self.STRUCT.resolution,
|
|
286
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
287
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
288
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
289
|
+
title=info_map.get("_struct.title", ""),
|
|
290
|
+
)
|
|
44
291
|
self.update_entity()
|
|
45
|
-
self.update_full_sequences()
|
|
46
|
-
|
|
47
|
-
def update_full_sequences(self):
|
|
48
|
-
for ent_idx, ent in enumerate(self.STRUCT.entities):
|
|
49
|
-
# get full sequence
|
|
50
|
-
full_seq = ent.full_sequence
|
|
51
|
-
|
|
52
|
-
# when missing, construct from Residues
|
|
53
|
-
if not full_seq:
|
|
54
|
-
sel_ch_id = None
|
|
55
|
-
sel_ch_len = 0
|
|
56
|
-
for ch_id, ent_id in self.ENTITY.polymer2eid.items():
|
|
57
|
-
if ent_id == ent.name:
|
|
58
|
-
cur_len = len(self.polymer_sequences[ch_id])
|
|
59
|
-
if cur_len > sel_ch_len:
|
|
60
|
-
sel_ch_id = ch_id
|
|
61
|
-
sel_ch_len = cur_len
|
|
62
|
-
|
|
63
|
-
if sel_ch_id is not None and sel_ch_len > 0:
|
|
64
|
-
full_seq = [r.name for r in self.STRUCT[0][sel_ch_id].get_polymer() if not r.is_water()]
|
|
65
|
-
self.STRUCT.entities[ent_idx].full_sequence = full_seq
|
|
66
|
-
|
|
67
|
-
@typechecked
|
|
68
|
-
def load_from_file(self, path: Union[str, pathlib.PosixPath]):
|
|
69
|
-
if _is_pdb(path):
|
|
70
|
-
struct, entity = pdb_parser(path)
|
|
71
|
-
elif _is_cif(path):
|
|
72
|
-
struct, entity = cif_parser(path)
|
|
73
|
-
else:
|
|
74
|
-
raise ValueError("Only support .cif, .cif.gz, .pdb or .pdb.gz file, but got %s" % path)
|
|
75
|
-
|
|
76
|
-
_assert_unique_chain_names_in_models(struct)
|
|
77
|
-
self.STRUCT, self.ENTITY = struct, entity
|
|
78
|
-
self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
|
|
79
|
-
self.update_entity()
|
|
80
|
-
self.update_full_sequences()
|
|
81
|
-
|
|
82
|
-
@typechecked
|
|
83
|
-
def to_pdb(self, outfile: str, write_minimal_pdb=False):
|
|
84
|
-
compound_source = _compound_source_string(self.ENTITY)
|
|
85
|
-
struct = self.STRUCT.clone()
|
|
86
|
-
|
|
87
|
-
rs = "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % struct.resolution
|
|
88
|
-
resolution_remarks = ["%-80s" % "REMARK 2",
|
|
89
|
-
"%-80s" % rs]
|
|
90
|
-
|
|
91
|
-
struct.raw_remarks = compound_source + resolution_remarks
|
|
92
|
-
if write_minimal_pdb:
|
|
93
|
-
struct.write_minimal_pdb(outfile)
|
|
94
|
-
else:
|
|
95
|
-
struct.write_pdb(outfile)
|
|
96
292
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
293
|
+
def _init_struct(self):
|
|
294
|
+
self.STRUCT.setup_entities()
|
|
295
|
+
self.STRUCT.assign_serial_numbers()
|
|
296
|
+
self.STRUCT.renumber_models()
|
|
101
297
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
for c in m:
|
|
107
|
-
vals.append(c.name)
|
|
108
|
-
vals.sort()
|
|
109
|
-
return vals
|
|
298
|
+
# keep the first model
|
|
299
|
+
if len(self.STRUCT) > 1:
|
|
300
|
+
for idx in reversed(list(range(1, len(self.STRUCT)))):
|
|
301
|
+
del self.STRUCT[idx]
|
|
110
302
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
303
|
+
self.MODEL = self.STRUCT[0]
|
|
304
|
+
self.STRUCT.remove_alternative_conformations()
|
|
305
|
+
self.STRUCT.remove_hydrogens()
|
|
306
|
+
self.STRUCT.remove_empty_chains()
|
|
307
|
+
self._update_full_sequences()
|
|
114
308
|
|
|
115
|
-
|
|
116
|
-
def set_default_model(self, num: Optional[int] = None):
|
|
309
|
+
def load_from_file(self, path: str):
|
|
117
310
|
"""
|
|
118
|
-
|
|
119
|
-
:param
|
|
311
|
+
Load model from file, default use the first model.
|
|
312
|
+
:param path:
|
|
120
313
|
:return:
|
|
121
314
|
"""
|
|
122
|
-
if
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
keep_model = self.STRUCT[0]
|
|
315
|
+
if is_pdb(path):
|
|
316
|
+
val = parse_pdb(path)
|
|
317
|
+
self.STRUCT, self.INFO = val["structure"], val["info"]
|
|
318
|
+
elif is_cif(path):
|
|
319
|
+
val = parse_cif(path)
|
|
320
|
+
self.STRUCT, self.INFO = val["structure"], val["info"]
|
|
129
321
|
else:
|
|
130
|
-
|
|
131
|
-
if model.num == num:
|
|
132
|
-
keep_model = model
|
|
133
|
-
break
|
|
322
|
+
raise ValueError("path must be files with suffixes [ .cif, .cif.gz, .pdb or .pdb.gz]")
|
|
134
323
|
|
|
135
|
-
|
|
136
|
-
|
|
324
|
+
self._init_struct()
|
|
325
|
+
self.update_entity()
|
|
326
|
+
|
|
327
|
+
def _update_full_sequences(self):
|
|
328
|
+
for idx, ent in enumerate(self.STRUCT.entities):
|
|
329
|
+
if ent.entity_type.name == "Polymer":
|
|
330
|
+
self.STRUCT.entities[idx].full_sequence = [gemmi.Entity.first_mon(item) for item in ent.full_sequence]
|
|
137
331
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
332
|
+
if len(ent.full_sequence) == 0:
|
|
333
|
+
sc = self.get_subchain(ent.subchains[0])
|
|
334
|
+
self.STRUCT.entities[idx].full_sequence = sc.extract_sequence()
|
|
141
335
|
|
|
142
|
-
|
|
143
|
-
|
|
336
|
+
@property
|
|
337
|
+
def chain_ids(self):
|
|
338
|
+
return [ch.name for ch in self.MODEL]
|
|
144
339
|
|
|
145
340
|
@property
|
|
146
|
-
def
|
|
147
|
-
return
|
|
341
|
+
def subchain_ids(self):
|
|
342
|
+
return [ch.subchain_id() for ch in self.MODEL.subchains()]
|
|
148
343
|
|
|
149
344
|
@property
|
|
150
345
|
def assembly_names(self):
|
|
151
346
|
return [assem.name for assem in self.STRUCT.assemblies]
|
|
152
347
|
|
|
153
348
|
@property
|
|
154
|
-
def
|
|
155
|
-
|
|
349
|
+
def polymer_types(self):
|
|
350
|
+
subchain_id2polymer = dict()
|
|
351
|
+
for ent in self.STRUCT.entities:
|
|
352
|
+
if ent.entity_type.name == "Polymer":
|
|
353
|
+
for ch in ent.subchains:
|
|
354
|
+
subchain_id2polymer[ch] = ent.polymer_type
|
|
355
|
+
|
|
156
356
|
out = dict()
|
|
157
|
-
for
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
357
|
+
for chain in self.MODEL:
|
|
358
|
+
polymer_ch = chain.get_polymer()
|
|
359
|
+
seq = polymer_ch.extract_sequence()
|
|
360
|
+
if seq:
|
|
361
|
+
subchain_id = polymer_ch.subchain_id()
|
|
362
|
+
if subchain_id in subchain_id2polymer:
|
|
363
|
+
out[chain.name] = subchain_id2polymer[subchain_id]
|
|
162
364
|
return out
|
|
163
365
|
|
|
164
|
-
|
|
165
|
-
def polymer_residue_numbers(self):
|
|
166
|
-
cts = self.chain_types
|
|
366
|
+
def polymer_sequences(self, pdbx: bool = False):
|
|
167
367
|
out = dict()
|
|
168
|
-
|
|
169
|
-
(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
for chain in model:
|
|
176
|
-
ct = cts.get(chain.name, "other")
|
|
177
|
-
if ct != "other":
|
|
178
|
-
out[chain.name] = np.array([(chain.name, r.seqid.num, r.seqid.icode, r.name)
|
|
179
|
-
for r in chain.get_polymer()], dtype=id_type)
|
|
368
|
+
for ch, polymer_type in self.polymer_types.items():
|
|
369
|
+
polymer = self.get_chain(ch).get_polymer()
|
|
370
|
+
if pdbx:
|
|
371
|
+
s = gemmi.pdbx_one_letter_code(polymer.extract_sequence(), gemmi.sequence_kind(polymer_type))
|
|
372
|
+
else:
|
|
373
|
+
s = polymer.make_one_letter_sequence().replace("-", "")
|
|
374
|
+
out[ch] = s
|
|
180
375
|
return out
|
|
181
376
|
|
|
182
|
-
def
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
377
|
+
def get_subchain(self, subchain_id: str):
|
|
378
|
+
out = None
|
|
379
|
+
for ch in self.MODEL.subchains():
|
|
380
|
+
if ch.subchain_id() == subchain_id:
|
|
381
|
+
out = ch
|
|
382
|
+
break
|
|
383
|
+
|
|
384
|
+
if out is None:
|
|
385
|
+
raise ValueError("Sub-Chain %s not found (only [%s])" % (subchain_id, " ".join(self.subchain_ids)))
|
|
188
386
|
|
|
189
|
-
out = dict()
|
|
190
|
-
for model in self.STRUCT:
|
|
191
|
-
for chain in model:
|
|
192
|
-
res_codes = []
|
|
193
|
-
for r in chain:
|
|
194
|
-
if r.is_water():
|
|
195
|
-
if with_water:
|
|
196
|
-
res_codes.append(r.name)
|
|
197
|
-
else:
|
|
198
|
-
if polymer_only:
|
|
199
|
-
if r.entity_type.name == "Polymer":
|
|
200
|
-
res_codes.append(r.name)
|
|
201
|
-
else:
|
|
202
|
-
res_codes.append(r.name)
|
|
203
|
-
out[chain.name] = res_codes
|
|
204
387
|
return out
|
|
205
388
|
|
|
389
|
+
@property
|
|
390
|
+
def subchain_id_to_entity_id(self):
|
|
391
|
+
return {ch: ent.name for ent in self.STRUCT.entities for ch in ent.subchains}
|
|
392
|
+
|
|
393
|
+
@property
|
|
394
|
+
def subchain_id_to_chain_id(self):
|
|
395
|
+
return {sch.subchain_id(): chain.name for chain in self.MODEL for sch in chain.subchains()}
|
|
396
|
+
|
|
397
|
+
def get_chain(self, chain_id: str):
|
|
398
|
+
return self.MODEL[chain_id]
|
|
399
|
+
|
|
400
|
+
def pick_chains(self, chain_names: List[str]):
|
|
401
|
+
struct = gemmi.Structure()
|
|
402
|
+
struct.name = self.STRUCT.name
|
|
403
|
+
model = gemmi.Model(1)
|
|
404
|
+
for ch_id in chain_names:
|
|
405
|
+
model.add_chain(self.get_chain(ch_id))
|
|
406
|
+
|
|
407
|
+
struct.add_model(model)
|
|
408
|
+
|
|
409
|
+
# add basic information
|
|
410
|
+
struct.resolution = self.STRUCT.resolution
|
|
411
|
+
|
|
412
|
+
vals = {"_exptl.method": self.INFO["method"],
|
|
413
|
+
"_struct.title": "(Chains %s): " % " ".join(chain_names) + self.INFO["title"],
|
|
414
|
+
"_pdbx_database_status.recvd_initial_deposition_date": self.INFO["deposition_date"],
|
|
415
|
+
}
|
|
416
|
+
if self.INFO["pdb_id"] != "":
|
|
417
|
+
vals["_entry.id"] = self.INFO["pdb_id"]
|
|
418
|
+
|
|
419
|
+
struct.info = gemmi.InfoMap(vals)
|
|
420
|
+
new_struct = StructureParser(struct)
|
|
421
|
+
|
|
422
|
+
new_struct.INFO["description"] = {ent.name: self.INFO["description"][ent.name]
|
|
423
|
+
for ent in new_struct.STRUCT.entities
|
|
424
|
+
if ent.name in self.INFO["description"]
|
|
425
|
+
}
|
|
426
|
+
new_struct.INFO["source"] = {ent.name: self.INFO["source"][ent.name]
|
|
427
|
+
for ent in new_struct.STRUCT.entities
|
|
428
|
+
if ent.name in self.INFO["source"]
|
|
429
|
+
}
|
|
430
|
+
return new_struct
|
|
431
|
+
|
|
432
|
+
def _raw_marks(self):
|
|
433
|
+
subchain2chain = dict()
|
|
434
|
+
for chain in self.MODEL:
|
|
435
|
+
for sub_chain in chain.subchains():
|
|
436
|
+
subchain_id = sub_chain.subchain_id()
|
|
437
|
+
subchain2chain[subchain_id] = chain.name
|
|
438
|
+
|
|
439
|
+
entity2chains = dict()
|
|
440
|
+
for ent in self.STRUCT.entities:
|
|
441
|
+
val = [subchain2chain[sub_ch] for sub_ch in ent.subchains if sub_ch in subchain2chain]
|
|
442
|
+
if len(val) > 0:
|
|
443
|
+
entity2chains[ent.name] = val
|
|
444
|
+
|
|
445
|
+
mol_id = 1
|
|
446
|
+
n_line = 1
|
|
447
|
+
compound_mol = "COMPND {n_line:>3} MOL_ID: {mol_id};"
|
|
448
|
+
compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
|
|
449
|
+
compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
|
|
450
|
+
|
|
451
|
+
outputs = []
|
|
452
|
+
|
|
453
|
+
for ent in self.STRUCT.entities:
|
|
454
|
+
if ent.entity_type.name == "Polymer":
|
|
455
|
+
chain = ", ".join(entity2chains[ent.name])
|
|
456
|
+
|
|
457
|
+
molecule = self.INFO["description"].get(ent.name, "")
|
|
458
|
+
if n_line == 1:
|
|
459
|
+
outputs.append("COMPND MOL_ID: {mol_id};".format(mol_id=mol_id))
|
|
460
|
+
else:
|
|
461
|
+
outputs.append(compound_mol.format(n_line=n_line, mol_id=mol_id))
|
|
462
|
+
n_line += 1
|
|
463
|
+
|
|
464
|
+
outputs.append(compound_molecule.format(n_line=n_line, molecule=molecule))
|
|
465
|
+
n_line += 1
|
|
466
|
+
|
|
467
|
+
outputs.append(compound_chain.format(n_line=n_line, chain=chain))
|
|
468
|
+
n_line += 1
|
|
469
|
+
|
|
470
|
+
mol_id += 1
|
|
471
|
+
|
|
472
|
+
mol_id = 1
|
|
473
|
+
n_line = 1
|
|
474
|
+
source_mol = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
|
|
475
|
+
source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
|
|
476
|
+
source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
|
|
477
|
+
|
|
478
|
+
for ent in self.STRUCT.entities:
|
|
479
|
+
if ent.entity_type.name == "Polymer":
|
|
480
|
+
src = self.INFO["source"].get(ent.name)
|
|
481
|
+
if src is None:
|
|
482
|
+
organism_scientific, organism_taxid = "", ""
|
|
483
|
+
else:
|
|
484
|
+
organism_scientific, organism_taxid = src
|
|
485
|
+
|
|
486
|
+
if n_line == 1:
|
|
487
|
+
outputs.append("SOURCE MOL_ID: {mol_id};".format(mol_id=mol_id))
|
|
488
|
+
else:
|
|
489
|
+
outputs.append(source_mol.format(n_line=n_line, mol_id=mol_id))
|
|
490
|
+
n_line += 1
|
|
491
|
+
|
|
492
|
+
outputs.append(source_scientific.format(n_line=n_line, organism_scientific=organism_scientific))
|
|
493
|
+
n_line += 1
|
|
494
|
+
|
|
495
|
+
outputs.append(source_taxid.format(n_line=n_line, organism_taxid=organism_taxid))
|
|
496
|
+
n_line += 1
|
|
497
|
+
|
|
498
|
+
mol_id += 1
|
|
499
|
+
|
|
500
|
+
resolution_remarks = ["REMARK 2",
|
|
501
|
+
"REMARK 2 RESOLUTION. %.2f ANGSTROMS." % self.STRUCT.resolution
|
|
502
|
+
]
|
|
503
|
+
outputs.extend(resolution_remarks)
|
|
504
|
+
return outputs
|
|
505
|
+
|
|
506
|
+
def to_pdb(self, outfile: str, write_minimal_pdb=False):
|
|
507
|
+
struct = self.STRUCT.clone()
|
|
508
|
+
if write_minimal_pdb:
|
|
509
|
+
struct.write_minimal_pdb(outfile)
|
|
510
|
+
else:
|
|
511
|
+
struct.raw_remarks = self._raw_marks()
|
|
512
|
+
struct.write_pdb(outfile)
|
|
513
|
+
|
|
514
|
+
@staticmethod
|
|
515
|
+
def _item_index(block: gemmi.cif.Block, tag: str):
|
|
516
|
+
mapper = dict()
|
|
517
|
+
for idx, item in enumerate(block):
|
|
518
|
+
if item.loop is not None:
|
|
519
|
+
keys = item.loop.tags
|
|
520
|
+
for k in keys:
|
|
521
|
+
mapper[k] = idx
|
|
522
|
+
elif item.pair is not None:
|
|
523
|
+
key = item.pair[0]
|
|
524
|
+
mapper[key] = idx
|
|
525
|
+
return mapper.get(tag)
|
|
526
|
+
|
|
527
|
+
def to_cif(self, outfile: str):
|
|
528
|
+
block = self.STRUCT.make_mmcif_block()
|
|
529
|
+
#### add resolution
|
|
530
|
+
# block.set_pair(tag="_refine.entry_id", value=gemmi.cif.quote(self.INFO["pdb_id"].upper()))
|
|
531
|
+
# block.set_pair(tag="_refine.pdbx_refine_id", value=gemmi.cif.quote(self.INFO["method"].upper()))
|
|
532
|
+
block.set_pair(tag="_refine.ls_d_res_high", value=gemmi.cif.quote(str(self.INFO["resolution"])))
|
|
533
|
+
|
|
534
|
+
# tag_names = ["_exptl.entry_id",
|
|
535
|
+
# "_refine.entry_id", "_refine.pdbx_refine_id",
|
|
536
|
+
# "_refine.ls_d_res_high"]
|
|
537
|
+
# for i in range(1, len(tag_names)):
|
|
538
|
+
# idx_1a = self._item_index(block, tag=tag_names[i])
|
|
539
|
+
# idx_2a = self._item_index(block, tag=tag_names[i - 1])
|
|
540
|
+
# block.move_item(idx_1a, idx_2a + 1)
|
|
541
|
+
|
|
542
|
+
#### add entity description
|
|
543
|
+
ta = block.find_mmcif_category(category="_entity.")
|
|
544
|
+
da = pd.DataFrame(list(ta), columns=list(ta.tags))
|
|
545
|
+
da["_entity.pdbx_description"] = da["_entity.id"].apply(
|
|
546
|
+
lambda i: gemmi.cif.quote(self.INFO["description"].get(i, "?")))
|
|
547
|
+
|
|
548
|
+
rows_1 = da.to_numpy().tolist()
|
|
549
|
+
tags_1 = [s.replace("_entity.", "") for s in da.columns.tolist()]
|
|
550
|
+
|
|
551
|
+
# erase
|
|
552
|
+
qitem = block.find_loop_item("_entity.id")
|
|
553
|
+
if isinstance(qitem, gemmi.cif.Item):
|
|
554
|
+
qitem.erase()
|
|
555
|
+
|
|
556
|
+
# add
|
|
557
|
+
loop_1 = block.init_loop(prefix="_entity.", tags=tags_1)
|
|
558
|
+
for r in rows_1:
|
|
559
|
+
loop_1.add_row(r)
|
|
560
|
+
|
|
561
|
+
idx_1b = self._item_index(block, tag="_entity.id")
|
|
562
|
+
idx_2b = self._item_index(block, tag="_entity_poly.entity_id")
|
|
563
|
+
|
|
564
|
+
# place _entity. before _entity_poly.
|
|
565
|
+
if isinstance(idx_1b, int) and isinstance(idx_2b, int):
|
|
566
|
+
block.move_item(idx_1b, idx_2b - 1)
|
|
567
|
+
|
|
568
|
+
#### add source name and taxid
|
|
569
|
+
loop_2 = block.init_loop(prefix="_entity_src_gen.", tags=["entity_id",
|
|
570
|
+
"pdbx_gene_src_scientific_name",
|
|
571
|
+
"pdbx_gene_src_ncbi_taxonomy_id"])
|
|
572
|
+
|
|
573
|
+
for k, (name, taxid) in self.INFO["source"].items():
|
|
574
|
+
name = name if name != "" else "?"
|
|
575
|
+
taxid = taxid if taxid != "" else "?"
|
|
576
|
+
|
|
577
|
+
loop_2.add_row([gemmi.cif.quote(k),
|
|
578
|
+
gemmi.cif.quote(name),
|
|
579
|
+
gemmi.cif.quote(taxid)]
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
idx_1c = self._item_index(block, tag="_entity_src_gen.entity_id")
|
|
583
|
+
idx_2c = self._item_index(block, tag="_entity_poly_seq.entity_id")
|
|
584
|
+
# place _entity_src_gen. after _entity_poly_seq.
|
|
585
|
+
if isinstance(idx_1c, int) and isinstance(idx_2c, int):
|
|
586
|
+
block.move_item(idx_1c, idx_2c + 1)
|
|
587
|
+
|
|
588
|
+
block.write_file(outfile)
|
|
589
|
+
|
|
206
590
|
def update_entity(self):
|
|
207
591
|
"""
|
|
208
592
|
Update ENTITY, .entities .assemblies according to subchains
|
|
209
593
|
:return:
|
|
210
594
|
"""
|
|
211
|
-
subchains =
|
|
212
|
-
for model in self.STRUCT:
|
|
213
|
-
for chain in model:
|
|
214
|
-
subchains.extend([sc.subchain_id() for sc in chain.subchains()])
|
|
595
|
+
subchains = self.subchain_ids
|
|
215
596
|
|
|
216
597
|
# update .entities
|
|
217
598
|
new_entities = gemmi.EntityList()
|
|
@@ -224,15 +605,9 @@ class StructureParser(object):
|
|
|
224
605
|
ent_names.append(ent.name)
|
|
225
606
|
self.STRUCT.entities = new_entities
|
|
226
607
|
|
|
227
|
-
# update
|
|
228
|
-
for
|
|
229
|
-
|
|
230
|
-
if eid not in ent_names:
|
|
231
|
-
del self.ENTITY[super_key][eid]
|
|
232
|
-
|
|
233
|
-
for cid, eid in list(self.ENTITY["polymer2eid"].items()):
|
|
234
|
-
if eid not in ent_names or cid not in self.chain_ids:
|
|
235
|
-
del self.ENTITY["polymer2eid"][cid]
|
|
608
|
+
# update INFO
|
|
609
|
+
self.INFO["description"] = {k: v for k, v in self.INFO["description"].items() if k in ent_names}
|
|
610
|
+
self.INFO["source"] = {k: v for k, v in self.INFO["source"].items() if k in ent_names}
|
|
236
611
|
|
|
237
612
|
# update .assemblies
|
|
238
613
|
all_cid = self.chain_ids
|
|
@@ -262,189 +637,257 @@ class StructureParser(object):
|
|
|
262
637
|
for dai in del_assembly_indexes:
|
|
263
638
|
del self.STRUCT.assemblies[dai]
|
|
264
639
|
|
|
265
|
-
@typechecked
|
|
266
640
|
def rename_chain(self, origin_name: str, target_name: str):
|
|
267
641
|
if origin_name not in self.chain_ids:
|
|
268
|
-
raise ValueError("
|
|
642
|
+
raise ValueError("Chain %s not found" % origin_name)
|
|
643
|
+
|
|
269
644
|
other_chain_names = set(self.chain_ids) - {origin_name}
|
|
270
645
|
|
|
271
646
|
if target_name in other_chain_names:
|
|
272
|
-
raise ValueError("
|
|
647
|
+
raise ValueError("Chain %s has existed, please set a different target_name." % target_name)
|
|
273
648
|
|
|
274
649
|
self.STRUCT.rename_chain(origin_name, target_name)
|
|
275
650
|
|
|
276
|
-
# update .polymer2eid if exist
|
|
277
|
-
if origin_name in self.ENTITY.polymer2eid:
|
|
278
|
-
val = self.ENTITY.polymer2eid[origin_name]
|
|
279
|
-
del self.ENTITY.polymer2eid[origin_name]
|
|
280
|
-
self.ENTITY.polymer2eid[target_name] = val
|
|
281
|
-
|
|
282
|
-
# update .assemblies.generator.chain if exists, for .pdb loading structure
|
|
283
651
|
for assembly in self.STRUCT.assemblies:
|
|
284
652
|
for gen in assembly.generators:
|
|
285
653
|
tmp = [target_name if c == origin_name else c for c in gen.chains]
|
|
286
654
|
gen.chains = tmp
|
|
287
655
|
|
|
288
|
-
|
|
289
|
-
def switch_chain_names(self, chain_name_1: str, chain_name_2: str):
|
|
656
|
+
def swap_chain_names(self, chain_name_1: str, chain_name_2: str):
|
|
290
657
|
if chain_name_1 not in self.chain_ids:
|
|
291
|
-
raise ValueError("
|
|
658
|
+
raise ValueError("Chain %s not found" % chain_name_1)
|
|
292
659
|
if chain_name_2 not in self.chain_ids:
|
|
293
|
-
raise ValueError("
|
|
660
|
+
raise ValueError("Chain %s not in found" % chain_name_2)
|
|
294
661
|
|
|
295
|
-
|
|
296
|
-
|
|
662
|
+
flag = True
|
|
663
|
+
while flag:
|
|
664
|
+
characters = string.ascii_letters + string.digits
|
|
665
|
+
sw_name = ''.join(random.choices(characters, k=4))
|
|
666
|
+
if sw_name not in self.chain_ids:
|
|
667
|
+
flag = False
|
|
297
668
|
|
|
298
|
-
current_names = set(self.chain_ids)
|
|
299
|
-
l3_l = [n for n in l3 if n not in current_names]
|
|
300
|
-
sw_name = l3_l.pop()
|
|
301
669
|
self.rename_chain(chain_name_1, sw_name)
|
|
302
670
|
self.rename_chain(chain_name_2, chain_name_1)
|
|
303
671
|
self.rename_chain(sw_name, chain_name_2)
|
|
304
672
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
673
|
+
def make_one_letter_chain(self, only_uppercase: bool = True):
|
|
674
|
+
uppercase_letters = list(string.ascii_uppercase)
|
|
675
|
+
uppercase_letters.sort(reverse=True)
|
|
308
676
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
for di in del_chain_indexes:
|
|
318
|
-
del self.STRUCT[0][di]
|
|
319
|
-
self.update_entity()
|
|
677
|
+
lowercase_letters = list(string.ascii_lowercase)
|
|
678
|
+
lowercase_letters.sort(reverse=True)
|
|
679
|
+
|
|
680
|
+
digit_letters = list(string.digits)
|
|
681
|
+
digit_letters.sort(reverse=True)
|
|
682
|
+
|
|
683
|
+
if only_uppercase:
|
|
684
|
+
letters = uppercase_letters
|
|
320
685
|
else:
|
|
321
|
-
|
|
686
|
+
letters = digit_letters + lowercase_letters + uppercase_letters
|
|
322
687
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
688
|
+
if only_uppercase:
|
|
689
|
+
msg = "The number of chains exceed the number of uppercase letters: %d > %d"
|
|
690
|
+
else:
|
|
691
|
+
msg = "The number of chains exceed the number of one-letter characters: %d > %d"
|
|
692
|
+
|
|
693
|
+
if len(self.chain_ids) > len(letters):
|
|
694
|
+
raise RuntimeError(msg % (len(self.chain_ids), len(letters)))
|
|
695
|
+
|
|
696
|
+
# not use yet
|
|
697
|
+
letters_valid = [l for l in letters if l not in self.chain_ids]
|
|
698
|
+
chains2rename = [ch for ch in self.chain_ids if ch not in letters]
|
|
699
|
+
mapper = {ch: letters_valid.pop() for ch in self.chain_ids if ch not in letters}
|
|
700
|
+
|
|
701
|
+
for origin_name, target_name in mapper.items():
|
|
327
702
|
self.rename_chain(origin_name, target_name)
|
|
328
|
-
return
|
|
703
|
+
return mapper
|
|
329
704
|
|
|
330
|
-
|
|
331
|
-
|
|
705
|
+
def get_assembly(self, assembly_name: str,
|
|
706
|
+
how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
|
|
332
707
|
if assembly_name not in self.assembly_names:
|
|
333
|
-
raise ValueError("
|
|
708
|
+
raise ValueError("Assembly %s not found (only [%s])" % (assembly_name, ", ".join(self.assembly_names)))
|
|
334
709
|
|
|
335
|
-
struct
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
710
|
+
struct = self.STRUCT.clone()
|
|
711
|
+
struct.transform_to_assembly(assembly_name, how)
|
|
712
|
+
struct.info["_struct.title"] = "(Assembly %s): " % assembly_name + struct.info["_struct.title"]
|
|
713
|
+
|
|
714
|
+
new_struct = StructureParser(struct)
|
|
715
|
+
|
|
716
|
+
# find perfect match entities
|
|
717
|
+
entity_mapper = dict()
|
|
718
|
+
for new_ent in new_struct.STRUCT.entities:
|
|
719
|
+
for ent in self.STRUCT.entities:
|
|
720
|
+
if new_ent.entity_type == ent.entity_type:
|
|
721
|
+
if ent.entity_type.name == "Polymer":
|
|
722
|
+
if new_ent.full_sequence == ent.full_sequence:
|
|
723
|
+
entity_mapper[new_ent.name] = ent.name
|
|
724
|
+
break
|
|
725
|
+
else:
|
|
726
|
+
new_s = new_struct.get_subchain(new_ent.subchains[0]).extract_sequence()
|
|
727
|
+
s = self.get_subchain(ent.subchains[0]).extract_sequence()
|
|
728
|
+
if new_s == s:
|
|
729
|
+
entity_mapper[new_ent.name] = ent.name
|
|
730
|
+
break
|
|
339
731
|
|
|
340
|
-
# update
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
out.STRUCT.info = out.INFO.to_gemmi_structure_infomap()
|
|
344
|
-
return out
|
|
732
|
+
# update Info
|
|
733
|
+
desc = dict()
|
|
734
|
+
src = dict()
|
|
345
735
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
Merge a list of chains, target chain id is chains[0]
|
|
736
|
+
for ent in new_struct.STRUCT.entities:
|
|
737
|
+
if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["description"]:
|
|
738
|
+
desc[ent.name] = self.INFO["description"][entity_mapper[ent.name]]
|
|
350
739
|
|
|
351
|
-
|
|
740
|
+
if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["source"]:
|
|
741
|
+
src[ent.name] = self.INFO["source"][entity_mapper[ent.name]]
|
|
352
742
|
|
|
353
|
-
[
|
|
354
|
-
|
|
743
|
+
new_struct.INFO["description"] = desc
|
|
744
|
+
new_struct.INFO["source"] = src
|
|
745
|
+
return new_struct
|
|
746
|
+
|
|
747
|
+
def clean_structure(self, remove_ligand=True):
|
|
748
|
+
"""
|
|
749
|
+
Remove water by default
|
|
750
|
+
|
|
751
|
+
:param remove_ligand:
|
|
355
752
|
:return:
|
|
356
|
-
GemmiLoader
|
|
357
753
|
"""
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
raise RuntimeError("Chain %s is not in the structure" % c)
|
|
361
|
-
if len(self.STRUCT) > 1:
|
|
362
|
-
print("Multiple models in structure, do nothing")
|
|
363
|
-
elif len(chains) < 2:
|
|
364
|
-
print("Query chains less than 2, do nothing")
|
|
365
|
-
else:
|
|
366
|
-
new_chain = gemmi.Chain(chains[0])
|
|
367
|
-
residue_index = 1
|
|
368
|
-
|
|
369
|
-
model = self.STRUCT[0]
|
|
370
|
-
|
|
371
|
-
for ch in model:
|
|
372
|
-
if ch.name in chains:
|
|
373
|
-
for res in ch:
|
|
374
|
-
nr = deepcopy(res)
|
|
375
|
-
nr.seqid.icode = " "
|
|
376
|
-
nr.seqid.num = residue_index
|
|
377
|
-
new_chain.add_residue(nr)
|
|
378
|
-
residue_index += 1
|
|
379
|
-
|
|
380
|
-
for c in chains:
|
|
381
|
-
self.STRUCT[0].remove_chain(c)
|
|
382
|
-
|
|
383
|
-
self.STRUCT[0].add_chain(new_chain, unique_name=True)
|
|
384
|
-
|
|
385
|
-
def get_atom_coords(self, chains: List[str], atoms: Optional[List[str]] = None):
|
|
386
|
-
for c in chains:
|
|
387
|
-
if c not in self.chain_ids:
|
|
388
|
-
warnings.warn("Chain %s is not in the structure" % c)
|
|
389
|
-
|
|
390
|
-
coord = []
|
|
391
|
-
atom_id = []
|
|
392
|
-
id_type = np.dtype([
|
|
393
|
-
("ch_name", "U5"),
|
|
394
|
-
("res_num", "i4"),
|
|
395
|
-
("res_icode", "U3"),
|
|
396
|
-
("res_name", "U5"),
|
|
397
|
-
("atom_name", "U5")
|
|
398
|
-
])
|
|
399
|
-
|
|
400
|
-
model = self.STRUCT[0]
|
|
401
|
-
for ch in model:
|
|
402
|
-
if ch.name in chains:
|
|
403
|
-
for res in ch:
|
|
404
|
-
for atom in res:
|
|
405
|
-
if atoms is None or atom.name in atoms:
|
|
406
|
-
cur_id = (ch.name, res.seqid.num, res.seqid.icode, res.name, atom.name)
|
|
407
|
-
cur_pos = atom.pos.tolist()
|
|
408
|
-
coord.append(cur_pos)
|
|
409
|
-
atom_id.append(cur_id)
|
|
410
|
-
|
|
411
|
-
if coord:
|
|
412
|
-
return np.array(coord, dtype=np.float32), np.array(atom_id, dtype=id_type)
|
|
754
|
+
if remove_ligand:
|
|
755
|
+
self.STRUCT.remove_waters()
|
|
413
756
|
else:
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def make_one_letter_sequence(self, chain_id):
|
|
417
|
-
c_type = self.chain_types[chain_id]
|
|
418
|
-
residues = self.chain_residues(polymer_only=True, with_water=False)[chain_id]
|
|
757
|
+
self.STRUCT.remove_ligands_and_waters()
|
|
419
758
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
elif c_type in ["dna", "rna"]:
|
|
423
|
-
one_letter_code = "".join([nucleic_3to1_mapper.get(r, "N") for r in residues])
|
|
424
|
-
else:
|
|
425
|
-
one_letter_code = ""
|
|
426
|
-
return one_letter_code
|
|
759
|
+
self.STRUCT.remove_empty_chains()
|
|
760
|
+
self.update_entity()
|
|
427
761
|
|
|
428
|
-
def
|
|
762
|
+
def met_to_mse(self):
|
|
763
|
+
for chain in self.MODEL:
|
|
764
|
+
for residue in chain:
|
|
765
|
+
if residue.name == 'MET':
|
|
766
|
+
residue.name = 'MSE'
|
|
767
|
+
for atom in residue:
|
|
768
|
+
if atom.name == 'SD':
|
|
769
|
+
atom.name = 'SE'
|
|
770
|
+
atom.element = gemmi.Element('Se')
|
|
771
|
+
|
|
772
|
+
def get_atoms(self, arg: str = "*"):
|
|
429
773
|
"""
|
|
430
|
-
(1) remove_alternative_conformations
|
|
431
|
-
(2) remove_hydrogens
|
|
432
|
-
(3) remove_water
|
|
433
|
-
(4) remove_empty_chains
|
|
434
774
|
|
|
775
|
+
:param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
|
|
776
|
+
see gemmi.Selection
|
|
435
777
|
:return:
|
|
778
|
+
np.ndarray
|
|
436
779
|
"""
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
780
|
+
sel = gemmi.Selection(arg)
|
|
781
|
+
res = []
|
|
782
|
+
|
|
783
|
+
for model in sel.models(self.STRUCT):
|
|
784
|
+
for chain in sel.chains(model):
|
|
785
|
+
for residue in sel.residues(chain):
|
|
786
|
+
for atom in sel.atoms(residue):
|
|
787
|
+
val = (chain.name,
|
|
788
|
+
residue.seqid.num,
|
|
789
|
+
residue.seqid.icode,
|
|
790
|
+
residue.name,
|
|
791
|
+
atom.name,
|
|
792
|
+
atom.element.name,
|
|
793
|
+
atom.charge,
|
|
794
|
+
atom.b_iso,
|
|
795
|
+
atom.occ,
|
|
796
|
+
tuple(atom.pos.tolist()),
|
|
797
|
+
)
|
|
798
|
+
res.append(val)
|
|
799
|
+
|
|
800
|
+
dtype = [("chain_name", "U5"),
|
|
801
|
+
("residue_num", "i4"),
|
|
802
|
+
("residue_icode", "U3"),
|
|
803
|
+
("residue_name", "U5"),
|
|
804
|
+
("atom_name", "U5"),
|
|
805
|
+
("element", "U3"),
|
|
806
|
+
("charge", "i1"),
|
|
807
|
+
("b_factor", "f4"),
|
|
808
|
+
("occupancy", "f4"),
|
|
809
|
+
("coordinate", ("f4", (3,)))
|
|
810
|
+
]
|
|
811
|
+
return np.array(res, dtype=dtype)
|
|
812
|
+
|
|
813
|
+
def polymer_interface_residues(self,
|
|
814
|
+
chains_x: List[str],
|
|
815
|
+
chains_y: List[str],
|
|
816
|
+
threshold: float = 4.5):
|
|
817
|
+
"""
|
|
818
|
+
Identify PPI among protein, DNA, RNA using heavy atom distances.
|
|
819
|
+
:param chains_x:
|
|
820
|
+
:param chains_y:
|
|
821
|
+
:param threshold:
|
|
822
|
+
:return:
|
|
823
|
+
PPI residues of chains_x, PPI residues of chains_y
|
|
824
|
+
"""
|
|
825
|
+
for ch in chains_x + chains_y:
|
|
826
|
+
if ch not in self.chain_ids:
|
|
827
|
+
raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(self.chain_ids)))
|
|
828
|
+
elif ch not in self.polymer_types:
|
|
829
|
+
raise ValueError("Chain %s is not a polymer (only [%s])"
|
|
830
|
+
% (ch, " ".join(list(self.polymer_types.keys())))
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
def ppi_atoms(struct, chains):
|
|
834
|
+
# atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
|
|
835
|
+
protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
|
|
836
|
+
'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
|
|
837
|
+
'SD', 'SG']
|
|
838
|
+
xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
|
|
839
|
+
'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
|
|
840
|
+
'OP1', 'OP2', 'OP3', 'P']
|
|
841
|
+
tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
|
|
842
|
+
z = struct.get_atoms(tag)
|
|
843
|
+
return z
|
|
844
|
+
|
|
845
|
+
query_struct = deepcopy(self)
|
|
846
|
+
query_struct.clean_structure(remove_ligand=True)
|
|
847
|
+
|
|
848
|
+
atom_x = ppi_atoms(query_struct, chains_x)
|
|
849
|
+
atom_y = ppi_atoms(query_struct, chains_y)
|
|
850
|
+
|
|
851
|
+
kd_tree_x = cKDTree(atom_x["coordinate"])
|
|
852
|
+
kd_tree_y = cKDTree(atom_y["coordinate"])
|
|
853
|
+
|
|
854
|
+
pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
|
|
855
|
+
x_res = np.unique(atom_x[pairs.row][["chain_name", "residue_num", "residue_icode", "residue_name"]])
|
|
856
|
+
y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
|
|
857
|
+
|
|
858
|
+
return x_res, y_res
|
|
859
|
+
|
|
860
|
+
def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
|
|
861
|
+
"""
|
|
862
|
+
Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
|
|
440
863
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
864
|
+
:param ppi_threshold:
|
|
865
|
+
:param n_cpus:
|
|
866
|
+
:return:
|
|
867
|
+
"""
|
|
868
|
+
chains = list(self.polymer_types.keys())
|
|
869
|
+
ch_pairs = list(itertools.combinations(chains, r=2))
|
|
870
|
+
ch_pairs.sort()
|
|
871
|
+
|
|
872
|
+
def _run(ch_1, ch_2):
|
|
873
|
+
key = "%s/%s" % (ch_1, ch_2)
|
|
874
|
+
res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
|
|
875
|
+
|
|
876
|
+
if len(res_x) > 0:
|
|
877
|
+
vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
|
|
878
|
+
vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
|
|
879
|
+
return {key: [vx, vy]}
|
|
880
|
+
else:
|
|
881
|
+
return dict()
|
|
445
882
|
|
|
446
|
-
|
|
883
|
+
cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
|
|
447
884
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
885
|
+
outputs = dict()
|
|
886
|
+
if cpu2use == 1 or len(ch_pairs) < 50:
|
|
887
|
+
for ch_1, ch_2 in ch_pairs:
|
|
888
|
+
outputs.update(_run(ch_1, ch_2))
|
|
889
|
+
else:
|
|
890
|
+
results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
|
|
891
|
+
for item in results:
|
|
892
|
+
outputs.update(item)
|
|
893
|
+
return outputs
|