gemmi-protools 0.1.16__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +1 -4
- gemmi_protools/io/convert.py +0 -3
- gemmi_protools/io/reader.py +749 -310
- gemmi_protools/{utils → tools}/align.py +38 -55
- gemmi_protools/tools/dockq.py +127 -0
- gemmi_protools/tools/mesh.py +95 -0
- gemmi_protools/{utils → tools}/pdb_annot.py +21 -106
- gemmi_protools-1.0.0.dist-info/METADATA +41 -0
- gemmi_protools-1.0.0.dist-info/RECORD +19 -0
- gemmi_protools/io/cif_opts.py +0 -173
- gemmi_protools/io/parse_pdb_header.py +0 -387
- gemmi_protools/io/parser.py +0 -292
- gemmi_protools/io/pdb_opts.py +0 -179
- gemmi_protools/io/peptide.py +0 -32
- gemmi_protools/io/struct_info.py +0 -91
- gemmi_protools/utils/dockq.py +0 -139
- gemmi_protools/utils/fixer.py +0 -274
- gemmi_protools/utils/ppi.py +0 -74
- gemmi_protools-0.1.16.dist-info/METADATA +0 -29
- gemmi_protools-0.1.16.dist-info/RECORD +0 -26
- /gemmi_protools/{utils → tools}/__init__.py +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/WHEEL +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {gemmi_protools-0.1.16.dist-info → gemmi_protools-1.0.0.dist-info}/top_level.txt +0 -0
gemmi_protools/io/reader.py
CHANGED
|
@@ -1,217 +1,594 @@
|
|
|
1
|
-
|
|
2
|
-
@Author: Luo Jiejian
|
|
3
|
-
"""
|
|
1
|
+
import itertools
|
|
4
2
|
import pathlib
|
|
3
|
+
import random
|
|
5
4
|
import string
|
|
6
|
-
import
|
|
5
|
+
from collections import defaultdict
|
|
7
6
|
from copy import deepcopy
|
|
8
|
-
from typing import
|
|
7
|
+
from typing import Dict, Optional, List
|
|
9
8
|
|
|
10
9
|
import gemmi
|
|
11
10
|
import numpy as np
|
|
12
|
-
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from joblib import Parallel, delayed
|
|
13
|
+
from scipy.spatial import cKDTree
|
|
13
14
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
15
|
+
|
|
16
|
+
def is_pdb(path: str) -> bool:
|
|
17
|
+
"""
|
|
18
|
+
Check if input file is .pdb or .pdb.gz format
|
|
19
|
+
:param path:
|
|
20
|
+
:return:
|
|
21
|
+
bool
|
|
22
|
+
"""
|
|
23
|
+
path = pathlib.Path(path)
|
|
24
|
+
|
|
25
|
+
if path.suffixes:
|
|
26
|
+
if path.suffixes[-1] == ".pdb":
|
|
27
|
+
return True
|
|
28
|
+
elif "".join(path.suffixes[-2:]) == ".pdb.gz":
|
|
29
|
+
return True
|
|
30
|
+
else:
|
|
31
|
+
return False
|
|
32
|
+
else:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def is_cif(path: str) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Check if input file is .cif or .cif.gz
|
|
39
|
+
:param path:
|
|
40
|
+
:return:
|
|
41
|
+
bool
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
path = pathlib.Path(path)
|
|
45
|
+
if path.suffixes:
|
|
46
|
+
if path.suffixes[-1] == ".cif":
|
|
47
|
+
return True
|
|
48
|
+
elif "".join(path.suffixes[-2:]) == ".cif.gz":
|
|
49
|
+
return True
|
|
50
|
+
else:
|
|
51
|
+
return False
|
|
52
|
+
else:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_cif(path: str) -> dict:
|
|
57
|
+
"""
|
|
58
|
+
Parse CIF structure and info
|
|
59
|
+
:param path: str
|
|
60
|
+
:return:
|
|
61
|
+
dict
|
|
62
|
+
"""
|
|
63
|
+
if not is_cif(path):
|
|
64
|
+
raise TypeError("Input file is not a cif file [.cif or .cif.gz]: %s" % path)
|
|
65
|
+
|
|
66
|
+
doc = gemmi.cif.Document()
|
|
67
|
+
st = gemmi.read_structure(path, save_doc=doc)
|
|
68
|
+
st.setup_entities()
|
|
69
|
+
st.assign_serial_numbers()
|
|
70
|
+
block = doc.sole_block()
|
|
71
|
+
|
|
72
|
+
def _read_src(query_block, category, name_col, taxid_col):
|
|
73
|
+
dk = pd.DataFrame(query_block.get_mmcif_category(name=category, raw=False))
|
|
74
|
+
dk[dk.isna()] = ""
|
|
75
|
+
|
|
76
|
+
if dk.shape[0] > 0 and np.all(np.isin(["entity_id", name_col, taxid_col], dk.columns)):
|
|
77
|
+
return {eid: [name, taxid]
|
|
78
|
+
for eid, name, taxid in dk[["entity_id", name_col, taxid_col]].to_numpy()
|
|
79
|
+
}
|
|
80
|
+
else:
|
|
81
|
+
return dict()
|
|
82
|
+
|
|
83
|
+
desc = pd.DataFrame(block.get_mmcif_category(name="_entity", raw=False))
|
|
84
|
+
desc[desc.isna()] = ""
|
|
85
|
+
|
|
86
|
+
entityid2description = dict()
|
|
87
|
+
if desc.shape[0] > 0 and np.all(np.isin(["id", "pdbx_description"], desc.columns)):
|
|
88
|
+
entityid2description = dict(zip(desc["id"], desc["pdbx_description"]))
|
|
89
|
+
|
|
90
|
+
entityid2src = dict()
|
|
91
|
+
src_1 = _read_src(block, "_entity_src_gen.",
|
|
92
|
+
"pdbx_gene_src_scientific_name",
|
|
93
|
+
"pdbx_gene_src_ncbi_taxonomy_id")
|
|
94
|
+
src_2 = _read_src(block, "_pdbx_entity_src_syn.",
|
|
95
|
+
"organism_scientific",
|
|
96
|
+
"ncbi_taxonomy_id")
|
|
97
|
+
src_3 = _read_src(block, "_entity_src_nat.",
|
|
98
|
+
"pdbx_organism_scientific",
|
|
99
|
+
"pdbx_ncbi_taxonomy_id")
|
|
100
|
+
entityid2src.update(src_1)
|
|
101
|
+
|
|
102
|
+
for k, v in src_2.items():
|
|
103
|
+
if k not in entityid2src:
|
|
104
|
+
entityid2src[k] = v
|
|
105
|
+
|
|
106
|
+
for k, v in src_3.items():
|
|
107
|
+
if k not in entityid2src:
|
|
108
|
+
entityid2src[k] = v
|
|
109
|
+
|
|
110
|
+
info_map = dict(st.info)
|
|
111
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
112
|
+
info = dict(description={k: v for k, v in entityid2description.items() if v and v != "?"},
|
|
113
|
+
source=entityid2src,
|
|
114
|
+
resolution=st.resolution,
|
|
115
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
116
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
117
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
118
|
+
title=info_map.get("_struct.title", "")
|
|
119
|
+
)
|
|
120
|
+
return dict(structure=st, info=info)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def molecule_description(path: str):
|
|
124
|
+
"""
|
|
125
|
+
Molecule description from PDB (.pdb or .pdb.gz)
|
|
126
|
+
:param path:
|
|
127
|
+
:return:
|
|
128
|
+
"""
|
|
129
|
+
if is_pdb(path):
|
|
130
|
+
cur_path = pathlib.Path(path)
|
|
131
|
+
if cur_path.suffixes[-1] == ".pdb":
|
|
132
|
+
with open(path, "r") as text_io:
|
|
133
|
+
lines = text_io.readlines()
|
|
134
|
+
else:
|
|
135
|
+
with gzip.open(path, "rb") as gz_handle:
|
|
136
|
+
with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
|
|
137
|
+
lines = text_io.readlines()
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
|
|
140
|
+
|
|
141
|
+
values = {"COMPND": defaultdict(dict),
|
|
142
|
+
"SOURCE": defaultdict(dict),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
comp_molid = ""
|
|
146
|
+
last_comp_key = ""
|
|
147
|
+
|
|
148
|
+
for hh in lines:
|
|
149
|
+
h = hh.strip()
|
|
150
|
+
key = h[:6].strip()
|
|
151
|
+
tt = h[10:].strip().strip(";")
|
|
152
|
+
|
|
153
|
+
if key in ["COMPND", "SOURCE"]:
|
|
154
|
+
tok = tt.split(":")
|
|
155
|
+
if len(tok) >= 2:
|
|
156
|
+
ckey = tok[0].lower().strip()
|
|
157
|
+
cval = tok[1].strip()
|
|
158
|
+
if ckey == "mol_id":
|
|
159
|
+
comp_molid = cval
|
|
160
|
+
values[key][comp_molid] = dict()
|
|
161
|
+
else:
|
|
162
|
+
values[key][comp_molid][ckey] = cval
|
|
163
|
+
last_comp_key = ckey
|
|
164
|
+
else:
|
|
165
|
+
if last_comp_key != "":
|
|
166
|
+
values[key][comp_molid][last_comp_key] += " " + tok[0].strip()
|
|
167
|
+
|
|
168
|
+
outputs = dict(description=dict(),
|
|
169
|
+
source=dict())
|
|
170
|
+
|
|
171
|
+
ch_id2mol_id = dict()
|
|
172
|
+
for mol_id, val in values["COMPND"].items():
|
|
173
|
+
chain_str = val.get("chain", "").strip()
|
|
174
|
+
if chain_str != "":
|
|
175
|
+
chains = chain_str.split(",")
|
|
176
|
+
for ch in chains:
|
|
177
|
+
ch_id2mol_id[ch.strip()] = mol_id
|
|
178
|
+
|
|
179
|
+
for mol_id, val in values["COMPND"].items():
|
|
180
|
+
m = val.get("molecule", "").strip()
|
|
181
|
+
if m != "":
|
|
182
|
+
outputs["description"][mol_id] = m
|
|
183
|
+
|
|
184
|
+
for mol_id, val in values["SOURCE"].items():
|
|
185
|
+
name = val.get("organism_scientific", "").strip()
|
|
186
|
+
taxid = val.get("organism_taxid", "").strip()
|
|
187
|
+
if name not in ["", "?", "."] or taxid not in ["", "?", "."]:
|
|
188
|
+
outputs["source"][mol_id] = [name, taxid]
|
|
189
|
+
outputs["ch_id2mol_id"] = ch_id2mol_id
|
|
190
|
+
return outputs
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def parse_pdb(path: str) -> dict:
|
|
194
|
+
if not is_pdb(path):
|
|
195
|
+
raise TypeError("Input file is not a pdb file [.pdb or .pdb.gz]: %s" % path)
|
|
196
|
+
|
|
197
|
+
st = gemmi.read_structure(path)
|
|
198
|
+
st.setup_entities()
|
|
199
|
+
st.assign_serial_numbers()
|
|
200
|
+
|
|
201
|
+
values = molecule_description(path)
|
|
202
|
+
|
|
203
|
+
mol_id2entity_name = dict()
|
|
204
|
+
for ent in st.entities:
|
|
205
|
+
if ent.name in values["ch_id2mol_id"]:
|
|
206
|
+
mol_id = values["ch_id2mol_id"][ent.name]
|
|
207
|
+
mol_id2entity_name[mol_id] = ent.name
|
|
208
|
+
|
|
209
|
+
# replace mod_id to entity.name
|
|
210
|
+
description = {mol_id2entity_name[mol_id]: v for mol_id, v in values["description"].items()
|
|
211
|
+
if mol_id in mol_id2entity_name}
|
|
212
|
+
# add ligand and water entity description
|
|
213
|
+
# gemmi use ligand name or water as entity name, take this as description
|
|
214
|
+
for ent in st.entities:
|
|
215
|
+
if (ent.name not in description
|
|
216
|
+
and ent.polymer_type.name == "Unknown"
|
|
217
|
+
and ent.name != ""
|
|
218
|
+
and len(ent.name) > 1):
|
|
219
|
+
description[ent.name] = ent.name
|
|
220
|
+
|
|
221
|
+
source = {mol_id2entity_name[mol_id]: v for mol_id, v in values["source"].items()
|
|
222
|
+
if mol_id in mol_id2entity_name}
|
|
223
|
+
|
|
224
|
+
# assign digital entity names
|
|
225
|
+
mapper = assign_digital_entity_names(st)
|
|
226
|
+
|
|
227
|
+
info_map = dict(st.info)
|
|
228
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
229
|
+
info = dict(description={mapper.get(k, k): v for k, v in description.items()},
|
|
230
|
+
source={mapper.get(k, k): v for k, v in source.items()},
|
|
231
|
+
resolution=st.resolution,
|
|
232
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
233
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
234
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
235
|
+
title=info_map.get("_struct.title", ""),
|
|
236
|
+
)
|
|
237
|
+
return dict(structure=st, info=info)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
|
|
241
|
+
"""
|
|
242
|
+
:param structure:
|
|
243
|
+
:return:
|
|
244
|
+
dict, original entity name to new digital entity name
|
|
245
|
+
"""
|
|
246
|
+
all_digit_name = np.all([ent.name.isdigit() for ent in structure.entities])
|
|
247
|
+
|
|
248
|
+
mapper = dict()
|
|
249
|
+
if not all_digit_name:
|
|
250
|
+
for ix, ent in enumerate(structure.entities):
|
|
251
|
+
new_name = str(ix + 1)
|
|
252
|
+
mapper[ent.name] = new_name
|
|
253
|
+
ent.name = new_name
|
|
254
|
+
return mapper
|
|
21
255
|
|
|
22
256
|
|
|
23
257
|
class StructureParser(object):
|
|
24
258
|
"""
|
|
25
|
-
|
|
259
|
+
Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
|
|
260
|
+
|
|
261
|
+
Read the first model
|
|
26
262
|
"""
|
|
27
263
|
|
|
28
|
-
def __init__(self, structure: gemmi.Structure = None):
|
|
264
|
+
def __init__(self, structure: Optional[gemmi.Structure] = None):
|
|
29
265
|
if not isinstance(structure, (type(None), gemmi.Structure)):
|
|
30
266
|
raise ValueError("structure must be gemmi.Structure or None")
|
|
31
267
|
if structure is None:
|
|
268
|
+
# init with an empty model
|
|
32
269
|
self.STRUCT = gemmi.Structure()
|
|
270
|
+
self.MODEL = gemmi.Model(1)
|
|
271
|
+
self.STRUCT.add_model(self.MODEL)
|
|
33
272
|
elif isinstance(structure, gemmi.Structure):
|
|
34
|
-
_assert_unique_chain_names_in_models(structure)
|
|
35
273
|
self.STRUCT = structure.clone()
|
|
36
274
|
else:
|
|
37
275
|
raise ValueError("structure must be gemmi.Structure or None")
|
|
276
|
+
|
|
38
277
|
self.STRUCT.setup_entities()
|
|
39
|
-
|
|
278
|
+
self.STRUCT.assign_serial_numbers()
|
|
40
279
|
|
|
41
|
-
self.
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
self.update_full_sequences()
|
|
46
|
-
|
|
47
|
-
def update_full_sequences(self):
|
|
48
|
-
for ent_idx, ent in enumerate(self.STRUCT.entities):
|
|
49
|
-
# get full sequence
|
|
50
|
-
full_seq = ent.full_sequence
|
|
51
|
-
|
|
52
|
-
# when missing, construct from Residues
|
|
53
|
-
if not full_seq:
|
|
54
|
-
sel_ch_id = None
|
|
55
|
-
sel_ch_len = 0
|
|
56
|
-
for ch_id, ent_id in self.ENTITY.polymer2eid.items():
|
|
57
|
-
if ent_id == ent.name:
|
|
58
|
-
cur_len = len(self.polymer_sequences[ch_id])
|
|
59
|
-
if cur_len > sel_ch_len:
|
|
60
|
-
sel_ch_id = ch_id
|
|
61
|
-
sel_ch_len = cur_len
|
|
62
|
-
|
|
63
|
-
if sel_ch_id is not None and sel_ch_len > 0:
|
|
64
|
-
full_seq = [r.name for r in self.STRUCT[0][sel_ch_id].get_polymer() if not r.is_water()]
|
|
65
|
-
self.STRUCT.entities[ent_idx].full_sequence = full_seq
|
|
66
|
-
|
|
67
|
-
@typechecked
|
|
68
|
-
def load_from_file(self, path: Union[str, pathlib.PosixPath]):
|
|
69
|
-
if _is_pdb(path):
|
|
70
|
-
struct, entity = pdb_parser(path)
|
|
71
|
-
elif _is_cif(path):
|
|
72
|
-
struct, entity = cif_parser(path)
|
|
73
|
-
else:
|
|
74
|
-
raise ValueError("Only support .cif, .cif.gz, .pdb or .pdb.gz file, but got %s" % path)
|
|
280
|
+
self.STRUCT.renumber_models()
|
|
281
|
+
if len(self.STRUCT) > 1:
|
|
282
|
+
for idx in range(1, len(self.STRUCT)):
|
|
283
|
+
del self.STRUCT[idx]
|
|
75
284
|
|
|
76
|
-
|
|
77
|
-
self.STRUCT
|
|
78
|
-
self.
|
|
285
|
+
self.MODEL = self.STRUCT[0]
|
|
286
|
+
self.STRUCT.remove_alternative_conformations()
|
|
287
|
+
self.STRUCT.remove_hydrogens()
|
|
288
|
+
self.STRUCT.remove_empty_chains()
|
|
289
|
+
self._update_full_sequences()
|
|
290
|
+
|
|
291
|
+
info_map = dict(self.STRUCT.info)
|
|
292
|
+
pdb_code = info_map.get("_entry.id", "").lower()
|
|
293
|
+
self.INFO = dict(description=dict(),
|
|
294
|
+
source=dict(),
|
|
295
|
+
resolution=self.STRUCT.resolution,
|
|
296
|
+
pdb_id=pdb_code if gemmi.is_pdb_code(pdb_code) else "",
|
|
297
|
+
method=info_map.get("_exptl.method", "").lower(),
|
|
298
|
+
deposition_date=info_map.get("_pdbx_database_status.recvd_initial_deposition_date", ""),
|
|
299
|
+
title=info_map.get("_struct.title", ""),
|
|
300
|
+
)
|
|
79
301
|
self.update_entity()
|
|
80
|
-
self.update_full_sequences()
|
|
81
|
-
|
|
82
|
-
@typechecked
|
|
83
|
-
def to_pdb(self, outfile: str, write_minimal_pdb=False):
|
|
84
|
-
compound_source = _compound_source_string(self.ENTITY)
|
|
85
|
-
struct = self.STRUCT.clone()
|
|
86
|
-
|
|
87
|
-
rs = "REMARK 2 RESOLUTION. %.2f ANGSTROMS." % struct.resolution
|
|
88
|
-
resolution_remarks = ["%-80s" % "REMARK 2",
|
|
89
|
-
"%-80s" % rs]
|
|
90
302
|
|
|
91
|
-
|
|
92
|
-
if write_minimal_pdb:
|
|
93
|
-
struct.write_minimal_pdb(outfile)
|
|
94
|
-
else:
|
|
95
|
-
struct.write_pdb(outfile)
|
|
96
|
-
|
|
97
|
-
@typechecked
|
|
98
|
-
def to_cif(self, outfile: str):
|
|
99
|
-
out_block = _cif_block_for_output(self.STRUCT, self.ENTITY)
|
|
100
|
-
out_block.write_file(outfile)
|
|
101
|
-
|
|
102
|
-
@property
|
|
103
|
-
def chain_ids(self):
|
|
104
|
-
vals = []
|
|
105
|
-
for m in self.STRUCT:
|
|
106
|
-
for c in m:
|
|
107
|
-
vals.append(c.name)
|
|
108
|
-
vals.sort()
|
|
109
|
-
return vals
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def model_numbers(self):
|
|
113
|
-
return [m.num for m in self.STRUCT]
|
|
114
|
-
|
|
115
|
-
@typechecked
|
|
116
|
-
def set_default_model(self, num: Optional[int] = None):
|
|
303
|
+
def load_from_file(self, path: str):
|
|
117
304
|
"""
|
|
118
|
-
|
|
119
|
-
:param
|
|
305
|
+
Load model from file, default use the first model.
|
|
306
|
+
:param path:
|
|
120
307
|
:return:
|
|
121
308
|
"""
|
|
122
|
-
if
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
keep_model = self.STRUCT[0]
|
|
309
|
+
if is_pdb(path):
|
|
310
|
+
val = parse_pdb(path)
|
|
311
|
+
self.STRUCT, self.INFO = val["structure"], val["info"]
|
|
312
|
+
elif is_cif(path):
|
|
313
|
+
val = parse_cif(path)
|
|
314
|
+
self.STRUCT, self.INFO = val["structure"], val["info"]
|
|
129
315
|
else:
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
316
|
+
raise ValueError("path must be files with suffixes [ .cif, .cif.gz, .pdb or .pdb.gz]")
|
|
317
|
+
|
|
318
|
+
# force to use first model when mulitple models exist
|
|
319
|
+
self.STRUCT.renumber_models()
|
|
320
|
+
if len(self.STRUCT) > 1:
|
|
321
|
+
for idx in range(1, len(self.STRUCT)):
|
|
322
|
+
del self.STRUCT[idx]
|
|
323
|
+
|
|
324
|
+
self.MODEL = self.STRUCT[0]
|
|
325
|
+
self.STRUCT.remove_alternative_conformations()
|
|
326
|
+
self.STRUCT.remove_hydrogens()
|
|
327
|
+
self.STRUCT.remove_empty_chains()
|
|
328
|
+
self._update_full_sequences()
|
|
329
|
+
self.update_entity()
|
|
134
330
|
|
|
135
|
-
|
|
136
|
-
|
|
331
|
+
def _update_full_sequences(self):
|
|
332
|
+
for idx, ent in enumerate(self.STRUCT.entities):
|
|
333
|
+
if ent.entity_type.name == "Polymer":
|
|
334
|
+
self.STRUCT.entities[idx].full_sequence = [gemmi.Entity.first_mon(item) for item in ent.full_sequence]
|
|
137
335
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
336
|
+
if len(ent.full_sequence) == 0:
|
|
337
|
+
sc = self.get_subchain(ent.subchains[0])
|
|
338
|
+
self.STRUCT.entities[idx].full_sequence = sc.extract_sequence()
|
|
141
339
|
|
|
142
|
-
|
|
143
|
-
|
|
340
|
+
@property
|
|
341
|
+
def chain_ids(self):
|
|
342
|
+
return [ch.name for ch in self.MODEL]
|
|
144
343
|
|
|
145
344
|
@property
|
|
146
|
-
def
|
|
147
|
-
return
|
|
345
|
+
def subchain_ids(self):
|
|
346
|
+
return [ch.subchain_id() for ch in self.MODEL.subchains()]
|
|
148
347
|
|
|
149
348
|
@property
|
|
150
349
|
def assembly_names(self):
|
|
151
350
|
return [assem.name for assem in self.STRUCT.assemblies]
|
|
152
351
|
|
|
153
352
|
@property
|
|
154
|
-
def
|
|
155
|
-
|
|
353
|
+
def polymer_types(self):
|
|
354
|
+
subchain_id2polymer = dict()
|
|
355
|
+
for ent in self.STRUCT.entities:
|
|
356
|
+
if ent.entity_type.name == "Polymer":
|
|
357
|
+
for ch in ent.subchains:
|
|
358
|
+
subchain_id2polymer[ch] = ent.polymer_type
|
|
359
|
+
|
|
156
360
|
out = dict()
|
|
157
|
-
for
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
361
|
+
for chain in self.MODEL:
|
|
362
|
+
polymer_ch = chain.get_polymer()
|
|
363
|
+
seq = polymer_ch.extract_sequence()
|
|
364
|
+
if seq:
|
|
365
|
+
subchain_id = polymer_ch.subchain_id()
|
|
366
|
+
if subchain_id in subchain_id2polymer:
|
|
367
|
+
out[chain.name] = subchain_id2polymer[subchain_id]
|
|
162
368
|
return out
|
|
163
369
|
|
|
164
|
-
|
|
165
|
-
def polymer_residue_numbers(self):
|
|
166
|
-
cts = self.chain_types
|
|
370
|
+
def polymer_sequences(self, pdbx: bool = False):
|
|
167
371
|
out = dict()
|
|
168
|
-
|
|
169
|
-
(
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
for chain in model:
|
|
176
|
-
ct = cts.get(chain.name, "other")
|
|
177
|
-
if ct != "other":
|
|
178
|
-
out[chain.name] = np.array([(chain.name, r.seqid.num, r.seqid.icode, r.name)
|
|
179
|
-
for r in chain.get_polymer()], dtype=id_type)
|
|
372
|
+
for ch, polymer_type in self.polymer_types.items():
|
|
373
|
+
polymer = self.get_chain(ch).get_polymer()
|
|
374
|
+
if pdbx:
|
|
375
|
+
s = gemmi.pdbx_one_letter_code(polymer.extract_sequence(), gemmi.sequence_kind(polymer_type))
|
|
376
|
+
else:
|
|
377
|
+
s = polymer.make_one_letter_sequence().replace("-", "")
|
|
378
|
+
out[ch] = s
|
|
180
379
|
return out
|
|
181
380
|
|
|
182
|
-
def
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
381
|
+
def get_subchain(self, subchain_id: str):
|
|
382
|
+
out = None
|
|
383
|
+
for ch in self.MODEL.subchains():
|
|
384
|
+
if ch.subchain_id() == subchain_id:
|
|
385
|
+
out = ch
|
|
386
|
+
break
|
|
387
|
+
|
|
388
|
+
if out is None:
|
|
389
|
+
raise ValueError("Sub-Chain %s not found (only [%s])" % (subchain_id, " ".join(self.subchain_ids)))
|
|
188
390
|
|
|
189
|
-
out = dict()
|
|
190
|
-
for model in self.STRUCT:
|
|
191
|
-
for chain in model:
|
|
192
|
-
res_codes = []
|
|
193
|
-
for r in chain:
|
|
194
|
-
if r.is_water():
|
|
195
|
-
if with_water:
|
|
196
|
-
res_codes.append(r.name)
|
|
197
|
-
else:
|
|
198
|
-
if polymer_only:
|
|
199
|
-
if r.entity_type.name == "Polymer":
|
|
200
|
-
res_codes.append(r.name)
|
|
201
|
-
else:
|
|
202
|
-
res_codes.append(r.name)
|
|
203
|
-
out[chain.name] = res_codes
|
|
204
391
|
return out
|
|
205
392
|
|
|
393
|
+
def get_chain(self, chain_id: str):
|
|
394
|
+
return self.MODEL[chain_id]
|
|
395
|
+
|
|
396
|
+
def pick_chains(self, chain_names: List[str]):
|
|
397
|
+
struct = gemmi.Structure()
|
|
398
|
+
struct.name = self.STRUCT.name
|
|
399
|
+
model = gemmi.Model(1)
|
|
400
|
+
for ch_id in chain_names:
|
|
401
|
+
model.add_chain(self.get_chain(ch_id))
|
|
402
|
+
|
|
403
|
+
struct.add_model(model)
|
|
404
|
+
|
|
405
|
+
# add basic information
|
|
406
|
+
struct.resolution = self.STRUCT.resolution
|
|
407
|
+
|
|
408
|
+
vals = {"_exptl.method": self.INFO["method"],
|
|
409
|
+
"_struct.title": "(Chains %s): " % " ".join(chain_names) + self.INFO["title"],
|
|
410
|
+
"_pdbx_database_status.recvd_initial_deposition_date": self.INFO["deposition_date"],
|
|
411
|
+
}
|
|
412
|
+
if self.INFO["pdb_id"] != "":
|
|
413
|
+
vals["_entry.id"] = self.INFO["pdb_id"]
|
|
414
|
+
|
|
415
|
+
struct.info = gemmi.InfoMap(vals)
|
|
416
|
+
new_struct = StructureParser(struct)
|
|
417
|
+
|
|
418
|
+
new_struct.INFO["description"] = {ent.name: self.INFO["description"][ent.name]
|
|
419
|
+
for ent in new_struct.STRUCT.entities
|
|
420
|
+
if ent.name in self.INFO["description"]
|
|
421
|
+
}
|
|
422
|
+
new_struct.INFO["source"] = {ent.name: self.INFO["source"][ent.name]
|
|
423
|
+
for ent in new_struct.STRUCT.entities
|
|
424
|
+
if ent.name in self.INFO["source"]
|
|
425
|
+
}
|
|
426
|
+
return new_struct
|
|
427
|
+
|
|
428
|
+
def _raw_marks(self):
|
|
429
|
+
subchain2chain = dict()
|
|
430
|
+
for chain in self.MODEL:
|
|
431
|
+
for sub_chain in chain.subchains():
|
|
432
|
+
subchain_id = sub_chain.subchain_id()
|
|
433
|
+
subchain2chain[subchain_id] = chain.name
|
|
434
|
+
|
|
435
|
+
entity2chains = dict()
|
|
436
|
+
for ent in self.STRUCT.entities:
|
|
437
|
+
val = [subchain2chain[sub_ch] for sub_ch in ent.subchains if sub_ch in subchain2chain]
|
|
438
|
+
if len(val) > 0:
|
|
439
|
+
entity2chains[ent.name] = val
|
|
440
|
+
|
|
441
|
+
mol_id = 1
|
|
442
|
+
n_line = 1
|
|
443
|
+
compound_mol = "COMPND {n_line:>3} MOL_ID: {mol_id};"
|
|
444
|
+
compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
|
|
445
|
+
compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
|
|
446
|
+
|
|
447
|
+
outputs = []
|
|
448
|
+
|
|
449
|
+
for ent in self.STRUCT.entities:
|
|
450
|
+
if ent.entity_type.name == "Polymer":
|
|
451
|
+
chain = ", ".join(entity2chains[ent.name])
|
|
452
|
+
|
|
453
|
+
molecule = self.INFO["description"].get(ent.name, "")
|
|
454
|
+
if n_line == 1:
|
|
455
|
+
outputs.append("COMPND MOL_ID: {mol_id};".format(mol_id=mol_id))
|
|
456
|
+
else:
|
|
457
|
+
outputs.append(compound_mol.format(n_line=n_line, mol_id=mol_id))
|
|
458
|
+
n_line += 1
|
|
459
|
+
|
|
460
|
+
outputs.append(compound_molecule.format(n_line=n_line, molecule=molecule))
|
|
461
|
+
n_line += 1
|
|
462
|
+
|
|
463
|
+
outputs.append(compound_chain.format(n_line=n_line, chain=chain))
|
|
464
|
+
n_line += 1
|
|
465
|
+
|
|
466
|
+
mol_id += 1
|
|
467
|
+
|
|
468
|
+
mol_id = 1
|
|
469
|
+
n_line = 1
|
|
470
|
+
source_mol = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
|
|
471
|
+
source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
|
|
472
|
+
source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
|
|
473
|
+
|
|
474
|
+
for ent in self.STRUCT.entities:
|
|
475
|
+
if ent.entity_type.name == "Polymer":
|
|
476
|
+
src = self.INFO["source"].get(ent.name)
|
|
477
|
+
if src is None:
|
|
478
|
+
organism_scientific, organism_taxid = "", ""
|
|
479
|
+
else:
|
|
480
|
+
organism_scientific, organism_taxid = src
|
|
481
|
+
|
|
482
|
+
if n_line == 1:
|
|
483
|
+
outputs.append("SOURCE MOL_ID: {mol_id};".format(mol_id=mol_id))
|
|
484
|
+
else:
|
|
485
|
+
outputs.append(source_mol.format(n_line=n_line, mol_id=mol_id))
|
|
486
|
+
n_line += 1
|
|
487
|
+
|
|
488
|
+
outputs.append(source_scientific.format(n_line=n_line, organism_scientific=organism_scientific))
|
|
489
|
+
n_line += 1
|
|
490
|
+
|
|
491
|
+
outputs.append(source_taxid.format(n_line=n_line, organism_taxid=organism_taxid))
|
|
492
|
+
n_line += 1
|
|
493
|
+
|
|
494
|
+
mol_id += 1
|
|
495
|
+
|
|
496
|
+
resolution_remarks = ["REMARK 2",
|
|
497
|
+
"REMARK 2 RESOLUTION. %.2f ANGSTROMS." % self.STRUCT.resolution
|
|
498
|
+
]
|
|
499
|
+
outputs.extend(resolution_remarks)
|
|
500
|
+
return outputs
|
|
501
|
+
|
|
502
|
+
def to_pdb(self, outfile: str, write_minimal_pdb=False):
|
|
503
|
+
struct = self.STRUCT.clone()
|
|
504
|
+
if write_minimal_pdb:
|
|
505
|
+
struct.write_minimal_pdb(outfile)
|
|
506
|
+
else:
|
|
507
|
+
struct.raw_remarks = self._raw_marks()
|
|
508
|
+
struct.write_pdb(outfile)
|
|
509
|
+
|
|
510
|
+
@staticmethod
|
|
511
|
+
def _item_index(block: gemmi.cif.Block, tag: str):
|
|
512
|
+
mapper = dict()
|
|
513
|
+
for idx, item in enumerate(block):
|
|
514
|
+
if item.loop is not None:
|
|
515
|
+
keys = item.loop.tags
|
|
516
|
+
for k in keys:
|
|
517
|
+
mapper[k] = idx
|
|
518
|
+
elif item.pair is not None:
|
|
519
|
+
key = item.pair[0]
|
|
520
|
+
mapper[key] = idx
|
|
521
|
+
return mapper.get(tag)
|
|
522
|
+
|
|
523
|
+
def to_cif(self, outfile: str):
|
|
524
|
+
block = self.STRUCT.make_mmcif_block()
|
|
525
|
+
#### add resolution
|
|
526
|
+
# block.set_pair(tag="_refine.entry_id", value=gemmi.cif.quote(self.INFO["pdb_id"].upper()))
|
|
527
|
+
# block.set_pair(tag="_refine.pdbx_refine_id", value=gemmi.cif.quote(self.INFO["method"].upper()))
|
|
528
|
+
block.set_pair(tag="_refine.ls_d_res_high", value=gemmi.cif.quote(str(self.INFO["resolution"])))
|
|
529
|
+
|
|
530
|
+
# tag_names = ["_exptl.entry_id",
|
|
531
|
+
# "_refine.entry_id", "_refine.pdbx_refine_id",
|
|
532
|
+
# "_refine.ls_d_res_high"]
|
|
533
|
+
# for i in range(1, len(tag_names)):
|
|
534
|
+
# idx_1a = self._item_index(block, tag=tag_names[i])
|
|
535
|
+
# idx_2a = self._item_index(block, tag=tag_names[i - 1])
|
|
536
|
+
# block.move_item(idx_1a, idx_2a + 1)
|
|
537
|
+
|
|
538
|
+
#### add entity description
|
|
539
|
+
ta = block.find_mmcif_category(category="_entity.")
|
|
540
|
+
da = pd.DataFrame(list(ta), columns=list(ta.tags))
|
|
541
|
+
da["_entity.pdbx_description"] = da["_entity.id"].apply(
|
|
542
|
+
lambda i: gemmi.cif.quote(self.INFO["description"].get(i, "?")))
|
|
543
|
+
|
|
544
|
+
rows_1 = da.to_numpy().tolist()
|
|
545
|
+
tags_1 = [s.replace("_entity.", "") for s in da.columns.tolist()]
|
|
546
|
+
|
|
547
|
+
# erase
|
|
548
|
+
qitem = block.find_loop_item("_entity.id")
|
|
549
|
+
if isinstance(qitem, gemmi.cif.Item):
|
|
550
|
+
qitem.erase()
|
|
551
|
+
|
|
552
|
+
# add
|
|
553
|
+
loop_1 = block.init_loop(prefix="_entity.", tags=tags_1)
|
|
554
|
+
for r in rows_1:
|
|
555
|
+
loop_1.add_row(r)
|
|
556
|
+
|
|
557
|
+
idx_1b = self._item_index(block, tag="_entity.id")
|
|
558
|
+
idx_2b = self._item_index(block, tag="_entity_poly.entity_id")
|
|
559
|
+
|
|
560
|
+
# place _entity. before _entity_poly.
|
|
561
|
+
if isinstance(idx_1b, int) and isinstance(idx_2b, int):
|
|
562
|
+
block.move_item(idx_1b, idx_2b - 1)
|
|
563
|
+
|
|
564
|
+
#### add source name and taxid
|
|
565
|
+
loop_2 = block.init_loop(prefix="_entity_src_gen.", tags=["entity_id",
|
|
566
|
+
"pdbx_gene_src_scientific_name",
|
|
567
|
+
"pdbx_gene_src_ncbi_taxonomy_id"])
|
|
568
|
+
|
|
569
|
+
for k, (name, taxid) in self.INFO["source"].items():
|
|
570
|
+
name = name if name != "" else "?"
|
|
571
|
+
taxid = taxid if taxid != "" else "?"
|
|
572
|
+
|
|
573
|
+
loop_2.add_row([gemmi.cif.quote(k),
|
|
574
|
+
gemmi.cif.quote(name),
|
|
575
|
+
gemmi.cif.quote(taxid)]
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
idx_1c = self._item_index(block, tag="_entity_src_gen.entity_id")
|
|
579
|
+
idx_2c = self._item_index(block, tag="_entity_poly_seq.entity_id")
|
|
580
|
+
# place _entity_src_gen. after _entity_poly_seq.
|
|
581
|
+
if isinstance(idx_1c, int) and isinstance(idx_2c, int):
|
|
582
|
+
block.move_item(idx_1c, idx_2c + 1)
|
|
583
|
+
|
|
584
|
+
block.write_file(outfile)
|
|
585
|
+
|
|
206
586
|
def update_entity(self):
|
|
207
587
|
"""
|
|
208
588
|
Update ENTITY, .entities .assemblies according to subchains
|
|
209
589
|
:return:
|
|
210
590
|
"""
|
|
211
|
-
subchains =
|
|
212
|
-
for model in self.STRUCT:
|
|
213
|
-
for chain in model:
|
|
214
|
-
subchains.extend([sc.subchain_id() for sc in chain.subchains()])
|
|
591
|
+
subchains = self.subchain_ids
|
|
215
592
|
|
|
216
593
|
# update .entities
|
|
217
594
|
new_entities = gemmi.EntityList()
|
|
@@ -224,15 +601,9 @@ class StructureParser(object):
|
|
|
224
601
|
ent_names.append(ent.name)
|
|
225
602
|
self.STRUCT.entities = new_entities
|
|
226
603
|
|
|
227
|
-
# update
|
|
228
|
-
for
|
|
229
|
-
|
|
230
|
-
if eid not in ent_names:
|
|
231
|
-
del self.ENTITY[super_key][eid]
|
|
232
|
-
|
|
233
|
-
for cid, eid in list(self.ENTITY["polymer2eid"].items()):
|
|
234
|
-
if eid not in ent_names or cid not in self.chain_ids:
|
|
235
|
-
del self.ENTITY["polymer2eid"][cid]
|
|
604
|
+
# update INFO
|
|
605
|
+
self.INFO["description"] = {k: v for k, v in self.INFO["description"].items() if k in ent_names}
|
|
606
|
+
self.INFO["source"] = {k: v for k, v in self.INFO["source"].items() if k in ent_names}
|
|
236
607
|
|
|
237
608
|
# update .assemblies
|
|
238
609
|
all_cid = self.chain_ids
|
|
@@ -262,189 +633,257 @@ class StructureParser(object):
|
|
|
262
633
|
for dai in del_assembly_indexes:
|
|
263
634
|
del self.STRUCT.assemblies[dai]
|
|
264
635
|
|
|
265
|
-
@typechecked
|
|
266
636
|
def rename_chain(self, origin_name: str, target_name: str):
|
|
267
637
|
if origin_name not in self.chain_ids:
|
|
268
|
-
raise ValueError("
|
|
638
|
+
raise ValueError("Chain %s not found" % origin_name)
|
|
639
|
+
|
|
269
640
|
other_chain_names = set(self.chain_ids) - {origin_name}
|
|
270
641
|
|
|
271
642
|
if target_name in other_chain_names:
|
|
272
|
-
raise ValueError("
|
|
643
|
+
raise ValueError("Chain %s has existed, please set a different target_name." % target_name)
|
|
273
644
|
|
|
274
645
|
self.STRUCT.rename_chain(origin_name, target_name)
|
|
275
646
|
|
|
276
|
-
# update .polymer2eid if exist
|
|
277
|
-
if origin_name in self.ENTITY.polymer2eid:
|
|
278
|
-
val = self.ENTITY.polymer2eid[origin_name]
|
|
279
|
-
del self.ENTITY.polymer2eid[origin_name]
|
|
280
|
-
self.ENTITY.polymer2eid[target_name] = val
|
|
281
|
-
|
|
282
|
-
# update .assemblies.generator.chain if exists, for .pdb loading structure
|
|
283
647
|
for assembly in self.STRUCT.assemblies:
|
|
284
648
|
for gen in assembly.generators:
|
|
285
649
|
tmp = [target_name if c == origin_name else c for c in gen.chains]
|
|
286
650
|
gen.chains = tmp
|
|
287
651
|
|
|
288
|
-
|
|
289
|
-
def switch_chain_names(self, chain_name_1: str, chain_name_2: str):
|
|
652
|
+
def swap_chain_names(self, chain_name_1: str, chain_name_2: str):
|
|
290
653
|
if chain_name_1 not in self.chain_ids:
|
|
291
|
-
raise ValueError("
|
|
654
|
+
raise ValueError("Chain %s not found" % chain_name_1)
|
|
292
655
|
if chain_name_2 not in self.chain_ids:
|
|
293
|
-
raise ValueError("
|
|
656
|
+
raise ValueError("Chain %s not in found" % chain_name_2)
|
|
294
657
|
|
|
295
|
-
|
|
296
|
-
|
|
658
|
+
flag = True
|
|
659
|
+
while flag:
|
|
660
|
+
characters = string.ascii_letters + string.digits
|
|
661
|
+
sw_name = ''.join(random.choices(characters, k=4))
|
|
662
|
+
if sw_name not in self.chain_ids:
|
|
663
|
+
flag = False
|
|
297
664
|
|
|
298
|
-
current_names = set(self.chain_ids)
|
|
299
|
-
l3_l = [n for n in l3 if n not in current_names]
|
|
300
|
-
sw_name = l3_l.pop()
|
|
301
665
|
self.rename_chain(chain_name_1, sw_name)
|
|
302
666
|
self.rename_chain(chain_name_2, chain_name_1)
|
|
303
667
|
self.rename_chain(sw_name, chain_name_2)
|
|
304
668
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
669
|
+
def make_one_letter_chain(self, only_uppercase: bool = True):
|
|
670
|
+
uppercase_letters = list(string.ascii_uppercase)
|
|
671
|
+
uppercase_letters.sort(reverse=True)
|
|
308
672
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
for di in del_chain_indexes:
|
|
318
|
-
del self.STRUCT[0][di]
|
|
319
|
-
self.update_entity()
|
|
673
|
+
lowercase_letters = list(string.ascii_lowercase)
|
|
674
|
+
lowercase_letters.sort(reverse=True)
|
|
675
|
+
|
|
676
|
+
digit_letters = list(string.digits)
|
|
677
|
+
digit_letters.sort(reverse=True)
|
|
678
|
+
|
|
679
|
+
if only_uppercase:
|
|
680
|
+
letters = uppercase_letters
|
|
320
681
|
else:
|
|
321
|
-
|
|
682
|
+
letters = digit_letters + lowercase_letters + uppercase_letters
|
|
322
683
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
684
|
+
if only_uppercase:
|
|
685
|
+
msg = "The number of chains exceed the number of uppercase letters: %d > %d"
|
|
686
|
+
else:
|
|
687
|
+
msg = "The number of chains exceed the number of one-letter characters: %d > %d"
|
|
688
|
+
|
|
689
|
+
if len(self.chain_ids) > len(letters):
|
|
690
|
+
raise RuntimeError(msg % (len(self.chain_ids), len(letters)))
|
|
691
|
+
|
|
692
|
+
# not use yet
|
|
693
|
+
letters_valid = [l for l in letters if l not in self.chain_ids]
|
|
694
|
+
chains2rename = [ch for ch in self.chain_ids if ch not in letters]
|
|
695
|
+
mapper = {ch: letters_valid.pop() for ch in self.chain_ids if ch not in letters}
|
|
696
|
+
|
|
697
|
+
for origin_name, target_name in mapper.items():
|
|
327
698
|
self.rename_chain(origin_name, target_name)
|
|
328
|
-
return
|
|
699
|
+
return mapper
|
|
329
700
|
|
|
330
|
-
|
|
331
|
-
|
|
701
|
+
def get_assembly(self, assembly_name: str,
|
|
702
|
+
how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
|
|
332
703
|
if assembly_name not in self.assembly_names:
|
|
333
|
-
raise ValueError("
|
|
704
|
+
raise ValueError("Assembly %s not found (only [%s])" % (assembly_name, ", ".join(self.assembly_names)))
|
|
334
705
|
|
|
335
|
-
struct
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
706
|
+
struct = self.STRUCT.clone()
|
|
707
|
+
struct.transform_to_assembly(assembly_name, how)
|
|
708
|
+
struct.info["_struct.title"] = "(Assembly %s): " % assembly_name + struct.info["_struct.title"]
|
|
709
|
+
|
|
710
|
+
new_struct = StructureParser(struct)
|
|
711
|
+
|
|
712
|
+
# find perfect match entities
|
|
713
|
+
entity_mapper = dict()
|
|
714
|
+
for new_ent in new_struct.STRUCT.entities:
|
|
715
|
+
for ent in self.STRUCT.entities:
|
|
716
|
+
if new_ent.entity_type == ent.entity_type:
|
|
717
|
+
if ent.entity_type.name == "Polymer":
|
|
718
|
+
if new_ent.full_sequence == ent.full_sequence:
|
|
719
|
+
entity_mapper[new_ent.name] = ent.name
|
|
720
|
+
break
|
|
721
|
+
else:
|
|
722
|
+
new_s = new_struct.get_subchain(new_ent.subchains[0]).extract_sequence()
|
|
723
|
+
s = self.get_subchain(ent.subchains[0]).extract_sequence()
|
|
724
|
+
if new_s == s:
|
|
725
|
+
entity_mapper[new_ent.name] = ent.name
|
|
726
|
+
break
|
|
339
727
|
|
|
340
|
-
# update
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
out.STRUCT.info = out.INFO.to_gemmi_structure_infomap()
|
|
344
|
-
return out
|
|
728
|
+
# update Info
|
|
729
|
+
desc = dict()
|
|
730
|
+
src = dict()
|
|
345
731
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
732
|
+
for ent in new_struct.STRUCT.entities:
|
|
733
|
+
if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["description"]:
|
|
734
|
+
desc[ent.name] = self.INFO["description"][entity_mapper[ent.name]]
|
|
735
|
+
|
|
736
|
+
if ent.name in entity_mapper and entity_mapper[ent.name] in self.INFO["source"]:
|
|
737
|
+
src[ent.name] = self.INFO["source"][entity_mapper[ent.name]]
|
|
350
738
|
|
|
351
|
-
|
|
739
|
+
new_struct.INFO["description"] = desc
|
|
740
|
+
new_struct.INFO["source"] = src
|
|
741
|
+
return new_struct
|
|
352
742
|
|
|
353
|
-
|
|
354
|
-
|
|
743
|
+
def clean_structure(self, remove_ligand=True):
|
|
744
|
+
"""
|
|
745
|
+
Remove water by default
|
|
746
|
+
|
|
747
|
+
:param remove_ligand:
|
|
355
748
|
:return:
|
|
356
|
-
GemmiLoader
|
|
357
749
|
"""
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
raise RuntimeError("Chain %s is not in the structure" % c)
|
|
361
|
-
if len(self.STRUCT) > 1:
|
|
362
|
-
print("Multiple models in structure, do nothing")
|
|
363
|
-
elif len(chains) < 2:
|
|
364
|
-
print("Query chains less than 2, do nothing")
|
|
365
|
-
else:
|
|
366
|
-
new_chain = gemmi.Chain(chains[0])
|
|
367
|
-
residue_index = 1
|
|
368
|
-
|
|
369
|
-
model = self.STRUCT[0]
|
|
370
|
-
|
|
371
|
-
for ch in model:
|
|
372
|
-
if ch.name in chains:
|
|
373
|
-
for res in ch:
|
|
374
|
-
nr = deepcopy(res)
|
|
375
|
-
nr.seqid.icode = " "
|
|
376
|
-
nr.seqid.num = residue_index
|
|
377
|
-
new_chain.add_residue(nr)
|
|
378
|
-
residue_index += 1
|
|
379
|
-
|
|
380
|
-
for c in chains:
|
|
381
|
-
self.STRUCT[0].remove_chain(c)
|
|
382
|
-
|
|
383
|
-
self.STRUCT[0].add_chain(new_chain, unique_name=True)
|
|
384
|
-
|
|
385
|
-
def get_atom_coords(self, chains: List[str], atoms: Optional[List[str]] = None):
|
|
386
|
-
for c in chains:
|
|
387
|
-
if c not in self.chain_ids:
|
|
388
|
-
warnings.warn("Chain %s is not in the structure" % c)
|
|
389
|
-
|
|
390
|
-
coord = []
|
|
391
|
-
atom_id = []
|
|
392
|
-
id_type = np.dtype([
|
|
393
|
-
("ch_name", "U5"),
|
|
394
|
-
("res_num", "i4"),
|
|
395
|
-
("res_icode", "U3"),
|
|
396
|
-
("res_name", "U5"),
|
|
397
|
-
("atom_name", "U5")
|
|
398
|
-
])
|
|
399
|
-
|
|
400
|
-
model = self.STRUCT[0]
|
|
401
|
-
for ch in model:
|
|
402
|
-
if ch.name in chains:
|
|
403
|
-
for res in ch:
|
|
404
|
-
for atom in res:
|
|
405
|
-
if atoms is None or atom.name in atoms:
|
|
406
|
-
cur_id = (ch.name, res.seqid.num, res.seqid.icode, res.name, atom.name)
|
|
407
|
-
cur_pos = atom.pos.tolist()
|
|
408
|
-
coord.append(cur_pos)
|
|
409
|
-
atom_id.append(cur_id)
|
|
410
|
-
|
|
411
|
-
if coord:
|
|
412
|
-
return np.array(coord, dtype=np.float32), np.array(atom_id, dtype=id_type)
|
|
750
|
+
if remove_ligand:
|
|
751
|
+
self.STRUCT.remove_waters()
|
|
413
752
|
else:
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def make_one_letter_sequence(self, chain_id):
|
|
417
|
-
c_type = self.chain_types[chain_id]
|
|
418
|
-
residues = self.chain_residues(polymer_only=True, with_water=False)[chain_id]
|
|
753
|
+
self.STRUCT.remove_ligands_and_waters()
|
|
419
754
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
elif c_type in ["dna", "rna"]:
|
|
423
|
-
one_letter_code = "".join([nucleic_3to1_mapper.get(r, "N") for r in residues])
|
|
424
|
-
else:
|
|
425
|
-
one_letter_code = ""
|
|
426
|
-
return one_letter_code
|
|
755
|
+
self.STRUCT.remove_empty_chains()
|
|
756
|
+
self.update_entity()
|
|
427
757
|
|
|
428
|
-
def
|
|
758
|
+
def met_to_mse(self):
|
|
759
|
+
for chain in self.MODEL:
|
|
760
|
+
for residue in chain:
|
|
761
|
+
if residue.name == 'MET':
|
|
762
|
+
residue.name = 'MSE'
|
|
763
|
+
for atom in residue:
|
|
764
|
+
if atom.name == 'SD':
|
|
765
|
+
atom.name = 'SE'
|
|
766
|
+
atom.element = gemmi.Element('Se')
|
|
767
|
+
|
|
768
|
+
def get_atoms(self, arg: str = "*"):
|
|
429
769
|
"""
|
|
430
|
-
(1) remove_alternative_conformations
|
|
431
|
-
(2) remove_hydrogens
|
|
432
|
-
(3) remove_water
|
|
433
|
-
(4) remove_empty_chains
|
|
434
770
|
|
|
771
|
+
:param arg: str, "*", "/1/*//N,CA,C,O", "/1/*"
|
|
772
|
+
see gemmi.Selection
|
|
435
773
|
:return:
|
|
774
|
+
np.ndarray
|
|
436
775
|
"""
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
776
|
+
sel = gemmi.Selection(arg)
|
|
777
|
+
res = []
|
|
778
|
+
|
|
779
|
+
for model in sel.models(self.STRUCT):
|
|
780
|
+
for chain in sel.chains(model):
|
|
781
|
+
for residue in sel.residues(chain):
|
|
782
|
+
for atom in sel.atoms(residue):
|
|
783
|
+
val = (chain.name,
|
|
784
|
+
residue.seqid.num,
|
|
785
|
+
residue.seqid.icode,
|
|
786
|
+
residue.name,
|
|
787
|
+
atom.name,
|
|
788
|
+
atom.element.name,
|
|
789
|
+
atom.charge,
|
|
790
|
+
atom.b_iso,
|
|
791
|
+
atom.occ,
|
|
792
|
+
tuple(atom.pos.tolist()),
|
|
793
|
+
)
|
|
794
|
+
res.append(val)
|
|
795
|
+
|
|
796
|
+
dtype = [("chain_name", "U5"),
|
|
797
|
+
("residue_num", "i4"),
|
|
798
|
+
("residue_icode", "U3"),
|
|
799
|
+
("residue_name", "U5"),
|
|
800
|
+
("atom_name", "U5"),
|
|
801
|
+
("element", "U3"),
|
|
802
|
+
("charge", "i1"),
|
|
803
|
+
("b_factor", "f4"),
|
|
804
|
+
("occupancy", "f4"),
|
|
805
|
+
("coordinate", ("f4", (3,)))
|
|
806
|
+
]
|
|
807
|
+
return np.array(res, dtype=dtype)
|
|
808
|
+
|
|
809
|
+
def polymer_interface_residues(self,
|
|
810
|
+
chains_x: List[str],
|
|
811
|
+
chains_y: List[str],
|
|
812
|
+
threshold: float = 4.5):
|
|
813
|
+
"""
|
|
814
|
+
Identify PPI among protein, DNA, RNA using heavy atom distances.
|
|
815
|
+
:param chains_x:
|
|
816
|
+
:param chains_y:
|
|
817
|
+
:param threshold:
|
|
818
|
+
:return:
|
|
819
|
+
PPI residues of chains_x, PPI residues of chains_y
|
|
820
|
+
"""
|
|
821
|
+
for ch in chains_x + chains_y:
|
|
822
|
+
if ch not in self.chain_ids:
|
|
823
|
+
raise ValueError("Chain %s not found (only [%s])" % (ch, " ".join(self.chain_ids)))
|
|
824
|
+
elif ch not in self.polymer_types:
|
|
825
|
+
raise ValueError("Chain %s is not a polymer (only [%s])"
|
|
826
|
+
% (ch, " ".join(list(self.polymer_types.keys())))
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
def ppi_atoms(struct, chains):
|
|
830
|
+
# atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
|
|
831
|
+
protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
|
|
832
|
+
'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
|
|
833
|
+
'SD', 'SG']
|
|
834
|
+
xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
|
|
835
|
+
'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
|
|
836
|
+
'OP1', 'OP2', 'OP3', 'P']
|
|
837
|
+
tag = "/1/%s//%s" % (",".join(chains), ",".join(protein_atoms + xna_atoms))
|
|
838
|
+
z = struct.get_atoms(tag)
|
|
839
|
+
return z
|
|
840
|
+
|
|
841
|
+
query_struct = deepcopy(self)
|
|
842
|
+
query_struct.clean_structure(remove_ligand=True)
|
|
843
|
+
|
|
844
|
+
atom_x = ppi_atoms(query_struct, chains_x)
|
|
845
|
+
atom_y = ppi_atoms(query_struct, chains_y)
|
|
846
|
+
|
|
847
|
+
kd_tree_x = cKDTree(atom_x["coordinate"])
|
|
848
|
+
kd_tree_y = cKDTree(atom_y["coordinate"])
|
|
849
|
+
|
|
850
|
+
pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
|
|
851
|
+
x_res = np.unique(atom_x[pairs.row][["chain_name", "residue_num", "residue_icode", "residue_name"]])
|
|
852
|
+
y_res = np.unique(atom_y[pairs.col][["chain_name", "residue_num", "residue_icode", "residue_name"]])
|
|
853
|
+
|
|
854
|
+
return x_res, y_res
|
|
855
|
+
|
|
856
|
+
def polymer_interface_residues_all(self, ppi_threshold: float = 4.5, n_cpus: int = 4):
|
|
857
|
+
"""
|
|
858
|
+
Identify PPI among protein, DNA, RNA using heavy atom distances between all chain pairs.
|
|
440
859
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
860
|
+
:param ppi_threshold:
|
|
861
|
+
:param n_cpus:
|
|
862
|
+
:return:
|
|
863
|
+
"""
|
|
864
|
+
chains = list(self.polymer_types.keys())
|
|
865
|
+
ch_pairs = list(itertools.combinations(chains, r=2))
|
|
866
|
+
ch_pairs.sort()
|
|
867
|
+
|
|
868
|
+
def _run(ch_1, ch_2):
|
|
869
|
+
key = "%s/%s" % (ch_1, ch_2)
|
|
870
|
+
res_x, res_y = self.polymer_interface_residues(chains_x=[ch_1], chains_y=[ch_2], threshold=ppi_threshold)
|
|
871
|
+
|
|
872
|
+
if len(res_x) > 0:
|
|
873
|
+
vx = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_x.tolist()]
|
|
874
|
+
vy = ["%s/%d/%s/%s" % (a, b, c.strip(), d) for a, b, c, d in res_y.tolist()]
|
|
875
|
+
return {key: [vx, vy]}
|
|
876
|
+
else:
|
|
877
|
+
return dict()
|
|
445
878
|
|
|
446
|
-
|
|
879
|
+
cpu2use = max(min(n_cpus, len(ch_pairs)), 1)
|
|
447
880
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
881
|
+
outputs = dict()
|
|
882
|
+
if cpu2use == 1 or len(ch_pairs) < 50:
|
|
883
|
+
for ch_1, ch_2 in ch_pairs:
|
|
884
|
+
outputs.update(_run(ch_1, ch_2))
|
|
885
|
+
else:
|
|
886
|
+
results = Parallel(n_jobs=cpu2use)(delayed(_run)(c1, c2) for c1, c2 in ch_pairs)
|
|
887
|
+
for item in results:
|
|
888
|
+
outputs.update(item)
|
|
889
|
+
return outputs
|