gemmi-protools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gemmi-protools might be problematic. Click here for more details.
- gemmi_protools/__init__.py +8 -0
- gemmi_protools/align.py +183 -0
- gemmi_protools/cif_opts.py +167 -0
- gemmi_protools/convert.py +96 -0
- gemmi_protools/dockq.py +139 -0
- gemmi_protools/parse_pdb_header.py +387 -0
- gemmi_protools/parser.py +279 -0
- gemmi_protools/pdb_opts.py +177 -0
- gemmi_protools/ppi.py +74 -0
- gemmi_protools/reader.py +371 -0
- gemmi_protools/struct_info.py +91 -0
- gemmi_protools-0.1.0.dist-info/METADATA +19 -0
- gemmi_protools-0.1.0.dist-info/RECORD +16 -0
- gemmi_protools-0.1.0.dist-info/WHEEL +5 -0
- gemmi_protools-0.1.0.dist-info/licenses/LICENSE +21 -0
- gemmi_protools-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2004 Kristian Rother.
|
|
3
|
+
# Revisions copyright 2004 Thomas Hamelryck.
|
|
4
|
+
# Revisions copyright 2024 James Krieger.
|
|
5
|
+
#
|
|
6
|
+
# This file is part of the Biopython distribution and governed by your
|
|
7
|
+
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
8
|
+
# Please see the LICENSE file that should have been included as part of this
|
|
9
|
+
# package.
|
|
10
|
+
|
|
11
|
+
"""Parse header of PDB files into a python dictionary.
|
|
12
|
+
|
|
13
|
+
Emerged from the Columba database project www.columba-db.de, original author
|
|
14
|
+
Kristian Rother.
|
|
15
|
+
|
|
16
|
+
Modify _parse_pdb_header_list
|
|
17
|
+
Don't perform lower() to chain id
|
|
18
|
+
By Luo Jiejian
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
|
|
24
|
+
from Bio import File
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_biomoltrans(inl):
|
|
28
|
+
# REMARK 350
|
|
29
|
+
# REMARK 350 COORDINATES FOR A COMPLETE MULTIMER REPRESENTING THE KNOWN
|
|
30
|
+
# REMARK 350 BIOLOGICALLY SIGNIFICANT OLIGOMERIZATION STATE OF THE
|
|
31
|
+
# REMARK 350 MOLECULE CAN BE GENERATED BY APPLYING BIOMT TRANSFORMATIONS
|
|
32
|
+
# REMARK 350 GIVEN BELOW. BOTH NON-CRYSTALLOGRAPHIC AND
|
|
33
|
+
# REMARK 350 CRYSTALLOGRAPHIC OPERATIONS ARE GIVEN.
|
|
34
|
+
# REMARK 350
|
|
35
|
+
# REMARK 350 BIOMOLECULE: 1
|
|
36
|
+
# REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: MONOMERIC
|
|
37
|
+
# REMARK 350 APPLY THE FOLLOWING TO CHAINS: A
|
|
38
|
+
# REMARK 350 BIOMT1 1 1.000000 0.000000 0.000000 0.00000
|
|
39
|
+
# REMARK 350 BIOMT2 1 0.000000 1.000000 0.000000 0.00000
|
|
40
|
+
# REMARK 350 BIOMT3 1 0.000000 0.000000 1.000000 0.00000
|
|
41
|
+
biomolecule = defaultdict(list)
|
|
42
|
+
for line in inl:
|
|
43
|
+
if line.startswith("REMARK 350"):
|
|
44
|
+
if line[11:23] == "BIOMOLECULE:":
|
|
45
|
+
currentBiomolecule = line.split()[-1]
|
|
46
|
+
applyToChains = []
|
|
47
|
+
elif (
|
|
48
|
+
line[11:41] == "APPLY THE FOLLOWING TO CHAINS:"
|
|
49
|
+
or line[30:41] == "AND CHAINS:"
|
|
50
|
+
):
|
|
51
|
+
applyToChains.extend(
|
|
52
|
+
line[41:].replace(" ", "").strip().strip(",").split(",")
|
|
53
|
+
)
|
|
54
|
+
elif line[13:18] == "BIOMT":
|
|
55
|
+
biomt = biomolecule[currentBiomolecule]
|
|
56
|
+
if line[13:19] == "BIOMT1":
|
|
57
|
+
if applyToChains == []:
|
|
58
|
+
applyToChains = biomt[0]
|
|
59
|
+
biomt.append(applyToChains)
|
|
60
|
+
elif line[13:19]:
|
|
61
|
+
applyToChains = []
|
|
62
|
+
biomt.append(line[23:])
|
|
63
|
+
return dict(biomolecule)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_journal(inl):
|
|
67
|
+
# JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7
|
|
68
|
+
journal = ""
|
|
69
|
+
for line in inl:
|
|
70
|
+
if re.search(r"\AJRNL", line):
|
|
71
|
+
journal += line[19:72].lower()
|
|
72
|
+
journal = re.sub(r"\s\s+", " ", journal)
|
|
73
|
+
return journal
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _get_references(inl):
|
|
77
|
+
# REMARK 1 REFERENCE 1 1CSE 11
|
|
78
|
+
# REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12
|
|
79
|
+
references = []
|
|
80
|
+
actref = ""
|
|
81
|
+
for line in inl:
|
|
82
|
+
if re.search(r"\AREMARK 1", line):
|
|
83
|
+
if re.search(r"\AREMARK 1 REFERENCE", line):
|
|
84
|
+
if actref != "":
|
|
85
|
+
actref = re.sub(r"\s\s+", " ", actref)
|
|
86
|
+
if actref != " ":
|
|
87
|
+
references.append(actref)
|
|
88
|
+
actref = ""
|
|
89
|
+
else:
|
|
90
|
+
actref += line[19:72].lower()
|
|
91
|
+
|
|
92
|
+
if actref != "":
|
|
93
|
+
actref = re.sub(r"\s\s+", " ", actref)
|
|
94
|
+
if actref != " ":
|
|
95
|
+
references.append(actref)
|
|
96
|
+
return references
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# bring dates to format: 1909-01-08
|
|
100
|
+
def _format_date(pdb_date):
|
|
101
|
+
"""Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE)."""
|
|
102
|
+
date = ""
|
|
103
|
+
year = int(pdb_date[7:])
|
|
104
|
+
if year < 50:
|
|
105
|
+
century = 2000
|
|
106
|
+
else:
|
|
107
|
+
century = 1900
|
|
108
|
+
date = str(century + year) + "-"
|
|
109
|
+
all_months = [
|
|
110
|
+
"xxx",
|
|
111
|
+
"Jan",
|
|
112
|
+
"Feb",
|
|
113
|
+
"Mar",
|
|
114
|
+
"Apr",
|
|
115
|
+
"May",
|
|
116
|
+
"Jun",
|
|
117
|
+
"Jul",
|
|
118
|
+
"Aug",
|
|
119
|
+
"Sep",
|
|
120
|
+
"Oct",
|
|
121
|
+
"Nov",
|
|
122
|
+
"Dec",
|
|
123
|
+
]
|
|
124
|
+
month = str(all_months.index(pdb_date[3:6]))
|
|
125
|
+
if len(month) == 1:
|
|
126
|
+
month = "0" + month
|
|
127
|
+
date = date + month + "-" + pdb_date[:2]
|
|
128
|
+
return date
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _chop_end_codes(line):
|
|
132
|
+
"""Chops lines ending with ' 1CSA 14' and the like (PRIVATE)."""
|
|
133
|
+
return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _chop_end_misc(line):
|
|
137
|
+
"""Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE)."""
|
|
138
|
+
return re.sub(r"\s+\d\d-\w\w\w-\d\d\s+[1-9][0-9A-Z]{3}\s*\Z", "", line)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _nice_case(line):
|
|
142
|
+
"""Make A Lowercase String With Capitals (PRIVATE)."""
|
|
143
|
+
line_lower = line.lower()
|
|
144
|
+
s = ""
|
|
145
|
+
i = 0
|
|
146
|
+
nextCap = 1
|
|
147
|
+
while i < len(line_lower):
|
|
148
|
+
c = line_lower[i]
|
|
149
|
+
if c >= "a" and c <= "z" and nextCap:
|
|
150
|
+
c = c.upper()
|
|
151
|
+
nextCap = 0
|
|
152
|
+
elif c in " .,;:\t-_":
|
|
153
|
+
nextCap = 1
|
|
154
|
+
s += c
|
|
155
|
+
i += 1
|
|
156
|
+
return s
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def parse_pdb_header(infile):
|
|
160
|
+
"""Return the header lines of a pdb file as a dictionary.
|
|
161
|
+
|
|
162
|
+
Dictionary keys are: head, deposition_date, release_date, structure_method,
|
|
163
|
+
resolution, structure_reference, journal_reference, author and
|
|
164
|
+
compound.
|
|
165
|
+
"""
|
|
166
|
+
header = []
|
|
167
|
+
with File.as_handle(infile) as f:
|
|
168
|
+
for line in f:
|
|
169
|
+
record_type = line[0:6]
|
|
170
|
+
if record_type in ("ATOM ", "HETATM", "MODEL "):
|
|
171
|
+
break
|
|
172
|
+
header.append(line)
|
|
173
|
+
return _parse_pdb_header_list(header)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _parse_remark_465(line):
|
|
177
|
+
"""Parse missing residue remarks.
|
|
178
|
+
|
|
179
|
+
Returns a dictionary describing the missing residue.
|
|
180
|
+
The specification for REMARK 465 at
|
|
181
|
+
http://www.wwpdb.org/documentation/file-format-content/format33/remarks2.html#REMARK%20465
|
|
182
|
+
only gives templates, but does not say they have to be followed.
|
|
183
|
+
So we assume that not all pdb-files with a REMARK 465 can be understood.
|
|
184
|
+
|
|
185
|
+
Returns a dictionary with the following keys:
|
|
186
|
+
"model", "res_name", "chain", "ssseq", "insertion"
|
|
187
|
+
"""
|
|
188
|
+
if line:
|
|
189
|
+
# Note that line has been stripped.
|
|
190
|
+
assert line[0] != " " and line[-1] not in "\n ", "line has to be stripped"
|
|
191
|
+
pattern = re.compile(
|
|
192
|
+
r"""
|
|
193
|
+
(\d+\s[\sA-Z][\sA-Z][A-Z] | # Either model number + residue name
|
|
194
|
+
[A-Z]{1,3}) # Or only residue name with 1 (RNA) to 3 letters
|
|
195
|
+
\s ([A-Za-z0-9]) # A single character chain
|
|
196
|
+
\s+(-?\d+[A-Za-z]?)$ # Residue number: A digit followed by an optional
|
|
197
|
+
# insertion code (Hetero-flags make no sense in
|
|
198
|
+
# context with missing res)
|
|
199
|
+
""",
|
|
200
|
+
re.VERBOSE,
|
|
201
|
+
)
|
|
202
|
+
match = pattern.match(line)
|
|
203
|
+
if match is None:
|
|
204
|
+
return None
|
|
205
|
+
residue = {}
|
|
206
|
+
if " " in match.group(1):
|
|
207
|
+
model, residue["res_name"] = match.group(1).split()
|
|
208
|
+
residue["model"] = int(model)
|
|
209
|
+
else:
|
|
210
|
+
residue["model"] = None
|
|
211
|
+
residue["res_name"] = match.group(1)
|
|
212
|
+
residue["chain"] = match.group(2)
|
|
213
|
+
try:
|
|
214
|
+
residue["ssseq"] = int(match.group(3))
|
|
215
|
+
except ValueError:
|
|
216
|
+
residue["insertion"] = match.group(3)[-1]
|
|
217
|
+
residue["ssseq"] = int(match.group(3)[:-1])
|
|
218
|
+
else:
|
|
219
|
+
residue["insertion"] = None
|
|
220
|
+
return residue
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _parse_pdb_header_list(header):
|
|
224
|
+
# database fields
|
|
225
|
+
pdbh_dict = {
|
|
226
|
+
"name": "",
|
|
227
|
+
"head": "",
|
|
228
|
+
"idcode": "",
|
|
229
|
+
"deposition_date": "1909-01-08",
|
|
230
|
+
"release_date": "1909-01-08",
|
|
231
|
+
"structure_method": "unknown",
|
|
232
|
+
"resolution": None,
|
|
233
|
+
"structure_reference": "unknown",
|
|
234
|
+
"journal_reference": "unknown",
|
|
235
|
+
"author": "",
|
|
236
|
+
"compound": {"1": {"misc": ""}},
|
|
237
|
+
"source": {"1": {"misc": ""}},
|
|
238
|
+
"has_missing_residues": False,
|
|
239
|
+
"missing_residues": [],
|
|
240
|
+
"biomoltrans": [],
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
pdbh_dict["structure_reference"] = _get_references(header)
|
|
244
|
+
pdbh_dict["journal_reference"] = _get_journal(header)
|
|
245
|
+
pdbh_dict["biomoltrans"] = _get_biomoltrans(header)
|
|
246
|
+
comp_molid = "1"
|
|
247
|
+
last_comp_key = "misc"
|
|
248
|
+
last_src_key = "misc"
|
|
249
|
+
|
|
250
|
+
for hh in header:
|
|
251
|
+
h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off
|
|
252
|
+
# key=re.sub("\s.+\s*","",h)
|
|
253
|
+
key = h[:6].strip()
|
|
254
|
+
# tail=re.sub("\A\w+\s+\d*\s*","",h)
|
|
255
|
+
tail = h[10:].strip()
|
|
256
|
+
# print("%s:%s" % (key, tail)
|
|
257
|
+
|
|
258
|
+
# From here, all the keys from the header are being parsed
|
|
259
|
+
if key == "TITLE":
|
|
260
|
+
name = _chop_end_codes(tail).lower()
|
|
261
|
+
pdbh_dict["name"] = " ".join([pdbh_dict["name"], name]).strip()
|
|
262
|
+
elif key == "HEADER":
|
|
263
|
+
rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
|
|
264
|
+
if rr is not None:
|
|
265
|
+
pdbh_dict["deposition_date"] = _format_date(_nice_case(rr.group()))
|
|
266
|
+
rr = re.search(r"\s+([1-9][0-9A-Z]{3})\s*\Z", tail)
|
|
267
|
+
if rr is not None:
|
|
268
|
+
pdbh_dict["idcode"] = rr.group(1)
|
|
269
|
+
head = _chop_end_misc(tail).lower()
|
|
270
|
+
pdbh_dict["head"] = head
|
|
271
|
+
elif key == "COMPND":
|
|
272
|
+
# LJJ
|
|
273
|
+
# tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
|
|
274
|
+
tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail))
|
|
275
|
+
# look for E.C. numbers in COMPND lines
|
|
276
|
+
rec = re.search(r"\d+\.\d+\.\d+\.\d+", tt)
|
|
277
|
+
if rec:
|
|
278
|
+
pdbh_dict["compound"][comp_molid]["ec_number"] = rec.group()
|
|
279
|
+
tt = re.sub(r"\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt)
|
|
280
|
+
tok = tt.split(":")
|
|
281
|
+
if len(tok) >= 2:
|
|
282
|
+
# lower ckey LJJ
|
|
283
|
+
ckey = tok[0].lower()
|
|
284
|
+
# ckey = tok[0]
|
|
285
|
+
cval = re.sub(r"\A\s*", "", tok[1])
|
|
286
|
+
if ckey == "mol_id":
|
|
287
|
+
# mol_id, keep original, usually digital string
|
|
288
|
+
pdbh_dict["compound"][cval] = {"misc": ""}
|
|
289
|
+
comp_molid = cval
|
|
290
|
+
last_comp_key = "misc"
|
|
291
|
+
else:
|
|
292
|
+
# add two lines, lower all except chain value LJJ
|
|
293
|
+
if ckey != "chain":
|
|
294
|
+
cval = cval.lower()
|
|
295
|
+
|
|
296
|
+
pdbh_dict["compound"][comp_molid][ckey] = cval
|
|
297
|
+
last_comp_key = ckey
|
|
298
|
+
else:
|
|
299
|
+
# pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0] + " "
|
|
300
|
+
# concat and lower LJJ
|
|
301
|
+
pdbh_dict["compound"][comp_molid][last_comp_key] += tok[0].lower() + " "
|
|
302
|
+
elif key == "SOURCE":
|
|
303
|
+
tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower()
|
|
304
|
+
tok = tt.split(":")
|
|
305
|
+
# print(tok)
|
|
306
|
+
if len(tok) >= 2:
|
|
307
|
+
ckey = tok[0]
|
|
308
|
+
cval = re.sub(r"\A\s*", "", tok[1])
|
|
309
|
+
if ckey == "mol_id":
|
|
310
|
+
pdbh_dict["source"][cval] = {"misc": ""}
|
|
311
|
+
comp_molid = cval
|
|
312
|
+
last_src_key = "misc"
|
|
313
|
+
else:
|
|
314
|
+
pdbh_dict["source"][comp_molid][ckey] = cval
|
|
315
|
+
last_src_key = ckey
|
|
316
|
+
else:
|
|
317
|
+
pdbh_dict["source"][comp_molid][last_src_key] += tok[0] + " "
|
|
318
|
+
elif key == "KEYWDS":
|
|
319
|
+
kwd = _chop_end_codes(tail).lower()
|
|
320
|
+
if "keywords" in pdbh_dict:
|
|
321
|
+
pdbh_dict["keywords"] += " " + kwd
|
|
322
|
+
else:
|
|
323
|
+
pdbh_dict["keywords"] = kwd
|
|
324
|
+
elif key == "EXPDTA":
|
|
325
|
+
expd = _chop_end_codes(tail)
|
|
326
|
+
# chop junk at end of lines for some structures
|
|
327
|
+
expd = re.sub(r"\s\s\s\s\s\s\s.*\Z", "", expd)
|
|
328
|
+
# if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr'
|
|
329
|
+
# if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction'
|
|
330
|
+
pdbh_dict["structure_method"] = expd.lower()
|
|
331
|
+
elif key == "CAVEAT":
|
|
332
|
+
# make Annotation entries out of these!!!
|
|
333
|
+
pass
|
|
334
|
+
elif key == "REVDAT":
|
|
335
|
+
rr = re.search(r"\d\d-\w\w\w-\d\d", tail)
|
|
336
|
+
if rr is not None:
|
|
337
|
+
pdbh_dict["release_date"] = _format_date(_nice_case(rr.group()))
|
|
338
|
+
elif key == "JRNL":
|
|
339
|
+
# print("%s:%s" % (key, tail))
|
|
340
|
+
if "journal" in pdbh_dict:
|
|
341
|
+
pdbh_dict["journal"] += tail
|
|
342
|
+
else:
|
|
343
|
+
pdbh_dict["journal"] = tail
|
|
344
|
+
elif key == "AUTHOR":
|
|
345
|
+
auth = _nice_case(_chop_end_codes(tail))
|
|
346
|
+
if "author" in pdbh_dict:
|
|
347
|
+
pdbh_dict["author"] += auth
|
|
348
|
+
else:
|
|
349
|
+
pdbh_dict["author"] = auth
|
|
350
|
+
elif key == "REMARK":
|
|
351
|
+
if re.search("REMARK 2 RESOLUTION.", hh):
|
|
352
|
+
r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", "", hh))
|
|
353
|
+
r = re.sub(r"\s+ANGSTROM.*", "", r)
|
|
354
|
+
try:
|
|
355
|
+
pdbh_dict["resolution"] = float(r)
|
|
356
|
+
except ValueError:
|
|
357
|
+
# print('nonstandard resolution %r' % r)
|
|
358
|
+
pdbh_dict["resolution"] = None
|
|
359
|
+
elif hh.startswith("REMARK 465"):
|
|
360
|
+
if tail:
|
|
361
|
+
pdbh_dict["has_missing_residues"] = True
|
|
362
|
+
missing_res_info = _parse_remark_465(tail)
|
|
363
|
+
if missing_res_info:
|
|
364
|
+
pdbh_dict["missing_residues"].append(missing_res_info)
|
|
365
|
+
elif hh.startswith("REMARK 99 ASTRAL"):
|
|
366
|
+
if tail:
|
|
367
|
+
remark_99_keyval = tail.replace("ASTRAL ", "").split(": ")
|
|
368
|
+
if (
|
|
369
|
+
isinstance(remark_99_keyval, list)
|
|
370
|
+
and len(remark_99_keyval) == 2
|
|
371
|
+
):
|
|
372
|
+
if "astral" not in pdbh_dict:
|
|
373
|
+
pdbh_dict["astral"] = {
|
|
374
|
+
remark_99_keyval[0]: remark_99_keyval[1]
|
|
375
|
+
}
|
|
376
|
+
else:
|
|
377
|
+
pdbh_dict["astral"][remark_99_keyval[0]] = remark_99_keyval[
|
|
378
|
+
1
|
|
379
|
+
]
|
|
380
|
+
else:
|
|
381
|
+
# print(key)
|
|
382
|
+
pass
|
|
383
|
+
if pdbh_dict["structure_method"] == "unknown":
|
|
384
|
+
res = pdbh_dict["resolution"]
|
|
385
|
+
if res is not None and res > 0.0:
|
|
386
|
+
pdbh_dict["structure_method"] = "x-ray diffraction"
|
|
387
|
+
return pdbh_dict
|
gemmi_protools/parser.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@Author: Luo Jiejian
|
|
3
|
+
"""
|
|
4
|
+
import pathlib
|
|
5
|
+
from collections import defaultdict, Counter
|
|
6
|
+
from typing import Union, Optional, Dict, List
|
|
7
|
+
|
|
8
|
+
import gemmi
|
|
9
|
+
from typeguard import typechecked
|
|
10
|
+
|
|
11
|
+
from .cif_opts import _cif_entity_info, _is_cif, _get_cif_resolution
|
|
12
|
+
from .pdb_opts import _pdb_entity_info, _is_pdb, _get_pdb_resolution
|
|
13
|
+
from .struct_info import Entity
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@typechecked
|
|
17
|
+
def _ent_from_structure(struct: gemmi.Structure) -> Entity:
|
|
18
|
+
"""
|
|
19
|
+
Run .setup_entities() in advance
|
|
20
|
+
:param struct:
|
|
21
|
+
:return:
|
|
22
|
+
"""
|
|
23
|
+
block = struct.make_mmcif_block()
|
|
24
|
+
ent_info = _cif_entity_info(block)
|
|
25
|
+
for ent in struct.entities:
|
|
26
|
+
if ent.name not in ent_info["eid2desc"]:
|
|
27
|
+
ent_info["eid2desc"][ent.name] = ent.name
|
|
28
|
+
return ent_info
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@typechecked
|
|
32
|
+
def cif_parser(path: Union[str, pathlib.Path]):
|
|
33
|
+
"""
|
|
34
|
+
Parse .cif or .cif.gz
|
|
35
|
+
:param path:
|
|
36
|
+
:return: (gemmi.Structure, entity)
|
|
37
|
+
"""
|
|
38
|
+
if _is_cif(path):
|
|
39
|
+
doc = gemmi.cif.read(str(path))
|
|
40
|
+
block0 = doc.sole_block()
|
|
41
|
+
struct = gemmi.read_structure(str(path))
|
|
42
|
+
struct.setup_entities()
|
|
43
|
+
# sheet_id like 1' will get some strange errors
|
|
44
|
+
# result in sheets with 0 strands
|
|
45
|
+
# delete sheets with 0 strands
|
|
46
|
+
# check here
|
|
47
|
+
|
|
48
|
+
zero_sheet_ind = []
|
|
49
|
+
for i, sheet in enumerate(struct.sheets):
|
|
50
|
+
if len(sheet.strands) == 0:
|
|
51
|
+
zero_sheet_ind.append(i)
|
|
52
|
+
|
|
53
|
+
if zero_sheet_ind:
|
|
54
|
+
zero_sheet_ind.sort(reverse=True)
|
|
55
|
+
for i in zero_sheet_ind:
|
|
56
|
+
del struct.sheets[i]
|
|
57
|
+
|
|
58
|
+
# gemmi fail to parse right resolution, update here
|
|
59
|
+
struct.resolution = _get_cif_resolution(block0)
|
|
60
|
+
|
|
61
|
+
# ent information
|
|
62
|
+
# from doc
|
|
63
|
+
ent_0 = _cif_entity_info(block0)
|
|
64
|
+
|
|
65
|
+
# init from struct
|
|
66
|
+
ent_1 = _ent_from_structure(struct)
|
|
67
|
+
|
|
68
|
+
# update ent_0 with ent_1
|
|
69
|
+
for super_key in ["eid2desc", "polymer2eid"]:
|
|
70
|
+
for key, val in ent_1[super_key].items():
|
|
71
|
+
if key not in ent_0[super_key]:
|
|
72
|
+
ent_0[super_key][key] = val
|
|
73
|
+
return struct, ent_0
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError("Only support .cif or .cif.gz file, but got %s" % path)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@typechecked
|
|
79
|
+
def _assign_digital_entity_names(structure: gemmi.Structure) -> Optional[Dict[str, str]]:
|
|
80
|
+
"""
|
|
81
|
+
Run .setup_entities() in advance
|
|
82
|
+
:param structure:
|
|
83
|
+
:return:
|
|
84
|
+
"""
|
|
85
|
+
# rename entities' names to numbers if not
|
|
86
|
+
not_digit_name = False
|
|
87
|
+
for ent in structure.entities:
|
|
88
|
+
if not ent.name.isdigit():
|
|
89
|
+
not_digit_name = True
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
if not_digit_name:
|
|
93
|
+
mapper = dict()
|
|
94
|
+
for ix, ent in enumerate(structure.entities):
|
|
95
|
+
new_name = str(ix + 1)
|
|
96
|
+
mapper[ent.name] = new_name
|
|
97
|
+
ent.name = new_name
|
|
98
|
+
return mapper
|
|
99
|
+
else:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@typechecked
|
|
104
|
+
def _update_entity_names(entity: Entity, mapper: Dict[str, str]):
|
|
105
|
+
"""
|
|
106
|
+
Update entity names to new ones in eid2desc, eid2specie, eid2taxid in place.
|
|
107
|
+
:param entity:
|
|
108
|
+
:param mapper: {old_entity_name: new_entity_name}
|
|
109
|
+
:return:
|
|
110
|
+
"""
|
|
111
|
+
for super_key in ['eid2desc', 'eid2specie', 'eid2taxid']:
|
|
112
|
+
tmp = dict()
|
|
113
|
+
for key in entity[super_key]:
|
|
114
|
+
tmp[mapper[key]] = entity[super_key][key]
|
|
115
|
+
entity.__setattr__(super_key, tmp)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@typechecked
|
|
119
|
+
def pdb_parser(path: Union[str, pathlib.Path]):
|
|
120
|
+
"""
|
|
121
|
+
Parse .pdb or .pdb.gz
|
|
122
|
+
:param path:
|
|
123
|
+
:return: (gemmi.Structure, entity)
|
|
124
|
+
"""
|
|
125
|
+
if _is_pdb(path):
|
|
126
|
+
struct = gemmi.read_structure(str(path))
|
|
127
|
+
struct.resolution = _get_pdb_resolution(struct.raw_remarks)
|
|
128
|
+
ent_0 = _pdb_entity_info(path)
|
|
129
|
+
|
|
130
|
+
struct.setup_entities()
|
|
131
|
+
|
|
132
|
+
# pdb have "A,B,C" chains
|
|
133
|
+
# after setup, entities will merge
|
|
134
|
+
block = struct.make_mmcif_block()
|
|
135
|
+
ent_t = _cif_entity_info(block)
|
|
136
|
+
rec = defaultdict(list)
|
|
137
|
+
for cn, middle_eid in ent_t.polymer2eid.items():
|
|
138
|
+
rec[middle_eid].append(cn)
|
|
139
|
+
|
|
140
|
+
_mapper = _assign_digital_entity_names(struct)
|
|
141
|
+
_mapper_n = dict()
|
|
142
|
+
for middle_eid, new_eid in _mapper.items():
|
|
143
|
+
old_eid = str(",".join(rec[middle_eid]))
|
|
144
|
+
_mapper_n[old_eid] = new_eid
|
|
145
|
+
|
|
146
|
+
if _mapper_n:
|
|
147
|
+
_update_entity_names(ent_0, _mapper_n)
|
|
148
|
+
|
|
149
|
+
ent_1 = _ent_from_structure(struct)
|
|
150
|
+
|
|
151
|
+
# update ent_0 with ent_1
|
|
152
|
+
for super_key in ["eid2desc", "polymer2eid"]:
|
|
153
|
+
for key, val in ent_1[super_key].items():
|
|
154
|
+
if key not in ent_0[super_key]:
|
|
155
|
+
ent_0[super_key][key] = val
|
|
156
|
+
return struct, ent_0
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@typechecked
|
|
162
|
+
def _chain_type(structure: gemmi.Structure, chain_id: str) -> str:
|
|
163
|
+
out = None
|
|
164
|
+
values = {"PeptideL": "protein",
|
|
165
|
+
"Dna": "dna",
|
|
166
|
+
"Rna": "rna"}
|
|
167
|
+
|
|
168
|
+
for model in structure:
|
|
169
|
+
for cur_chain in model:
|
|
170
|
+
if cur_chain.name == chain_id:
|
|
171
|
+
sc_types = set()
|
|
172
|
+
for sc in cur_chain.subchains():
|
|
173
|
+
t = sc.check_polymer_type().name
|
|
174
|
+
if t != "Unknown":
|
|
175
|
+
sc_types.update({t})
|
|
176
|
+
|
|
177
|
+
if len(sc_types) == 1:
|
|
178
|
+
out = sc_types.pop()
|
|
179
|
+
else:
|
|
180
|
+
out = "Unknown"
|
|
181
|
+
if out is None:
|
|
182
|
+
raise RuntimeError("chain_id %s not in structure" % chain_id)
|
|
183
|
+
else:
|
|
184
|
+
return values.get(out, "other")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@typechecked
|
|
188
|
+
def _get_model_chain_names(model: gemmi.Model) -> List[str]:
|
|
189
|
+
vals = []
|
|
190
|
+
for ch in model:
|
|
191
|
+
vals.append(ch.name)
|
|
192
|
+
return vals
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@typechecked
|
|
196
|
+
def _assert_unique_chain_names_in_models(structure: gemmi.Structure):
|
|
197
|
+
for model in structure:
|
|
198
|
+
names = _get_model_chain_names(model)
|
|
199
|
+
nums = Counter(names)
|
|
200
|
+
dup_names = [k for k, v in nums.items() if v > 1]
|
|
201
|
+
|
|
202
|
+
if dup_names:
|
|
203
|
+
raise RuntimeError("Duplicate chain names in model %d: %s" % (model.num, ",".join(dup_names)))
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@typechecked
|
|
207
|
+
def _chain_names2one_letter(structure: gemmi.Structure, only_uppercase: bool = True) -> Dict[str, str]:
|
|
208
|
+
"""
|
|
209
|
+
Automatically generate one letter mapper when the length of chain name > 1 or chain name is not uppercase letters
|
|
210
|
+
|
|
211
|
+
(1) when only_uppercase is True, only supported when the number of chains of the one-model structure <= 26
|
|
212
|
+
(2) when only_uppercase is False, only supported when the number of chains of the one-model structure <= 62
|
|
213
|
+
|
|
214
|
+
If there are too many chains, make some splits or assemblies first,
|
|
215
|
+
or just keep the longer chain names in .cif format.
|
|
216
|
+
PDB only support the single letter chain name.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
if len(structure) > 1:
|
|
220
|
+
raise RuntimeError("> 1 models in structure, do nothing")
|
|
221
|
+
|
|
222
|
+
_assert_unique_chain_names_in_models(structure)
|
|
223
|
+
|
|
224
|
+
n_chains = len(structure[0])
|
|
225
|
+
if only_uppercase:
|
|
226
|
+
l1 = ['Z', 'Y', 'X', 'W', 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
|
|
227
|
+
'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
|
|
228
|
+
mode = "UPPERCASE"
|
|
229
|
+
else:
|
|
230
|
+
l1 = ['9', '8', '7', '6', '5', '4', '3', '2', '1', '0',
|
|
231
|
+
'z', 'y', 'x', 'w', 'v', 'u', 't', 's', 'r', 'q',
|
|
232
|
+
'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'g',
|
|
233
|
+
'f', 'e', 'd', 'c', 'b', 'a', 'Z', 'Y', 'X', 'W',
|
|
234
|
+
'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'O', 'N', 'M',
|
|
235
|
+
'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
|
|
236
|
+
mode = "UPPERCASE + LOWERCASE + DIGITAL"
|
|
237
|
+
|
|
238
|
+
if n_chains > len(l1):
|
|
239
|
+
raise RuntimeError("Support max %d chains under %s mode, but got %d chains in structure"
|
|
240
|
+
% (len(l1), mode, n_chains))
|
|
241
|
+
|
|
242
|
+
existed_one_letter_ids = []
|
|
243
|
+
for model in structure:
|
|
244
|
+
for chain in model:
|
|
245
|
+
if chain.name in l1 and chain.name not in existed_one_letter_ids:
|
|
246
|
+
existed_one_letter_ids.append(chain.name)
|
|
247
|
+
|
|
248
|
+
left_l1 = [i for i in l1 if i not in existed_one_letter_ids]
|
|
249
|
+
|
|
250
|
+
name_mapper = dict()
|
|
251
|
+
for model in structure:
|
|
252
|
+
for chain in model:
|
|
253
|
+
if chain.name not in l1:
|
|
254
|
+
new_name = left_l1.pop()
|
|
255
|
+
name_mapper[chain.name] = new_name
|
|
256
|
+
return name_mapper
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@typechecked
|
|
260
|
+
def get_assembly(structure: gemmi.Structure, assembly_name: str,
|
|
261
|
+
how: gemmi.HowToNameCopiedChain = gemmi.HowToNameCopiedChain.AddNumber):
|
|
262
|
+
struct = structure.clone()
|
|
263
|
+
struct.transform_to_assembly(assembly_name, how)
|
|
264
|
+
|
|
265
|
+
# update ENTITY.polymer2eid
|
|
266
|
+
scn2eid = dict()
|
|
267
|
+
for ent in struct.entities:
|
|
268
|
+
for scn in ent.subchains:
|
|
269
|
+
scn2eid[scn] = ent.name
|
|
270
|
+
|
|
271
|
+
polymer2eid = dict()
|
|
272
|
+
for model in struct:
|
|
273
|
+
for chain in model:
|
|
274
|
+
for sc in chain.subchains():
|
|
275
|
+
sc_t = sc.check_polymer_type().name
|
|
276
|
+
if sc_t in ["PeptideL", "Dna", "Rna"]:
|
|
277
|
+
polymer2eid[chain.name] = scn2eid[sc.subchain_id()]
|
|
278
|
+
break
|
|
279
|
+
return struct, polymer2eid
|