stjames 0.0.52__py3-none-any.whl → 0.0.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of stjames might be problematic. Click here for more details.
- stjames/atomium_stjames/__init__.py +5 -0
- stjames/atomium_stjames/data.py +377 -0
- stjames/atomium_stjames/mmcif.py +651 -0
- stjames/atomium_stjames/pdb.py +572 -0
- stjames/atomium_stjames/utilities.py +125 -0
- stjames/pdb.py +482 -10
- stjames/workflows/irc.py +14 -7
- {stjames-0.0.52.dist-info → stjames-0.0.53.dist-info}/METADATA +2 -1
- {stjames-0.0.52.dist-info → stjames-0.0.53.dist-info}/RECORD +12 -7
- {stjames-0.0.52.dist-info → stjames-0.0.53.dist-info}/LICENSE +0 -0
- {stjames-0.0.52.dist-info → stjames-0.0.53.dist-info}/WHEEL +0 -0
- {stjames-0.0.52.dist-info → stjames-0.0.53.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
"""Contains functions for dealing with the .pdb file format."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from itertools import chain, groupby
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from .data import CODES
|
|
9
|
+
from .mmcif import add_secondary_structure_to_polymers
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def pdb_string_to_pdb_dict(filestring: str) -> dict[str, Any]:
|
|
13
|
+
"""Takes a .pdb filestring and turns into a ``dict`` which represents its
|
|
14
|
+
record structure. Only lines which aren't empty are used.
|
|
15
|
+
|
|
16
|
+
The resultant dictionary has line types as the keys, which point to the
|
|
17
|
+
lines as its value. So ``{"TITLE": ["TITLE line 1", "TITLE line 2"]}`` etc.
|
|
18
|
+
|
|
19
|
+
The exceptions are the REMARK records, where there is a sub-dictionary with
|
|
20
|
+
REMARK numbers as keys, and the structure records themselves which are just
|
|
21
|
+
arranged into lists - one for each model.
|
|
22
|
+
|
|
23
|
+
:param str filestring: the .pdb filestring to process.
|
|
24
|
+
:rtype: ``dict``"""
|
|
25
|
+
|
|
26
|
+
pdb_dict: dict[str, Any] = {}
|
|
27
|
+
lines_1 = list(filter(lambda l: bool(l.strip()), filestring.split("\n")))
|
|
28
|
+
lines: list[list[str]] = [[line[:6].rstrip(), line.rstrip()] for line in lines_1]
|
|
29
|
+
model_recs = ("ATOM", "HETATM", "ANISOU", "MODEL", "TER", "ENDMDL")
|
|
30
|
+
for head, line in lines:
|
|
31
|
+
if head == "REMARK":
|
|
32
|
+
if "REMARK" not in pdb_dict:
|
|
33
|
+
pdb_dict["REMARK"] = {}
|
|
34
|
+
number = line.lstrip().split()[1]
|
|
35
|
+
update_dict(pdb_dict["REMARK"], number, line)
|
|
36
|
+
elif head in model_recs:
|
|
37
|
+
if "MODEL" not in pdb_dict:
|
|
38
|
+
pdb_dict["MODEL"] = [[]]
|
|
39
|
+
if head == "ENDMDL":
|
|
40
|
+
pdb_dict["MODEL"].append([])
|
|
41
|
+
elif head != "MODEL":
|
|
42
|
+
pdb_dict["MODEL"][-1].append(line)
|
|
43
|
+
else:
|
|
44
|
+
update_dict(pdb_dict, head, line)
|
|
45
|
+
if "MODEL" in pdb_dict and not pdb_dict["MODEL"][-1]:
|
|
46
|
+
pdb_dict["MODEL"].pop()
|
|
47
|
+
return pdb_dict
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def update_dict(d: dict[str, Any], key: str, value: str) -> None:
|
|
51
|
+
"""Takes a dictionary where the values are lists, and adds a value to one of
|
|
52
|
+
the lists at the specific key. If the list doesn't exist, it creates it
|
|
53
|
+
first.
|
|
54
|
+
|
|
55
|
+
The dictionary is changed in place.
|
|
56
|
+
|
|
57
|
+
:param dict d: the dictionary to update.
|
|
58
|
+
:param str key: the location of the list.
|
|
59
|
+
:param str value: the value to add to the list."""
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
d[key].append(value)
|
|
63
|
+
except Exception:
|
|
64
|
+
d[key] = [value]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def pdb_dict_to_data_dict(pdb_dict: dict[str, Any]) -> dict[str, Any]:
|
|
68
|
+
"""Converts an .pdb dictionary into an atomium data dictionary, with the
|
|
69
|
+
same standard layout that the other file formats get converted into.
|
|
70
|
+
|
|
71
|
+
:param dict pdb_dict: the .pdb dictionary.
|
|
72
|
+
:rtype: ``dict``"""
|
|
73
|
+
|
|
74
|
+
data_dict = {
|
|
75
|
+
"description": {"code": None, "title": None, "deposition_date": None, "classification": None, "keywords": [], "authors": []},
|
|
76
|
+
"experiment": {"technique": None, "source_organism": None, "expression_system": None, "missing_residues": []},
|
|
77
|
+
"quality": {"resolution": None, "rvalue": None, "rfree": None},
|
|
78
|
+
"geometry": {"assemblies": [], "crystallography": {}},
|
|
79
|
+
"models": [],
|
|
80
|
+
}
|
|
81
|
+
update_description_dict(pdb_dict, data_dict)
|
|
82
|
+
update_experiment_dict(pdb_dict, data_dict)
|
|
83
|
+
update_quality_dict(pdb_dict, data_dict)
|
|
84
|
+
update_geometry_dict(pdb_dict, data_dict)
|
|
85
|
+
update_models_list(pdb_dict, data_dict)
|
|
86
|
+
return data_dict
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def update_description_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
|
|
90
|
+
"""Creates the description component of a standard atomium data dictionary
|
|
91
|
+
from a .pdb dictionary.
|
|
92
|
+
|
|
93
|
+
:param dict pdb_dict: The .pdb dictionary to read.
|
|
94
|
+
:param dict data_dict: The data dictionary to update."""
|
|
95
|
+
|
|
96
|
+
extract_header(pdb_dict, data_dict["description"])
|
|
97
|
+
extract_title(pdb_dict, data_dict["description"])
|
|
98
|
+
extract_keywords(pdb_dict, data_dict["description"])
|
|
99
|
+
extract_authors(pdb_dict, data_dict["description"])
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def update_experiment_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
|
|
103
|
+
"""Creates the experiment component of a standard atomium data dictionary
|
|
104
|
+
from a .pdb dictionary.
|
|
105
|
+
|
|
106
|
+
:param dict pdb_dict: The .pdb dictionary to read.
|
|
107
|
+
:param dict data_dict: The data dictionary to update."""
|
|
108
|
+
|
|
109
|
+
extract_technique(pdb_dict, data_dict["experiment"])
|
|
110
|
+
extract_source(pdb_dict, data_dict["experiment"])
|
|
111
|
+
extract_missing_residues(pdb_dict, data_dict["experiment"])
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def update_quality_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
|
|
115
|
+
"""Creates the quality component of a standard atomium data dictionary
|
|
116
|
+
from a .pdb dictionary.
|
|
117
|
+
|
|
118
|
+
:param dict pdb_dict: The .pdb dictionary to read.
|
|
119
|
+
:param dict data_dict: The data dictionary to update."""
|
|
120
|
+
|
|
121
|
+
extract_resolution_remark(pdb_dict, data_dict["quality"])
|
|
122
|
+
extract_rvalue_remark(pdb_dict, data_dict["quality"])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def update_geometry_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
|
|
126
|
+
"""Creates the geometry component of a standard atomium data dictionary
|
|
127
|
+
from a .pdb dictionary.
|
|
128
|
+
|
|
129
|
+
:param dict pdb_dict: The .pdb dictionary to read.
|
|
130
|
+
:param dict data_dict: The data dictionary to update."""
|
|
131
|
+
|
|
132
|
+
extract_assembly_remark(pdb_dict, data_dict["geometry"])
|
|
133
|
+
extract_crystallography(pdb_dict, data_dict["geometry"])
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def update_models_list(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
|
|
137
|
+
"""Creates model dictionaries in a data dictionary.
|
|
138
|
+
|
|
139
|
+
:param dict pdb_dict: The .pdb dictionary to read.
|
|
140
|
+
:param dict data_dict: The data dictionary to update."""
|
|
141
|
+
|
|
142
|
+
sequences = make_sequences(pdb_dict)
|
|
143
|
+
secondary_structure = make_secondary_structure(pdb_dict)
|
|
144
|
+
full_names = get_full_names(pdb_dict)
|
|
145
|
+
for model_lines in pdb_dict["MODEL"]:
|
|
146
|
+
aniso = make_aniso(model_lines)
|
|
147
|
+
last_ter = get_last_ter_line(model_lines)
|
|
148
|
+
model: dict[str, Any] = {"polymer": {}, "non-polymer": {}, "water": {}}
|
|
149
|
+
for index, line in enumerate(model_lines):
|
|
150
|
+
if line[:6] in ["ATOM ", "HETATM"]:
|
|
151
|
+
chain_id = line[21] if index < last_ter else id_from_line(line)
|
|
152
|
+
res_id = id_from_line(line)
|
|
153
|
+
if index < last_ter:
|
|
154
|
+
add_atom_to_polymer(line, model, chain_id, res_id, aniso, full_names)
|
|
155
|
+
else:
|
|
156
|
+
add_atom_to_non_polymer(line, model, res_id, aniso, full_names)
|
|
157
|
+
|
|
158
|
+
for chain_id, _chain in model["polymer"].items():
|
|
159
|
+
_chain["sequence"] = sequences.get(chain_id, "")
|
|
160
|
+
add_secondary_structure_to_polymers(model, secondary_structure)
|
|
161
|
+
data_dict["models"].append(model)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def extract_header(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
|
|
165
|
+
"""Takes a ``dict`` and adds header information to it by parsing the HEADER
|
|
166
|
+
line.
|
|
167
|
+
|
|
168
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
169
|
+
:param dict description_dict: the ``dict`` to update."""
|
|
170
|
+
|
|
171
|
+
if pdb_dict.get("HEADER"):
|
|
172
|
+
line = pdb_dict["HEADER"][0]
|
|
173
|
+
if line[50:59].strip():
|
|
174
|
+
description_dict["deposition_date"] = datetime.strptime(line[50:59], "%d-%b-%y").date()
|
|
175
|
+
if line[62:66].strip():
|
|
176
|
+
description_dict["code"] = line[62:66]
|
|
177
|
+
if line[10:50].strip():
|
|
178
|
+
description_dict["classification"] = line[10:50].strip()
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def extract_title(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
|
|
182
|
+
"""Takes a ``dict`` and adds header information to it by parsing the TITLE
|
|
183
|
+
lines.
|
|
184
|
+
|
|
185
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
186
|
+
:param dict description_dict: the ``dict`` to update."""
|
|
187
|
+
|
|
188
|
+
if pdb_dict.get("TITLE"):
|
|
189
|
+
description_dict["title"] = merge_lines(pdb_dict["TITLE"], 10)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def extract_keywords(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
|
|
193
|
+
"""Takes a ``dict`` and adds header information to it by parsing the KEYWDS
|
|
194
|
+
line.
|
|
195
|
+
|
|
196
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
197
|
+
:param dict description_dict: the ``dict`` to update."""
|
|
198
|
+
|
|
199
|
+
if pdb_dict.get("KEYWDS"):
|
|
200
|
+
text = merge_lines(pdb_dict["KEYWDS"], 10)
|
|
201
|
+
description_dict["keywords"] = [w.strip() for w in text.split(",")]
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def extract_authors(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
|
|
205
|
+
"""Takes a ``dict`` and adds header information to it by parsing the AUTHOR
|
|
206
|
+
line.
|
|
207
|
+
|
|
208
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
209
|
+
:param dict description_dict: the ``dict`` to update."""
|
|
210
|
+
|
|
211
|
+
if pdb_dict.get("AUTHOR"):
|
|
212
|
+
text = merge_lines(pdb_dict["AUTHOR"], 10)
|
|
213
|
+
description_dict["authors"] = [w.strip() for w in text.split(",")]
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def extract_technique(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
|
|
217
|
+
"""Takes a ``dict`` and adds technique information to it by parsing EXPDTA
|
|
218
|
+
lines.
|
|
219
|
+
|
|
220
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
221
|
+
:param dict experiment_dict: the ``dict`` to update."""
|
|
222
|
+
|
|
223
|
+
if pdb_dict.get("EXPDTA"):
|
|
224
|
+
if pdb_dict["EXPDTA"][0].strip():
|
|
225
|
+
experiment_dict["technique"] = pdb_dict["EXPDTA"][0][6:].strip()
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def extract_source(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
|
|
229
|
+
"""Takes a ``dict`` and adds source information to it by parsing SOURCE
|
|
230
|
+
lines.
|
|
231
|
+
|
|
232
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
233
|
+
:param dict experiment_dict: the ``dict`` to update."""
|
|
234
|
+
|
|
235
|
+
if pdb_dict.get("SOURCE"):
|
|
236
|
+
data = merge_lines(pdb_dict["SOURCE"], 10)
|
|
237
|
+
patterns = {"source_organism": r"ORGANISM_SCIENTIFIC\: (.+?);", "expression_system": r"EXPRESSION_SYSTEM\: (.+?);"}
|
|
238
|
+
for attribute, pattern in patterns.items():
|
|
239
|
+
matches = re.findall(pattern, data)
|
|
240
|
+
if matches:
|
|
241
|
+
experiment_dict[attribute] = matches[0]
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def extract_missing_residues(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
|
|
245
|
+
"""Takes a ``dict`` and adds missing residue information to it by parsing
|
|
246
|
+
REMARK 465 lines.
|
|
247
|
+
|
|
248
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
249
|
+
:param dict experiment_dict: the ``dict`` to update."""
|
|
250
|
+
|
|
251
|
+
for line in pdb_dict.get("REMARK", {}).get("465", []):
|
|
252
|
+
chunks = line.strip().split()
|
|
253
|
+
if len(chunks) == 5:
|
|
254
|
+
experiment_dict["missing_residues"].append({"name": chunks[2], "id": f"{chunks[3]}.{chunks[4]}"})
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def extract_resolution_remark(pdb_dict: dict[str, Any], quality_dict: dict[str, Any]) -> None:
|
|
258
|
+
"""Takes a ``dict`` and adds resolution information to it by parsing REMARK
|
|
259
|
+
2 lines.
|
|
260
|
+
|
|
261
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
262
|
+
:param dict quality_dict: the ``dict`` to update."""
|
|
263
|
+
|
|
264
|
+
if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("2"):
|
|
265
|
+
for remark in pdb_dict["REMARK"]["2"]:
|
|
266
|
+
try:
|
|
267
|
+
quality_dict["resolution"] = float(remark[10:].strip().split()[1])
|
|
268
|
+
break
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def extract_rvalue_remark(pdb_dict: dict[str, Any], quality_dict: dict[str, Any]) -> None:
|
|
274
|
+
"""Takes a ``dict`` and adds resolution information to it by parsing REMARK
|
|
275
|
+
3 lines.
|
|
276
|
+
|
|
277
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
278
|
+
:param dict quality_dict: the ``dict`` to update."""
|
|
279
|
+
|
|
280
|
+
if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("3"):
|
|
281
|
+
patterns = {
|
|
282
|
+
"rvalue": r"R VALUE.+WORKING.+?: (.+)",
|
|
283
|
+
"rfree": r"FREE R VALUE[ ]{2,}: (.+)",
|
|
284
|
+
}
|
|
285
|
+
for attribute, pattern in patterns.items():
|
|
286
|
+
for remark in pdb_dict["REMARK"]["3"]:
|
|
287
|
+
matches = re.findall(pattern, remark.strip())
|
|
288
|
+
if matches:
|
|
289
|
+
try:
|
|
290
|
+
quality_dict[attribute] = float(matches[0].strip())
|
|
291
|
+
except Exception:
|
|
292
|
+
pass
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def extract_assembly_remark(pdb_dict: dict[str, Any], geometry_dict: dict[str, Any]) -> None:
|
|
297
|
+
"""Takes a ``dict`` and adds assembly information to it by parsing REMARK
|
|
298
|
+
350 lines.
|
|
299
|
+
|
|
300
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
301
|
+
:param dict geometry_dict: the ``dict`` to update."""
|
|
302
|
+
if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("350"):
|
|
303
|
+
groups = [list(g) for k, g in groupby(pdb_dict["REMARK"]["350"], lambda x: "ECULE:" in x)][1:]
|
|
304
|
+
assemblies = [list(chain(*a)) for a in zip(groups[::2], groups[1::2])]
|
|
305
|
+
for a in assemblies:
|
|
306
|
+
geometry_dict["assemblies"].append(assembly_lines_to_assembly_dict(a))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def assembly_lines_to_assembly_dict(lines: list[str]) -> dict[str, Any]:
|
|
310
|
+
"""Takes the lines representing a single biological assembly and turns
|
|
311
|
+
them into an assembly dictionary.
|
|
312
|
+
|
|
313
|
+
:param list lines: The REMARK lines to read.
|
|
314
|
+
:rtype: ``dict``"""
|
|
315
|
+
|
|
316
|
+
assembly: dict[str, Any] = {"transformations": [], "software": None, "buried_surface_area": None, "surface_area": None, "delta_energy": None, "id": 0}
|
|
317
|
+
patterns: list[tuple[str, str, Callable[[str], Any]]] = [
|
|
318
|
+
(r"(.+)SOFTWARE USED: (.+)", "software", lambda x: x),
|
|
319
|
+
(r"(.+)BIOMOLECULE: (.+)", "id", int),
|
|
320
|
+
(r"(.+)SURFACE AREA: (.+) [A-Z]", "buried_surface_area", float),
|
|
321
|
+
(r"(.+)AREA OF THE COMPLEX: (.+) [A-Z]", "surface_area", float),
|
|
322
|
+
(r"(.+)FREE ENERGY: (.+) [A-Z]", "delta_energy", float),
|
|
323
|
+
]
|
|
324
|
+
t = None
|
|
325
|
+
for line in lines:
|
|
326
|
+
for pattern, key, converter in patterns:
|
|
327
|
+
matches = re.findall(pattern, line)
|
|
328
|
+
if matches:
|
|
329
|
+
assembly[key] = converter(matches[0][1].strip())
|
|
330
|
+
if "APPLY THE FOLLOWING" in line:
|
|
331
|
+
if t:
|
|
332
|
+
assembly["transformations"].append(t)
|
|
333
|
+
t = {"chains": [], "matrix": [], "vector": []}
|
|
334
|
+
if "CHAINS:" in line and t:
|
|
335
|
+
t["chains"] += [c.strip() for c in line.split(":")[-1].strip().split(",") if c.strip()]
|
|
336
|
+
if "BIOMT" in line and t:
|
|
337
|
+
values = [float(x) for x in line.split()[4:]]
|
|
338
|
+
if len(t["matrix"]) == 3:
|
|
339
|
+
assembly["transformations"].append(t)
|
|
340
|
+
t = {"chains": t["chains"], "matrix": [], "vector": []}
|
|
341
|
+
t["matrix"].append(values[:3])
|
|
342
|
+
t["vector"].append(values[-1])
|
|
343
|
+
if t:
|
|
344
|
+
assembly["transformations"].append(t)
|
|
345
|
+
return assembly
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def extract_crystallography(pdb_dict: dict[str, Any], geometry_dict: dict[str, Any]) -> None:
|
|
349
|
+
"""Takes a ``dict`` and adds assembly information to it by parsing the
|
|
350
|
+
CRYST1 record.
|
|
351
|
+
|
|
352
|
+
:param dict pdb_dict: the ``dict`` to read.
|
|
353
|
+
:param dict geometry_dict: the ``dict`` to update."""
|
|
354
|
+
|
|
355
|
+
if pdb_dict.get("CRYST1"):
|
|
356
|
+
line = pdb_dict["CRYST1"][0]
|
|
357
|
+
values = line.split()
|
|
358
|
+
geometry_dict["crystallography"]["space_group"] = line[55:66].strip()
|
|
359
|
+
geometry_dict["crystallography"]["unit_cell"] = [float(val) for val in values[1:7]] if len(values) >= 6 else []
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def make_sequences(pdb_dict: dict[str, Any]) -> dict[str, str]:
|
|
363
|
+
"""Creates a mapping of chain IDs to sequences, by parsing SEQRES records.
|
|
364
|
+
|
|
365
|
+
:param dict pdb_dict: the .pdb dictionary to read.
|
|
366
|
+
:rtype: ``dict``"""
|
|
367
|
+
|
|
368
|
+
seq: dict[str, Any] = {}
|
|
369
|
+
if pdb_dict.get("SEQRES"):
|
|
370
|
+
for line in pdb_dict["SEQRES"]:
|
|
371
|
+
chain, residues = line[11], line[19:].strip().split()
|
|
372
|
+
if chain not in seq:
|
|
373
|
+
seq[chain] = []
|
|
374
|
+
seq[chain] += residues
|
|
375
|
+
return {k: "".join([CODES.get(r, "X") for r in v]) for k, v in seq.items()}
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def inverse_make_sequences(seq: str, chain_id: str) -> list[str]:
|
|
379
|
+
"""Converts a mapping of chain IDs to sequences back into SEQRES format.
|
|
380
|
+
|
|
381
|
+
:param dict seq_dict: A dictionary mapping chain IDs to sequences.
|
|
382
|
+
"""
|
|
383
|
+
# Reverse CODES dictionary
|
|
384
|
+
REVERSE_CODES = {v: k for k, v in CODES.items()}
|
|
385
|
+
|
|
386
|
+
seqres_lines = []
|
|
387
|
+
residues = [REVERSE_CODES.get(aa, "UNK") for aa in seq]
|
|
388
|
+
# SEQRES records are typically formatted into lines of up to 13 residues
|
|
389
|
+
for i in range(0, len(residues), 13):
|
|
390
|
+
seqres_lines.append(f"SEQRES {i // 13 + 1:>3} {chain_id} {len(seq):>4} " + " ".join(residues[i : i + 13]))
|
|
391
|
+
|
|
392
|
+
return seqres_lines
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def make_secondary_structure(pdb_dict: dict[str, Any]) -> dict[str, Any]:
|
|
396
|
+
"""Creates a dictionary of helices and strands, with each having a list of
|
|
397
|
+
start and end residues.
|
|
398
|
+
|
|
399
|
+
:param pdb_dict: the .pdb dict to read.
|
|
400
|
+
:rtype: ``dict``"""
|
|
401
|
+
|
|
402
|
+
helices, strands = [], []
|
|
403
|
+
for helix in pdb_dict.get("HELIX", []):
|
|
404
|
+
helices.append(
|
|
405
|
+
[
|
|
406
|
+
f"{helix[19]}.{helix[21:25].strip()}{helix[25].strip()}",
|
|
407
|
+
f"{helix[31]}.{helix[33:37].strip()}{helix[37].strip() if len(helix) > 37 else ''}",
|
|
408
|
+
]
|
|
409
|
+
)
|
|
410
|
+
for strand in pdb_dict.get("SHEET", []):
|
|
411
|
+
strands.append(
|
|
412
|
+
[
|
|
413
|
+
f"{strand[21]}.{strand[22:26].strip()}{strand[26].strip()}",
|
|
414
|
+
f"{strand[32]}.{strand[33:37].strip()}{strand[37].strip() if len(strand) > 37 else ''}",
|
|
415
|
+
]
|
|
416
|
+
)
|
|
417
|
+
return {"helices": helices, "strands": strands}
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def get_full_names(pdb_dict: dict[str, Any]) -> dict[str, Any]:
|
|
421
|
+
"""Creates a mapping of het names to full English names.
|
|
422
|
+
|
|
423
|
+
:param pdb_dict: the .pdb dict to read.
|
|
424
|
+
:rtype: ``dict``"""
|
|
425
|
+
|
|
426
|
+
full_names: dict[str, Any] = {}
|
|
427
|
+
for line in pdb_dict.get("HETNAM", []):
|
|
428
|
+
try:
|
|
429
|
+
full_names[line[11:14].strip()] += line[15:].strip()
|
|
430
|
+
except Exception:
|
|
431
|
+
full_names[line[11:14].strip()] = line[15:].strip()
|
|
432
|
+
|
|
433
|
+
return full_names
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def make_aniso(model_lines: list[str]) -> dict[int, list[float]]:
|
|
437
|
+
"""Creates a mapping of chain IDs to anisotropy, by parsing ANISOU records.
|
|
438
|
+
|
|
439
|
+
:param dict pdb_dict: the .pdb dictionary to read.
|
|
440
|
+
:rtype: ``dict``"""
|
|
441
|
+
|
|
442
|
+
return {int(line[6:11].strip()): [int(line[n * 7 + 28 : n * 7 + 35]) / 10000 for n in range(6)] for line in model_lines if line[:6] == "ANISOU"}
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def get_last_ter_line(model_lines: list[str]) -> int:
|
|
446
|
+
"""Gets the index of the last TER record in a list of records. 0 will be
|
|
447
|
+
returned if there are none.
|
|
448
|
+
|
|
449
|
+
:param list model_lines: the lines to search.
|
|
450
|
+
:rtype: ``int``"""
|
|
451
|
+
|
|
452
|
+
last_ter = 0
|
|
453
|
+
for index, line in enumerate(model_lines[::-1]):
|
|
454
|
+
if line[:3] == "TER":
|
|
455
|
+
last_ter = len(model_lines) - index - 1
|
|
456
|
+
break
|
|
457
|
+
return last_ter
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def id_from_line(line: str) -> str:
|
|
461
|
+
"""Creates a residue ID from an atom line.
|
|
462
|
+
|
|
463
|
+
:param str line: the ATOM or HETATM line record.
|
|
464
|
+
:rtype: ``str``"""
|
|
465
|
+
|
|
466
|
+
return "{}.{}{}".format(line[21], line[22:26].strip(), line[26].strip())
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def add_atom_to_polymer(line: str, model: dict[Any, Any], chain_id: str, res_id: str, aniso_dict: dict[Any, Any], full_names: dict[Any, Any]) -> None:
|
|
470
|
+
"""Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
|
|
471
|
+
polymer dictionary.
|
|
472
|
+
|
|
473
|
+
:param dict line: the line to read.
|
|
474
|
+
:param dict model: the model to update.
|
|
475
|
+
:param str chain_id: the chain ID to add to.
|
|
476
|
+
:param str res_id: the molecule ID to add to.
|
|
477
|
+
:param dict aniso_dict: lookup dictionary for anisotropy information."""
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
model["polymer"][chain_id]["residues"][res_id]["atoms"][int(line[6:11])] = atom_line_to_dict(line, aniso_dict)
|
|
481
|
+
except Exception:
|
|
482
|
+
name = line[17:20].strip()
|
|
483
|
+
try:
|
|
484
|
+
model["polymer"][chain_id]["residues"][res_id] = {
|
|
485
|
+
"name": name,
|
|
486
|
+
"full_name": full_names.get(name),
|
|
487
|
+
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
|
|
488
|
+
"number": len(model["polymer"][chain_id]["residues"]) + 1,
|
|
489
|
+
}
|
|
490
|
+
except Exception:
|
|
491
|
+
model["polymer"][chain_id] = {
|
|
492
|
+
"internal_id": chain_id,
|
|
493
|
+
"helices": [],
|
|
494
|
+
"strands": [],
|
|
495
|
+
"residues": {
|
|
496
|
+
res_id: {
|
|
497
|
+
"name": line[17:20].strip(),
|
|
498
|
+
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
|
|
499
|
+
"number": 1,
|
|
500
|
+
"full_name": None,
|
|
501
|
+
}
|
|
502
|
+
},
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def add_atom_to_non_polymer(line: str, model: dict[Any, Any], res_id: str, aniso_dict: dict[Any, Any], full_names: dict[Any, Any]) -> None:
|
|
507
|
+
"""Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
|
|
508
|
+
non-polymer dictionary.
|
|
509
|
+
|
|
510
|
+
:param dict line: the line to read.
|
|
511
|
+
:param dict model: the model to update.
|
|
512
|
+
:param str res_id: the molecule ID to add to.
|
|
513
|
+
:param dict aniso_dict: lookup dictionary for anisotropy information."""
|
|
514
|
+
|
|
515
|
+
key = "water" if line[17:20] in ["HOH", "DOD"] else "non-polymer"
|
|
516
|
+
try:
|
|
517
|
+
model[key][res_id]["atoms"][int(line[6:11])] = atom_line_to_dict(line, aniso_dict)
|
|
518
|
+
except Exception:
|
|
519
|
+
name = line[17:20].strip()
|
|
520
|
+
model[key][res_id] = {
|
|
521
|
+
"name": name,
|
|
522
|
+
"full_name": full_names.get(name),
|
|
523
|
+
"internal_id": line[21],
|
|
524
|
+
"polymer": line[21],
|
|
525
|
+
"atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def atom_line_to_dict(line: str, aniso_dict: dict[Any, Any]) -> dict[str, Any]:
|
|
530
|
+
"""Converts an ATOM or HETATM record to an atom dictionary.
|
|
531
|
+
|
|
532
|
+
:param str line: the record to convert.
|
|
533
|
+
:param dict aniso_dict: the anisotropy dictionary to use.
|
|
534
|
+
:rtype: ``dict``"""
|
|
535
|
+
|
|
536
|
+
a = {"occupancy": 1, "bvalue": None, "charge": 0, "anisotropy": aniso_dict.get(int(line[6:11].strip()), None)}
|
|
537
|
+
a["is_hetatm"] = line[:6] == "HETATM"
|
|
538
|
+
a["name"] = line[12:16].strip() or None
|
|
539
|
+
a["alt_loc"] = line[16].strip() or None
|
|
540
|
+
a["x"] = float(line[30:38].strip())
|
|
541
|
+
a["y"] = float(line[38:46].strip())
|
|
542
|
+
a["z"] = float(line[46:54].strip())
|
|
543
|
+
if line[54:60].strip():
|
|
544
|
+
a["occupancy"] = float(line[54:60].strip())
|
|
545
|
+
if line[60:66].strip():
|
|
546
|
+
a["bvalue"] = float(line[60:66].strip())
|
|
547
|
+
a["element"] = line[76:78].strip() or None
|
|
548
|
+
if line[78:80].strip():
|
|
549
|
+
try:
|
|
550
|
+
a["charge"] = int(line[78:80].strip())
|
|
551
|
+
except Exception:
|
|
552
|
+
a["charge"] = int(line[78:80][::-1].strip())
|
|
553
|
+
|
|
554
|
+
if a["charge"] == 0:
|
|
555
|
+
a["charge"] = None
|
|
556
|
+
if not a["is_hetatm"]:
|
|
557
|
+
a["is_hetatm"] = None
|
|
558
|
+
if not a["alt_loc"]:
|
|
559
|
+
a["alt_loc"] = None
|
|
560
|
+
return a
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def merge_lines(lines: list[str], start: int, join: str = " ") -> str:
|
|
564
|
+
"""Gets a single continuous string from a sequence of lines.
|
|
565
|
+
|
|
566
|
+
:param list lines: The lines to merge.
|
|
567
|
+
:param int start: The start point in each record.
|
|
568
|
+
:param str join: The string to join on.
|
|
569
|
+
:rtype: ``str``"""
|
|
570
|
+
|
|
571
|
+
string = join.join([line[start:].strip() for line in lines])
|
|
572
|
+
return string
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Contains various file handling helper functions."""
|
|
2
|
+
|
|
3
|
+
import builtins
|
|
4
|
+
import gzip
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from requests import get # type: ignore [import-untyped]
|
|
8
|
+
|
|
9
|
+
from .mmcif import mmcif_dict_to_data_dict, mmcif_string_to_mmcif_dict
|
|
10
|
+
from .pdb import pdb_dict_to_data_dict, pdb_string_to_pdb_dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def open(path: str, *args, **kwargs) -> Any: # type: ignore [no-untyped-def]
|
|
14
|
+
"""Opens a file at a given path, works out what filetype it is, and parses
|
|
15
|
+
it accordingly.
|
|
16
|
+
|
|
17
|
+
For example:
|
|
18
|
+
open('/path/to/file.pdb', data_dict=True)
|
|
19
|
+
|
|
20
|
+
This will parse file.pdb as a .pdb file, but only go as far as converting it
|
|
21
|
+
to an atomium data dictionary.
|
|
22
|
+
|
|
23
|
+
If the file extension is .gz, the file will be unzipped first.
|
|
24
|
+
|
|
25
|
+
:param str path: the location of the file.
|
|
26
|
+
:param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
|
|
27
|
+
:param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
|
|
28
|
+
:rtype: ``File``"""
|
|
29
|
+
|
|
30
|
+
if str(path)[-3:] == ".gz":
|
|
31
|
+
try:
|
|
32
|
+
with gzip.open(path) as f:
|
|
33
|
+
filestring = f.read().decode()
|
|
34
|
+
except Exception:
|
|
35
|
+
with gzip.open(path, "rt") as f:
|
|
36
|
+
filestring = f.read()
|
|
37
|
+
return parse_string(filestring, path[:-3], *args, **kwargs)
|
|
38
|
+
else:
|
|
39
|
+
try:
|
|
40
|
+
with builtins.open(path) as f:
|
|
41
|
+
filestring = f.read()
|
|
42
|
+
except Exception:
|
|
43
|
+
with builtins.open(path, "rb") as f:
|
|
44
|
+
filestring = f.read() # type: ignore [assignment]
|
|
45
|
+
return parse_string(filestring, path, *args, **kwargs)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def fetch(code: str, *args, **kwargs) -> Any: # type: ignore [no-untyped-def]
|
|
49
|
+
"""Fetches a file from a remote location via HTTP.
|
|
50
|
+
|
|
51
|
+
If a PDB code is given, the .cif form of that struture will be fetched from
|
|
52
|
+
the RCSB servers. If that code is given an extension, that file format will
|
|
53
|
+
be obtained instead of .cif. If a URL is given, the function will simply
|
|
54
|
+
look in that location.
|
|
55
|
+
|
|
56
|
+
For example:
|
|
57
|
+
fetch('1lol.mmtf', file_dict=True)
|
|
58
|
+
|
|
59
|
+
This will get the .mmtf version of structure 1LOL, but only go as far as
|
|
60
|
+
converting it to an atomium file dictionary.
|
|
61
|
+
|
|
62
|
+
:param str code: the file to fetch.
|
|
63
|
+
:param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
|
|
64
|
+
:param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
|
|
65
|
+
:raises ValueError: if no file is found.
|
|
66
|
+
:rtype: ``File``"""
|
|
67
|
+
|
|
68
|
+
if code.startswith("http"):
|
|
69
|
+
url = code
|
|
70
|
+
elif code.endswith(".mmtf"):
|
|
71
|
+
url = "https://mmtf.rcsb.org/v1.0/full/{}".format(code[:-5].lower())
|
|
72
|
+
else:
|
|
73
|
+
if "." not in code:
|
|
74
|
+
code += ".cif"
|
|
75
|
+
url = "https://files.rcsb.org/view/" + code.lower()
|
|
76
|
+
response = get(url, stream=True)
|
|
77
|
+
if response.status_code == 200:
|
|
78
|
+
text = response.content if code.endswith(".mmtf") else response.text
|
|
79
|
+
return parse_string(text, code, *args, **kwargs)
|
|
80
|
+
raise ValueError("Could not find anything at {}".format(url))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def parse_string(filestring: str, path: str, file_dict: bool = False, data_dict: bool = False) -> Any:
|
|
84
|
+
"""Takes a filestring and parses it in the appropriate way. You must provide
|
|
85
|
+
the string to parse itself, and some other string that ends in either .cif,
|
|
86
|
+
.mmtf, or .cif - that will determine how the file is parsed.
|
|
87
|
+
|
|
88
|
+
(If this cannot be inferred from the path string, atomium will guess based
|
|
89
|
+
on the filestring contents.)
|
|
90
|
+
|
|
91
|
+
:param str filestring: the contents of some file.
|
|
92
|
+
:param str path: the filename of the file of origin.
|
|
93
|
+
:param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
|
|
94
|
+
:param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
|
|
95
|
+
:rtype: ``File``"""
|
|
96
|
+
|
|
97
|
+
file_func, data_func = get_parse_functions(filestring, path)
|
|
98
|
+
parsed = file_func(filestring)
|
|
99
|
+
if not file_dict:
|
|
100
|
+
parsed = data_func(parsed)
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def get_parse_functions(filestring: str, path: str) -> Any:
|
|
105
|
+
"""Works out which parsing functions to use for a given filestring and
|
|
106
|
+
returns them.
|
|
107
|
+
|
|
108
|
+
(If this cannot be inferred from the path string, atomium will guess based
|
|
109
|
+
on the filestring contents.)
|
|
110
|
+
|
|
111
|
+
:param str filestring: the filestring to inspect.
|
|
112
|
+
:param str path: the path to inspect.
|
|
113
|
+
:rtype: ``tuple``"""
|
|
114
|
+
|
|
115
|
+
if "." in path:
|
|
116
|
+
ending = path.split(".")[-1]
|
|
117
|
+
if ending in ("mmtf", "cif", "pdb"):
|
|
118
|
+
return {
|
|
119
|
+
"cif": (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict),
|
|
120
|
+
"pdb": (pdb_string_to_pdb_dict, pdb_dict_to_data_dict),
|
|
121
|
+
}[ending]
|
|
122
|
+
elif "_atom_sites" in filestring:
|
|
123
|
+
return (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict)
|
|
124
|
+
else:
|
|
125
|
+
return (pdb_string_to_pdb_dict, pdb_dict_to_data_dict)
|