stjames 0.0.52__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stjames might be problematic. Click here for more details.

@@ -0,0 +1,572 @@
1
+ """Contains functions for dealing with the .pdb file format."""
2
+
3
+ import re
4
+ from datetime import datetime
5
+ from itertools import chain, groupby
6
+ from typing import Any, Callable
7
+
8
+ from .data import CODES
9
+ from .mmcif import add_secondary_structure_to_polymers
10
+
11
+
12
+ def pdb_string_to_pdb_dict(filestring: str) -> dict[str, Any]:
13
+ """Takes a .pdb filestring and turns into a ``dict`` which represents its
14
+ record structure. Only lines which aren't empty are used.
15
+
16
+ The resultant dictionary has line types as the keys, which point to the
17
+ lines as its value. So ``{"TITLE": ["TITLE line 1", "TITLE line 2"]}`` etc.
18
+
19
+ The exceptions are the REMARK records, where there is a sub-dictionary with
20
+ REMARK numbers as keys, and the structure records themselves which are just
21
+ arranged into lists - one for each model.
22
+
23
+ :param str filestring: the .pdb filestring to process.
24
+ :rtype: ``dict``"""
25
+
26
+ pdb_dict: dict[str, Any] = {}
27
+ lines_1 = list(filter(lambda l: bool(l.strip()), filestring.split("\n")))
28
+ lines: list[list[str]] = [[line[:6].rstrip(), line.rstrip()] for line in lines_1]
29
+ model_recs = ("ATOM", "HETATM", "ANISOU", "MODEL", "TER", "ENDMDL")
30
+ for head, line in lines:
31
+ if head == "REMARK":
32
+ if "REMARK" not in pdb_dict:
33
+ pdb_dict["REMARK"] = {}
34
+ number = line.lstrip().split()[1]
35
+ update_dict(pdb_dict["REMARK"], number, line)
36
+ elif head in model_recs:
37
+ if "MODEL" not in pdb_dict:
38
+ pdb_dict["MODEL"] = [[]]
39
+ if head == "ENDMDL":
40
+ pdb_dict["MODEL"].append([])
41
+ elif head != "MODEL":
42
+ pdb_dict["MODEL"][-1].append(line)
43
+ else:
44
+ update_dict(pdb_dict, head, line)
45
+ if "MODEL" in pdb_dict and not pdb_dict["MODEL"][-1]:
46
+ pdb_dict["MODEL"].pop()
47
+ return pdb_dict
48
+
49
+
50
+ def update_dict(d: dict[str, Any], key: str, value: str) -> None:
51
+ """Takes a dictionary where the values are lists, and adds a value to one of
52
+ the lists at the specific key. If the list doesn't exist, it creates it
53
+ first.
54
+
55
+ The dictionary is changed in place.
56
+
57
+ :param dict d: the dictionary to update.
58
+ :param str key: the location of the list.
59
+ :param str value: the value to add to the list."""
60
+
61
+ try:
62
+ d[key].append(value)
63
+ except Exception:
64
+ d[key] = [value]
65
+
66
+
67
+ def pdb_dict_to_data_dict(pdb_dict: dict[str, Any]) -> dict[str, Any]:
68
+ """Converts an .pdb dictionary into an atomium data dictionary, with the
69
+ same standard layout that the other file formats get converted into.
70
+
71
+ :param dict pdb_dict: the .pdb dictionary.
72
+ :rtype: ``dict``"""
73
+
74
+ data_dict = {
75
+ "description": {"code": None, "title": None, "deposition_date": None, "classification": None, "keywords": [], "authors": []},
76
+ "experiment": {"technique": None, "source_organism": None, "expression_system": None, "missing_residues": []},
77
+ "quality": {"resolution": None, "rvalue": None, "rfree": None},
78
+ "geometry": {"assemblies": [], "crystallography": {}},
79
+ "models": [],
80
+ }
81
+ update_description_dict(pdb_dict, data_dict)
82
+ update_experiment_dict(pdb_dict, data_dict)
83
+ update_quality_dict(pdb_dict, data_dict)
84
+ update_geometry_dict(pdb_dict, data_dict)
85
+ update_models_list(pdb_dict, data_dict)
86
+ return data_dict
87
+
88
+
89
+ def update_description_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
90
+ """Creates the description component of a standard atomium data dictionary
91
+ from a .pdb dictionary.
92
+
93
+ :param dict pdb_dict: The .pdb dictionary to read.
94
+ :param dict data_dict: The data dictionary to update."""
95
+
96
+ extract_header(pdb_dict, data_dict["description"])
97
+ extract_title(pdb_dict, data_dict["description"])
98
+ extract_keywords(pdb_dict, data_dict["description"])
99
+ extract_authors(pdb_dict, data_dict["description"])
100
+
101
+
102
+ def update_experiment_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
103
+ """Creates the experiment component of a standard atomium data dictionary
104
+ from a .pdb dictionary.
105
+
106
+ :param dict pdb_dict: The .pdb dictionary to read.
107
+ :param dict data_dict: The data dictionary to update."""
108
+
109
+ extract_technique(pdb_dict, data_dict["experiment"])
110
+ extract_source(pdb_dict, data_dict["experiment"])
111
+ extract_missing_residues(pdb_dict, data_dict["experiment"])
112
+
113
+
114
+ def update_quality_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
115
+ """Creates the quality component of a standard atomium data dictionary
116
+ from a .pdb dictionary.
117
+
118
+ :param dict pdb_dict: The .pdb dictionary to read.
119
+ :param dict data_dict: The data dictionary to update."""
120
+
121
+ extract_resolution_remark(pdb_dict, data_dict["quality"])
122
+ extract_rvalue_remark(pdb_dict, data_dict["quality"])
123
+
124
+
125
+ def update_geometry_dict(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
126
+ """Creates the geometry component of a standard atomium data dictionary
127
+ from a .pdb dictionary.
128
+
129
+ :param dict pdb_dict: The .pdb dictionary to read.
130
+ :param dict data_dict: The data dictionary to update."""
131
+
132
+ extract_assembly_remark(pdb_dict, data_dict["geometry"])
133
+ extract_crystallography(pdb_dict, data_dict["geometry"])
134
+
135
+
136
+ def update_models_list(pdb_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
137
+ """Creates model dictionaries in a data dictionary.
138
+
139
+ :param dict pdb_dict: The .pdb dictionary to read.
140
+ :param dict data_dict: The data dictionary to update."""
141
+
142
+ sequences = make_sequences(pdb_dict)
143
+ secondary_structure = make_secondary_structure(pdb_dict)
144
+ full_names = get_full_names(pdb_dict)
145
+ for model_lines in pdb_dict["MODEL"]:
146
+ aniso = make_aniso(model_lines)
147
+ last_ter = get_last_ter_line(model_lines)
148
+ model: dict[str, Any] = {"polymer": {}, "non-polymer": {}, "water": {}}
149
+ for index, line in enumerate(model_lines):
150
+ if line[:6] in ["ATOM ", "HETATM"]:
151
+ chain_id = line[21] if index < last_ter else id_from_line(line)
152
+ res_id = id_from_line(line)
153
+ if index < last_ter:
154
+ add_atom_to_polymer(line, model, chain_id, res_id, aniso, full_names)
155
+ else:
156
+ add_atom_to_non_polymer(line, model, res_id, aniso, full_names)
157
+
158
+ for chain_id, _chain in model["polymer"].items():
159
+ _chain["sequence"] = sequences.get(chain_id, "")
160
+ add_secondary_structure_to_polymers(model, secondary_structure)
161
+ data_dict["models"].append(model)
162
+
163
+
164
+ def extract_header(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
165
+ """Takes a ``dict`` and adds header information to it by parsing the HEADER
166
+ line.
167
+
168
+ :param dict pdb_dict: the ``dict`` to read.
169
+ :param dict description_dict: the ``dict`` to update."""
170
+
171
+ if pdb_dict.get("HEADER"):
172
+ line = pdb_dict["HEADER"][0]
173
+ if line[50:59].strip():
174
+ description_dict["deposition_date"] = datetime.strptime(line[50:59], "%d-%b-%y").date()
175
+ if line[62:66].strip():
176
+ description_dict["code"] = line[62:66]
177
+ if line[10:50].strip():
178
+ description_dict["classification"] = line[10:50].strip()
179
+
180
+
181
+ def extract_title(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
182
+ """Takes a ``dict`` and adds header information to it by parsing the TITLE
183
+ lines.
184
+
185
+ :param dict pdb_dict: the ``dict`` to read.
186
+ :param dict description_dict: the ``dict`` to update."""
187
+
188
+ if pdb_dict.get("TITLE"):
189
+ description_dict["title"] = merge_lines(pdb_dict["TITLE"], 10)
190
+
191
+
192
+ def extract_keywords(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
193
+ """Takes a ``dict`` and adds header information to it by parsing the KEYWDS
194
+ line.
195
+
196
+ :param dict pdb_dict: the ``dict`` to read.
197
+ :param dict description_dict: the ``dict`` to update."""
198
+
199
+ if pdb_dict.get("KEYWDS"):
200
+ text = merge_lines(pdb_dict["KEYWDS"], 10)
201
+ description_dict["keywords"] = [w.strip() for w in text.split(",")]
202
+
203
+
204
+ def extract_authors(pdb_dict: dict[str, Any], description_dict: dict[str, Any]) -> None:
205
+ """Takes a ``dict`` and adds header information to it by parsing the AUTHOR
206
+ line.
207
+
208
+ :param dict pdb_dict: the ``dict`` to read.
209
+ :param dict description_dict: the ``dict`` to update."""
210
+
211
+ if pdb_dict.get("AUTHOR"):
212
+ text = merge_lines(pdb_dict["AUTHOR"], 10)
213
+ description_dict["authors"] = [w.strip() for w in text.split(",")]
214
+
215
+
216
+ def extract_technique(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
217
+ """Takes a ``dict`` and adds technique information to it by parsing EXPDTA
218
+ lines.
219
+
220
+ :param dict pdb_dict: the ``dict`` to read.
221
+ :param dict experiment_dict: the ``dict`` to update."""
222
+
223
+ if pdb_dict.get("EXPDTA"):
224
+ if pdb_dict["EXPDTA"][0].strip():
225
+ experiment_dict["technique"] = pdb_dict["EXPDTA"][0][6:].strip()
226
+
227
+
228
+ def extract_source(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
229
+ """Takes a ``dict`` and adds source information to it by parsing SOURCE
230
+ lines.
231
+
232
+ :param dict pdb_dict: the ``dict`` to read.
233
+ :param dict experiment_dict: the ``dict`` to update."""
234
+
235
+ if pdb_dict.get("SOURCE"):
236
+ data = merge_lines(pdb_dict["SOURCE"], 10)
237
+ patterns = {"source_organism": r"ORGANISM_SCIENTIFIC\: (.+?);", "expression_system": r"EXPRESSION_SYSTEM\: (.+?);"}
238
+ for attribute, pattern in patterns.items():
239
+ matches = re.findall(pattern, data)
240
+ if matches:
241
+ experiment_dict[attribute] = matches[0]
242
+
243
+
244
+ def extract_missing_residues(pdb_dict: dict[str, Any], experiment_dict: dict[str, Any]) -> None:
245
+ """Takes a ``dict`` and adds missing residue information to it by parsing
246
+ REMARK 465 lines.
247
+
248
+ :param dict pdb_dict: the ``dict`` to read.
249
+ :param dict experiment_dict: the ``dict`` to update."""
250
+
251
+ for line in pdb_dict.get("REMARK", {}).get("465", []):
252
+ chunks = line.strip().split()
253
+ if len(chunks) == 5:
254
+ experiment_dict["missing_residues"].append({"name": chunks[2], "id": f"{chunks[3]}.{chunks[4]}"})
255
+
256
+
257
+ def extract_resolution_remark(pdb_dict: dict[str, Any], quality_dict: dict[str, Any]) -> None:
258
+ """Takes a ``dict`` and adds resolution information to it by parsing REMARK
259
+ 2 lines.
260
+
261
+ :param dict pdb_dict: the ``dict`` to read.
262
+ :param dict quality_dict: the ``dict`` to update."""
263
+
264
+ if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("2"):
265
+ for remark in pdb_dict["REMARK"]["2"]:
266
+ try:
267
+ quality_dict["resolution"] = float(remark[10:].strip().split()[1])
268
+ break
269
+ except Exception:
270
+ pass
271
+
272
+
273
+ def extract_rvalue_remark(pdb_dict: dict[str, Any], quality_dict: dict[str, Any]) -> None:
274
+ """Takes a ``dict`` and adds resolution information to it by parsing REMARK
275
+ 3 lines.
276
+
277
+ :param dict pdb_dict: the ``dict`` to read.
278
+ :param dict quality_dict: the ``dict`` to update."""
279
+
280
+ if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("3"):
281
+ patterns = {
282
+ "rvalue": r"R VALUE.+WORKING.+?: (.+)",
283
+ "rfree": r"FREE R VALUE[ ]{2,}: (.+)",
284
+ }
285
+ for attribute, pattern in patterns.items():
286
+ for remark in pdb_dict["REMARK"]["3"]:
287
+ matches = re.findall(pattern, remark.strip())
288
+ if matches:
289
+ try:
290
+ quality_dict[attribute] = float(matches[0].strip())
291
+ except Exception:
292
+ pass
293
+ break
294
+
295
+
296
+ def extract_assembly_remark(pdb_dict: dict[str, Any], geometry_dict: dict[str, Any]) -> None:
297
+ """Takes a ``dict`` and adds assembly information to it by parsing REMARK
298
+ 350 lines.
299
+
300
+ :param dict pdb_dict: the ``dict`` to read.
301
+ :param dict geometry_dict: the ``dict`` to update."""
302
+ if pdb_dict.get("REMARK") and pdb_dict["REMARK"].get("350"):
303
+ groups = [list(g) for k, g in groupby(pdb_dict["REMARK"]["350"], lambda x: "ECULE:" in x)][1:]
304
+ assemblies = [list(chain(*a)) for a in zip(groups[::2], groups[1::2])]
305
+ for a in assemblies:
306
+ geometry_dict["assemblies"].append(assembly_lines_to_assembly_dict(a))
307
+
308
+
309
+ def assembly_lines_to_assembly_dict(lines: list[str]) -> dict[str, Any]:
310
+ """Takes the lines representing a single biological assembly and turns
311
+ them into an assembly dictionary.
312
+
313
+ :param list lines: The REMARK lines to read.
314
+ :rtype: ``dict``"""
315
+
316
+ assembly: dict[str, Any] = {"transformations": [], "software": None, "buried_surface_area": None, "surface_area": None, "delta_energy": None, "id": 0}
317
+ patterns: list[tuple[str, str, Callable[[str], Any]]] = [
318
+ (r"(.+)SOFTWARE USED: (.+)", "software", lambda x: x),
319
+ (r"(.+)BIOMOLECULE: (.+)", "id", int),
320
+ (r"(.+)SURFACE AREA: (.+) [A-Z]", "buried_surface_area", float),
321
+ (r"(.+)AREA OF THE COMPLEX: (.+) [A-Z]", "surface_area", float),
322
+ (r"(.+)FREE ENERGY: (.+) [A-Z]", "delta_energy", float),
323
+ ]
324
+ t = None
325
+ for line in lines:
326
+ for pattern, key, converter in patterns:
327
+ matches = re.findall(pattern, line)
328
+ if matches:
329
+ assembly[key] = converter(matches[0][1].strip())
330
+ if "APPLY THE FOLLOWING" in line:
331
+ if t:
332
+ assembly["transformations"].append(t)
333
+ t = {"chains": [], "matrix": [], "vector": []}
334
+ if "CHAINS:" in line and t:
335
+ t["chains"] += [c.strip() for c in line.split(":")[-1].strip().split(",") if c.strip()]
336
+ if "BIOMT" in line and t:
337
+ values = [float(x) for x in line.split()[4:]]
338
+ if len(t["matrix"]) == 3:
339
+ assembly["transformations"].append(t)
340
+ t = {"chains": t["chains"], "matrix": [], "vector": []}
341
+ t["matrix"].append(values[:3])
342
+ t["vector"].append(values[-1])
343
+ if t:
344
+ assembly["transformations"].append(t)
345
+ return assembly
346
+
347
+
348
+ def extract_crystallography(pdb_dict: dict[str, Any], geometry_dict: dict[str, Any]) -> None:
349
+ """Takes a ``dict`` and adds assembly information to it by parsing the
350
+ CRYST1 record.
351
+
352
+ :param dict pdb_dict: the ``dict`` to read.
353
+ :param dict geometry_dict: the ``dict`` to update."""
354
+
355
+ if pdb_dict.get("CRYST1"):
356
+ line = pdb_dict["CRYST1"][0]
357
+ values = line.split()
358
+ geometry_dict["crystallography"]["space_group"] = line[55:66].strip()
359
+ geometry_dict["crystallography"]["unit_cell"] = [float(val) for val in values[1:7]] if len(values) >= 6 else []
360
+
361
+
362
+ def make_sequences(pdb_dict: dict[str, Any]) -> dict[str, str]:
363
+ """Creates a mapping of chain IDs to sequences, by parsing SEQRES records.
364
+
365
+ :param dict pdb_dict: the .pdb dictionary to read.
366
+ :rtype: ``dict``"""
367
+
368
+ seq: dict[str, Any] = {}
369
+ if pdb_dict.get("SEQRES"):
370
+ for line in pdb_dict["SEQRES"]:
371
+ chain, residues = line[11], line[19:].strip().split()
372
+ if chain not in seq:
373
+ seq[chain] = []
374
+ seq[chain] += residues
375
+ return {k: "".join([CODES.get(r, "X") for r in v]) for k, v in seq.items()}
376
+
377
+
378
+ def inverse_make_sequences(seq: str, chain_id: str) -> list[str]:
379
+ """Converts a mapping of chain IDs to sequences back into SEQRES format.
380
+
381
+ :param dict seq_dict: A dictionary mapping chain IDs to sequences.
382
+ """
383
+ # Reverse CODES dictionary
384
+ REVERSE_CODES = {v: k for k, v in CODES.items()}
385
+
386
+ seqres_lines = []
387
+ residues = [REVERSE_CODES.get(aa, "UNK") for aa in seq]
388
+ # SEQRES records are typically formatted into lines of up to 13 residues
389
+ for i in range(0, len(residues), 13):
390
+ seqres_lines.append(f"SEQRES {i // 13 + 1:>3} {chain_id} {len(seq):>4} " + " ".join(residues[i : i + 13]))
391
+
392
+ return seqres_lines
393
+
394
+
395
+ def make_secondary_structure(pdb_dict: dict[str, Any]) -> dict[str, Any]:
396
+ """Creates a dictionary of helices and strands, with each having a list of
397
+ start and end residues.
398
+
399
+ :param pdb_dict: the .pdb dict to read.
400
+ :rtype: ``dict``"""
401
+
402
+ helices, strands = [], []
403
+ for helix in pdb_dict.get("HELIX", []):
404
+ helices.append(
405
+ [
406
+ f"{helix[19]}.{helix[21:25].strip()}{helix[25].strip()}",
407
+ f"{helix[31]}.{helix[33:37].strip()}{helix[37].strip() if len(helix) > 37 else ''}",
408
+ ]
409
+ )
410
+ for strand in pdb_dict.get("SHEET", []):
411
+ strands.append(
412
+ [
413
+ f"{strand[21]}.{strand[22:26].strip()}{strand[26].strip()}",
414
+ f"{strand[32]}.{strand[33:37].strip()}{strand[37].strip() if len(strand) > 37 else ''}",
415
+ ]
416
+ )
417
+ return {"helices": helices, "strands": strands}
418
+
419
+
420
+ def get_full_names(pdb_dict: dict[str, Any]) -> dict[str, Any]:
421
+ """Creates a mapping of het names to full English names.
422
+
423
+ :param pdb_dict: the .pdb dict to read.
424
+ :rtype: ``dict``"""
425
+
426
+ full_names: dict[str, Any] = {}
427
+ for line in pdb_dict.get("HETNAM", []):
428
+ try:
429
+ full_names[line[11:14].strip()] += line[15:].strip()
430
+ except Exception:
431
+ full_names[line[11:14].strip()] = line[15:].strip()
432
+
433
+ return full_names
434
+
435
+
436
+ def make_aniso(model_lines: list[str]) -> dict[int, list[float]]:
437
+ """Creates a mapping of chain IDs to anisotropy, by parsing ANISOU records.
438
+
439
+ :param dict pdb_dict: the .pdb dictionary to read.
440
+ :rtype: ``dict``"""
441
+
442
+ return {int(line[6:11].strip()): [int(line[n * 7 + 28 : n * 7 + 35]) / 10000 for n in range(6)] for line in model_lines if line[:6] == "ANISOU"}
443
+
444
+
445
+ def get_last_ter_line(model_lines: list[str]) -> int:
446
+ """Gets the index of the last TER record in a list of records. 0 will be
447
+ returned if there are none.
448
+
449
+ :param list model_lines: the lines to search.
450
+ :rtype: ``int``"""
451
+
452
+ last_ter = 0
453
+ for index, line in enumerate(model_lines[::-1]):
454
+ if line[:3] == "TER":
455
+ last_ter = len(model_lines) - index - 1
456
+ break
457
+ return last_ter
458
+
459
+
460
+ def id_from_line(line: str) -> str:
461
+ """Creates a residue ID from an atom line.
462
+
463
+ :param str line: the ATOM or HETATM line record.
464
+ :rtype: ``str``"""
465
+
466
+ return "{}.{}{}".format(line[21], line[22:26].strip(), line[26].strip())
467
+
468
+
469
+ def add_atom_to_polymer(line: str, model: dict[Any, Any], chain_id: str, res_id: str, aniso_dict: dict[Any, Any], full_names: dict[Any, Any]) -> None:
470
+ """Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
471
+ polymer dictionary.
472
+
473
+ :param dict line: the line to read.
474
+ :param dict model: the model to update.
475
+ :param str chain_id: the chain ID to add to.
476
+ :param str res_id: the molecule ID to add to.
477
+ :param dict aniso_dict: lookup dictionary for anisotropy information."""
478
+
479
+ try:
480
+ model["polymer"][chain_id]["residues"][res_id]["atoms"][int(line[6:11])] = atom_line_to_dict(line, aniso_dict)
481
+ except Exception:
482
+ name = line[17:20].strip()
483
+ try:
484
+ model["polymer"][chain_id]["residues"][res_id] = {
485
+ "name": name,
486
+ "full_name": full_names.get(name),
487
+ "atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
488
+ "number": len(model["polymer"][chain_id]["residues"]) + 1,
489
+ }
490
+ except Exception:
491
+ model["polymer"][chain_id] = {
492
+ "internal_id": chain_id,
493
+ "helices": [],
494
+ "strands": [],
495
+ "residues": {
496
+ res_id: {
497
+ "name": line[17:20].strip(),
498
+ "atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
499
+ "number": 1,
500
+ "full_name": None,
501
+ }
502
+ },
503
+ }
504
+
505
+
506
+ def add_atom_to_non_polymer(line: str, model: dict[Any, Any], res_id: str, aniso_dict: dict[Any, Any], full_names: dict[Any, Any]) -> None:
507
+ """Takes an .pdb ATOM or HETATM record, converts it, and adds it to a
508
+ non-polymer dictionary.
509
+
510
+ :param dict line: the line to read.
511
+ :param dict model: the model to update.
512
+ :param str res_id: the molecule ID to add to.
513
+ :param dict aniso_dict: lookup dictionary for anisotropy information."""
514
+
515
+ key = "water" if line[17:20] in ["HOH", "DOD"] else "non-polymer"
516
+ try:
517
+ model[key][res_id]["atoms"][int(line[6:11])] = atom_line_to_dict(line, aniso_dict)
518
+ except Exception:
519
+ name = line[17:20].strip()
520
+ model[key][res_id] = {
521
+ "name": name,
522
+ "full_name": full_names.get(name),
523
+ "internal_id": line[21],
524
+ "polymer": line[21],
525
+ "atoms": {int(line[6:11]): atom_line_to_dict(line, aniso_dict)},
526
+ }
527
+
528
+
529
+ def atom_line_to_dict(line: str, aniso_dict: dict[Any, Any]) -> dict[str, Any]:
530
+ """Converts an ATOM or HETATM record to an atom dictionary.
531
+
532
+ :param str line: the record to convert.
533
+ :param dict aniso_dict: the anisotropy dictionary to use.
534
+ :rtype: ``dict``"""
535
+
536
+ a = {"occupancy": 1, "bvalue": None, "charge": 0, "anisotropy": aniso_dict.get(int(line[6:11].strip()), None)}
537
+ a["is_hetatm"] = line[:6] == "HETATM"
538
+ a["name"] = line[12:16].strip() or None
539
+ a["alt_loc"] = line[16].strip() or None
540
+ a["x"] = float(line[30:38].strip())
541
+ a["y"] = float(line[38:46].strip())
542
+ a["z"] = float(line[46:54].strip())
543
+ if line[54:60].strip():
544
+ a["occupancy"] = float(line[54:60].strip())
545
+ if line[60:66].strip():
546
+ a["bvalue"] = float(line[60:66].strip())
547
+ a["element"] = line[76:78].strip() or None
548
+ if line[78:80].strip():
549
+ try:
550
+ a["charge"] = int(line[78:80].strip())
551
+ except Exception:
552
+ a["charge"] = int(line[78:80][::-1].strip())
553
+
554
+ if a["charge"] == 0:
555
+ a["charge"] = None
556
+ if not a["is_hetatm"]:
557
+ a["is_hetatm"] = None
558
+ if not a["alt_loc"]:
559
+ a["alt_loc"] = None
560
+ return a
561
+
562
+
563
+ def merge_lines(lines: list[str], start: int, join: str = " ") -> str:
564
+ """Gets a single continuous string from a sequence of lines.
565
+
566
+ :param list lines: The lines to merge.
567
+ :param int start: The start point in each record.
568
+ :param str join: The string to join on.
569
+ :rtype: ``str``"""
570
+
571
+ string = join.join([line[start:].strip() for line in lines])
572
+ return string
@@ -0,0 +1,125 @@
1
+ """Contains various file handling helper functions."""
2
+
3
+ import builtins
4
+ import gzip
5
+ from typing import Any
6
+
7
+ from requests import get # type: ignore [import-untyped]
8
+
9
+ from .mmcif import mmcif_dict_to_data_dict, mmcif_string_to_mmcif_dict
10
+ from .pdb import pdb_dict_to_data_dict, pdb_string_to_pdb_dict
11
+
12
+
13
+ def open(path: str, *args, **kwargs) -> Any: # type: ignore [no-untyped-def]
14
+ """Opens a file at a given path, works out what filetype it is, and parses
15
+ it accordingly.
16
+
17
+ For example:
18
+ open('/path/to/file.pdb', data_dict=True)
19
+
20
+ This will parse file.pdb as a .pdb file, but only go as far as converting it
21
+ to an atomium data dictionary.
22
+
23
+ If the file extension is .gz, the file will be unzipped first.
24
+
25
+ :param str path: the location of the file.
26
+ :param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
27
+ :param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
28
+ :rtype: ``File``"""
29
+
30
+ if str(path)[-3:] == ".gz":
31
+ try:
32
+ with gzip.open(path) as f:
33
+ filestring = f.read().decode()
34
+ except Exception:
35
+ with gzip.open(path, "rt") as f:
36
+ filestring = f.read()
37
+ return parse_string(filestring, path[:-3], *args, **kwargs)
38
+ else:
39
+ try:
40
+ with builtins.open(path) as f:
41
+ filestring = f.read()
42
+ except Exception:
43
+ with builtins.open(path, "rb") as f:
44
+ filestring = f.read() # type: ignore [assignment]
45
+ return parse_string(filestring, path, *args, **kwargs)
46
+
47
+
48
+ def fetch(code: str, *args, **kwargs) -> Any: # type: ignore [no-untyped-def]
49
+ """Fetches a file from a remote location via HTTP.
50
+
51
+ If a PDB code is given, the .cif form of that struture will be fetched from
52
+ the RCSB servers. If that code is given an extension, that file format will
53
+ be obtained instead of .cif. If a URL is given, the function will simply
54
+ look in that location.
55
+
56
+ For example:
57
+ fetch('1lol.mmtf', file_dict=True)
58
+
59
+ This will get the .mmtf version of structure 1LOL, but only go as far as
60
+ converting it to an atomium file dictionary.
61
+
62
+ :param str code: the file to fetch.
63
+ :param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
64
+ :param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
65
+ :raises ValueError: if no file is found.
66
+ :rtype: ``File``"""
67
+
68
+ if code.startswith("http"):
69
+ url = code
70
+ elif code.endswith(".mmtf"):
71
+ url = "https://mmtf.rcsb.org/v1.0/full/{}".format(code[:-5].lower())
72
+ else:
73
+ if "." not in code:
74
+ code += ".cif"
75
+ url = "https://files.rcsb.org/view/" + code.lower()
76
+ response = get(url, stream=True)
77
+ if response.status_code == 200:
78
+ text = response.content if code.endswith(".mmtf") else response.text
79
+ return parse_string(text, code, *args, **kwargs)
80
+ raise ValueError("Could not find anything at {}".format(url))
81
+
82
+
83
+ def parse_string(filestring: str, path: str, file_dict: bool = False, data_dict: bool = False) -> Any:
84
+ """Takes a filestring and parses it in the appropriate way. You must provide
85
+ the string to parse itself, and some other string that ends in either .cif,
86
+ .mmtf, or .cif - that will determine how the file is parsed.
87
+
88
+ (If this cannot be inferred from the path string, atomium will guess based
89
+ on the filestring contents.)
90
+
91
+ :param str filestring: the contents of some file.
92
+ :param str path: the filename of the file of origin.
93
+ :param bool file_dict: if ``True``, parsing will stop at the file ``dict``.
94
+ :param bool data_dict: if ``True``, parsing will stop at the data ``dict``.
95
+ :rtype: ``File``"""
96
+
97
+ file_func, data_func = get_parse_functions(filestring, path)
98
+ parsed = file_func(filestring)
99
+ if not file_dict:
100
+ parsed = data_func(parsed)
101
+ return parsed
102
+
103
+
104
+ def get_parse_functions(filestring: str, path: str) -> Any:
105
+ """Works out which parsing functions to use for a given filestring and
106
+ returns them.
107
+
108
+ (If this cannot be inferred from the path string, atomium will guess based
109
+ on the filestring contents.)
110
+
111
+ :param str filestring: the filestring to inspect.
112
+ :param str path: the path to inspect.
113
+ :rtype: ``tuple``"""
114
+
115
+ if "." in path:
116
+ ending = path.split(".")[-1]
117
+ if ending in ("mmtf", "cif", "pdb"):
118
+ return {
119
+ "cif": (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict),
120
+ "pdb": (pdb_string_to_pdb_dict, pdb_dict_to_data_dict),
121
+ }[ending]
122
+ elif "_atom_sites" in filestring:
123
+ return (mmcif_string_to_mmcif_dict, mmcif_dict_to_data_dict)
124
+ else:
125
+ return (pdb_string_to_pdb_dict, pdb_dict_to_data_dict)