stjames 0.0.52__py3-none-any.whl → 0.0.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stjames might be problematic. Click here for more details.

@@ -0,0 +1,651 @@
1
+ """Contains functions for dealing with the .cif file format."""
2
+
3
+ import re
4
+ from collections import deque
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+ from .data import CODES
11
+
12
+
13
+ def mmcif_string_to_mmcif_dict(filestring: str) -> dict[str, Any]:
14
+ """Takes a .cif filestring and turns into a ``dict`` which represents its
15
+ table structure. Only lines which aren't empty and which don't begin with
16
+ ``#`` are used.
17
+
18
+ Multi-line strings are consolidated onto one line, and the whole thing is
19
+ then split into the blocks that will become table lists. At the end, quote
20
+ marks are removed from any string which retains them.
21
+
22
+ :param str filestring: the .cif filestring to process.
23
+ :rtype: ``dict``"""
24
+
25
+ lines = deque(filter(lambda l: l and l[0] != "#", filestring.split("\n")))
26
+ lines = consolidate_strings(lines)
27
+ blocks = mmcif_lines_to_mmcif_blocks(lines)
28
+ mmcif_dict = {}
29
+ for block in blocks:
30
+ if block["lines"][0] == "loop_":
31
+ mmcif_dict[block["category"]] = loop_block_to_list(block)
32
+ else:
33
+ mmcif_dict[block["category"]] = non_loop_block_to_list(block)
34
+ strip_quotes(mmcif_dict)
35
+ return mmcif_dict
36
+
37
+
38
+ def consolidate_strings(lines: deque[str]) -> deque[str]:
39
+ """Generally, .cif files have a one file line to one table row
40
+ correspondence. Sometimes however, a string cell is given a line of its own,
41
+ breaking the row over several lines. This function takes the lines of a .cif
42
+ file and puts all table rows on a single line.
43
+
44
+ :param deque lines: the .cif file lines.
45
+ :rtype: ``deque``"""
46
+
47
+ new_lines: deque[str] = deque()
48
+ while lines:
49
+ line = lines.popleft()
50
+ if line.startswith(";"):
51
+ string = [line[1:].strip()]
52
+ while not lines[0].startswith(";"):
53
+ string.append(lines.popleft())
54
+ lines.popleft()
55
+ new_lines[-1] += ' "{}"'.format(" ".join(string).replace('"', "\x1a").replace("'", "\x1b"))
56
+ else:
57
+ new_lines.append(line)
58
+ return new_lines
59
+
60
+
61
+ def mmcif_lines_to_mmcif_blocks(lines: deque[str]) -> list[dict[str, Any]]:
62
+ """A .cif file is ultimately a list of tables. This function takes a list of
63
+ .cif file lines and splits them into these table blocks. Each block will be
64
+ a ``dict`` containing a category name and a list of lines.
65
+
66
+ :param deque lines: the .cif file lines.
67
+ :rtype: ``list``"""
68
+
69
+ category = None
70
+ block: list[str] = []
71
+ blocks = []
72
+ while lines:
73
+ line = lines.popleft()
74
+ if line.startswith("data_"):
75
+ continue
76
+ if line.startswith("_"):
77
+ line_category = line.split(".")[0]
78
+ if line_category != category:
79
+ if category:
80
+ blocks.append({"category": category[1:], "lines": block})
81
+ category = line_category
82
+ block = []
83
+ if line.startswith("loop_"):
84
+ if category:
85
+ blocks.append({"category": category[1:], "lines": block})
86
+ category = lines[0].split(".")[0]
87
+ block = []
88
+ block.append(line)
89
+ if block:
90
+ blocks.append({"category": category[1:], "lines": block}) # type: ignore [index]
91
+ return blocks
92
+
93
+
94
+ def non_loop_block_to_list(block: dict[str, Any]) -> list[dict[str, Any]]:
95
+ """Takes a simple block ``dict`` with no loop and turns it into a table
96
+ ``list``.
97
+
98
+ :param dict block: the .cif block to process.
99
+ :rtype: ``list``"""
100
+
101
+ d = {}
102
+ for index in range(len(block["lines"]) - 1):
103
+ if block["lines"][index + 1][0] != "_":
104
+ block["lines"][index] += " " + block["lines"][index + 1]
105
+ block["lines"] = [l for l in block["lines"] if l[0] == "_"]
106
+ for line in block["lines"]:
107
+ name = line.split(".")[1].split()[0]
108
+ value = line
109
+ if line.startswith("_"):
110
+ value = " ".join(line.split()[1:])
111
+ d[name] = value
112
+ return [d]
113
+
114
+
115
+ def loop_block_to_list(block: dict[str, Any]) -> list[dict[str, Any]]:
116
+ """Takes a loop block ``dict`` where the initial lines are table headers and
117
+ turns it into a table ``list``. Sometimes a row is broken over several lines
118
+ so this function deals with that too.
119
+
120
+ :param dict block: the .cif block to process.
121
+ :rtype: ``list``"""
122
+
123
+ names, lines, _ = [], [], True
124
+ body_start = 0
125
+ for index, line in enumerate(block["lines"][1:], start=1):
126
+ if not line.startswith("_" + block["category"]):
127
+ body_start = index
128
+ break
129
+ names = [l.split(".")[1].rstrip() for l in block["lines"][1:body_start]]
130
+ lines = [split_values(l) for l in block["lines"][body_start:]]
131
+ l = []
132
+ for n in range(len(lines) - 1):
133
+ while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(names):
134
+ lines[n] += lines[n + 1]
135
+ lines.pop(n + 1)
136
+ for line in lines:
137
+ l.append({name: value for name, value in zip(names, line)})
138
+ return l
139
+
140
+
141
+ def split_values(line: str) -> list[str]:
142
+ """The body of a .cif table is a series of lines, with each cell divided by
143
+ whitespace. This function takes a string line and breaks it into cells.
144
+
145
+ There are a few peculiarities to handle. Sometimes a cell is a string
146
+ enclosed in quote marks, and spaces within this string obviously shouldn't
147
+ be used to break the line. This function handles all of that.
148
+
149
+ :param str line: the .cif line to split.
150
+ :rtype: ``list``"""
151
+
152
+ if not re.search("['\"]", line):
153
+ return line.split()
154
+ chars = deque(line.strip())
155
+ values, in_string = [], False
156
+ value: list[Any] = []
157
+ while chars:
158
+ char = chars.popleft()
159
+ if char == " " and not in_string:
160
+ values.append(value)
161
+ value = []
162
+ elif char in "'\"":
163
+ if in_string and chars and chars[0] != " ":
164
+ value.append(char)
165
+ else:
166
+ in_string = not in_string
167
+ else:
168
+ value.append(char)
169
+ values.append(value)
170
+ return ["".join(v) for v in values if v]
171
+
172
+
173
+ def strip_quotes(mmcif_dict: dict[str, Any]) -> None:
174
+ """Goes through each table in the mmcif ``dict`` and removes any unneeded
175
+ quote marks from the cells.
176
+
177
+ :param dict mmcif_dict: the almost finished .mmcif dictionary to clean."""
178
+
179
+ for name, table in mmcif_dict.items():
180
+ for row in table:
181
+ for k, value in row.items():
182
+ for char in "'\"":
183
+ if value[0] == char and value[-1] == char:
184
+ row[k] = value[1:-1]
185
+ row[k] = row[k].replace("\x1a", '"').replace("\x1b", "'")
186
+
187
+
188
+ def mmcif_dict_to_data_dict(mmcif_dict: dict[str, Any]) -> dict[str, Any]:
189
+ """Converts an .mmcif dictionary into an atomium data dictionary, with the
190
+ same standard layout that the other file formats get converted into.
191
+
192
+ :param dict mmcif_dict: the .mmcif dictionary.
193
+ :rtype: ``dict``"""
194
+
195
+ data_dict = {
196
+ "description": {"code": None, "title": None, "deposition_date": None, "classification": None, "keywords": [], "authors": []},
197
+ "experiment": {"technique": None, "source_organism": None, "expression_system": None, "missing_residues": []},
198
+ "quality": {"resolution": None, "rvalue": None, "rfree": None},
199
+ "geometry": {"assemblies": [], "crystallography": {}},
200
+ "models": [],
201
+ }
202
+ update_description_dict(mmcif_dict, data_dict)
203
+ update_experiment_dict(mmcif_dict, data_dict)
204
+ update_quality_dict(mmcif_dict, data_dict)
205
+ update_geometry_dict(mmcif_dict, data_dict)
206
+ update_models_list(mmcif_dict, data_dict)
207
+ return data_dict
208
+
209
+
210
+ def update_description_dict(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
211
+ """Takes a data dictionary and updates its description sub-dictionary with
212
+ information from a .mmcif dictionary.
213
+
214
+ :param dict mmcif_dict: the .mmcif dictionary to read.
215
+ :param dict data_dict: the data dictionary to update."""
216
+
217
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "code", "entry", "id")
218
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "title", "struct", "title")
219
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "deposition_date", "pdbx_database_status", "recvd_initial_deposition_date", date=True)
220
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "classification", "struct_keywords", "pdbx_keywords")
221
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "keywords", "struct_keywords", "text", split=True)
222
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "description", "authors", "audit_author", "name", multi=True)
223
+
224
+
225
+ def update_experiment_dict(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
226
+ """Takes a data dictionary and updates its experiment sub-dictionary with
227
+ information from a .mmcif dictionary.
228
+
229
+ :param dict mmcif_dict: the .mmcif dictionary to read.
230
+ :param dict data_dict: the data dictionary to update."""
231
+
232
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "experiment", "technique", "exptl", "method")
233
+ for cat, key in [
234
+ ["entity_src_nat", "pdbx_organism_scientific"],
235
+ ["entity_src_gen", "pdbx_gene_src_scientific_name"],
236
+ ["pdbx_entity_src_syn", "organism_scientific"],
237
+ ]:
238
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "experiment", "source_organism", cat, key)
239
+ if data_dict["experiment"]["source_organism"] not in [None, "?"]:
240
+ break
241
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "experiment", "expression_system", "entity_src_gen", "pdbx_host_org_scientific_name")
242
+ for r in mmcif_dict.get("pdbx_unobs_or_zero_occ_residues", []):
243
+ insert = "" if r["PDB_ins_code"] in "?." else r["PDB_ins_code"]
244
+ data_dict["experiment"]["missing_residues"].append({"id": f"{r['auth_asym_id']}.{r['auth_seq_id']}{insert}", "name": r["auth_comp_id"]})
245
+
246
+
247
+ def update_quality_dict(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
248
+ """Takes a data dictionary and updates its quality sub-dictionary with
249
+ information from a .mmcif dictionary.
250
+
251
+ :param dict mmcif_dict: the .mmcif dictionary to read.
252
+ :param dict data_dict: the data dictionary to update."""
253
+
254
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "quality", "resolution", "reflns", "d_resolution_high", func=float)
255
+ if not data_dict["quality"]["resolution"]:
256
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "quality", "resolution", "refine", "ls_d_res_high", func=float)
257
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "quality", "rvalue", "refine", "ls_R_factor_R_work", func=float)
258
+ if not data_dict["quality"]["rvalue"]:
259
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "quality", "rvalue", "refine", "ls_R_factor_obs", func=float)
260
+ mmcif_to_data_transfer(mmcif_dict, data_dict, "quality", "rfree", "refine", "ls_R_factor_R_free", func=float)
261
+
262
+
263
+ def update_geometry_dict(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
264
+ """Takes a data dictionary and updates its geometry sub-dictionary with
265
+ information from a .mmcif dictionary.
266
+
267
+ :param dict mmcif_dict: the .mmcif dictionary to read.
268
+ :param dict data_dict: the data dictionary to update."""
269
+
270
+ data_dict["geometry"]["assemblies"] = [
271
+ {
272
+ "id": int(a["id"]),
273
+ "software": a.get("method_details", None),
274
+ "delta_energy": None,
275
+ "buried_surface_area": None,
276
+ "surface_area": None,
277
+ "transformations": [],
278
+ }
279
+ for a in mmcif_dict.get("pdbx_struct_assembly", [])
280
+ ]
281
+ operations = {
282
+ o["id"]: [[float(o["matrix[{}][{}]".format(r, c)]) for c in [1, 2, 3]] + [float(o["vector[{}]".format(r)])] for r in [1, 2, 3]] + [[0, 0, 0, 1]]
283
+ for o in mmcif_dict.get("pdbx_struct_oper_list", [])
284
+ }
285
+ for assembly in data_dict["geometry"]["assemblies"]:
286
+ if assembly["software"] == "?":
287
+ assembly["software"] = None
288
+ assign_metrics_to_assembly(mmcif_dict, assembly)
289
+ assign_transformations_to_assembly(mmcif_dict, operations, assembly)
290
+ update_crystallography_dict(mmcif_dict, data_dict)
291
+
292
+
293
+ def assign_metrics_to_assembly(mmcif_dict: dict[str, Any], assembly: dict[str, Any]) -> None:
294
+ """Takes an assembly dict, and goes through an mmcif dictionary looking for
295
+ relevant energy etc. information to update it with.
296
+
297
+ :param dict mmcif_dict: The dictionary to read.
298
+ :param dict assembly: The assembly to update."""
299
+
300
+ for a in mmcif_dict.get("pdbx_struct_assembly_prop", []):
301
+ if a["biol_id"] == str(assembly["id"]):
302
+ if a["type"] == "MORE":
303
+ assembly["delta_energy"] = float(a["value"].split("/")[0])
304
+ elif a["type"] == "SSA (A^2)":
305
+ assembly["surface_area"] = float(a["value"].split("/")[0])
306
+ elif a["type"] == "ABSA (A^2)":
307
+ assembly["buried_surface_area"] = float(a["value"].split("/")[0])
308
+
309
+
310
+ def assign_transformations_to_assembly(mmcif_dict: dict[str, Any], operations: Any, assembly: dict[str, Any]) -> None:
311
+ """Takes an assembly dict, and goes through an mmcif dictionary looking for
312
+ relevant transformation information to update it with.
313
+
314
+ :param dict mmcif_dict: the .mmcif dictionary to read.
315
+ :param dict operations: the processed operations matrices.
316
+ :param dict assembly: the assembly to update."""
317
+
318
+ for gen in mmcif_dict.get("pdbx_struct_assembly_gen", []):
319
+ if gen["assembly_id"] == str(assembly["id"]):
320
+ op_ids_groups = get_operation_id_groups(gen["oper_expression"])
321
+ ops = operation_id_groups_to_operations(operations, op_ids_groups)
322
+ for operation in ops:
323
+ assembly["transformations"].append(
324
+ {"chains": gen["asym_id_list"].split(","), "matrix": [row[:3] for row in operation[:3]], "vector": [row[-1] for row in operation[:3]]}
325
+ )
326
+
327
+
328
+ def get_operation_id_groups(expression: str) -> list[list[str]]:
329
+ """Takes an operator expression from an .mmcif transformation dict, and
330
+ works out what transformation IDs it is referring to. For example, (1,2,3)
331
+ becomes [[1, 2, 3]], (1-3)(8-11,17) becomes [[1, 2, 3], [8, 9, 10, 11, 17]],
332
+ and so on.
333
+
334
+ :param str expression: The expression to parse.
335
+ :rtype: ``list``"""
336
+
337
+ if expression[0] != "(":
338
+ expression = "({})".format(expression)
339
+ groups = re.findall(r"\((.+?)\)", expression)
340
+ group_ids = []
341
+ for group in groups:
342
+ ids = []
343
+ elements = group.split(",")
344
+ for element in elements:
345
+ if "-" in element:
346
+ bounds = [int(x) for x in element.split("-")]
347
+ ids += [str(n) for n in list(range(bounds[0], bounds[1] + 1))]
348
+ else:
349
+ ids.append(element)
350
+ group_ids.append(ids)
351
+ return group_ids
352
+
353
+
354
+ def update_crystallography_dict(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
355
+ """Takes a data dictionary and updates its crystallography
356
+ sub-sub-dictionary with information from a .mmcif dictionary.
357
+
358
+ :param dict mmcif_dict: the .mmcif dictionary to read.
359
+ :param dict data_dict: the data dictionary to update."""
360
+
361
+ if mmcif_dict.get("cell"):
362
+ mmcif_to_data_transfer(mmcif_dict, data_dict["geometry"], "crystallography", "space_group", "symmetry", "space_group_name_H-M")
363
+ data_dict["geometry"]["crystallography"]["unit_cell"] = [
364
+ float(mmcif_dict["cell"][0][key].replace("?", "0")) for key in ["length_a", "length_b", "length_c", "angle_alpha", "angle_beta", "angle_gamma"]
365
+ ]
366
+ if data_dict["geometry"]["crystallography"].get("space_group") == "NA":
367
+ data_dict["geometry"]["crystallography"] = {}
368
+
369
+
370
+ def operation_id_groups_to_operations(operations: Any, operation_id_groups: Any) -> Any:
371
+ """Creates a list of operation matrices for an assembly, from a list of
372
+ operation IDs - cross multiplying as required.
373
+
374
+ :param dict operations: the parsed .mmcif operations.
375
+ :param list operation_id_groups: the operation IDs."""
376
+
377
+ operation_groups = [[operations[i] for i in ids] for ids in operation_id_groups]
378
+ while len(operation_groups) and len(operation_groups) != 1:
379
+ operations = []
380
+ for op1 in operation_groups[0]:
381
+ for op2 in operation_groups[1]:
382
+ operations.append(np.matmul(op1, op2))
383
+ operation_groups[0] = operations
384
+ operation_groups.pop(1)
385
+ return operation_groups[0]
386
+
387
+
388
+ def update_models_list(mmcif_dict: dict[str, Any], data_dict: dict[str, Any]) -> None:
389
+ """Takes a data dictionary and updates its models list with
390
+ information from a .mmcif dictionary.
391
+
392
+ :param dict mmcif_dict: the .mmcif dictionary to read.
393
+ :param dict data_dict: the data dictionary to update."""
394
+
395
+ data_dict["models"] = []
396
+ types = {e["id"]: e["type"] for e in mmcif_dict.get("entity", {})}
397
+ names = {e["id"]: e["name"] for e in mmcif_dict.get("chem_comp", {}) if e["mon_nstd_flag"] != "y"}
398
+ entities = {m["id"]: m["entity_id"] for m in mmcif_dict.get("struct_asym", [])}
399
+ # sequences = make_sequences(mmcif_dict)
400
+ secondary_structure = make_secondary_structure(mmcif_dict)
401
+ aniso = make_aniso(mmcif_dict)
402
+ model: dict[str, Any] = {"polymer": {}, "non-polymer": {}, "water": {}, "branched": {}}
403
+ model_num = mmcif_dict["atom_site"][0]["pdbx_PDB_model_num"]
404
+ for atom in mmcif_dict["atom_site"]:
405
+ if atom["pdbx_PDB_model_num"] != model_num:
406
+ data_dict["models"].append(model)
407
+ model = {"polymer": {}, "non-polymer": {}, "water": {}, "branched": {}}
408
+ model_num = atom["pdbx_PDB_model_num"]
409
+ mol_type = types[entities[atom["label_asym_id"]]]
410
+ if mol_type == "polymer" or mol_type == "branched":
411
+ add_atom_to_polymer(atom, aniso, model, names)
412
+ else:
413
+ add_atom_to_non_polymer(atom, aniso, model, mol_type, names)
414
+ data_dict["models"].append(model)
415
+ for model in data_dict["models"]:
416
+ add_sequences_to_polymers(model, mmcif_dict, entities)
417
+ add_secondary_structure_to_polymers(model, secondary_structure)
418
+
419
+
420
+ def make_aniso(mmcif_dict: dict[str, Any]) -> dict[int, Any]:
421
+ """Makes a mapping of atom IDs to anisotropy information.
422
+
423
+ :param mmcif_dict: the .mmcif dict to read.
424
+ :rtype: ``dict``"""
425
+
426
+ return {
427
+ int(a["id"]): [float(a["U[{}][{}]".format(x, y)]) for x, y in ["11", "22", "33", "12", "13", "23"]] # type: ignore [has-type, misc]
428
+ for a in mmcif_dict.get("atom_site_anisotrop", [])
429
+ }
430
+
431
+
432
+ def make_secondary_structure(mmcif_dict: dict[str, Any]) -> dict[str, Any]:
433
+ """Creates a dictionary of helices and strands, with each having a list of
434
+ start and end residues.
435
+
436
+ :param mmcif_dict: the .mmcif dict to read.
437
+ :rtype: ``dict``"""
438
+
439
+ helices, strands = [], []
440
+ for helix in mmcif_dict.get("struct_conf", []):
441
+ helices.append(
442
+ [
443
+ "{}.{}{}".format(
444
+ helix[f"{x}_auth_asym_id"],
445
+ helix[f"{x}_auth_seq_id"],
446
+ helix[f"pdbx_{x}_PDB_ins_code"].replace("?", ""),
447
+ )
448
+ for x in ["beg", "end"]
449
+ ]
450
+ )
451
+ for strand in mmcif_dict.get("struct_sheet_range", []):
452
+ strands.append(
453
+ [
454
+ "{}.{}{}".format(
455
+ strand[f"{x}_auth_asym_id"],
456
+ strand[f"{x}_auth_seq_id"],
457
+ strand[f"pdbx_{x}_PDB_ins_code"].replace("?", ""),
458
+ )
459
+ for x in ["beg", "end"]
460
+ ]
461
+ )
462
+ return {"helices": helices, "strands": strands}
463
+
464
+
465
+ def add_atom_to_polymer(atom: dict[str, Any], aniso: dict[int, Any], model: dict[str, Any], names: dict[str, Any]) -> None:
466
+ """Takes an MMCIF atom dictionary, converts it, and adds it to a polymer
467
+ dictionary.
468
+
469
+ :param dict atom: the .mmcif dictionary to read.
470
+ :param dict aniso: lookup dictionary for anisotropy information.
471
+ :param dict model: the model to update.
472
+ :param dict names: the lookup dictionary for full name information."""
473
+
474
+ mol_id = atom["auth_asym_id"]
475
+ res_id = make_residue_id(atom)
476
+ try:
477
+ model["polymer"][mol_id]["residues"][res_id]["atoms"][int(atom["id"])] = atom_dict_to_atom_dict(atom, aniso)
478
+ except Exception:
479
+ name = atom["auth_comp_id"]
480
+ try:
481
+ model["polymer"][mol_id]["residues"][res_id] = {
482
+ "name": name,
483
+ "full_name": names.get(name),
484
+ "atoms": {int(atom["id"]): atom_dict_to_atom_dict(atom, aniso)},
485
+ "number": len(model["polymer"][mol_id]["residues"]) + 1,
486
+ }
487
+ except Exception:
488
+ model["polymer"][mol_id] = {
489
+ "internal_id": atom["label_asym_id"],
490
+ "helices": [],
491
+ "strands": [],
492
+ "residues": {
493
+ res_id: {
494
+ "name": name,
495
+ "atoms": {int(atom["id"]): atom_dict_to_atom_dict(atom, aniso)},
496
+ "number": 1,
497
+ "full_name": names.get(name),
498
+ }
499
+ },
500
+ }
501
+
502
+
503
+ def add_atom_to_non_polymer(atom: dict[str, Any], aniso: dict[int, Any], model: dict[str, Any], mol_type: str, names: dict[str, Any]) -> None:
504
+ """Takes an MMCIF atom dictionary, converts it, and adds it to a non-polymer
505
+ dictionary.
506
+
507
+ :param dict atom: the .mmcif dictionary to read.
508
+ :param dict aniso: lookup dictionary for anisotropy information.
509
+ :param dict model: the model to update.
510
+ :param str mol_type: non-polymer or water.
511
+ :param dict names: the lookup dictionary for full name information."""
512
+
513
+ mol_id = make_residue_id(atom)
514
+ try:
515
+ model[mol_type][mol_id]["atoms"][int(atom["id"])] = atom_dict_to_atom_dict(atom, aniso)
516
+ except Exception:
517
+ name = atom["auth_comp_id"]
518
+ model[mol_type][mol_id] = {
519
+ "name": name,
520
+ "full_name": names.get(name),
521
+ "internal_id": atom["label_asym_id"],
522
+ "polymer": atom["auth_asym_id"],
523
+ "atoms": {int(atom["id"]): atom_dict_to_atom_dict(atom, aniso)},
524
+ }
525
+
526
+
527
+ def make_residue_id(d: dict[str, Any]) -> str:
528
+ """Generates a residue ID for an atom.
529
+
530
+ :param dict d: the atom dictionary to read.
531
+ :rtype: ``str``"""
532
+
533
+ insert = "" if d["pdbx_PDB_ins_code"] in "?." else d["pdbx_PDB_ins_code"]
534
+ return "{}.{}{}".format(d["auth_asym_id"], d["auth_seq_id"], insert)
535
+
536
+
537
+ def add_sequences_to_polymers(model: dict[str, Any], mmcif_dict: dict[str, Any], entities: dict[str, Any]) -> None:
538
+ """Takes a pre-populated mapping of chain IDs to entity IDs, and uses them
539
+ to add sequence information to a model.
540
+
541
+ :param dict model: the model to update.
542
+ :param dict mmcif_dict: the .mmcif dictionary to read.
543
+ :param dict entities: a mapping of chain IDs to entity IDs."""
544
+
545
+ sequences = make_sequences(mmcif_dict)
546
+ for polymer in model["polymer"].values():
547
+ polymer["sequence"] = sequences.get(entities.get(polymer["internal_id"], ""), "")
548
+
549
+
550
+ def add_secondary_structure_to_polymers(model: dict[str, Any], ss_dict: dict[str, Any]) -> None:
551
+ """Updates polymer dictionaries with secondary structure information, from
552
+ a previously created mapping.
553
+
554
+ :param dict model: the model to update.
555
+ :param dict ss_dict: the mapping to read."""
556
+
557
+ for ss in ("helices", "strands"):
558
+ for segment in ss_dict[ss]:
559
+ chain = model["polymer"].get(segment[0].split(".")[0])
560
+ if chain:
561
+ in_segment = False
562
+ chain[ss].append([])
563
+ for residue_id in chain["residues"].keys():
564
+ if residue_id == segment[0]:
565
+ in_segment = True
566
+ if in_segment:
567
+ chain[ss][-1].append(residue_id)
568
+ if residue_id == segment[1]:
569
+ break
570
+
571
+
572
+ def make_sequences(mmcif_dict: dict[str, Any]) -> dict[str, Any]:
573
+ """Creates a mapping of entity IDs to sequences.
574
+
575
+ :param dict mmcif_dict: the .mmcif dictionary to read.
576
+ :rtype: ``dict``"""
577
+
578
+ return {
579
+ e["id"]: "".join([CODES.get(res["mon_id"], "X") for res in mmcif_dict.get("entity_poly_seq", []) if res["entity_id"] == e["id"]])
580
+ for e in mmcif_dict.get("entity", [])
581
+ if e["type"] == "polymer"
582
+ }
583
+
584
+
585
+ def atom_dict_to_atom_dict(d: dict[str, Any], aniso_dict: dict[int, Any]) -> dict[str, Any]:
586
+ """Turns an .mmcif atom dictionary into an atomium atom data dictionary.
587
+
588
+ :param dict d: the .mmcif atom dictionary.
589
+ :param dict d: the mapping of atom IDs to anisotropy.
590
+ :rtype: ``dict``"""
591
+
592
+ charge = "pdbx_formal_charge"
593
+ atom = {
594
+ "x": d["Cartn_x"],
595
+ "y": d["Cartn_y"],
596
+ "z": d["Cartn_z"],
597
+ "element": d["type_symbol"],
598
+ "name": d.get("label_atom_id"),
599
+ "occupancy": d.get("occupancy", 1),
600
+ "bvalue": d.get("B_iso_or_equiv"),
601
+ "charge": d.get(charge, 0) if d.get(charge) != "?" else 0,
602
+ "alt_loc": d.get("label_alt_id") if d.get("label_alt_id") != "." else None,
603
+ "anisotropy": aniso_dict.get(int(d["id"]), [0, 0, 0, 0, 0, 0]),
604
+ "is_hetatm": d.get("group_PDB", "ATOM") == "HETATM",
605
+ }
606
+ for key in ["x", "y", "z", "charge", "bvalue", "occupancy"]:
607
+ if atom[key] is not None:
608
+ atom[key] = float(atom[key])
609
+ return atom
610
+
611
+
612
+ def mmcif_to_data_transfer(
613
+ mmcif_dict: dict[str, Any],
614
+ data_dict: dict[str, Any],
615
+ d_cat: Any,
616
+ d_key: Any,
617
+ m_table: Any,
618
+ m_key: Any,
619
+ date: bool = False,
620
+ split: bool = False,
621
+ multi: bool = False,
622
+ func: Any = None,
623
+ ) -> None:
624
+ """A function for transfering a bit of data from a .mmcif dictionary to a
625
+ data dictionary, or doing nothing if the data doesn't exist.
626
+
627
+ :param dict mmcif_dict: the .mmcif dictionary to read.
628
+ :param dict data_dict: the data dictionary to update.
629
+ :param str d_cat: the top-level key in the data dictionary.
630
+ :param str d_key: the data dictionary field to update.
631
+ :param str m_table: the name of the .mmcif table to look in.
632
+ :param str m_key: the .mmcif field to read.
633
+ :param bool date: if True, the value will be converted to a date.
634
+ :param bool split: if True, the value will be split on commas.
635
+ :param bool multi: if True, every row in the table will be read.
636
+ :param function func: if given, this will be applied to the value."""
637
+
638
+ try:
639
+ if multi:
640
+ value = [row[m_key] for row in mmcif_dict[m_table]]
641
+ else:
642
+ value = mmcif_dict[m_table][0][m_key]
643
+ if date:
644
+ value = datetime.strptime(value, "%Y-%m-%d").date() # type: ignore [arg-type, assignment]
645
+ if split:
646
+ value = value.replace(", ", ",").split(",") # type: ignore [attr-defined]
647
+ if func:
648
+ value = func(value)
649
+ data_dict[d_cat][d_key] = None if value == "?" else value # type: ignore [comparison-overlap]
650
+ except Exception:
651
+ pass