boltz-vsynthes 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. boltz/__init__.py +7 -0
  2. boltz/data/__init__.py +0 -0
  3. boltz/data/const.py +1184 -0
  4. boltz/data/crop/__init__.py +0 -0
  5. boltz/data/crop/affinity.py +164 -0
  6. boltz/data/crop/boltz.py +296 -0
  7. boltz/data/crop/cropper.py +45 -0
  8. boltz/data/feature/__init__.py +0 -0
  9. boltz/data/feature/featurizer.py +1230 -0
  10. boltz/data/feature/featurizerv2.py +2208 -0
  11. boltz/data/feature/symmetry.py +602 -0
  12. boltz/data/filter/__init__.py +0 -0
  13. boltz/data/filter/dynamic/__init__.py +0 -0
  14. boltz/data/filter/dynamic/date.py +76 -0
  15. boltz/data/filter/dynamic/filter.py +24 -0
  16. boltz/data/filter/dynamic/max_residues.py +37 -0
  17. boltz/data/filter/dynamic/resolution.py +34 -0
  18. boltz/data/filter/dynamic/size.py +38 -0
  19. boltz/data/filter/dynamic/subset.py +42 -0
  20. boltz/data/filter/static/__init__.py +0 -0
  21. boltz/data/filter/static/filter.py +26 -0
  22. boltz/data/filter/static/ligand.py +37 -0
  23. boltz/data/filter/static/polymer.py +299 -0
  24. boltz/data/module/__init__.py +0 -0
  25. boltz/data/module/inference.py +307 -0
  26. boltz/data/module/inferencev2.py +429 -0
  27. boltz/data/module/training.py +684 -0
  28. boltz/data/module/trainingv2.py +660 -0
  29. boltz/data/mol.py +900 -0
  30. boltz/data/msa/__init__.py +0 -0
  31. boltz/data/msa/mmseqs2.py +235 -0
  32. boltz/data/pad.py +84 -0
  33. boltz/data/parse/__init__.py +0 -0
  34. boltz/data/parse/a3m.py +134 -0
  35. boltz/data/parse/csv.py +100 -0
  36. boltz/data/parse/fasta.py +138 -0
  37. boltz/data/parse/mmcif.py +1239 -0
  38. boltz/data/parse/mmcif_with_constraints.py +1607 -0
  39. boltz/data/parse/schema.py +1851 -0
  40. boltz/data/parse/yaml.py +68 -0
  41. boltz/data/sample/__init__.py +0 -0
  42. boltz/data/sample/cluster.py +283 -0
  43. boltz/data/sample/distillation.py +57 -0
  44. boltz/data/sample/random.py +39 -0
  45. boltz/data/sample/sampler.py +49 -0
  46. boltz/data/tokenize/__init__.py +0 -0
  47. boltz/data/tokenize/boltz.py +195 -0
  48. boltz/data/tokenize/boltz2.py +396 -0
  49. boltz/data/tokenize/tokenizer.py +24 -0
  50. boltz/data/types.py +777 -0
  51. boltz/data/write/__init__.py +0 -0
  52. boltz/data/write/mmcif.py +305 -0
  53. boltz/data/write/pdb.py +171 -0
  54. boltz/data/write/utils.py +23 -0
  55. boltz/data/write/writer.py +330 -0
  56. boltz/main.py +1292 -0
  57. boltz/model/__init__.py +0 -0
  58. boltz/model/layers/__init__.py +0 -0
  59. boltz/model/layers/attention.py +132 -0
  60. boltz/model/layers/attentionv2.py +111 -0
  61. boltz/model/layers/confidence_utils.py +231 -0
  62. boltz/model/layers/dropout.py +34 -0
  63. boltz/model/layers/initialize.py +100 -0
  64. boltz/model/layers/outer_product_mean.py +98 -0
  65. boltz/model/layers/pair_averaging.py +135 -0
  66. boltz/model/layers/pairformer.py +337 -0
  67. boltz/model/layers/relative.py +58 -0
  68. boltz/model/layers/transition.py +78 -0
  69. boltz/model/layers/triangular_attention/__init__.py +0 -0
  70. boltz/model/layers/triangular_attention/attention.py +189 -0
  71. boltz/model/layers/triangular_attention/primitives.py +409 -0
  72. boltz/model/layers/triangular_attention/utils.py +380 -0
  73. boltz/model/layers/triangular_mult.py +212 -0
  74. boltz/model/loss/__init__.py +0 -0
  75. boltz/model/loss/bfactor.py +49 -0
  76. boltz/model/loss/confidence.py +590 -0
  77. boltz/model/loss/confidencev2.py +621 -0
  78. boltz/model/loss/diffusion.py +171 -0
  79. boltz/model/loss/diffusionv2.py +134 -0
  80. boltz/model/loss/distogram.py +48 -0
  81. boltz/model/loss/distogramv2.py +105 -0
  82. boltz/model/loss/validation.py +1025 -0
  83. boltz/model/models/__init__.py +0 -0
  84. boltz/model/models/boltz1.py +1286 -0
  85. boltz/model/models/boltz2.py +1249 -0
  86. boltz/model/modules/__init__.py +0 -0
  87. boltz/model/modules/affinity.py +223 -0
  88. boltz/model/modules/confidence.py +481 -0
  89. boltz/model/modules/confidence_utils.py +181 -0
  90. boltz/model/modules/confidencev2.py +495 -0
  91. boltz/model/modules/diffusion.py +844 -0
  92. boltz/model/modules/diffusion_conditioning.py +116 -0
  93. boltz/model/modules/diffusionv2.py +677 -0
  94. boltz/model/modules/encoders.py +639 -0
  95. boltz/model/modules/encodersv2.py +565 -0
  96. boltz/model/modules/transformers.py +322 -0
  97. boltz/model/modules/transformersv2.py +261 -0
  98. boltz/model/modules/trunk.py +688 -0
  99. boltz/model/modules/trunkv2.py +828 -0
  100. boltz/model/modules/utils.py +303 -0
  101. boltz/model/optim/__init__.py +0 -0
  102. boltz/model/optim/ema.py +389 -0
  103. boltz/model/optim/scheduler.py +99 -0
  104. boltz/model/potentials/__init__.py +0 -0
  105. boltz/model/potentials/potentials.py +497 -0
  106. boltz/model/potentials/schedules.py +32 -0
  107. boltz_vsynthes-1.0.0.dist-info/METADATA +151 -0
  108. boltz_vsynthes-1.0.0.dist-info/RECORD +112 -0
  109. boltz_vsynthes-1.0.0.dist-info/WHEEL +5 -0
  110. boltz_vsynthes-1.0.0.dist-info/entry_points.txt +2 -0
  111. boltz_vsynthes-1.0.0.dist-info/licenses/LICENSE +21 -0
  112. boltz_vsynthes-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1239 @@
1
+ import contextlib
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass, replace
4
+ from typing import Optional
5
+
6
+ import gemmi
7
+ import numpy as np
8
+ from rdkit import rdBase
9
+ from rdkit.Chem import AllChem
10
+ from rdkit.Chem.rdchem import Mol
11
+ from sklearn.neighbors import KDTree
12
+
13
+ from boltz.data import const
14
+ from boltz.data.mol import load_molecules
15
+ from boltz.data.types import (
16
+ AtomV2,
17
+ BondV2,
18
+ Chain,
19
+ Coords,
20
+ Ensemble,
21
+ Interface,
22
+ Residue,
23
+ StructureInfo,
24
+ StructureV2,
25
+ )
26
+
27
+ ####################################################################################################
28
+ # DATACLASSES
29
+ ####################################################################################################
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class ParsedAtom:
34
+ """A parsed atom object."""
35
+
36
+ name: str
37
+ coords: tuple[float, float, float]
38
+ is_present: bool
39
+ bfactor: float
40
+ plddt: Optional[float] = None
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class ParsedBond:
45
+ """A parsed bond object."""
46
+
47
+ atom_1: int
48
+ atom_2: int
49
+ type: int
50
+
51
+
52
+ @dataclass(frozen=True, slots=True)
53
+ class ParsedResidue:
54
+ """A parsed residue object."""
55
+
56
+ name: str
57
+ type: int
58
+ idx: int
59
+ atoms: list[ParsedAtom]
60
+ bonds: list[ParsedBond]
61
+ orig_idx: Optional[int]
62
+ atom_center: int
63
+ atom_disto: int
64
+ is_standard: bool
65
+ is_present: bool
66
+
67
+
68
+ @dataclass(frozen=True, slots=True)
69
+ class ParsedChain:
70
+ """A parsed chain object."""
71
+
72
+ name: str
73
+ entity: str
74
+ type: int
75
+ residues: list[ParsedResidue]
76
+ sequence: Optional[str] = None
77
+
78
+
79
+ @dataclass(frozen=True, slots=True)
80
+ class ParsedConnection:
81
+ """A parsed connection object."""
82
+
83
+ chain_1: str
84
+ chain_2: str
85
+ residue_index_1: int
86
+ residue_index_2: int
87
+ atom_index_1: str
88
+ atom_index_2: str
89
+
90
+
91
+ @dataclass(frozen=True, slots=True)
92
+ class ParsedStructure:
93
+ """A parsed structure object."""
94
+
95
+ data: StructureV2
96
+ info: StructureInfo
97
+ sequences: dict[str, str]
98
+
99
+
100
+ ####################################################################################################
101
+ # HELPERS
102
+ ####################################################################################################
103
+
104
+
105
+ def get_mol(ccd: str, mols: dict, moldir: str) -> Mol:
106
+ """Get mol from CCD code.
107
+
108
+ Return mol with ccd from mols if it is in mols. Otherwise load it from moldir,
109
+ add it to mols, and return the mol.
110
+ """
111
+ mol = mols.get(ccd)
112
+ if mol is None:
113
+ # Load molecule
114
+ mol = load_molecules(moldir, [ccd])[ccd]
115
+
116
+ # Add to resource
117
+ if isinstance(mols, dict):
118
+ mols[ccd] = mol
119
+ else:
120
+ mols.set(ccd, mol)
121
+
122
+ return mol
123
+
124
+
125
+ def get_dates(block: gemmi.cif.Block) -> tuple[str, str, str]:
126
+ """Get the deposited, released, and last revision dates.
127
+
128
+ Parameters
129
+ ----------
130
+ block : gemmi.cif.Block
131
+ The block to process.
132
+
133
+ Returns
134
+ -------
135
+ str
136
+ The deposited date.
137
+ str
138
+ The released date.
139
+ str
140
+ The last revision date.
141
+
142
+ """
143
+ deposited = "_pdbx_database_status.recvd_initial_deposition_date"
144
+ revision = "_pdbx_audit_revision_history.revision_date"
145
+ deposit_date = revision_date = release_date = ""
146
+ with contextlib.suppress(Exception):
147
+ deposit_date = block.find([deposited])[0][0]
148
+ release_date = block.find([revision])[0][0]
149
+ revision_date = block.find([revision])[-1][0]
150
+
151
+ return deposit_date, release_date, revision_date
152
+
153
+
154
+ def get_resolution(block: gemmi.cif.Block) -> float:
155
+ """Get the resolution from a gemmi structure.
156
+
157
+ Parameters
158
+ ----------
159
+ block : gemmi.cif.Block
160
+ The block to process.
161
+
162
+ Returns
163
+ -------
164
+ float
165
+ The resolution.
166
+
167
+ """
168
+ resolution = 0.0
169
+ for res_key in (
170
+ "_refine.ls_d_res_high",
171
+ "_em_3d_reconstruction.resolution",
172
+ "_reflns.d_resolution_high",
173
+ ):
174
+ with contextlib.suppress(Exception):
175
+ resolution = float(block.find([res_key])[0].str(0))
176
+ break
177
+ return resolution
178
+
179
+
180
+ def get_method(block: gemmi.cif.Block) -> str:
181
+ """Get the method from a gemmi structure.
182
+
183
+ Parameters
184
+ ----------
185
+ block : gemmi.cif.Block
186
+ The block to process.
187
+
188
+ Returns
189
+ -------
190
+ str
191
+ The method.
192
+
193
+ """
194
+ method = ""
195
+ method_key = "_exptl.method"
196
+ with contextlib.suppress(Exception):
197
+ methods = block.find([method_key])
198
+ method = ",".join([m.str(0).lower() for m in methods])
199
+
200
+ return method
201
+
202
+
203
+ def get_experiment_conditions(
204
+ block: gemmi.cif.Block,
205
+ ) -> tuple[Optional[float], Optional[float]]:
206
+ """Get temperature and pH.
207
+
208
+ Parameters
209
+ ----------
210
+ block : gemmi.cif.Block
211
+ The block to process.
212
+
213
+ Returns
214
+ -------
215
+ tuple[float, float]
216
+ The temperature and pH.
217
+ """
218
+ temperature = None
219
+ ph = None
220
+
221
+ keys_t = [
222
+ "_exptl_crystal_grow.temp",
223
+ "_pdbx_nmr_exptl_sample_conditions.temperature",
224
+ ]
225
+ for key in keys_t:
226
+ with contextlib.suppress(Exception):
227
+ temperature = float(block.find([key])[0][0])
228
+ break
229
+
230
+ keys_ph = ["_exptl_crystal_grow.pH", "_pdbx_nmr_exptl_sample_conditions.pH"]
231
+ with contextlib.suppress(Exception):
232
+ for key in keys_ph:
233
+ ph = float(block.find([key])[0][0])
234
+ break
235
+
236
+ return temperature, ph
237
+
238
+
239
+ def get_unk_token(dtype: gemmi.PolymerType) -> str:
240
+ """Get the unknown token for a given entity type.
241
+
242
+ Parameters
243
+ ----------
244
+ dtype : gemmi.EntityType
245
+ The entity type.
246
+
247
+ Returns
248
+ -------
249
+ str
250
+ The unknown token.
251
+
252
+ """
253
+ if dtype == gemmi.PolymerType.PeptideL:
254
+ unk = const.unk_token["PROTEIN"]
255
+ elif dtype == gemmi.PolymerType.Dna:
256
+ unk = const.unk_token["DNA"]
257
+ elif dtype == gemmi.PolymerType.Rna:
258
+ unk = const.unk_token["RNA"]
259
+ else:
260
+ msg = f"Unknown polymer type: {dtype}"
261
+ raise ValueError(msg)
262
+
263
+ return unk
264
+
265
+
266
+ def compute_covalent_ligands(
267
+ connections: list[gemmi.Connection],
268
+ subchain_map: dict[tuple[str, int], str],
269
+ entities: dict[str, gemmi.Entity],
270
+ ) -> set[str]:
271
+ """Compute the covalent ligands from a list of connections.
272
+
273
+ Parameters
274
+ ----------
275
+ connections: list[gemmi.Connection]
276
+ The connections to process.
277
+ subchain_map: dict[tuple[str, int], str]
278
+ The mapping from chain, residue index to subchain name.
279
+ entities: dict[str, gemmi.Entity]
280
+ The entities in the structure.
281
+
282
+ Returns
283
+ -------
284
+ set
285
+ The covalent ligand subchains.
286
+
287
+ """
288
+ # Get covalent chain ids
289
+ covalent_chain_ids = set()
290
+ for connection in connections:
291
+ if connection.type.name != "Covale":
292
+ continue
293
+
294
+ # Map to correct subchain
295
+ chain_1_name = connection.partner1.chain_name
296
+ chain_2_name = connection.partner2.chain_name
297
+
298
+ res_1_id = connection.partner1.res_id.seqid
299
+ res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip()
300
+
301
+ res_2_id = connection.partner2.res_id.seqid
302
+ res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip()
303
+
304
+ subchain_1 = subchain_map[(chain_1_name, res_1_id)]
305
+ subchain_2 = subchain_map[(chain_2_name, res_2_id)]
306
+
307
+ # If non-polymer or branched, add to set
308
+ entity_1 = entities[subchain_1].entity_type.name
309
+ entity_2 = entities[subchain_2].entity_type.name
310
+
311
+ if entity_1 in {"NonPolymer", "Branched"}:
312
+ covalent_chain_ids.add(subchain_1)
313
+ if entity_2 in {"NonPolymer", "Branched"}:
314
+ covalent_chain_ids.add(subchain_2)
315
+
316
+ return covalent_chain_ids
317
+
318
+
319
+ def compute_interfaces(atom_data: np.ndarray, chain_data: np.ndarray) -> np.ndarray:
320
+ """Compute the chain-chain interfaces from a gemmi structure.
321
+
322
+ Parameters
323
+ ----------
324
+ atom_data : list[tuple]
325
+ The atom data.
326
+ chain_data : list[tuple]
327
+ The chain data.
328
+
329
+ Returns
330
+ -------
331
+ list[tuple[int, int]]
332
+ The interfaces.
333
+
334
+ """
335
+ # Compute chain_id per atom
336
+ chain_ids = []
337
+ for idx, chain in enumerate(chain_data):
338
+ chain_ids.extend([idx] * chain["atom_num"])
339
+ chain_ids = np.array(chain_ids)
340
+
341
+ # Filter to present atoms
342
+ coords = atom_data["coords"]
343
+ mask = atom_data["is_present"]
344
+
345
+ coords = coords[mask]
346
+ chain_ids = chain_ids[mask]
347
+
348
+ # Compute the distance matrix
349
+ tree = KDTree(coords, metric="euclidean")
350
+ query = tree.query_radius(coords, const.atom_interface_cutoff)
351
+
352
+ # Get unique chain pairs
353
+ interfaces = set()
354
+ for c1, pairs in zip(chain_ids, query):
355
+ chains = np.unique(chain_ids[pairs])
356
+ chains = chains[chains != c1]
357
+ interfaces.update((c1, c2) for c2 in chains)
358
+
359
+ # Get unique chain pairs
360
+ interfaces = [(min(i, j), max(i, j)) for i, j in interfaces]
361
+ interfaces = list({(int(i), int(j)) for i, j in interfaces})
362
+ interfaces = np.array(interfaces, dtype=Interface)
363
+ return interfaces
364
+
365
+
366
+ ####################################################################################################
367
+ # PARSING
368
+ ####################################################################################################
369
+
370
+
371
+ def parse_ccd_residue( # noqa: PLR0915, C901
372
+ name: str,
373
+ ref_mol: Mol,
374
+ res_idx: int,
375
+ gemmi_mol: Optional[gemmi.Residue] = None,
376
+ is_covalent: bool = False,
377
+ ) -> Optional[ParsedResidue]:
378
+ """Parse an MMCIF ligand.
379
+
380
+ First tries to get the SMILES string from the RCSB.
381
+ Then, tries to infer atom ordering using RDKit.
382
+
383
+ Parameters
384
+ ----------
385
+ name: str
386
+ The name of the molecule to parse.
387
+ components : dict
388
+ The preprocessed PDB components dictionary.
389
+ res_idx : int
390
+ The residue index.
391
+ gemmi_mol : Optional[gemmi.Residue]
392
+ The PDB molecule, as a gemmi Residue object, if any.
393
+
394
+ Returns
395
+ -------
396
+ ParsedResidue, optional
397
+ The output ParsedResidue, if successful.
398
+
399
+ """
400
+ # Check if we have a PDB structure for this residue,
401
+ # it could be a missing residue from the sequence
402
+ is_present = gemmi_mol is not None
403
+
404
+ # Save original index (required for parsing connections)
405
+ if is_present:
406
+ orig_idx = gemmi_mol.seqid
407
+ orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip()
408
+ else:
409
+ orig_idx = None
410
+
411
+ # Remove hydrogens
412
+ ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False)
413
+
414
+ # Check if this is a single atom CCD residue
415
+ if ref_mol.GetNumAtoms() == 1:
416
+ pos = (0, 0, 0)
417
+ bfactor = 0
418
+ if is_present:
419
+ pos = (
420
+ gemmi_mol[0].pos.x,
421
+ gemmi_mol[0].pos.y,
422
+ gemmi_mol[0].pos.z,
423
+ )
424
+ bfactor = gemmi_mol[0].b_iso
425
+ ref_atom = ref_mol.GetAtoms()[0]
426
+ atom = ParsedAtom(
427
+ name=ref_atom.GetProp("name"),
428
+ coords=pos,
429
+ is_present=is_present,
430
+ bfactor=bfactor,
431
+ )
432
+ unk_prot_id = const.unk_token_ids["PROTEIN"]
433
+ residue = ParsedResidue(
434
+ name=name,
435
+ type=unk_prot_id,
436
+ atoms=[atom],
437
+ bonds=[],
438
+ idx=res_idx,
439
+ orig_idx=orig_idx,
440
+ atom_center=0, # Placeholder, no center
441
+ atom_disto=0, # Placeholder, no center
442
+ is_standard=False,
443
+ is_present=is_present,
444
+ )
445
+ return residue
446
+
447
+ # If multi-atom, start by getting the PDB coordinates
448
+ pdb_pos = {}
449
+ bfactor = {}
450
+ if is_present:
451
+ # Match atoms based on names
452
+ for atom in gemmi_mol:
453
+ atom: gemmi.Atom
454
+ pos = (atom.pos.x, atom.pos.y, atom.pos.z)
455
+ pdb_pos[atom.name] = pos
456
+ bfactor[atom.name] = atom.b_iso
457
+ # Parse each atom in order of the reference mol
458
+ atoms = []
459
+ atom_idx = 0
460
+ idx_map = {} # Used for bonds later
461
+
462
+ for i, atom in enumerate(ref_mol.GetAtoms()):
463
+ # Get atom name, charge, element and reference coordinates
464
+ atom_name = atom.GetProp("name")
465
+
466
+ # If the atom is a leaving atom, skip if not in the PDB and is_covalent
467
+ if (
468
+ atom.HasProp("leaving_atom")
469
+ and int(atom.GetProp("leaving_atom")) == 1
470
+ and is_covalent
471
+ and (atom_name not in pdb_pos)
472
+ ):
473
+ continue
474
+
475
+ # Get PDB coordinates, if any
476
+ coords = pdb_pos.get(atom_name)
477
+ if coords is None:
478
+ atom_is_present = False
479
+ coords = (0, 0, 0)
480
+ else:
481
+ atom_is_present = True
482
+
483
+ # Add atom to list
484
+ atoms.append(
485
+ ParsedAtom(
486
+ name=atom_name,
487
+ coords=coords,
488
+ is_present=atom_is_present,
489
+ bfactor=bfactor.get(atom_name, 0),
490
+ )
491
+ )
492
+ idx_map[i] = atom_idx
493
+ atom_idx += 1
494
+
495
+ # Load bonds
496
+ bonds = []
497
+ unk_bond = const.bond_type_ids[const.unk_bond_type]
498
+ for bond in ref_mol.GetBonds():
499
+ idx_1 = bond.GetBeginAtomIdx()
500
+ idx_2 = bond.GetEndAtomIdx()
501
+
502
+ # Skip bonds with atoms ignored
503
+ if (idx_1 not in idx_map) or (idx_2 not in idx_map):
504
+ continue
505
+
506
+ idx_1 = idx_map[idx_1]
507
+ idx_2 = idx_map[idx_2]
508
+ start = min(idx_1, idx_2)
509
+ end = max(idx_1, idx_2)
510
+ bond_type = bond.GetBondType().name
511
+ bond_type = const.bond_type_ids.get(bond_type, unk_bond)
512
+ bonds.append(ParsedBond(start, end, bond_type))
513
+
514
+ unk_prot_id = const.unk_token_ids["PROTEIN"]
515
+ return ParsedResidue(
516
+ name=name,
517
+ type=unk_prot_id,
518
+ atoms=atoms,
519
+ bonds=bonds,
520
+ idx=res_idx,
521
+ atom_center=0,
522
+ atom_disto=0,
523
+ orig_idx=orig_idx,
524
+ is_standard=False,
525
+ is_present=is_present,
526
+ )
527
+
528
+
529
+ def parse_polymer( # noqa: C901, PLR0915, PLR0912
530
+ polymer: gemmi.ResidueSpan,
531
+ polymer_type: gemmi.PolymerType,
532
+ sequence: list[str],
533
+ chain_id: str,
534
+ entity: str,
535
+ mols: dict[str, Mol],
536
+ moldir: str,
537
+ ) -> Optional[ParsedChain]:
538
+ """Process a gemmi Polymer into a chain object.
539
+
540
+ Performs alignment of the full sequence to the polymer
541
+ residues. Loads coordinates and masks for the atoms in
542
+ the polymer, following the ordering in const.atom_order.
543
+
544
+ Parameters
545
+ ----------
546
+ polymer : gemmi.ResidueSpan
547
+ The polymer to process.
548
+ polymer_type : gemmi.PolymerType
549
+ The polymer type.
550
+ sequence : str
551
+ The full sequence of the polymer.
552
+ chain_id : str
553
+ The chain identifier.
554
+ entity : str
555
+ The entity name.
556
+ components : dict[str, Mol]
557
+ The preprocessed PDB components dictionary.
558
+
559
+ Returns
560
+ -------
561
+ ParsedChain, optional
562
+ The output chain, if successful.
563
+
564
+ Raises
565
+ ------
566
+ ValueError
567
+ If the alignment fails.
568
+
569
+ """
570
+ # Ignore microheterogeneities (pick first)
571
+ sequence = [gemmi.Entity.first_mon(item) for item in sequence]
572
+
573
+ # Align full sequence to polymer residues
574
+ # This is a simple way to handle all the different numbering schemes
575
+ result = gemmi.align_sequence_to_polymer(
576
+ sequence,
577
+ polymer,
578
+ polymer_type,
579
+ gemmi.AlignmentScoring(),
580
+ )
581
+
582
+ # Get coordinates and masks
583
+ i = 0
584
+ ref_res = set(const.tokens)
585
+ parsed = []
586
+ for j, match in enumerate(result.match_string):
587
+ # Get residue name from sequence
588
+ res_name = sequence[j]
589
+
590
+ # Check if we have a match in the structure
591
+ res = None
592
+ name_to_atom = {}
593
+
594
+ if match == "|":
595
+ # Get pdb residue
596
+ res = polymer[i]
597
+ name_to_atom = {a.name.upper(): a for a in res}
598
+
599
+ # Double check the match
600
+ if res.name != res_name:
601
+ msg = "Alignment mismatch!"
602
+ raise ValueError(msg)
603
+
604
+ # Increment polymer index
605
+ i += 1
606
+
607
+ # Map MSE to MET, put the selenium atom in the sulphur column
608
+ if res_name == "MSE":
609
+ res_name = "MET"
610
+ if "SE" in name_to_atom:
611
+ name_to_atom["SD"] = name_to_atom["SE"]
612
+
613
+ # Handle non-standard residues
614
+ elif res_name not in ref_res:
615
+ modified_mol = get_mol(res_name, mols, moldir)
616
+ if modified_mol is not None:
617
+ residue = parse_ccd_residue(
618
+ name=res_name,
619
+ ref_mol=modified_mol,
620
+ res_idx=j,
621
+ gemmi_mol=res,
622
+ is_covalent=True,
623
+ )
624
+ parsed.append(residue)
625
+ continue
626
+ else: # noqa: RET507
627
+ res_name = "UNK"
628
+
629
+ # Load regular residues
630
+ ref_mol = get_mol(res_name, mols, moldir)
631
+ ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False)
632
+
633
+ # Only use reference atoms set in constants
634
+ ref_name_to_atom = {a.GetProp("name"): a for a in ref_mol.GetAtoms()}
635
+ ref_atoms = [ref_name_to_atom[a] for a in const.ref_atoms[res_name]]
636
+
637
+ # Iterate, always in the same order
638
+ atoms: list[ParsedAtom] = []
639
+
640
+ for ref_atom in ref_atoms:
641
+ # Get atom name
642
+ atom_name = ref_atom.GetProp("name")
643
+
644
+ # Get coordinates from PDB
645
+ if atom_name in name_to_atom:
646
+ atom: gemmi.Atom = name_to_atom[atom_name]
647
+ atom_is_present = True
648
+ coords = (atom.pos.x, atom.pos.y, atom.pos.z)
649
+ bfactor = atom.b_iso
650
+ else:
651
+ atom_is_present = False
652
+ coords = (0, 0, 0)
653
+ bfactor = 0
654
+
655
+ # Add atom to list
656
+ atoms.append(
657
+ ParsedAtom(
658
+ name=atom_name,
659
+ coords=coords,
660
+ is_present=atom_is_present,
661
+ bfactor=bfactor,
662
+ )
663
+ )
664
+
665
+ # Fix naming errors in arginine residues where NH2 is
666
+ # incorrectly assigned to be closer to CD than NH1
667
+ if (res is not None) and (res_name == "ARG"):
668
+ ref_atoms: list[str] = const.ref_atoms["ARG"]
669
+ cd = atoms[ref_atoms.index("CD")]
670
+ nh1 = atoms[ref_atoms.index("NH1")]
671
+ nh2 = atoms[ref_atoms.index("NH2")]
672
+
673
+ cd_coords = np.array(cd.coords)
674
+ nh1_coords = np.array(nh1.coords)
675
+ nh2_coords = np.array(nh2.coords)
676
+
677
+ if all(atom.is_present for atom in (cd, nh1, nh2)) and (
678
+ np.linalg.norm(nh1_coords - cd_coords)
679
+ > np.linalg.norm(nh2_coords - cd_coords)
680
+ ):
681
+ atoms[ref_atoms.index("NH1")] = replace(nh1, coords=nh2.coords)
682
+ atoms[ref_atoms.index("NH2")] = replace(nh2, coords=nh1.coords)
683
+
684
+ # Add residue to parsed list
685
+ if res is not None:
686
+ orig_idx = res.seqid
687
+ orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip()
688
+ else:
689
+ orig_idx = None
690
+
691
+ atom_center = const.res_to_center_atom_id[res_name]
692
+ atom_disto = const.res_to_disto_atom_id[res_name]
693
+ parsed.append(
694
+ ParsedResidue(
695
+ name=res_name,
696
+ type=const.token_ids[res_name],
697
+ atoms=atoms,
698
+ bonds=[],
699
+ idx=j,
700
+ atom_center=atom_center,
701
+ atom_disto=atom_disto,
702
+ is_standard=True,
703
+ is_present=res is not None,
704
+ orig_idx=orig_idx,
705
+ )
706
+ )
707
+
708
+ # Get polymer class
709
+ if polymer_type == gemmi.PolymerType.PeptideL:
710
+ chain_type = const.chain_type_ids["PROTEIN"]
711
+ elif polymer_type == gemmi.PolymerType.Dna:
712
+ chain_type = const.chain_type_ids["DNA"]
713
+ elif polymer_type == gemmi.PolymerType.Rna:
714
+ chain_type = const.chain_type_ids["RNA"]
715
+
716
+ # Return polymer object
717
+ return ParsedChain(
718
+ name=chain_id,
719
+ entity=entity,
720
+ residues=parsed,
721
+ type=chain_type,
722
+ sequence=gemmi.one_letter_code(sequence),
723
+ )
724
+
725
+
726
+ def parse_connection(
727
+ connection: gemmi.Connection,
728
+ chains: list[ParsedChain],
729
+ subchain_map: dict[tuple[str, int], str],
730
+ ) -> ParsedConnection:
731
+ """Parse (covalent) connection from a gemmi Connection.
732
+
733
+ Parameters
734
+ ----------
735
+ connections : gemmi.Connectionlist
736
+ The connection list to parse.
737
+ chains : list[Chain]
738
+ The parsed chains.
739
+ subchain_map : dict[tuple[str, int], str]
740
+ The mapping from chain, residue index to subchain name.
741
+
742
+ Returns
743
+ -------
744
+ list[Connection]
745
+ The parsed connections.
746
+
747
+ """
748
+ # Map to correct subchains
749
+ chain_1_name = connection.partner1.chain_name
750
+ chain_2_name = connection.partner2.chain_name
751
+
752
+ res_1_id = connection.partner1.res_id.seqid
753
+ res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip()
754
+
755
+ res_2_id = connection.partner2.res_id.seqid
756
+ res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip()
757
+
758
+ subchain_1 = subchain_map[(chain_1_name, res_1_id)]
759
+ subchain_2 = subchain_map[(chain_2_name, res_2_id)]
760
+
761
+ # Get chain indices
762
+ chain_1 = next(chain for chain in chains if (chain.name == subchain_1))
763
+ chain_2 = next(chain for chain in chains if (chain.name == subchain_2))
764
+
765
+ # Get residue indices
766
+ res_1_idx, res_1 = next(
767
+ (idx, res)
768
+ for idx, res in enumerate(chain_1.residues)
769
+ if (res.orig_idx == res_1_id)
770
+ )
771
+ res_2_idx, res_2 = next(
772
+ (idx, res)
773
+ for idx, res in enumerate(chain_2.residues)
774
+ if (res.orig_idx == res_2_id)
775
+ )
776
+
777
+ # Get atom indices
778
+ atom_index_1 = next(
779
+ idx
780
+ for idx, atom in enumerate(res_1.atoms)
781
+ if atom.name == connection.partner1.atom_name
782
+ )
783
+ atom_index_2 = next(
784
+ idx
785
+ for idx, atom in enumerate(res_2.atoms)
786
+ if atom.name == connection.partner2.atom_name
787
+ )
788
+
789
+ conn = ParsedConnection(
790
+ chain_1=subchain_1,
791
+ chain_2=subchain_2,
792
+ residue_index_1=res_1_idx,
793
+ residue_index_2=res_2_idx,
794
+ atom_index_1=atom_index_1,
795
+ atom_index_2=atom_index_2,
796
+ )
797
+
798
+ return conn
799
+
800
+
801
+ def parse_mmcif( # noqa: C901, PLR0915, PLR0912
802
+ path: str,
803
+ mols: Optional[dict[str, Mol]] = None,
804
+ moldir: Optional[str] = None,
805
+ use_assembly: bool = True,
806
+ compute_interfaces: bool = True,
807
+ ) -> ParsedStructure:
808
+ """Parse a structure in MMCIF format.
809
+
810
+ Parameters
811
+ ----------
812
+ mmcif_file : PathLike
813
+ Path to the MMCIF file.
814
+ components: Mapping[str, Mol]
815
+ The preprocessed PDB components dictionary.
816
+
817
+ Returns
818
+ -------
819
+ ParsedStructure
820
+ The parsed structure.
821
+
822
+ """
823
+ # Disable rdkit warnings
824
+ blocker = rdBase.BlockLogs() # noqa: F841
825
+
826
+ # set mols
827
+ mols = {} if mols is None else mols
828
+
829
+ # Parse MMCIF input file
830
+ block = gemmi.cif.read(str(path))[0]
831
+
832
+ # Extract medatadata
833
+ deposit_date, release_date, revision_date = get_dates(block)
834
+ resolution = get_resolution(block)
835
+ method = get_method(block)
836
+ temperature, ph = get_experiment_conditions(block)
837
+
838
+ # Load structure object
839
+ structure = gemmi.make_structure_from_block(block)
840
+
841
+ # Clean up the structure
842
+ structure.merge_chain_parts()
843
+ structure.remove_waters()
844
+ structure.remove_hydrogens()
845
+ structure.remove_alternative_conformations()
846
+ structure.remove_empty_chains()
847
+
848
+ # Expand assembly 1
849
+ if use_assembly and structure.assemblies:
850
+ how = gemmi.HowToNameCopiedChain.AddNumber
851
+ assembly_name = structure.assemblies[0].name
852
+ structure.transform_to_assembly(assembly_name, how=how)
853
+
854
+ # Parse entities
855
+ # Create mapping from subchain id to entity
856
+ entities: dict[str, gemmi.Entity] = {}
857
+ entity_ids: dict[str, int] = {}
858
+ for entity_id, entity in enumerate(structure.entities):
859
+ entity: gemmi.Entity
860
+ if entity.entity_type.name == "Water":
861
+ continue
862
+ for subchain_id in entity.subchains:
863
+ entities[subchain_id] = entity
864
+ entity_ids[subchain_id] = entity_id
865
+
866
+ # Create mapping from chain, residue to subchains
867
+ # since a Connection uses the chains and not subchins
868
+ subchain_map = {}
869
+ for chain in structure[0]:
870
+ for residue in chain:
871
+ seq_id = residue.seqid
872
+ seq_id = str(seq_id.num) + str(seq_id.icode).strip()
873
+ subchain_map[(chain.name, seq_id)] = residue.subchain
874
+
875
+ # Find covalent ligands
876
+ covalent_chain_ids = compute_covalent_ligands(
877
+ connections=structure.connections,
878
+ subchain_map=subchain_map,
879
+ entities=entities,
880
+ )
881
+
882
+ # Parse chains
883
+ chains: list[ParsedChain] = []
884
+ for raw_chain in structure[0].subchains():
885
+ # Check chain type
886
+ subchain_id = raw_chain.subchain_id()
887
+ entity: gemmi.Entity = entities[subchain_id]
888
+ entity_type = entity.entity_type.name
889
+
890
+ # Parse a polymer
891
+ if entity_type == "Polymer":
892
+ # Skip PeptideD, DnaRnaHybrid, Pna, Other
893
+ if entity.polymer_type.name not in {
894
+ "PeptideL",
895
+ "Dna",
896
+ "Rna",
897
+ }:
898
+ continue
899
+
900
+ # Add polymer if successful
901
+ parsed_polymer = parse_polymer(
902
+ polymer=raw_chain,
903
+ polymer_type=entity.polymer_type,
904
+ sequence=entity.full_sequence,
905
+ chain_id=subchain_id,
906
+ entity=entity.name,
907
+ mols=mols,
908
+ moldir=moldir,
909
+ )
910
+ if parsed_polymer is not None:
911
+ chains.append(parsed_polymer)
912
+
913
+ # Parse a non-polymer
914
+ elif entity_type in {"NonPolymer", "Branched"}:
915
+ # Skip UNL
916
+ if any(lig.name == "UNL" for lig in raw_chain):
917
+ continue
918
+
919
+ residues = []
920
+ for lig_idx, ligand in enumerate(raw_chain):
921
+ # Check if ligand is covalent
922
+ if entity_type == "Branched":
923
+ is_covalent = True
924
+ else:
925
+ is_covalent = subchain_id in covalent_chain_ids
926
+
927
+ ligand: gemmi.Residue
928
+ ligand_mol = get_mol(ligand.name, mols, moldir)
929
+
930
+ residue = parse_ccd_residue(
931
+ name=ligand.name,
932
+ ref_mol=ligand_mol,
933
+ res_idx=lig_idx,
934
+ gemmi_mol=ligand,
935
+ is_covalent=is_covalent,
936
+ )
937
+ residues.append(residue)
938
+
939
+ if residues:
940
+ chains.append(
941
+ ParsedChain(
942
+ name=subchain_id,
943
+ entity=entity.name,
944
+ residues=residues,
945
+ type=const.chain_type_ids["NONPOLYMER"],
946
+ )
947
+ )
948
+
949
+ # If no chains parsed fail
950
+ if not chains:
951
+ msg = "No chains parsed!"
952
+ raise ValueError(msg)
953
+
954
+ # Want to traverse subchains in same order as reference structure
955
+ ref_chain_map = {ref_chain.name: i for i, ref_chain in enumerate(chains)}
956
+ all_ensembles = [chains]
957
+
958
+ # Loop through different structures in model
959
+ for struct in list(structure)[1:]:
960
+ struct: gemmi.Model
961
+ ensemble_chains = {}
962
+
963
+ for raw_chain in struct.subchains():
964
+ # Check chain type
965
+ subchain_id = raw_chain.subchain_id()
966
+ entity: gemmi.Entity = entities[subchain_id]
967
+ entity_type = entity.entity_type.name
968
+
969
+ # Parse a polymer
970
+ if entity_type == "Polymer":
971
+ # Skip PeptideD, DnaRnaHybrid, Pna, Other
972
+ if entity.polymer_type.name not in {
973
+ "PeptideL",
974
+ "Dna",
975
+ "Rna",
976
+ }:
977
+ continue
978
+
979
+ # Add polymer if successful
980
+ parsed_polymer = parse_polymer(
981
+ polymer=raw_chain,
982
+ polymer_type=entity.polymer_type,
983
+ sequence=entity.full_sequence,
984
+ chain_id=subchain_id,
985
+ entity=entity.name,
986
+ mols=mols,
987
+ moldir=moldir,
988
+ )
989
+ if parsed_polymer is not None:
990
+ ensemble_chains[ref_chain_map[subchain_id]] = parsed_polymer
991
+
992
+ # Parse a non-polymer
993
+ elif entity_type in {"NonPolymer", "Branched"}:
994
+ # Skip UNL
995
+ if any(lig.name == "UNL" for lig in raw_chain):
996
+ continue
997
+
998
+ residues = []
999
+ for lig_idx, ligand in enumerate(raw_chain):
1000
+ # Check if ligand is covalent
1001
+ if entity_type == "Branched":
1002
+ is_covalent = True
1003
+ else:
1004
+ is_covalent = subchain_id in covalent_chain_ids
1005
+
1006
+ ligand: gemmi.Residue
1007
+ ligand_mol = get_mol(ligand.name, mols, moldir)
1008
+
1009
+ residue = parse_ccd_residue(
1010
+ name=ligand.name,
1011
+ ref_mol=ligand_mol,
1012
+ res_idx=lig_idx,
1013
+ gemmi_mol=ligand,
1014
+ is_covalent=is_covalent,
1015
+ )
1016
+ residues.append(residue)
1017
+
1018
+ if residues:
1019
+ parsed_non_polymer = ParsedChain(
1020
+ name=subchain_id,
1021
+ entity=entity.name,
1022
+ residues=residues,
1023
+ type=const.chain_type_ids["NONPOLYMER"],
1024
+ )
1025
+ ensemble_chains[ref_chain_map[subchain_id]] = parsed_non_polymer
1026
+
1027
+ # Ensure ensemble chains are in the same order as reference structure
1028
+ ensemble_chains = [ensemble_chains[idx] for idx in range(len(ensemble_chains))]
1029
+ all_ensembles.append(ensemble_chains)
1030
+
1031
+ # Parse covalent connections
1032
+ connections: list[ParsedConnection] = []
1033
+ for connection in structure.connections:
1034
+ # Skip non-covalent connections
1035
+ connection: gemmi.Connection
1036
+ if connection.type.name != "Covale":
1037
+ continue
1038
+ try:
1039
+ parsed_connection = parse_connection(
1040
+ connection=connection,
1041
+ chains=chains,
1042
+ subchain_map=subchain_map,
1043
+ )
1044
+ except Exception: # noqa: S112, BLE001
1045
+ continue
1046
+ connections.append(parsed_connection)
1047
+
1048
+ # Create tables
1049
+ atom_data = []
1050
+ bond_data = []
1051
+ res_data = []
1052
+ chain_data = []
1053
+ ensemble_data = []
1054
+ coords_data = defaultdict(list)
1055
+
1056
+ # Convert parsed chains to tables
1057
+ atom_idx = 0
1058
+ res_idx = 0
1059
+ sym_count = {}
1060
+ chain_to_idx = {}
1061
+ res_to_idx = {}
1062
+ chain_to_seq = {}
1063
+
1064
+ for asym_id, chain in enumerate(chains):
1065
+ # Compute number of atoms and residues
1066
+ res_num = len(chain.residues)
1067
+ atom_num = sum(len(res.atoms) for res in chain.residues)
1068
+
1069
+ # Get same chain across models in ensemble
1070
+ ensemble_chains = [ensemble[asym_id] for ensemble in all_ensembles]
1071
+ assert len(ensemble_chains) == len(all_ensembles)
1072
+ for ensemble_chain in ensemble_chains:
1073
+ assert len(ensemble_chain.residues) == res_num
1074
+ assert sum(len(res.atoms) for res in ensemble_chain.residues) == atom_num
1075
+
1076
+ # Find all copies of this chain in the assembly
1077
+ entity_id = entity_ids[chain.name]
1078
+ sym_id = sym_count.get(entity_id, 0)
1079
+ chain_data.append(
1080
+ (
1081
+ chain.name,
1082
+ chain.type,
1083
+ entity_id,
1084
+ sym_id,
1085
+ asym_id,
1086
+ atom_idx,
1087
+ atom_num,
1088
+ res_idx,
1089
+ res_num,
1090
+ 0,
1091
+ )
1092
+ )
1093
+ chain_to_idx[chain.name] = asym_id
1094
+ sym_count[entity_id] = sym_id + 1
1095
+ if chain.sequence is not None:
1096
+ chain_to_seq[chain.name] = chain.sequence
1097
+
1098
+ # Add residue, atom, bond, data
1099
+ for i, res in enumerate(chain.residues):
1100
+ # Get same residue across models in ensemble
1101
+ ensemble_residues = [
1102
+ ensemble_chain.residues[i] for ensemble_chain in ensemble_chains
1103
+ ]
1104
+ assert len(ensemble_residues) == len(all_ensembles)
1105
+ for ensemble_res in ensemble_residues:
1106
+ assert ensemble_res.name == res.name
1107
+
1108
+ atom_center = atom_idx + res.atom_center
1109
+ atom_disto = atom_idx + res.atom_disto
1110
+ res_data.append(
1111
+ (
1112
+ res.name,
1113
+ res.type,
1114
+ res.idx,
1115
+ atom_idx,
1116
+ len(res.atoms),
1117
+ atom_center,
1118
+ atom_disto,
1119
+ res.is_standard,
1120
+ res.is_present,
1121
+ )
1122
+ )
1123
+ res_to_idx[(chain.name, i)] = (res_idx, atom_idx)
1124
+
1125
+ for bond in res.bonds:
1126
+ chain_1 = asym_id
1127
+ chain_2 = asym_id
1128
+ res_1 = res_idx
1129
+ res_2 = res_idx
1130
+ atom_1 = atom_idx + bond.atom_1
1131
+ atom_2 = atom_idx + bond.atom_2
1132
+ bond_data.append(
1133
+ (
1134
+ chain_1,
1135
+ chain_2,
1136
+ res_1,
1137
+ res_2,
1138
+ atom_1,
1139
+ atom_2,
1140
+ bond.type,
1141
+ )
1142
+ )
1143
+
1144
+ for a_idx, atom in enumerate(res.atoms):
1145
+ # Get same atom across models in ensemble
1146
+ ensemble_atoms = [
1147
+ ensemble_res.atoms[a_idx] for ensemble_res in ensemble_residues
1148
+ ]
1149
+ assert len(ensemble_atoms) == len(all_ensembles)
1150
+ for e_idx, ensemble_atom in enumerate(ensemble_atoms):
1151
+ assert ensemble_atom.name == atom.name
1152
+ assert atom.is_present == ensemble_atom.is_present
1153
+
1154
+ coords_data[e_idx].append(ensemble_atom.coords)
1155
+
1156
+ atom_data.append(
1157
+ (
1158
+ atom.name,
1159
+ atom.coords,
1160
+ atom.is_present,
1161
+ atom.bfactor,
1162
+ 1.0, # plddt is 1 for real data
1163
+ )
1164
+ )
1165
+ atom_idx += 1
1166
+
1167
+ res_idx += 1
1168
+
1169
+ # Create coordinates table
1170
+ coords_data_ = []
1171
+ for e_idx in range(len(coords_data)):
1172
+ ensemble_data.append((e_idx * atom_idx, atom_idx))
1173
+ coords_data_.append(coords_data[e_idx])
1174
+ coords_data = [(x,) for xs in coords_data_ for x in xs]
1175
+
1176
+ # Convert connections to tables
1177
+ for conn in connections:
1178
+ chain_1_idx = chain_to_idx[conn.chain_1]
1179
+ chain_2_idx = chain_to_idx[conn.chain_2]
1180
+ res_1_idx, atom_1_offset = res_to_idx[(conn.chain_1, conn.residue_index_1)]
1181
+ res_2_idx, atom_2_offset = res_to_idx[(conn.chain_2, conn.residue_index_2)]
1182
+ atom_1_idx = atom_1_offset + conn.atom_index_1
1183
+ atom_2_idx = atom_2_offset + conn.atom_index_2
1184
+ bond_data.append(
1185
+ (
1186
+ chain_1_idx,
1187
+ chain_2_idx,
1188
+ res_1_idx,
1189
+ res_2_idx,
1190
+ atom_1_idx,
1191
+ atom_2_idx,
1192
+ const.bond_type_ids["COVALENT"],
1193
+ )
1194
+ )
1195
+
1196
+ # Convert into datatypes
1197
+ atoms = np.array(atom_data, dtype=AtomV2)
1198
+ bonds = np.array(bond_data, dtype=BondV2)
1199
+ residues = np.array(res_data, dtype=Residue)
1200
+ chains = np.array(chain_data, dtype=Chain)
1201
+ mask = np.ones(len(chain_data), dtype=bool)
1202
+ ensemble = np.array(ensemble_data, dtype=Ensemble)
1203
+ coords = np.array(coords_data, dtype=Coords)
1204
+
1205
+ # Compute interface chains (find chains with a heavy atom within 5A)
1206
+ if compute_interfaces:
1207
+ interfaces = compute_interfaces(atoms, chains)
1208
+ else:
1209
+ interfaces = np.array([], dtype=Interface)
1210
+
1211
+ # Return parsed structure
1212
+ info = StructureInfo(
1213
+ deposited=deposit_date,
1214
+ revised=revision_date,
1215
+ released=release_date,
1216
+ resolution=resolution,
1217
+ method=method,
1218
+ num_chains=len(chains),
1219
+ num_interfaces=len(interfaces),
1220
+ temperature=temperature,
1221
+ pH=ph,
1222
+ )
1223
+
1224
+ data = StructureV2(
1225
+ atoms=atoms,
1226
+ bonds=bonds,
1227
+ residues=residues,
1228
+ chains=chains,
1229
+ interfaces=interfaces,
1230
+ mask=mask,
1231
+ ensemble=ensemble,
1232
+ coords=coords,
1233
+ )
1234
+
1235
+ return ParsedStructure(
1236
+ data=data,
1237
+ info=info,
1238
+ sequences=chain_to_seq,
1239
+ )