biotite 1.1.0__cp311-cp311-win_amd64.whl → 1.3.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (160) hide show
  1. biotite/application/application.py +3 -3
  2. biotite/application/autodock/app.py +1 -1
  3. biotite/application/blast/webapp.py +1 -1
  4. biotite/application/clustalo/app.py +1 -1
  5. biotite/application/localapp.py +2 -2
  6. biotite/application/msaapp.py +10 -10
  7. biotite/application/muscle/app3.py +3 -3
  8. biotite/application/muscle/app5.py +3 -3
  9. biotite/application/sra/app.py +0 -5
  10. biotite/application/util.py +21 -1
  11. biotite/application/viennarna/rnaalifold.py +8 -8
  12. biotite/application/viennarna/rnaplot.py +10 -8
  13. biotite/application/viennarna/util.py +1 -1
  14. biotite/application/webapp.py +1 -1
  15. biotite/database/afdb/__init__.py +12 -0
  16. biotite/database/afdb/download.py +191 -0
  17. biotite/database/entrez/dbnames.py +10 -0
  18. biotite/database/entrez/download.py +9 -10
  19. biotite/database/entrez/key.py +1 -1
  20. biotite/database/entrez/query.py +5 -4
  21. biotite/database/pubchem/download.py +6 -6
  22. biotite/database/pubchem/error.py +10 -0
  23. biotite/database/pubchem/query.py +12 -23
  24. biotite/database/rcsb/download.py +3 -2
  25. biotite/database/rcsb/query.py +2 -3
  26. biotite/database/uniprot/check.py +2 -2
  27. biotite/database/uniprot/download.py +2 -5
  28. biotite/database/uniprot/query.py +3 -4
  29. biotite/file.py +14 -2
  30. biotite/interface/__init__.py +19 -0
  31. biotite/interface/openmm/__init__.py +20 -0
  32. biotite/interface/openmm/state.py +93 -0
  33. biotite/interface/openmm/system.py +227 -0
  34. biotite/interface/pymol/__init__.py +201 -0
  35. biotite/interface/pymol/cgo.py +346 -0
  36. biotite/interface/pymol/convert.py +185 -0
  37. biotite/interface/pymol/display.py +267 -0
  38. biotite/interface/pymol/object.py +1226 -0
  39. biotite/interface/pymol/shapes.py +178 -0
  40. biotite/interface/pymol/startup.py +169 -0
  41. biotite/interface/rdkit/__init__.py +19 -0
  42. biotite/interface/rdkit/mol.py +490 -0
  43. biotite/interface/version.py +94 -0
  44. biotite/interface/warning.py +19 -0
  45. biotite/sequence/align/__init__.py +0 -4
  46. biotite/sequence/align/alignment.py +33 -11
  47. biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
  48. biotite/sequence/align/banded.pyx +22 -22
  49. biotite/sequence/align/cigar.py +2 -2
  50. biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
  51. biotite/sequence/align/kmeralphabet.pyx +2 -2
  52. biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
  53. biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
  54. biotite/sequence/align/kmertable.pyx +6 -6
  55. biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
  56. biotite/sequence/align/localgapped.pyx +47 -47
  57. biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
  58. biotite/sequence/align/localungapped.pyx +10 -10
  59. biotite/sequence/align/matrix.py +12 -3
  60. biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
  61. biotite/sequence/align/multiple.pyx +1 -2
  62. biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
  63. biotite/sequence/align/pairwise.pyx +37 -39
  64. biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
  65. biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
  66. biotite/sequence/align/selector.pyx +2 -2
  67. biotite/sequence/align/statistics.py +1 -1
  68. biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
  69. biotite/sequence/alphabet.py +2 -2
  70. biotite/sequence/annotation.py +19 -13
  71. biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
  72. biotite/sequence/codon.py +1 -2
  73. biotite/sequence/graphics/alignment.py +25 -39
  74. biotite/sequence/graphics/dendrogram.py +4 -2
  75. biotite/sequence/graphics/features.py +2 -2
  76. biotite/sequence/graphics/logo.py +10 -12
  77. biotite/sequence/io/fasta/convert.py +1 -2
  78. biotite/sequence/io/fasta/file.py +1 -1
  79. biotite/sequence/io/fastq/file.py +3 -3
  80. biotite/sequence/io/genbank/file.py +3 -3
  81. biotite/sequence/io/genbank/sequence.py +2 -0
  82. biotite/sequence/io/gff/convert.py +1 -1
  83. biotite/sequence/io/gff/file.py +1 -2
  84. biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
  85. biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
  86. biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
  87. biotite/sequence/profile.py +19 -25
  88. biotite/sequence/search.py +0 -1
  89. biotite/sequence/seqtypes.py +12 -5
  90. biotite/sequence/sequence.py +1 -2
  91. biotite/structure/__init__.py +2 -0
  92. biotite/structure/alphabet/i3d.py +1 -2
  93. biotite/structure/alphabet/pb.py +1 -2
  94. biotite/structure/alphabet/unkerasify.py +8 -2
  95. biotite/structure/atoms.py +35 -27
  96. biotite/structure/basepairs.py +39 -40
  97. biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
  98. biotite/structure/bonds.pyx +8 -5
  99. biotite/structure/box.py +159 -23
  100. biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
  101. biotite/structure/celllist.pyx +83 -68
  102. biotite/structure/chains.py +17 -55
  103. biotite/structure/charges.cp311-win_amd64.pyd +0 -0
  104. biotite/structure/compare.py +420 -13
  105. biotite/structure/density.py +1 -1
  106. biotite/structure/dotbracket.py +31 -32
  107. biotite/structure/filter.py +8 -8
  108. biotite/structure/geometry.py +15 -15
  109. biotite/structure/graphics/rna.py +19 -16
  110. biotite/structure/hbond.py +18 -21
  111. biotite/structure/info/atoms.py +11 -2
  112. biotite/structure/info/ccd.py +0 -2
  113. biotite/structure/info/components.bcif +0 -0
  114. biotite/structure/info/groups.py +0 -3
  115. biotite/structure/info/misc.py +0 -1
  116. biotite/structure/info/radii.py +92 -22
  117. biotite/structure/info/standardize.py +1 -2
  118. biotite/structure/integrity.py +4 -6
  119. biotite/structure/io/general.py +2 -2
  120. biotite/structure/io/gro/file.py +8 -9
  121. biotite/structure/io/mol/convert.py +1 -1
  122. biotite/structure/io/mol/ctab.py +33 -28
  123. biotite/structure/io/mol/mol.py +1 -1
  124. biotite/structure/io/mol/sdf.py +39 -13
  125. biotite/structure/io/pdb/convert.py +86 -5
  126. biotite/structure/io/pdb/file.py +90 -24
  127. biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
  128. biotite/structure/io/pdbqt/file.py +4 -4
  129. biotite/structure/io/pdbx/bcif.py +22 -7
  130. biotite/structure/io/pdbx/cif.py +20 -7
  131. biotite/structure/io/pdbx/component.py +6 -0
  132. biotite/structure/io/pdbx/compress.py +71 -34
  133. biotite/structure/io/pdbx/convert.py +429 -77
  134. biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
  135. biotite/structure/io/pdbx/encoding.pyx +39 -23
  136. biotite/structure/io/trajfile.py +9 -6
  137. biotite/structure/io/util.py +38 -0
  138. biotite/structure/mechanics.py +0 -1
  139. biotite/structure/molecules.py +0 -15
  140. biotite/structure/pseudoknots.py +13 -19
  141. biotite/structure/repair.py +2 -4
  142. biotite/structure/residues.py +20 -48
  143. biotite/structure/rings.py +335 -0
  144. biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
  145. biotite/structure/sasa.pyx +30 -30
  146. biotite/structure/segments.py +123 -9
  147. biotite/structure/sequence.py +0 -1
  148. biotite/structure/spacegroups.json +1567 -0
  149. biotite/structure/spacegroups.license +26 -0
  150. biotite/structure/sse.py +0 -2
  151. biotite/structure/superimpose.py +75 -253
  152. biotite/structure/tm.py +581 -0
  153. biotite/structure/transform.py +232 -26
  154. biotite/structure/util.py +3 -3
  155. biotite/version.py +9 -4
  156. biotite/visualize.py +111 -1
  157. {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/METADATA +8 -36
  158. {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/RECORD +160 -138
  159. {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/WHEEL +1 -1
  160. {biotite-1.1.0.dist-info → biotite-1.3.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -3,7 +3,7 @@
3
3
  # information.
4
4
 
5
5
  __name__ = "biotite.structure.io.pdbx"
6
- __author__ = "Fabrice Allain, Patrick Kunzmann"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
7
7
  __all__ = [
8
8
  "get_sequence",
9
9
  "get_model_count",
@@ -13,16 +13,30 @@ __all__ = [
13
13
  "set_component",
14
14
  "list_assemblies",
15
15
  "get_assembly",
16
+ "get_unit_cell",
17
+ "get_sse",
16
18
  ]
17
19
 
18
20
  import itertools
19
21
  import warnings
22
+ from collections import defaultdict
20
23
  import numpy as np
21
24
  from biotite.file import InvalidFileError
22
25
  from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
23
- from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
26
+ from biotite.structure.atoms import (
27
+ AtomArray,
28
+ AtomArrayStack,
29
+ concatenate,
30
+ repeat,
31
+ )
24
32
  from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
25
- from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
33
+ from biotite.structure.box import (
34
+ coord_to_fraction,
35
+ fraction_to_coord,
36
+ space_group_transforms,
37
+ unitcell_from_vectors,
38
+ vectors_from_unitcell,
39
+ )
26
40
  from biotite.structure.error import BadStructureError
27
41
  from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
28
42
  from biotite.structure.filter import (
@@ -32,6 +46,7 @@ from biotite.structure.filter import (
32
46
  filter_first_altloc,
33
47
  filter_highest_occupancy_altloc,
34
48
  )
49
+ from biotite.structure.geometry import centroid
35
50
  from biotite.structure.io.pdbx.bcif import (
36
51
  BinaryCIFBlock,
37
52
  BinaryCIFColumn,
@@ -45,7 +60,7 @@ from biotite.structure.residues import (
45
60
  get_residue_positions,
46
61
  get_residue_starts_for,
47
62
  )
48
- from biotite.structure.util import matrix_rotate
63
+ from biotite.structure.transform import AffineTransformation
49
64
 
50
65
  # Bond types in `struct_conn` category that refer to covalent bonds
51
66
  PDBX_BOND_TYPE_ID_TO_TYPE = {
@@ -81,6 +96,7 @@ PDBX_BOND_TYPE_TO_ORDER = {
81
96
  BondType.AROMATIC_TRIPLE: "trip",
82
97
  # These are masked later, it is merely added here to avoid a KeyError
83
98
  BondType.ANY: "",
99
+ BondType.AROMATIC: "",
84
100
  BondType.COORDINATION: "",
85
101
  }
86
102
  # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
@@ -92,12 +108,19 @@ COMP_BOND_ORDER_TO_TYPE = {
92
108
  ("SING", "Y"): BondType.AROMATIC_SINGLE,
93
109
  ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
94
110
  ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
111
+ ("AROM", "Y"): BondType.AROMATIC,
95
112
  }
96
113
  # ...and vice versa
97
114
  COMP_BOND_TYPE_TO_ORDER = {
98
115
  bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
99
116
  }
100
117
  CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
118
+ # it was observed that when the number or rows in `atom_site` and `struct_conn`
119
+ # exceed a certain threshold,
120
+ # a dictionary approach is less computation and memory intensive than the dense
121
+ # vectorized approach.
122
+ # https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
123
+ FIND_MATCHES_SWITCH_THRESHOLD = 4000000
101
124
 
102
125
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
103
126
  _nucleotideseq_type_list = [
@@ -116,8 +139,7 @@ _other_type_list = [
116
139
 
117
140
  def _filter(category, index):
118
141
  """
119
- Reduce the ``atom_site`` category to the values for the given
120
- model.
142
+ Reduce the given category to the values selected by the given index,
121
143
  """
122
144
  Category = type(category)
123
145
  Column = Category.subcomponent_class()
@@ -160,8 +182,8 @@ def get_sequence(pdbx_file, data_block=None):
160
182
  -------
161
183
  sequence_dict : Dictionary of Sequences
162
184
  Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
163
- (often equivalent to chain_id and atom_site.auth_asym_id
164
- in most cases). Dictionary values are sequences.
185
+ (equivalent to ``atom_site.auth_asym_id``).
186
+ Dictionary values are sequences.
165
187
 
166
188
  Notes
167
189
  -----
@@ -217,9 +239,7 @@ def get_model_count(pdbx_file, data_block=None):
217
239
  The number of models.
218
240
  """
219
241
  block = _get_block(pdbx_file, data_block)
220
- return len(
221
- _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
222
- )
242
+ return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
223
243
 
224
244
 
225
245
  def get_structure(
@@ -310,7 +330,6 @@ def get_structure(
310
330
  >>> arr = get_structure(file, model=1)
311
331
  >>> print(len(arr))
312
332
  304
313
-
314
333
  """
315
334
  block = _get_block(pdbx_file, data_block)
316
335
 
@@ -321,13 +340,12 @@ def get_structure(
321
340
  raise InvalidFileError("Missing 'atom_site' category in file")
322
341
 
323
342
  models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
324
- model_starts = _get_model_starts(models)
325
- model_count = len(model_starts)
343
+ model_count = len(np.unique(models))
326
344
  atom_count = len(models)
327
345
 
328
346
  if model is None:
329
347
  # For a stack, the annotations are derived from the first model
330
- model_atom_site = _filter_model(atom_site, model_starts, 1)
348
+ model_atom_site = _filter_model(atom_site, 1)
331
349
  # Any field of the category would work here to get the length
332
350
  model_length = model_atom_site.row_count
333
351
  atoms = AtomArrayStack(model_count, model_length)
@@ -373,7 +391,7 @@ def get_structure(
373
391
  f"the given model {model} does not exist"
374
392
  )
375
393
 
376
- model_atom_site = _filter_model(atom_site, model_starts, model)
394
+ model_atom_site = _filter_model(atom_site, model)
377
395
  # Any field of the category would work here to get the length
378
396
  model_length = model_atom_site.row_count
379
397
  atoms = AtomArray(model_length)
@@ -386,7 +404,16 @@ def get_structure(
386
404
 
387
405
  # The below part is the same for both, AtomArray and AtomArrayStack
388
406
  _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
407
+
408
+ atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
409
+
389
410
  if include_bonds:
411
+ if altloc == "all":
412
+ raise ValueError(
413
+ "Bond computation is not supported with `altloc='all', consider using "
414
+ "'connect_via_residue_names()' afterwards"
415
+ )
416
+
390
417
  if "chem_comp_bond" in block:
391
418
  try:
392
419
  custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
@@ -402,10 +429,13 @@ def get_structure(
402
429
  bonds = connect_via_residue_names(atoms)
403
430
  if "struct_conn" in block:
404
431
  bonds = bonds.merge(
405
- _parse_inter_residue_bonds(model_atom_site, block["struct_conn"])
432
+ _parse_inter_residue_bonds(
433
+ altloc_filtered_atom_site,
434
+ block["struct_conn"],
435
+ atom_count=atoms.array_length(),
436
+ )
406
437
  )
407
438
  atoms.bonds = bonds
408
- atoms = _filter_altloc(atoms, model_atom_site, altloc)
409
439
 
410
440
  return atoms
411
441
 
@@ -565,11 +595,12 @@ def _parse_intra_residue_bonds(chem_comp_bond):
565
595
  return custom_bond_dict
566
596
 
567
597
 
568
- def _parse_inter_residue_bonds(atom_site, struct_conn):
598
+ def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
569
599
  """
570
600
  Create inter-residue bonds by parsing the ``struct_conn`` category.
571
601
  The atom indices of each bond are found by matching the bond labels
572
602
  to the ``atom_site`` category.
603
+ If atom_count is None, it will be inferred from the ``atom_site`` category.
573
604
  """
574
605
  # Identity symmetry operation
575
606
  IDENTITY = "1_555"
@@ -638,7 +669,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
638
669
  bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
639
670
 
640
671
  return BondList(
641
- atom_site.row_count,
672
+ atom_count if atom_count is not None else atom_site.row_count,
642
673
  np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
643
674
  )
644
675
 
@@ -649,6 +680,17 @@ def _find_matches(query_arrays, reference_arrays):
649
680
  `reference_arrays` where all query values match the reference counterpart.
650
681
  If no match is found for a query, the corresponding index is -1.
651
682
  """
683
+ if (
684
+ query_arrays[0].shape[0] * reference_arrays[0].shape[0]
685
+ <= FIND_MATCHES_SWITCH_THRESHOLD
686
+ ):
687
+ match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
688
+ else:
689
+ match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
690
+ return match_indices
691
+
692
+
693
+ def _find_matches_by_dense_array(query_arrays, reference_arrays):
652
694
  match_masks_for_all_columns = np.stack(
653
695
  [
654
696
  query[:, np.newaxis] == reference[np.newaxis, :]
@@ -676,6 +718,38 @@ def _find_matches(query_arrays, reference_arrays):
676
718
  return match_indices
677
719
 
678
720
 
721
+ def _find_matches_by_dict(query_arrays, reference_arrays):
722
+ # Convert reference arrays to a dictionary for O(1) lookups
723
+ reference_dict = {}
724
+ ambiguous_keys = set()
725
+ for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
726
+ ref_key = tuple(ref_row)
727
+ if ref_key in reference_dict:
728
+ ambiguous_keys.add(ref_key)
729
+ continue
730
+ reference_dict[ref_key] = ref_idx
731
+
732
+ match_indices = []
733
+ for query_idx, query_row in enumerate(zip(*query_arrays)):
734
+ query_key = tuple(query_row)
735
+ occurrence = reference_dict.get(query_key)
736
+
737
+ if occurrence is None:
738
+ # -1 indicates that no match was found in the reference
739
+ match_indices.append(-1)
740
+ elif query_key in ambiguous_keys:
741
+ # The query cannot be uniquely matched to an atom in the reference
742
+ raise InvalidFileError(
743
+ f"The covalent bond in the 'struct_conn' category at index "
744
+ f"{query_idx} cannot be unambiguously assigned to atoms in "
745
+ f"the 'atom_site' category"
746
+ )
747
+ else:
748
+ match_indices.append(occurrence)
749
+
750
+ return np.array(match_indices)
751
+
752
+
679
753
  def _get_struct_conn_col_name(col_name, partner):
680
754
  """
681
755
  For a column name in ``atom_site`` get the corresponding column name
@@ -691,44 +765,52 @@ def _get_struct_conn_col_name(col_name, partner):
691
765
 
692
766
 
693
767
  def _filter_altloc(array, atom_site, altloc):
768
+ """
769
+ Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
770
+ specified by the given *altloc* identifier.
771
+ """
694
772
  altloc_ids = atom_site.get("label_alt_id")
695
773
  occupancy = atom_site.get("occupancy")
696
774
 
697
- # Filter altloc IDs and return
698
- if altloc_ids is None:
699
- return array
775
+ if altloc == "all":
776
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
777
+ return array, atom_site
778
+ elif altloc_ids is None or (altloc_ids.mask.array != MaskValue.PRESENT).all():
779
+ # No altlocs in atom_site category
780
+ return array, atom_site
700
781
  elif altloc == "occupancy" and occupancy is not None:
701
- return array[
702
- ...,
703
- filter_highest_occupancy_altloc(
704
- array, altloc_ids.as_array(str), occupancy.as_array(float)
705
- ),
706
- ]
782
+ mask = filter_highest_occupancy_altloc(
783
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
784
+ )
785
+ return array[..., mask], _filter(atom_site, mask)
707
786
  # 'first' is also fallback if file has no occupancy information
708
787
  elif altloc == "first":
709
- return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
710
- elif altloc == "all":
711
- array.set_annotation("altloc_id", altloc_ids.as_array(str))
712
- return array
788
+ mask = filter_first_altloc(array, altloc_ids.as_array(str))
789
+ return array[..., mask], _filter(atom_site, mask)
713
790
  else:
714
791
  raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
715
792
 
716
793
 
717
- def _get_model_starts(model_array):
718
- """
719
- Get the start index for each model in the arrays of the
720
- ``atom_site`` category.
721
- """
722
- _, indices = np.unique(model_array, return_index=True)
723
- indices.sort()
724
- return indices
725
-
726
-
727
- def _filter_model(atom_site, model_starts, model):
794
+ def _filter_model(atom_site, model):
728
795
  """
729
796
  Reduce the ``atom_site`` category to the values for the given
730
797
  model.
798
+
799
+ Parameters
800
+ ----------
801
+ atom_site : CIFCategory or BinaryCIFCategory
802
+ ``atom_site`` category containing all models.
803
+ model : int
804
+ The model to be selected.
805
+
806
+ Returns
807
+ -------
808
+ atom_site : CIFCategory or BinaryCIFCategory
809
+ The ``atom_site`` category containing only the selected model.
731
810
  """
811
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
812
+ _, model_starts = np.unique(models, return_index=True)
813
+ model_starts.sort()
732
814
  # Append exclusive stop
733
815
  model_starts = np.append(model_starts, [atom_site.row_count])
734
816
  # Indexing starts at 0, but model number starts at 1
@@ -815,7 +897,6 @@ def set_structure(
815
897
  >>> file = CIFFile()
816
898
  >>> set_structure(file, atom_array)
817
899
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
818
-
819
900
  """
820
901
  _check_non_empty(array)
821
902
 
@@ -836,7 +917,11 @@ def set_structure(
836
917
  )
837
918
  atom_site["label_comp_id"] = np.copy(array.res_name)
838
919
  atom_site["label_asym_id"] = np.copy(array.chain_id)
839
- atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
920
+ atom_site["label_entity_id"] = (
921
+ np.copy(array.label_entity_id)
922
+ if "label_entity_id" in array.get_annotation_categories()
923
+ else _determine_entity_id(array.chain_id)
924
+ )
840
925
  atom_site["label_seq_id"] = np.copy(array.res_id)
841
926
  atom_site["pdbx_PDB_ins_code"] = Column(
842
927
  np.copy(array.ins_code),
@@ -1181,7 +1266,13 @@ def _filter_canonical_links(array, bond_array):
1181
1266
  ) # fmt: skip
1182
1267
 
1183
1268
 
1184
- def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
1269
+ def get_component(
1270
+ pdbx_file,
1271
+ data_block=None,
1272
+ use_ideal_coord=True,
1273
+ res_name=None,
1274
+ allow_missing_coord=False,
1275
+ ):
1185
1276
  """
1186
1277
  Create an :class:`AtomArray` for a chemical component from the
1187
1278
  ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -1209,6 +1300,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1209
1300
  In this case, the component with the given residue name is
1210
1301
  read.
1211
1302
  By default, all rows would be read in this case.
1303
+ allow_missing_coord : bool, optional
1304
+ Whether to allow missing coordinate values in components.
1305
+ If ``True``, these will be represented as ``nan`` values.
1306
+ If ``False``, a ``ValueError`` is raised when missing coordinates
1307
+ are encountered.
1212
1308
 
1213
1309
  Returns
1214
1310
  -------
@@ -1299,7 +1395,8 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1299
1395
  else:
1300
1396
  raise
1301
1397
  array.coord = _parse_component_coordinates(
1302
- [atom_category[field] for field in alt_coord_fields]
1398
+ [atom_category[field] for field in alt_coord_fields],
1399
+ allow_missing=allow_missing_coord,
1303
1400
  )
1304
1401
 
1305
1402
  try:
@@ -1310,7 +1407,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1310
1407
  )
1311
1408
  except KeyError:
1312
1409
  warnings.warn(
1313
- "Category 'chem_comp_bond' not found. " "No bonds will be parsed",
1410
+ "Category 'chem_comp_bond' not found. No bonds will be parsed",
1314
1411
  UserWarning,
1315
1412
  )
1316
1413
  else:
@@ -1330,14 +1427,20 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1330
1427
  return array
1331
1428
 
1332
1429
 
1333
- def _parse_component_coordinates(coord_columns):
1430
+ def _parse_component_coordinates(coord_columns, allow_missing=False):
1334
1431
  coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1335
1432
  for i, column in enumerate(coord_columns):
1336
1433
  if column.mask is not None and column.mask.array.any():
1337
- raise ValueError(
1338
- "Missing coordinates for some atoms",
1339
- )
1340
- coord[:, i] = column.as_array(np.float32)
1434
+ if allow_missing:
1435
+ warnings.warn(
1436
+ "Missing coordinates for some atoms. Those will be set to nan",
1437
+ UserWarning,
1438
+ )
1439
+ else:
1440
+ raise ValueError(
1441
+ "Missing coordinates for some atoms",
1442
+ )
1443
+ coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
1341
1444
  return coord
1342
1445
 
1343
1446
 
@@ -1445,6 +1548,7 @@ def list_assemblies(pdbx_file, data_block=None):
1445
1548
 
1446
1549
  Examples
1447
1550
  --------
1551
+
1448
1552
  >>> import os.path
1449
1553
  >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1450
1554
  >>> assembly_ids = list_assemblies(file)
@@ -1611,7 +1715,7 @@ def get_assembly(
1611
1715
  )
1612
1716
 
1613
1717
  ### Get transformations and apply them to the affected asym IDs
1614
- assembly = None
1718
+ chain_ops = defaultdict(list)
1615
1719
  for id, op_expr, asym_id_expr in zip(
1616
1720
  assembly_gen_category["assembly_id"].as_array(str),
1617
1721
  assembly_gen_category["oper_expression"].as_array(str),
@@ -1620,19 +1724,22 @@ def get_assembly(
1620
1724
  # Find the operation expressions for given assembly ID
1621
1725
  # We already asserted that the ID is actually present
1622
1726
  if id == assembly_id:
1623
- operations = _parse_operation_expression(op_expr)
1624
- asym_ids = asym_id_expr.split(",")
1625
- # Filter affected asym IDs
1626
- sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)]
1627
- sub_assembly = _apply_transformations(
1628
- sub_structure, transformations, operations
1629
- )
1630
- # Merge the chains with asym IDs for this operation
1631
- # with chains from other operations
1632
- if assembly is None:
1633
- assembly = sub_assembly
1634
- else:
1635
- assembly += sub_assembly
1727
+ for chain_id in asym_id_expr.split(","):
1728
+ chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
1729
+
1730
+ sub_assemblies = []
1731
+ for asym_id, op_list in chain_ops.items():
1732
+ sub_struct = structure[..., structure.label_asym_id == asym_id]
1733
+ sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
1734
+ # Merge the chain's sub_assembly into the rest of the assembly
1735
+ sub_assemblies.append(sub_assembly)
1736
+ assembly = concatenate(sub_assemblies)
1737
+
1738
+ # Sort AtomArray or AtomArrayStack by 'sym_id'
1739
+ max_sym_id = assembly.sym_id.max()
1740
+ assembly = concatenate(
1741
+ [assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
1742
+ )
1636
1743
 
1637
1744
  # Remove 'label_asym_id', if it was not included in the original
1638
1745
  # user-supplied 'extra_fields'
@@ -1655,11 +1762,7 @@ def _apply_transformations(structure, transformation_dict, operations):
1655
1762
  # Execute for each transformation step
1656
1763
  # in the operation expression
1657
1764
  for op_step in operation:
1658
- rotation_matrix, translation_vector = transformation_dict[op_step]
1659
- # Rotate
1660
- coord = matrix_rotate(coord, rotation_matrix)
1661
- # Translate
1662
- coord += translation_vector
1765
+ coord = transformation_dict[op_step].apply(coord)
1663
1766
  assembly_coord[i] = coord
1664
1767
 
1665
1768
  assembly = repeat(structure, assembly_coord)
@@ -1671,8 +1774,7 @@ def _apply_transformations(structure, transformation_dict, operations):
1671
1774
 
1672
1775
  def _get_transformations(struct_oper):
1673
1776
  """
1674
- Get transformation operation in terms of rotation matrix and
1675
- translation for each operation ID in ``pdbx_struct_oper_list``.
1777
+ Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
1676
1778
  """
1677
1779
  transformation_dict = {}
1678
1780
  for index, id in enumerate(struct_oper["id"].as_array(str)):
@@ -1688,7 +1790,9 @@ def _get_transformations(struct_oper):
1688
1790
  translation_vector = np.array(
1689
1791
  [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
1690
1792
  )
1691
- transformation_dict[id] = (rotation_matrix, translation_vector)
1793
+ transformation_dict[id] = AffineTransformation(
1794
+ np.zeros(3), rotation_matrix, translation_vector
1795
+ )
1692
1796
  return transformation_dict
1693
1797
 
1694
1798
 
@@ -1742,4 +1846,252 @@ def _convert_string_to_sequence(string, stype):
1742
1846
  elif stype in _other_type_list:
1743
1847
  return None
1744
1848
  else:
1745
- raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype)
1849
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
1850
+
1851
+
1852
+ def get_unit_cell(
1853
+ pdbx_file,
1854
+ center=True,
1855
+ model=None,
1856
+ data_block=None,
1857
+ altloc="first",
1858
+ extra_fields=None,
1859
+ use_author_fields=True,
1860
+ include_bonds=False,
1861
+ ):
1862
+ """
1863
+ Build a structure model containing all symmetric copies of the structure within a
1864
+ single unit cell.
1865
+
1866
+ This function receives the data from the ``symmetry`` and ``atom_site`` categories
1867
+ in the file.
1868
+ Consequently, these categories must be present in the file.
1869
+
1870
+ Parameters
1871
+ ----------
1872
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1873
+ The file object.
1874
+ center : bool, optional
1875
+ If set to true, each symmetric copy will be moved inside the unit cell
1876
+ dimensions, if its centroid is outside.
1877
+ By default, the copies are are created using the raw space group
1878
+ transformations, which may put them one unit cell length further away.
1879
+ model : int, optional
1880
+ If this parameter is given, the function will return an
1881
+ :class:`AtomArray` from the atoms corresponding to the given
1882
+ model number (starting at 1).
1883
+ Negative values are used to index models starting from the last
1884
+ model insted of the first model.
1885
+ If this parameter is omitted, an :class:`AtomArrayStack`
1886
+ containing all models will be returned, even if the structure
1887
+ contains only one model.
1888
+ data_block : str, optional
1889
+ The name of the data block.
1890
+ Default is the first (and most times only) data block of the
1891
+ file.
1892
+ If the data block object is passed directly to `pdbx_file`,
1893
+ this parameter is ignored.
1894
+ altloc : {'first', 'occupancy', 'all'}
1895
+ This parameter defines how *altloc* IDs are handled:
1896
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1897
+ appearing in a residue.
1898
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1899
+ with the highest occupancy for a residue.
1900
+ - ``'all'`` - Use all atoms.
1901
+ Note that this leads to duplicate atoms.
1902
+ When this option is chosen, the ``altloc_id`` annotation
1903
+ array is added to the returned structure.
1904
+ extra_fields : list of str, optional
1905
+ The strings in the list are entry names, that are
1906
+ additionally added as annotation arrays.
1907
+ The annotation category name will be the same as the PDBx
1908
+ subcategory name.
1909
+ The array type is always `str`.
1910
+ An exception are the special field identifiers:
1911
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1912
+ These will convert the fitting subcategory into an
1913
+ annotation array with reasonable type.
1914
+ use_author_fields : bool, optional
1915
+ Some fields can be read from two alternative sources,
1916
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1917
+ the ID of the residue.
1918
+ While, the ``label_xxx`` fields can be used as official pointers
1919
+ to other categories in the file, the ``auth_xxx``
1920
+ fields are set by the author(s) of the structure and are
1921
+ consistent with the corresponding values in PDB files.
1922
+ If `use_author_fields` is true, the annotation arrays will be
1923
+ read from the ``auth_xxx`` fields (if applicable),
1924
+ otherwise from the the ``label_xxx`` fields.
1925
+ include_bonds : bool, optional
1926
+ If set to true, a :class:`BondList` will be created for the
1927
+ resulting :class:`AtomArray` containing the bond information
1928
+ from the file.
1929
+ Bonds, whose order could not be determined from the
1930
+ *Chemical Component Dictionary*
1931
+ (e.g. especially inter-residue bonds),
1932
+ have :attr:`BondType.ANY`, since the PDB format itself does
1933
+ not support bond orders.
1934
+
1935
+ Returns
1936
+ -------
1937
+ unit_cell : AtomArray or AtomArrayStack
1938
+ The structure representing the unit cell.
1939
+ The return type depends on the `model` parameter.
1940
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1941
+ unit in the unit cell.
1942
+
1943
+ Examples
1944
+ --------
1945
+
1946
+ >>> import os.path
1947
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1948
+ >>> unit_cell = get_unit_cell(file, model=1)
1949
+ """
1950
+ block = _get_block(pdbx_file, data_block)
1951
+
1952
+ try:
1953
+ space_group = block["symmetry"]["space_group_name_H-M"].as_item()
1954
+ except KeyError:
1955
+ raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
1956
+ transforms = space_group_transforms(space_group)
1957
+
1958
+ asym = get_structure(
1959
+ pdbx_file,
1960
+ model,
1961
+ data_block,
1962
+ altloc,
1963
+ extra_fields,
1964
+ use_author_fields,
1965
+ include_bonds,
1966
+ )
1967
+
1968
+ fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
1969
+ unit_cell_copies = []
1970
+ for transform in transforms:
1971
+ fractional_coord = transform.apply(fractional_asym_coord)
1972
+ if center:
1973
+ # If the centroid is outside the box, move the copy inside the box
1974
+ orig_centroid = centroid(fractional_coord)
1975
+ new_centroid = orig_centroid % 1
1976
+ fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
1977
+ unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
1978
+
1979
+ unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
1980
+ unit_cell.set_annotation(
1981
+ "sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
1982
+ )
1983
+ return unit_cell
1984
+
1985
+
1986
+ def get_sse(pdbx_file, data_block=None, match_model=None):
1987
+ """
1988
+ Get the secondary structure from a PDBx file.
1989
+
1990
+ Parameters
1991
+ ----------
1992
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1993
+ The file object.
1994
+ The following categories are required:
1995
+
1996
+ - ``entity_poly``
1997
+ - ``struct_conf`` (if alpha-helices are present)
1998
+ - ``struct_sheet_range`` (if beta-strands are present)
1999
+ - ``atom_site`` (if `match_model` is set)
2000
+
2001
+ data_block : str, optional
2002
+ The name of the data block.
2003
+ Default is the first (and most times only) data block of the
2004
+ file.
2005
+ If the data block object is passed directly to `pdbx_file`,
2006
+ this parameter is ignored.
2007
+ match_model : None, optional
2008
+ If a model number is given, only secondary structure elements for residues are
2009
+ kept, that are resolved in the given model.
2010
+ This means secondary structure elements for residues that would not appear
2011
+ in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
2012
+ By default, all residues in the sequence are kept.
2013
+
2014
+ Returns
2015
+ -------
2016
+ sse_dict : dict of str -> ndarray, dtype=str
2017
+ The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
2018
+ secondary structure of the respective chain.
2019
+
2020
+ - ``"a"``: alpha-helix
2021
+ - ``"b"``: beta-strand
2022
+ - ``"c"``: coil or not an amino acid
2023
+
2024
+ Each secondary structure element corresponds to the ``label_seq_id`` of the
2025
+ ``atom_site`` category.
2026
+ This means that the 0-th position of the array corresponds to the residue
2027
+ in ``atom_site`` with ``label_seq_id`` ``1``.
2028
+
2029
+ Examples
2030
+ --------
2031
+
2032
+ >>> import os.path
2033
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
2034
+ >>> sse = get_sse(file, match_model=1)
2035
+ >>> print(sse)
2036
+ {'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
2037
+ 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
2038
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2039
+ 'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
2040
+ 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2041
+ 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2042
+ 'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
2043
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
2044
+ 'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2045
+ 'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
2046
+ dtype='<U1')}
2047
+
2048
+ If only secondary structure elements for resolved residues are requested, the length
2049
+ of the returned array matches the number of peptide residues in the structure.
2050
+
2051
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
2052
+ >>> print(len(get_sse(file, match_model=1)["A"]))
2053
+ 128
2054
+ >>> atoms = get_structure(file, model=1)
2055
+ >>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
2056
+ >>> print(get_residue_count(atoms))
2057
+ 128
2058
+ """
2059
+ block = _get_block(pdbx_file, data_block)
2060
+
2061
+ # Init all chains with "c" for coil
2062
+ sse_dict = {
2063
+ chain_id: np.repeat("c", len(sequence))
2064
+ for chain_id, sequence in get_sequence(block).items()
2065
+ }
2066
+
2067
+ # Populate SSE arrays with helices and strands
2068
+ for sse_symbol, category_name in [
2069
+ ("a", "struct_conf"),
2070
+ ("b", "struct_sheet_range"),
2071
+ ]:
2072
+ if category_name in block:
2073
+ category = block[category_name]
2074
+ chains = category["beg_auth_asym_id"].as_array(str)
2075
+ start_positions = category["beg_label_seq_id"].as_array(int)
2076
+ end_positions = category["end_label_seq_id"].as_array(int)
2077
+
2078
+ # set alpha helix positions
2079
+ for chain, start, end in zip(chains, start_positions, end_positions):
2080
+ # Translate the 1-based positions from PDBx into 0-based array indices
2081
+ sse_dict[chain][start - 1 : end] = sse_symbol
2082
+
2083
+ if match_model is not None:
2084
+ model_atom_site = _filter_model(block["atom_site"], match_model)
2085
+ chain_ids = model_atom_site["auth_asym_id"].as_array(str)
2086
+ res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
2087
+ # Filter out masked residues, i.e. residues not part of a chain
2088
+ mask = res_ids != -1
2089
+ chain_ids = chain_ids[mask]
2090
+ res_ids = res_ids[mask]
2091
+ for chain_id, sse in sse_dict.items():
2092
+ res_ids_in_chain = res_ids[chain_ids == chain_id]
2093
+ # Transform from 1-based residue ID to 0-based index
2094
+ indices = np.unique(res_ids_in_chain) - 1
2095
+ sse_dict[chain_id] = sse[indices]
2096
+
2097
+ return sse_dict