biotite 1.2.0__cp311-cp311-macosx_11_0_arm64.whl → 1.4.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (62) hide show
  1. biotite/application/viennarna/rnaplot.py +7 -7
  2. biotite/interface/openmm/__init__.py +4 -0
  3. biotite/interface/pymol/__init__.py +3 -0
  4. biotite/interface/pymol/object.py +3 -1
  5. biotite/interface/rdkit/__init__.py +4 -0
  6. biotite/interface/rdkit/mol.py +5 -5
  7. biotite/interface/version.py +23 -0
  8. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  9. biotite/sequence/align/banded.pyx +1 -1
  10. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  11. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  12. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  13. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  14. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  15. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  16. biotite/sequence/align/multiple.pyx +1 -2
  17. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  18. biotite/sequence/align/pairwise.pyx +2 -4
  19. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  20. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  22. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  23. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  24. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  25. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  26. biotite/structure/basepairs.py +13 -14
  27. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  28. biotite/structure/bonds.pyx +67 -6
  29. biotite/structure/box.py +141 -3
  30. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  31. biotite/structure/celllist.pyx +0 -1
  32. biotite/structure/chains.py +15 -21
  33. biotite/structure/charges.cpython-311-darwin.so +0 -0
  34. biotite/structure/compare.py +2 -0
  35. biotite/structure/dotbracket.py +4 -4
  36. biotite/structure/graphics/rna.py +19 -16
  37. biotite/structure/hbond.py +1 -2
  38. biotite/structure/info/components.bcif +0 -0
  39. biotite/structure/io/pdb/convert.py +84 -2
  40. biotite/structure/io/pdb/file.py +94 -7
  41. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  42. biotite/structure/io/pdbx/bcif.py +6 -3
  43. biotite/structure/io/pdbx/cif.py +5 -2
  44. biotite/structure/io/pdbx/compress.py +71 -34
  45. biotite/structure/io/pdbx/convert.py +226 -58
  46. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  47. biotite/structure/io/pdbx/encoding.pyx +39 -23
  48. biotite/structure/pseudoknots.py +6 -6
  49. biotite/structure/residues.py +10 -27
  50. biotite/structure/rings.py +118 -2
  51. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  52. biotite/structure/sasa.pyx +28 -29
  53. biotite/structure/segments.py +55 -0
  54. biotite/structure/spacegroups.json +1567 -0
  55. biotite/structure/spacegroups.license +26 -0
  56. biotite/structure/superimpose.py +1 -191
  57. biotite/structure/transform.py +220 -1
  58. biotite/version.py +2 -2
  59. {biotite-1.2.0.dist-info → biotite-1.4.0.dist-info}/METADATA +4 -34
  60. {biotite-1.2.0.dist-info → biotite-1.4.0.dist-info}/RECORD +62 -60
  61. {biotite-1.2.0.dist-info → biotite-1.4.0.dist-info}/WHEEL +3 -1
  62. {biotite-1.2.0.dist-info → biotite-1.4.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -3,6 +3,7 @@ __name__ = "biotite.structure.io.pdbx"
3
3
  __author__ = "Patrick Kunzmann"
4
4
 
5
5
  import itertools
6
+ import warnings
6
7
  import msgpack
7
8
  import numpy as np
8
9
  import biotite.structure.io.pdbx.bcif as bcif
@@ -17,7 +18,7 @@ from biotite.structure.io.pdbx.encoding import (
17
18
  )
18
19
 
19
20
 
20
- def compress(data, float_tolerance=1e-6):
21
+ def compress(data, float_tolerance=None, rtol=1e-6, atol=1e-4):
21
22
  """
22
23
  Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
23
24
  different data encodings for each data array and selecting the one, which results in
@@ -29,6 +30,12 @@ def compress(data, float_tolerance=1e-6):
29
30
  The data to compress.
30
31
  float_tolerance : float, optional
31
32
  The relative error that is accepted when compressing floating point numbers.
33
+ DEPRECATED: Use `rtol` instead.
34
+ rtol, atol : float, optional
35
+ The compression factor of floating point numbers is chosen such that
36
+ either the relative (`rtol`) or absolute (`atol`) tolerance is fulfilled
37
+ for each value, i.e. the difference between the compressed and uncompressed
38
+ value is smaller than the tolerance.
32
39
 
33
40
  Returns
34
41
  -------
@@ -49,64 +56,79 @@ def compress(data, float_tolerance=1e-6):
49
56
  >>> pdbx_file.write(uncompressed_file)
50
57
  >>> _ = uncompressed_file.seek(0)
51
58
  >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
52
- 927 KB
59
+ 937 KB
53
60
  >>> # Write compressed file
54
61
  >>> pdbx_file = compress(pdbx_file)
55
62
  >>> compressed_file = BytesIO()
56
63
  >>> pdbx_file.write(compressed_file)
57
64
  >>> _ = compressed_file.seek(0)
58
65
  >>> print(f"{len(compressed_file.read()) // 1000} KB")
59
- 111 KB
66
+ 114 KB
60
67
  """
68
+ if float_tolerance is not None:
69
+ warnings.warn(
70
+ "The 'float_tolerance' parameter is deprecated, use 'rtol' instead",
71
+ DeprecationWarning,
72
+ )
73
+
61
74
  match type(data):
62
75
  case bcif.BinaryCIFFile:
63
- return _compress_file(data, float_tolerance)
76
+ return _compress_file(data, rtol, atol)
64
77
  case bcif.BinaryCIFBlock:
65
- return _compress_block(data, float_tolerance)
78
+ return _compress_block(data, rtol, atol)
66
79
  case bcif.BinaryCIFCategory:
67
- return _compress_category(data, float_tolerance)
80
+ return _compress_category(data, rtol, atol)
68
81
  case bcif.BinaryCIFColumn:
69
- return _compress_column(data, float_tolerance)
82
+ return _compress_column(data, rtol, atol)
70
83
  case bcif.BinaryCIFData:
71
- return _compress_data(data, float_tolerance)
84
+ return _compress_data(data, rtol, atol)
72
85
  case _:
73
86
  raise TypeError(f"Unsupported type {type(data).__name__}")
74
87
 
75
88
 
76
- def _compress_file(bcif_file, float_tolerance):
89
+ def _compress_file(bcif_file, rtol, atol):
77
90
  compressed_file = bcif.BinaryCIFFile()
78
91
  for block_name, bcif_block in bcif_file.items():
79
- compressed_block = _compress_block(bcif_block, float_tolerance)
92
+ try:
93
+ compressed_block = _compress_block(bcif_block, rtol, atol)
94
+ except Exception:
95
+ raise ValueError(f"Failed to compress block '{block_name}'")
80
96
  compressed_file[block_name] = compressed_block
81
97
  return compressed_file
82
98
 
83
99
 
84
- def _compress_block(bcif_block, float_tolerance):
100
+ def _compress_block(bcif_block, rtol, atol):
85
101
  compressed_block = bcif.BinaryCIFBlock()
86
102
  for category_name, bcif_category in bcif_block.items():
87
- compressed_category = _compress_category(bcif_category, float_tolerance)
103
+ try:
104
+ compressed_category = _compress_category(bcif_category, rtol, atol)
105
+ except Exception:
106
+ raise ValueError(f"Failed to compress category '{category_name}'")
88
107
  compressed_block[category_name] = compressed_category
89
108
  return compressed_block
90
109
 
91
110
 
92
- def _compress_category(bcif_category, float_tolerance):
111
+ def _compress_category(bcif_category, rtol, atol):
93
112
  compressed_category = bcif.BinaryCIFCategory()
94
113
  for column_name, bcif_column in bcif_category.items():
95
- compressed_column = _compress_column(bcif_column, float_tolerance)
114
+ try:
115
+ compressed_column = _compress_column(bcif_column, rtol, atol)
116
+ except Exception:
117
+ raise ValueError(f"Failed to compress column '{column_name}'")
96
118
  compressed_category[column_name] = compressed_column
97
119
  return compressed_category
98
120
 
99
121
 
100
- def _compress_column(bcif_column, float_tolerance):
101
- data = _compress_data(bcif_column.data, float_tolerance)
122
+ def _compress_column(bcif_column, rtol, atol):
123
+ data = _compress_data(bcif_column.data, rtol, atol)
102
124
  if bcif_column.mask is not None:
103
- mask = _compress_data(bcif_column.mask, float_tolerance)
125
+ mask = _compress_data(bcif_column.mask, rtol, atol)
104
126
  else:
105
127
  mask = None
106
128
  return bcif.BinaryCIFColumn(data, mask)
107
129
 
108
130
 
109
- def _compress_data(bcif_data, float_tolerance):
131
+ def _compress_data(bcif_data, rtol, atol):
110
132
  array = bcif_data.array
111
133
  if len(array) == 1:
112
134
  # No need to compress a single value -> Use default uncompressed encoding
@@ -123,16 +145,28 @@ def _compress_data(bcif_data, float_tolerance):
123
145
  return bcif.BinaryCIFData(array, [encoding])
124
146
 
125
147
  elif np.issubdtype(array.dtype, np.floating):
148
+ if not np.isfinite(array).all():
149
+ # NaN/inf values cannot be represented by integers
150
+ # -> do not use integer encoding
151
+ return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
126
152
  to_integer_encoding = FixedPointEncoding(
127
- 10 ** _get_decimal_places(array, float_tolerance)
153
+ 10 ** _get_decimal_places(array, rtol, atol)
128
154
  )
129
- integer_array = to_integer_encoding.encode(array)
130
- best_encoding, size_compressed = _find_best_integer_compression(integer_array)
131
- if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
132
- return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
133
- else:
134
- # The float array is smaller -> encode it directly as bytes
155
+ try:
156
+ integer_array = to_integer_encoding.encode(array)
157
+ except ValueError:
158
+ # With the given tolerances integer underflow/overflow would occur
159
+ # -> do not use integer encoding
135
160
  return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
161
+ else:
162
+ best_encoding, size_compressed = _find_best_integer_compression(
163
+ integer_array
164
+ )
165
+ if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
166
+ return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
167
+ else:
168
+ # The float array is smaller -> encode it directly as bytes
169
+ return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
136
170
 
137
171
  elif np.issubdtype(array.dtype, np.integer):
138
172
  array = _to_smallest_integer_type(array)
@@ -273,7 +307,7 @@ def _data_size_in_file(data):
273
307
  return len(bytes_in_file)
274
308
 
275
309
 
276
- def _get_decimal_places(array, tol):
310
+ def _get_decimal_places(array, rtol, atol):
277
311
  """
278
312
  Get the number of decimal places in a floating point array.
279
313
 
@@ -281,21 +315,24 @@ def _get_decimal_places(array, tol):
281
315
  ----------
282
316
  array : numpy.ndarray
283
317
  The array to analyze.
284
- tol : float, optional
285
- The relative tolerance allowed when the values are cut off after the returned
286
- number of decimal places.
318
+ rtol, atol : float, optional
319
+ The relative and absolute tolerance allowed when the values are cut off after
320
+ the returned number of decimal places.
287
321
 
288
322
  Returns
289
323
  -------
290
324
  decimals : int
291
325
  The number of decimal places.
292
326
  """
293
- # Decimals of NaN or infinite values do not make sense
294
- # and 0 would give NaN when rounding on decimals
295
- array = array[np.isfinite(array) & (array != 0)]
296
- for decimals in itertools.count(start=-_order_magnitude(array)):
327
+ if rtol <= 0 and atol <= 0:
328
+ raise ValueError("At least one of 'rtol' and 'atol' must be greater than 0")
329
+ # 0 would give NaN when rounding on decimals
330
+ array = array[array != 0]
331
+ for decimals in itertools.count(start=min(0, -_order_magnitude(array))):
297
332
  error = np.abs(np.round(array, decimals) - array)
298
- if np.all(error < tol * np.abs(array)):
333
+ if decimals == 100:
334
+ raise
335
+ if np.all((error < rtol * np.abs(array)) | (error < atol)):
299
336
  return decimals
300
337
 
301
338
 
@@ -13,17 +13,30 @@ __all__ = [
13
13
  "set_component",
14
14
  "list_assemblies",
15
15
  "get_assembly",
16
+ "get_unit_cell",
16
17
  "get_sse",
17
18
  ]
18
19
 
19
20
  import itertools
20
21
  import warnings
22
+ from collections import defaultdict
21
23
  import numpy as np
22
24
  from biotite.file import InvalidFileError
23
25
  from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
24
- from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
26
+ from biotite.structure.atoms import (
27
+ AtomArray,
28
+ AtomArrayStack,
29
+ concatenate,
30
+ repeat,
31
+ )
25
32
  from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
26
- from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
33
+ from biotite.structure.box import (
34
+ coord_to_fraction,
35
+ fraction_to_coord,
36
+ space_group_transforms,
37
+ unitcell_from_vectors,
38
+ vectors_from_unitcell,
39
+ )
27
40
  from biotite.structure.error import BadStructureError
28
41
  from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
29
42
  from biotite.structure.filter import (
@@ -33,6 +46,7 @@ from biotite.structure.filter import (
33
46
  filter_first_altloc,
34
47
  filter_highest_occupancy_altloc,
35
48
  )
49
+ from biotite.structure.geometry import centroid
36
50
  from biotite.structure.io.pdbx.bcif import (
37
51
  BinaryCIFBlock,
38
52
  BinaryCIFColumn,
@@ -46,7 +60,7 @@ from biotite.structure.residues import (
46
60
  get_residue_positions,
47
61
  get_residue_starts_for,
48
62
  )
49
- from biotite.structure.util import matrix_rotate
63
+ from biotite.structure.transform import AffineTransformation
50
64
 
51
65
  # Bond types in `struct_conn` category that refer to covalent bonds
52
66
  PDBX_BOND_TYPE_ID_TO_TYPE = {
@@ -125,8 +139,7 @@ _other_type_list = [
125
139
 
126
140
  def _filter(category, index):
127
141
  """
128
- Reduce the ``atom_site`` category to the values for the given
129
- model.
142
+ Reduce the given category to the values selected by the given index,
130
143
  """
131
144
  Category = type(category)
132
145
  Column = Category.subcomponent_class()
@@ -391,7 +404,16 @@ def get_structure(
391
404
 
392
405
  # The below part is the same for both, AtomArray and AtomArrayStack
393
406
  _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
407
+
408
+ atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
409
+
394
410
  if include_bonds:
411
+ if altloc == "all":
412
+ raise ValueError(
413
+ "Bond computation is not supported with `altloc='all', consider using "
414
+ "'connect_via_residue_names()' afterwards"
415
+ )
416
+
395
417
  if "chem_comp_bond" in block:
396
418
  try:
397
419
  custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
@@ -407,10 +429,13 @@ def get_structure(
407
429
  bonds = connect_via_residue_names(atoms)
408
430
  if "struct_conn" in block:
409
431
  bonds = bonds.merge(
410
- _parse_inter_residue_bonds(model_atom_site, block["struct_conn"])
432
+ _parse_inter_residue_bonds(
433
+ altloc_filtered_atom_site,
434
+ block["struct_conn"],
435
+ atom_count=atoms.array_length(),
436
+ )
411
437
  )
412
438
  atoms.bonds = bonds
413
- atoms = _filter_altloc(atoms, model_atom_site, altloc)
414
439
 
415
440
  return atoms
416
441
 
@@ -570,11 +595,12 @@ def _parse_intra_residue_bonds(chem_comp_bond):
570
595
  return custom_bond_dict
571
596
 
572
597
 
573
- def _parse_inter_residue_bonds(atom_site, struct_conn):
598
+ def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
574
599
  """
575
600
  Create inter-residue bonds by parsing the ``struct_conn`` category.
576
601
  The atom indices of each bond are found by matching the bond labels
577
602
  to the ``atom_site`` category.
603
+ If atom_count is None, it will be inferred from the ``atom_site`` category.
578
604
  """
579
605
  # Identity symmetry operation
580
606
  IDENTITY = "1_555"
@@ -643,7 +669,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
643
669
  bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
644
670
 
645
671
  return BondList(
646
- atom_site.row_count,
672
+ atom_count if atom_count is not None else atom_site.row_count,
647
673
  np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
648
674
  )
649
675
 
@@ -739,25 +765,31 @@ def _get_struct_conn_col_name(col_name, partner):
739
765
 
740
766
 
741
767
  def _filter_altloc(array, atom_site, altloc):
768
+ """
769
+ Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
770
+ specified by the given *altloc* identifier.
771
+ """
742
772
  altloc_ids = atom_site.get("label_alt_id")
743
773
  occupancy = atom_site.get("occupancy")
744
774
 
745
- # Filter altloc IDs and return
746
- if altloc_ids is None:
747
- return array
775
+ if altloc == "all":
776
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
777
+ return array, atom_site
778
+ elif altloc_ids is None or (
779
+ altloc_ids.mask is not None
780
+ and (altloc_ids.mask.array != MaskValue.PRESENT).all()
781
+ ):
782
+ # No altlocs in atom_site category
783
+ return array, atom_site
748
784
  elif altloc == "occupancy" and occupancy is not None:
749
- return array[
750
- ...,
751
- filter_highest_occupancy_altloc(
752
- array, altloc_ids.as_array(str), occupancy.as_array(float)
753
- ),
754
- ]
785
+ mask = filter_highest_occupancy_altloc(
786
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
787
+ )
788
+ return array[..., mask], _filter(atom_site, mask)
755
789
  # 'first' is also fallback if file has no occupancy information
756
790
  elif altloc == "first":
757
- return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
758
- elif altloc == "all":
759
- array.set_annotation("altloc_id", altloc_ids.as_array(str))
760
- return array
791
+ mask = filter_first_altloc(array, altloc_ids.as_array(str))
792
+ return array[..., mask], _filter(atom_site, mask)
761
793
  else:
762
794
  raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
763
795
 
@@ -844,11 +876,7 @@ def set_structure(
844
876
  this parameter is ignored.
845
877
  If the file is empty, a new data block will be created.
846
878
  include_bonds : bool, optional
847
- If set to true and `array` has associated ``bonds`` , the
848
- intra-residue bonds will be written into the ``chem_comp_bond``
849
- category.
850
- Inter-residue bonds will be written into the ``struct_conn``
851
- independent of this parameter.
879
+ DEPRECATED: Has no effect anymore.
852
880
  extra_fields : list of str, optional
853
881
  List of additional fields from the ``atom_site`` category
854
882
  that should be written into the file.
@@ -869,6 +897,13 @@ def set_structure(
869
897
  >>> set_structure(file, atom_array)
870
898
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
871
899
  """
900
+ if include_bonds:
901
+ warnings.warn(
902
+ "`include_bonds` parameter is deprecated, "
903
+ "intra-residue are always written, if available",
904
+ DeprecationWarning,
905
+ )
906
+
872
907
  _check_non_empty(array)
873
908
 
874
909
  block = _get_or_create_block(pdbx_file, data_block)
@@ -946,10 +981,9 @@ def set_structure(
946
981
  struct_conn = _set_inter_residue_bonds(array, atom_site)
947
982
  if struct_conn is not None:
948
983
  block["struct_conn"] = struct_conn
949
- if include_bonds:
950
- chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
951
- if chem_comp_bond is not None:
952
- block["chem_comp_bond"] = chem_comp_bond
984
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
985
+ if chem_comp_bond is not None:
986
+ block["chem_comp_bond"] = chem_comp_bond
953
987
 
954
988
  # In case of a single model handle each coordinate
955
989
  # simply like a flattened array
@@ -1623,11 +1657,11 @@ def get_assembly(
1623
1657
  If set to true, a :class:`BondList` will be created for the
1624
1658
  resulting :class:`AtomArray` containing the bond information
1625
1659
  from the file.
1626
- Bonds, whose order could not be determined from the
1627
- *Chemical Component Dictionary*
1628
- (e.g. especially inter-residue bonds),
1629
- have :attr:`BondType.ANY`, since the PDB format itself does
1630
- not support bond orders.
1660
+ Inter-residue bonds, will be read from the ``struct_conn``
1661
+ category.
1662
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1663
+ available, otherwise they will be derived from the Chemical
1664
+ Component Dictionary.
1631
1665
 
1632
1666
  Returns
1633
1667
  -------
@@ -1686,7 +1720,7 @@ def get_assembly(
1686
1720
  )
1687
1721
 
1688
1722
  ### Get transformations and apply them to the affected asym IDs
1689
- assembly = None
1723
+ chain_ops = defaultdict(list)
1690
1724
  for id, op_expr, asym_id_expr in zip(
1691
1725
  assembly_gen_category["assembly_id"].as_array(str),
1692
1726
  assembly_gen_category["oper_expression"].as_array(str),
@@ -1695,19 +1729,22 @@ def get_assembly(
1695
1729
  # Find the operation expressions for given assembly ID
1696
1730
  # We already asserted that the ID is actually present
1697
1731
  if id == assembly_id:
1698
- operations = _parse_operation_expression(op_expr)
1699
- asym_ids = asym_id_expr.split(",")
1700
- # Filter affected asym IDs
1701
- sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)]
1702
- sub_assembly = _apply_transformations(
1703
- sub_structure, transformations, operations
1704
- )
1705
- # Merge the chains with asym IDs for this operation
1706
- # with chains from other operations
1707
- if assembly is None:
1708
- assembly = sub_assembly
1709
- else:
1710
- assembly += sub_assembly
1732
+ for chain_id in asym_id_expr.split(","):
1733
+ chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
1734
+
1735
+ sub_assemblies = []
1736
+ for asym_id, op_list in chain_ops.items():
1737
+ sub_struct = structure[..., structure.label_asym_id == asym_id]
1738
+ sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
1739
+ # Merge the chain's sub_assembly into the rest of the assembly
1740
+ sub_assemblies.append(sub_assembly)
1741
+ assembly = concatenate(sub_assemblies)
1742
+
1743
+ # Sort AtomArray or AtomArrayStack by 'sym_id'
1744
+ max_sym_id = assembly.sym_id.max()
1745
+ assembly = concatenate(
1746
+ [assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
1747
+ )
1711
1748
 
1712
1749
  # Remove 'label_asym_id', if it was not included in the original
1713
1750
  # user-supplied 'extra_fields'
@@ -1730,11 +1767,7 @@ def _apply_transformations(structure, transformation_dict, operations):
1730
1767
  # Execute for each transformation step
1731
1768
  # in the operation expression
1732
1769
  for op_step in operation:
1733
- rotation_matrix, translation_vector = transformation_dict[op_step]
1734
- # Rotate
1735
- coord = matrix_rotate(coord, rotation_matrix)
1736
- # Translate
1737
- coord += translation_vector
1770
+ coord = transformation_dict[op_step].apply(coord)
1738
1771
  assembly_coord[i] = coord
1739
1772
 
1740
1773
  assembly = repeat(structure, assembly_coord)
@@ -1746,8 +1779,7 @@ def _apply_transformations(structure, transformation_dict, operations):
1746
1779
 
1747
1780
  def _get_transformations(struct_oper):
1748
1781
  """
1749
- Get transformation operation in terms of rotation matrix and
1750
- translation for each operation ID in ``pdbx_struct_oper_list``.
1782
+ Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
1751
1783
  """
1752
1784
  transformation_dict = {}
1753
1785
  for index, id in enumerate(struct_oper["id"].as_array(str)):
@@ -1763,7 +1795,9 @@ def _get_transformations(struct_oper):
1763
1795
  translation_vector = np.array(
1764
1796
  [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
1765
1797
  )
1766
- transformation_dict[id] = (rotation_matrix, translation_vector)
1798
+ transformation_dict[id] = AffineTransformation(
1799
+ np.zeros(3), rotation_matrix, translation_vector
1800
+ )
1767
1801
  return transformation_dict
1768
1802
 
1769
1803
 
@@ -1820,6 +1854,140 @@ def _convert_string_to_sequence(string, stype):
1820
1854
  raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
1821
1855
 
1822
1856
 
1857
+ def get_unit_cell(
1858
+ pdbx_file,
1859
+ center=True,
1860
+ model=None,
1861
+ data_block=None,
1862
+ altloc="first",
1863
+ extra_fields=None,
1864
+ use_author_fields=True,
1865
+ include_bonds=False,
1866
+ ):
1867
+ """
1868
+ Build a structure model containing all symmetric copies of the structure within a
1869
+ single unit cell.
1870
+
1871
+ This function receives the data from the ``symmetry`` and ``atom_site`` categories
1872
+ in the file.
1873
+ Consequently, these categories must be present in the file.
1874
+
1875
+ Parameters
1876
+ ----------
1877
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1878
+ The file object.
1879
+ center : bool, optional
1880
+ If set to true, each symmetric copy will be moved inside the unit cell
1881
+ dimensions, if its centroid is outside.
1882
+ By default, the copies are are created using the raw space group
1883
+ transformations, which may put them one unit cell length further away.
1884
+ model : int, optional
1885
+ If this parameter is given, the function will return an
1886
+ :class:`AtomArray` from the atoms corresponding to the given
1887
+ model number (starting at 1).
1888
+ Negative values are used to index models starting from the last
1889
+ model insted of the first model.
1890
+ If this parameter is omitted, an :class:`AtomArrayStack`
1891
+ containing all models will be returned, even if the structure
1892
+ contains only one model.
1893
+ data_block : str, optional
1894
+ The name of the data block.
1895
+ Default is the first (and most times only) data block of the
1896
+ file.
1897
+ If the data block object is passed directly to `pdbx_file`,
1898
+ this parameter is ignored.
1899
+ altloc : {'first', 'occupancy', 'all'}
1900
+ This parameter defines how *altloc* IDs are handled:
1901
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1902
+ appearing in a residue.
1903
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1904
+ with the highest occupancy for a residue.
1905
+ - ``'all'`` - Use all atoms.
1906
+ Note that this leads to duplicate atoms.
1907
+ When this option is chosen, the ``altloc_id`` annotation
1908
+ array is added to the returned structure.
1909
+ extra_fields : list of str, optional
1910
+ The strings in the list are entry names, that are
1911
+ additionally added as annotation arrays.
1912
+ The annotation category name will be the same as the PDBx
1913
+ subcategory name.
1914
+ The array type is always `str`.
1915
+ An exception are the special field identifiers:
1916
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1917
+ These will convert the fitting subcategory into an
1918
+ annotation array with reasonable type.
1919
+ use_author_fields : bool, optional
1920
+ Some fields can be read from two alternative sources,
1921
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1922
+ the ID of the residue.
1923
+ While, the ``label_xxx`` fields can be used as official pointers
1924
+ to other categories in the file, the ``auth_xxx``
1925
+ fields are set by the author(s) of the structure and are
1926
+ consistent with the corresponding values in PDB files.
1927
+ If `use_author_fields` is true, the annotation arrays will be
1928
+ read from the ``auth_xxx`` fields (if applicable),
1929
+ otherwise from the the ``label_xxx`` fields.
1930
+ include_bonds : bool, optional
1931
+ If set to true, a :class:`BondList` will be created for the
1932
+ resulting :class:`AtomArray` containing the bond information
1933
+ from the file.
1934
+ Inter-residue bonds, will be read from the ``struct_conn``
1935
+ category.
1936
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1937
+ available, otherwise they will be derived from the Chemical
1938
+ Component Dictionary.
1939
+
1940
+ Returns
1941
+ -------
1942
+ unit_cell : AtomArray or AtomArrayStack
1943
+ The structure representing the unit cell.
1944
+ The return type depends on the `model` parameter.
1945
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1946
+ unit in the unit cell.
1947
+
1948
+ Examples
1949
+ --------
1950
+
1951
+ >>> import os.path
1952
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1953
+ >>> unit_cell = get_unit_cell(file, model=1)
1954
+ """
1955
+ block = _get_block(pdbx_file, data_block)
1956
+
1957
+ try:
1958
+ space_group = block["symmetry"]["space_group_name_H-M"].as_item()
1959
+ except KeyError:
1960
+ raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
1961
+ transforms = space_group_transforms(space_group)
1962
+
1963
+ asym = get_structure(
1964
+ pdbx_file,
1965
+ model,
1966
+ data_block,
1967
+ altloc,
1968
+ extra_fields,
1969
+ use_author_fields,
1970
+ include_bonds,
1971
+ )
1972
+
1973
+ fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
1974
+ unit_cell_copies = []
1975
+ for transform in transforms:
1976
+ fractional_coord = transform.apply(fractional_asym_coord)
1977
+ if center:
1978
+ # If the centroid is outside the box, move the copy inside the box
1979
+ orig_centroid = centroid(fractional_coord)
1980
+ new_centroid = orig_centroid % 1
1981
+ fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
1982
+ unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
1983
+
1984
+ unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
1985
+ unit_cell.set_annotation(
1986
+ "sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
1987
+ )
1988
+ return unit_cell
1989
+
1990
+
1823
1991
  def get_sse(pdbx_file, data_block=None, match_model=None):
1824
1992
  """
1825
1993
  Get the secondary structure from a PDBx file.