biotite 1.3.0__cp313-cp313-macosx_11_0_arm64.whl → 1.5.0__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (52) hide show
  1. biotite/application/dssp/app.py +63 -6
  2. biotite/database/afdb/download.py +12 -6
  3. biotite/database/rcsb/download.py +1 -0
  4. biotite/database/rcsb/query.py +2 -2
  5. biotite/interface/pymol/object.py +3 -1
  6. biotite/interface/rdkit/mol.py +5 -5
  7. biotite/sequence/align/banded.cpython-313-darwin.so +0 -0
  8. biotite/sequence/align/kmeralphabet.cpython-313-darwin.so +0 -0
  9. biotite/sequence/align/kmersimilarity.cpython-313-darwin.so +0 -0
  10. biotite/sequence/align/kmertable.cpython-313-darwin.so +0 -0
  11. biotite/sequence/align/localgapped.cpython-313-darwin.so +0 -0
  12. biotite/sequence/align/localungapped.cpython-313-darwin.so +0 -0
  13. biotite/sequence/align/multiple.cpython-313-darwin.so +0 -0
  14. biotite/sequence/align/pairwise.cpython-313-darwin.so +0 -0
  15. biotite/sequence/align/permutation.cpython-313-darwin.so +0 -0
  16. biotite/sequence/align/selector.cpython-313-darwin.so +0 -0
  17. biotite/sequence/align/tracetable.cpython-313-darwin.so +0 -0
  18. biotite/sequence/codec.cpython-313-darwin.so +0 -0
  19. biotite/sequence/phylo/nj.cpython-313-darwin.so +0 -0
  20. biotite/sequence/phylo/tree.cpython-313-darwin.so +0 -0
  21. biotite/sequence/phylo/upgma.cpython-313-darwin.so +0 -0
  22. biotite/structure/atoms.py +1 -1
  23. biotite/structure/bonds.cpython-313-darwin.so +0 -0
  24. biotite/structure/bonds.pyx +67 -6
  25. biotite/structure/box.py +1 -1
  26. biotite/structure/celllist.cpython-313-darwin.so +0 -0
  27. biotite/structure/chains.py +34 -0
  28. biotite/structure/charges.cpython-313-darwin.so +0 -0
  29. biotite/structure/compare.py +2 -0
  30. biotite/structure/filter.py +2 -1
  31. biotite/structure/geometry.py +164 -2
  32. biotite/structure/info/atoms.py +8 -0
  33. biotite/structure/info/components.bcif +0 -0
  34. biotite/structure/io/pdb/convert.py +1 -0
  35. biotite/structure/io/pdb/file.py +31 -7
  36. biotite/structure/io/pdb/hybrid36.cpython-313-darwin.so +0 -0
  37. biotite/structure/io/pdbx/bcif.py +7 -4
  38. biotite/structure/io/pdbx/cif.py +6 -3
  39. biotite/structure/io/pdbx/compress.py +15 -11
  40. biotite/structure/io/pdbx/convert.py +42 -26
  41. biotite/structure/io/pdbx/encoding.cpython-313-darwin.so +0 -0
  42. biotite/structure/io/pdbx/encoding.pyx +39 -8
  43. biotite/structure/residues.py +173 -1
  44. biotite/structure/rings.py +117 -1
  45. biotite/structure/sasa.cpython-313-darwin.so +0 -0
  46. biotite/structure/segments.py +39 -3
  47. biotite/structure/util.py +14 -22
  48. biotite/version.py +16 -3
  49. {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/METADATA +1 -1
  50. {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/RECORD +52 -52
  51. {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/WHEEL +0 -0
  52. {biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -19,19 +19,79 @@ __all__ = [
19
19
  "dihedral",
20
20
  "index_dihedral",
21
21
  "dihedral_backbone",
22
+ "dihedral_side_chain",
22
23
  "centroid",
23
24
  ]
24
25
 
26
+ import functools
25
27
  import numpy as np
26
28
  from biotite.structure.atoms import AtomArray, AtomArrayStack, coord
27
29
  from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal
28
- from biotite.structure.filter import filter_amino_acids
30
+ from biotite.structure.filter import filter_amino_acids, filter_canonical_amino_acids
31
+ from biotite.structure.residues import get_residue_starts
29
32
  from biotite.structure.util import (
30
33
  coord_for_atom_name_per_residue,
31
34
  norm_vector,
32
35
  vector_dot,
33
36
  )
34
37
 
38
+ # The names of the atoms participating in chi angle
39
+ _CHI_ATOMS = {
40
+ "ARG": [
41
+ ("N", "CA", "CB", "CG"),
42
+ ("CA", "CB", "CG", "CD"),
43
+ ("CB", "CG", "CD", "NE"),
44
+ ("CG", "CD", "NE", "CZ"),
45
+ ],
46
+ "LEU": [
47
+ ("N", "CA", "CB", "CG"),
48
+ # By convention chi2 is defined using CD1 instead of CD2
49
+ ("CA", "CB", "CG", "CD1"),
50
+ ],
51
+ "VAL": [("N", "CA", "CB", "CG1")],
52
+ "ILE": [("N", "CA", "CB", "CG1"), ("CA", "CB", "CG1", "CD1")],
53
+ "MET": [
54
+ ("N", "CA", "CB", "CG"),
55
+ ("CA", "CB", "CG", "SD"),
56
+ ("CB", "CG", "SD", "CE"),
57
+ ],
58
+ "LYS": [
59
+ ("N", "CA", "CB", "CG"),
60
+ ("CA", "CB", "CG", "CD"),
61
+ ("CB", "CG", "CD", "CE"),
62
+ ("CG", "CD", "CE", "NZ"),
63
+ ],
64
+ "PHE": [
65
+ ("N", "CA", "CB", "CG"),
66
+ ("CA", "CB", "CG", "CD1"),
67
+ ],
68
+ "TRP": [
69
+ ("N", "CA", "CB", "CG"),
70
+ ("CA", "CB", "CG", "CD1"),
71
+ ],
72
+ "TYR": [
73
+ ("N", "CA", "CB", "CG"),
74
+ ("CA", "CB", "CG", "CD1"),
75
+ ],
76
+ "ASN": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
77
+ "GLN": [
78
+ ("N", "CA", "CB", "CG"),
79
+ ("CA", "CB", "CG", "CD"),
80
+ ("CB", "CG", "CD", "OE1"),
81
+ ],
82
+ "ASP": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
83
+ "GLU": [
84
+ ("N", "CA", "CB", "CG"),
85
+ ("CA", "CB", "CG", "CD"),
86
+ ("CB", "CG", "CD", "OE1"),
87
+ ],
88
+ "CYS": [("N", "CA", "CB", "SG")],
89
+ "HIS": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "ND1")],
90
+ "PRO": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD")],
91
+ "SER": [("N", "CA", "CB", "OG")],
92
+ "THR": [("N", "CA", "CB", "OG1")],
93
+ }
94
+
35
95
 
36
96
  def displacement(atoms1, atoms2, box=None):
37
97
  """
@@ -492,7 +552,7 @@ def dihedral_backbone(atom_array):
492
552
 
493
553
  Returns
494
554
  -------
495
- phi, psi, omega : ndarray
555
+ phi, psi, omega : ndarray, shape=(m,n) or shape=(n,), dtype=float
496
556
  An array containing the 3 backbone dihedral angles for every CA atom.
497
557
  `phi` is not defined at the N-terminus, `psi` and `omega` are not defined at the
498
558
  C-terminus.
@@ -562,6 +622,96 @@ def dihedral_backbone(atom_array):
562
622
  return phi, psi, omg
563
623
 
564
624
 
625
+ def dihedral_side_chain(atoms):
626
+ r"""
627
+ Measure the side chain :math:`\chi` dihedral angles of amino acid residues.
628
+
629
+ Parameters
630
+ ----------
631
+ atoms : AtomArray or AtomArrayStack
632
+ The protein structure to measure the side chain dihedral angles for.
633
+
634
+ Returns
635
+ -------
636
+ chi : ndarray, shape=(m, n, 4) or shape=(n, 4), dtype=float
637
+ An array containing the up to four side chain dihedral angles for every
638
+ amino acid residue.
639
+ Trailing :math:`\chi` angles that are not defined for an amino acid are filled
640
+ with :math:`NaN` values.
641
+ The same is True for all residues that are not canonical amino acids.
642
+
643
+ Notes
644
+ -----
645
+ By convention, the :math:`\chi_2` angle of leucine is defined using ``CD1``
646
+ instead of ``CD2``.
647
+
648
+ Examples
649
+ --------
650
+
651
+ >>> res_ids, res_names = get_residues(atom_array)
652
+ >>> dihedrals = dihedral_side_chain(atom_array)
653
+ >>> for res_id, res_name, dihedrals in zip(res_ids, res_names, dihedrals):
654
+ ... print(f"{res_name.capitalize()}{res_id:<2d}:", dihedrals)
655
+ Asn1 : [-1.180 -0.066 nan nan]
656
+ Leu2 : [0.923 1.866 nan nan]
657
+ Tyr3 : [-2.593 -1.487 nan nan]
658
+ Ile4 : [-0.781 -0.972 nan nan]
659
+ Gln5 : [-2.557 1.410 -1.776 nan]
660
+ Trp6 : [3.117 1.372 nan nan]
661
+ Leu7 : [-1.33 3.08 nan nan]
662
+ Lys8 : [ 1.320 1.734 3.076 -2.022]
663
+ Asp9 : [-1.623 0.909 nan nan]
664
+ Gly10: [nan nan nan nan]
665
+ Gly11: [nan nan nan nan]
666
+ Pro12: [-0.331 0.539 nan nan]
667
+ Ser13: [-1.067 nan nan nan]
668
+ Ser14: [-2.514 nan nan nan]
669
+ Gly15: [nan nan nan nan]
670
+ Arg16: [ 1.032 -3.063 1.541 -1.568]
671
+ Pro17: [ 0.522 -0.601 nan nan]
672
+ Pro18: [ 0.475 -0.577 nan nan]
673
+ Pro19: [ 0.561 -0.602 nan nan]
674
+ Ser20: [-1.055 nan nan nan]
675
+ """
676
+ is_multi_model = isinstance(atoms, AtomArrayStack)
677
+
678
+ chi_atoms = _all_chi_atoms()
679
+ res_names = atoms.res_name[get_residue_starts(atoms)]
680
+ chi_atom_coord = coord_for_atom_name_per_residue(
681
+ atoms, chi_atoms, filter_canonical_amino_acids(atoms)
682
+ )
683
+ chi_atoms_to_coord_index = {atom_name: i for i, atom_name in enumerate(chi_atoms)}
684
+
685
+ if is_multi_model:
686
+ shape = (atoms.stack_depth(), len(res_names), 4)
687
+ else:
688
+ shape = (len(res_names), 4)
689
+ chi_angles = np.full(shape, np.nan, dtype=np.float32)
690
+ for res_name, chi_atom_names_for_all_angles in _CHI_ATOMS.items():
691
+ res_mask = res_names == res_name
692
+ for chi_i, chi_atom_names in enumerate(chi_atom_names_for_all_angles):
693
+ dihedrals = dihedral(
694
+ chi_atom_coord[
695
+ chi_atoms_to_coord_index[chi_atom_names[0]], ..., res_mask, :
696
+ ],
697
+ chi_atom_coord[
698
+ chi_atoms_to_coord_index[chi_atom_names[1]], ..., res_mask, :
699
+ ],
700
+ chi_atom_coord[
701
+ chi_atoms_to_coord_index[chi_atom_names[2]], ..., res_mask, :
702
+ ],
703
+ chi_atom_coord[
704
+ chi_atoms_to_coord_index[chi_atom_names[3]], ..., res_mask, :
705
+ ],
706
+ )
707
+ if is_multi_model:
708
+ # Swap dimensions due to NumPy's behavior when using advanced indexing
709
+ # (https://numpy.org/devdocs/user/basics.indexing.html#combining-advanced-and-basic-indexing)
710
+ dihedrals = dihedrals.T
711
+ chi_angles[..., res_mask, chi_i] = dihedrals
712
+ return chi_angles
713
+
714
+
565
715
  def centroid(atoms):
566
716
  """
567
717
  Measure the centroid of a structure.
@@ -653,3 +803,15 @@ def _displacement_triclinic_box(fractions, box, disp):
653
803
  disp[:] = shifted_diffs[
654
804
  np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1)
655
805
  ]
806
+
807
+
808
+ @functools.cache
809
+ def _all_chi_atoms():
810
+ """
811
+ Get the names of the atoms participating in any chi angle.
812
+ """
813
+ atom_names = set()
814
+ for angles in _CHI_ATOMS.values():
815
+ for angle in angles:
816
+ atom_names.update(angle)
817
+ return sorted(atom_names)
@@ -6,6 +6,7 @@ __name__ = "biotite.structure.info"
6
6
  __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["residue"]
8
8
 
9
+ import functools
9
10
  from biotite.structure.info.ccd import get_ccd
10
11
 
11
12
  # fmt: off
@@ -75,6 +76,13 @@ def residue(res_name, allow_missing_coord=False):
75
76
  ['CB' 'HB3']
76
77
  ['OXT' 'HXT']]
77
78
  """
79
+ # Use a cache internally, but always return a copy,
80
+ # as the returned AtomArray is mutable
81
+ return _residue(res_name, allow_missing_coord).copy()
82
+
83
+
84
+ @functools.lru_cache(maxsize=100)
85
+ def _residue(res_name, allow_missing_coord=False):
78
86
  # Avoid circular import
79
87
  from biotite.structure.io.pdbx import get_component
80
88
 
Binary file
@@ -16,6 +16,7 @@ __all__ = [
16
16
  "list_assemblies",
17
17
  "get_assembly",
18
18
  "get_unit_cell",
19
+ "get_symmetry_mates",
19
20
  ]
20
21
 
21
22
  import warnings
@@ -6,12 +6,16 @@ __name__ = "biotite.structure.io.pdb"
6
6
  __author__ = "Patrick Kunzmann, Daniel Bauer, Claude J. Rogers"
7
7
  __all__ = ["PDBFile"]
8
8
 
9
+ import itertools
9
10
  import warnings
10
11
  from collections import namedtuple
11
12
  import numpy as np
12
13
  from biotite.file import InvalidFileError, TextFile
13
14
  from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
14
- from biotite.structure.bonds import BondList, connect_via_residue_names
15
+ from biotite.structure.bonds import (
16
+ BondList,
17
+ connect_via_residue_names,
18
+ )
15
19
  from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
16
20
  from biotite.structure.error import BadStructureError
17
21
  from biotite.structure.filter import (
@@ -19,6 +23,7 @@ from biotite.structure.filter import (
19
23
  filter_highest_occupancy_altloc,
20
24
  filter_solvent,
21
25
  )
26
+ from biotite.structure.info.bonds import bonds_in_residue
22
27
  from biotite.structure.io.pdb.hybrid36 import (
23
28
  decode_hybrid36,
24
29
  encode_hybrid36,
@@ -544,7 +549,16 @@ class PDBFile(TextFile):
544
549
  # Read bonds
545
550
  if include_bonds:
546
551
  bond_list = self._get_bonds(atom_id)
547
- bond_list = bond_list.merge(connect_via_residue_names(array))
552
+ # Create bond dict containing only non-hetero residues (+ water)
553
+ custom_bond_dict = {
554
+ res_name: bonds_in_residue(res_name)
555
+ for res_name in itertools.chain(
556
+ np.unique(array[..., ~array.hetero].res_name), ["HOH"]
557
+ )
558
+ }
559
+ bond_list = bond_list.merge(
560
+ connect_via_residue_names(array, custom_bond_dict=custom_bond_dict)
561
+ )
548
562
  array.bonds = bond_list
549
563
 
550
564
  return array
@@ -936,7 +950,11 @@ class PDBFile(TextFile):
936
950
  if transform_start is None:
937
951
  raise InvalidFileError("No 'BIOMT' records found for chosen assembly")
938
952
  rotations, translations = _parse_transformations(
939
- assembly_lines[transform_start:stop]
953
+ [
954
+ line
955
+ for line in assembly_lines[transform_start:stop]
956
+ if len(line.strip()) > 0
957
+ ]
940
958
  )
941
959
  # Filter affected chains
942
960
  sub_structure = structure[
@@ -1193,7 +1211,7 @@ class PDBFile(TextFile):
1193
1211
  conect_lines = [line for line in self.lines if line.startswith("CONECT")]
1194
1212
 
1195
1213
  # Mapping from atom ids to indices in an AtomArray
1196
- atom_id_to_index = np.zeros(atom_ids[-1] + 1, dtype=int)
1214
+ atom_id_to_index = np.full(atom_ids[-1] + 1, -1, dtype=int)
1197
1215
  try:
1198
1216
  for i, id in enumerate(atom_ids):
1199
1217
  atom_id_to_index[id] = i
@@ -1202,15 +1220,21 @@ class PDBFile(TextFile):
1202
1220
 
1203
1221
  bonds = []
1204
1222
  for line in conect_lines:
1205
- center_id = atom_id_to_index[decode_hybrid36(line[6:11])]
1223
+ center_index = atom_id_to_index[decode_hybrid36(line[6:11])]
1224
+ if center_index == -1:
1225
+ # Atom ID is not in the AtomArray (probably removed altloc)
1226
+ continue
1206
1227
  for i in range(11, 31, 5):
1207
1228
  id_string = line[i : i + 5]
1208
1229
  try:
1209
- id = atom_id_to_index[decode_hybrid36(id_string)]
1230
+ contact_index = atom_id_to_index[decode_hybrid36(id_string)]
1231
+ if contact_index == -1:
1232
+ # Atom ID is not in the AtomArray (probably removed altloc)
1233
+ continue
1210
1234
  except ValueError:
1211
1235
  # String is empty -> no further IDs
1212
1236
  break
1213
- bonds.append((center_id, id))
1237
+ bonds.append((center_index, contact_index))
1214
1238
 
1215
1239
  # The length of the 'atom_ids' array
1216
1240
  # is equal to the length of the AtomArray
@@ -292,7 +292,7 @@ class BinaryCIFColumn(_Component):
292
292
  else:
293
293
  # Array needs to be converted, but masked values are
294
294
  # not necessarily convertible
295
- # (e.g. '' cannot be converted to int)
295
+ # (e.g. '.' cannot be converted to int)
296
296
  if masked_value is None:
297
297
  array = np.zeros(len(self._data), dtype=dtype)
298
298
  else:
@@ -511,7 +511,7 @@ class BinaryCIFBlock(_HierarchicalContainer):
511
511
 
512
512
  def __delitem__(self, key):
513
513
  try:
514
- return super().__setitem__("_" + key)
514
+ return super().__delitem__("_" + key)
515
515
  except KeyError:
516
516
  raise KeyError(key)
517
517
 
@@ -581,9 +581,12 @@ class BinaryCIFFile(File, _HierarchicalContainer):
581
581
 
582
582
  @property
583
583
  def block(self):
584
- if len(self) != 1:
584
+ if len(self) == 0:
585
+ raise ValueError("There are no blocks in the file")
586
+ elif len(self) > 1:
585
587
  raise ValueError("There are multiple blocks in the file")
586
- return self[next(iter(self))]
588
+ else:
589
+ return self[next(iter(self))]
587
590
 
588
591
  @staticmethod
589
592
  def subcomponent_class():
@@ -243,7 +243,7 @@ class CIFColumn:
243
243
  else:
244
244
  # Array needs to be converted, but masked values are
245
245
  # not necessarily convertible
246
- # (e.g. '' cannot be converted to int)
246
+ # (e.g. '.' cannot be converted to int)
247
247
  if masked_value is None:
248
248
  array = np.zeros(len(self._data), dtype=dtype)
249
249
  else:
@@ -799,9 +799,12 @@ class CIFFile(_Component, File, MutableMapping):
799
799
 
800
800
  @property
801
801
  def block(self):
802
- if len(self) != 1:
802
+ if len(self) == 0:
803
+ raise ValueError("There are no blocks in the file")
804
+ elif len(self) > 1:
803
805
  raise ValueError("There are multiple blocks in the file")
804
- return self[next(iter(self))]
806
+ else:
807
+ return self[next(iter(self))]
805
808
 
806
809
  @staticmethod
807
810
  def subcomponent_class():
@@ -56,14 +56,14 @@ def compress(data, float_tolerance=None, rtol=1e-6, atol=1e-4):
56
56
  >>> pdbx_file.write(uncompressed_file)
57
57
  >>> _ = uncompressed_file.seek(0)
58
58
  >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
59
- 927 KB
59
+ 937 KB
60
60
  >>> # Write compressed file
61
61
  >>> pdbx_file = compress(pdbx_file)
62
62
  >>> compressed_file = BytesIO()
63
63
  >>> pdbx_file.write(compressed_file)
64
64
  >>> _ = compressed_file.seek(0)
65
65
  >>> print(f"{len(compressed_file.read()) // 1000} KB")
66
- 111 KB
66
+ 114 KB
67
67
  """
68
68
  if float_tolerance is not None:
69
69
  warnings.warn(
@@ -140,8 +140,8 @@ def _compress_data(bcif_data, rtol, atol):
140
140
  # Run encode to initialize the data and offset arrays
141
141
  indices = encoding.encode(array)
142
142
  offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
143
- encoding.data_encoding, _ = _find_best_integer_compression(indices)
144
- encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
143
+ encoding.data_encoding = _find_best_integer_compression(indices)
144
+ encoding.offset_encoding = _find_best_integer_compression(offsets)
145
145
  return bcif.BinaryCIFData(array, [encoding])
146
146
 
147
147
  elif np.issubdtype(array.dtype, np.floating):
@@ -159,18 +159,22 @@ def _compress_data(bcif_data, rtol, atol):
159
159
  # -> do not use integer encoding
160
160
  return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
161
161
  else:
162
- best_encoding, size_compressed = _find_best_integer_compression(
163
- integer_array
162
+ best_encoding = _find_best_integer_compression(integer_array)
163
+ compressed_data = bcif.BinaryCIFData(
164
+ array, [to_integer_encoding] + best_encoding
164
165
  )
165
- if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
166
- return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
166
+ uncompressed_data = bcif.BinaryCIFData(array, [ByteArrayEncoding()])
167
+ if _data_size_in_file(compressed_data) < _data_size_in_file(
168
+ uncompressed_data
169
+ ):
170
+ return compressed_data
167
171
  else:
168
172
  # The float array is smaller -> encode it directly as bytes
169
- return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
173
+ return uncompressed_data
170
174
 
171
175
  elif np.issubdtype(array.dtype, np.integer):
172
176
  array = _to_smallest_integer_type(array)
173
- encodings, _ = _find_best_integer_compression(array)
177
+ encodings = _find_best_integer_compression(array)
174
178
  return bcif.BinaryCIFData(array, encodings)
175
179
 
176
180
  else:
@@ -233,7 +237,7 @@ def _find_best_integer_compression(array):
233
237
  if size < smallest_size:
234
238
  best_encoding_sequence = encodings
235
239
  smallest_size = size
236
- return best_encoding_sequence, smallest_size
240
+ return best_encoding_sequence
237
241
 
238
242
 
239
243
  def _estimate_packed_length(array, packed_byte_count):
@@ -55,6 +55,7 @@ from biotite.structure.io.pdbx.bcif import (
55
55
  from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
56
56
  from biotite.structure.io.pdbx.component import MaskValue
57
57
  from biotite.structure.io.pdbx.encoding import StringArrayEncoding
58
+ from biotite.structure.repair import create_continuous_res_ids
58
59
  from biotite.structure.residues import (
59
60
  get_residue_count,
60
61
  get_residue_positions,
@@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
496
497
  atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
497
498
  ).as_array(str),
498
499
  )
499
- array.set_annotation(
500
- "res_id",
501
- _get_or_fallback(
502
- atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
503
- ).as_array(int, -1),
504
- )
505
500
  array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
506
501
  array.set_annotation(
507
502
  "res_name",
@@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
518
513
  )
519
514
  array.set_annotation("element", atom_site["type_symbol"].as_array(str))
520
515
 
516
+ # Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
517
+ # hetero residues, which makes distinguishing subsequent residues from another
518
+ # difficult (https://github.com/biotite-dev/biotite/issues/553)
519
+ res_id = _get_or_fallback(
520
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
521
+ ).as_array(int, -1)
522
+ if not use_author_fields and "auth_seq_id" in atom_site:
523
+ # Therefore, the `auth_seq_id` is still used to determine residue starts
524
+ # in `create_continuous_res_ids()`, even if `use_author_fields = False`.
525
+ res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
526
+ array.set_annotation("res_id", res_id_for_residue_starts)
527
+ fallback_res_ids = create_continuous_res_ids(array)
528
+ array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
529
+ else:
530
+ array.set_annotation("res_id", res_id)
531
+
521
532
  if "atom_id" in extra_fields:
522
533
  if "id" in atom_site:
523
534
  array.set_annotation("atom_id", atom_site["id"].as_array(int))
@@ -775,7 +786,10 @@ def _filter_altloc(array, atom_site, altloc):
775
786
  if altloc == "all":
776
787
  array.set_annotation("altloc_id", altloc_ids.as_array(str))
777
788
  return array, atom_site
778
- elif altloc_ids is None or (altloc_ids.mask.array != MaskValue.PRESENT).all():
789
+ elif altloc_ids is None or (
790
+ altloc_ids.mask is not None
791
+ and (altloc_ids.mask.array != MaskValue.PRESENT).all()
792
+ ):
779
793
  # No altlocs in atom_site category
780
794
  return array, atom_site
781
795
  elif altloc == "occupancy" and occupancy is not None:
@@ -873,11 +887,7 @@ def set_structure(
873
887
  this parameter is ignored.
874
888
  If the file is empty, a new data block will be created.
875
889
  include_bonds : bool, optional
876
- If set to true and `array` has associated ``bonds`` , the
877
- intra-residue bonds will be written into the ``chem_comp_bond``
878
- category.
879
- Inter-residue bonds will be written into the ``struct_conn``
880
- independent of this parameter.
890
+ DEPRECATED: Has no effect anymore.
881
891
  extra_fields : list of str, optional
882
892
  List of additional fields from the ``atom_site`` category
883
893
  that should be written into the file.
@@ -898,6 +908,13 @@ def set_structure(
898
908
  >>> set_structure(file, atom_array)
899
909
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
900
910
  """
911
+ if include_bonds:
912
+ warnings.warn(
913
+ "`include_bonds` parameter is deprecated, "
914
+ "intra-residue are always written, if available",
915
+ DeprecationWarning,
916
+ )
917
+
901
918
  _check_non_empty(array)
902
919
 
903
920
  block = _get_or_create_block(pdbx_file, data_block)
@@ -975,10 +992,9 @@ def set_structure(
975
992
  struct_conn = _set_inter_residue_bonds(array, atom_site)
976
993
  if struct_conn is not None:
977
994
  block["struct_conn"] = struct_conn
978
- if include_bonds:
979
- chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
980
- if chem_comp_bond is not None:
981
- block["chem_comp_bond"] = chem_comp_bond
995
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
996
+ if chem_comp_bond is not None:
997
+ block["chem_comp_bond"] = chem_comp_bond
982
998
 
983
999
  # In case of a single model handle each coordinate
984
1000
  # simply like a flattened array
@@ -1652,11 +1668,11 @@ def get_assembly(
1652
1668
  If set to true, a :class:`BondList` will be created for the
1653
1669
  resulting :class:`AtomArray` containing the bond information
1654
1670
  from the file.
1655
- Bonds, whose order could not be determined from the
1656
- *Chemical Component Dictionary*
1657
- (e.g. especially inter-residue bonds),
1658
- have :attr:`BondType.ANY`, since the PDB format itself does
1659
- not support bond orders.
1671
+ Inter-residue bonds, will be read from the ``struct_conn``
1672
+ category.
1673
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1674
+ available, otherwise they will be derived from the Chemical
1675
+ Component Dictionary.
1660
1676
 
1661
1677
  Returns
1662
1678
  -------
@@ -1926,11 +1942,11 @@ def get_unit_cell(
1926
1942
  If set to true, a :class:`BondList` will be created for the
1927
1943
  resulting :class:`AtomArray` containing the bond information
1928
1944
  from the file.
1929
- Bonds, whose order could not be determined from the
1930
- *Chemical Component Dictionary*
1931
- (e.g. especially inter-residue bonds),
1932
- have :attr:`BondType.ANY`, since the PDB format itself does
1933
- not support bond orders.
1945
+ Inter-residue bonds, will be read from the ``struct_conn``
1946
+ category.
1947
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1948
+ available, otherwise they will be derived from the Chemical
1949
+ Component Dictionary.
1934
1950
 
1935
1951
  Returns
1936
1952
  -------
@@ -225,9 +225,13 @@ class Encoding(_Component, metaclass=ABCMeta):
225
225
  -------
226
226
  decoded_data : ndarray
227
227
  The decoded data.
228
+
229
+ Warnings
230
+ --------
231
+ When overriding this method, do not omit bound checks with
232
+ ``@cython.boundscheck(False)`` or ``@cython.wraparound(False)``,
233
+ since the file content may be invalid/malicious.
228
234
  """
229
- # Important: Do not omit bound checks for decoding,
230
- # since the file content may be invalid/malicious.
231
235
  raise NotImplementedError()
232
236
 
233
237
  def __str__(self):
@@ -883,17 +887,39 @@ class StringArrayEncoding(Encoding):
883
887
  else:
884
888
  check_present = True
885
889
 
886
- string_order = _safe_cast(np.argsort(self.strings), np.int32)
887
- sorted_strings = self.strings[string_order]
888
- sorted_indices = np.searchsorted(sorted_strings, data)
889
- indices = string_order[sorted_indices]
890
- if check_present and not np.all(self.strings[indices] == data):
890
+ if len(self.strings) > 0:
891
+ string_order = _safe_cast(np.argsort(self.strings), np.int32)
892
+ sorted_strings = self.strings[string_order]
893
+ sorted_indices = np.searchsorted(sorted_strings, data)
894
+ indices = string_order[sorted_indices]
895
+ # `"" not in self.strings` can be quite costly and is only necessary,
896
+ # if the the `strings` were given by the user, as otherwise we always
897
+ # include an empty string explicitly when we compute them in this function
898
+ # -> Only run if `check_present` is True
899
+ if check_present and "" not in self.strings:
900
+ # Represent empty strings as -1
901
+ indices[data == ""] = -1
902
+ else:
903
+ # There are no strings -> The indices can only ever be -1 to indicate
904
+ # missing values
905
+ # The check if this is correct is done below
906
+ indices = np.full(data.shape[0], -1, dtype=np.int32)
907
+
908
+ valid_indices_mask = indices != -1
909
+ if check_present and not np.all(
910
+ self.strings[indices[valid_indices_mask]] == data[valid_indices_mask]
911
+ ):
891
912
  raise ValueError("Data contains strings not present in 'strings'")
892
913
  return encode_stepwise(indices, self.data_encoding)
893
914
 
894
915
  def decode(self, data):
895
916
  indices = decode_stepwise(data, self.data_encoding)
896
- return self.strings[indices]
917
+ # Initialize with empty strings
918
+ strings = np.zeros(indices.shape[0], dtype=self.strings.dtype)
919
+ # `-1`` indices indicate missing values
920
+ valid_indices_mask = indices != -1
921
+ strings[valid_indices_mask] = self.strings[indices[valid_indices_mask]]
922
+ return strings
897
923
 
898
924
  def __eq__(self, other):
899
925
  if not isinstance(other, type(self)):
@@ -1009,6 +1035,11 @@ def decode_stepwise(data, encoding):
1009
1035
  """
1010
1036
  for enc in reversed(encoding):
1011
1037
  data = enc.decode(data)
1038
+ # ByteEncoding may decode in a non-writable array,
1039
+ # as it creates the ndarray cheaply from buffer
1040
+ if not data.flags.writeable:
1041
+ # Make the resulting ndarray writable, by copying the underlying buffer
1042
+ data = data.copy()
1012
1043
  return data
1013
1044
 
1014
1045