PyPI - biotite - Versions diffs - 1.3.0__cp312-cp312-macosx_11_0_arm64.whl → 1.5.0__cp312-cp312-macosx_11_0_arm64.whl - Mend

biotite 1.3.0__cp312-cp312-macosx_11_0_arm64.whl → 1.5.0__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

biotite/application/dssp/app.py +63 -6
biotite/database/afdb/download.py +12 -6
biotite/database/rcsb/download.py +1 -0
biotite/database/rcsb/query.py +2 -2
biotite/interface/pymol/object.py +3 -1
biotite/interface/rdkit/mol.py +5 -5
biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
biotite/sequence/codec.cpython-312-darwin.so +0 -0
biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
biotite/structure/atoms.py +1 -1
biotite/structure/bonds.cpython-312-darwin.so +0 -0
biotite/structure/bonds.pyx +67 -6
biotite/structure/box.py +1 -1
biotite/structure/celllist.cpython-312-darwin.so +0 -0
biotite/structure/chains.py +34 -0
biotite/structure/charges.cpython-312-darwin.so +0 -0
biotite/structure/compare.py +2 -0
biotite/structure/filter.py +2 -1
biotite/structure/geometry.py +164 -2
biotite/structure/info/atoms.py +8 -0
biotite/structure/info/components.bcif +0 -0
biotite/structure/io/pdb/convert.py +1 -0
biotite/structure/io/pdb/file.py +31 -7
biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
biotite/structure/io/pdbx/bcif.py +7 -4
biotite/structure/io/pdbx/cif.py +6 -3
biotite/structure/io/pdbx/compress.py +15 -11
biotite/structure/io/pdbx/convert.py +42 -26
biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
biotite/structure/io/pdbx/encoding.pyx +39 -8
biotite/structure/residues.py +173 -1
biotite/structure/rings.py +117 -1
biotite/structure/sasa.cpython-312-darwin.so +0 -0
biotite/structure/segments.py +39 -3
biotite/structure/util.py +14 -22
biotite/version.py +16 -3
{biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/METADATA +1 -1
{biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/RECORD +52 -52
{biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/WHEEL +0 -0
{biotite-1.3.0.dist-info → biotite-1.5.0.dist-info}/licenses/LICENSE.rst +0 -0

biotite/structure/geometry.py CHANGED Viewed

@@ -19,19 +19,79 @@ __all__ = [
     "dihedral",
     "index_dihedral",
     "dihedral_backbone",
+    "dihedral_side_chain",
     "centroid",
 ]
+import functools
 import numpy as np
 from biotite.structure.atoms import AtomArray, AtomArrayStack, coord
 from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal
-from biotite.structure.filter import filter_amino_acids
+from biotite.structure.filter import filter_amino_acids, filter_canonical_amino_acids
+from biotite.structure.residues import get_residue_starts
 from biotite.structure.util import (
     coord_for_atom_name_per_residue,
     norm_vector,
     vector_dot,
 )
+# The names of the atoms participating in chi angle
+_CHI_ATOMS = {
+    "ARG": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD"),
+        ("CB", "CG", "CD", "NE"),
+        ("CG", "CD", "NE", "CZ"),
+    ],
+    "LEU": [
+        ("N", "CA", "CB", "CG"),
+        # By convention chi2 is defined using CD1 instead of CD2
+        ("CA", "CB", "CG", "CD1"),
+    ],
+    "VAL": [("N", "CA", "CB", "CG1")],
+    "ILE": [("N", "CA", "CB", "CG1"), ("CA", "CB", "CG1", "CD1")],
+    "MET": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "SD"),
+        ("CB", "CG", "SD", "CE"),
+    ],
+    "LYS": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD"),
+        ("CB", "CG", "CD", "CE"),
+        ("CG", "CD", "CE", "NZ"),
+    ],
+    "PHE": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD1"),
+    ],
+    "TRP": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD1"),
+    ],
+    "TYR": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD1"),
+    ],
+    "ASN": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
+    "GLN": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD"),
+        ("CB", "CG", "CD", "OE1"),
+    ],
+    "ASP": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
+    "GLU": [
+        ("N", "CA", "CB", "CG"),
+        ("CA", "CB", "CG", "CD"),
+        ("CB", "CG", "CD", "OE1"),
+    ],
+    "CYS": [("N", "CA", "CB", "SG")],
+    "HIS": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "ND1")],
+    "PRO": [("N", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD")],
+    "SER": [("N", "CA", "CB", "OG")],
+    "THR": [("N", "CA", "CB", "OG1")],
+}
 def displacement(atoms1, atoms2, box=None):
     """
@@ -492,7 +552,7 @@ def dihedral_backbone(atom_array):
     Returns
     -------
-    phi, psi, omega : ndarray
+    phi, psi, omega : ndarray, shape=(m,n) or shape=(n,), dtype=float
         An array containing the 3 backbone dihedral angles for every CA atom.
         `phi` is not defined at the N-terminus, `psi` and `omega` are not defined at the
         C-terminus.
@@ -562,6 +622,96 @@ def dihedral_backbone(atom_array):
     return phi, psi, omg
+def dihedral_side_chain(atoms):
+    r"""
+    Measure the side chain :math:`\chi` dihedral angles of amino acid residues.
+    Parameters
+    ----------
+    atoms : AtomArray or AtomArrayStack
+        The protein structure to measure the side chain dihedral angles for.
+    Returns
+    -------
+    chi : ndarray, shape=(m, n, 4) or shape=(n, 4), dtype=float
+        An array containing the up to four side chain dihedral angles for every
+        amino acid residue.
+        Trailing :math:`\chi` angles that are not defined for an amino acid are filled
+        with :math:`NaN` values.
+        The same is True for all residues that are not canonical amino acids.
+    Notes
+    -----
+    By convention, the :math:`\chi_2` angle of leucine is defined using ``CD1``
+    instead of ``CD2``.
+    Examples
+    --------
+    >>> res_ids, res_names = get_residues(atom_array)
+    >>> dihedrals = dihedral_side_chain(atom_array)
+    >>> for res_id, res_name, dihedrals in zip(res_ids, res_names, dihedrals):
+    ...     print(f"{res_name.capitalize()}{res_id:<2d}:", dihedrals)
+    Asn1 : [-1.180 -0.066    nan    nan]
+    Leu2 : [0.923 1.866   nan   nan]
+    Tyr3 : [-2.593 -1.487    nan    nan]
+    Ile4 : [-0.781 -0.972    nan    nan]
+    Gln5 : [-2.557  1.410 -1.776    nan]
+    Trp6 : [3.117 1.372   nan   nan]
+    Leu7 : [-1.33  3.08   nan   nan]
+    Lys8 : [ 1.320  1.734  3.076 -2.022]
+    Asp9 : [-1.623  0.909    nan    nan]
+    Gly10: [nan nan nan nan]
+    Gly11: [nan nan nan nan]
+    Pro12: [-0.331  0.539    nan    nan]
+    Ser13: [-1.067    nan    nan    nan]
+    Ser14: [-2.514    nan    nan    nan]
+    Gly15: [nan nan nan nan]
+    Arg16: [ 1.032 -3.063  1.541 -1.568]
+    Pro17: [ 0.522 -0.601    nan    nan]
+    Pro18: [ 0.475 -0.577    nan    nan]
+    Pro19: [ 0.561 -0.602    nan    nan]
+    Ser20: [-1.055    nan    nan    nan]
+    """
+    is_multi_model = isinstance(atoms, AtomArrayStack)
+    chi_atoms = _all_chi_atoms()
+    res_names = atoms.res_name[get_residue_starts(atoms)]
+    chi_atom_coord = coord_for_atom_name_per_residue(
+        atoms, chi_atoms, filter_canonical_amino_acids(atoms)
+    )
+    chi_atoms_to_coord_index = {atom_name: i for i, atom_name in enumerate(chi_atoms)}
+    if is_multi_model:
+        shape = (atoms.stack_depth(), len(res_names), 4)
+    else:
+        shape = (len(res_names), 4)
+    chi_angles = np.full(shape, np.nan, dtype=np.float32)
+    for res_name, chi_atom_names_for_all_angles in _CHI_ATOMS.items():
+        res_mask = res_names == res_name
+        for chi_i, chi_atom_names in enumerate(chi_atom_names_for_all_angles):
+            dihedrals = dihedral(
+                chi_atom_coord[
+                    chi_atoms_to_coord_index[chi_atom_names[0]], ..., res_mask, :
+                ],
+                chi_atom_coord[
+                    chi_atoms_to_coord_index[chi_atom_names[1]], ..., res_mask, :
+                ],
+                chi_atom_coord[
+                    chi_atoms_to_coord_index[chi_atom_names[2]], ..., res_mask, :
+                ],
+                chi_atom_coord[
+                    chi_atoms_to_coord_index[chi_atom_names[3]], ..., res_mask, :
+                ],
+            )
+            if is_multi_model:
+                # Swap dimensions due to NumPy's behavior when using advanced indexing
+                # (https://numpy.org/devdocs/user/basics.indexing.html#combining-advanced-and-basic-indexing)
+                dihedrals = dihedrals.T
+            chi_angles[..., res_mask, chi_i] = dihedrals
+    return chi_angles
 def centroid(atoms):
     """
     Measure the centroid of a structure.
@@ -653,3 +803,15 @@ def _displacement_triclinic_box(fractions, box, disp):
     disp[:] = shifted_diffs[
         np.arange(len(shifted_diffs)), np.argmin(sq_distance, axis=1)
     ]
+@functools.cache
+def _all_chi_atoms():
+    """
+    Get the names of the atoms participating in any chi angle.
+    """
+    atom_names = set()
+    for angles in _CHI_ATOMS.values():
+        for angle in angles:
+            atom_names.update(angle)
+    return sorted(atom_names)

biotite/structure/info/atoms.py CHANGED Viewed

@@ -6,6 +6,7 @@ __name__ = "biotite.structure.info"
 __author__ = "Patrick Kunzmann"
 __all__ = ["residue"]
+import functools
 from biotite.structure.info.ccd import get_ccd
 # fmt: off
@@ -75,6 +76,13 @@ def residue(res_name, allow_missing_coord=False):
      ['CB' 'HB3']
      ['OXT' 'HXT']]
     """
+    # Use a cache internally, but always return a copy,
+    # as the returned AtomArray is mutable
+    return _residue(res_name, allow_missing_coord).copy()
+@functools.lru_cache(maxsize=100)
+def _residue(res_name, allow_missing_coord=False):
     # Avoid circular import
     from biotite.structure.io.pdbx import get_component

biotite/structure/info/components.bcif CHANGED Viewed

Binary file

biotite/structure/io/pdb/convert.py CHANGED Viewed

@@ -16,6 +16,7 @@ __all__ = [
     "list_assemblies",
     "get_assembly",
     "get_unit_cell",
+    "get_symmetry_mates",
 ]
 import warnings

biotite/structure/io/pdb/file.py CHANGED Viewed

@@ -6,12 +6,16 @@ __name__ = "biotite.structure.io.pdb"
 __author__ = "Patrick Kunzmann, Daniel Bauer, Claude J. Rogers"
 __all__ = ["PDBFile"]
+import itertools
 import warnings
 from collections import namedtuple
 import numpy as np
 from biotite.file import InvalidFileError, TextFile
 from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
-from biotite.structure.bonds import BondList, connect_via_residue_names
+from biotite.structure.bonds import (
+    BondList,
+    connect_via_residue_names,
+)
 from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
 from biotite.structure.error import BadStructureError
 from biotite.structure.filter import (
@@ -19,6 +23,7 @@ from biotite.structure.filter import (
     filter_highest_occupancy_altloc,
     filter_solvent,
 )
+from biotite.structure.info.bonds import bonds_in_residue
 from biotite.structure.io.pdb.hybrid36 import (
     decode_hybrid36,
     encode_hybrid36,
@@ -544,7 +549,16 @@ class PDBFile(TextFile):
         # Read bonds
         if include_bonds:
             bond_list = self._get_bonds(atom_id)
-            bond_list = bond_list.merge(connect_via_residue_names(array))
+            # Create bond dict containing only non-hetero residues (+ water)
+            custom_bond_dict = {
+                res_name: bonds_in_residue(res_name)
+                for res_name in itertools.chain(
+                    np.unique(array[..., ~array.hetero].res_name), ["HOH"]
+                )
+            }
+            bond_list = bond_list.merge(
+                connect_via_residue_names(array, custom_bond_dict=custom_bond_dict)
+            )
             array.bonds = bond_list
         return array
@@ -936,7 +950,11 @@ class PDBFile(TextFile):
             if transform_start is None:
                 raise InvalidFileError("No 'BIOMT' records found for chosen assembly")
             rotations, translations = _parse_transformations(
-                assembly_lines[transform_start:stop]
+                [
+                    line
+                    for line in assembly_lines[transform_start:stop]
+                    if len(line.strip()) > 0
+                ]
             )
             # Filter affected chains
             sub_structure = structure[
@@ -1193,7 +1211,7 @@ class PDBFile(TextFile):
         conect_lines = [line for line in self.lines if line.startswith("CONECT")]
         # Mapping from atom ids to indices in an AtomArray
-        atom_id_to_index = np.zeros(atom_ids[-1] + 1, dtype=int)
+        atom_id_to_index = np.full(atom_ids[-1] + 1, -1, dtype=int)
         try:
             for i, id in enumerate(atom_ids):
                 atom_id_to_index[id] = i
@@ -1202,15 +1220,21 @@ class PDBFile(TextFile):
         bonds = []
         for line in conect_lines:
-            center_id = atom_id_to_index[decode_hybrid36(line[6:11])]
+            center_index = atom_id_to_index[decode_hybrid36(line[6:11])]
+            if center_index == -1:
+                # Atom ID is not in the AtomArray (probably removed altloc)
+                continue
             for i in range(11, 31, 5):
                 id_string = line[i : i + 5]
                 try:
-                    id = atom_id_to_index[decode_hybrid36(id_string)]
+                    contact_index = atom_id_to_index[decode_hybrid36(id_string)]
+                    if contact_index == -1:
+                        # Atom ID is not in the AtomArray (probably removed altloc)
+                        continue
                 except ValueError:
                     # String is empty -> no further IDs
                     break
-                bonds.append((center_id, id))
+                bonds.append((center_index, contact_index))
         # The length of the 'atom_ids' array
         # is equal to the length of the AtomArray

biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so CHANGED Viewed

Binary file

biotite/structure/io/pdbx/bcif.py CHANGED Viewed

@@ -292,7 +292,7 @@ class BinaryCIFColumn(_Component):
         else:
             # Array needs to be converted, but masked values are
             # not necessarily convertible
-            # (e.g. '' cannot be converted to int)
+            # (e.g. '.' cannot be converted to int)
             if masked_value is None:
                 array = np.zeros(len(self._data), dtype=dtype)
             else:
@@ -511,7 +511,7 @@ class BinaryCIFBlock(_HierarchicalContainer):
     def __delitem__(self, key):
         try:
-            return super().__setitem__("_" + key)
+            return super().__delitem__("_" + key)
         except KeyError:
             raise KeyError(key)
@@ -581,9 +581,12 @@ class BinaryCIFFile(File, _HierarchicalContainer):
     @property
     def block(self):
-        if len(self) != 1:
+        if len(self) == 0:
+            raise ValueError("There are no blocks in the file")
+        elif len(self) > 1:
             raise ValueError("There are multiple blocks in the file")
-        return self[next(iter(self))]
+        else:
+            return self[next(iter(self))]
     @staticmethod
     def subcomponent_class():

biotite/structure/io/pdbx/cif.py CHANGED Viewed

@@ -243,7 +243,7 @@ class CIFColumn:
         else:
             # Array needs to be converted, but masked values are
             # not necessarily convertible
-            # (e.g. '' cannot be converted to int)
+            # (e.g. '.' cannot be converted to int)
             if masked_value is None:
                 array = np.zeros(len(self._data), dtype=dtype)
             else:
@@ -799,9 +799,12 @@ class CIFFile(_Component, File, MutableMapping):
     @property
     def block(self):
-        if len(self) != 1:
+        if len(self) == 0:
+            raise ValueError("There are no blocks in the file")
+        elif len(self) > 1:
             raise ValueError("There are multiple blocks in the file")
-        return self[next(iter(self))]
+        else:
+            return self[next(iter(self))]
     @staticmethod
     def subcomponent_class():

biotite/structure/io/pdbx/compress.py CHANGED Viewed

@@ -56,14 +56,14 @@ def compress(data, float_tolerance=None, rtol=1e-6, atol=1e-4):
     >>> pdbx_file.write(uncompressed_file)
     >>> _ = uncompressed_file.seek(0)
     >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
-    927 KB
+    937 KB
     >>> # Write compressed file
     >>> pdbx_file = compress(pdbx_file)
     >>> compressed_file = BytesIO()
     >>> pdbx_file.write(compressed_file)
     >>> _ = compressed_file.seek(0)
     >>> print(f"{len(compressed_file.read()) // 1000} KB")
-    111 KB
+    114 KB
     """
     if float_tolerance is not None:
         warnings.warn(
@@ -140,8 +140,8 @@ def _compress_data(bcif_data, rtol, atol):
         # Run encode to initialize the data and offset arrays
         indices = encoding.encode(array)
         offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
-        encoding.data_encoding, _ = _find_best_integer_compression(indices)
-        encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
+        encoding.data_encoding = _find_best_integer_compression(indices)
+        encoding.offset_encoding = _find_best_integer_compression(offsets)
         return bcif.BinaryCIFData(array, [encoding])
     elif np.issubdtype(array.dtype, np.floating):
@@ -159,18 +159,22 @@ def _compress_data(bcif_data, rtol, atol):
             # -> do not use integer encoding
             return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
         else:
-            best_encoding, size_compressed = _find_best_integer_compression(
-                integer_array
+            best_encoding = _find_best_integer_compression(integer_array)
+            compressed_data = bcif.BinaryCIFData(
+                array, [to_integer_encoding] + best_encoding
             )
-            if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
-                return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
+            uncompressed_data = bcif.BinaryCIFData(array, [ByteArrayEncoding()])
+            if _data_size_in_file(compressed_data) < _data_size_in_file(
+                uncompressed_data
+            ):
+                return compressed_data
             else:
                 # The float array is smaller -> encode it directly as bytes
-                return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
+                return uncompressed_data
     elif np.issubdtype(array.dtype, np.integer):
         array = _to_smallest_integer_type(array)
-        encodings, _ = _find_best_integer_compression(array)
+        encodings = _find_best_integer_compression(array)
         return bcif.BinaryCIFData(array, encodings)
     else:
@@ -233,7 +237,7 @@ def _find_best_integer_compression(array):
                 if size < smallest_size:
                     best_encoding_sequence = encodings
                     smallest_size = size
-    return best_encoding_sequence, smallest_size
+    return best_encoding_sequence
 def _estimate_packed_length(array, packed_byte_count):

biotite/structure/io/pdbx/convert.py CHANGED Viewed

@@ -55,6 +55,7 @@ from biotite.structure.io.pdbx.bcif import (
 from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
 from biotite.structure.io.pdbx.component import MaskValue
 from biotite.structure.io.pdbx.encoding import StringArrayEncoding
+from biotite.structure.repair import create_continuous_res_ids
 from biotite.structure.residues import (
     get_residue_count,
     get_residue_positions,
@@ -496,12 +497,6 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
             atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
         ).as_array(str),
     )
-    array.set_annotation(
-        "res_id",
-        _get_or_fallback(
-            atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
-        ).as_array(int, -1),
-    )
     array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
     array.set_annotation(
         "res_name",
@@ -518,6 +513,22 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
     )
     array.set_annotation("element", atom_site["type_symbol"].as_array(str))
+    # Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
+    # hetero residues, which makes distinguishing subsequent residues from another
+    # difficult (https://github.com/biotite-dev/biotite/issues/553)
+    res_id = _get_or_fallback(
+        atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
+    ).as_array(int, -1)
+    if not use_author_fields and "auth_seq_id" in atom_site:
+        # Therefore, the `auth_seq_id` is still used to determine residue starts
+        # in `create_continuous_res_ids()`, even if `use_author_fields = False`.
+        res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
+        array.set_annotation("res_id", res_id_for_residue_starts)
+        fallback_res_ids = create_continuous_res_ids(array)
+        array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
+    else:
+        array.set_annotation("res_id", res_id)
     if "atom_id" in extra_fields:
         if "id" in atom_site:
             array.set_annotation("atom_id", atom_site["id"].as_array(int))
@@ -775,7 +786,10 @@ def _filter_altloc(array, atom_site, altloc):
     if altloc == "all":
         array.set_annotation("altloc_id", altloc_ids.as_array(str))
         return array, atom_site
-    elif altloc_ids is None or (altloc_ids.mask.array != MaskValue.PRESENT).all():
+    elif altloc_ids is None or (
+        altloc_ids.mask is not None
+        and (altloc_ids.mask.array != MaskValue.PRESENT).all()
+    ):
         # No altlocs in atom_site category
         return array, atom_site
     elif altloc == "occupancy" and occupancy is not None:
@@ -873,11 +887,7 @@ def set_structure(
         this parameter is ignored.
         If the file is empty, a new data block will be created.
     include_bonds : bool, optional
-        If set to true and `array` has associated ``bonds`` , the
-        intra-residue bonds will be written into the ``chem_comp_bond``
-        category.
-        Inter-residue bonds will be written into the ``struct_conn``
-        independent of this parameter.
+        DEPRECATED: Has no effect anymore.
     extra_fields : list of str, optional
         List of additional fields from the ``atom_site`` category
         that should be written into the file.
@@ -898,6 +908,13 @@ def set_structure(
     >>> set_structure(file, atom_array)
     >>> file.write(os.path.join(path_to_directory, "structure.cif"))
     """
+    if include_bonds:
+        warnings.warn(
+            "`include_bonds` parameter is deprecated, "
+            "intra-residue are always written, if available",
+            DeprecationWarning,
+        )
     _check_non_empty(array)
     block = _get_or_create_block(pdbx_file, data_block)
@@ -975,10 +992,9 @@ def set_structure(
         struct_conn = _set_inter_residue_bonds(array, atom_site)
         if struct_conn is not None:
             block["struct_conn"] = struct_conn
-        if include_bonds:
-            chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
-            if chem_comp_bond is not None:
-                block["chem_comp_bond"] = chem_comp_bond
+        chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
+        if chem_comp_bond is not None:
+            block["chem_comp_bond"] = chem_comp_bond
     # In case of a single model handle each coordinate
     # simply like a flattened array
@@ -1652,11 +1668,11 @@ def get_assembly(
         If set to true, a :class:`BondList` will be created for the
         resulting :class:`AtomArray` containing the bond information
         from the file.
-        Bonds, whose order could not be determined from the
-        *Chemical Component Dictionary*
-        (e.g. especially inter-residue bonds),
-        have :attr:`BondType.ANY`, since the PDB format itself does
-        not support bond orders.
+        Inter-residue bonds, will be read from the ``struct_conn``
+        category.
+        Intra-residue bonds will be read from the ``chem_comp_bond``, if
+        available, otherwise they will be derived from the Chemical
+        Component Dictionary.
     Returns
     -------
@@ -1926,11 +1942,11 @@ def get_unit_cell(
         If set to true, a :class:`BondList` will be created for the
         resulting :class:`AtomArray` containing the bond information
         from the file.
-        Bonds, whose order could not be determined from the
-        *Chemical Component Dictionary*
-        (e.g. especially inter-residue bonds),
-        have :attr:`BondType.ANY`, since the PDB format itself does
-        not support bond orders.
+        Inter-residue bonds, will be read from the ``struct_conn``
+        category.
+        Intra-residue bonds will be read from the ``chem_comp_bond``, if
+        available, otherwise they will be derived from the Chemical
+        Component Dictionary.
     Returns
     -------

biotite/structure/io/pdbx/encoding.cpython-312-darwin.so CHANGED Viewed

Binary file

biotite/structure/io/pdbx/encoding.pyx CHANGED Viewed

@@ -225,9 +225,13 @@ class Encoding(_Component, metaclass=ABCMeta):
         -------
         decoded_data : ndarray
             The decoded data.
+        Warnings
+        --------
+        When overriding this method, do not omit bound checks with
+        ``@cython.boundscheck(False)`` or ``@cython.wraparound(False)``,
+        since the file content may be invalid/malicious.
         """
-        # Important: Do not omit bound checks for decoding,
-        # since the file content may be invalid/malicious.
         raise NotImplementedError()
     def __str__(self):
@@ -883,17 +887,39 @@ class StringArrayEncoding(Encoding):
         else:
             check_present = True
-        string_order = _safe_cast(np.argsort(self.strings), np.int32)
-        sorted_strings = self.strings[string_order]
-        sorted_indices = np.searchsorted(sorted_strings, data)
-        indices = string_order[sorted_indices]
-        if check_present and not np.all(self.strings[indices] == data):
+        if len(self.strings) > 0:
+            string_order = _safe_cast(np.argsort(self.strings), np.int32)
+            sorted_strings = self.strings[string_order]
+            sorted_indices = np.searchsorted(sorted_strings, data)
+            indices = string_order[sorted_indices]
+            # `"" not in self.strings` can be quite costly and is only necessary,
+            # if the the `strings` were given by the user, as otherwise we always
+            # include an empty string explicitly when we compute them in this function
+            # -> Only run if `check_present` is True
+            if check_present and "" not in self.strings:
+                # Represent empty strings as -1
+                indices[data == ""] = -1
+        else:
+            # There are no strings -> The indices can only ever be -1 to indicate
+            # missing values
+            # The check if this is correct is done below
+            indices = np.full(data.shape[0], -1, dtype=np.int32)
+        valid_indices_mask = indices != -1
+        if check_present and not np.all(
+            self.strings[indices[valid_indices_mask]] == data[valid_indices_mask]
+        ):
             raise ValueError("Data contains strings not present in 'strings'")
         return encode_stepwise(indices, self.data_encoding)
     def decode(self, data):
         indices = decode_stepwise(data, self.data_encoding)
-        return self.strings[indices]
+        # Initialize with empty strings
+        strings = np.zeros(indices.shape[0], dtype=self.strings.dtype)
+        # `-1`` indices indicate missing values
+        valid_indices_mask = indices != -1
+        strings[valid_indices_mask] = self.strings[indices[valid_indices_mask]]
+        return strings
     def __eq__(self, other):
         if not isinstance(other, type(self)):
@@ -1009,6 +1035,11 @@ def decode_stepwise(data, encoding):
     """
     for enc in reversed(encoding):
         data = enc.decode(data)
+    # ByteEncoding may decode in a non-writable array,
+    # as it creates the ndarray cheaply from buffer
+    if not data.flags.writeable:
+        # Make the resulting ndarray writable, by copying the underlying buffer
+        data = data.copy()
     return data