PyPI - biotite - Versions diffs - 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl - Mend

biotite 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (90) hide show

biotite/application/dssp/app.py +13 -3
biotite/application/localapp.py +34 -0
biotite/application/muscle/app3.py +2 -15
biotite/application/muscle/app5.py +2 -2
biotite/application/util.py +1 -1
biotite/application/viennarna/rnaplot.py +6 -2
biotite/database/rcsb/query.py +6 -6
biotite/database/uniprot/check.py +20 -15
biotite/database/uniprot/download.py +1 -1
biotite/database/uniprot/query.py +1 -1
biotite/sequence/align/alignment.py +16 -3
biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
biotite/sequence/align/banded.pyx +5 -5
biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmeralphabet.pyx +17 -0
biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.pyx +52 -42
biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/matrix.py +273 -55
biotite/sequence/align/matrix_data/3Di.mat +24 -0
biotite/sequence/align/matrix_data/PB.license +21 -0
biotite/sequence/align/matrix_data/PB.mat +18 -0
biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
biotite/sequence/alphabet.py +3 -0
biotite/sequence/codec.cpython-311-darwin.so +0 -0
biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
biotite/sequence/graphics/colorschemes.py +44 -11
biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
biotite/sequence/profile.py +86 -4
biotite/sequence/seqtypes.py +124 -3
biotite/setup_ccd.py +197 -0
biotite/structure/__init__.py +4 -3
biotite/structure/alphabet/__init__.py +25 -0
biotite/structure/alphabet/encoder.py +332 -0
biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
biotite/structure/alphabet/i3d.py +110 -0
biotite/structure/alphabet/layers.py +86 -0
biotite/structure/alphabet/pb.license +21 -0
biotite/structure/alphabet/pb.py +171 -0
biotite/structure/alphabet/unkerasify.py +122 -0
biotite/structure/atoms.py +129 -40
biotite/structure/bonds.cpython-311-darwin.so +0 -0
biotite/structure/bonds.pyx +72 -21
biotite/structure/celllist.cpython-311-darwin.so +0 -0
biotite/structure/charges.cpython-311-darwin.so +0 -0
biotite/structure/geometry.py +60 -113
biotite/structure/info/__init__.py +1 -0
biotite/structure/info/atoms.py +13 -13
biotite/structure/info/bonds.py +12 -6
biotite/structure/info/ccd.py +125 -32
biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
biotite/structure/info/groups.py +63 -17
biotite/structure/info/masses.py +9 -6
biotite/structure/info/misc.py +15 -21
biotite/structure/info/standardize.py +3 -2
biotite/structure/io/mol/sdf.py +41 -40
biotite/structure/io/pdb/convert.py +2 -0
biotite/structure/io/pdb/file.py +74 -3
biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbqt/file.py +32 -32
biotite/structure/io/pdbx/__init__.py +1 -0
biotite/structure/io/pdbx/bcif.py +32 -8
biotite/structure/io/pdbx/cif.py +72 -59
biotite/structure/io/pdbx/component.py +9 -4
biotite/structure/io/pdbx/compress.py +321 -0
biotite/structure/io/pdbx/convert.py +194 -48
biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbx/encoding.pyx +98 -17
biotite/structure/molecules.py +141 -141
biotite/structure/sasa.cpython-311-darwin.so +0 -0
biotite/structure/segments.py +1 -2
biotite/structure/util.py +73 -1
biotite/version.py +2 -2
{biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/METADATA +3 -1
{biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/RECORD +86 -76
biotite/structure/info/ccd/README.rst +0 -8
biotite/structure/info/ccd/amino_acids.txt +0 -1663
biotite/structure/info/ccd/carbohydrates.txt +0 -1135
biotite/structure/info/ccd/nucleotides.txt +0 -798
{biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
{biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0

biotite/structure/io/pdbx/convert.py CHANGED Viewed

@@ -24,6 +24,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
 from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
 from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
 from biotite.structure.error import BadStructureError
+from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
+from biotite.structure.filter import (
+    _canonical_nucleotide_list as canonical_nucleotide_list,
+)
 from biotite.structure.filter import (
     filter_first_altloc,
     filter_highest_occupancy_altloc,
@@ -36,32 +40,38 @@ from biotite.structure.io.pdbx.bcif import (
 from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
 from biotite.structure.io.pdbx.component import MaskValue
 from biotite.structure.io.pdbx.encoding import StringArrayEncoding
-from biotite.structure.residues import get_residue_count, get_residue_starts_for
+from biotite.structure.residues import (
+    get_residue_count,
+    get_residue_positions,
+    get_residue_starts_for,
+)
 from biotite.structure.util import matrix_rotate
-# Cond types in `struct_conn` category that refer to covalent bonds
-PDBX_COVALENT_TYPES = [
-    "covale",
-    "covale_base",
-    "covale_phosphate",
-    "covale_sugar",
-    "disulf",
-    "modres",
-    "modres_link",
-    "metalc",
-]
-# Map 'struct_conn' bond orders to 'BondType'...
-PDBX_BOND_ORDER_TO_TYPE = {
-    "": BondType.ANY,
-    "sing": BondType.SINGLE,
-    "doub": BondType.DOUBLE,
-    "trip": BondType.TRIPLE,
-    "quad": BondType.QUADRUPLE,
+# Bond types in `struct_conn` category that refer to covalent bonds
+PDBX_BOND_TYPE_ID_TO_TYPE = {
+    # Although a covalent bond, could in theory have a higher bond order,
+    # practically inter-residue bonds are always single
+    "covale": BondType.SINGLE,
+    "covale_base": BondType.SINGLE,
+    "covale_phosphate": BondType.SINGLE,
+    "covale_sugar": BondType.SINGLE,
+    "disulf": BondType.SINGLE,
+    "modres": BondType.SINGLE,
+    "modres_link": BondType.SINGLE,
+    "metalc": BondType.COORDINATION,
+}
+PDBX_BOND_TYPE_TO_TYPE_ID = {
+    BondType.ANY: "covale",
+    BondType.SINGLE: "covale",
+    BondType.DOUBLE: "covale",
+    BondType.TRIPLE: "covale",
+    BondType.QUADRUPLE: "covale",
+    BondType.AROMATIC_SINGLE: "covale",
+    BondType.AROMATIC_DOUBLE: "covale",
+    BondType.AROMATIC_TRIPLE: "covale",
+    BondType.COORDINATION: "metalc",
 }
-# ...and vice versa
 PDBX_BOND_TYPE_TO_ORDER = {
-    # 'ANY' is masked later, it is merely added here to avoid a KeyError
-    BondType.ANY: "",
     BondType.SINGLE: "sing",
     BondType.DOUBLE: "doub",
     BondType.TRIPLE: "trip",
@@ -69,6 +79,9 @@ PDBX_BOND_TYPE_TO_ORDER = {
     BondType.AROMATIC_SINGLE: "sing",
     BondType.AROMATIC_DOUBLE: "doub",
     BondType.AROMATIC_TRIPLE: "trip",
+    # These are masked later, it is merely added here to avoid a KeyError
+    BondType.ANY: "",
+    BondType.COORDINATION: "",
 }
 # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
 COMP_BOND_ORDER_TO_TYPE = {
@@ -84,6 +97,7 @@ COMP_BOND_ORDER_TO_TYPE = {
 COMP_BOND_TYPE_TO_ORDER = {
     bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
 }
+CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
 _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
 _nucleotideseq_type_list = [
@@ -475,16 +489,53 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
     array.set_annotation("element", atom_site["type_symbol"].as_array(str))
     if "atom_id" in extra_fields:
-        array.set_annotation("atom_id", atom_site["id"].as_array(int))
+        if "id" in atom_site:
+            array.set_annotation("atom_id", atom_site["id"].as_array(int))
+        else:
+            warnings.warn(
+                "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
+                UserWarning,
+            )
+            array.set_annotation("atom_id", np.arange(array.array_length()))
         extra_fields.remove("atom_id")
     if "b_factor" in extra_fields:
-        array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
+        if "B_iso_or_equiv" in atom_site:
+            array.set_annotation(
+                "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
+            )
+        else:
+            warnings.warn(
+                "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
+                UserWarning,
+            )
+            array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
         extra_fields.remove("b_factor")
     if "occupancy" in extra_fields:
-        array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
+        if "occupancy" in atom_site:
+            array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
+        else:
+            warnings.warn(
+                "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
+                UserWarning,
+            )
+            array.set_annotation(
+                "occupancy", np.ones(array.array_length(), dtype=float)
+            )
         extra_fields.remove("occupancy")
     if "charge" in extra_fields:
-        array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
+        if "pdbx_formal_charge" in atom_site:
+            array.set_annotation(
+                "charge",
+                atom_site["pdbx_formal_charge"].as_array(
+                    int, 0
+                ),  # masked values are set to 0
+            )
+        else:
+            warnings.warn(
+                "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
+                UserWarning,
+            )
+            array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
         extra_fields.remove("charge")
     # Handle all remaining custom fields
@@ -536,7 +587,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     ]
     covale_mask = np.isin(
-        struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
+        struct_conn["conn_type_id"].as_array(str),
+        list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
     )
     if "ptnr1_symmetry" in struct_conn:
         covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
@@ -576,13 +628,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
     atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
-    # Interpret missing values as ANY bonds
-    bond_order = struct_conn["pdbx_value_order"].as_array(str, "")
+    bond_type_id = struct_conn["conn_type_id"].as_array()
     # Consecutively apply the same masks as applied to the atom indices
     # Logical combination does not work here,
     # as the second mask was created based on already filtered data
-    bond_order = bond_order[covale_mask][mapping_exists_mask]
-    bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
+    bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
+    # The type ID is always present in the dictionary,
+    # as it was used to filter the applicable bonds
+    bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
     return BondList(
         atom_site.row_count,
@@ -593,7 +646,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
 def _find_matches(query_arrays, reference_arrays):
     """
     For each index in the `query_arrays` find the indices in the
-    `reference_arrays` where all query values the reference counterpart.
+    `reference_arrays` where all query values match the reference counterpart.
     If no match is found for a query, the corresponding index is -1.
     """
     match_masks_for_all_columns = np.stack(
@@ -703,7 +756,13 @@ def _get_box(block):
     return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
-def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
+def set_structure(
+    pdbx_file,
+    array,
+    data_block=None,
+    include_bonds=False,
+    extra_fields=[],
+):
     """
     Set the ``atom_site`` category with atom information from an
     :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -737,6 +796,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
         category.
         Inter-residue bonds will be written into the ``struct_conn``
         independent of this parameter.
+    extra_fields : list of str, optional
+        List of additional fields from the ``atom_site`` category
+        that should be written into the file.
+        Default is an empty list.
     Notes
     -----
@@ -797,6 +860,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
             np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
         )
+    # Handle all remaining custom fields
+    if len(extra_fields) > 0:
+        # ... check to avoid clashes with standard annotations
+        _standard_annotations = [
+            "hetero",
+            "element",
+            "atom_name",
+            "res_name",
+            "chain_id",
+            "res_id",
+            "ins_code",
+            "atom_id",
+            "b_factor",
+            "occupancy",
+            "charge",
+        ]
+        _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
+        for annot in extra_fields:
+            if annot in _reserved_annotation_names:
+                raise ValueError(
+                    f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
+                    "Please choose another name."
+                )
+            atom_site[annot] = np.copy(array.get_annotation(annot))
     if array.bonds is not None:
         struct_conn = _set_inter_residue_bonds(array, atom_site)
         if struct_conn is not None:
@@ -1021,13 +1110,21 @@ def _set_inter_residue_bonds(array, atom_site):
     if len(bond_array) == 0:
         return None
+    # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
+    # nucleotide/amino acid residues
+    bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
+    if len(bond_array) == 0:
+        return None
     struct_conn = Category()
     struct_conn["id"] = np.arange(1, len(bond_array) + 1)
-    struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
+    struct_conn["conn_type_id"] = [
+        PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
+    ]
     struct_conn["pdbx_value_order"] = Column(
         np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
         np.where(
-            bond_array[:, 2] == BondType.ANY,
+            np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
             MaskValue.MISSING,
             MaskValue.PRESENT,
         ),
@@ -1063,6 +1160,27 @@ def _filter_bonds(array, connection):
         raise ValueError("Invalid 'connection' option")
+def _filter_canonical_links(array, bond_array):
+    """
+    Filter out peptide bonds between adjacent canonical amino acid residues.
+    """
+    # Get the residue index for each bonded atom
+    residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
+        -1, 2
+    )
+    return (
+        # Must be canonical residues
+        np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
+        np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
+        # Must be backbone bond
+        np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
+        np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
+        # Must connect adjacent residues
+        residue_indices[:, 1] - residue_indices[:, 0] == 1
+    )  # fmt: skip
 def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
     """
     Create an :class:`AtomArray` for a chemical component from the
@@ -1161,17 +1279,28 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
         # Swap with the fallback option
         coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
     try:
-        for i, field in enumerate(coord_fields):
-            array.coord[:, i] = atom_category[field].as_array(np.float32)
-    except KeyError as err:
-        key = err.args[0]
-        warnings.warn(
-            f"Attribute '{key}' not found within 'chem_comp_atom' category. "
-            f"The fallback coordinates will be used instead",
-            UserWarning,
+        array.coord = _parse_component_coordinates(
+            [atom_category[field] for field in coord_fields]
+        )
+    except Exception as err:
+        if isinstance(err, KeyError):
+            key = err.args[0]
+            warnings.warn(
+                f"Attribute '{key}' not found within 'chem_comp_atom' category. "
+                f"The fallback coordinates will be used instead",
+                UserWarning,
+            )
+        elif isinstance(err, ValueError):
+            warnings.warn(
+                "The coordinates are missing for some atoms. "
+                "The fallback coordinates will be used instead",
+                UserWarning,
+            )
+        else:
+            raise
+        array.coord = _parse_component_coordinates(
+            [atom_category[field] for field in alt_coord_fields]
         )
-        for i, field in enumerate(alt_coord_fields):
-            array.coord[:, i] = atom_category[field].as_array(np.float32)
     try:
         bond_category = block["chem_comp_bond"]
@@ -1201,6 +1330,17 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
     return array
+def _parse_component_coordinates(coord_columns):
+    coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
+    for i, column in enumerate(coord_columns):
+        if column.mask is not None and column.mask.array.any():
+            raise ValueError(
+                "Missing coordinates for some atoms",
+            )
+        coord[:, i] = column.as_array(np.float32)
+    return coord
 def set_component(pdbx_file, array, data_block=None):
     """
     Set the ``chem_comp_atom`` and, if bonds are available,
@@ -1417,7 +1557,10 @@ def get_assembly(
     Returns
     -------
     assembly : AtomArray or AtomArrayStack
-        The assembly. The return type depends on the `model` parameter.
+        The assembly.
+        The return type depends on the `model` parameter.
+        Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
+        unit in the assembly.
     Examples
     --------
@@ -1506,7 +1649,6 @@ def _apply_transformations(structure, transformation_dict, operations):
     """
     # Additional first dimesion for 'structure.repeat()'
     assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
     # Apply corresponding transformation for each copy in the assembly
     for i, operation in enumerate(operations):
         coord = structure.coord
@@ -1520,7 +1662,11 @@ def _apply_transformations(structure, transformation_dict, operations):
             coord += translation_vector
         assembly_coord[i] = coord
-    return repeat(structure, assembly_coord)
+    assembly = repeat(structure, assembly_coord)
+    assembly.set_annotation(
+        "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
+    )
+    return assembly
 def _get_transformations(struct_oper):

biotite/structure/io/pdbx/encoding.cpython-311-darwin.so CHANGED Viewed

Binary file

biotite/structure/io/pdbx/encoding.pyx CHANGED Viewed

@@ -287,7 +287,8 @@ class FixedPointEncoding(Encoding):
         The data type of the array to be encoded.
         Either a NumPy dtype or a *BinaryCIF* type code is accepted.
         The dtype must be a float type.
-        If omitted, 32-bit floats are assumed.
+        If omitted, the data type is taken from the data the
+        first time :meth:`encode()` is called.
     Attributes
     ----------
@@ -304,7 +305,7 @@ class FixedPointEncoding(Encoding):
     [987 654]
     """
     factor: ...
-    src_type: ... = TypeCode.FLOAT32
+    src_type: ... = None
     def __post_init__(self):
         if self.src_type is not None:
@@ -315,6 +316,14 @@ class FixedPointEncoding(Encoding):
                 )
     def encode(self, data):
+        # If not given in constructor, it is determined from the data
+        if self.src_type is None:
+            self.src_type = TypeCode.from_dtype(data.dtype)
+            if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
+                raise ValueError(
+                    "Only floating point types are supported"
+                )
         # Round to avoid wrong values due to floating point inaccuracies
         return np.round(data * self.factor).astype(np.int32)
@@ -340,7 +349,8 @@ class IntervalQuantizationEncoding(Encoding):
         The data type of the array to be encoded.
         Either a NumPy dtype or a *BinaryCIF* type code is accepted.
         The dtype must be a float type.
-        If omitted, 32-bit floats are assumed.
+        If omitted, the data type is taken from the data the
+        first time :meth:`encode()` is called.
     Attributes
     ----------
@@ -367,13 +377,17 @@ class IntervalQuantizationEncoding(Encoding):
     min: ...
     max: ...
     num_steps: ...
-    src_type: ... = TypeCode.FLOAT32
+    src_type: ... = None
     def __post_init__(self):
         if self.src_type is not None:
             self.src_type = TypeCode.from_dtype(self.src_type)
     def encode(self, data):
+        # If not given in constructor, it is determined from the data
+        if self.src_type is None:
+            self.src_type = TypeCode.from_dtype(data.dtype)
         steps = np.linspace(
             self.min, self.max, self.num_steps, dtype=data.dtype
         )
@@ -524,7 +538,8 @@ class DeltaEncoding(Encoding):
         first time :meth:`encode()` is called.
     origin : int, optional
         The starting value from which the differences are calculated.
-        If omitted, the origin is set to 0.
+        If omitted, the value is taken from the first array element the
+        first time :meth:`encode()` is called.
     Attributes
     ----------
@@ -535,11 +550,14 @@ class DeltaEncoding(Encoding):
     --------
     >>> data = np.array([1, 1, 2, 3, 5, 8])
-    >>> print(DeltaEncoding().encode(data))
-    [1 0 1 1 2 3]
+    >>> encoding = DeltaEncoding()
+    >>> print(encoding.encode(data))
+    [0 0 1 1 2 3]
+    >>> print(encoding.origin)
+    1
     """
     src_type: ... = None
-    origin: ... = 0
+    origin: ... = None
     def __post_init__(self):
         if self.src_type is not None:
@@ -549,6 +567,8 @@ class DeltaEncoding(Encoding):
         # If not given in constructor, it is determined from the data
         if self.src_type is None:
             self.src_type = TypeCode.from_dtype(data.dtype)
+        if self.origin is None:
+            self.origin = data[0]
         data = data - self.origin
         return np.diff(data, prepend=0).astype(np.int32, copy=False)
@@ -582,7 +602,8 @@ class IntegerPackingEncoding(Encoding):
     is_unsigned : bool, optional
         Whether the values should be packed into signed or unsigned
         integers.
-        If omitted, the values are packed into signed integers.
+        If omitted, first time :meth:`encode()` is called, determines whether
+        the values fit into unsigned integers.
     Attributes
     ----------
@@ -601,7 +622,7 @@ class IntegerPackingEncoding(Encoding):
     """
     byte_count: ...
     src_size: ... = None
-    is_unsigned: ... = False
+    is_unsigned: ... = None
     def encode(self, data):
         if self.src_size is None:
@@ -610,6 +631,9 @@ class IntegerPackingEncoding(Encoding):
             raise IndexError(
                 "Given source size does not match actual data size"
             )
+        if self.is_unsigned is None:
+            # Only positive values -> use unsigned integers
+            self.is_unsigned = data.min().item() >= 0
         data = data.astype(np.int32, copy=False)
         return self._encode(
@@ -672,7 +696,7 @@ class IntegerPackingEncoding(Encoding):
         # Get length of output array
         # by summing up required length of each element
         cdef int number
-        cdef int length = 0
+        cdef long length = 0
         for i in range(data.shape[0]):
             number = data[i]
             if number < 0:
@@ -750,7 +774,7 @@ class StringArrayEncoding(Encoding):
         If omitted, the unique strings are determined from the data the
         first time :meth:`encode()` is called.
     data_encoding : list of Encoding, optional
-        The encodings that are applied to the indiy array.
+        The encodings that are applied to the index array.
         If omitted, the array is directly encoded into bytes without
         further compression.
     offset_encoding : list of Encoding, optional
@@ -837,8 +861,11 @@ class StringArrayEncoding(Encoding):
             raise TypeError("Data must be of string type")
         if self.strings is None:
-            # 'unique()' already sorts the strings
-            self.strings = np.unique(data)
+            # 'unique()' already sorts the strings, but this is not necessarily
+            # desired, as this makes efficient encoding of the indices more difficult
+            # -> Bring into the original order
+            _, unique_indices = np.unique(data, return_index=True)
+            self.strings = data[np.sort(unique_indices)]
             check_present = False
         else:
             check_present = True
@@ -888,6 +915,19 @@ _encoding_classes_kinds = {
 def deserialize_encoding(content):
+    """
+    Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
+    Parameters
+    ----------
+    content : dict
+        The encoding represenet as *BinaryCIF* dictionary.
+    Returns
+    -------
+    encoding : Encoding
+        The deserialized encoding.
+    """
     try:
         encoding_class = _encoding_classes[content["kind"]]
     except KeyError:
@@ -898,28 +938,69 @@ def deserialize_encoding(content):
 def create_uncompressed_encoding(array):
-    dtype = array.dtype
+    """
+    Create a simple encoding for the given array that does not compress the data.
-    if np.issubdtype(dtype, np.str_):
+    Parameters
+    ----------
+    array : ndarray
+        The array to to create the encoding for.
+    Returns
+    -------
+    encoding : list of Encoding
+        The encoding for the data.
+    """
+    if np.issubdtype(array.dtype, np.str_):
         return [StringArrayEncoding()]
     else:
         return [ByteArrayEncoding()]
 def encode_stepwise(data, encoding):
+    """
+    Apply a list of encodings stepwise to the given data.
+    Parameters
+    ----------
+    data : ndarray
+        The data to be encoded.
+    encoding : list of Encoding
+        The encodings to be applied.
+    Returns
+    -------
+    encoded_data : ndarray or bytes
+        The encoded data.
+    """
     for encoding in encoding:
         data = encoding.encode(data)
     return data
 def decode_stepwise(data, encoding):
+    """
+    Apply a list of encodings stepwise to the given data.
+    Parameters
+    ----------
+    data : ndarray or bytes
+        The data to be decoded.
+    encoding : list of Encoding
+        The encodings to be applied.
+    Returns
+    -------
+    decoded_data : ndarray
+        The decoded data.
+    """
     for enc in reversed(encoding):
         data = enc.decode(data)
     return data
 def _camel_to_snake_case(attribute_name):
-    return re.sub(CAMEL_CASE_PATTERN, "_", attribute_name).lower()
+    return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
 def _snake_to_camel_case(attribute_name):