PyPI - biotite - Versions diffs - 0.39.0__cp311-cp311-win_amd64.whl → 0.40.0__cp311-cp311-win_amd64.whl - Mend

biotite 0.39.0__cp311-cp311-win_amd64.whl → 0.40.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (104) hide show

biotite/__init__.py +3 -3
biotite/application/dssp/app.py +18 -18
biotite/database/rcsb/download.py +19 -14
biotite/sequence/align/banded.c +258 -237
biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/kmeralphabet.c +243 -222
biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/kmersimilarity.c +215 -196
biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/kmertable.cpp +233 -205
biotite/sequence/align/localgapped.c +258 -237
biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/localungapped.c +235 -214
biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/multiple.c +255 -234
biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/pairwise.c +274 -253
biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/permutation.c +215 -196
biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/selector.c +217 -197
biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
biotite/sequence/align/tracetable.c +215 -195
biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
biotite/sequence/codec.c +235 -214
biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
biotite/sequence/phylo/nj.c +215 -196
biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
biotite/sequence/phylo/tree.c +227 -202
biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
biotite/sequence/phylo/upgma.c +215 -196
biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
biotite/structure/basepairs.py +7 -12
biotite/structure/bonds.c +1175 -1226
biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
biotite/structure/celllist.c +217 -197
biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
biotite/structure/charges.c +1052 -1101
biotite/structure/charges.cp311-win_amd64.pyd +0 -0
biotite/structure/filter.py +30 -37
biotite/structure/info/__init__.py +5 -8
biotite/structure/info/atoms.py +25 -67
biotite/structure/info/bonds.py +46 -100
biotite/structure/info/ccd/README.rst +8 -0
biotite/structure/info/ccd/amino_acids.txt +1646 -0
biotite/structure/info/ccd/carbohydrates.txt +1133 -0
biotite/structure/info/ccd/components.bcif +0 -0
biotite/structure/info/ccd/nucleotides.txt +797 -0
biotite/structure/info/ccd.py +95 -0
biotite/structure/info/groups.py +90 -0
biotite/structure/info/masses.py +21 -20
biotite/structure/info/misc.py +11 -22
biotite/structure/info/standardize.py +17 -12
biotite/structure/io/__init__.py +2 -4
biotite/structure/io/ctab.py +1 -1
biotite/structure/io/general.py +37 -43
biotite/structure/io/mmtf/__init__.py +3 -0
biotite/structure/io/mmtf/convertarray.c +219 -198
biotite/structure/io/mmtf/convertarray.cp311-win_amd64.pyd +0 -0
biotite/structure/io/mmtf/convertfile.c +217 -197
biotite/structure/io/mmtf/convertfile.cp311-win_amd64.pyd +0 -0
biotite/structure/io/mmtf/decode.c +225 -204
biotite/structure/io/mmtf/decode.cp311-win_amd64.pyd +0 -0
biotite/structure/io/mmtf/encode.c +215 -196
biotite/structure/io/mmtf/encode.cp311-win_amd64.pyd +0 -0
biotite/structure/io/mmtf/file.py +34 -26
biotite/structure/io/npz/__init__.py +3 -0
biotite/structure/io/npz/file.py +21 -18
biotite/structure/io/pdb/__init__.py +3 -3
biotite/structure/io/pdb/file.py +5 -3
biotite/structure/io/pdb/hybrid36.c +63 -43
biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
biotite/structure/io/pdbqt/file.py +32 -32
biotite/structure/io/pdbx/__init__.py +13 -6
biotite/structure/io/pdbx/bcif.py +649 -0
biotite/structure/io/pdbx/cif.py +1028 -0
biotite/structure/io/pdbx/component.py +243 -0
biotite/structure/io/pdbx/convert.py +707 -359
biotite/structure/io/pdbx/encoding.c +112813 -0
biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
biotite/structure/io/pdbx/error.py +14 -0
biotite/structure/io/pdbx/legacy.py +267 -0
biotite/structure/molecules.py +151 -151
biotite/structure/sasa.c +215 -196
biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
biotite/structure/superimpose.py +158 -115
{biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
{biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/RECORD +92 -90
{biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
biotite/structure/info/amino_acids.json +0 -1556
biotite/structure/info/amino_acids.py +0 -42
biotite/structure/info/carbohydrates.json +0 -1122
biotite/structure/info/carbohydrates.py +0 -39
biotite/structure/info/intra_bonds.msgpack +0 -0
biotite/structure/info/link_types.msgpack +0 -1
biotite/structure/info/nucleotides.json +0 -772
biotite/structure/info/nucleotides.py +0 -39
biotite/structure/info/residue_masses.msgpack +0 -0
biotite/structure/info/residue_names.msgpack +0 -3
biotite/structure/info/residues.msgpack +0 -0
biotite/structure/io/pdbx/file.py +0 -652
{biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
{biotite-0.39.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0

biotite/structure/io/pdbx/convert.py CHANGED Viewed

@@ -17,21 +17,50 @@ __all__ = [
 import itertools
 import warnings
-from collections import OrderedDict
 import numpy as np
 from ....file import InvalidFileError
 from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
 from ...atoms import AtomArray, AtomArrayStack, repeat
-from ...bonds import BondList, BondType
+from ...bonds import BondList, BondType, connect_via_residue_names
 from ...box import unitcell_from_vectors, vectors_from_unitcell
 from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
-from ...residues import get_residue_count
+from ...residues import get_residue_count, get_residue_starts_for
 from ...error import BadStructureError
 from ...util import matrix_rotate
+from .legacy import PDBxFile
+from .component import MaskValue
+from .cif import CIFFile, CIFBlock
+from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
+from .encoding import StringArrayEncoding
-# Map 'chem_comp_bond' bond orders to 'BondType'...
-BOND_ORDER_TO_BOND_TYPE = {
+# Cond types in `struct_conn` category that refer to covalent bonds
+PDBX_COVALENT_TYPES = [
+    "covale", "covale_base", "covale_phosphate", "covale_sugar",
+    "disulf", "modres", "modres_link", "metalc"
+]
+# Map 'struct_conn' bond orders to 'BondType'...
+PDBX_BOND_ORDER_TO_TYPE = {
+    "":     BondType.ANY,
+    "sing": BondType.SINGLE,
+    "doub": BondType.DOUBLE,
+    "trip": BondType.TRIPLE,
+    "quad": BondType.QUADRUPLE,
+}
+# ...and vice versa
+PDBX_BOND_TYPE_TO_ORDER = {
+    # 'ANY' is masked later, it is merely added here to avoid a KeyError
+    BondType.ANY: "",
+    BondType.SINGLE: "sing",
+    BondType.DOUBLE: "doub",
+    BondType.TRIPLE: "trip",
+    BondType.QUADRUPLE: "quad",
+    BondType.AROMATIC_SINGLE: "sing",
+    BondType.AROMATIC_DOUBLE: "doub",
+    BondType.AROMATIC_TRIPLE: "trip",
+}
+# Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
+COMP_BOND_ORDER_TO_TYPE = {
     ("SING", "N") : BondType.SINGLE,
     ("DOUB", "N") : BondType.DOUBLE,
     ("TRIP", "N") : BondType.TRIPLE,
@@ -41,11 +70,10 @@ BOND_ORDER_TO_BOND_TYPE = {
     ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
 }
 # ...and vice versa
-BOND_TYPE_TO_BOND_ORDER = {
-    bond_type: order for order, bond_type in BOND_ORDER_TO_BOND_TYPE.items()
+COMP_BOND_TYPE_TO_ORDER = {
+    bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
 }
 _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
 _nucleotideseq_type_list = [
     "polydeoxyribonucleotide",
@@ -61,6 +89,27 @@ _other_type_list = [
 ]
+def _filter(category, index):
+    """
+    Reduce the ``atom_site`` category to the values for the given
+    model.
+    """
+    Category = type(category)
+    Column = Category.subcomponent_class()
+    Data = Column.subcomponent_class()
+    return Category({
+        key: Column(
+            Data(column.data.array[index]),
+            (
+                Data(column.mask.array[index])
+                if column.mask is not None else None
+            )
+        )
+        for key, column in category.items()
+    })
 def get_sequence(pdbx_file, data_block=None):
     """
     Get the protein and nucleotide sequences from the
@@ -74,11 +123,14 @@ def get_sequence(pdbx_file, data_block=None):
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
-    data_block : string, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+    data_block : str, optional
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     Returns
     -------
@@ -86,50 +138,55 @@ def get_sequence(pdbx_file, data_block=None):
         The protein and nucleotide sequences for each entity
         (equivalent to chains in most cases).
     """
-    poly_dict = pdbx_file.get_category("entity_poly", data_block)
-    seq_string = poly_dict["pdbx_seq_one_letter_code_can"]
-    seq_type = poly_dict["type"]
+    block = _get_block(pdbx_file, data_block)
+    poly_category= block["entity_poly"]
+    seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
+    seq_type = poly_category["type"].as_array(str)
     sequences = []
-    if isinstance(seq_string, np.ndarray):
-        for string, stype in zip(seq_string, seq_type):
-            sequence = _convert_string_to_sequence(string, stype)
-            if sequence is not None:
-                sequences.append(sequence)
-    else:
-        sequences.append(_convert_string_to_sequence(seq_string, seq_type))
+    for string, stype in zip(seq_string, seq_type):
+        sequence = _convert_string_to_sequence(string, stype)
+        if sequence is not None:
+            sequences.append(sequence)
     return sequences
-def get_model_count(file, data_block=None):
+def get_model_count(pdbx_file, data_block=None):
     """
     Get the number of models contained in a :class:`PDBxFile`.
     Parameters
     ----------
-    file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     data_block : str, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     Returns
     -------
     model_count : int
         The number of models.
     """
-    atom_site_dict = file.get_category("atom_site", data_block)
-    return len(_get_model_starts(atom_site_dict["pdbx_PDB_model_num"]))
+    block = _get_block(pdbx_file, data_block)
+    return len(_get_model_starts(
+        block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
+    ))
 def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
-                  extra_fields=None, use_author_fields=True):
+                  extra_fields=None, use_author_fields=True,
+                  include_bonds=False):
     """
     Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
     ``atom_site`` category in a :class:`PDBxFile`.
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     model : int, optional
         If this parameter is given, the function will return an
@@ -141,8 +198,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
         containing all models will be returned, even if the structure
         contains only one model.
     data_block : str, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     altloc : {'first', 'occupancy', 'all'}
         This parameter defines how *altloc* IDs are handled:
             - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -176,6 +236,15 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
         otherwise from the the ``label_xxx`` fields.
         If the requested field is not available, the respective other
         field is taken as fallback.
+    include_bonds : bool, optional
+        If set to true, a :class:`BondList` will be created for the
+        resulting :class:`AtomArray` containing the bond information
+        from the file.
+        Bonds, whose order could not be determined from the
+        *Chemical Component Dictionary*
+        (e.g. especially inter-residue bonds),
+        have :attr:`BondType.ANY`, since the PDB format itself does
+        not support bond orders.
     Returns
     -------
@@ -186,31 +255,35 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
     --------
     >>> import os.path
-    >>> file = PDBxFile.read(os.path.join(path_to_structures, "1l2y.cif"))
+    >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
     >>> arr = get_structure(file, model=1)
     >>> print(len(arr))
     304
     """
-    extra_fields = [] if extra_fields is None else extra_fields
+    block = _get_block(pdbx_file, data_block)
+    extra_fields = set() if extra_fields is None else set(extra_fields)
-    atom_site_dict = pdbx_file.get_category("atom_site", data_block)
-    if atom_site_dict is None:
+    atom_site = block.get("atom_site")
+    if atom_site is None:
         raise InvalidFileError("Missing 'atom_site' category in file")
-    models = atom_site_dict["pdbx_PDB_model_num"]
+    models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
     model_starts = _get_model_starts(models)
     model_count = len(model_starts)
     atom_count = len(models)
     if model is None:
         # For a stack, the annotations are derived from the first model
-        model_dict = _get_model_dict(atom_site_dict, model_starts, 1)
+        model_atom_site = _filter_model(atom_site, model_starts, 1)
         # Any field of the category would work here to get the length
-        model_length = len(model_dict["group_PDB"])
+        model_length = model_atom_site.row_count
         stack = AtomArrayStack(model_count, model_length)
-        _fill_annotations(stack, model_dict, extra_fields, use_author_fields)
+        _fill_annotations(
+            stack, model_atom_site, extra_fields, use_author_fields
+        )
         # Check if each model has the same amount of atoms
         # If not, raise exception
@@ -221,22 +294,24 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
                 "instead"
             )
-        stack.coord = np.zeros(
-            (model_count, model_length, 3), dtype=np.float32
-        )
-        stack.coord[:, :, 0] = atom_site_dict["Cartn_x"].reshape(
-            (model_count, model_length)
-        )
-        stack.coord[:, :, 1] = atom_site_dict["Cartn_y"].reshape(
-            (model_count, model_length)
-        )
-        stack.coord[:, :, 2] = atom_site_dict["Cartn_z"].reshape(
-            (model_count, model_length)
-        )
+        stack.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
+                              .reshape((model_count, model_length))
+        stack.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
+                              .reshape((model_count, model_length))
+        stack.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
+                              .reshape((model_count, model_length))
+        if include_bonds:
+            bonds = connect_via_residue_names(stack)
+            if "struct_conn" in block:
+                bonds = bonds.merge(_parse_inter_residue_bonds(
+                    model_atom_site, block["struct_conn"]
+                ))
+            stack.bonds = bonds
-        stack = _filter_altloc(stack, model_dict, altloc)
+        stack = _filter_altloc(stack, model_atom_site, altloc)
-        box = _get_box(pdbx_file, data_block)
+        box = _get_box(block)
         if box is not None:
             # Duplicate same box for each model
             stack.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
@@ -254,169 +329,284 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
                 f"the given model {model} does not exist"
             )
-        model_dict = _get_model_dict(atom_site_dict, model_starts, model)
+        model_atom_site = _filter_model(atom_site, model_starts, model)
         # Any field of the category would work here to get the length
-        model_length = len(model_dict["group_PDB"])
+        model_length = model_atom_site.row_count
         array = AtomArray(model_length)
-        _fill_annotations(array, model_dict, extra_fields, use_author_fields)
-        # Append exclusive stop
-        model_starts = np.append(
-            model_starts, [len(atom_site_dict["group_PDB"])]
-        )
-        # Indexing starts at 0, but model number starts at 1
-        model_index = model - 1
-        start, stop = model_starts[model_index], model_starts[model_index + 1]
-        array.coord = np.zeros((model_length, 3), dtype=np.float32)
-        array.coord[:, 0] = atom_site_dict["Cartn_x"][start:stop].astype(
-            np.float32
-        )
-        array.coord[:, 1] = atom_site_dict["Cartn_y"][start:stop].astype(
-            np.float32
-        )
-        array.coord[:, 2] = atom_site_dict["Cartn_z"][start:stop].astype(
-            np.float32
+        _fill_annotations(
+            array, model_atom_site, extra_fields, use_author_fields
         )
-        array = _filter_altloc(array, model_dict, altloc)
+        array.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
+        array.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
+        array.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
+        if include_bonds:
+            bonds = connect_via_residue_names(array)
+            if "struct_conn" in block:
+                bonds = bonds.merge(_parse_inter_residue_bonds(
+                    model_atom_site, block["struct_conn"]
+                ))
+            array.bonds = bonds
+        array = _filter_altloc(array, model_atom_site, altloc)
-        array.box = _get_box(pdbx_file, data_block)
+        array.box = _get_box(block)
         return array
-def _fill_annotations(array, model_dict, extra_fields, use_author_fields):
-    """Fill atom_site annotations in atom array or atom array stack.
+def _get_block(pdbx_component, block_name):
+    if isinstance(pdbx_component, PDBxFile):
+        # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
+        pdbx_component = pdbx_component.cif_file
+    if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
+        # Determine block
+        if block_name is None:
+            return pdbx_component.block
+        else:
+            return pdbx_component[block_name]
+    else:
+        return pdbx_component
-    Parameters
-    ----------
-    array : AtomArray or AtomArrayStack
-        Atom array or stack which will be annotated.
-    model_dict : dict(str, ndarray)
-        ``atom_site`` dictionary with values for one model.
-    extra_fields : list of str
-        Entry names, that are additionally added as annotation arrays.
-    use_author_fields : bool
-        Define if alternate fields prefixed with ``auth_`` should be used
-        instead of ``label_``.
-    """
-    def get_or_fallback_from_dict(input_dict, key, fallback_key,
-                                  dict_name="input"):
+def _get_or_fallback(category, key, fallback_key, cat_name="input"):
         """
-        Return value related to key in input dict if it exists,
-        otherwise try to get the value related to fallback key."""
-        if key not in input_dict:
+        Return column related to key in category if it exists,
+        otherwise try to get the column related to fallback key.
+        """
+        if key not in category:
             warnings.warn(
-                f"Attribute '{key}' not found within '{dict_name}' category. "
+                f"Attribute '{key}' not found within '{cat_name}' category. "
                 f"The fallback attribute '{fallback_key}' will be used instead",
                 UserWarning
             )
             try:
-                return input_dict[fallback_key]
+                return category[fallback_key]
             except KeyError as key_exc:
                 raise InvalidFileError(
                     f"Fallback attribute '{fallback_key}' not found in "
                     "'{dict_name}' category"
                 ) from key_exc
-        return input_dict[key]
-    def get_annotation_from_model(
-        model_dict,
-        annotation_name,
-        annotation_fallback=None,
-        as_type=None,
-        formatter=None,
-    ):
-        """Get and format annotation array from model dictionary."""
-        array = (
-            get_or_fallback_from_dict(
-                model_dict, annotation_name, annotation_fallback,
-                dict_name="atom_site"
-            )
-            if annotation_fallback is not None
-            else model_dict[annotation_name]
-        )
-        if as_type is not None:
-            array = array.astype(as_type)
-        return formatter(array) if formatter is not None else array
+        return category[key]
+def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
+    """Fill atom_site annotations in atom array or atom array stack.
+    Parameters
+    ----------
+    array : AtomArray or AtomArrayStack
+        Atom array or stack which will be annotated.
+    atom_site : CIFCategory or BinaryCIFCategory
+        ``atom_site`` category with values for one model.
+    extra_fields : list of str
+        Entry names, that are additionally added as annotation arrays.
+    use_author_fields : bool
+        Define if alternate fields prefixed with ``auth_`` should be used
+        instead of ``label_``.
+    """
     prefix, alt_prefix = (
         ("auth", "label") if use_author_fields else ("label", "auth")
     )
-    annotation_data = {
-        "chain_id": (f"{prefix}_asym_id", f"{alt_prefix}_asym_id", "U4", None),
-        "res_id": (
-            f"{prefix}_seq_id",
-            f"{alt_prefix}_seq_id",
-            None,
-            lambda annot: np.array(
-                [-1 if elt in [".", "?"] else int(elt) for elt in annot]
-            ),
-        ),
-        "ins_code": (
-            "pdbx_PDB_ins_code",
-            None,
-            "U1",
-            lambda annot: np.array(
-                ["" if elt in [".", "?"] else elt for elt in annot]
-            ),
-        ),
-        "res_name": (f"{prefix}_comp_id", f"{alt_prefix}_comp_id", "U5", None),
-        "hetero": ("group_PDB", None, None, lambda annot: annot == "HETATM"),
-        "atom_name": (
-            f"{prefix}_atom_id",
-            f"{alt_prefix}_atom_id",
-            "U6",
-            None,
-        ),
-        "element": ("type_symbol", None, "U2", None),
-        "atom_id": ("id", None, int, None),
-        "b_factor": ("B_iso_or_equiv", None, float, None),
-        "occupancy": ("occupancy", None, float, None),
-        "charge": (
-            "pdbx_formal_charge",
-            None,
-            None,
-            lambda annot: np.array(
-                [
-                    0 if charge in ["?", "."] else int(charge)
-                    for charge in annot
-                ],
-                dtype=int,
-            ),
-        ),
-    }
-    mandatory_annotations = [
+    array.set_annotation(
         "chain_id",
+        _get_or_fallback(
+            atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
+        ).as_array("U4")
+    )
+    array.set_annotation(
         "res_id",
+        _get_or_fallback(
+            atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
+        ).as_array(int, -1)
+    )
+    array.set_annotation(
         "ins_code",
+        atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
+    )
+    array.set_annotation(
         "res_name",
+        _get_or_fallback(
+            atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
+        ).as_array("U5")
+    )
+    array.set_annotation(
         "hetero",
+        atom_site["group_PDB"].as_array(str) == "HETATM"
+    )
+    array.set_annotation(
         "atom_name",
+        _get_or_fallback(
+            atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
+        ).as_array("U6")
+    )
+    array.set_annotation(
         "element",
-    ]
+        atom_site["type_symbol"].as_array("U2")
+    )
-    # Iterate over mandatory annotations and given extra_fields
-    for annotation_name in mandatory_annotations + extra_fields:
+    if "atom_id" in extra_fields:
         array.set_annotation(
-            annotation_name,
-            get_annotation_from_model(
-                model_dict, *annotation_data[annotation_name]
-            )
-            if annotation_name in annotation_data
-            else get_annotation_from_model(
-                model_dict, annotation_name, as_type=str
-            ),
+            "atom_id",
+            atom_site["id"].as_array(int)
+        )
+        extra_fields.remove("atom_id")
+    if "b_factor" in extra_fields:
+        array.set_annotation(
+            "b_factor",
+            atom_site["B_iso_or_equiv"].as_array(float)
+        )
+        extra_fields.remove("b_factor")
+    if "occupancy" in extra_fields:
+        array.set_annotation(
+            "occupancy",
+            atom_site["occupancy"].as_array(float)
+        )
+        extra_fields.remove("occupancy")
+    if "charge" in extra_fields:
+        array.set_annotation(
+            "charge",
+            atom_site["pdbx_formal_charge"].as_array(int, 0)
+        )
+        extra_fields.remove("charge")
+    # Handle all remaining custom fields
+    for field in extra_fields:
+        array.set_annotation(
+            field,
+            atom_site[field].as_array(str)
+        )
+def _parse_inter_residue_bonds(atom_site, struct_conn):
+    """
+    Create inter-residue bonds by parsing the ``struct_conn`` category.
+    The atom indices of each bond are found by matching the bond labels
+    to the ``atom_site`` category.
+    """
+    # Identity symmetry operation
+    IDENTITY = "1_555"
+    # Columns in 'atom_site' that should be matched by 'struct_conn'
+    COLUMNS = [
+        "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
+        "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
+        "pdbx_PDB_ins_code"
+    ]
+    covale_mask = np.isin(
+        struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
+    )
+    if "ptnr1_symmetry" in struct_conn:
+        covale_mask &= (
+            struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
+        )
+    if "ptnr2_symmetry" in struct_conn:
+        covale_mask &= (
+            struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
+        )
+    atom_indices = [None] * 2
+    for i in range(2):
+        reference_arrays = []
+        query_arrays = []
+        for col_name in COLUMNS:
+            struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
+            if (
+                col_name not in atom_site
+                or struct_conn_col_name not in struct_conn
+            ):
+                continue
+            # Ensure both arrays have the same dtype to allow comparison
+            reference = atom_site[col_name].as_array()
+            dtype = reference.dtype
+            query = struct_conn[struct_conn_col_name].as_array(dtype)
+            if np.issubdtype(reference.dtype, str):
+                # The mask value is not necessarily consistent
+                # between query and reference
+                # -> make it consistent
+                reference[reference == "?"] = "."
+                query[query == "?"] = "."
+            reference_arrays.append(reference)
+            query_arrays.append(query[covale_mask])
+        # Match the combination of 'label_asym_id', 'label_comp_id', etc.
+        # in 'atom_site' and 'struct_conn'
+        atom_indices[i] = _find_matches(query_arrays, reference_arrays)
+    atoms_indices_1 = atom_indices[0]
+    atoms_indices_2 = atom_indices[1]
+    # Some bonds in 'struct_conn' may not be found in 'atom_site'
+    # This is okay,
+    # as 'atom_site' might already be reduced to a single model
+    mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
+    atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
+    atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
+    # Interpret missing values as ANY bonds
+    bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
+    # Consecutively apply the same masks as applied to the atom indices
+    # Logical combination does not work here,
+    # as the second mask was created based on already filtered data
+    bond_order = bond_order[covale_mask][mapping_exists_mask]
+    bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
+    return BondList(
+        atom_site.row_count,
+        np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
+    )
+def _find_matches(query_arrays, reference_arrays):
+    """
+    For each index in the `query_arrays` find the indices in the
+    `reference_arrays` where all query values the reference counterpart.
+    If no match is found for a query, the corresponding index is -1.
+    """
+    match_masks_for_all_columns = np.stack([
+        query[:, np.newaxis] == reference[np.newaxis, :]
+        for query, reference in zip(query_arrays, reference_arrays)
+    ], axis=-1)
+    match_masks = np.all(match_masks_for_all_columns, axis=-1)
+    query_matches, reference_matches = np.where(match_masks)
+    # Duplicate matches indicate that an atom from the query cannot
+    # be uniquely matched to an atom in the reference
+    unique_query_matches, counts = np.unique(query_matches, return_counts=True)
+    if np.any(counts > 1):
+        ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
+        raise InvalidFileError(
+            f"The covalent bond in the 'struct_conn' category at index "
+            f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
+            f"the 'atom_site' category"
         )
+    # -1 indicates that no match was found in the reference
+    match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
+    match_indices[query_matches] = reference_matches
+    return match_indices
-def _filter_altloc(array, model_dict, altloc):
-    altloc_ids = model_dict.get("label_alt_id")
-    occupancy = model_dict.get("occupancy")
+def _get_struct_conn_col_name(col_name, partner):
+    """
+    For a column name in ``atom_site`` get the corresponding column name
+    in ``struct_conn``.
+    """
+    if col_name == "label_alt_id":
+        return f"pdbx_ptnr{partner}_label_alt_id"
+    elif col_name.startswith("pdbx_"):
+        # Move 'pdbx_' to front
+        return f"pdbx_ptnr{partner}_{col_name[5:]}"
+    else:
+        return f"ptnr{partner}_{col_name}"
+def _filter_altloc(array, atom_site, altloc):
+    altloc_ids = atom_site.get("label_alt_id")
+    occupancy = atom_site.get("occupancy")
     # Filter altloc IDs and return
     if altloc_ids is None:
@@ -425,14 +615,14 @@ def _filter_altloc(array, model_dict, altloc):
         return array[
             ...,
             filter_highest_occupancy_altloc(
-                array, altloc_ids, occupancy.astype(float)
+                array, altloc_ids.as_array(str), occupancy.as_array(float)
             ),
         ]
     # 'first' is also fallback if file has no occupancy information
     elif altloc == "first":
-        return array[..., filter_first_altloc(array, altloc_ids)]
+        return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
     elif altloc == "all":
-        array.set_annotation("altloc_id", altloc_ids)
+        array.set_annotation("altloc_id", altloc_ids.as_array(str))
         return array
     else:
         raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
@@ -443,49 +633,46 @@ def _get_model_starts(model_array):
     Get the start index for each model in the arrays of the
     ``atom_site`` category.
     """
-    models, indices = np.unique(model_array, return_index=True)
+    _, indices = np.unique(model_array, return_index=True)
     indices.sort()
     return indices
-def _get_model_dict(atom_site_dict, model_starts, model):
+def _filter_model(atom_site, model_starts, model):
     """
-    Reduce the ``atom_site`` dictionary to the values for the given
+    Reduce the ``atom_site`` category to the values for the given
     model.
     """
+    Category = type(atom_site)
+    Column = Category.subcomponent_class()
+    Data = Column.subcomponent_class()
     # Append exclusive stop
     model_starts = np.append(
-        model_starts, [len(atom_site_dict["pdbx_PDB_model_num"])]
+        model_starts, [atom_site.row_count]
     )
-    model_dict = {}
     # Indexing starts at 0, but model number starts at 1
     model_index = model - 1
-    for key in atom_site_dict.keys():
-        model_dict[key] = atom_site_dict[key][
-            model_starts[model_index] : model_starts[model_index + 1]
-        ]
-    return model_dict
+    index = slice(model_starts[model_index], model_starts[model_index + 1])
+    return _filter(atom_site, index)
-def _get_box(pdbx_file, data_block):
-    if data_block is None:
-        cell_dict = pdbx_file.get("cell")
-    else:
-        cell_dict = pdbx_file.get((data_block, "cell"))
-    if cell_dict is None:
+def _get_box(block):
+    cell = block.get("cell")
+    if cell is None:
         return None
     try:
         len_a, len_b, len_c = [
-            float(cell_dict[length])
+            float(cell[length].as_item())
             for length in ["length_a", "length_b", "length_c"]
         ]
+        alpha, beta, gamma = [
+            np.deg2rad(float(cell[angle].as_item()))
+            for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
+        ]
     except ValueError:
         # 'cell_dict' has no proper unit cell values, e.g. '?'
         return None
-    alpha, beta, gamma = [
-        np.deg2rad(float(cell_dict[angle]))
-        for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
-    ]
     return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
@@ -496,69 +683,90 @@ def set_structure(pdbx_file, array, data_block=None):
     This will save the coordinates, the mandatory annotation categories
     and the optional annotation categories
-    ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
+    ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
     If the atom array (stack) contains the annotation ``'atom_id'``,
     these values will be used for atom numbering instead of continuous
     numbering.
+    Furthermore, inter-residue bonds will be written into the
+    ``struct_conn`` category.
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     array : AtomArray or AtomArrayStack
         The structure to be written. If a stack is given, each array in
         the stack will be in a separate model.
     data_block : str, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
+        If the file is empty, a new data will be created.
+    Notes
+    -----
+    In some cases, the written inter-residue bonds cannot be read again
+    due to ambiguity to which atoms the bond refers.
+    This is the case, when two equal residues in the same chain have
+    the same (or a masked) `res_id`.
     Examples
     --------
     >>> import os.path
-    >>> file = PDBxFile()
-    >>> set_structure(file, atom_array, data_block="structure")
+    >>> file = CIFFile()
+    >>> set_structure(file, atom_array)
     >>> file.write(os.path.join(path_to_directory, "structure.cif"))
     """
+    block = _get_or_create_block(pdbx_file, data_block)
+    Category = block.subcomponent_class()
+    Column = Category.subcomponent_class()
     # Fill PDBx columns from information
     # in structures' attribute arrays as good as possible
-    # Use OrderedDict in order to ensure the usually used column order.
-    atom_site_dict = OrderedDict()
-    # Save list of annotation categories for checks,
-    # if an optional category exists
-    annot_categories = array.get_annotation_categories()
-    atom_site_dict["group_PDB"] = np.array(
-        ["ATOM" if e == False else "HETATM" for e in array.hetero]
+    atom_site = Category()
+    atom_site["group_PDB"] = np.where(
+        array.hetero, "HETATM", "ATOM"
+    )
+    atom_site["type_symbol"] = np.copy(array.element)
+    atom_site["label_atom_id"] = np.copy(array.atom_name)
+    atom_site["label_alt_id"] = Column(
+        # AtomArrays do not store altloc atoms
+        np.full(array.array_length(), "."),
+        np.full(array.array_length(), MaskValue.INAPPLICABLE),
+    )
+    atom_site["label_comp_id"] = np.copy(array.res_name)
+    atom_site["label_asym_id"] = np.copy(array.chain_id)
+    atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
+    atom_site["label_seq_id"] = np.copy(array.res_id)
+    atom_site["pdbx_PDB_ins_code"] = Column(
+        np.copy(array.ins_code),
+        np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
     )
-    atom_site_dict["type_symbol"] = np.copy(array.element)
-    atom_site_dict["label_atom_id"] = np.copy(array.atom_name)
-    atom_site_dict["label_alt_id"] = np.full(array.array_length(), ".")
-    atom_site_dict["label_comp_id"] = np.copy(array.res_name)
-    atom_site_dict["label_asym_id"] = np.copy(array.chain_id)
-    atom_site_dict["label_entity_id"] = _determine_entity_id(array.chain_id)
-    atom_site_dict["label_seq_id"] = np.array([str(e) for e in array.res_id])
-    atom_site_dict["pdbx_PDB_ins_code"] = array.ins_code
-    atom_site_dict["auth_seq_id"] = atom_site_dict["label_seq_id"]
-    atom_site_dict["auth_comp_id"] = atom_site_dict["label_comp_id"]
-    atom_site_dict["auth_asym_id"] = atom_site_dict["label_asym_id"]
-    atom_site_dict["auth_atom_id"] = atom_site_dict["label_atom_id"]
+    atom_site["auth_seq_id"] = atom_site["label_seq_id"]
+    atom_site["auth_comp_id"] = atom_site["label_comp_id"]
+    atom_site["auth_asym_id"] = atom_site["label_asym_id"]
+    atom_site["auth_atom_id"] = atom_site["label_atom_id"]
+    annot_categories = array.get_annotation_categories()
     if "atom_id" in annot_categories:
-        atom_site_dict["id"] = array.atom_id.astype(str)
+        atom_site["id"] = np.copy(array.atom_id)
     if "b_factor" in annot_categories:
-        atom_site_dict["B_iso_or_equiv"] = np.array(
-            [f"{b:.2f}" for b in array.b_factor]
-        )
+        atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
     if "occupancy" in annot_categories:
-        atom_site_dict["occupancy"] = np.array(
-            [f"{occ:.2f}" for occ in array.occupancy]
-        )
+        atom_site["occupancy"] = np.copy(array.occupancy)
     if "charge" in annot_categories:
-        atom_site_dict["pdbx_formal_charge"] = np.array(
-            [f"{c:+d}" if c != 0 else "?" for c in array.charge]
+        atom_site["pdbx_formal_charge"] = Column(
+            np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
+            np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
         )
+    if array.bonds is not None:
+        block["struct_conn"] = _set_inter_residue_bonds(array, atom_site)
     # In case of a single model handle each coordinate
     # simply like a flattened array
     if type(array) == AtomArray or (
@@ -566,42 +774,34 @@ def set_structure(pdbx_file, array, data_block=None):
     ):
         # 'ravel' flattens coord without copy
         # in case of stack with stack_depth = 1
-        atom_site_dict["Cartn_x"] = np.array(
-            [f"{c:.3f}" for c in np.ravel(array.coord[..., 0])]
-        )
-        atom_site_dict["Cartn_y"] = np.array(
-            [f"{c:.3f}" for c in np.ravel(array.coord[..., 1])]
-        )
-        atom_site_dict["Cartn_z"] = np.array(
-            [f"{c:.3f}" for c in np.ravel(array.coord[..., 2])]
-        )
-        atom_site_dict["pdbx_PDB_model_num"] = np.full(
-            array.array_length(), "1"
+        atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
+        atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
+        atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
+        atom_site["pdbx_PDB_model_num"] = np.ones(
+            array.array_length(), dtype=np.int32
         )
     # In case of multiple models repeat annotations
     # and use model specific coordinates
     elif type(array) == AtomArrayStack:
-        for key, value in atom_site_dict.items():
-            atom_site_dict[key] = np.tile(value, reps=array.stack_depth())
+        atom_site = _repeat(atom_site, array.stack_depth())
         coord = np.reshape(
             array.coord, (array.stack_depth() * array.array_length(), 3)
         )
-        atom_site_dict["Cartn_x"] = np.array([f"{c:.3f}" for c in coord[:, 0]])
-        atom_site_dict["Cartn_y"] = np.array([f"{c:.3f}" for c in coord[:, 1]])
-        atom_site_dict["Cartn_z"] = np.array([f"{c:.3f}" for c in coord[:, 2]])
-        models = np.repeat(
-            np.arange(1, array.stack_depth() + 1).astype(str),
+        atom_site["Cartn_x"] = np.copy(coord[:, 0])
+        atom_site["Cartn_y"] = np.copy(coord[:, 1])
+        atom_site["Cartn_z"] = np.copy(coord[:, 2])
+        atom_site["pdbx_PDB_model_num"] = np.repeat(
+            np.arange(1, array.stack_depth() + 1, dtype=np.int32),
             repeats=array.array_length(),
         )
-        atom_site_dict["pdbx_PDB_model_num"] = models
     else:
         raise ValueError("Structure must be AtomArray or AtomArrayStack")
     if not "atom_id" in annot_categories:
         # Count from 1
-        atom_site_dict["id"] = np.arange(
-            1, len(atom_site_dict["group_PDB"]) + 1
-        ).astype("U6")
-    pdbx_file.set_category("atom_site", atom_site_dict, data_block)
+        atom_site["id"] = np.arange(
+            1, len(atom_site["group_PDB"]) + 1
+        )
+    block["atom_site"] = atom_site
     # Write box into file
     if array.box is not None:
@@ -612,14 +812,38 @@ def set_structure(pdbx_file, array, data_block=None):
         else:
             box = array.box
         len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
-        cell_dict = OrderedDict()
-        cell_dict["length_a"] = "{:6.3f}".format(len_a)
-        cell_dict["length_b"] = "{:6.3f}".format(len_b)
-        cell_dict["length_c"] = "{:6.3f}".format(len_c)
-        cell_dict["angle_alpha"] = "{:5.3f}".format(np.rad2deg(alpha))
-        cell_dict["angle_beta"] = "{:5.3f}".format(np.rad2deg(beta))
-        cell_dict["angle_gamma"] = "{:5.3f}".format(np.rad2deg(gamma))
-        pdbx_file.set_category("cell", cell_dict, data_block)
+        cell = Category()
+        cell["length_a"] = len_a
+        cell["length_b"] = len_b
+        cell["length_c"] = len_c
+        cell["angle_alpha"] = np.rad2deg(alpha)
+        cell["angle_beta"] = np.rad2deg(beta)
+        cell["angle_gamma"] = np.rad2deg(gamma)
+        block["cell"] = cell
+def _get_or_create_block(pdbx_component, block_name):
+    if isinstance(pdbx_component, PDBxFile):
+        # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
+        pdbx_component = pdbx_component.cif_file
+    Block = pdbx_component.subcomponent_class()
+    if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
+        if block_name is None:
+            if len(pdbx_component) > 0:
+                block_name = next(iter(pdbx_component.keys()))
+            else:
+                # File is empty -> invent a new block name
+                block_name = "structure"
+        if block_name not in pdbx_component:
+            block = Block()
+            pdbx_component[block_name] = block
+        return pdbx_component[block_name]
+    else:
+        # Already a block
+        return pdbx_component
 def _determine_entity_id(chain_id):
@@ -635,10 +859,81 @@ def _determine_entity_id(chain_id):
             id_translation[chain_id[i]] = id
             entity_id[i] = id_translation[chain_id[i]]
             id += 1
-    return entity_id.astype(str)
+    return entity_id
+def _repeat(category, repetitions):
+    Category = type(category)
+    Column = Category.subcomponent_class()
+    Data = Column.subcomponent_class()
+    category_dict = {}
+    for key, column in category.items():
+        if isinstance(column, BinaryCIFColumn):
+            data_encoding = column.data.encoding
+            # Optimization: The repeated string array has the same
+            # unique values, as the original string array
+            # -> Use same unique values (faster due to shorter array)
+            if isinstance(data_encoding[0], StringArrayEncoding):
+                data_encoding[0].strings = np.unique(column.data.array)
+            data = Data(np.tile(column.data.array, repetitions), data_encoding)
+        else:
+            data = Data(np.tile(column.data.array, repetitions))
+        mask = Data(np.tile(column.mask.array, repetitions)) \
+               if column.mask is not None else None
+        category_dict[key] = Column(data, mask)
+    return Category(category_dict)
-def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
+def _set_inter_residue_bonds(array, atom_site):
+    """
+    Create the ``struct_conn`` category containing the inter-residue
+    bonds.
+    The involved atoms are identified by annotations from the
+    ``atom_site`` category.
+    """
+    COLUMNS = [
+        "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
+        "pdbx_PDB_ins_code"
+    ]
+    Category = type(atom_site)
+    Column = Category.subcomponent_class()
+    bond_array = array.bonds.as_array()
+    # To save computation time call 'get_residue_starts_for()' only once
+    # with indices of the first and second atom of each bond
+    residue_starts_1, residue_starts_2 = get_residue_starts_for(
+        array, bond_array[:, :2].flatten()
+    ).reshape(-1, 2).T
+    # Filter out all intra-residue bonds
+    bond_array = bond_array[residue_starts_1 != residue_starts_2]
+    struct_conn = Category()
+    struct_conn["id"] = np.arange(1, len(bond_array) + 1)
+    struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
+    struct_conn["pdbx_value_order"] = Column(
+        np.array(
+            [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
+        ),
+        np.where(
+            bond_array[:, 2] == BondType.ANY,
+            MaskValue.MISSING, MaskValue.PRESENT,
+        )
+    )
+    # Write the identifying annotation...
+    for col_name in COLUMNS:
+        annot = atom_site[col_name].as_array()
+        # ...for each bond partner
+        for i in range(2):
+            atom_indices = bond_array[:, i]
+            struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
+                = annot[atom_indices]
+    return struct_conn
+def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
+                  res_name=None):
     """
     Create an :class:`AtomArray` for a chemical component from the
     ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -646,26 +941,37 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
     Parameters
     ----------
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
+        The file object.
     data_block : str, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     use_ideal_coord : bool, optional
         If true, the *ideal* coordinates are read from the file
         (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
         originating from computations.
         If set to false, alternative coordinates are read
         (``model_Cartn_<dim>_`` fields).
+    res_name : str
+        In rare cases the categories may contain rows for multiple
+        components.
+        In this case, the component with the given residue name is
+        read.
+        By default, all rows would be read in this case.
     Returns
     -------
     array : AtomArray
         The parsed chemical component.
     Examples
     --------
     >>> import os.path
-    >>> file = PDBxFile.read(
+    >>> file = CIFFile.read(
     ...     os.path.join(path_to_structures, "molecules", "TYR.cif")
     ... )
     >>> comp = get_component(file)
@@ -695,26 +1001,31 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
     HET         0  TYR HH     H        -0.123   -0.399   -5.059
     HET         0  TYR HXT    H        -1.333   -0.030    4.784
     """
-    atom_dict = pdbx_file.get_category(
-        "chem_comp_atom", block=data_block, expect_looped=True
-    )
-    if atom_dict is None:
+    block = _get_block(pdbx_file, data_block)
+    try:
+        atom_category = block["chem_comp_atom"]
+    except KeyError:
         raise InvalidFileError("Missing 'chem_comp_atom' category in file")
-    bond_dict = pdbx_file.get_category(
-        "chem_comp_bond", block=data_block, expect_looped=True
-    )
+    if res_name is not None:
+        atom_category = _filter(
+            atom_category, atom_category["comp_id"].as_array() == res_name
+        )
+        if len(atom_category) == 0:
+            raise KeyError(
+                f"No rows with residue name '{res_name}' found in "
+                f"'chem_comp_atom' category"
+            )
-    array = AtomArray(len(list(atom_dict.values())[0]))
+    array = AtomArray(atom_category.row_count)
     array.hetero[:] = True
-    array.res_name = atom_dict["comp_id"]
-    array.atom_name = atom_dict["atom_id"]
-    array.element = atom_dict["type_symbol"]
+    array.res_name = atom_category["comp_id"].as_array("U5")
+    array.atom_name = atom_category["atom_id"].as_array("U6")
+    array.element = atom_category["type_symbol"].as_array("U2")
     array.add_annotation("charge", int)
-    array.charge = np.array(
-        [int(c) if c != "?" else 0 for c in atom_dict["charge"]]
-    )
+    array.charge = atom_category["charge"].as_array(int, 0)
     coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
     alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
     if not use_ideal_coord:
@@ -722,7 +1033,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
         coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
     try:
         for i, field in enumerate(coord_fields):
-            array.coord[:,i] = atom_dict[field]
+            array.coord[:,i] = atom_category[field].as_array(np.float32)
     except KeyError as err:
         key = err.args[0]
         warnings.warn(
@@ -731,9 +1042,15 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
             UserWarning
         )
         for i, field in enumerate(alt_coord_fields):
-            array.coord[:,i] = atom_dict[field]
-    if bond_dict is None:
+            array.coord[:,i] = atom_category[field].as_array(np.float32)
+    try:
+        bond_category = block["chem_comp_bond"]
+        if res_name is not None:
+            bond_category = _filter(
+                bond_category, bond_category["comp_id"].as_array() == res_name
+            )
+    except KeyError:
         warnings.warn(
             f"Category 'chem_comp_bond' not found. "
             f"No bonds will be parsed",
@@ -742,12 +1059,14 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True):
     else:
         bonds = BondList(array.array_length())
         for atom1, atom2, order, aromatic_flag in zip(
-            bond_dict["atom_id_1"], bond_dict["atom_id_2"],
-            bond_dict["value_order"], bond_dict["pdbx_aromatic_flag"]
+            bond_category["atom_id_1"].as_array(str),
+            bond_category["atom_id_2"].as_array(str),
+            bond_category["value_order"].as_array(str),
+            bond_category["pdbx_aromatic_flag"].as_array(str)
         ):
             atom_i = np.where(array.atom_name == atom1)[0][0]
             atom_j = np.where(array.atom_name == atom2)[0][0]
-            bond_type = BOND_ORDER_TO_BOND_TYPE[order, aromatic_flag]
+            bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
             bonds.add_bond(atom_i, atom_j, bond_type)
         array.bonds = bonds
@@ -766,15 +1085,22 @@ def set_component(pdbx_file, array, data_block=None):
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     array : AtomArray
         The chemical component to be written.
         Must contain only a single residue.
     data_block : str, optional
-        The name of the data block. Default is the first
-        (and most times only) data block of the file.
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the file is empty, a new data will be created.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     """
+    block = _get_or_create_block(pdbx_file, data_block)
+    Category = block.subcomponent_class()
     if get_residue_count(array) > 1:
         raise BadStructureError(
             "The input atom array must comprise only one residue"
@@ -787,45 +1113,44 @@ def set_component(pdbx_file, array, data_block=None):
     else:
         charge = np.full(array.array_length(), "?", dtype="U2")
-    chem_comp_dict = OrderedDict()
-    chem_comp_dict["comp_id"] = np.full(array.array_length(), res_name)
-    chem_comp_dict["atom_id"] = np.copy(array.atom_name)
-    chem_comp_dict["alt_atom_id"] = chem_comp_dict["atom_id"]
-    chem_comp_dict["type_symbol"] = np.copy(array.element)
-    chem_comp_dict["charge"] = charge
-    chem_comp_dict["model_Cartn_x"] = np.copy(array.coord[:, 0])
-    chem_comp_dict["model_Cartn_y"] = np.copy(array.coord[:, 1])
-    chem_comp_dict["model_Cartn_z"] = np.copy(array.coord[:, 2])
-    chem_comp_dict["pdbx_model_Cartn_x_ideal"] = chem_comp_dict["model_Cartn_x"]
-    chem_comp_dict["pdbx_model_Cartn_y_ideal"] = chem_comp_dict["model_Cartn_y"]
-    chem_comp_dict["pdbx_model_Cartn_z_ideal"] = chem_comp_dict["model_Cartn_z"]
-    chem_comp_dict["pdbx_component_atom_id"] = chem_comp_dict["atom_id"]
-    chem_comp_dict["pdbx_component_comp_id"] = chem_comp_dict["comp_id"]
-    chem_comp_dict["pdbx_ordinal"] = np.arange(
+    atom_cat = Category()
+    atom_cat["comp_id"] = np.full(array.array_length(), res_name)
+    atom_cat["atom_id"] = np.copy(array.atom_name)
+    atom_cat["alt_atom_id"] = atom_cat["atom_id"]
+    atom_cat["type_symbol"] = np.copy(array.element)
+    atom_cat["charge"] = charge
+    atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
+    atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
+    atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
+    atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
+    atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
+    atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
+    atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
+    atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
+    atom_cat["pdbx_ordinal"] = np.arange(
         1, array.array_length() + 1
     ).astype(str)
-    pdbx_file.set_category("chem_comp_atom", chem_comp_dict, data_block)
+    block["chem_comp_atom"] = atom_cat
     if array.bonds is not None:
         bond_array = array.bonds.as_array()
         order_flags = []
         aromatic_flags = []
         for bond_type in bond_array[:,2]:
-            order_flag, aromatic_flag = BOND_TYPE_TO_BOND_ORDER[bond_type]
+            order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
             order_flags.append(order_flag)
             aromatic_flags.append(aromatic_flag)
-        chem_comp_bond_dict = OrderedDict()
-        chem_comp_bond_dict["comp_id"] = np.full(len(bond_array), res_name)
-        chem_comp_bond_dict["atom_id_1"] = array.atom_name[bond_array[:,0]]
-        chem_comp_bond_dict["atom_id_2"] = array.atom_name[bond_array[:,1]]
-        chem_comp_bond_dict["value_order"] = np.array(order_flags)
-        chem_comp_bond_dict["pdbx_aromatic_flag"] = np.array(aromatic_flags)
-        chem_comp_bond_dict["pdbx_ordinal"] = np.arange(
+        bond_cat = Category()
+        bond_cat["comp_id"] = np.full(len(bond_array), res_name)
+        bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
+        bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
+        bond_cat["value_order"] = np.array(order_flags)
+        bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
+        bond_cat["pdbx_ordinal"] = np.arange(
             1, len(bond_array) + 1
         ).astype(str)
-        pdbx_file.set_category("chem_comp_bond", chem_comp_bond_dict, data_block)
+        block["chem_comp_bond"] = bond_cat
 def list_assemblies(pdbx_file, data_block=None):
     """
@@ -838,23 +1163,25 @@ def list_assemblies(pdbx_file, data_block=None):
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     data_block : str, optional
         The name of the data block.
-        Defaults to the first (and most times only) data block of the
+        Default is the first (and most times only) data block of the
         file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     Returns
     -------
     assemblies : dict of str -> str
         A dictionary that maps an assembly ID to a description of the
         corresponding assembly.
     Examples
     --------
     >>> import os.path
-    >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
+    >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
     >>> assembly_ids = list_assemblies(file)
     >>> for key, val in assembly_ids.items():
     ...     print(f"'{key}' : '{val}'")
@@ -865,21 +1192,24 @@ def list_assemblies(pdbx_file, data_block=None):
     '5' : 'icosahedral asymmetric unit, std point frame'
     '6' : 'crystal asymmetric unit, crystal frame'
     """
-    assembly_category = pdbx_file.get_category(
-        "pdbx_struct_assembly", data_block, expect_looped=True
-    )
-    if assembly_category is None:
+    block = _get_block(pdbx_file, data_block)
+    try:
+        assembly_category = block["pdbx_struct_assembly"]
+    except KeyError:
         raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
     return {
         id: details
         for id, details in zip(
-            assembly_category["id"], assembly_category["details"]
+            assembly_category["id"].as_array(str),
+            assembly_category["details"].as_array(str)
         )
     }
 def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
-                 altloc="first", extra_fields=None, use_author_fields=True):
+                 altloc="first", extra_fields=None, use_author_fields=True,
+                 include_bonds=False):
     """
     Build the given biological assembly.
@@ -890,7 +1220,7 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
     Parameters
     ----------
-    pdbx_file : PDBxFile
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
         The file object.
     assembly_id : str
         The assembly to build.
@@ -907,8 +1237,10 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
         contains only one model.
     data_block : str, optional
         The name of the data block.
-        Defaults to the first (and most times only) data block of the
+        Default is the first (and most times only) data block of the
         file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
     altloc : {'first', 'occupancy', 'all'}
         This parameter defines how *altloc* IDs are handled:
             - ``'first'`` - Use atoms that have the first *altloc* ID
@@ -940,36 +1272,46 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
         If `use_author_fields` is true, the annotation arrays will be
         read from the ``auth_xxx`` fields (if applicable),
         otherwise from the the ``label_xxx`` fields.
+    include_bonds : bool, optional
+        If set to true, a :class:`BondList` will be created for the
+        resulting :class:`AtomArray` containing the bond information
+        from the file.
+        Bonds, whose order could not be determined from the
+        *Chemical Component Dictionary*
+        (e.g. especially inter-residue bonds),
+        have :attr:`BondType.ANY`, since the PDB format itself does
+        not support bond orders.
     Returns
     -------
     assembly : AtomArray or AtomArrayStack
         The assembly. The return type depends on the `model` parameter.
     Examples
     --------
     >>> import os.path
-    >>> file = PDBxFile.read(os.path.join(path_to_structures, "1f2n.cif"))
+    >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
     >>> assembly = get_assembly(file, model=1)
     """
-    assembly_gen_category = pdbx_file.get_category(
-        "pdbx_struct_assembly_gen", data_block, expect_looped=True
-    )
-    if assembly_gen_category is None:
+    block = _get_block(pdbx_file, data_block)
+    try:
+        assembly_gen_category = block["pdbx_struct_assembly_gen"]
+    except KeyError:
         raise InvalidFileError(
             "File has no 'pdbx_struct_assembly_gen' category"
         )
-    struct_oper_category = pdbx_file.get_category(
-        "pdbx_struct_oper_list", data_block, expect_looped=True
-    )
-    if struct_oper_category is None:
+    try:
+        struct_oper_category = block["pdbx_struct_oper_list"]
+    except KeyError:
         raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
+    assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
     if assembly_id is None:
-        assembly_id = assembly_gen_category["assembly_id"][0]
-    elif assembly_id not in assembly_gen_category["assembly_id"]:
+        assembly_id = assembly_ids[0]
+    elif assembly_id not in assembly_ids:
         raise KeyError(f"File has no Assembly ID '{assembly_id}'")
     ### Calculate all possible transformations
@@ -982,6 +1324,8 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
     if "label_asym_id" in extra_fields:
         extra_fields_and_asym = extra_fields
     else:
+        # The operations apply on asym IDs
+        # -> they need to be included to select the correct atoms
         extra_fields_and_asym = extra_fields + ["label_asym_id"]
     structure = get_structure(
         pdbx_file,
@@ -990,14 +1334,15 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
         altloc,
         extra_fields_and_asym,
         use_author_fields,
+        include_bonds
     )
     ### Get transformations and apply them to the affected asym IDs
     assembly = None
     for id, op_expr, asym_id_expr in zip(
-        assembly_gen_category["assembly_id"],
-        assembly_gen_category["oper_expression"],
-        assembly_gen_category["asym_id_list"],
+        assembly_gen_category["assembly_id"].as_array(str),
+        assembly_gen_category["oper_expression"].as_array(str),
+        assembly_gen_category["asym_id_list"].as_array(str),
     ):
         # Find the operation expressions for given assembly ID
         # We already asserted that the ID is actually present
@@ -1017,12 +1362,12 @@ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
                 assembly = sub_assembly
             else:
                 assembly += sub_assembly
     # Remove 'label_asym_id', if it was not included in the original
     # user-supplied 'extra_fields'
     if "label_asym_id" not in extra_fields:
         assembly.del_annotation("label_asym_id")
     return assembly
@@ -1056,19 +1401,20 @@ def _get_transformations(struct_oper):
     translation for each operation ID in ``pdbx_struct_oper_list``.
     """
     transformation_dict = {}
-    for index, id in enumerate(struct_oper["id"]):
+    for index, id in enumerate(struct_oper["id"].as_array(str)):
         rotation_matrix = np.array(
             [
                 [
-                    float(struct_oper[f"matrix[{i}][{j}]"][index])
+                    struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
                     for j in (1, 2, 3)
                 ]
                 for i in (1, 2, 3)
             ]
         )
-        translation_vector = np.array(
-            [float(struct_oper[f"vector[{i}]"][index]) for i in (1, 2, 3)]
-        )
+        translation_vector = np.array([
+            struct_oper[f"vector[{i}]"].as_array(float)[index]
+            for i in (1, 2, 3)
+        ])
         transformation_dict[id] = (rotation_matrix, translation_vector)
     return transformation_dict
@@ -1112,6 +1458,8 @@ def _convert_string_to_sequence(string, stype):
     ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
     contained in ``_nucleotideseq_type_list``.
     """
+    # sequence may be stored as multiline string
+    string = string.replace("\n", "")
     if stype in _proteinseq_type_list:
         return ProteinSequence(string)
     elif stype in _nucleotideseq_type_list: