PyPI - biotite - Versions diffs - 1.0.1__cp310-cp310-win_amd64.whl → 1.2.0__cp310-cp310-win_amd64.whl - Mend

biotite 1.0.1__cp310-cp310-win_amd64.whl → 1.2.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (177) hide show

biotite/application/application.py +3 -3
biotite/application/autodock/app.py +1 -1
biotite/application/blast/webapp.py +1 -1
biotite/application/clustalo/app.py +1 -1
biotite/application/dssp/app.py +13 -3
biotite/application/localapp.py +36 -2
biotite/application/msaapp.py +10 -10
biotite/application/muscle/app3.py +5 -18
biotite/application/muscle/app5.py +5 -5
biotite/application/sra/app.py +0 -5
biotite/application/util.py +22 -2
biotite/application/viennarna/rnaalifold.py +8 -8
biotite/application/viennarna/rnaplot.py +9 -3
biotite/application/viennarna/util.py +1 -1
biotite/application/webapp.py +1 -1
biotite/database/afdb/__init__.py +12 -0
biotite/database/afdb/download.py +191 -0
biotite/database/entrez/dbnames.py +10 -0
biotite/database/entrez/download.py +9 -10
biotite/database/entrez/key.py +1 -1
biotite/database/entrez/query.py +5 -4
biotite/database/pubchem/download.py +6 -6
biotite/database/pubchem/error.py +10 -0
biotite/database/pubchem/query.py +12 -23
biotite/database/rcsb/download.py +3 -2
biotite/database/rcsb/query.py +8 -9
biotite/database/uniprot/check.py +22 -17
biotite/database/uniprot/download.py +3 -6
biotite/database/uniprot/query.py +4 -5
biotite/file.py +14 -2
biotite/interface/__init__.py +19 -0
biotite/interface/openmm/__init__.py +16 -0
biotite/interface/openmm/state.py +93 -0
biotite/interface/openmm/system.py +227 -0
biotite/interface/pymol/__init__.py +198 -0
biotite/interface/pymol/cgo.py +346 -0
biotite/interface/pymol/convert.py +185 -0
biotite/interface/pymol/display.py +267 -0
biotite/interface/pymol/object.py +1226 -0
biotite/interface/pymol/shapes.py +178 -0
biotite/interface/pymol/startup.py +169 -0
biotite/interface/rdkit/__init__.py +15 -0
biotite/interface/rdkit/mol.py +490 -0
biotite/interface/version.py +71 -0
biotite/interface/warning.py +19 -0
biotite/sequence/align/__init__.py +0 -4
biotite/sequence/align/alignment.py +49 -14
biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/banded.pyx +26 -26
biotite/sequence/align/cigar.py +2 -2
biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/kmeralphabet.pyx +19 -2
biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/kmertable.pyx +58 -48
biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/localgapped.pyx +47 -47
biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/localungapped.pyx +10 -10
biotite/sequence/align/matrix.py +284 -57
biotite/sequence/align/matrix_data/3Di.mat +24 -0
biotite/sequence/align/matrix_data/PB.license +21 -0
biotite/sequence/align/matrix_data/PB.mat +18 -0
biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/pairwise.pyx +35 -35
biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
biotite/sequence/align/selector.pyx +2 -2
biotite/sequence/align/statistics.py +1 -1
biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
biotite/sequence/alphabet.py +5 -2
biotite/sequence/annotation.py +19 -13
biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
biotite/sequence/codon.py +1 -2
biotite/sequence/graphics/alignment.py +25 -39
biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
biotite/sequence/graphics/colorschemes.py +44 -11
biotite/sequence/graphics/dendrogram.py +4 -2
biotite/sequence/graphics/features.py +2 -2
biotite/sequence/graphics/logo.py +10 -12
biotite/sequence/io/fasta/convert.py +1 -2
biotite/sequence/io/fasta/file.py +1 -1
biotite/sequence/io/fastq/file.py +3 -3
biotite/sequence/io/genbank/file.py +3 -3
biotite/sequence/io/genbank/sequence.py +2 -0
biotite/sequence/io/gff/convert.py +1 -1
biotite/sequence/io/gff/file.py +1 -2
biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
biotite/sequence/profile.py +105 -29
biotite/sequence/search.py +0 -1
biotite/sequence/seqtypes.py +136 -8
biotite/sequence/sequence.py +1 -2
biotite/setup_ccd.py +197 -0
biotite/structure/__init__.py +6 -3
biotite/structure/alphabet/__init__.py +25 -0
biotite/structure/alphabet/encoder.py +332 -0
biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
biotite/structure/alphabet/i3d.py +109 -0
biotite/structure/alphabet/layers.py +86 -0
biotite/structure/alphabet/pb.license +21 -0
biotite/structure/alphabet/pb.py +170 -0
biotite/structure/alphabet/unkerasify.py +128 -0
biotite/structure/atoms.py +163 -66
biotite/structure/basepairs.py +26 -26
biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
biotite/structure/bonds.pyx +79 -25
biotite/structure/box.py +19 -21
biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
biotite/structure/celllist.pyx +83 -67
biotite/structure/chains.py +5 -37
biotite/structure/charges.cp310-win_amd64.pyd +0 -0
biotite/structure/compare.py +420 -13
biotite/structure/density.py +1 -1
biotite/structure/dotbracket.py +27 -28
biotite/structure/filter.py +8 -8
biotite/structure/geometry.py +74 -127
biotite/structure/hbond.py +17 -19
biotite/structure/info/__init__.py +1 -0
biotite/structure/info/atoms.py +24 -15
biotite/structure/info/bonds.py +12 -6
biotite/structure/info/ccd.py +125 -34
biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
biotite/structure/info/groups.py +62 -19
biotite/structure/info/masses.py +9 -6
biotite/structure/info/misc.py +15 -22
biotite/structure/info/radii.py +92 -22
biotite/structure/info/standardize.py +4 -4
biotite/structure/integrity.py +4 -6
biotite/structure/io/general.py +2 -2
biotite/structure/io/gro/file.py +8 -9
biotite/structure/io/mol/convert.py +1 -1
biotite/structure/io/mol/ctab.py +33 -28
biotite/structure/io/mol/mol.py +1 -1
biotite/structure/io/mol/sdf.py +80 -53
biotite/structure/io/pdb/convert.py +4 -3
biotite/structure/io/pdb/file.py +85 -25
biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
biotite/structure/io/pdbqt/file.py +36 -36
biotite/structure/io/pdbx/__init__.py +1 -0
biotite/structure/io/pdbx/bcif.py +54 -15
biotite/structure/io/pdbx/cif.py +92 -66
biotite/structure/io/pdbx/component.py +15 -4
biotite/structure/io/pdbx/compress.py +321 -0
biotite/structure/io/pdbx/convert.py +410 -75
biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
biotite/structure/io/pdbx/encoding.pyx +98 -17
biotite/structure/io/trajfile.py +9 -6
biotite/structure/io/util.py +38 -0
biotite/structure/mechanics.py +0 -1
biotite/structure/molecules.py +141 -156
biotite/structure/pseudoknots.py +7 -13
biotite/structure/repair.py +2 -4
biotite/structure/residues.py +13 -24
biotite/structure/rings.py +335 -0
biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
biotite/structure/sasa.pyx +2 -1
biotite/structure/segments.py +69 -11
biotite/structure/sequence.py +0 -1
biotite/structure/sse.py +0 -2
biotite/structure/superimpose.py +74 -62
biotite/structure/tm.py +581 -0
biotite/structure/transform.py +12 -25
biotite/structure/util.py +76 -4
biotite/version.py +9 -4
biotite/visualize.py +111 -1
{biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
{biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
biotite/structure/info/ccd/README.rst +0 -8
biotite/structure/info/ccd/amino_acids.txt +0 -1663
biotite/structure/info/ccd/carbohydrates.txt +0 -1135
biotite/structure/info/ccd/nucleotides.txt +0 -798
{biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
{biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0

biotite/structure/io/pdbx/convert.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # information.
 __name__ = "biotite.structure.io.pdbx"
-__author__ = "Fabrice Allain, Patrick Kunzmann"
+__author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
 __all__ = [
     "get_sequence",
     "get_model_count",
@@ -13,6 +13,7 @@ __all__ = [
     "set_component",
     "list_assemblies",
     "get_assembly",
+    "get_sse",
 ]
 import itertools
@@ -24,6 +25,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
 from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
 from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
 from biotite.structure.error import BadStructureError
+from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
+from biotite.structure.filter import (
+    _canonical_nucleotide_list as canonical_nucleotide_list,
+)
 from biotite.structure.filter import (
     filter_first_altloc,
     filter_highest_occupancy_altloc,
@@ -36,32 +41,38 @@ from biotite.structure.io.pdbx.bcif import (
 from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
 from biotite.structure.io.pdbx.component import MaskValue
 from biotite.structure.io.pdbx.encoding import StringArrayEncoding
-from biotite.structure.residues import get_residue_count, get_residue_starts_for
+from biotite.structure.residues import (
+    get_residue_count,
+    get_residue_positions,
+    get_residue_starts_for,
+)
 from biotite.structure.util import matrix_rotate
-# Cond types in `struct_conn` category that refer to covalent bonds
-PDBX_COVALENT_TYPES = [
-    "covale",
-    "covale_base",
-    "covale_phosphate",
-    "covale_sugar",
-    "disulf",
-    "modres",
-    "modres_link",
-    "metalc",
-]
-# Map 'struct_conn' bond orders to 'BondType'...
-PDBX_BOND_ORDER_TO_TYPE = {
-    "": BondType.ANY,
-    "sing": BondType.SINGLE,
-    "doub": BondType.DOUBLE,
-    "trip": BondType.TRIPLE,
-    "quad": BondType.QUADRUPLE,
+# Bond types in `struct_conn` category that refer to covalent bonds
+PDBX_BOND_TYPE_ID_TO_TYPE = {
+    # Although a covalent bond, could in theory have a higher bond order,
+    # practically inter-residue bonds are always single
+    "covale": BondType.SINGLE,
+    "covale_base": BondType.SINGLE,
+    "covale_phosphate": BondType.SINGLE,
+    "covale_sugar": BondType.SINGLE,
+    "disulf": BondType.SINGLE,
+    "modres": BondType.SINGLE,
+    "modres_link": BondType.SINGLE,
+    "metalc": BondType.COORDINATION,
+}
+PDBX_BOND_TYPE_TO_TYPE_ID = {
+    BondType.ANY: "covale",
+    BondType.SINGLE: "covale",
+    BondType.DOUBLE: "covale",
+    BondType.TRIPLE: "covale",
+    BondType.QUADRUPLE: "covale",
+    BondType.AROMATIC_SINGLE: "covale",
+    BondType.AROMATIC_DOUBLE: "covale",
+    BondType.AROMATIC_TRIPLE: "covale",
+    BondType.COORDINATION: "metalc",
 }
-# ...and vice versa
 PDBX_BOND_TYPE_TO_ORDER = {
-    # 'ANY' is masked later, it is merely added here to avoid a KeyError
-    BondType.ANY: "",
     BondType.SINGLE: "sing",
     BondType.DOUBLE: "doub",
     BondType.TRIPLE: "trip",
@@ -69,6 +80,10 @@ PDBX_BOND_TYPE_TO_ORDER = {
     BondType.AROMATIC_SINGLE: "sing",
     BondType.AROMATIC_DOUBLE: "doub",
     BondType.AROMATIC_TRIPLE: "trip",
+    # These are masked later, it is merely added here to avoid a KeyError
+    BondType.ANY: "",
+    BondType.AROMATIC: "",
+    BondType.COORDINATION: "",
 }
 # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
 COMP_BOND_ORDER_TO_TYPE = {
@@ -79,11 +94,19 @@ COMP_BOND_ORDER_TO_TYPE = {
     ("SING", "Y"): BondType.AROMATIC_SINGLE,
     ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
     ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
+    ("AROM", "Y"): BondType.AROMATIC,
 }
 # ...and vice versa
 COMP_BOND_TYPE_TO_ORDER = {
     bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
 }
+CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
+# it was observed that when the number or rows in `atom_site` and `struct_conn`
+# exceed a certain threshold,
+# a dictionary approach is less computation and memory intensive than the dense
+# vectorized approach.
+# https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
+FIND_MATCHES_SWITCH_THRESHOLD = 4000000
 _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
 _nucleotideseq_type_list = [
@@ -146,8 +169,8 @@ def get_sequence(pdbx_file, data_block=None):
     -------
     sequence_dict : Dictionary of Sequences
         Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
-        (often equivalent to chain_id and atom_site.auth_asym_id
-        in most cases). Dictionary values are sequences.
+        (equivalent to ``atom_site.auth_asym_id``).
+        Dictionary values are sequences.
     Notes
     -----
@@ -203,9 +226,7 @@ def get_model_count(pdbx_file, data_block=None):
         The number of models.
     """
     block = _get_block(pdbx_file, data_block)
-    return len(
-        _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
-    )
+    return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
 def get_structure(
@@ -296,7 +317,6 @@ def get_structure(
     >>> arr = get_structure(file, model=1)
     >>> print(len(arr))
     304
     """
     block = _get_block(pdbx_file, data_block)
@@ -307,13 +327,12 @@ def get_structure(
         raise InvalidFileError("Missing 'atom_site' category in file")
     models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
-    model_starts = _get_model_starts(models)
-    model_count = len(model_starts)
+    model_count = len(np.unique(models))
     atom_count = len(models)
     if model is None:
         # For a stack, the annotations are derived from the first model
-        model_atom_site = _filter_model(atom_site, model_starts, 1)
+        model_atom_site = _filter_model(atom_site, 1)
         # Any field of the category would work here to get the length
         model_length = model_atom_site.row_count
         atoms = AtomArrayStack(model_count, model_length)
@@ -359,7 +378,7 @@ def get_structure(
                 f"the given model {model} does not exist"
             )
-        model_atom_site = _filter_model(atom_site, model_starts, model)
+        model_atom_site = _filter_model(atom_site, model)
         # Any field of the category would work here to get the length
         model_length = model_atom_site.row_count
         atoms = AtomArray(model_length)
@@ -475,16 +494,53 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
     array.set_annotation("element", atom_site["type_symbol"].as_array(str))
     if "atom_id" in extra_fields:
-        array.set_annotation("atom_id", atom_site["id"].as_array(int))
+        if "id" in atom_site:
+            array.set_annotation("atom_id", atom_site["id"].as_array(int))
+        else:
+            warnings.warn(
+                "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
+                UserWarning,
+            )
+            array.set_annotation("atom_id", np.arange(array.array_length()))
         extra_fields.remove("atom_id")
     if "b_factor" in extra_fields:
-        array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
+        if "B_iso_or_equiv" in atom_site:
+            array.set_annotation(
+                "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
+            )
+        else:
+            warnings.warn(
+                "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
+                UserWarning,
+            )
+            array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
         extra_fields.remove("b_factor")
     if "occupancy" in extra_fields:
-        array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
+        if "occupancy" in atom_site:
+            array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
+        else:
+            warnings.warn(
+                "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
+                UserWarning,
+            )
+            array.set_annotation(
+                "occupancy", np.ones(array.array_length(), dtype=float)
+            )
         extra_fields.remove("occupancy")
     if "charge" in extra_fields:
-        array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
+        if "pdbx_formal_charge" in atom_site:
+            array.set_annotation(
+                "charge",
+                atom_site["pdbx_formal_charge"].as_array(
+                    int, 0
+                ),  # masked values are set to 0
+            )
+        else:
+            warnings.warn(
+                "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
+                UserWarning,
+            )
+            array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
         extra_fields.remove("charge")
     # Handle all remaining custom fields
@@ -536,7 +592,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     ]
     covale_mask = np.isin(
-        struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
+        struct_conn["conn_type_id"].as_array(str),
+        list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
     )
     if "ptnr1_symmetry" in struct_conn:
         covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
@@ -576,13 +633,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
     atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
-    # Interpret missing values as ANY bonds
-    bond_order = struct_conn["pdbx_value_order"].as_array(str, "")
+    bond_type_id = struct_conn["conn_type_id"].as_array()
     # Consecutively apply the same masks as applied to the atom indices
     # Logical combination does not work here,
     # as the second mask was created based on already filtered data
-    bond_order = bond_order[covale_mask][mapping_exists_mask]
-    bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
+    bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
+    # The type ID is always present in the dictionary,
+    # as it was used to filter the applicable bonds
+    bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
     return BondList(
         atom_site.row_count,
@@ -593,9 +651,20 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
 def _find_matches(query_arrays, reference_arrays):
     """
     For each index in the `query_arrays` find the indices in the
-    `reference_arrays` where all query values the reference counterpart.
+    `reference_arrays` where all query values match the reference counterpart.
     If no match is found for a query, the corresponding index is -1.
     """
+    if (
+        query_arrays[0].shape[0] * reference_arrays[0].shape[0]
+        <= FIND_MATCHES_SWITCH_THRESHOLD
+    ):
+        match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
+    else:
+        match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
+    return match_indices
+def _find_matches_by_dense_array(query_arrays, reference_arrays):
     match_masks_for_all_columns = np.stack(
         [
             query[:, np.newaxis] == reference[np.newaxis, :]
@@ -623,6 +692,38 @@ def _find_matches(query_arrays, reference_arrays):
     return match_indices
+def _find_matches_by_dict(query_arrays, reference_arrays):
+    # Convert reference arrays to a dictionary for O(1) lookups
+    reference_dict = {}
+    ambiguous_keys = set()
+    for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
+        ref_key = tuple(ref_row)
+        if ref_key in reference_dict:
+            ambiguous_keys.add(ref_key)
+            continue
+        reference_dict[ref_key] = ref_idx
+    match_indices = []
+    for query_idx, query_row in enumerate(zip(*query_arrays)):
+        query_key = tuple(query_row)
+        occurrence = reference_dict.get(query_key)
+        if occurrence is None:
+            # -1 indicates that no match was found in the reference
+            match_indices.append(-1)
+        elif query_key in ambiguous_keys:
+            # The query cannot be uniquely matched to an atom in the reference
+            raise InvalidFileError(
+                f"The covalent bond in the 'struct_conn' category at index "
+                f"{query_idx} cannot be unambiguously assigned to atoms in "
+                f"the 'atom_site' category"
+            )
+        else:
+            match_indices.append(occurrence)
+    return np.array(match_indices)
 def _get_struct_conn_col_name(col_name, partner):
     """
     For a column name in ``atom_site`` get the corresponding column name
@@ -661,21 +762,26 @@ def _filter_altloc(array, atom_site, altloc):
         raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
-def _get_model_starts(model_array):
-    """
-    Get the start index for each model in the arrays of the
-    ``atom_site`` category.
-    """
-    _, indices = np.unique(model_array, return_index=True)
-    indices.sort()
-    return indices
-def _filter_model(atom_site, model_starts, model):
+def _filter_model(atom_site, model):
     """
     Reduce the ``atom_site`` category to the values for the given
     model.
+    Parameters
+    ----------
+    atom_site : CIFCategory or BinaryCIFCategory
+        ``atom_site`` category containing all models.
+    model : int
+        The model to be selected.
+    Returns
+    -------
+    atom_site : CIFCategory or BinaryCIFCategory
+        The ``atom_site`` category containing only the selected model.
     """
+    models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
+    _, model_starts = np.unique(models, return_index=True)
+    model_starts.sort()
     # Append exclusive stop
     model_starts = np.append(model_starts, [atom_site.row_count])
     # Indexing starts at 0, but model number starts at 1
@@ -703,7 +809,13 @@ def _get_box(block):
     return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
-def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
+def set_structure(
+    pdbx_file,
+    array,
+    data_block=None,
+    include_bonds=False,
+    extra_fields=[],
+):
     """
     Set the ``atom_site`` category with atom information from an
     :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -737,6 +849,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
         category.
         Inter-residue bonds will be written into the ``struct_conn``
         independent of this parameter.
+    extra_fields : list of str, optional
+        List of additional fields from the ``atom_site`` category
+        that should be written into the file.
+        Default is an empty list.
     Notes
     -----
@@ -752,7 +868,6 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
     >>> file = CIFFile()
     >>> set_structure(file, atom_array)
     >>> file.write(os.path.join(path_to_directory, "structure.cif"))
     """
     _check_non_empty(array)
@@ -773,7 +888,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
     )
     atom_site["label_comp_id"] = np.copy(array.res_name)
     atom_site["label_asym_id"] = np.copy(array.chain_id)
-    atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
+    atom_site["label_entity_id"] = (
+        np.copy(array.label_entity_id)
+        if "label_entity_id" in array.get_annotation_categories()
+        else _determine_entity_id(array.chain_id)
+    )
     atom_site["label_seq_id"] = np.copy(array.res_id)
     atom_site["pdbx_PDB_ins_code"] = Column(
         np.copy(array.ins_code),
@@ -797,6 +916,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
             np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
         )
+    # Handle all remaining custom fields
+    if len(extra_fields) > 0:
+        # ... check to avoid clashes with standard annotations
+        _standard_annotations = [
+            "hetero",
+            "element",
+            "atom_name",
+            "res_name",
+            "chain_id",
+            "res_id",
+            "ins_code",
+            "atom_id",
+            "b_factor",
+            "occupancy",
+            "charge",
+        ]
+        _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
+        for annot in extra_fields:
+            if annot in _reserved_annotation_names:
+                raise ValueError(
+                    f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
+                    "Please choose another name."
+                )
+            atom_site[annot] = np.copy(array.get_annotation(annot))
     if array.bonds is not None:
         struct_conn = _set_inter_residue_bonds(array, atom_site)
         if struct_conn is not None:
@@ -1021,13 +1166,21 @@ def _set_inter_residue_bonds(array, atom_site):
     if len(bond_array) == 0:
         return None
+    # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
+    # nucleotide/amino acid residues
+    bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
+    if len(bond_array) == 0:
+        return None
     struct_conn = Category()
     struct_conn["id"] = np.arange(1, len(bond_array) + 1)
-    struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
+    struct_conn["conn_type_id"] = [
+        PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
+    ]
     struct_conn["pdbx_value_order"] = Column(
         np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
         np.where(
-            bond_array[:, 2] == BondType.ANY,
+            np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
             MaskValue.MISSING,
             MaskValue.PRESENT,
         ),
@@ -1063,7 +1216,34 @@ def _filter_bonds(array, connection):
         raise ValueError("Invalid 'connection' option")
-def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
+def _filter_canonical_links(array, bond_array):
+    """
+    Filter out peptide bonds between adjacent canonical amino acid residues.
+    """
+    # Get the residue index for each bonded atom
+    residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
+        -1, 2
+    )
+    return (
+        # Must be canonical residues
+        np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
+        np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
+        # Must be backbone bond
+        np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
+        np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
+        # Must connect adjacent residues
+        residue_indices[:, 1] - residue_indices[:, 0] == 1
+    )  # fmt: skip
+def get_component(
+    pdbx_file,
+    data_block=None,
+    use_ideal_coord=True,
+    res_name=None,
+    allow_missing_coord=False,
+):
     """
     Create an :class:`AtomArray` for a chemical component from the
     ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -1091,6 +1271,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
         In this case, the component with the given residue name is
         read.
         By default, all rows would be read in this case.
+    allow_missing_coord : bool, optional
+        Whether to allow missing coordinate values in components.
+        If ``True``, these will be represented as ``nan`` values.
+        If ``False``, a ``ValueError`` is raised when missing coordinates
+        are encountered.
     Returns
     -------
@@ -1161,17 +1346,29 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
         # Swap with the fallback option
         coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
     try:
-        for i, field in enumerate(coord_fields):
-            array.coord[:, i] = atom_category[field].as_array(np.float32)
-    except KeyError as err:
-        key = err.args[0]
-        warnings.warn(
-            f"Attribute '{key}' not found within 'chem_comp_atom' category. "
-            f"The fallback coordinates will be used instead",
-            UserWarning,
+        array.coord = _parse_component_coordinates(
+            [atom_category[field] for field in coord_fields]
+        )
+    except Exception as err:
+        if isinstance(err, KeyError):
+            key = err.args[0]
+            warnings.warn(
+                f"Attribute '{key}' not found within 'chem_comp_atom' category. "
+                f"The fallback coordinates will be used instead",
+                UserWarning,
+            )
+        elif isinstance(err, ValueError):
+            warnings.warn(
+                "The coordinates are missing for some atoms. "
+                "The fallback coordinates will be used instead",
+                UserWarning,
+            )
+        else:
+            raise
+        array.coord = _parse_component_coordinates(
+            [atom_category[field] for field in alt_coord_fields],
+            allow_missing=allow_missing_coord,
         )
-        for i, field in enumerate(alt_coord_fields):
-            array.coord[:, i] = atom_category[field].as_array(np.float32)
     try:
         bond_category = block["chem_comp_bond"]
@@ -1181,7 +1378,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
             )
     except KeyError:
         warnings.warn(
-            "Category 'chem_comp_bond' not found. " "No bonds will be parsed",
+            "Category 'chem_comp_bond' not found. No bonds will be parsed",
             UserWarning,
         )
     else:
@@ -1201,6 +1398,23 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
     return array
+def _parse_component_coordinates(coord_columns, allow_missing=False):
+    coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
+    for i, column in enumerate(coord_columns):
+        if column.mask is not None and column.mask.array.any():
+            if allow_missing:
+                warnings.warn(
+                    "Missing coordinates for some atoms. Those will be set to nan",
+                    UserWarning,
+                )
+            else:
+                raise ValueError(
+                    "Missing coordinates for some atoms",
+                )
+        coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
+    return coord
 def set_component(pdbx_file, array, data_block=None):
     """
     Set the ``chem_comp_atom`` and, if bonds are available,
@@ -1305,6 +1519,7 @@ def list_assemblies(pdbx_file, data_block=None):
     Examples
     --------
     >>> import os.path
     >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
     >>> assembly_ids = list_assemblies(file)
@@ -1417,7 +1632,10 @@ def get_assembly(
     Returns
     -------
     assembly : AtomArray or AtomArrayStack
-        The assembly. The return type depends on the `model` parameter.
+        The assembly.
+        The return type depends on the `model` parameter.
+        Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
+        unit in the assembly.
     Examples
     --------
@@ -1506,7 +1724,6 @@ def _apply_transformations(structure, transformation_dict, operations):
     """
     # Additional first dimesion for 'structure.repeat()'
     assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
     # Apply corresponding transformation for each copy in the assembly
     for i, operation in enumerate(operations):
         coord = structure.coord
@@ -1520,7 +1737,11 @@ def _apply_transformations(structure, transformation_dict, operations):
             coord += translation_vector
         assembly_coord[i] = coord
-    return repeat(structure, assembly_coord)
+    assembly = repeat(structure, assembly_coord)
+    assembly.set_annotation(
+        "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
+    )
+    return assembly
 def _get_transformations(struct_oper):
@@ -1596,4 +1817,118 @@ def _convert_string_to_sequence(string, stype):
     elif stype in _other_type_list:
         return None
     else:
-        raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype)
+        raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
+def get_sse(pdbx_file, data_block=None, match_model=None):
+    """
+    Get the secondary structure from a PDBx file.
+    Parameters
+    ----------
+    pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
+        The file object.
+        The following categories are required:
+        - ``entity_poly``
+        - ``struct_conf`` (if alpha-helices are present)
+        - ``struct_sheet_range`` (if beta-strands are present)
+        - ``atom_site`` (if `match_model` is set)
+    data_block : str, optional
+        The name of the data block.
+        Default is the first (and most times only) data block of the
+        file.
+        If the data block object is passed directly to `pdbx_file`,
+        this parameter is ignored.
+    match_model : None, optional
+        If a model number is given, only secondary structure elements for residues are
+        kept, that are resolved in the given model.
+        This means secondary structure elements for residues that would not appear
+        in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
+        By default, all residues in the sequence are kept.
+    Returns
+    -------
+    sse_dict : dict of str -> ndarray, dtype=str
+        The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
+        secondary structure of the respective chain.
+        - ``"a"``: alpha-helix
+        - ``"b"``: beta-strand
+        - ``"c"``: coil or not an amino acid
+        Each secondary structure element corresponds to the ``label_seq_id`` of the
+        ``atom_site`` category.
+        This means that the 0-th position of the array corresponds to the residue
+        in ``atom_site`` with ``label_seq_id`` ``1``.
+    Examples
+    --------
+    >>> import os.path
+    >>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
+    >>> sse = get_sse(file, match_model=1)
+    >>> print(sse)
+    {'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
+                 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
+                 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
+                 'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
+                 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
+                 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
+                 'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
+                 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
+                 'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
+                 'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
+                 dtype='<U1')}
+    If only secondary structure elements for resolved residues are requested, the length
+    of the returned array matches the number of peptide residues in the structure.
+    >>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
+    >>> print(len(get_sse(file, match_model=1)["A"]))
+    128
+    >>> atoms = get_structure(file, model=1)
+    >>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
+    >>> print(get_residue_count(atoms))
+    128
+    """
+    block = _get_block(pdbx_file, data_block)
+    # Init all chains with "c" for coil
+    sse_dict = {
+        chain_id: np.repeat("c", len(sequence))
+        for chain_id, sequence in get_sequence(block).items()
+    }
+    # Populate SSE arrays with helices and strands
+    for sse_symbol, category_name in [
+        ("a", "struct_conf"),
+        ("b", "struct_sheet_range"),
+    ]:
+        if category_name in block:
+            category = block[category_name]
+            chains = category["beg_auth_asym_id"].as_array(str)
+            start_positions = category["beg_label_seq_id"].as_array(int)
+            end_positions = category["end_label_seq_id"].as_array(int)
+            # set alpha helix positions
+            for chain, start, end in zip(chains, start_positions, end_positions):
+                # Translate the 1-based positions from PDBx into 0-based array indices
+                sse_dict[chain][start - 1 : end] = sse_symbol
+    if match_model is not None:
+        model_atom_site = _filter_model(block["atom_site"], match_model)
+        chain_ids = model_atom_site["auth_asym_id"].as_array(str)
+        res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
+        # Filter out masked residues, i.e. residues not part of a chain
+        mask = res_ids != -1
+        chain_ids = chain_ids[mask]
+        res_ids = res_ids[mask]
+        for chain_id, sse in sse_dict.items():
+            res_ids_in_chain = res_ids[chain_ids == chain_id]
+            # Transform from 1-based residue ID to 0-based index
+            indices = np.unique(res_ids_in_chain) - 1
+            sse_dict[chain_id] = sse[indices]
+    return sse_dict