PyPI - biotite - Versions diffs - 0.40.0__cp311-cp311-macosx_11_0_arm64.whl → 0.41.0__cp311-cp311-macosx_11_0_arm64.whl - Mend

biotite 0.40.0__cp311-cp311-macosx_11_0_arm64.whl → 0.41.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (90) hide show

biotite/__init__.py +1 -1
biotite/database/pubchem/download.py +23 -23
biotite/database/pubchem/query.py +7 -7
biotite/file.py +17 -9
biotite/sequence/align/banded.c +119 -119
biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
biotite/sequence/align/cigar.py +60 -15
biotite/sequence/align/kmeralphabet.c +119 -119
biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmersimilarity.c +119 -119
biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.cpp +119 -119
biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
biotite/sequence/align/localgapped.c +119 -119
biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/localungapped.c +119 -119
biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/multiple.c +119 -119
biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
biotite/sequence/align/pairwise.c +119 -119
biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
biotite/sequence/align/permutation.c +119 -119
biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
biotite/sequence/align/selector.c +119 -119
biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
biotite/sequence/align/tracetable.c +119 -119
biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
biotite/sequence/annotation.py +2 -2
biotite/sequence/codec.c +119 -119
biotite/sequence/codec.cpython-311-darwin.so +0 -0
biotite/sequence/io/fasta/convert.py +27 -24
biotite/sequence/phylo/nj.c +119 -119
biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/tree.c +119 -119
biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/upgma.c +119 -119
biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
biotite/structure/__init__.py +2 -0
biotite/structure/bonds.c +1124 -915
biotite/structure/bonds.cpython-311-darwin.so +0 -0
biotite/structure/celllist.c +119 -119
biotite/structure/celllist.cpython-311-darwin.so +0 -0
biotite/structure/charges.c +119 -119
biotite/structure/charges.cpython-311-darwin.so +0 -0
biotite/structure/dotbracket.py +2 -0
biotite/structure/info/atoms.py +6 -1
biotite/structure/info/bonds.py +1 -1
biotite/structure/info/ccd/amino_acids.txt +17 -0
biotite/structure/info/ccd/carbohydrates.txt +2 -0
biotite/structure/info/ccd/components.bcif +0 -0
biotite/structure/info/ccd/nucleotides.txt +1 -0
biotite/structure/info/misc.py +69 -5
biotite/structure/integrity.py +19 -70
biotite/structure/io/ctab.py +12 -106
biotite/structure/io/general.py +157 -165
biotite/structure/io/gro/file.py +16 -16
biotite/structure/io/mmtf/convertarray.c +119 -119
biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/convertfile.c +119 -119
biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/decode.c +119 -119
biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/encode.c +119 -119
biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
biotite/structure/io/mol/__init__.py +4 -2
biotite/structure/io/mol/convert.py +71 -7
biotite/structure/io/mol/ctab.py +414 -0
biotite/structure/io/mol/header.py +116 -0
biotite/structure/io/mol/{file.py → mol.py} +69 -82
biotite/structure/io/mol/sdf.py +909 -0
biotite/structure/io/pdb/file.py +84 -31
biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbx/__init__.py +0 -1
biotite/structure/io/pdbx/bcif.py +2 -3
biotite/structure/io/pdbx/cif.py +9 -5
biotite/structure/io/pdbx/component.py +4 -1
biotite/structure/io/pdbx/convert.py +203 -79
biotite/structure/io/pdbx/encoding.c +119 -119
biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
biotite/structure/repair.py +253 -0
biotite/structure/sasa.c +119 -119
biotite/structure/sasa.cpython-311-darwin.so +0 -0
biotite/structure/sequence.py +112 -0
biotite/structure/superimpose.py +472 -13
{biotite-0.40.0.dist-info → biotite-0.41.0.dist-info}/METADATA +2 -2
{biotite-0.40.0.dist-info → biotite-0.41.0.dist-info}/RECORD +89 -85
biotite/structure/io/pdbx/error.py +0 -14
{biotite-0.40.0.dist-info → biotite-0.41.0.dist-info}/LICENSE.rst +0 -0
{biotite-0.40.0.dist-info → biotite-0.41.0.dist-info}/WHEEL +0 -0
{biotite-0.40.0.dist-info → biotite-0.41.0.dist-info}/top_level.txt +0 -0

biotite/structure/io/pdbx/convert.py CHANGED Viewed

@@ -240,11 +240,11 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
         If set to true, a :class:`BondList` will be created for the
         resulting :class:`AtomArray` containing the bond information
         from the file.
-        Bonds, whose order could not be determined from the
-        *Chemical Component Dictionary*
-        (e.g. especially inter-residue bonds),
-        have :attr:`BondType.ANY`, since the PDB format itself does
-        not support bond orders.
+        Inter-residue bonds, will be read from the ``struct_conn``
+        category.
+        Intra-residue bonds will be read from the ``chem_comp_bond``, if
+        available, otherwise they will be derived from the Chemical
+        Component Dictionary.
     Returns
     -------
@@ -279,11 +279,7 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
         model_atom_site = _filter_model(atom_site, model_starts, 1)
         # Any field of the category would work here to get the length
         model_length = model_atom_site.row_count
-        stack = AtomArrayStack(model_count, model_length)
-        _fill_annotations(
-            stack, model_atom_site, extra_fields, use_author_fields
-        )
+        atoms = AtomArrayStack(model_count, model_length)
         # Check if each model has the same amount of atoms
         # If not, raise exception
@@ -294,29 +290,17 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
                 "instead"
             )
-        stack.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
+        atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
                               .reshape((model_count, model_length))
-        stack.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
+        atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
                               .reshape((model_count, model_length))
-        stack.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
+        atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
                               .reshape((model_count, model_length))
-        if include_bonds:
-            bonds = connect_via_residue_names(stack)
-            if "struct_conn" in block:
-                bonds = bonds.merge(_parse_inter_residue_bonds(
-                    model_atom_site, block["struct_conn"]
-                ))
-            stack.bonds = bonds
-        stack = _filter_altloc(stack, model_atom_site, altloc)
         box = _get_box(block)
         if box is not None:
             # Duplicate same box for each model
-            stack.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
-        return stack
+            atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
     else:
         if model == 0:
@@ -332,29 +316,44 @@ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
         model_atom_site = _filter_model(atom_site, model_starts, model)
         # Any field of the category would work here to get the length
         model_length = model_atom_site.row_count
-        array = AtomArray(model_length)
-        _fill_annotations(
-            array, model_atom_site, extra_fields, use_author_fields
-        )
+        atoms = AtomArray(model_length)
-        array.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
-        array.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
-        array.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
-        if include_bonds:
-            bonds = connect_via_residue_names(array)
-            if "struct_conn" in block:
-                bonds = bonds.merge(_parse_inter_residue_bonds(
-                    model_atom_site, block["struct_conn"]
-                ))
-            array.bonds = bonds
+        atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
+        atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
+        atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
-        array = _filter_altloc(array, model_atom_site, altloc)
+        atoms.box = _get_box(block)
-        array.box = _get_box(block)
+    # The below part is the same for both, AtomArray and AtomArrayStack
+    _fill_annotations(
+        atoms, model_atom_site, extra_fields, use_author_fields
+    )
+    if include_bonds:
+        if "chem_comp_bond" in block:
+            try:
+                custom_bond_dict = _parse_intra_residue_bonds(
+                    block["chem_comp_bond"]
+                )
+            except KeyError:
+                warnings.warn(
+                    "The 'chem_comp_bond' category has missing columns, "
+                    "falling back to using Chemical Component Dictionary",
+                    UserWarning
+                )
+                custom_bond_dict = None
+            bonds = connect_via_residue_names(
+                atoms, custom_bond_dict=custom_bond_dict
+            )
+        else:
+            bonds = connect_via_residue_names(atoms)
+        if "struct_conn" in block:
+            bonds = bonds.merge(_parse_inter_residue_bonds(
+                model_atom_site, block["struct_conn"]
+            ))
+        atoms.bonds = bonds
+    atoms = _filter_altloc(atoms, model_atom_site, altloc)
-        return array
+    return atoms
 def _get_block(pdbx_component, block_name):
@@ -372,14 +371,14 @@ def _get_block(pdbx_component, block_name):
         return pdbx_component
-def _get_or_fallback(category, key, fallback_key, cat_name="input"):
+def _get_or_fallback(category, key, fallback_key):
         """
         Return column related to key in category if it exists,
         otherwise try to get the column related to fallback key.
         """
         if key not in category:
             warnings.warn(
-                f"Attribute '{key}' not found within '{cat_name}' category. "
+                f"Attribute '{key}' not found within 'atom_site' category. "
                 f"The fallback attribute '{fallback_key}' will be used instead",
                 UserWarning
             )
@@ -387,8 +386,8 @@ def _get_or_fallback(category, key, fallback_key, cat_name="input"):
                 return category[fallback_key]
             except KeyError as key_exc:
                 raise InvalidFileError(
-                    f"Fallback attribute '{fallback_key}' not found in "
-                    "'{dict_name}' category"
+                    f"Fallback attribute '{fallback_key}' not found within "
+                    "'atom_site' category"
                 ) from key_exc
         return category[key]
@@ -483,6 +482,28 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
         )
+def _parse_intra_residue_bonds(chem_comp_bond):
+    """
+    Create a :func:`connect_via_residue_names()` compatible
+    `custom_bond_dict` from the ``chem_comp_bond`` category.
+    """
+    custom_bond_dict = {}
+    for res_name, atom_1, atom_2, order, aromatic_flag in zip(
+        chem_comp_bond["comp_id"].as_array(str),
+        chem_comp_bond["atom_id_1"].as_array(str),
+        chem_comp_bond["atom_id_2"].as_array(str),
+        chem_comp_bond["value_order"].as_array(str),
+        chem_comp_bond["pdbx_aromatic_flag"].as_array(str)
+    ):
+        if res_name not in custom_bond_dict:
+            custom_bond_dict[res_name] = {}
+        bond_type = COMP_BOND_ORDER_TO_TYPE.get(
+            (order.upper(), aromatic_flag), BondType.ANY
+        )
+        custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
+    return custom_bond_dict
 def _parse_inter_residue_bonds(atom_site, struct_conn):
     """
     Create inter-residue bonds by parsing the ``struct_conn`` category.
@@ -676,7 +697,7 @@ def _get_box(block):
     return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
-def set_structure(pdbx_file, array, data_block=None):
+def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
     """
     Set the ``atom_site`` category with atom information from an
     :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -703,7 +724,13 @@ def set_structure(pdbx_file, array, data_block=None):
         file.
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
-        If the file is empty, a new data will be created.
+        If the file is empty, a new data block will be created.
+    include_bonds : bool, optional
+        If set to true and `array` has associated ``bonds`` , the
+        intra-residue bonds will be written into the ``chem_comp_bond``
+        category.
+        Inter-residue bonds will be written into the ``struct_conn``
+        independent of this parameter.
     Notes
     -----
@@ -721,6 +748,8 @@ def set_structure(pdbx_file, array, data_block=None):
     >>> file.write(os.path.join(path_to_directory, "structure.cif"))
     """
+    _check_non_empty(array)
     block = _get_or_create_block(pdbx_file, data_block)
     Category = block.subcomponent_class()
     Column = Category.subcomponent_class()
@@ -765,7 +794,13 @@ def set_structure(pdbx_file, array, data_block=None):
         )
     if array.bonds is not None:
-        block["struct_conn"] = _set_inter_residue_bonds(array, atom_site)
+        struct_conn =  _set_inter_residue_bonds(array, atom_site)
+        if struct_conn is not None:
+            block["struct_conn"] = struct_conn
+        if include_bonds:
+            chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
+            if chem_comp_bond is not None:
+                block["chem_comp_bond"] = chem_comp_bond
     # In case of a single model handle each coordinate
     # simply like a flattened array
@@ -782,7 +817,7 @@ def set_structure(pdbx_file, array, data_block=None):
         )
     # In case of multiple models repeat annotations
     # and use model specific coordinates
-    elif type(array) == AtomArrayStack:
+    else:
         atom_site = _repeat(atom_site, array.stack_depth())
         coord = np.reshape(
             array.coord, (array.stack_depth() * array.array_length(), 3)
@@ -794,8 +829,6 @@ def set_structure(pdbx_file, array, data_block=None):
             np.arange(1, array.stack_depth() + 1, dtype=np.int32),
             repeats=array.array_length(),
         )
-    else:
-        raise ValueError("Structure must be AtomArray or AtomArrayStack")
     if not "atom_id" in annot_categories:
         # Count from 1
         atom_site["id"] = np.arange(
@@ -822,6 +855,20 @@ def set_structure(pdbx_file, array, data_block=None):
         block["cell"] = cell
+def _check_non_empty(array):
+    if isinstance(array, AtomArray):
+        if array.array_length() == 0:
+            raise BadStructureError("Structure must not be empty")
+    elif isinstance(array, AtomArrayStack):
+        if array.array_length() == 0 or array.stack_depth() == 0:
+            raise BadStructureError("Structure must not be empty")
+    else:
+        raise ValueError(
+            "Structure must be AtomArray or AtomArrayStack, "
+            f"but got {type(array).__name__}"
+        )
 def _get_or_create_block(pdbx_component, block_name):
     if isinstance(pdbx_component, PDBxFile):
         # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
@@ -885,6 +932,67 @@ def _repeat(category, repetitions):
     return Category(category_dict)
+def _set_intra_residue_bonds(array, atom_site):
+    """
+    Create the ``chem_comp_bond`` category containing the intra-residue
+    bonds.
+    ``atom_site`` is only used to infer the right :class:`Category` type
+    (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
+    """
+    if (array.res_name == "").any():
+        raise BadStructureError(
+            "Structure contains atoms with empty residue name, "
+            "but it is required to write intra-residue bonds"
+        )
+    if (array.atom_name == "").any():
+        raise BadStructureError(
+            "Structure contains atoms with empty atom name, "
+            "but it is required to write intra-residue bonds"
+        )
+    Category = type(atom_site)
+    Column = Category.subcomponent_class()
+    bond_array = _filter_bonds(array, "intra")
+    if len(bond_array) == 0:
+        return None
+    value_order = np.zeros(len(bond_array), dtype="U4")
+    aromatic_flag = np.zeros(len(bond_array), dtype="U1")
+    for i, bond_type in enumerate(bond_array[:, 2]):
+        if bond_type == BondType.ANY:
+            # ANY bonds will be masked anyway, no need to set the value
+            continue
+        order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
+        value_order[i] = order
+        aromatic_flag[i] = aromatic
+    any_mask = bond_array[:, 2] == BondType.ANY
+    chem_comp_bond = Category()
+    # Take the residue name from the first atom index, as the residue
+    # name is the same for both atoms, since we have only intra bonds
+    chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]]
+    chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
+    chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
+    chem_comp_bond["value_order"] = Column(
+        value_order,
+        np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
+    )
+    chem_comp_bond["pdbx_aromatic_flag"] = Column(
+        aromatic_flag,
+        np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
+    )
+    # BondList does not contain stereo information
+    # -> all values are missing
+    chem_comp_bond["pdbx_stereo_config"] = Column(
+        np.zeros(len(bond_array), dtype="U1"),
+        np.full(len(bond_array), MaskValue.MISSING)
+    )
+    chem_comp_bond["pdbx_ordinal"] = np.arange(
+        1, len(bond_array) + 1, dtype=np.int32
+    )
+    return chem_comp_bond
 def _set_inter_residue_bonds(array, atom_site):
     """
     Create the ``struct_conn`` category containing the inter-residue
@@ -900,15 +1008,9 @@ def _set_inter_residue_bonds(array, atom_site):
     Category = type(atom_site)
     Column = Category.subcomponent_class()
-    bond_array = array.bonds.as_array()
-    # To save computation time call 'get_residue_starts_for()' only once
-    # with indices of the first and second atom of each bond
-    residue_starts_1, residue_starts_2 = get_residue_starts_for(
-        array, bond_array[:, :2].flatten()
-    ).reshape(-1, 2).T
-    # Filter out all intra-residue bonds
-    bond_array = bond_array[residue_starts_1 != residue_starts_2]
+    bond_array = _filter_bonds(array, "inter")
+    if len(bond_array) == 0:
+        return None
     struct_conn = Category()
     struct_conn["id"] = np.arange(1, len(bond_array) + 1)
     struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
@@ -932,6 +1034,25 @@ def _set_inter_residue_bonds(array, atom_site):
     return struct_conn
+def _filter_bonds(array, connection):
+    """
+    Get a bonds array, that contain either only intra-residue or
+    only inter-residue bonds.
+    """
+    bond_array = array.bonds.as_array()
+    # To save computation time call 'get_residue_starts_for()' only once
+    # with indices of the first and second atom of each bond
+    residue_starts_1, residue_starts_2 = get_residue_starts_for(
+        array, bond_array[:, :2].flatten()
+    ).reshape(-1, 2).T
+    if connection == "intra":
+        return bond_array[residue_starts_1 == residue_starts_2]
+    elif connection == "inter":
+        return bond_array[residue_starts_1 != residue_starts_2]
+    else:
+        raise ValueError("Invalid 'connection' option")
 def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
                   res_name=None):
     """
@@ -1011,7 +1132,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
         atom_category = _filter(
             atom_category, atom_category["comp_id"].as_array() == res_name
         )
-        if len(atom_category) == 0:
+        if atom_category.row_count == 0:
             raise KeyError(
                 f"No rows with residue name '{res_name}' found in "
                 f"'chem_comp_atom' category"
@@ -1098,6 +1219,8 @@ def set_component(pdbx_file, array, data_block=None):
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
     """
+    _check_non_empty(array)
     block = _get_or_create_block(pdbx_file, data_block)
     Category = block.subcomponent_class()
@@ -1132,7 +1255,7 @@ def set_component(pdbx_file, array, data_block=None):
     ).astype(str)
     block["chem_comp_atom"] = atom_cat
-    if array.bonds is not None:
+    if array.bonds is not None and array.bonds.get_bond_count() > 0:
         bond_array = array.bonds.as_array()
         order_flags = []
         aromatic_flags = []
@@ -1428,25 +1551,26 @@ def _parse_operation_expression(expression):
     # Split groups by parentheses:
     # use the opening parenthesis as delimiter
     # and just remove the closing parenthesis
+    # example: '(X0)(1-10,21-25)' from 1a34
     expressions_per_step = expression.replace(")", "").split("(")
     expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
     # Important: Operations are applied from right to left
     expressions_per_step.reverse()
     operations = []
-    for expr in expressions_per_step:
-        if "-" in expr:
-            # Range of operation IDs, they must be integers
-            first, last = expr.split("-")
-            operations.append(
-                [str(id) for id in range(int(first), int(last) + 1)]
-            )
-        elif "," in expr:
-            # List of operation IDs
-            operations.append(expr.split(","))
-        else:
-            # Single operation ID
-            operations.append([expr])
+    for one_step_expr in expressions_per_step:
+        one_step_op_ids = []
+        for expr in one_step_expr.split(","):
+            if "-" in expr:
+                # Range of operation IDs, they must be integers
+                first, last = expr.split("-")
+                one_step_op_ids.extend(
+                    [str(id) for id in range(int(first), int(last) + 1)]
+                )
+            else:
+                # Single operation ID
+                one_step_op_ids.append(expr)
+        operations.append(one_step_op_ids)
     # Cartesian product of operations
     return list(itertools.product(*operations))