PyPI - biotite - Versions diffs - 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl - Mend

biotite 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (92) hide show

biotite/application/dssp/app.py +13 -3
biotite/application/localapp.py +34 -0
biotite/application/muscle/app3.py +2 -15
biotite/application/muscle/app5.py +2 -2
biotite/application/util.py +1 -1
biotite/application/viennarna/rnaplot.py +6 -2
biotite/database/rcsb/query.py +6 -6
biotite/database/uniprot/check.py +20 -15
biotite/database/uniprot/download.py +1 -1
biotite/database/uniprot/query.py +1 -1
biotite/sequence/align/alignment.py +16 -3
biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
biotite/sequence/align/banded.pyx +5 -5
biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmeralphabet.pyx +17 -0
biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.pyx +52 -42
biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/matrix.py +273 -55
biotite/sequence/align/matrix_data/3Di.mat +24 -0
biotite/sequence/align/matrix_data/PB.license +21 -0
biotite/sequence/align/matrix_data/PB.mat +18 -0
biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
biotite/sequence/alphabet.py +3 -0
biotite/sequence/codec.cpython-311-darwin.so +0 -0
biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
biotite/sequence/graphics/colorschemes.py +44 -11
biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
biotite/sequence/profile.py +86 -4
biotite/sequence/seqtypes.py +124 -3
biotite/setup_ccd.py +197 -0
biotite/structure/__init__.py +4 -3
biotite/structure/alphabet/__init__.py +25 -0
biotite/structure/alphabet/encoder.py +332 -0
biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
biotite/structure/alphabet/i3d.py +110 -0
biotite/structure/alphabet/layers.py +86 -0
biotite/structure/alphabet/pb.license +21 -0
biotite/structure/alphabet/pb.py +171 -0
biotite/structure/alphabet/unkerasify.py +122 -0
biotite/structure/atoms.py +156 -43
biotite/structure/bonds.cpython-311-darwin.so +0 -0
biotite/structure/bonds.pyx +72 -21
biotite/structure/celllist.cpython-311-darwin.so +0 -0
biotite/structure/charges.cpython-311-darwin.so +0 -0
biotite/structure/filter.py +1 -1
biotite/structure/geometry.py +60 -113
biotite/structure/info/__init__.py +1 -0
biotite/structure/info/atoms.py +13 -13
biotite/structure/info/bonds.py +12 -6
biotite/structure/info/ccd.py +125 -32
biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
biotite/structure/info/groups.py +63 -17
biotite/structure/info/masses.py +9 -6
biotite/structure/info/misc.py +15 -21
biotite/structure/info/standardize.py +3 -2
biotite/structure/io/mol/sdf.py +41 -40
biotite/structure/io/pdb/convert.py +2 -0
biotite/structure/io/pdb/file.py +74 -3
biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbqt/file.py +32 -32
biotite/structure/io/pdbx/__init__.py +1 -0
biotite/structure/io/pdbx/bcif.py +32 -8
biotite/structure/io/pdbx/cif.py +148 -107
biotite/structure/io/pdbx/component.py +9 -4
biotite/structure/io/pdbx/compress.py +321 -0
biotite/structure/io/pdbx/convert.py +227 -68
biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbx/encoding.pyx +98 -17
biotite/structure/io/trajfile.py +16 -16
biotite/structure/molecules.py +141 -141
biotite/structure/sasa.cpython-311-darwin.so +0 -0
biotite/structure/segments.py +1 -2
biotite/structure/util.py +73 -1
biotite/version.py +2 -2
{biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
{biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
biotite/structure/info/ccd/README.rst +0 -8
biotite/structure/info/ccd/amino_acids.txt +0 -1663
biotite/structure/info/ccd/carbohydrates.txt +0 -1135
biotite/structure/info/ccd/nucleotides.txt +0 -798
{biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
{biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0

biotite/sequence/profile.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # information.
 import warnings
+from numbers import Integral
 import numpy as np
 from biotite.sequence.align.alignment import get_codes
 from biotite.sequence.alphabet import LetterAlphabet
@@ -66,6 +67,9 @@ class SequenceProfile(object):
     It also saves the number of gaps at each position in the array
     'gaps'.
+    With :meth:`from_alignment()` a :class:`SequenceProfile` object can
+    be created from an indefinite number of aligned sequences.
     With :meth:`probability_matrix()` the position probability matrix
     can be created based on 'symbols' and a pseudocount.
@@ -73,9 +77,6 @@ class SequenceProfile(object):
     be created based on the before calculated position probability
     matrix and the background frequencies.
-    With :meth:`from_alignment()` a :class:`SequenceProfile` object can
-    be created from an indefinite number of aligned sequences.
     With :meth:`sequence_probability_from_matrix()` the probability of a
     sequence can be calculated based on the before calculated position
     probability matrix of this instance of object SequenceProfile.
@@ -105,6 +106,63 @@ class SequenceProfile(object):
         Array which indicates the number of gaps at each position.
     alphabet : Alphabet, length=k
         Alphabet of sequences of sequence profile
+    Examples
+    --------
+    Create a profile from a multiple sequence alignment:
+    >>> sequences = [
+    ...     NucleotideSequence("CGCTCATTC"),
+    ...     NucleotideSequence("CGCTATTC"),
+    ...     NucleotideSequence("CCCTCAATC"),
+    ... ]
+    >>> msa, _, _, _ = align_multiple(
+    ...     sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
+    ... )
+    >>> print(msa)
+    CGCTCATTC
+    CGCT-ATTC
+    CCCTCAATC
+    >>> profile = SequenceProfile.from_alignment(msa)
+    >>> print(profile)
+      A C G T
+    0 0 3 0 0
+    1 0 1 2 0
+    2 0 3 0 0
+    3 0 0 0 3
+    4 0 2 0 0
+    5 3 0 0 0
+    6 1 0 0 2
+    7 0 0 0 3
+    8 0 3 0 0
+    >>> print(profile.gaps)
+    [0 0 0 0 1 0 0 0 0]
+    Slice the profile (masks and index arrays are also supported):
+    >>> print(profile[2:])
+      A C G T
+    0 0 3 0 0
+    1 0 0 0 3
+    2 0 2 0 0
+    3 3 0 0 0
+    4 1 0 0 2
+    5 0 0 0 3
+    6 0 3 0 0
+    Use the profile to compute the position probability matrix:
+    >>> print(profile.probability_matrix())
+    [[0.000 1.000 0.000 0.000]
+     [0.000 0.333 0.667 0.000]
+     [0.000 1.000 0.000 0.000]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]
+     [1.000 0.000 0.000 0.000]
+     [0.333 0.000 0.000 0.667]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]]
     """
     def __init__(self, symbols, gaps, alphabet):
@@ -156,8 +214,23 @@ class SequenceProfile(object):
             )
         self._gaps = new_gaps
+    def __str__(self):
+        # Add an additional row and column for the position and symbol indicators
+        print_matrix = np.full(
+            (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
+        )
+        print_matrix[1:, 1:] = self.symbols.astype(str)
+        print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
+        print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
+        max_len = len(max(print_matrix.flatten(), key=len))
+        return "\n".join(
+            [
+                " ".join([str(cell).rjust(max_len) for cell in row])
+                for row in print_matrix
+            ]
+        )
     def __repr__(self):
-        """Represent SequenceProfile as a string for debugging."""
         return (
             f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
             f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
@@ -483,3 +556,12 @@ class SequenceProfile(object):
                 f"as 'symbols' {self.symbols.shape}"
             )
         return np.sum(pwm[np.arange(len(sequence)), sequence.code])
+    def __getitem__(self, index):
+        if isinstance(index, Integral):
+            # Do not allow to collapse dimensions
+            index = slice(index, index + 1)
+        return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
+    def __len__(self):
+        return len(self.symbols)

biotite/sequence/seqtypes.py CHANGED Viewed

@@ -4,10 +4,22 @@
 __name__ = "biotite.sequence"
 __author__ = "Patrick Kunzmann", "Thomas Nevolianis"
-__all__ = ["GeneralSequence", "NucleotideSequence", "ProteinSequence"]
+__all__ = [
+    "GeneralSequence",
+    "NucleotideSequence",
+    "ProteinSequence",
+    "PositionalSequence",
+    "PurePositionalSequence",
+]
+from dataclasses import dataclass, field
 import numpy as np
-from biotite.sequence.alphabet import AlphabetError, AlphabetMapper, LetterAlphabet
+from biotite.sequence.alphabet import (
+    Alphabet,
+    AlphabetError,
+    AlphabetMapper,
+    LetterAlphabet,
+)
 from biotite.sequence.sequence import Sequence
@@ -590,3 +602,112 @@ class ProteinSequence(Sequence):
                 "Sequence contains ambiguous amino acids, " "cannot calculate weight"
             )
         return weight
+class PositionalSequence(Sequence):
+    """
+    A sequence where each symbol is associated with a position.
+    For each individual position the sequence contains a separate
+    :class:`PositionalSequence.Symbol`, encoded by a custom alphabet for this sequence.
+    In consequence the symbol code is the position in the sequence itself.
+    This is useful for aligning sequences based on a position-specific
+    substitution matrix.
+    Parameters
+    ----------
+    original_sequence : seq.Sequence
+        The original sequence to create the positional sequence from.
+    """
+    @dataclass(frozen=True)
+    class Symbol:
+        """
+        Combination of a symbol and its position in a sequence.
+        Attributes
+        ----------
+        original_alphabet : Alphabet
+            The original alphabet, where the symbol stems from.
+        original_code : int
+            The code of the original symbol in the original alphabet.
+        position : int
+            The 0-based position of the symbol in the sequence.
+        symbol : object
+            The symbol from the original alphabet.
+        See Also
+        --------
+        PositionalSequence
+            The sequence type containing :class:`PositionalSymbol` objects.
+        """
+        original_alphabet: ...
+        original_code: ...
+        position: ...
+        symbol: ... = field(init=False)
+        def __post_init__(self):
+            sym = self.original_alphabet.decode(self.original_code)
+            super().__setattr__("symbol", sym)
+        def __str__(self):
+            return str(self.symbol)
+    def __init__(self, original_sequence):
+        self._orig_alphabet = original_sequence.get_alphabet()
+        self._alphabet = Alphabet(
+            [
+                PositionalSequence.Symbol(self._orig_alphabet, code, pos)
+                for pos, code in enumerate(original_sequence.code)
+            ]
+        )
+        self.code = np.arange(
+            len(original_sequence), dtype=Sequence.dtype(len(self._alphabet))
+        )
+    def reconstruct(self):
+        """
+        Reconstruct the original sequence from the positional sequence.
+        Returns
+        -------
+        original_sequence : GeneralSequence
+            The original sequence.
+            Although the actual type of the returned sequence is always a
+            :class:`GeneralSequence`, the alphabet and the symbols of the returned
+            sequence are equal to the original sequence.
+        """
+        original_sequence = GeneralSequence(self._orig_alphabet)
+        original_sequence.code = np.array([sym.original_code for sym in self._alphabet])
+        return original_sequence
+    def get_alphabet(self):
+        return self._alphabet
+    def __str__(self) -> str:
+        return "".join([str(sym) for sym in self.symbols])
+    def __repr__(self):
+        return f"PositionalSequence({self.reconstruct()!r})"
+class PurePositionalSequence(Sequence):
+    """
+    An object of this class is a 'placeholder' sequence, where each symbol is the
+    position in the sequence itself.
+    This class is similar to :class:`PositionalSequence`, but the symbols are not
+    derived from an original sequence, but are the pure position.
+    Hence, there is no meaningful string representation of the sequence and its symbols.
+    """
+    def __init__(self, length):
+        self._alphabet = Alphabet(range(length))
+        self.code = np.arange(length, dtype=Sequence.dtype(length))
+    def get_alphabet(self):
+        return self._alphabet
+    def __repr__(self):
+        return f"PurePositionalSequence({len(self)})"

biotite/setup_ccd.py ADDED Viewed

@@ -0,0 +1,197 @@
+__author__ = "Patrick Kunzmann"
+__all__ = []
+import gzip
+import logging
+from collections import defaultdict
+from io import StringIO
+from pathlib import Path
+import numpy as np
+import requests
+from biotite.structure.io.pdbx import *
+OUTPUT_CCD = Path(__file__).parent / "structure" / "info" / "components.bcif"
+CCD_URL = "https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
+def concatenate_ccd(categories=None):
+    """
+    Create the CCD in BinaryCIF format with each category contains the
+    data of all blocks.
+    Parameters
+    ----------
+    categories : list of str, optional
+        The names of the categories to include.
+        By default, all categories from the CCD are included.
+    Returns
+    -------
+    compressed_file : BinaryCIFFile
+        The compressed CCD in BinaryCIF format.
+    """
+    logging.info("Download and read CCD...")
+    ccd_cif_text = gzip.decompress(requests.get(CCD_URL).content).decode()
+    ccd_file = CIFFile.read(StringIO(ccd_cif_text))
+    compressed_block = BinaryCIFBlock()
+    if categories is None:
+        categories = _list_all_category_names(ccd_file)
+    for category_name in categories:
+        logging.info(f"Concatenate and compress '{category_name}' category...")
+        compressed_block[category_name] = compress(
+            _concatenate_blocks_into_category(ccd_file, category_name)
+        )
+    logging.info("Write concatenated CCD into BinaryCIF...")
+    compressed_file = BinaryCIFFile()
+    compressed_file["components"] = compressed_block
+    return compressed_file
+def _concatenate_blocks_into_category(pdbx_file, category_name):
+    """
+    Concatenate the given category from all blocks into a single
+    category.
+    Parameters
+    ----------
+    pdbx_file : PDBxFile
+        The PDBx file, whose blocks should be concatenated.
+    category_name : str
+        The name of the category to concatenate.
+    Returns
+    -------
+    category : BinaryCIFCategory
+        The concatenated category.
+    """
+    columns_names = _list_all_column_names(pdbx_file, category_name)
+    data_chunks = defaultdict(list)
+    mask_chunks = defaultdict(list)
+    for block in pdbx_file.values():
+        if category_name not in block:
+            continue
+        category = block[category_name]
+        for column_name in columns_names:
+            if column_name in category:
+                column = category[column_name]
+                data_chunks[column_name].append(column.data.array)
+                if column.mask is not None:
+                    mask_chunks[column_name].append(column.mask.array)
+                else:
+                    mask_chunks[column_name].append(
+                        np.full(category.row_count, MaskValue.PRESENT, dtype=np.uint8)
+                    )
+            else:
+                # Column is missing in this block
+                # -> handle it as data masked as 'missing'
+                data_chunks[column_name].append(
+                    # For now all arrays are of type string anyway,
+                    # as they are read from a CIF file
+                    np.full(category.row_count, "", dtype="U1")
+                )
+                mask_chunks[column_name].append(
+                    np.full(category.row_count, MaskValue.MISSING, dtype=np.uint8)
+                )
+    bcif_columns = {}
+    for col_name in columns_names:
+        data = np.concatenate(data_chunks[col_name])
+        mask = np.concatenate(mask_chunks[col_name])
+        data = _into_fitting_type(data, mask)
+        if np.all(mask == MaskValue.PRESENT):
+            mask = None
+        bcif_columns[col_name] = BinaryCIFColumn(data, mask)
+    return BinaryCIFCategory(bcif_columns)
+def _list_all_column_names(pdbx_file, category_name):
+    """
+    Get all columns that exist in any block for a given category.
+    Parameters
+    ----------
+    pdbx_file : PDBxFile
+        The PDBx file to search in for the columns.
+    category_name : str
+        The name of the category to search in.
+    Returns
+    -------
+    columns_names : list of str
+        The names of the columns.
+    """
+    columns_names = set()
+    for block in pdbx_file.values():
+        if category_name in block:
+            columns_names.update(block[category_name].keys())
+    return sorted(columns_names)
+def _list_all_category_names(pdbx_file):
+    """
+    Get all categories that exist in any block.
+    Parameters
+    ----------
+    pdbx_file : PDBxFile
+        The PDBx file to search in for the columns.
+    Returns
+    -------
+    columns_names : list of str
+        The names of the columns.
+    """
+    category_names = set()
+    for block in pdbx_file.values():
+        category_names.update(block.keys())
+    return sorted(category_names)
+def _into_fitting_type(string_array, mask):
+    """
+    Try to find a numeric type for a string ndarray, if possible.
+    Parameters
+    ----------
+    string_array : ndarray, dtype=string
+        The array to convert.
+    mask : ndarray, dtype=uint8
+        Only values in `string_array` where the mask is ``MaskValue.PRESENT`` are
+        considered for type conversion.
+    Returns
+    -------
+    array : ndarray
+        The array converted into an appropriate dtype.
+    """
+    mask = mask == MaskValue.PRESENT
+    # Only try to find an appropriate dtype for unmasked values
+    values = string_array[mask]
+    try:
+        # Try to fit into integer type
+        values = values.astype(int)
+    except ValueError:
+        try:
+            # Try to fit into float type
+            values = values.astype(float)
+        except ValueError:
+            # Keep string type
+            pass
+    array = np.zeros(string_array.shape, dtype=values.dtype)
+    array[mask] = values
+    return array
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
+    OUTPUT_CCD.parent.mkdir(parents=True, exist_ok=True)
+    compressed_ccd = concatenate_ccd(["chem_comp", "chem_comp_atom", "chem_comp_bond"])
+    compressed_ccd.write(OUTPUT_CCD)
+if __name__ == "__main__":
+    main()

biotite/structure/__init__.py CHANGED Viewed

@@ -57,14 +57,15 @@ The annotation arrays can be accessed either via the method
 The following annotation categories are optionally used by some
 functions:
-=========  ===========  =================   ============================
+=========  ===========  =================   =========================================
 Category   Type         Examples            Description
-=========  ===========  =================   ============================
+=========  ===========  =================   =========================================
 atom_id    int          1,2,3, ...          Atom serial number
 b_factor   float        0.9, 12.3, ...      Temperature factor
 occupancy  float        .1, .3, .9, ...     Occupancy
 charge     int          -2,-1,0,1,2, ...    Electric charge of the atom
-=========  ===========  =================   ============================
+sym_id     string       '1','2','3', ...    Symmetry ID for assemblies/symmetry mates
+=========  ===========  =================   =========================================
 For each type, the attributes can be accessed directly.
 Both :class:`AtomArray` and :class:`AtomArrayStack` support

biotite/structure/alphabet/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+"""
+A subpackage for converting structures to structural alphabet sequences.
+Structural alphabets represent the local geometry of each residue in a structure as
+symbol in a sequence.
+This allows using sequence-based functionality from :mod:`biotite.sequence` on
+structural data.
+For each supported structural alphabet, this subpackage provides a conversion function
+that converts each chain of a given structure into a :class:`Sequence` object from the
+respective structural alphabet.
+Note that the structural alphabets use lower-case letters as symbols, in order to
+distinguish them better from the nucleotide and amino acid alphabets.
+"""
+__name__ = "biotite.structure.alphabet"
+__author__ = "Martin Larralde, Patrick Kunzmann"
+from .i3d import *
+from .pb import *