PyPI - biotite - Versions diffs - 0.38.0__cp311-cp311-macosx_11_0_arm64.whl → 0.40.0__cp311-cp311-macosx_11_0_arm64.whl - Mend

biotite 0.38.0__cp311-cp311-macosx_11_0_arm64.whl → 0.40.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (124) hide show

biotite/__init__.py +3 -3
biotite/application/application.py +33 -28
biotite/application/dssp/app.py +18 -18
biotite/application/sra/__init__.py +5 -0
biotite/application/sra/app.py +337 -55
biotite/database/entrez/__init__.py +2 -1
biotite/database/entrez/check.py +14 -3
biotite/database/entrez/download.py +20 -13
biotite/database/entrez/key.py +44 -0
biotite/database/entrez/query.py +38 -34
biotite/database/pubchem/query.py +44 -44
biotite/database/rcsb/download.py +19 -14
biotite/database/rcsb/query.py +46 -46
biotite/sequence/align/__init__.py +5 -1
biotite/sequence/align/banded.c +1408 -1025
biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
biotite/sequence/align/buckets.py +69 -0
biotite/sequence/align/cigar.py +389 -0
biotite/sequence/align/kmeralphabet.c +3220 -2850
biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmersimilarity.c +713 -663
biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
biotite/sequence/align/kmertable.cpp +68398 -0
biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
biotite/sequence/align/localgapped.c +1507 -1074
biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/localungapped.c +1143 -833
biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
biotite/sequence/align/multiple.c +1569 -1092
biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
biotite/sequence/align/pairwise.c +1612 -1212
biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
biotite/sequence/align/permutation.c +33259 -0
biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
biotite/sequence/align/primes.txt +821 -0
biotite/sequence/align/{kmertable.c → selector.c} +9129 -16497
biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
biotite/sequence/align/tracetable.c +685 -646
biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
biotite/sequence/codec.c +1159 -841
biotite/sequence/codec.cpython-311-darwin.so +0 -0
biotite/sequence/graphics/alignment.py +212 -2
biotite/sequence/io/genbank/annotation.py +11 -11
biotite/sequence/phylo/nj.c +684 -636
biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/tree.c +970 -673
biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
biotite/sequence/phylo/upgma.c +672 -626
biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
biotite/structure/__init__.py +1 -1
biotite/structure/atoms.py +1 -1
biotite/structure/basepairs.py +7 -12
biotite/structure/bonds.c +3861 -3749
biotite/structure/bonds.cpython-311-darwin.so +0 -0
biotite/structure/celllist.c +727 -707
biotite/structure/celllist.cpython-311-darwin.so +0 -0
biotite/structure/charges.c +1561 -1560
biotite/structure/charges.cpython-311-darwin.so +0 -0
biotite/structure/filter.py +30 -37
biotite/structure/info/__init__.py +5 -8
biotite/structure/info/atoms.py +25 -67
biotite/structure/info/bonds.py +46 -100
biotite/structure/info/ccd/README.rst +8 -0
biotite/structure/info/ccd/amino_acids.txt +1646 -0
biotite/structure/info/ccd/carbohydrates.txt +1133 -0
biotite/structure/info/ccd/components.bcif +0 -0
biotite/structure/info/ccd/nucleotides.txt +797 -0
biotite/structure/info/ccd.py +95 -0
biotite/structure/info/groups.py +90 -0
biotite/structure/info/masses.py +21 -20
biotite/structure/info/misc.py +11 -22
biotite/structure/info/standardize.py +17 -12
biotite/structure/io/__init__.py +2 -4
biotite/structure/io/ctab.py +1 -1
biotite/structure/io/general.py +37 -43
biotite/structure/io/mmtf/__init__.py +3 -0
biotite/structure/io/mmtf/convertarray.c +528 -365
biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/convertfile.c +725 -676
biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/decode.c +1070 -754
biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/encode.c +727 -677
biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
biotite/structure/io/mmtf/file.py +34 -26
biotite/structure/io/npz/__init__.py +3 -0
biotite/structure/io/npz/file.py +21 -18
biotite/structure/io/pdb/__init__.py +3 -3
biotite/structure/io/pdb/file.py +72 -70
biotite/structure/io/pdb/hybrid36.c +540 -478
biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbqt/file.py +82 -68
biotite/structure/io/pdbx/__init__.py +13 -6
biotite/structure/io/pdbx/bcif.py +649 -0
biotite/structure/io/pdbx/cif.py +1028 -0
biotite/structure/io/pdbx/component.py +243 -0
biotite/structure/io/pdbx/convert.py +707 -359
biotite/structure/io/pdbx/encoding.c +112813 -0
biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
biotite/structure/io/pdbx/error.py +14 -0
biotite/structure/io/pdbx/legacy.py +267 -0
biotite/structure/molecules.py +151 -151
biotite/structure/residues.py +40 -40
biotite/structure/sasa.c +713 -644
biotite/structure/sasa.cpython-311-darwin.so +0 -0
biotite/structure/superimpose.py +158 -115
biotite/visualize.py +9 -11
{biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/METADATA +2 -2
{biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/RECORD +112 -102
{biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/WHEEL +1 -1
biotite/structure/info/amino_acids.json +0 -1556
biotite/structure/info/amino_acids.py +0 -42
biotite/structure/info/carbohydrates.json +0 -1122
biotite/structure/info/carbohydrates.py +0 -39
biotite/structure/info/intra_bonds.msgpack +0 -0
biotite/structure/info/link_types.msgpack +0 -1
biotite/structure/info/nucleotides.json +0 -772
biotite/structure/info/nucleotides.py +0 -39
biotite/structure/info/residue_masses.msgpack +0 -0
biotite/structure/info/residue_names.msgpack +0 -3
biotite/structure/info/residues.msgpack +0 -0
biotite/structure/io/pdbx/file.py +0 -652
{biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/LICENSE.rst +0 -0
{biotite-0.38.0.dist-info → biotite-0.40.0.dist-info}/top_level.txt +0 -0

biotite/sequence/align/banded.cpython-311-darwin.so CHANGED Viewed

Binary file

biotite/sequence/align/buckets.py ADDED Viewed

@@ -0,0 +1,69 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+__name__ = "biotite.sequence.align"
+__author__ = "Patrick Kunzmann"
+__all__ = ["bucket_number"]
+from os.path import realpath, dirname, join
+import numpy as np
+_primes = None
+def bucket_number(n_kmers, load_factor=0.8):
+    """
+    Find an appropriate number of buckets for a :class:`BucketKmerTable`
+    based on the number of elements (i.e. *k-mers*) that should be
+    stored in the table.
+    Parameters
+    ----------
+    n_kmers : int
+        The expected number of *k-mers* that will be stored in the
+        :class:`BucketKmerTable`.
+        If this number deviates from the actual number of *k-mers* that
+        will be stored, the load factor of the table will deviate
+        by the same percentage.
+    load_factor : float, optional
+        The ratio of bucket number to *k-mer* number.
+        The actual load factor will be lower, as the closest greater
+        prime is returned (see *Notes*).
+    Returns
+    -------
+    n_buckets : int
+        The recommended number of buckets to use for a
+        :class:`BucketKmerTable`, that stores `n_kmers` at the given
+        `load_factor`.
+    Notes
+    -----
+    The function returns the closest greater prime number from a
+    precomputed list of primes to use as the number of buckets.
+    The reason is that primer numbers have proven to be good hash table
+    sizes, if the hash function is not randomized.
+    Let's take unambiguous nucleotide *k-mers* as example.
+    If powers of two would be used as table size (another common scheme),
+    taking the modulo operation on the *k-mer* code would simply erase
+    the upper bits corresponding to the first nucleotide(s) in a
+    *k-mer*.
+    Hence, all *k-mers* with the same suffix would be stored in the same
+    bin.
+    """
+    global _primes
+    if _primes is None:
+        with open(
+            join(dirname(realpath(__file__)), "primes.txt")
+        ) as file:
+            _primes = np.array([
+                int(line) for line in file.read().splitlines()
+                if len(line) != 0 and line[0] != "#"
+            ])
+    number = int(n_kmers / load_factor)
+    index = np.searchsorted(_primes, number, side="left")
+    if index == len(_primes):
+        raise ValueError("Number of buckets too large")
+    return _primes[index]

biotite/sequence/align/cigar.py ADDED Viewed

@@ -0,0 +1,389 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+__name__ = "biotite.sequence.align"
+__author__ = "Patrick Kunzmann"
+__all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
+import enum
+import numpy as np
+from .alignment import Alignment, get_codes
+class CigarOp(enum.IntEnum):
+    """
+    An enum for the different CIGAR operations.
+    """
+    MATCH = 0
+    INSERTION = 1
+    DELETION = 2
+    INTRON = 3
+    SOFT_CLIP = 4
+    HARD_CLIP = 5
+    PADDING = 6
+    EQUAL = 7
+    DIFFERENT = 8
+    BACK = 9
+    @staticmethod
+    def from_cigar_symbol(symbol):
+        """
+        Get the enum value from the CIGAR symbol.
+        Parameters
+        ----------
+        symbol : str
+            The CIGAR symbol.
+        Returns
+        -------
+        op : CigarOp
+            The enum value.
+        """
+        return _str_to_op[symbol]
+    def to_cigar_symbol(self):
+        return _op_to_str[self]
+_str_to_op = {
+        "M" : CigarOp.MATCH,
+        "I" : CigarOp.INSERTION,
+        "D" : CigarOp.DELETION,
+        "N" : CigarOp.INTRON,
+        "S" : CigarOp.SOFT_CLIP,
+        "H" : CigarOp.HARD_CLIP,
+        "P" : CigarOp.PADDING,
+        "=" : CigarOp.EQUAL,
+        "X" : CigarOp.DIFFERENT,
+        "B" : CigarOp.BACK
+    }
+_op_to_str = {v: k for k, v in _str_to_op.items()}
+def read_alignment_from_cigar(cigar, position,
+                              reference_sequence, segment_sequence):
+    """
+    Create an :class:`Alignment` from a CIGAR string.
+    Parameters
+    ----------
+    cigar : str
+        The CIGAR string.
+    position : int
+        0-based position of the first aligned base in the reference.
+        0-based equivalent to the ``POS`` field in the SAM/BAM file.
+    reference_sequence : Sequence
+        The reference sequence.
+    segment_sequence : Sequence
+        The segment, read or query sequence.
+    Returns
+    -------
+    alignment : Alignment
+        The alignment.
+    See Also
+    --------
+    write_alignment_to_cigar
+    Notes
+    -----
+    This function expects that the `segment_sequence` was taken from the
+    SAM/BAM file, hence hard-clipped bases are not part of the sequence.
+    Therefore, hard clipped bases are simply ignored in the CIGAR
+    string.
+    Examples
+    --------
+    >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> print(read_alignment_from_cigar("9M2D12M", 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    >>> print(read_alignment_from_cigar("4X5=2D7=1X4=", 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    If bases in the segment sequence are soft-clipped, they do not
+    appear in the alignment.
+    Furthermore, the start of the reference sequence must be adapted.
+    >>> print(read_alignment_from_cigar("4S5M2D12M", 7, ref, seg))
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    Hard-clipped bases are not part of the segment sequence.
+    Hence `H` operations are completely ignored.
+    >>> seg = NucleotideSequence("GGTTTGACCGTATGTAG")
+    >>> print(read_alignment_from_cigar("4H5M2D12M", 7, ref, seg))
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    Reading from BAM codes is also possible:
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> op_tuples = [
+    ...     (CigarOp.MATCH, 9),
+    ...     (CigarOp.DELETION, 2),
+    ...     (CigarOp.MATCH, 12)
+    ... ]
+    >>> print(read_alignment_from_cigar(op_tuples, 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    """
+    if isinstance(cigar, str):
+        operations = _op_tuples_from_cigar(cigar)
+    else:
+        operations = np.asarray(cigar, dtype=int)
+        if operations.ndim != 2:
+            raise ValueError(
+                "Expected array with shape (n,2)"
+            )
+        if operations.shape[1] != 2:
+            raise ValueError(
+                "Expected (operation, length) pairs"
+            )
+    if len(operations) == 0:
+        return Alignment(
+            [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
+        )
+    trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
+    clip_mask = np.ones(trace.shape[0], dtype=bool)
+    i = 0
+    ref_pos = position
+    seg_pos = 0
+    for op, length in operations:
+        op = CigarOp(op)
+        if op in (CigarOp.MATCH, CigarOp.EQUAL, CigarOp.DIFFERENT):
+            trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
+            trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
+            ref_pos += length
+            seg_pos += length
+        elif op == CigarOp.INSERTION:
+            trace[i : i + length, 0] = -1
+            trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
+            seg_pos += length
+        elif op in (CigarOp.DELETION, CigarOp.INTRON):
+            trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
+            trace[i : i + length, 1] = -1
+            ref_pos += length
+        elif op == CigarOp.SOFT_CLIP:
+            clip_mask[i : i + length] = False
+            seg_pos += length
+        elif op == CigarOp.HARD_CLIP:
+            clip_mask[i : i + length] = False
+        else:
+            raise ValueError(
+                f"CIGAR operation {op} is not implemented"
+            )
+        i += length
+    # Remove clipped positions
+    trace = trace[clip_mask]
+    return Alignment([reference_sequence, segment_sequence], trace)
+def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
+                             introns=(), distinguish_matches=False,
+                             hard_clip=False, as_string=True):
+    """
+    Convert an :class:`Alignment` into a CIGAR string.
+    Parameters
+    ----------
+    alignment : Alignment
+        The alignment to be converted.
+    reference_index : int, optional
+        The index of the reference sequence in the alignment.
+        By default the first sequence is used.
+    segment_index : int, optional
+        The index of the segment, read or query sequence in the
+        alignment.
+        By default the second sequence is used.
+    introns : iterable object of tuple(int, int), optional
+        The introns in the reference sequence.
+        The introns are given as tuples of start and exclusive stop
+        index.
+        In those regions gaps in the reference sequence are reflected by
+        `'N'` in the CIGAR string.
+        By default no introns are assumed.
+    distinguish_matches : bool, optional
+        If true, matches (`'='`) are distinguished from mismatches
+        (`'X'`).
+        Otherwise, matches and mismatches are reflected equally by an
+        `'M'` in the CIGAR string.
+    hard_clip : bool, optional
+        If true, clipped bases are hard-clipped.
+        Otherwise, clipped bases are soft-clipped.
+    as_string : bool, optional
+        If true, the CIGAR string is returned.
+        Otherwise, a list of tuples is returned, where the first element
+        of each tuple specifies the :class:`CigarOp` and the second
+        element specifies the number of repetitions.
+    Returns
+    -------
+    cigar : str or ndarray, shape=(n,2) dtype=int
+        If `as_string` is true, the CIGAR string is returned.
+        Otherwise, an array is returned, where the first column
+        specifies the :class:`CigarOp` and the second column specifies
+        the number of repetitions of that operation.
+    See Also
+    --------
+    read_alignment_from_cigar
+    Examples
+    --------
+    >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
+    >>> semiglobal_alignment = align_optimal(
+    ...     ref, seg, matrix, local=False, terminal_penalty=False
+    ... )[0]
+    >>> print(semiglobal_alignment)
+    TATAAAAGGTTTCCGACCGTAGGTAGCTGA
+    ---CCCCGGTTT--GACCGTATGTAG----
+    >>> print(write_alignment_to_cigar(semiglobal_alignment))
+    9M2D12M
+    >>> print(write_alignment_to_cigar(semiglobal_alignment, introns=[(12, 14)]))
+    9M2N12M
+    >>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
+    4X5=2D7=1X4=
+    >>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
+    >>> print(local_alignment)
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    >>> print(write_alignment_to_cigar(local_alignment, hard_clip=False))
+    4S5M2D12M
+    >>> print(write_alignment_to_cigar(local_alignment, hard_clip=True))
+    4H5M2D12M
+    Writing operations as BAM codes is also possible:
+    >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
+    >>> for op, length in op_tuples:
+    ...     print(CigarOp(op), length)
+    CigarOp.MATCH 9
+    CigarOp.DELETION 2
+    CigarOp.MATCH 12
+    """
+    # Ignore terminal gaps in segment sequence
+    no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
+    alignment = alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
+    ref_trace = alignment.trace[:, reference_index]
+    seg_trace = alignment.trace[:, segment_index]
+    operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
+    insertion_mask = (ref_trace == -1)
+    deletion_mask = (seg_trace == -1)
+    if np.any(insertion_mask & deletion_mask):
+        raise ValueError(
+            "Alignment contains insertion and deletion at the same position"
+        )
+    operations[insertion_mask] = CigarOp.INSERTION
+    operations[deletion_mask] = CigarOp.DELETION
+    if introns is not None:
+        intron_mask = np.zeros(operations.shape[0], dtype=bool)
+        for start, stop in introns:
+            if start >= stop:
+                raise ValueError(
+                    "Intron start must be smaller than intron stop"
+                )
+            if start < 0:
+                raise ValueError(
+                    "Intron start must not be negative"
+                )
+            intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
+        if np.any(intron_mask & ~deletion_mask):
+            raise ValueError(
+                "Introns must be within gaps in the reference sequence"
+            )
+        operations[intron_mask] = CigarOp.INTRON
+    if distinguish_matches:
+        symbol_codes = get_codes(alignment)
+        ref_codes = symbol_codes[reference_index, :]
+        seg_codes = symbol_codes[segment_index, :]
+        equal_mask = (ref_codes == seg_codes)
+        match_mask = (operations == CigarOp.MATCH)
+        operations[equal_mask & match_mask] = CigarOp.EQUAL
+        operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
+    op_tuples = _aggregate_consecutive(operations)
+    clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
+    # Missing bases at the beginning and end of the segment are
+    # interpreted as clipped
+    # As first element in the segment trace is the first aligned base,
+    # all previous bases are clipped...
+    start_clip_length = seg_trace[0]
+    if start_clip_length != 0:
+        start_clip = [(clip_op, seg_trace[0])]
+    else:
+        start_clip = np.zeros((0, 2), dtype=int)
+    # ...and the same applies for the last base
+    end_clip_length = (
+        len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
+    )
+    if end_clip_length != 0:
+        end_clip = [(clip_op, end_clip_length)]
+    else:
+        end_clip = np.zeros((0, 2), dtype=int)
+    op_tuples = np.concatenate((start_clip, op_tuples, end_clip))
+    if as_string:
+        cigar = _cigar_from_op_tuples(op_tuples)
+        return cigar
+    else:
+        return op_tuples
+def _aggregate_consecutive(operations):
+    """
+    Aggregate consecutive operations of the same type.
+    """
+    op_start_indices = np.where(operations[:-1] != operations[1:])[0]
+    # Also include the first operation
+    op_start_indices += 1
+    op_start_indices = np.concatenate(([0], op_start_indices))
+    ops = operations[op_start_indices]
+    length = np.diff(np.append(op_start_indices, len(operations)))
+    return np.stack((ops, length), axis=-1)
+def _cigar_from_op_tuples(op_tuples):
+    """
+    Create a CIGAR string from a list of BAM integer tuples.
+    The first element of each tuple specifies the operation and the
+    second element specifies the number of repetitions.
+    """
+    cigar = ""
+    for op, count in op_tuples:
+        cigar += str(count) + CigarOp(op).to_cigar_symbol()
+    return cigar
+def _op_tuples_from_cigar(cigar):
+    """
+    Create a list of tuples from a CIGAR string.
+    """
+    op_tuples = []
+    count = ""
+    for char in cigar:
+        if char.isdigit():
+            count += char
+        else:
+            op = CigarOp.from_cigar_symbol(char)
+            op_tuples.append((op, count))
+            count = ""
+    return np.array(op_tuples, dtype=int)