PyPI - biotite - Versions diffs - 1.1.0__cp313-cp313-macosx_10_13_x86_64.whl - Mend

biotite 1.1.0__cp313-cp313-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (332) hide show

biotite/__init__.py +18 -0
biotite/application/__init__.py +69 -0
biotite/application/application.py +276 -0
biotite/application/autodock/__init__.py +12 -0
biotite/application/autodock/app.py +500 -0
biotite/application/blast/__init__.py +14 -0
biotite/application/blast/alignment.py +92 -0
biotite/application/blast/webapp.py +428 -0
biotite/application/clustalo/__init__.py +12 -0
biotite/application/clustalo/app.py +223 -0
biotite/application/dssp/__init__.py +12 -0
biotite/application/dssp/app.py +159 -0
biotite/application/localapp.py +342 -0
biotite/application/mafft/__init__.py +12 -0
biotite/application/mafft/app.py +116 -0
biotite/application/msaapp.py +363 -0
biotite/application/muscle/__init__.py +13 -0
biotite/application/muscle/app3.py +227 -0
biotite/application/muscle/app5.py +163 -0
biotite/application/sra/__init__.py +18 -0
biotite/application/sra/app.py +452 -0
biotite/application/tantan/__init__.py +12 -0
biotite/application/tantan/app.py +199 -0
biotite/application/util.py +57 -0
biotite/application/viennarna/__init__.py +18 -0
biotite/application/viennarna/rnaalifold.py +310 -0
biotite/application/viennarna/rnafold.py +254 -0
biotite/application/viennarna/rnaplot.py +206 -0
biotite/application/viennarna/util.py +77 -0
biotite/application/webapp.py +76 -0
biotite/copyable.py +71 -0
biotite/database/__init__.py +23 -0
biotite/database/entrez/__init__.py +15 -0
biotite/database/entrez/check.py +60 -0
biotite/database/entrez/dbnames.py +91 -0
biotite/database/entrez/download.py +229 -0
biotite/database/entrez/key.py +44 -0
biotite/database/entrez/query.py +262 -0
biotite/database/error.py +16 -0
biotite/database/pubchem/__init__.py +21 -0
biotite/database/pubchem/download.py +258 -0
biotite/database/pubchem/error.py +20 -0
biotite/database/pubchem/query.py +830 -0
biotite/database/pubchem/throttle.py +98 -0
biotite/database/rcsb/__init__.py +13 -0
biotite/database/rcsb/download.py +159 -0
biotite/database/rcsb/query.py +964 -0
biotite/database/uniprot/__init__.py +13 -0
biotite/database/uniprot/check.py +40 -0
biotite/database/uniprot/download.py +129 -0
biotite/database/uniprot/query.py +293 -0
biotite/file.py +232 -0
biotite/sequence/__init__.py +84 -0
biotite/sequence/align/__init__.py +203 -0
biotite/sequence/align/alignment.py +680 -0
biotite/sequence/align/banded.cpython-313-darwin.so +0 -0
biotite/sequence/align/banded.pyx +652 -0
biotite/sequence/align/buckets.py +71 -0
biotite/sequence/align/cigar.py +425 -0
biotite/sequence/align/kmeralphabet.cpython-313-darwin.so +0 -0
biotite/sequence/align/kmeralphabet.pyx +595 -0
biotite/sequence/align/kmersimilarity.cpython-313-darwin.so +0 -0
biotite/sequence/align/kmersimilarity.pyx +233 -0
biotite/sequence/align/kmertable.cpython-313-darwin.so +0 -0
biotite/sequence/align/kmertable.pyx +3411 -0
biotite/sequence/align/localgapped.cpython-313-darwin.so +0 -0
biotite/sequence/align/localgapped.pyx +892 -0
biotite/sequence/align/localungapped.cpython-313-darwin.so +0 -0
biotite/sequence/align/localungapped.pyx +279 -0
biotite/sequence/align/matrix.py +622 -0
biotite/sequence/align/matrix_data/3Di.mat +24 -0
biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
biotite/sequence/align/matrix_data/GONNET.mat +26 -0
biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
biotite/sequence/align/matrix_data/MATCH.mat +25 -0
biotite/sequence/align/matrix_data/NUC.mat +25 -0
biotite/sequence/align/matrix_data/PAM10.mat +34 -0
biotite/sequence/align/matrix_data/PAM100.mat +34 -0
biotite/sequence/align/matrix_data/PAM110.mat +34 -0
biotite/sequence/align/matrix_data/PAM120.mat +34 -0
biotite/sequence/align/matrix_data/PAM130.mat +34 -0
biotite/sequence/align/matrix_data/PAM140.mat +34 -0
biotite/sequence/align/matrix_data/PAM150.mat +34 -0
biotite/sequence/align/matrix_data/PAM160.mat +34 -0
biotite/sequence/align/matrix_data/PAM170.mat +34 -0
biotite/sequence/align/matrix_data/PAM180.mat +34 -0
biotite/sequence/align/matrix_data/PAM190.mat +34 -0
biotite/sequence/align/matrix_data/PAM20.mat +34 -0
biotite/sequence/align/matrix_data/PAM200.mat +34 -0
biotite/sequence/align/matrix_data/PAM210.mat +34 -0
biotite/sequence/align/matrix_data/PAM220.mat +34 -0
biotite/sequence/align/matrix_data/PAM230.mat +34 -0
biotite/sequence/align/matrix_data/PAM240.mat +34 -0
biotite/sequence/align/matrix_data/PAM250.mat +34 -0
biotite/sequence/align/matrix_data/PAM260.mat +34 -0
biotite/sequence/align/matrix_data/PAM270.mat +34 -0
biotite/sequence/align/matrix_data/PAM280.mat +34 -0
biotite/sequence/align/matrix_data/PAM290.mat +34 -0
biotite/sequence/align/matrix_data/PAM30.mat +34 -0
biotite/sequence/align/matrix_data/PAM300.mat +34 -0
biotite/sequence/align/matrix_data/PAM310.mat +34 -0
biotite/sequence/align/matrix_data/PAM320.mat +34 -0
biotite/sequence/align/matrix_data/PAM330.mat +34 -0
biotite/sequence/align/matrix_data/PAM340.mat +34 -0
biotite/sequence/align/matrix_data/PAM350.mat +34 -0
biotite/sequence/align/matrix_data/PAM360.mat +34 -0
biotite/sequence/align/matrix_data/PAM370.mat +34 -0
biotite/sequence/align/matrix_data/PAM380.mat +34 -0
biotite/sequence/align/matrix_data/PAM390.mat +34 -0
biotite/sequence/align/matrix_data/PAM40.mat +34 -0
biotite/sequence/align/matrix_data/PAM400.mat +34 -0
biotite/sequence/align/matrix_data/PAM410.mat +34 -0
biotite/sequence/align/matrix_data/PAM420.mat +34 -0
biotite/sequence/align/matrix_data/PAM430.mat +34 -0
biotite/sequence/align/matrix_data/PAM440.mat +34 -0
biotite/sequence/align/matrix_data/PAM450.mat +34 -0
biotite/sequence/align/matrix_data/PAM460.mat +34 -0
biotite/sequence/align/matrix_data/PAM470.mat +34 -0
biotite/sequence/align/matrix_data/PAM480.mat +34 -0
biotite/sequence/align/matrix_data/PAM490.mat +34 -0
biotite/sequence/align/matrix_data/PAM50.mat +34 -0
biotite/sequence/align/matrix_data/PAM500.mat +34 -0
biotite/sequence/align/matrix_data/PAM60.mat +34 -0
biotite/sequence/align/matrix_data/PAM70.mat +34 -0
biotite/sequence/align/matrix_data/PAM80.mat +34 -0
biotite/sequence/align/matrix_data/PAM90.mat +34 -0
biotite/sequence/align/matrix_data/PB.license +21 -0
biotite/sequence/align/matrix_data/PB.mat +18 -0
biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
biotite/sequence/align/multiple.cpython-313-darwin.so +0 -0
biotite/sequence/align/multiple.pyx +620 -0
biotite/sequence/align/pairwise.cpython-313-darwin.so +0 -0
biotite/sequence/align/pairwise.pyx +587 -0
biotite/sequence/align/permutation.cpython-313-darwin.so +0 -0
biotite/sequence/align/permutation.pyx +313 -0
biotite/sequence/align/primes.txt +821 -0
biotite/sequence/align/selector.cpython-313-darwin.so +0 -0
biotite/sequence/align/selector.pyx +954 -0
biotite/sequence/align/statistics.py +264 -0
biotite/sequence/align/tracetable.cpython-313-darwin.so +0 -0
biotite/sequence/align/tracetable.pxd +64 -0
biotite/sequence/align/tracetable.pyx +370 -0
biotite/sequence/alphabet.py +555 -0
biotite/sequence/annotation.py +830 -0
biotite/sequence/codec.cpython-313-darwin.so +0 -0
biotite/sequence/codec.pyx +155 -0
biotite/sequence/codon.py +477 -0
biotite/sequence/codon_tables.txt +202 -0
biotite/sequence/graphics/__init__.py +33 -0
biotite/sequence/graphics/alignment.py +1115 -0
biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
biotite/sequence/graphics/color_schemes/autumn.json +51 -0
biotite/sequence/graphics/color_schemes/blossom.json +51 -0
biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
biotite/sequence/graphics/color_schemes/flower.json +51 -0
biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
biotite/sequence/graphics/color_schemes/ocean.json +51 -0
biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
biotite/sequence/graphics/color_schemes/spring.json +51 -0
biotite/sequence/graphics/color_schemes/sunset.json +51 -0
biotite/sequence/graphics/color_schemes/wither.json +51 -0
biotite/sequence/graphics/colorschemes.py +170 -0
biotite/sequence/graphics/dendrogram.py +229 -0
biotite/sequence/graphics/features.py +544 -0
biotite/sequence/graphics/logo.py +104 -0
biotite/sequence/graphics/plasmid.py +712 -0
biotite/sequence/io/__init__.py +12 -0
biotite/sequence/io/fasta/__init__.py +22 -0
biotite/sequence/io/fasta/convert.py +284 -0
biotite/sequence/io/fasta/file.py +265 -0
biotite/sequence/io/fastq/__init__.py +19 -0
biotite/sequence/io/fastq/convert.py +117 -0
biotite/sequence/io/fastq/file.py +507 -0
biotite/sequence/io/genbank/__init__.py +17 -0
biotite/sequence/io/genbank/annotation.py +269 -0
biotite/sequence/io/genbank/file.py +573 -0
biotite/sequence/io/genbank/metadata.py +336 -0
biotite/sequence/io/genbank/sequence.py +171 -0
biotite/sequence/io/general.py +201 -0
biotite/sequence/io/gff/__init__.py +26 -0
biotite/sequence/io/gff/convert.py +128 -0
biotite/sequence/io/gff/file.py +450 -0
biotite/sequence/phylo/__init__.py +36 -0
biotite/sequence/phylo/nj.cpython-313-darwin.so +0 -0
biotite/sequence/phylo/nj.pyx +221 -0
biotite/sequence/phylo/tree.cpython-313-darwin.so +0 -0
biotite/sequence/phylo/tree.pyx +1169 -0
biotite/sequence/phylo/upgma.cpython-313-darwin.so +0 -0
biotite/sequence/phylo/upgma.pyx +164 -0
biotite/sequence/profile.py +567 -0
biotite/sequence/search.py +118 -0
biotite/sequence/seqtypes.py +713 -0
biotite/sequence/sequence.py +374 -0
biotite/setup_ccd.py +197 -0
biotite/structure/__init__.py +133 -0
biotite/structure/alphabet/__init__.py +25 -0
biotite/structure/alphabet/encoder.py +332 -0
biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
biotite/structure/alphabet/i3d.py +110 -0
biotite/structure/alphabet/layers.py +86 -0
biotite/structure/alphabet/pb.license +21 -0
biotite/structure/alphabet/pb.py +171 -0
biotite/structure/alphabet/unkerasify.py +122 -0
biotite/structure/atoms.py +1554 -0
biotite/structure/basepairs.py +1404 -0
biotite/structure/bonds.cpython-313-darwin.so +0 -0
biotite/structure/bonds.pyx +1972 -0
biotite/structure/box.py +588 -0
biotite/structure/celllist.cpython-313-darwin.so +0 -0
biotite/structure/celllist.pyx +849 -0
biotite/structure/chains.py +314 -0
biotite/structure/charges.cpython-313-darwin.so +0 -0
biotite/structure/charges.pyx +520 -0
biotite/structure/compare.py +274 -0
biotite/structure/density.py +109 -0
biotite/structure/dotbracket.py +214 -0
biotite/structure/error.py +39 -0
biotite/structure/filter.py +590 -0
biotite/structure/geometry.py +655 -0
biotite/structure/graphics/__init__.py +13 -0
biotite/structure/graphics/atoms.py +243 -0
biotite/structure/graphics/rna.py +295 -0
biotite/structure/hbond.py +428 -0
biotite/structure/info/__init__.py +24 -0
biotite/structure/info/atom_masses.json +121 -0
biotite/structure/info/atoms.py +81 -0
biotite/structure/info/bonds.py +149 -0
biotite/structure/info/ccd.py +202 -0
biotite/structure/info/components.bcif +0 -0
biotite/structure/info/groups.py +131 -0
biotite/structure/info/masses.py +121 -0
biotite/structure/info/misc.py +138 -0
biotite/structure/info/radii.py +197 -0
biotite/structure/info/standardize.py +186 -0
biotite/structure/integrity.py +215 -0
biotite/structure/io/__init__.py +29 -0
biotite/structure/io/dcd/__init__.py +13 -0
biotite/structure/io/dcd/file.py +67 -0
biotite/structure/io/general.py +243 -0
biotite/structure/io/gro/__init__.py +14 -0
biotite/structure/io/gro/file.py +344 -0
biotite/structure/io/mol/__init__.py +20 -0
biotite/structure/io/mol/convert.py +112 -0
biotite/structure/io/mol/ctab.py +415 -0
biotite/structure/io/mol/header.py +120 -0
biotite/structure/io/mol/mol.py +149 -0
biotite/structure/io/mol/sdf.py +914 -0
biotite/structure/io/netcdf/__init__.py +13 -0
biotite/structure/io/netcdf/file.py +64 -0
biotite/structure/io/pdb/__init__.py +20 -0
biotite/structure/io/pdb/convert.py +307 -0
biotite/structure/io/pdb/file.py +1290 -0
biotite/structure/io/pdb/hybrid36.cpython-313-darwin.so +0 -0
biotite/structure/io/pdb/hybrid36.pyx +242 -0
biotite/structure/io/pdbqt/__init__.py +15 -0
biotite/structure/io/pdbqt/convert.py +113 -0
biotite/structure/io/pdbqt/file.py +688 -0
biotite/structure/io/pdbx/__init__.py +23 -0
biotite/structure/io/pdbx/bcif.py +656 -0
biotite/structure/io/pdbx/cif.py +1075 -0
biotite/structure/io/pdbx/component.py +245 -0
biotite/structure/io/pdbx/compress.py +321 -0
biotite/structure/io/pdbx/convert.py +1745 -0
biotite/structure/io/pdbx/encoding.cpython-313-darwin.so +0 -0
biotite/structure/io/pdbx/encoding.pyx +1031 -0
biotite/structure/io/trajfile.py +693 -0
biotite/structure/io/trr/__init__.py +13 -0
biotite/structure/io/trr/file.py +43 -0
biotite/structure/io/xtc/__init__.py +13 -0
biotite/structure/io/xtc/file.py +43 -0
biotite/structure/mechanics.py +73 -0
biotite/structure/molecules.py +352 -0
biotite/structure/pseudoknots.py +628 -0
biotite/structure/rdf.py +245 -0
biotite/structure/repair.py +304 -0
biotite/structure/residues.py +572 -0
biotite/structure/sasa.cpython-313-darwin.so +0 -0
biotite/structure/sasa.pyx +322 -0
biotite/structure/segments.py +178 -0
biotite/structure/sequence.py +111 -0
biotite/structure/sse.py +308 -0
biotite/structure/superimpose.py +689 -0
biotite/structure/transform.py +530 -0
biotite/structure/util.py +168 -0
biotite/version.py +16 -0
biotite/visualize.py +265 -0
biotite-1.1.0.dist-info/METADATA +190 -0
biotite-1.1.0.dist-info/RECORD +332 -0
biotite-1.1.0.dist-info/WHEEL +4 -0
biotite-1.1.0.dist-info/licenses/LICENSE.rst +30 -0

biotite/sequence/profile.py ADDED Viewed

@@ -0,0 +1,567 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+import warnings
+from numbers import Integral
+import numpy as np
+from biotite.sequence.align.alignment import get_codes
+from biotite.sequence.alphabet import LetterAlphabet
+from biotite.sequence.seqtypes import (
+    GeneralSequence,
+    NucleotideSequence,
+    ProteinSequence,
+)
+__name__ = "biotite.sequence"
+__author__ = "Maximilian Greil"
+__all__ = ["SequenceProfile"]
+# Abbreviations
+_NUC_DNA_ALPH = NucleotideSequence.alphabet_unamb
+_NUC_RNA_ALPH = LetterAlphabet(["A", "C", "G", "U"])
+_PROT_ALPH = ProteinSequence.alphabet
+def _determine_common_alphabet(alphabets):
+    """
+    Determine the common alphabet from a list of alphabets, that
+    extends all alphabets.
+    """
+    common_alphabet = alphabets[0]
+    for alphabet in alphabets[1:]:
+        if not common_alphabet.extends(alphabet):
+            if alphabet.extends(common_alphabet):
+                common_alphabet = alphabet
+            else:
+                raise ValueError(
+                    "There is no common alphabet that extends all alphabets"
+                )
+    return common_alphabet
+def _codes_to_iupac(frequency, codes, maxes, row):
+    """
+    Returns IUPAC code for a row of 'symbols' with none, one or
+    multiple maximum positions.
+    """
+    if np.sum(frequency) == 0:
+        raise ValueError(
+            f"There is an empty column in the 'symbols' frequency table. "
+            f"This doesn't make sense in context of an alignment. "
+            f"Please check the 'symbols' frequency table in row {row}."
+        )
+    key = tuple(np.where(frequency == maxes)[0])
+    return codes[key]
+class SequenceProfile(object):
+    """
+    A :class:`SequenceProfile` object stores information about a
+    sequence profile of aligned sequences.
+    It is possible to calculate and return its consensus sequence.
+    This class saves the position frequency matrix
+    (position count matrix) 'symbols' of the occurrences of each
+    alphabet symbol at each position.
+    It also saves the number of gaps at each position in the array
+    'gaps'.
+    With :meth:`from_alignment()` a :class:`SequenceProfile` object can
+    be created from an indefinite number of aligned sequences.
+    With :meth:`probability_matrix()` the position probability matrix
+    can be created based on 'symbols' and a pseudocount.
+    With :meth:`log_odds_matrix()` the position weight matrix can
+    be created based on the before calculated position probability
+    matrix and the background frequencies.
+    With :meth:`sequence_probability_from_matrix()` the probability of a
+    sequence can be calculated based on the before calculated position
+    probability matrix of this instance of object SequenceProfile.
+    With :meth:`sequence_score_from_matrix()` the score of a sequence
+    can be calculated based on the before calculated position weight
+    matrix of this instance of object SequenceProfile.
+    All attributes of this class are publicly accessible.
+    Parameters
+    ----------
+    symbols : ndarray, dtype=int, shape=(n,k)
+        This matrix simply saves for each position how often absolutely
+        each symbol is present.
+    gaps : ndarray, dtype=int, shape=n
+        Array which indicates the number of gaps at each position.
+    alphabet : Alphabet, length=k
+        Alphabet of sequences of sequence profile
+    Attributes
+    ----------
+    symbols : ndarray, dtype=int, shape=(n,k)
+        This matrix simply saves for each position how often absolutely
+        each symbol is present.
+    gaps : ndarray, dtype=int, shape=n
+        Array which indicates the number of gaps at each position.
+    alphabet : Alphabet, length=k
+        Alphabet of sequences of sequence profile
+    Examples
+    --------
+    Create a profile from a multiple sequence alignment:
+    >>> sequences = [
+    ...     NucleotideSequence("CGCTCATTC"),
+    ...     NucleotideSequence("CGCTATTC"),
+    ...     NucleotideSequence("CCCTCAATC"),
+    ... ]
+    >>> msa, _, _, _ = align_multiple(
+    ...     sequences, SubstitutionMatrix.std_nucleotide_matrix(), gap_penalty=-5
+    ... )
+    >>> print(msa)
+    CGCTCATTC
+    CGCT-ATTC
+    CCCTCAATC
+    >>> profile = SequenceProfile.from_alignment(msa)
+    >>> print(profile)
+      A C G T
+    0 0 3 0 0
+    1 0 1 2 0
+    2 0 3 0 0
+    3 0 0 0 3
+    4 0 2 0 0
+    5 3 0 0 0
+    6 1 0 0 2
+    7 0 0 0 3
+    8 0 3 0 0
+    >>> print(profile.gaps)
+    [0 0 0 0 1 0 0 0 0]
+    Slice the profile (masks and index arrays are also supported):
+    >>> print(profile[2:])
+      A C G T
+    0 0 3 0 0
+    1 0 0 0 3
+    2 0 2 0 0
+    3 3 0 0 0
+    4 1 0 0 2
+    5 0 0 0 3
+    6 0 3 0 0
+    Use the profile to compute the position probability matrix:
+    >>> print(profile.probability_matrix())
+    [[0.000 1.000 0.000 0.000]
+     [0.000 0.333 0.667 0.000]
+     [0.000 1.000 0.000 0.000]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]
+     [1.000 0.000 0.000 0.000]
+     [0.333 0.000 0.000 0.667]
+     [0.000 0.000 0.000 1.000]
+     [0.000 1.000 0.000 0.000]]
+    """
+    def __init__(self, symbols, gaps, alphabet):
+        self._symbols = symbols
+        self._gaps = gaps
+        self._alphabet = alphabet
+        if len(alphabet) != symbols.shape[1]:
+            raise ValueError(
+                f"The given alphabet doesn't have the same length "
+                f"({len(alphabet)}) as the number of columns "
+                f"({symbols.shape[1]}) in the 'symbols' frequency table."
+            )
+        if gaps.shape[0] != symbols.shape[0]:
+            raise ValueError(
+                f"The given 'gaps' position matrix doesn't have the same "
+                f"length ({gaps.shape[0]}) as the 'symbols' "
+                f"frequency table ({symbols.shape[0]})"
+            )
+    @property
+    def symbols(self):
+        return self._symbols
+    @property
+    def gaps(self):
+        return self._gaps
+    @property
+    def alphabet(self):
+        return self._alphabet
+    @symbols.setter
+    def symbols(self, new_symbols):
+        if not new_symbols.shape == self.symbols.shape:
+            raise ValueError(
+                f"New ndarray 'symbols' must be of same shape "
+                f"{self.symbols.shape} as the old one"
+            )
+        self._symbols = new_symbols
+    @gaps.setter
+    def gaps(self, new_gaps):
+        if not new_gaps.shape == self.gaps.shape:
+            raise ValueError(
+                f"New ndarray 'gaps' must be of same shape "
+                f"{self.gaps.shape} as the old one"
+            )
+        self._gaps = new_gaps
+    def __str__(self):
+        # Add an additional row and column for the position and symbol indicators
+        print_matrix = np.full(
+            (self.symbols.shape[0] + 1, self.symbols.shape[1] + 1), "", dtype=object
+        )
+        print_matrix[1:, 1:] = self.symbols.astype(str)
+        print_matrix[0, 1:] = [str(sym) for sym in self.alphabet]
+        print_matrix[1:, 0] = [str(i) for i in range(self.symbols.shape[0])]
+        max_len = len(max(print_matrix.flatten(), key=len))
+        return "\n".join(
+            [
+                " ".join([str(cell).rjust(max_len) for cell in row])
+                for row in print_matrix
+            ]
+        )
+    def __repr__(self):
+        return (
+            f"SequenceProfile(np.{np.array_repr(self.symbols)}, "
+            f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
+        )
+    def __eq__(self, item):
+        if not isinstance(item, SequenceProfile):
+            return False
+        if not np.array_equal(self.symbols, item.symbols):
+            return False
+        if not np.array_equal(self.gaps, item.gaps):
+            return False
+        if not self.alphabet == item.alphabet:
+            return False
+        return True
+    @staticmethod
+    def from_alignment(alignment, alphabet=None):
+        """
+        Get an object of :class:`SequenceProfile` from an object of
+        :class:`Alignment`.
+        Based on the sequences of the alignment, the SequenceProfile
+        parameters symbols and gaps are calculated.
+        Parameters
+        ----------
+        alignment : Alignment
+            An Alignment object to create the SequenceProfile object
+            from.
+        alphabet : bool
+            This alphabet will be used when creating the SequenceProfile
+            object. If no alphabet is selected, the alphabet for this
+            SequenceProfile
+            object will be calculated from the sequences of object
+            Alignment.
+            (Default: None).
+        Returns
+        -------
+        profile: SequenceProfile
+            The created SequenceProfile object
+        """
+        sequences = get_codes(alignment)
+        if alphabet is None:
+            alphabet = _determine_common_alphabet(
+                [seq.alphabet for seq in alignment.sequences]
+            )
+        else:
+            for alph in (seq.alphabet for seq in alignment.sequences):
+                if not alphabet.extends(alph):
+                    raise ValueError(
+                        "The given alphabet is incompatible with a least one "
+                        "alphabet of the given sequences"
+                    )
+        symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int)
+        gaps = np.zeros(len(sequences[0]), dtype=int)
+        sequences = np.transpose(sequences)
+        for i in range(len(sequences)):
+            row = np.where(sequences[i,] == -1, len(alphabet), sequences[i,])
+            count = np.bincount(row, minlength=len(alphabet) + 1)
+            symbols[i,] = count[0 : len(alphabet)]
+            gaps[i] = count[-1]
+        return SequenceProfile(symbols, gaps, alphabet)
+    def to_consensus(self, as_general=False):
+        """
+        Get the consensus sequence for this SequenceProfile object.
+        Parameters
+        ----------
+        as_general : bool
+            If true, returns consensus sequence as GeneralSequence
+            object.
+            Otherwise, the consensus sequence object type is chosen
+            based on the alphabet of this SequenceProfile object
+            (Default: False).
+        Returns
+        -------
+        consensus: Sequence
+            The calculated consensus sequence
+        """
+        # https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
+        if as_general:
+            return self._general_to_consensus()
+        elif self.alphabet == _NUC_DNA_ALPH:
+            return NucleotideSequence(self._dna_to_consensus())
+        elif self.alphabet == _NUC_RNA_ALPH:
+            return NucleotideSequence(self._rna_to_consensus())
+        elif self.alphabet == _PROT_ALPH:
+            return self._prot_to_consensus()
+        return self._general_to_consensus()
+    def _dna_to_consensus(self):
+        codes = {
+            (0,): "A",
+            (1,): "C",
+            (2,): "G",
+            (3,): "T",
+            (0, 2): "R",
+            (1, 3): "Y",
+            (1, 2): "S",
+            (0, 3): "W",
+            (2, 3): "K",
+            (0, 1): "M",
+            (1, 2, 3): "B",
+            (0, 2, 3): "D",
+            (0, 1, 3): "H",
+            (0, 1, 2): "V",
+            (0, 1, 2, 3): "N",
+        }
+        consensus = ""
+        maxes = np.max(self.symbols, axis=1)
+        for i in range(len(self.symbols)):
+            consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
+        return consensus
+    def _rna_to_consensus(self):
+        codes = {
+            (0,): "A",
+            (1,): "C",
+            (2,): "G",
+            (3,): "U",
+            (0, 2): "R",
+            (1, 3): "Y",
+            (1, 2): "S",
+            (0, 3): "W",
+            (2, 3): "K",
+            (0, 1): "M",
+            (1, 2, 3): "B",
+            (0, 2, 3): "D",
+            (0, 1, 3): "H",
+            (0, 1, 2): "V",
+            (0, 1, 2, 3): "N",
+        }
+        consensus = ""
+        maxes = np.max(self.symbols, axis=1)
+        for i in range(len(self.symbols)):
+            consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
+        return consensus
+    def _prot_to_consensus(self):
+        """
+        In case there is more than one symbol with the same maximal
+        occurrences, the alphabetically sorted first symbol will be
+        taken for the consensus sequence.
+        """
+        consensus = ProteinSequence()
+        consensus.code = np.argmax(self.symbols, axis=1)
+        consensus.code = np.where(
+            np.sum(self.symbols, axis=1) == 0, 23, consensus.code
+        )  # _PROT_ALPH[23] = 'X'
+        return consensus
+    def _general_to_consensus(self):
+        """
+        In case there is more than one symbol with the same maximal
+        occurrences, the alphabetically sorted first symbol will be
+        taken for the consensus sequence.
+        In case the sum of occurrences of all symbols at a position is
+        zero, the alphabetically sorted first symbol will be taken for
+        the consensus sequence.
+        """
+        consensus = GeneralSequence(self.alphabet)
+        consensus.code = np.argmax(self.symbols, axis=1)
+        return consensus
+    def probability_matrix(self, pseudocount=0):
+        r"""
+        Calculate the position probability matrix (PPM) based on
+        'symbols' and the given pseudocount.
+        This new matrix has the same shape as 'symbols'.
+        .. math::
+            P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p}
+        :math:`S`: The symbol.
+        :math:`C_S`: The count of symbol :math:`S` at the sequence
+        position.
+        :math:`c_p`: The pseudocount.
+        :math:`k`: Length of the alphabet.
+        Parameters
+        ----------
+        pseudocount: int, optional
+            Amount added to the number of observed cases in order to
+            change the expected probability of the PPM.
+            (Default: 0)
+        Returns
+        -------
+        ppm: ndarray, dtype=float, shape=(n,k)
+            The calculated the position probability matrix.
+        """
+        if pseudocount < 0:
+            raise ValueError("Pseudocount can not be smaller than zero.")
+        return (self.symbols + pseudocount / self.symbols.shape[1]) / (
+            np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount
+        )
+    def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
+        r"""
+        Calculate the position weight matrix (PWM) based on the
+        position probability matrix (PPM) (with given pseudocount) and
+        background_frequencies.
+        This new matrix has the same shape as 'symbols'.
+        .. math::
+            W(S) = \log_2 \left( \frac{P(S)}{B_S} \right)
+        :math:`S`: The symbol.
+        :math:`P(S)`: The probability of symbol :math:`S` at the
+        sequence position.
+        :math:`c_p`: The background frequency of symbol :math:`S`.
+        Parameters
+        ----------
+        pseudocount: int, optional
+            Amount added to the number of observed cases in order to change
+            the expected probability of the PPM.
+            (Default: 0)
+        background_frequencies: ndarray, shape=(k,), dtype=float, optional
+            The background frequencies for each symbol in the alphabet.
+            By default, a uniform distribution is assumed.
+        Returns
+        -------
+        pwm: ndarray, dtype=float, shape=(n,k)
+            The calculated the position weight matrix.
+        """
+        if background_frequencies is None:
+            background_frequencies = 1 / len(self.alphabet)
+        ppm = self.probability_matrix(pseudocount=pseudocount)
+        # Catch warning that appears, if a symbol is missing at any
+        # position in the profile
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=RuntimeWarning)
+            return np.log2(ppm / background_frequencies)
+    def sequence_probability(self, sequence, pseudocount=0):
+        r"""
+        Calculate probability of a sequence based on the
+        position probability matrix (PPM).
+        The sequence probability is the product of the probability of
+        the respective symbol over all sequence positions.
+        Parameters
+        ----------
+        sequence : Sequence
+           The input sequence.
+        pseudocount: int, optional
+            Amount added to the number of observed cases in order to change
+            the expected probability of the PPM.
+            (Default: 0)
+        Returns
+        -------
+        probability: float
+           The calculated probability for the input sequence based on
+           the PPM.
+        """
+        ppm = self.probability_matrix(pseudocount=pseudocount)
+        if len(sequence) != len(ppm):
+            raise ValueError(
+                f"The given sequence has a different length ({len(sequence)}) than "
+                f"the position probability matrix ({len(ppm)})."
+            )
+        if not ppm.shape == self.symbols.shape:
+            raise ValueError(
+                f"Position probability matrix {ppm.shape} must be of same shape "
+                f"as 'symbols' {self.symbols.shape}"
+            )
+        return np.prod(ppm[np.arange(len(sequence)), sequence.code])
+    def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
+        """
+        Calculate score of a sequence based on the
+        position weight matrix (PWM).
+        The score is the sum of weights (log-odds scores) of
+        the respective symbol over all sequence positions.
+        Parameters
+        ----------
+        sequence : Sequence
+           The input sequence.
+        pseudocount: int, optional
+            Amount added to the number of observed cases in order to change
+            the expected probability of the PPM.
+            (Default: 0)
+        background_frequencies: ndarray, shape=(k,), dtype=float, optional
+            The background frequencies for each symbol in the alphabet.
+            By default a uniform distribution is assumed.
+        Returns
+        -------
+        score: float
+           The calculated score for the input sequence based on
+           the PWM.
+        """
+        if background_frequencies is None:
+            background_frequencies = 1 / len(self.alphabet)
+        pwm = self.log_odds_matrix(
+            background_frequencies=background_frequencies, pseudocount=pseudocount
+        )
+        if len(sequence) != len(pwm):
+            raise ValueError(
+                f"The given sequence has a different length ({len(sequence)}) than "
+                f"the position weight matrix ({len(pwm)})."
+            )
+        if not pwm.shape == self.symbols.shape:
+            raise ValueError(
+                f"Position weight matrix {pwm.shape} must be of same shape "
+                f"as 'symbols' {self.symbols.shape}"
+            )
+        return np.sum(pwm[np.arange(len(sequence)), sequence.code])
+    def __getitem__(self, index):
+        if isinstance(index, Integral):
+            # Do not allow to collapse dimensions
+            index = slice(index, index + 1)
+        return SequenceProfile(self.symbols[index], self.gaps[index], self.alphabet)
+    def __len__(self):
+        return len(self.symbols)

biotite/sequence/search.py ADDED Viewed

@@ -0,0 +1,118 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+__name__ = "biotite.sequence"
+__author__ = "Patrick Kunzmann"
+__all__ = ["find_subsequence", "find_symbol", "find_symbol_first", "find_symbol_last"]
+import numpy as np
+def find_subsequence(sequence, query):
+    """
+    Find a subsequence in a sequence.
+    Parameters
+    ----------
+    sequence : Sequence
+        The sequence to find the subsequence in.
+    query : Sequence
+        The potential subsequence. Its alphabet must extend the
+        `sequence` alphabet.
+    Returns
+    -------
+    match_indices : ndarray
+        The starting indices in `sequence`, where `query` has been
+        found. The array is empty if no match has been found.
+    Raises
+    ------
+    ValueError
+        If the `query` alphabet does not extend the `sequence` alphabet.
+    Examples
+    --------
+    >>> main_seq = NucleotideSequence("ACTGAATGA")
+    >>> sub_seq = NucleotideSequence("TGA")
+    >>> print(find_subsequence(main_seq, sub_seq))
+    [2 6]
+    """
+    if not sequence.get_alphabet().extends(query.get_alphabet()):
+        raise ValueError("The sequences alphabets are not equal")
+    match_indices = []
+    frame_size = len(query)
+    for i in range(len(sequence) - frame_size + 1):
+        sub_seq_code = sequence.code[i : i + frame_size]
+        if np.array_equal(query.code, sub_seq_code):
+            match_indices.append(i)
+    return np.array(match_indices)
+def find_symbol(sequence, symbol):
+    """
+    Find a symbol in a sequence.
+    Parameters
+    ----------
+    sequence : Sequence
+        The sequence to find the symbol in.
+    symbol : object
+        The symbol to be found in `sequence`.
+    Returns
+    -------
+    match_indices : ndarray
+        The indices in `sequence`, where `symbol` has been found.
+    """
+    code = sequence.get_alphabet().encode(symbol)
+    return np.where(sequence.code == code)[0]
+def find_symbol_first(sequence, symbol):
+    """
+    Find first occurence of a symbol in a sequence.
+    Parameters
+    ----------
+    sequence : Sequence
+        The sequence to find the symbol in.
+    symbol : object
+        The symbol to be found in `sequence`.
+    Returns
+    -------
+    first_index : int
+        The first index of `symbol` in `sequence`. If `symbol` is not in
+        `sequence`, -1 is returned.
+    """
+    match_i = find_symbol(sequence, symbol)
+    if len(match_i) == 0:
+        return -1
+    return np.min(match_i)
+def find_symbol_last(sequence, symbol):
+    """
+    Find last occurence of a symbol in a sequence.
+    Parameters
+    ----------
+    sequence : Sequence
+        The sequence to find the symbol in.
+    symbol : object
+        The symbol to be found in `sequence`.
+    Returns
+    -------
+    flast_index : int
+        The last index of `symbol` in `sequence`. If `symbol` is not in
+        `sequence`, -1 is returned.
+    """
+    match_i = find_symbol(sequence, symbol)
+    if len(match_i) == 0:
+        return -1
+    return np.max(match_i)