PyPI - biotite - Versions diffs - 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl - Mend

biotite 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show

biotite/__init__.py +19 -0
biotite/application/__init__.py +43 -0
biotite/application/application.py +265 -0
biotite/application/autodock/__init__.py +12 -0
biotite/application/autodock/app.py +505 -0
biotite/application/blast/__init__.py +14 -0
biotite/application/blast/alignment.py +83 -0
biotite/application/blast/webapp.py +421 -0
biotite/application/clustalo/__init__.py +12 -0
biotite/application/clustalo/app.py +238 -0
biotite/application/dssp/__init__.py +12 -0
biotite/application/dssp/app.py +152 -0
biotite/application/localapp.py +306 -0
biotite/application/mafft/__init__.py +12 -0
biotite/application/mafft/app.py +122 -0
biotite/application/msaapp.py +374 -0
biotite/application/muscle/__init__.py +13 -0
biotite/application/muscle/app3.py +254 -0
biotite/application/muscle/app5.py +171 -0
biotite/application/sra/__init__.py +18 -0
biotite/application/sra/app.py +456 -0
biotite/application/tantan/__init__.py +12 -0
biotite/application/tantan/app.py +222 -0
biotite/application/util.py +59 -0
biotite/application/viennarna/__init__.py +18 -0
biotite/application/viennarna/rnaalifold.py +304 -0
biotite/application/viennarna/rnafold.py +269 -0
biotite/application/viennarna/rnaplot.py +187 -0
biotite/application/viennarna/util.py +72 -0
biotite/application/webapp.py +77 -0
biotite/copyable.py +71 -0
biotite/database/__init__.py +23 -0
biotite/database/entrez/__init__.py +15 -0
biotite/database/entrez/check.py +61 -0
biotite/database/entrez/dbnames.py +89 -0
biotite/database/entrez/download.py +223 -0
biotite/database/entrez/key.py +44 -0
biotite/database/entrez/query.py +223 -0
biotite/database/error.py +15 -0
biotite/database/pubchem/__init__.py +21 -0
biotite/database/pubchem/download.py +260 -0
biotite/database/pubchem/error.py +20 -0
biotite/database/pubchem/query.py +827 -0
biotite/database/pubchem/throttle.py +99 -0
biotite/database/rcsb/__init__.py +13 -0
biotite/database/rcsb/download.py +167 -0
biotite/database/rcsb/query.py +959 -0
biotite/database/uniprot/__init__.py +13 -0
biotite/database/uniprot/check.py +32 -0
biotite/database/uniprot/download.py +134 -0
biotite/database/uniprot/query.py +209 -0
biotite/file.py +251 -0
biotite/sequence/__init__.py +73 -0
biotite/sequence/align/__init__.py +49 -0
biotite/sequence/align/alignment.py +658 -0
biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
biotite/sequence/align/banded.pyx +652 -0
biotite/sequence/align/buckets.py +69 -0
biotite/sequence/align/cigar.py +434 -0
biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
biotite/sequence/align/kmeralphabet.pyx +574 -0
biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
biotite/sequence/align/kmersimilarity.pyx +233 -0
biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
biotite/sequence/align/kmertable.pyx +3400 -0
biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
biotite/sequence/align/localgapped.pyx +892 -0
biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
biotite/sequence/align/localungapped.pyx +279 -0
biotite/sequence/align/matrix.py +405 -0
biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
biotite/sequence/align/matrix_data/GONNET.mat +26 -0
biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
biotite/sequence/align/matrix_data/MATCH.mat +25 -0
biotite/sequence/align/matrix_data/NUC.mat +25 -0
biotite/sequence/align/matrix_data/PAM10.mat +34 -0
biotite/sequence/align/matrix_data/PAM100.mat +34 -0
biotite/sequence/align/matrix_data/PAM110.mat +34 -0
biotite/sequence/align/matrix_data/PAM120.mat +34 -0
biotite/sequence/align/matrix_data/PAM130.mat +34 -0
biotite/sequence/align/matrix_data/PAM140.mat +34 -0
biotite/sequence/align/matrix_data/PAM150.mat +34 -0
biotite/sequence/align/matrix_data/PAM160.mat +34 -0
biotite/sequence/align/matrix_data/PAM170.mat +34 -0
biotite/sequence/align/matrix_data/PAM180.mat +34 -0
biotite/sequence/align/matrix_data/PAM190.mat +34 -0
biotite/sequence/align/matrix_data/PAM20.mat +34 -0
biotite/sequence/align/matrix_data/PAM200.mat +34 -0
biotite/sequence/align/matrix_data/PAM210.mat +34 -0
biotite/sequence/align/matrix_data/PAM220.mat +34 -0
biotite/sequence/align/matrix_data/PAM230.mat +34 -0
biotite/sequence/align/matrix_data/PAM240.mat +34 -0
biotite/sequence/align/matrix_data/PAM250.mat +34 -0
biotite/sequence/align/matrix_data/PAM260.mat +34 -0
biotite/sequence/align/matrix_data/PAM270.mat +34 -0
biotite/sequence/align/matrix_data/PAM280.mat +34 -0
biotite/sequence/align/matrix_data/PAM290.mat +34 -0
biotite/sequence/align/matrix_data/PAM30.mat +34 -0
biotite/sequence/align/matrix_data/PAM300.mat +34 -0
biotite/sequence/align/matrix_data/PAM310.mat +34 -0
biotite/sequence/align/matrix_data/PAM320.mat +34 -0
biotite/sequence/align/matrix_data/PAM330.mat +34 -0
biotite/sequence/align/matrix_data/PAM340.mat +34 -0
biotite/sequence/align/matrix_data/PAM350.mat +34 -0
biotite/sequence/align/matrix_data/PAM360.mat +34 -0
biotite/sequence/align/matrix_data/PAM370.mat +34 -0
biotite/sequence/align/matrix_data/PAM380.mat +34 -0
biotite/sequence/align/matrix_data/PAM390.mat +34 -0
biotite/sequence/align/matrix_data/PAM40.mat +34 -0
biotite/sequence/align/matrix_data/PAM400.mat +34 -0
biotite/sequence/align/matrix_data/PAM410.mat +34 -0
biotite/sequence/align/matrix_data/PAM420.mat +34 -0
biotite/sequence/align/matrix_data/PAM430.mat +34 -0
biotite/sequence/align/matrix_data/PAM440.mat +34 -0
biotite/sequence/align/matrix_data/PAM450.mat +34 -0
biotite/sequence/align/matrix_data/PAM460.mat +34 -0
biotite/sequence/align/matrix_data/PAM470.mat +34 -0
biotite/sequence/align/matrix_data/PAM480.mat +34 -0
biotite/sequence/align/matrix_data/PAM490.mat +34 -0
biotite/sequence/align/matrix_data/PAM50.mat +34 -0
biotite/sequence/align/matrix_data/PAM500.mat +34 -0
biotite/sequence/align/matrix_data/PAM60.mat +34 -0
biotite/sequence/align/matrix_data/PAM70.mat +34 -0
biotite/sequence/align/matrix_data/PAM80.mat +34 -0
biotite/sequence/align/matrix_data/PAM90.mat +34 -0
biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
biotite/sequence/align/multiple.pyx +620 -0
biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
biotite/sequence/align/pairwise.pyx +587 -0
biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
biotite/sequence/align/permutation.pyx +305 -0
biotite/sequence/align/primes.txt +821 -0
biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
biotite/sequence/align/selector.pyx +956 -0
biotite/sequence/align/statistics.py +265 -0
biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
biotite/sequence/align/tracetable.pxd +64 -0
biotite/sequence/align/tracetable.pyx +370 -0
biotite/sequence/alphabet.py +566 -0
biotite/sequence/annotation.py +829 -0
biotite/sequence/codec.cpython-310-darwin.so +0 -0
biotite/sequence/codec.pyx +155 -0
biotite/sequence/codon.py +466 -0
biotite/sequence/codon_tables.txt +202 -0
biotite/sequence/graphics/__init__.py +33 -0
biotite/sequence/graphics/alignment.py +1034 -0
biotite/sequence/graphics/color_schemes/autumn.json +51 -0
biotite/sequence/graphics/color_schemes/blossom.json +51 -0
biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
biotite/sequence/graphics/color_schemes/flower.json +51 -0
biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
biotite/sequence/graphics/color_schemes/ocean.json +51 -0
biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
biotite/sequence/graphics/color_schemes/spring.json +51 -0
biotite/sequence/graphics/color_schemes/sunset.json +51 -0
biotite/sequence/graphics/color_schemes/wither.json +51 -0
biotite/sequence/graphics/colorschemes.py +139 -0
biotite/sequence/graphics/dendrogram.py +184 -0
biotite/sequence/graphics/features.py +510 -0
biotite/sequence/graphics/logo.py +110 -0
biotite/sequence/graphics/plasmid.py +661 -0
biotite/sequence/io/__init__.py +12 -0
biotite/sequence/io/fasta/__init__.py +22 -0
biotite/sequence/io/fasta/convert.py +273 -0
biotite/sequence/io/fasta/file.py +278 -0
biotite/sequence/io/fastq/__init__.py +19 -0
biotite/sequence/io/fastq/convert.py +120 -0
biotite/sequence/io/fastq/file.py +551 -0
biotite/sequence/io/genbank/__init__.py +17 -0
biotite/sequence/io/genbank/annotation.py +277 -0
biotite/sequence/io/genbank/file.py +575 -0
biotite/sequence/io/genbank/metadata.py +324 -0
biotite/sequence/io/genbank/sequence.py +172 -0
biotite/sequence/io/general.py +192 -0
biotite/sequence/io/gff/__init__.py +26 -0
biotite/sequence/io/gff/convert.py +133 -0
biotite/sequence/io/gff/file.py +434 -0
biotite/sequence/phylo/__init__.py +36 -0
biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
biotite/sequence/phylo/nj.pyx +221 -0
biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
biotite/sequence/phylo/tree.pyx +1169 -0
biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
biotite/sequence/phylo/upgma.pyx +164 -0
biotite/sequence/profile.py +456 -0
biotite/sequence/search.py +116 -0
biotite/sequence/seqtypes.py +556 -0
biotite/sequence/sequence.py +374 -0
biotite/structure/__init__.py +132 -0
biotite/structure/atoms.py +1455 -0
biotite/structure/basepairs.py +1415 -0
biotite/structure/bonds.cpython-310-darwin.so +0 -0
biotite/structure/bonds.pyx +1933 -0
biotite/structure/box.py +592 -0
biotite/structure/celllist.cpython-310-darwin.so +0 -0
biotite/structure/celllist.pyx +849 -0
biotite/structure/chains.py +298 -0
biotite/structure/charges.cpython-310-darwin.so +0 -0
biotite/structure/charges.pyx +520 -0
biotite/structure/compare.py +274 -0
biotite/structure/density.py +114 -0
biotite/structure/dotbracket.py +216 -0
biotite/structure/error.py +31 -0
biotite/structure/filter.py +585 -0
biotite/structure/geometry.py +697 -0
biotite/structure/graphics/__init__.py +13 -0
biotite/structure/graphics/atoms.py +226 -0
biotite/structure/graphics/rna.py +282 -0
biotite/structure/hbond.py +409 -0
biotite/structure/info/__init__.py +25 -0
biotite/structure/info/atom_masses.json +121 -0
biotite/structure/info/atoms.py +82 -0
biotite/structure/info/bonds.py +145 -0
biotite/structure/info/ccd/README.rst +8 -0
biotite/structure/info/ccd/amino_acids.txt +1663 -0
biotite/structure/info/ccd/carbohydrates.txt +1135 -0
biotite/structure/info/ccd/components.bcif +0 -0
biotite/structure/info/ccd/nucleotides.txt +798 -0
biotite/structure/info/ccd.py +95 -0
biotite/structure/info/groups.py +90 -0
biotite/structure/info/masses.py +123 -0
biotite/structure/info/misc.py +144 -0
biotite/structure/info/radii.py +197 -0
biotite/structure/info/standardize.py +196 -0
biotite/structure/integrity.py +268 -0
biotite/structure/io/__init__.py +30 -0
biotite/structure/io/ctab.py +72 -0
biotite/structure/io/dcd/__init__.py +13 -0
biotite/structure/io/dcd/file.py +65 -0
biotite/structure/io/general.py +257 -0
biotite/structure/io/gro/__init__.py +14 -0
biotite/structure/io/gro/file.py +343 -0
biotite/structure/io/mmtf/__init__.py +21 -0
biotite/structure/io/mmtf/assembly.py +214 -0
biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
biotite/structure/io/mmtf/convertarray.pyx +341 -0
biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
biotite/structure/io/mmtf/convertfile.pyx +501 -0
biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
biotite/structure/io/mmtf/decode.pyx +152 -0
biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
biotite/structure/io/mmtf/encode.pyx +183 -0
biotite/structure/io/mmtf/file.py +233 -0
biotite/structure/io/mol/__init__.py +20 -0
biotite/structure/io/mol/convert.py +115 -0
biotite/structure/io/mol/ctab.py +414 -0
biotite/structure/io/mol/header.py +116 -0
biotite/structure/io/mol/mol.py +193 -0
biotite/structure/io/mol/sdf.py +916 -0
biotite/structure/io/netcdf/__init__.py +13 -0
biotite/structure/io/netcdf/file.py +63 -0
biotite/structure/io/npz/__init__.py +20 -0
biotite/structure/io/npz/file.py +152 -0
biotite/structure/io/pdb/__init__.py +20 -0
biotite/structure/io/pdb/convert.py +293 -0
biotite/structure/io/pdb/file.py +1240 -0
biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
biotite/structure/io/pdb/hybrid36.pyx +242 -0
biotite/structure/io/pdbqt/__init__.py +15 -0
biotite/structure/io/pdbqt/convert.py +107 -0
biotite/structure/io/pdbqt/file.py +640 -0
biotite/structure/io/pdbx/__init__.py +23 -0
biotite/structure/io/pdbx/bcif.py +648 -0
biotite/structure/io/pdbx/cif.py +1032 -0
biotite/structure/io/pdbx/component.py +246 -0
biotite/structure/io/pdbx/convert.py +1597 -0
biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
biotite/structure/io/pdbx/encoding.pyx +950 -0
biotite/structure/io/pdbx/legacy.py +267 -0
biotite/structure/io/tng/__init__.py +13 -0
biotite/structure/io/tng/file.py +46 -0
biotite/structure/io/trajfile.py +710 -0
biotite/structure/io/trr/__init__.py +13 -0
biotite/structure/io/trr/file.py +46 -0
biotite/structure/io/xtc/__init__.py +13 -0
biotite/structure/io/xtc/file.py +46 -0
biotite/structure/mechanics.py +75 -0
biotite/structure/molecules.py +353 -0
biotite/structure/pseudoknots.py +642 -0
biotite/structure/rdf.py +243 -0
biotite/structure/repair.py +253 -0
biotite/structure/residues.py +562 -0
biotite/structure/resutil.py +178 -0
biotite/structure/sasa.cpython-310-darwin.so +0 -0
biotite/structure/sasa.pyx +322 -0
biotite/structure/sequence.py +112 -0
biotite/structure/sse.py +327 -0
biotite/structure/superimpose.py +727 -0
biotite/structure/transform.py +504 -0
biotite/structure/util.py +98 -0
biotite/temp.py +86 -0
biotite/version.py +16 -0
biotite/visualize.py +251 -0
biotite-0.41.1.dist-info/METADATA +187 -0
biotite-0.41.1.dist-info/RECORD +340 -0
biotite-0.41.1.dist-info/WHEEL +4 -0
biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0

biotite/sequence/align/buckets.py ADDED Viewed

@@ -0,0 +1,69 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+__name__ = "biotite.sequence.align"
+__author__ = "Patrick Kunzmann"
+__all__ = ["bucket_number"]
+from os.path import realpath, dirname, join
+import numpy as np
+_primes = None
+def bucket_number(n_kmers, load_factor=0.8):
+    """
+    Find an appropriate number of buckets for a :class:`BucketKmerTable`
+    based on the number of elements (i.e. *k-mers*) that should be
+    stored in the table.
+    Parameters
+    ----------
+    n_kmers : int
+        The expected number of *k-mers* that will be stored in the
+        :class:`BucketKmerTable`.
+        If this number deviates from the actual number of *k-mers* that
+        will be stored, the load factor of the table will deviate
+        by the same percentage.
+    load_factor : float, optional
+        The ratio of bucket number to *k-mer* number.
+        The actual load factor will be lower, as the closest greater
+        prime is returned (see *Notes*).
+    Returns
+    -------
+    n_buckets : int
+        The recommended number of buckets to use for a
+        :class:`BucketKmerTable`, that stores `n_kmers` at the given
+        `load_factor`.
+    Notes
+    -----
+    The function returns the closest greater prime number from a
+    precomputed list of primes to use as the number of buckets.
+    The reason is that primer numbers have proven to be good hash table
+    sizes, if the hash function is not randomized.
+    Let's take unambiguous nucleotide *k-mers* as example.
+    If powers of two would be used as table size (another common scheme),
+    taking the modulo operation on the *k-mer* code would simply erase
+    the upper bits corresponding to the first nucleotide(s) in a
+    *k-mer*.
+    Hence, all *k-mers* with the same suffix would be stored in the same
+    bin.
+    """
+    global _primes
+    if _primes is None:
+        with open(
+            join(dirname(realpath(__file__)), "primes.txt")
+        ) as file:
+            _primes = np.array([
+                int(line) for line in file.read().splitlines()
+                if len(line) != 0 and line[0] != "#"
+            ])
+    number = int(n_kmers / load_factor)
+    index = np.searchsorted(_primes, number, side="left")
+    if index == len(_primes):
+        raise ValueError("Number of buckets too large")
+    return _primes[index]

biotite/sequence/align/cigar.py ADDED Viewed

@@ -0,0 +1,434 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+__name__ = "biotite.sequence.align"
+__author__ = "Patrick Kunzmann"
+__all__ = ["CigarOp", "read_alignment_from_cigar", "write_alignment_to_cigar"]
+import enum
+import numpy as np
+from .alignment import Alignment, get_codes
+class CigarOp(enum.IntEnum):
+    """
+    An enum for the different CIGAR operations.
+    """
+    MATCH = 0
+    INSERTION = 1
+    DELETION = 2
+    INTRON = 3
+    SOFT_CLIP = 4
+    HARD_CLIP = 5
+    PADDING = 6
+    EQUAL = 7
+    DIFFERENT = 8
+    BACK = 9
+    @staticmethod
+    def from_cigar_symbol(symbol):
+        """
+        Get the enum value from the CIGAR symbol.
+        Parameters
+        ----------
+        symbol : str
+            The CIGAR symbol.
+        Returns
+        -------
+        op : CigarOp
+            The enum value.
+        """
+        return _str_to_op[symbol]
+    def to_cigar_symbol(self):
+        return _op_to_str[self]
+_str_to_op = {
+        "M" : CigarOp.MATCH,
+        "I" : CigarOp.INSERTION,
+        "D" : CigarOp.DELETION,
+        "N" : CigarOp.INTRON,
+        "S" : CigarOp.SOFT_CLIP,
+        "H" : CigarOp.HARD_CLIP,
+        "P" : CigarOp.PADDING,
+        "=" : CigarOp.EQUAL,
+        "X" : CigarOp.DIFFERENT,
+        "B" : CigarOp.BACK
+    }
+_op_to_str = {v: k for k, v in _str_to_op.items()}
+def read_alignment_from_cigar(cigar, position,
+                              reference_sequence, segment_sequence):
+    """
+    Create an :class:`Alignment` from a CIGAR string.
+    Parameters
+    ----------
+    cigar : str
+        The CIGAR string.
+    position : int
+        0-based position of the first aligned base in the reference.
+        0-based equivalent to the ``POS`` field in the SAM/BAM file.
+    reference_sequence : Sequence
+        The reference sequence.
+    segment_sequence : Sequence
+        The segment, read or query sequence.
+    Returns
+    -------
+    alignment : Alignment
+        The alignment.
+    See Also
+    --------
+    write_alignment_to_cigar
+    Notes
+    -----
+    This function expects that the `segment_sequence` was taken from the
+    SAM/BAM file, hence hard-clipped bases are not part of the sequence.
+    Therefore, hard clipped bases are simply ignored in the CIGAR
+    string.
+    Examples
+    --------
+    >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> print(read_alignment_from_cigar("9M2D12M", 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    >>> print(read_alignment_from_cigar("4X5=2D7=1X4=", 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    Explicit terminal deletions are also possible.
+    Note that in this case the deleted positions count as aligned bases
+    with respect to the `position` parameter.
+    >>> print(read_alignment_from_cigar("3D9M2D12M4D", 0, ref, seg))
+    TATAAAAGGTTTCCGACCGTAGGTAGCTGA
+    ---CCCCGGTTT--GACCGTATGTAG----
+    If bases in the segment sequence are soft-clipped, they do not
+    appear in the alignment.
+    Furthermore, the start of the reference sequence must be adapted.
+    >>> print(read_alignment_from_cigar("4S5M2D12M", 7, ref, seg))
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    Hard-clipped bases are not part of the segment sequence.
+    Hence `H` operations are completely ignored.
+    >>> seg = NucleotideSequence("GGTTTGACCGTATGTAG")
+    >>> print(read_alignment_from_cigar("4H5M2D12M", 7, ref, seg))
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    Reading from BAM codes is also possible.
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> op_tuples = [
+    ...     (CigarOp.MATCH, 9),
+    ...     (CigarOp.DELETION, 2),
+    ...     (CigarOp.MATCH, 12)
+    ... ]
+    >>> print(read_alignment_from_cigar(op_tuples, 3, ref, seg))
+    AAAAGGTTTCCGACCGTAGGTAG
+    CCCCGGTTT--GACCGTATGTAG
+    """
+    if isinstance(cigar, str):
+        operations = _op_tuples_from_cigar(cigar)
+    else:
+        operations = np.asarray(cigar, dtype=int)
+        if operations.ndim != 2:
+            raise ValueError(
+                "Expected array with shape (n,2)"
+            )
+        if operations.shape[1] != 2:
+            raise ValueError(
+                "Expected (operation, length) pairs"
+            )
+    if len(operations) == 0:
+        return Alignment(
+            [reference_sequence, segment_sequence], np.zeros((0, 2), dtype=int)
+        )
+    trace = np.zeros((np.sum(operations[:,1]), 2), dtype=int)
+    clip_mask = np.ones(trace.shape[0], dtype=bool)
+    i = 0
+    ref_pos = position
+    seg_pos = 0
+    for op, length in operations:
+        op = CigarOp(op)
+        if op in (CigarOp.MATCH, CigarOp.EQUAL, CigarOp.DIFFERENT):
+            trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
+            trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
+            ref_pos += length
+            seg_pos += length
+        elif op == CigarOp.INSERTION:
+            trace[i : i + length, 0] = -1
+            trace[i : i + length, 1] = np.arange(seg_pos, seg_pos + length)
+            seg_pos += length
+        elif op in (CigarOp.DELETION, CigarOp.INTRON):
+            trace[i : i + length, 0] = np.arange(ref_pos, ref_pos + length)
+            trace[i : i + length, 1] = -1
+            ref_pos += length
+        elif op == CigarOp.SOFT_CLIP:
+            clip_mask[i : i + length] = False
+            seg_pos += length
+        elif op == CigarOp.HARD_CLIP:
+            clip_mask[i : i + length] = False
+        else:
+            raise ValueError(
+                f"CIGAR operation {op} is not implemented"
+            )
+        i += length
+    # Remove clipped positions
+    trace = trace[clip_mask]
+    return Alignment([reference_sequence, segment_sequence], trace)
+def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
+                             introns=(), distinguish_matches=False,
+                             hard_clip=False, include_terminal_gaps=False,
+                             as_string=True):
+    """
+    Convert an :class:`Alignment` into a CIGAR string.
+    Parameters
+    ----------
+    alignment : Alignment
+        The alignment to be converted.
+    reference_index : int, optional
+        The index of the reference sequence in the alignment.
+        By default the first sequence is used.
+    segment_index : int, optional
+        The index of the segment, read or query sequence in the
+        alignment.
+        By default the second sequence is used.
+    introns : iterable object of tuple(int, int), optional
+        The introns in the reference sequence.
+        The introns are given as tuples of start and exclusive stop
+        index.
+        In those regions gaps in the reference sequence are reflected by
+        `'N'` in the CIGAR string.
+        By default no introns are assumed.
+    distinguish_matches : bool, optional
+        If true, matches (`'='`) are distinguished from mismatches
+        (`'X'`).
+        Otherwise, matches and mismatches are reflected equally by an
+        `'M'` in the CIGAR string.
+    hard_clip : bool, optional
+        If true, clipped bases are hard-clipped.
+        Otherwise, clipped bases are soft-clipped.
+    include_terminal_gaps : bool, optional
+        If true, terminal gaps in the segment sequence are included in
+        the CIGAR string.
+        These are represented by ``D`` operations at the start and/or
+        end of the string.
+        By default, those terminal gaps are omitted in the CIGAR, which
+        is the way SAM/BAM expects a CIGAR to be.
+    as_string : bool, optional
+        If true, the CIGAR string is returned.
+        Otherwise, a list of tuples is returned, where the first element
+        of each tuple specifies the :class:`CigarOp` and the second
+        element specifies the number of repetitions.
+    Returns
+    -------
+    cigar : str or ndarray, shape=(n,2) dtype=int
+        If `as_string` is true, the CIGAR string is returned.
+        Otherwise, an array is returned, where the first column
+        specifies the :class:`CigarOp` and the second column specifies
+        the number of repetitions of that operation.
+    See Also
+    --------
+    read_alignment_from_cigar
+    Notes
+    -----
+    If `include_terminal_gaps` is set to true, you usually want to set
+    ``position=0`` in :func:`read_alignment_from_cigar` to get the
+    correct alignment.
+    Examples
+    --------
+    >>> ref = NucleotideSequence("TATAAAAGGTTTCCGACCGTAGGTAGCTGA")
+    >>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
+    >>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
+    >>> semiglobal_alignment = align_optimal(
+    ...     ref, seg, matrix, local=False, terminal_penalty=False
+    ... )[0]
+    >>> print(semiglobal_alignment)
+    TATAAAAGGTTTCCGACCGTAGGTAGCTGA
+    ---CCCCGGTTT--GACCGTATGTAG----
+    >>> print(write_alignment_to_cigar(semiglobal_alignment))
+    9M2D12M
+    >>> print(write_alignment_to_cigar(semiglobal_alignment, introns=[(12, 14)]))
+    9M2N12M
+    >>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
+    4X5=2D7=1X4=
+    >>> print(write_alignment_to_cigar(semiglobal_alignment, include_terminal_gaps=True))
+    3D9M2D12M4D
+    >>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
+    >>> print(local_alignment)
+    GGTTTCCGACCGTAGGTAG
+    GGTTT--GACCGTATGTAG
+    >>> print(write_alignment_to_cigar(local_alignment, hard_clip=False))
+    4S5M2D12M
+    >>> print(write_alignment_to_cigar(local_alignment, hard_clip=True))
+    4H5M2D12M
+    Writing operations as BAM codes is also possible:
+    >>> op_tuples = write_alignment_to_cigar(semiglobal_alignment, as_string=False)
+    >>> for op, length in op_tuples:
+    ...     print(CigarOp(op), length)
+    CigarOp.MATCH 9
+    CigarOp.DELETION 2
+    CigarOp.MATCH 12
+    """
+    if not include_terminal_gaps:
+        alignment = _remove_terminal_segment_gaps(alignment, segment_index)
+    ref_trace = alignment.trace[:, reference_index]
+    seg_trace = alignment.trace[:, segment_index]
+    operations = np.full(alignment.trace.shape[0], CigarOp.MATCH, dtype=int)
+    insertion_mask = (ref_trace == -1)
+    deletion_mask = (seg_trace == -1)
+    if np.any(insertion_mask & deletion_mask):
+        raise ValueError(
+            "Alignment contains insertion and deletion at the same position"
+        )
+    operations[insertion_mask] = CigarOp.INSERTION
+    operations[deletion_mask] = CigarOp.DELETION
+    if introns is not None:
+        intron_mask = np.zeros(operations.shape[0], dtype=bool)
+        for start, stop in introns:
+            if start >= stop:
+                raise ValueError(
+                    "Intron start must be smaller than intron stop"
+                )
+            if start < 0:
+                raise ValueError(
+                    "Intron start must not be negative"
+                )
+            intron_mask[(ref_trace >= start) & (ref_trace < stop)] = True
+        if np.any(intron_mask & ~deletion_mask):
+            raise ValueError(
+                "Introns must be within gaps in the reference sequence"
+            )
+        operations[intron_mask] = CigarOp.INTRON
+    if distinguish_matches:
+        symbol_codes = get_codes(alignment)
+        ref_codes = symbol_codes[reference_index, :]
+        seg_codes = symbol_codes[segment_index, :]
+        equal_mask = (ref_codes == seg_codes)
+        match_mask = (operations == CigarOp.MATCH)
+        operations[equal_mask & match_mask] = CigarOp.EQUAL
+        operations[~equal_mask & match_mask] = CigarOp.DIFFERENT
+    op_tuples = _aggregate_consecutive(operations)
+    clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
+    start_clip_length, end_clip_length = _find_clipped_bases(
+        alignment, segment_index
+    )
+    if start_clip_length != 0:
+        start_clip = [(clip_op, start_clip_length)]
+    else:
+        start_clip = np.zeros((0, 2), dtype=int)
+    if end_clip_length != 0:
+        end_clip = [(clip_op, end_clip_length)]
+    else:
+        end_clip = np.zeros((0, 2), dtype=int)
+    op_tuples = np.concatenate((start_clip, op_tuples, end_clip))
+    if as_string:
+        cigar = _cigar_from_op_tuples(op_tuples)
+        return cigar
+    else:
+        return op_tuples
+def _remove_terminal_segment_gaps(alignment, segment_index):
+    """
+    Remove terminal gaps in the segment sequence.
+    """
+    no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
+    return alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
+def _find_clipped_bases(alignment, segment_index):
+    """
+    Find the number of clipped bases at the start and end of the segment.
+    """
+    # Finding the clipped part is easier, when the terminal segment gaps
+    # are removed (if not already done)
+    alignment = _remove_terminal_segment_gaps(alignment, segment_index)
+    seg_trace = alignment.trace[:, segment_index]
+    # Missing bases at the beginning and end of the segment are
+    # interpreted as clipped
+    # As first element in the segment trace is the first aligned base,
+    # all previous bases are clipped...
+    start_clip_length = seg_trace[0]
+    # ...and the same applies for the last base
+    end_clip_length = (
+        len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
+    )
+    return start_clip_length, end_clip_length
+def _aggregate_consecutive(operations):
+    """
+    Aggregate consecutive operations of the same type.
+    """
+    op_start_indices = np.where(operations[:-1] != operations[1:])[0]
+    # Also include the first operation
+    op_start_indices += 1
+    op_start_indices = np.concatenate(([0], op_start_indices))
+    ops = operations[op_start_indices]
+    length = np.diff(np.append(op_start_indices, len(operations)))
+    return np.stack((ops, length), axis=-1)
+def _cigar_from_op_tuples(op_tuples):
+    """
+    Create a CIGAR string from a list of BAM integer tuples.
+    The first element of each tuple specifies the operation and the
+    second element specifies the number of repetitions.
+    """
+    cigar = ""
+    for op, count in op_tuples:
+        cigar += str(count) + CigarOp(op).to_cigar_symbol()
+    return cigar
+def _op_tuples_from_cigar(cigar):
+    """
+    Create a list of tuples from a CIGAR string.
+    """
+    op_tuples = []
+    count = ""
+    for char in cigar:
+        if char.isdigit():
+            count += char
+        else:
+            op = CigarOp.from_cigar_symbol(char)
+            op_tuples.append((op, count))
+            count = ""
+    return np.array(op_tuples, dtype=int)

biotite/sequence/align/kmeralphabet.cpython-310-darwin.so ADDED Viewed

Binary file