biotite 1.5.0__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["EValueEstimator"]
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from biotite.sequence.align.pairwise import align_optimal
|
|
11
|
+
from biotite.sequence.seqtypes import GeneralSequence
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EValueEstimator:
|
|
15
|
+
r"""
|
|
16
|
+
This class is used to calculate *expect values* (E-values) for local
|
|
17
|
+
pairwise sequence alignments.
|
|
18
|
+
|
|
19
|
+
The E-value is a measure to quantify the significance of a found
|
|
20
|
+
homology.
|
|
21
|
+
It is the number of alignments, that would result from aligning
|
|
22
|
+
random sequences of a given length, with a score at least as high as
|
|
23
|
+
the score from an alignment of interest.
|
|
24
|
+
|
|
25
|
+
The calculation of the E-value from score and sequence lengths
|
|
26
|
+
depend on the two parameters :math:`\lambda` and :math:`K`
|
|
27
|
+
:footcite:`Altschul1996`.
|
|
28
|
+
These parameters are estimated from sampling a large number
|
|
29
|
+
of random sequence alignments in :meth:`from_samples()`
|
|
30
|
+
:footcite:`Altschul1986`, which may be time consuming.
|
|
31
|
+
If these parameters are known, the constructor can be used instead.
|
|
32
|
+
|
|
33
|
+
Based on the sampled parameters, the decadic logarithm of the
|
|
34
|
+
E-value can be quickly calculated via :meth:`log_evalue()`.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
lam : float
|
|
39
|
+
The :math:`\lambda` parameter.
|
|
40
|
+
k : float
|
|
41
|
+
The :math:`K` parameter.
|
|
42
|
+
|
|
43
|
+
Notes
|
|
44
|
+
-----
|
|
45
|
+
The calculated E-value is a rough estimation that gets more
|
|
46
|
+
accurate the more sequences are used in the sampling process.
|
|
47
|
+
Note that the accuracy for alignment of short sequences, where the
|
|
48
|
+
average length of a sampled alignment make up a significant part of
|
|
49
|
+
the complete sampled sequence :footcite:`Altschul1996`.
|
|
50
|
+
|
|
51
|
+
References
|
|
52
|
+
----------
|
|
53
|
+
|
|
54
|
+
.. footbibliography::
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
|
|
59
|
+
Create an alignment, whose significance should be evaluated.
|
|
60
|
+
|
|
61
|
+
>>> query = NucleotideSequence("CGACGGCGTCTACGAGTCAACATCATTC")
|
|
62
|
+
>>> hit = NucleotideSequence("GCTTTATTACGGGTTTACGAGTTCAACATCACGAAAACAA")
|
|
63
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
64
|
+
>>> gap_penalty = (-12, -2)
|
|
65
|
+
>>> alignment = align_optimal(query, hit, matrix, gap_penalty, local=True)[0]
|
|
66
|
+
>>> print(alignment)
|
|
67
|
+
ACGGCGTCTACGAGT-CAACATCA
|
|
68
|
+
ACGG-GTTTACGAGTTCAACATCA
|
|
69
|
+
>>> print(alignment.score)
|
|
70
|
+
77
|
|
71
|
+
|
|
72
|
+
Create an estimator based on the same scoring scheme as the
|
|
73
|
+
alignment.
|
|
74
|
+
Use background symbol frequencies from the hypothetical reference
|
|
75
|
+
database.
|
|
76
|
+
|
|
77
|
+
>>> # Ensure deterministic results
|
|
78
|
+
>>> np.random.seed(0)
|
|
79
|
+
>>> # Sequences in database have a GC content of 0.6
|
|
80
|
+
>>> background = np.array([0.2, 0.3, 0.3, 0.2])
|
|
81
|
+
>>> estimator = EValueEstimator.from_samples(
|
|
82
|
+
... query.alphabet, matrix, gap_penalty, background, sample_length=100
|
|
83
|
+
... )
|
|
84
|
+
|
|
85
|
+
Approach 1: Calculate E-value based on number of sequences in the
|
|
86
|
+
hypothetical database (*100*).
|
|
87
|
+
|
|
88
|
+
>>> log_e = estimator.log_evalue(alignment.score, len(query), 100 * len(hit))
|
|
89
|
+
>>> print(f"E-value = {10**log_e:.2e}")
|
|
90
|
+
E-value = 3.36e-01
|
|
91
|
+
|
|
92
|
+
Approach 2: Calculate E-value based on total length of all sequences
|
|
93
|
+
in the hypothetical database combined (*10000*).
|
|
94
|
+
|
|
95
|
+
>>> log_e = estimator.log_evalue(alignment.score, len(query), 10000)
|
|
96
|
+
>>> print(f"E-value = {10**log_e:.2e}")
|
|
97
|
+
E-value = 8.41e-01
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(self, lam, k):
|
|
101
|
+
self._lam = lam
|
|
102
|
+
self._k = k
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def from_samples(
|
|
106
|
+
alphabet, matrix, gap_penalty, frequencies, sample_length=1000, sample_size=1000
|
|
107
|
+
):
|
|
108
|
+
r"""
|
|
109
|
+
Create an :class:`EValueEstimator` with :math:`\lambda` and
|
|
110
|
+
:math:`K` estimated via sampling alignments of random sequences
|
|
111
|
+
based on a given scoring scheme.
|
|
112
|
+
|
|
113
|
+
The parameters are estimated from the sampled alignment scores
|
|
114
|
+
using the method of moments :footcite:`Altschul1986`.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
alphabet : Alphabet, length=k
|
|
119
|
+
The alphabet for the sampled sequences.
|
|
120
|
+
matrix : SubstitutionMatrix
|
|
121
|
+
The substitution matrix.
|
|
122
|
+
It must be compatible with the given `alphabet` and the
|
|
123
|
+
expected similarity score between two random symbols must be
|
|
124
|
+
negative.
|
|
125
|
+
gap_penalty : int or tuple(int,int)
|
|
126
|
+
Either a linear (``int``) or affine (``tuple``) gap penalty.
|
|
127
|
+
Integers must be negative.
|
|
128
|
+
frequencies : ndarray, shape=k, dtype=float
|
|
129
|
+
The background frequencies for each symbol in the
|
|
130
|
+
`alphabet`.
|
|
131
|
+
The random sequences are created based on these frequencies.
|
|
132
|
+
sample_length : int
|
|
133
|
+
The length of the sampled sequences.
|
|
134
|
+
It should be much larger than the average length of a local
|
|
135
|
+
alignment of two sequences.
|
|
136
|
+
The runtime scales quadratically with this parameter.
|
|
137
|
+
sample_size : int
|
|
138
|
+
The number of sampled sequences.
|
|
139
|
+
The accuracy of the estimated parameters and E-values,
|
|
140
|
+
but also the runtime increases with the sample size.
|
|
141
|
+
|
|
142
|
+
Returns
|
|
143
|
+
-------
|
|
144
|
+
estimator : EValueEstimator
|
|
145
|
+
A :class:`EValueEstimator` with sampled :math:`\lambda` and
|
|
146
|
+
:math:`K` parameters.
|
|
147
|
+
|
|
148
|
+
Notes
|
|
149
|
+
-----
|
|
150
|
+
The sampling process generates random sequences based on
|
|
151
|
+
``numpy.random``.
|
|
152
|
+
To ensure reproducible results you could call
|
|
153
|
+
:func:`numpy.random.seed()` before running
|
|
154
|
+
:meth:`from_samples()`.
|
|
155
|
+
"""
|
|
156
|
+
if len(frequencies) != len(alphabet):
|
|
157
|
+
raise IndexError(
|
|
158
|
+
f"Background frequencies for {len(frequencies)} symbols were "
|
|
159
|
+
f"given, but the alphabet has {len(alphabet)} symbols"
|
|
160
|
+
)
|
|
161
|
+
if np.any(frequencies < 0):
|
|
162
|
+
raise ValueError("Background frequencies must be positive")
|
|
163
|
+
# Normalize background frequencies
|
|
164
|
+
frequencies = frequencies / np.sum(frequencies)
|
|
165
|
+
|
|
166
|
+
# Check matrix
|
|
167
|
+
if not matrix.is_symmetric():
|
|
168
|
+
raise ValueError("A symmetric substitution matrix is required")
|
|
169
|
+
if not matrix.get_alphabet1().extends(alphabet):
|
|
170
|
+
raise ValueError(
|
|
171
|
+
"The substitution matrix is not compatible with the given alphabet"
|
|
172
|
+
)
|
|
173
|
+
score_matrix = matrix.score_matrix()[: len(alphabet), : len(alphabet)]
|
|
174
|
+
if (
|
|
175
|
+
np.sum(
|
|
176
|
+
score_matrix * frequencies[np.newaxis, :] * frequencies[:, np.newaxis]
|
|
177
|
+
)
|
|
178
|
+
>= 0
|
|
179
|
+
):
|
|
180
|
+
raise ValueError(
|
|
181
|
+
"Invalid substitution matrix, the expected similarity "
|
|
182
|
+
"score between two random symbols is not negative"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Generate the sequence code for the random sequences
|
|
186
|
+
random_sequence_code = np.random.choice(
|
|
187
|
+
len(alphabet), size=(sample_size, 2, sample_length), p=frequencies
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Sample the alignments of random sequences
|
|
191
|
+
sample_scores = np.zeros(sample_size, dtype=int)
|
|
192
|
+
for i in range(sample_size):
|
|
193
|
+
seq1 = GeneralSequence(alphabet)
|
|
194
|
+
seq2 = GeneralSequence(alphabet)
|
|
195
|
+
seq1.code = random_sequence_code[i, 0]
|
|
196
|
+
seq2.code = random_sequence_code[i, 1]
|
|
197
|
+
sample_scores[i] = align_optimal(
|
|
198
|
+
seq1, seq2, matrix, local=True, gap_penalty=gap_penalty, max_number=1
|
|
199
|
+
)[0].score
|
|
200
|
+
|
|
201
|
+
# Use method of moments to estimate parameters
|
|
202
|
+
lam = np.pi / np.sqrt(6 * np.var(sample_scores))
|
|
203
|
+
u = np.mean(sample_scores) - np.euler_gamma / lam
|
|
204
|
+
k = np.exp(lam * u) / sample_length**2
|
|
205
|
+
|
|
206
|
+
return EValueEstimator(lam, k)
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def lam(self):
|
|
210
|
+
return self._lam
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def k(self):
|
|
214
|
+
return self._k
|
|
215
|
+
|
|
216
|
+
def log_evalue(self, score, seq1_length, seq2_length):
|
|
217
|
+
r"""
|
|
218
|
+
Calculate the decadic logarithm of the E-value for a given
|
|
219
|
+
score.
|
|
220
|
+
|
|
221
|
+
The E-value and the logarithm of the E-value is calculated as
|
|
222
|
+
|
|
223
|
+
.. math::
|
|
224
|
+
|
|
225
|
+
E = Kmn e^{-\lambda s}
|
|
226
|
+
|
|
227
|
+
\log_{10} E = (\log_{10} Kmn) - \frac{\lambda s}{\ln 10},
|
|
228
|
+
|
|
229
|
+
where :math:`s` is the similarity score and :math:`m` and
|
|
230
|
+
:math:`n` are the lengths of the aligned sequences.
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
score : int or ndarray, dtype=int
|
|
235
|
+
The score to evaluate.
|
|
236
|
+
seq1_length : int or ndarray, dtype=int
|
|
237
|
+
The length of the first sequence.
|
|
238
|
+
In the context of a homology search in a sequence database,
|
|
239
|
+
this is usually the length of the query sequence.
|
|
240
|
+
seq2_length : int or ndarray, dtype=int
|
|
241
|
+
The length of the second sequence.
|
|
242
|
+
In the context of a homology search in a sequence database,
|
|
243
|
+
this is usually either the combined length of all sequences
|
|
244
|
+
in the database or the length of the hit sequence multiplied
|
|
245
|
+
by the number of sequences in the database.
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
log_e : float
|
|
250
|
+
The decadic logarithm of the E-value.
|
|
251
|
+
|
|
252
|
+
Notes
|
|
253
|
+
-----
|
|
254
|
+
This method returns the logarithm of the E-value instead of
|
|
255
|
+
the E-value, as low E-values indicating a highly significant
|
|
256
|
+
homology cannot be accurately represented by a ``float``.
|
|
257
|
+
"""
|
|
258
|
+
score = np.asarray(score)
|
|
259
|
+
seq1_length = np.asarray(seq1_length)
|
|
260
|
+
seq2_length = np.asarray(seq2_length)
|
|
261
|
+
|
|
262
|
+
return np.log10(
|
|
263
|
+
self._k * seq1_length * seq2_length
|
|
264
|
+
) - self._lam * score / np.log(10)
|
|
Binary file
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
cimport cython
|
|
6
|
+
cimport numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# A trace table saves the directions a cell came from
|
|
10
|
+
# A "1" in the corresponding bit in the trace table means
|
|
11
|
+
# the cell came from this direction
|
|
12
|
+
|
|
13
|
+
cdef enum TraceDirectionLinear:
|
|
14
|
+
# Values for linear gap penalty (one score table)
|
|
15
|
+
MATCH = 1 # bit 1 -> diagonal -> alignment of symbols
|
|
16
|
+
GAP_LEFT = 2 # bit 2 -> left -> gap in first sequence
|
|
17
|
+
GAP_TOP = 4 # bit 3 -> top -> gap in second sequence
|
|
18
|
+
|
|
19
|
+
cdef enum TraceDirectionAffine:
|
|
20
|
+
# Values for affine gap penalty (three score tables)
|
|
21
|
+
MATCH_TO_MATCH = 1 # bit 1 -> match - match transition
|
|
22
|
+
GAP_LEFT_TO_MATCH = 2 # bit 2 -> seq 1 gap - match transition
|
|
23
|
+
GAP_TOP_TO_MATCH = 4 # bit 3 -> seq 2 gap - match transition
|
|
24
|
+
MATCH_TO_GAP_LEFT = 8 # bit 4 -> match - seq 1 gap transition
|
|
25
|
+
GAP_LEFT_TO_GAP_LEFT = 16 # bit 5 -> seq 1 gap - seq 1 gap transition
|
|
26
|
+
MATCH_TO_GAP_TOP = 32 # bit 6 -> match - seq 2 gap transition
|
|
27
|
+
GAP_TOP_TO_GAP_TOP = 64 # bit 7 -> seq 2 gap - seq 2 gap transition
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
cdef enum TraceState:
|
|
31
|
+
# The state specifies the table the traceback is currently in
|
|
32
|
+
# For linear gap penalty (only one table/state exists):
|
|
33
|
+
NO_STATE = 0
|
|
34
|
+
# For affine gap penalty (three tables/states exists):
|
|
35
|
+
MATCH_STATE = 1
|
|
36
|
+
GAP_LEFT_STATE = 2
|
|
37
|
+
GAP_TOP_STATE = 3
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
cdef np.uint8_t get_trace_linear(np.int32_t match_score,
|
|
41
|
+
np.int32_t gap_left_score,
|
|
42
|
+
np.int32_t gap_top_score,
|
|
43
|
+
np.int32_t *max_score)
|
|
44
|
+
|
|
45
|
+
cdef np.uint8_t get_trace_affine(np.int32_t match_to_match_score,
|
|
46
|
+
np.int32_t gap_left_to_match_score,
|
|
47
|
+
np.int32_t gap_top_to_match_score,
|
|
48
|
+
np.int32_t match_to_gap_left_score,
|
|
49
|
+
np.int32_t gap_left_to_gap_left_score,
|
|
50
|
+
np.int32_t match_to_gap_top_score,
|
|
51
|
+
np.int32_t gap_top_to_gap_top_score,
|
|
52
|
+
np.int32_t *max_match_score,
|
|
53
|
+
np.int32_t *max_gap_left_score,
|
|
54
|
+
np.int32_t *max_gap_top_score)
|
|
55
|
+
|
|
56
|
+
cdef int follow_trace(np.uint8_t[:,:] trace_table,
|
|
57
|
+
bint banded,
|
|
58
|
+
int i, int j, int pos,
|
|
59
|
+
np.int64_t[:,:] trace,
|
|
60
|
+
list trace_list,
|
|
61
|
+
int state,
|
|
62
|
+
int* curr_trace_count,
|
|
63
|
+
int max_trace_count,
|
|
64
|
+
int lower_diag, int upper_diag) except -1
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
A module for Biotite's internal use only.
|
|
7
|
+
Contains C-functions for handling trace tables in a reuasable way for
|
|
8
|
+
different alignment functions.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__name__ = "biotite.sequence.align"
|
|
12
|
+
__author__ = "Patrick Kunzmann"
|
|
13
|
+
__all__ = []
|
|
14
|
+
|
|
15
|
+
cimport cython
|
|
16
|
+
cimport numpy as np
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
cdef inline np.uint8_t get_trace_linear(np.int32_t match_score,
|
|
22
|
+
np.int32_t gap_left_score,
|
|
23
|
+
np.int32_t gap_top_score,
|
|
24
|
+
np.int32_t *max_score):
|
|
25
|
+
"""
|
|
26
|
+
Find maximum score from the input scores and return corresponding
|
|
27
|
+
trace direction for linear gap penalty.
|
|
28
|
+
"""
|
|
29
|
+
if match_score > gap_left_score:
|
|
30
|
+
if match_score > gap_top_score:
|
|
31
|
+
trace = TraceDirectionLinear.MATCH
|
|
32
|
+
max_score[0] = match_score
|
|
33
|
+
elif match_score == gap_top_score:
|
|
34
|
+
trace = (
|
|
35
|
+
TraceDirectionLinear.MATCH |
|
|
36
|
+
TraceDirectionLinear.GAP_TOP
|
|
37
|
+
)
|
|
38
|
+
max_score[0] = match_score
|
|
39
|
+
else:
|
|
40
|
+
trace = TraceDirectionLinear.GAP_TOP
|
|
41
|
+
max_score[0] = gap_top_score
|
|
42
|
+
elif match_score == gap_left_score:
|
|
43
|
+
if match_score > gap_top_score:
|
|
44
|
+
trace = (
|
|
45
|
+
TraceDirectionLinear.MATCH |
|
|
46
|
+
TraceDirectionLinear.GAP_LEFT
|
|
47
|
+
)
|
|
48
|
+
max_score[0] = match_score
|
|
49
|
+
elif match_score == gap_top_score:
|
|
50
|
+
trace = (
|
|
51
|
+
TraceDirectionLinear.MATCH |
|
|
52
|
+
TraceDirectionLinear.GAP_LEFT |
|
|
53
|
+
TraceDirectionLinear.GAP_TOP
|
|
54
|
+
)
|
|
55
|
+
max_score[0] = match_score
|
|
56
|
+
else:
|
|
57
|
+
trace = TraceDirectionLinear.GAP_TOP
|
|
58
|
+
max_score[0] = gap_top_score
|
|
59
|
+
else:
|
|
60
|
+
if gap_left_score > gap_top_score:
|
|
61
|
+
trace = TraceDirectionLinear.GAP_LEFT
|
|
62
|
+
max_score[0] = gap_left_score
|
|
63
|
+
elif gap_left_score == gap_top_score:
|
|
64
|
+
trace = (
|
|
65
|
+
TraceDirectionLinear.GAP_LEFT |
|
|
66
|
+
TraceDirectionLinear.GAP_TOP
|
|
67
|
+
)
|
|
68
|
+
max_score[0] = gap_left_score
|
|
69
|
+
else:
|
|
70
|
+
trace = TraceDirectionLinear.GAP_TOP
|
|
71
|
+
max_score[0] = gap_top_score
|
|
72
|
+
|
|
73
|
+
return trace
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
cdef inline np.uint8_t get_trace_affine(np.int32_t match_to_match_score,
|
|
77
|
+
np.int32_t gap_left_to_match_score,
|
|
78
|
+
np.int32_t gap_top_to_match_score,
|
|
79
|
+
np.int32_t match_to_gap_left_score,
|
|
80
|
+
np.int32_t gap_left_to_gap_left_score,
|
|
81
|
+
np.int32_t match_to_gap_top_score,
|
|
82
|
+
np.int32_t gap_top_to_gap_top_score,
|
|
83
|
+
np.int32_t *max_match_score,
|
|
84
|
+
np.int32_t *max_gap_left_score,
|
|
85
|
+
np.int32_t *max_gap_top_score):
|
|
86
|
+
"""
|
|
87
|
+
Find maximum scores from the input scores and return corresponding
|
|
88
|
+
trace direction for affine gap penalty.
|
|
89
|
+
"""
|
|
90
|
+
# Match Table
|
|
91
|
+
if match_to_match_score > gap_left_to_match_score:
|
|
92
|
+
if match_to_match_score > gap_top_to_match_score:
|
|
93
|
+
trace = TraceDirectionAffine.MATCH_TO_MATCH
|
|
94
|
+
max_match_score[0] = match_to_match_score
|
|
95
|
+
elif match_to_match_score == gap_top_to_match_score:
|
|
96
|
+
trace = (
|
|
97
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
98
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
99
|
+
)
|
|
100
|
+
max_match_score[0] = match_to_match_score
|
|
101
|
+
else:
|
|
102
|
+
trace = TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
103
|
+
max_match_score[0] = gap_top_to_match_score
|
|
104
|
+
elif match_to_match_score == gap_left_to_match_score:
|
|
105
|
+
if match_to_match_score > gap_top_to_match_score:
|
|
106
|
+
trace = (
|
|
107
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
108
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH
|
|
109
|
+
)
|
|
110
|
+
max_match_score[0] = match_to_match_score
|
|
111
|
+
elif match_to_match_score == gap_top_to_match_score:
|
|
112
|
+
trace = (
|
|
113
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
114
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH |
|
|
115
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
116
|
+
)
|
|
117
|
+
max_match_score[0] = match_to_match_score
|
|
118
|
+
else:
|
|
119
|
+
trace = TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
120
|
+
max_match_score[0] = gap_top_to_match_score
|
|
121
|
+
else:
|
|
122
|
+
if gap_left_to_match_score > gap_top_to_match_score:
|
|
123
|
+
trace = TraceDirectionAffine.GAP_LEFT_TO_MATCH
|
|
124
|
+
max_match_score[0] = gap_left_to_match_score
|
|
125
|
+
elif gap_left_to_match_score == gap_top_to_match_score:
|
|
126
|
+
trace = (
|
|
127
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH |
|
|
128
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
129
|
+
)
|
|
130
|
+
max_match_score[0] = gap_left_to_match_score
|
|
131
|
+
else:
|
|
132
|
+
trace = TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
133
|
+
max_match_score[0] = gap_top_to_match_score
|
|
134
|
+
|
|
135
|
+
# 'Gap left' table
|
|
136
|
+
if match_to_gap_left_score > gap_left_to_gap_left_score:
|
|
137
|
+
trace |= TraceDirectionAffine.MATCH_TO_GAP_LEFT
|
|
138
|
+
max_gap_left_score[0] = match_to_gap_left_score
|
|
139
|
+
elif match_to_gap_left_score < gap_left_to_gap_left_score:
|
|
140
|
+
trace |= TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT
|
|
141
|
+
max_gap_left_score[0] = gap_left_to_gap_left_score
|
|
142
|
+
else:
|
|
143
|
+
trace |= (
|
|
144
|
+
TraceDirectionAffine.MATCH_TO_GAP_LEFT |
|
|
145
|
+
TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT
|
|
146
|
+
)
|
|
147
|
+
max_gap_left_score[0] = match_to_gap_left_score
|
|
148
|
+
|
|
149
|
+
# 'Gap right' table
|
|
150
|
+
if match_to_gap_top_score > gap_top_to_gap_top_score:
|
|
151
|
+
trace |= TraceDirectionAffine.MATCH_TO_GAP_TOP
|
|
152
|
+
max_gap_top_score[0] = match_to_gap_top_score
|
|
153
|
+
elif match_to_gap_top_score < gap_top_to_gap_top_score:
|
|
154
|
+
trace |= TraceDirectionAffine.GAP_TOP_TO_GAP_TOP
|
|
155
|
+
max_gap_top_score[0] = gap_top_to_gap_top_score
|
|
156
|
+
else:
|
|
157
|
+
trace |= (
|
|
158
|
+
TraceDirectionAffine.MATCH_TO_GAP_TOP |
|
|
159
|
+
TraceDirectionAffine.GAP_TOP_TO_GAP_TOP
|
|
160
|
+
)
|
|
161
|
+
max_gap_top_score[0] = gap_top_to_gap_top_score
|
|
162
|
+
|
|
163
|
+
return trace
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
cdef int follow_trace(np.uint8_t[:,:] trace_table,
|
|
167
|
+
bint banded,
|
|
168
|
+
int i, int j, int pos,
|
|
169
|
+
np.int64_t[:,:] trace,
|
|
170
|
+
list trace_list,
|
|
171
|
+
int state,
|
|
172
|
+
int* curr_trace_count,
|
|
173
|
+
int max_trace_count,
|
|
174
|
+
int lower_diag, int upper_diag) except -1:
|
|
175
|
+
"""
|
|
176
|
+
Follow and return traces from a trace table.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
trace_table
|
|
181
|
+
A matrix containing values indicating the direction for the
|
|
182
|
+
traceback.
|
|
183
|
+
banded
|
|
184
|
+
Whether the trace table belongs to a banded alignment
|
|
185
|
+
i, j
|
|
186
|
+
The current position in the trace table.
|
|
187
|
+
For the first branch, this is the start of the traceback.
|
|
188
|
+
For additional branches this is the start of the respective
|
|
189
|
+
branch.
|
|
190
|
+
pos
|
|
191
|
+
The current position inthe trace array to be created.
|
|
192
|
+
For the first branch, this is 0.
|
|
193
|
+
For additional branches the value of the parent branch is taken.
|
|
194
|
+
trace
|
|
195
|
+
The alignment trace array to be filled.
|
|
196
|
+
trace_list
|
|
197
|
+
When a trace is finished, it is appened to this list
|
|
198
|
+
state
|
|
199
|
+
The current score table (*match*, *gap left*, *gap top*)
|
|
200
|
+
the traceback is in, taken from parent branch.
|
|
201
|
+
Always 0 when a linear gap penalty is used.
|
|
202
|
+
curr_trace_count
|
|
203
|
+
The current number of branches. The value is a pointer, so that
|
|
204
|
+
updating this value propagates the value to all other branches
|
|
205
|
+
max_trace_count
|
|
206
|
+
The maximum number of branches created. When the number of
|
|
207
|
+
branches reaches this value, no new branches are created.
|
|
208
|
+
lower_diag, upper_diag
|
|
209
|
+
The lower and upper diagonal for a banded alignment.
|
|
210
|
+
Unused, if `banded` is false.
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
int
|
|
215
|
+
``0`` if, no exception is raised, otherwisw ``-1``.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
cdef list next_indices
|
|
219
|
+
cdef list next_states
|
|
220
|
+
cdef int trace_value
|
|
221
|
+
cdef int k
|
|
222
|
+
cdef int seq_i, seq_j
|
|
223
|
+
cdef int i_match, i_gap_left, i_gap_top
|
|
224
|
+
cdef int j_match, j_gap_left, j_gap_top
|
|
225
|
+
|
|
226
|
+
if state == TraceState.NO_STATE:
|
|
227
|
+
# Linear gap penalty
|
|
228
|
+
# Trace table has a 0 -> no trace direction -> break loop
|
|
229
|
+
# The '0'-cell itself is also not included in the traceback
|
|
230
|
+
while trace_table[i,j] != 0:
|
|
231
|
+
if banded:
|
|
232
|
+
seq_i = i - 1
|
|
233
|
+
seq_j = j + seq_i + lower_diag - 1
|
|
234
|
+
i_match, i_gap_left, i_gap_top = i-1, i, i-1
|
|
235
|
+
j_match, j_gap_left, j_gap_top = j , j-1, j+1
|
|
236
|
+
else:
|
|
237
|
+
# -1 is necessary due to the shift of the sequences
|
|
238
|
+
# to the bottom/right in the table
|
|
239
|
+
seq_i = i - 1
|
|
240
|
+
seq_j = j - 1
|
|
241
|
+
i_match, i_gap_left, i_gap_top = i-1, i, i-1
|
|
242
|
+
j_match, j_gap_left, j_gap_top = j-1, j-1, j
|
|
243
|
+
trace[pos, 0] = seq_i
|
|
244
|
+
trace[pos, 1] = seq_j
|
|
245
|
+
pos += 1
|
|
246
|
+
# Traces may split
|
|
247
|
+
next_indices = []
|
|
248
|
+
trace_value = trace_table[i,j]
|
|
249
|
+
if trace_value & TraceDirectionLinear.MATCH:
|
|
250
|
+
next_indices.append((i_match, j_match))
|
|
251
|
+
if trace_value & TraceDirectionLinear.GAP_LEFT:
|
|
252
|
+
next_indices.append((i_gap_left, j_gap_left))
|
|
253
|
+
if trace_value & TraceDirectionLinear.GAP_TOP:
|
|
254
|
+
next_indices.append((i_gap_top, j_gap_top))
|
|
255
|
+
# Trace branching
|
|
256
|
+
# -> Recursive call of _follow_trace() for indices[1:]
|
|
257
|
+
for k in range(1, len(next_indices)):
|
|
258
|
+
if curr_trace_count[0] < max_trace_count:
|
|
259
|
+
curr_trace_count[0] += 1
|
|
260
|
+
new_i, new_j = next_indices[k]
|
|
261
|
+
follow_trace(
|
|
262
|
+
trace_table, banded, new_i, new_j, pos,
|
|
263
|
+
np.copy(trace), trace_list, 0,
|
|
264
|
+
curr_trace_count, max_trace_count,
|
|
265
|
+
lower_diag, upper_diag
|
|
266
|
+
)
|
|
267
|
+
# Continue in this method with indices[0]
|
|
268
|
+
i, j = next_indices[0]
|
|
269
|
+
else:
|
|
270
|
+
# Affine gap penalty
|
|
271
|
+
# -> check only for the current state whether the trace ends
|
|
272
|
+
while (
|
|
273
|
+
(
|
|
274
|
+
state == TraceState.MATCH_STATE and trace_table[i,j] & (
|
|
275
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
276
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH |
|
|
277
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
278
|
+
) != 0
|
|
279
|
+
) or (
|
|
280
|
+
state == TraceState.GAP_LEFT_STATE and trace_table[i,j] & (
|
|
281
|
+
TraceDirectionAffine.MATCH_TO_GAP_LEFT |
|
|
282
|
+
TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT
|
|
283
|
+
) != 0
|
|
284
|
+
) or (
|
|
285
|
+
state == TraceState.GAP_TOP_STATE and trace_table[i,j] & (
|
|
286
|
+
TraceDirectionAffine.MATCH_TO_GAP_TOP |
|
|
287
|
+
TraceDirectionAffine.GAP_TOP_TO_GAP_TOP
|
|
288
|
+
) != 0
|
|
289
|
+
)
|
|
290
|
+
):
|
|
291
|
+
if banded:
|
|
292
|
+
seq_i = i - 1
|
|
293
|
+
seq_j = j + seq_i + lower_diag - 1
|
|
294
|
+
i_match, i_gap_left, i_gap_top = i-1, i, i-1
|
|
295
|
+
j_match, j_gap_left, j_gap_top = j , j-1, j+1
|
|
296
|
+
else:
|
|
297
|
+
# -1 is necessary due to the shift of the sequences
|
|
298
|
+
# to the bottom/right in the table
|
|
299
|
+
seq_i = i - 1
|
|
300
|
+
seq_j = j - 1
|
|
301
|
+
i_match, i_gap_left, i_gap_top = i-1, i, i-1
|
|
302
|
+
j_match, j_gap_left, j_gap_top = j-1, j-1, j
|
|
303
|
+
trace[pos, 0] = seq_i
|
|
304
|
+
trace[pos, 1] = seq_j
|
|
305
|
+
pos += 1
|
|
306
|
+
next_indices = []
|
|
307
|
+
next_states = []
|
|
308
|
+
|
|
309
|
+
# Get value of trace corresponding to current state
|
|
310
|
+
# = table trace is currently in
|
|
311
|
+
if state == TraceState.MATCH_STATE:
|
|
312
|
+
trace_value = trace_table[i,j] & (
|
|
313
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
314
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH |
|
|
315
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
316
|
+
)
|
|
317
|
+
elif state == TraceState.GAP_LEFT_STATE:
|
|
318
|
+
trace_value = trace_table[i,j] & (
|
|
319
|
+
TraceDirectionAffine.MATCH_TO_GAP_LEFT |
|
|
320
|
+
TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT
|
|
321
|
+
)
|
|
322
|
+
else: # state == TraceState.GAP_TOP_STATE:
|
|
323
|
+
trace_value = trace_table[i,j] & (
|
|
324
|
+
TraceDirectionAffine.MATCH_TO_GAP_TOP |
|
|
325
|
+
TraceDirectionAffine.GAP_TOP_TO_GAP_TOP
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Determine indices and state of next trace step
|
|
329
|
+
if trace_value & TraceDirectionAffine.MATCH_TO_MATCH:
|
|
330
|
+
next_indices.append((i_match, j_match))
|
|
331
|
+
next_states.append(TraceState.MATCH_STATE)
|
|
332
|
+
if trace_value & TraceDirectionAffine.GAP_LEFT_TO_MATCH:
|
|
333
|
+
next_indices.append((i_match, j_match))
|
|
334
|
+
next_states.append(TraceState.GAP_LEFT_STATE)
|
|
335
|
+
if trace_value & TraceDirectionAffine.GAP_TOP_TO_MATCH:
|
|
336
|
+
next_indices.append((i_match, j_match))
|
|
337
|
+
next_states.append(TraceState.GAP_TOP_STATE)
|
|
338
|
+
if trace_value & TraceDirectionAffine.MATCH_TO_GAP_LEFT:
|
|
339
|
+
next_indices.append((i_gap_left, j_gap_left))
|
|
340
|
+
next_states.append(TraceState.MATCH_STATE)
|
|
341
|
+
if trace_value & TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT:
|
|
342
|
+
next_indices.append((i_gap_left, j_gap_left))
|
|
343
|
+
next_states.append(TraceState.GAP_LEFT_STATE)
|
|
344
|
+
if trace_value & TraceDirectionAffine.MATCH_TO_GAP_TOP:
|
|
345
|
+
next_indices.append((i_gap_top, j_gap_top))
|
|
346
|
+
next_states.append(TraceState.MATCH_STATE)
|
|
347
|
+
if trace_value & TraceDirectionAffine.GAP_TOP_TO_GAP_TOP:
|
|
348
|
+
next_indices.append((i_gap_top, j_gap_top))
|
|
349
|
+
next_states.append(TraceState.GAP_TOP_STATE)
|
|
350
|
+
# Trace branching
|
|
351
|
+
# -> Recursive call of _follow_trace() for indices[1:]
|
|
352
|
+
for k in range(1, len(next_indices)):
|
|
353
|
+
if curr_trace_count[0] < max_trace_count:
|
|
354
|
+
curr_trace_count[0] += 1
|
|
355
|
+
new_i, new_j = next_indices[k]
|
|
356
|
+
new_state = next_states[k]
|
|
357
|
+
follow_trace(
|
|
358
|
+
trace_table, banded, new_i, new_j, pos,
|
|
359
|
+
np.copy(trace), trace_list, new_state,
|
|
360
|
+
curr_trace_count, max_trace_count,
|
|
361
|
+
lower_diag, upper_diag
|
|
362
|
+
)
|
|
363
|
+
# Continue in this method with indices[0] and states[0]
|
|
364
|
+
i, j = next_indices[0]
|
|
365
|
+
state = next_states[0]
|
|
366
|
+
# Trim trace to correct size (delete all pure -1 entries)
|
|
367
|
+
# and append to trace_list
|
|
368
|
+
tr_arr = np.asarray(trace)
|
|
369
|
+
trace_list.append(tr_arr[(tr_arr[:,0] != -1) | (tr_arr[:,1] != -1)])
|
|
370
|
+
return 0
|