biotite 1.5.0__cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-314-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["align_banded"]
|
|
8
|
+
|
|
9
|
+
cimport cython
|
|
10
|
+
cimport numpy as np
|
|
11
|
+
from .tracetable cimport follow_trace, get_trace_linear, get_trace_affine, \
|
|
12
|
+
TraceDirectionAffine, TraceState
|
|
13
|
+
|
|
14
|
+
from .matrix import SubstitutionMatrix
|
|
15
|
+
from ..sequence import Sequence
|
|
16
|
+
from .alignment import Alignment
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
ctypedef np.int32_t int32
|
|
21
|
+
ctypedef np.int64_t int64
|
|
22
|
+
ctypedef np.uint8_t uint8
|
|
23
|
+
ctypedef np.uint16_t uint16
|
|
24
|
+
ctypedef np.uint32_t uint32
|
|
25
|
+
ctypedef np.uint64_t uint64
|
|
26
|
+
|
|
27
|
+
ctypedef fused CodeType1:
|
|
28
|
+
uint8
|
|
29
|
+
uint16
|
|
30
|
+
uint32
|
|
31
|
+
uint64
|
|
32
|
+
ctypedef fused CodeType2:
|
|
33
|
+
uint8
|
|
34
|
+
uint16
|
|
35
|
+
uint32
|
|
36
|
+
uint64
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def align_banded(seq1, seq2, matrix, band, gap_penalty=-10, local=False,
|
|
40
|
+
max_number=1000):
|
|
41
|
+
"""
|
|
42
|
+
align_banded(seq1, seq2, matrix, band, gap_penalty=-10, local=False,
|
|
43
|
+
max_number=1000)
|
|
44
|
+
|
|
45
|
+
Perform a local or semi-global alignment within a defined diagonal
|
|
46
|
+
band. :footcite:`Pearson1988`
|
|
47
|
+
|
|
48
|
+
The function requires two diagonals that defines the lower
|
|
49
|
+
and upper limit of the alignment band.
|
|
50
|
+
A diagonal is an integer defined as :math:`D = j - i`, where *i* and
|
|
51
|
+
*j* are sequence positions in the first and second sequence,
|
|
52
|
+
respectively.
|
|
53
|
+
This means that two symbols at position *i* and *j* can only be
|
|
54
|
+
aligned to each other, if :math:`D_L \leq j - i \leq D_U`.
|
|
55
|
+
With increasing width of the diagonal band, the probability to find
|
|
56
|
+
the optimal alignment, but also the computation time increases.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
seq1, seq2 : Sequence
|
|
61
|
+
The sequences to be aligned.
|
|
62
|
+
matrix : SubstitutionMatrix
|
|
63
|
+
The substitution matrix used for scoring.
|
|
64
|
+
band : tuple(int, int)
|
|
65
|
+
The diagonals that represent the lower and upper limit of the
|
|
66
|
+
search space.
|
|
67
|
+
A diagonal :math:`D` is defined as :math:`D = j-i`, where
|
|
68
|
+
:math:`i` and :math:`j` are positions in `seq1` and `seq2`,
|
|
69
|
+
respectively.
|
|
70
|
+
An alignment of sequence positions where :math:`D` is lower than
|
|
71
|
+
the lower limit or greater than the upper limit is not explored
|
|
72
|
+
by the algorithm.
|
|
73
|
+
gap_penalty : int or tuple(int, int), optional
|
|
74
|
+
If an integer is provided, the value will be interpreted as
|
|
75
|
+
linear gap penalty.
|
|
76
|
+
If a tuple is provided, an affine gap penalty is used.
|
|
77
|
+
The first integer in the tuple is the gap opening penalty,
|
|
78
|
+
the second integer is the gap extension penalty.
|
|
79
|
+
The values need to be negative.
|
|
80
|
+
local : bool, optional
|
|
81
|
+
If set to true, a local alignment is performed.
|
|
82
|
+
Otherwise (default) a semi-global alignment is performed.
|
|
83
|
+
max_number : int, optional
|
|
84
|
+
The maximum number of alignments returned.
|
|
85
|
+
When the number of branches exceeds this value in the traceback
|
|
86
|
+
step, no further branches are created.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
alignments : list of Alignment
|
|
91
|
+
The generated alignments.
|
|
92
|
+
Each alignment in the list has the same similarity score,
|
|
93
|
+
which is the maximum score possible within the defined band.
|
|
94
|
+
|
|
95
|
+
See Also
|
|
96
|
+
--------
|
|
97
|
+
align_optimal
|
|
98
|
+
Guarantees to find the optimal alignment at the cost of greater
|
|
99
|
+
compuation time and memory requirements.
|
|
100
|
+
|
|
101
|
+
Notes
|
|
102
|
+
-----
|
|
103
|
+
The diagonals give the maximum difference between the
|
|
104
|
+
number of inserted gaps.
|
|
105
|
+
This means for any position in the alignment, the algorithm
|
|
106
|
+
will not consider inserting a gap into a sequence, if the first
|
|
107
|
+
sequence has already ``-band[0]`` more gaps than the second
|
|
108
|
+
sequence or if the second sequence has already ``band[1]`` more gaps
|
|
109
|
+
than the first sequence, even if inserting additional gaps would
|
|
110
|
+
yield a more optimal alignment.
|
|
111
|
+
Considerations on how to find a suitable band width are discussed in
|
|
112
|
+
:footcite:`Gibrat2018`.
|
|
113
|
+
|
|
114
|
+
The restriction to a limited band is the central difference between
|
|
115
|
+
the banded alignment heuristic and the optimal alignment
|
|
116
|
+
algorithms :footcite:`Needleman1970, Smith1981`.
|
|
117
|
+
Those classical algorithms require :math:`O(m \cdot n)`
|
|
118
|
+
memory space and computation time for aligning two sequences with
|
|
119
|
+
lengths :math:`m` and :math:`n`, respectively.
|
|
120
|
+
The banded alignment algorithm reduces both requirements to
|
|
121
|
+
:math:`O(\min(m,n) \cdot (D_U - D_L))`.
|
|
122
|
+
|
|
123
|
+
*Implementation details*
|
|
124
|
+
|
|
125
|
+
The implementation is very similar to :func:`align_optimal()`.
|
|
126
|
+
The most significant difference is that not the complete alignment
|
|
127
|
+
table is filled, but only the cells that lie within the diagonal
|
|
128
|
+
band.
|
|
129
|
+
Furthermore, to reduce also the space requirements the diagnoal band
|
|
130
|
+
is 'straightened', i.e. the table's rows are indented to the left.
|
|
131
|
+
Hence, this table
|
|
132
|
+
|
|
133
|
+
= = = = = = = = = =
|
|
134
|
+
. . x x x . . . . .
|
|
135
|
+
. . . x x x . . . .
|
|
136
|
+
. . . . x x x . . .
|
|
137
|
+
. . . . . x x x . .
|
|
138
|
+
. . . . . . x x x .
|
|
139
|
+
= = = = = = = = = =
|
|
140
|
+
|
|
141
|
+
is transformed into this table:
|
|
142
|
+
|
|
143
|
+
= = =
|
|
144
|
+
x x x
|
|
145
|
+
x x x
|
|
146
|
+
x x x
|
|
147
|
+
x x x
|
|
148
|
+
x x x
|
|
149
|
+
= = =
|
|
150
|
+
|
|
151
|
+
Filled cells, i.e. cells within the band, are indicated by ``x``.
|
|
152
|
+
The shorter sequence is always represented by the first dimension
|
|
153
|
+
of the table in this implementation.
|
|
154
|
+
|
|
155
|
+
References
|
|
156
|
+
----------
|
|
157
|
+
|
|
158
|
+
.. footbibliography::
|
|
159
|
+
|
|
160
|
+
Examples
|
|
161
|
+
--------
|
|
162
|
+
|
|
163
|
+
Find a matching diagonal for two sequences:
|
|
164
|
+
|
|
165
|
+
>>> sequence1 = NucleotideSequence("GCGCGCTATATTATGCGCGC")
|
|
166
|
+
>>> sequence2 = NucleotideSequence("TATAAT")
|
|
167
|
+
>>> table = KmerTable.from_sequences(k=4, sequences=[sequence1])
|
|
168
|
+
>>> match = table.match(sequence2)[0]
|
|
169
|
+
>>> diagonal = match[0] - match[2]
|
|
170
|
+
>>> print(diagonal)
|
|
171
|
+
-6
|
|
172
|
+
|
|
173
|
+
Align the sequences centered on the diagonal with buffer in both
|
|
174
|
+
directions:
|
|
175
|
+
|
|
176
|
+
>>> BUFFER = 5
|
|
177
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
178
|
+
>>> alignment = align_banded(
|
|
179
|
+
... sequence1, sequence2, matrix,
|
|
180
|
+
... band=(diagonal - BUFFER, diagonal + BUFFER), gap_penalty=(-6, -1)
|
|
181
|
+
... )[0]
|
|
182
|
+
>>> print(alignment)
|
|
183
|
+
TATATTAT
|
|
184
|
+
TATA--AT
|
|
185
|
+
"""
|
|
186
|
+
# Check matrix alphabets
|
|
187
|
+
if not matrix.get_alphabet1().extends(seq1.get_alphabet()) \
|
|
188
|
+
or not matrix.get_alphabet2().extends(seq2.get_alphabet()):
|
|
189
|
+
raise ValueError("The sequences' alphabets do not fit the matrix")
|
|
190
|
+
# Check if gap penalty is linear or affine
|
|
191
|
+
if type(gap_penalty) == int:
|
|
192
|
+
if gap_penalty > 0:
|
|
193
|
+
raise ValueError("Gap penalty must be negative")
|
|
194
|
+
affine_penalty = False
|
|
195
|
+
elif type(gap_penalty) == tuple:
|
|
196
|
+
if gap_penalty[0] > 0 or gap_penalty[1] > 0:
|
|
197
|
+
raise ValueError("Gap penalty must be negative")
|
|
198
|
+
affine_penalty = True
|
|
199
|
+
else:
|
|
200
|
+
raise TypeError("Gap penalty must be either integer or tuple")
|
|
201
|
+
# Check if max_number is reasonable
|
|
202
|
+
if max_number < 1:
|
|
203
|
+
raise ValueError(
|
|
204
|
+
"Maximum number of returned alignments must be at least 1"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# The shorter sequence is the one on the left of the matrix
|
|
208
|
+
# -> shorter sequence is 'seq1'
|
|
209
|
+
if len(seq2) < len(seq1):
|
|
210
|
+
seq1, seq2 = seq2, seq1
|
|
211
|
+
band = [-diag for diag in band]
|
|
212
|
+
matrix = matrix.transpose()
|
|
213
|
+
is_swapped = True
|
|
214
|
+
else:
|
|
215
|
+
is_swapped = False
|
|
216
|
+
lower_diag, upper_diag = min(band), max(band)
|
|
217
|
+
if len(seq1) + upper_diag <= 0 or lower_diag >= len(seq2):
|
|
218
|
+
raise ValueError(
|
|
219
|
+
"Alignment band is out of range, the band allows no overlap "
|
|
220
|
+
"between both sequences"
|
|
221
|
+
)
|
|
222
|
+
# Crop band diagonals to reasonable size, so that it at maximum
|
|
223
|
+
# covers the search space of an unbanded alignment
|
|
224
|
+
lower_diag = max(lower_diag, -len(seq1)+1)
|
|
225
|
+
upper_diag = min(upper_diag, len(seq2)-1)
|
|
226
|
+
band_width = upper_diag - lower_diag + 1
|
|
227
|
+
if band_width < 1:
|
|
228
|
+
raise ValueError("The width of the band is 0")
|
|
229
|
+
|
|
230
|
+
# This implementation uses transposed tables in comparison
|
|
231
|
+
# to the common visualization
|
|
232
|
+
# This means, the first sequence is one the left
|
|
233
|
+
# and the second sequence is at the top
|
|
234
|
+
|
|
235
|
+
# Terminal gap column on the left can be omitted in this algorithm,
|
|
236
|
+
# as terminal gaps are not part of the alignment
|
|
237
|
+
# This is not possible for the top row, as the dynamic programming
|
|
238
|
+
# algorithm requires these initial values
|
|
239
|
+
# On the left and right side an additional column is inserted
|
|
240
|
+
# representing the invalid boundaries of the band
|
|
241
|
+
# This prevents unnecessary bound checks when filling the dynamic
|
|
242
|
+
# programming matrix (score and trace)
|
|
243
|
+
trace_table = np.zeros((len(seq1)+1, band_width+2), dtype=np.uint8)
|
|
244
|
+
code1 = seq1.code
|
|
245
|
+
code2 = seq2.code
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# Table filling
|
|
249
|
+
###############
|
|
250
|
+
|
|
251
|
+
# A score value that signals that the respective direction in the
|
|
252
|
+
# dynamic programming matrix should not be used, since it would be
|
|
253
|
+
# outside the band
|
|
254
|
+
# It is the 'worst' score available, so the trace table will never
|
|
255
|
+
# include such a direction
|
|
256
|
+
neg_inf = np.iinfo(np.int32).min
|
|
257
|
+
# Correct the 'negative infinity' integer, by making it more positive
|
|
258
|
+
# This prevents an integer underflow when the gap penalty or
|
|
259
|
+
# match score is added to this value
|
|
260
|
+
neg_inf -= min(gap_penalty) if affine_penalty else gap_penalty
|
|
261
|
+
min_score = np.min(matrix.score_matrix())
|
|
262
|
+
if min_score < 0:
|
|
263
|
+
neg_inf -= min_score
|
|
264
|
+
|
|
265
|
+
if affine_penalty:
|
|
266
|
+
# Affine gap penalty
|
|
267
|
+
gap_open = gap_penalty[0]
|
|
268
|
+
gap_ext = gap_penalty[1]
|
|
269
|
+
# m_table, g1_table and g2_table are the 3 score tables
|
|
270
|
+
m_table = np.zeros((len(seq1)+1, band_width+2), dtype=np.int32)
|
|
271
|
+
# Fill with negative infinity values to prevent that an
|
|
272
|
+
# alignment trace starts with a gap extension
|
|
273
|
+
# instead of a gap opening
|
|
274
|
+
g1_table = np.full((len(seq1)+1, band_width+2), neg_inf, np.int32)
|
|
275
|
+
g2_table = np.full((len(seq1)+1, band_width+2), neg_inf, np.int32)
|
|
276
|
+
# As explained for the trace table (see above),
|
|
277
|
+
# the score table is filled with with netagive infinty values
|
|
278
|
+
# on the left and right column to prevent the trace leaving the
|
|
279
|
+
# alignment band
|
|
280
|
+
m_table[:, 0] = neg_inf
|
|
281
|
+
m_table[:, -1] = neg_inf
|
|
282
|
+
# Initialize first row and column for global alignments
|
|
283
|
+
_fill_align_table_affine(code1, code2,
|
|
284
|
+
matrix.score_matrix(), trace_table,
|
|
285
|
+
m_table, g1_table, g2_table,
|
|
286
|
+
lower_diag, upper_diag,
|
|
287
|
+
gap_open, gap_ext, local)
|
|
288
|
+
else:
|
|
289
|
+
# Linear gap penalty
|
|
290
|
+
score_table = np.zeros((len(seq1)+1, band_width+2), dtype=np.int32)
|
|
291
|
+
score_table[:, 0] = neg_inf
|
|
292
|
+
score_table[:, -1] = neg_inf
|
|
293
|
+
_fill_align_table(
|
|
294
|
+
code1, code2, matrix.score_matrix(), trace_table, score_table,
|
|
295
|
+
lower_diag, upper_diag, gap_penalty, local
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# Traceback
|
|
300
|
+
###########
|
|
301
|
+
|
|
302
|
+
# Stores all possible traces (= possible alignments)
|
|
303
|
+
# A trace stores the indices of the aligned symbols
|
|
304
|
+
# in both sequences
|
|
305
|
+
trace_list = []
|
|
306
|
+
# Lists of trace starting indices
|
|
307
|
+
i_list = np.zeros(0, dtype=int)
|
|
308
|
+
j_list = np.zeros(0, dtype=int)
|
|
309
|
+
# `state_list` lists of start states
|
|
310
|
+
# State specifies the table the trace starts in
|
|
311
|
+
if local:
|
|
312
|
+
# The start point is the maximal score in the table
|
|
313
|
+
# Multiple starting points possible,
|
|
314
|
+
# when duplicates of maximum score exist
|
|
315
|
+
if affine_penalty:
|
|
316
|
+
# The maximum score in the gap score tables do not need to
|
|
317
|
+
# be considered, as these starting positions would indicate
|
|
318
|
+
# that the local alignment starts with a gap
|
|
319
|
+
# Hence the maximum score value in these tables is always
|
|
320
|
+
# less than in the match table
|
|
321
|
+
max_score = np.max(m_table)
|
|
322
|
+
i_list, j_list = np.where((m_table == max_score))
|
|
323
|
+
state_list = np.full(
|
|
324
|
+
len(i_list), TraceState.MATCH_STATE, dtype=int
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
max_score = np.max(score_table)
|
|
328
|
+
i_list, j_list = np.where((score_table == max_score))
|
|
329
|
+
# State is always 0 for linear gap penalty
|
|
330
|
+
# since there is only one table
|
|
331
|
+
state_list = np.full(
|
|
332
|
+
len(i_list), TraceState.NO_STATE, dtype=int
|
|
333
|
+
)
|
|
334
|
+
else:
|
|
335
|
+
# Get all allowed trace start indices
|
|
336
|
+
possible_i_start, possible_j_start = get_global_trace_starts(
|
|
337
|
+
len(seq1), len(seq2), lower_diag, upper_diag
|
|
338
|
+
)
|
|
339
|
+
if affine_penalty:
|
|
340
|
+
state_list = np.zeros(0, dtype=int)
|
|
341
|
+
m_scores = m_table[possible_i_start, possible_j_start]
|
|
342
|
+
g1_scores = g1_table[possible_i_start, possible_j_start]
|
|
343
|
+
g2_scores = g2_table[possible_i_start, possible_j_start]
|
|
344
|
+
m_max_score = np.max(m_scores)
|
|
345
|
+
g1_max_score = np.max(g1_scores)
|
|
346
|
+
g2_max_score = np.max(g2_scores)
|
|
347
|
+
max_score = max(m_max_score, g1_max_score, g2_max_score)
|
|
348
|
+
if m_max_score == max_score:
|
|
349
|
+
best_indices = np.where(m_scores == max_score)[0]
|
|
350
|
+
i_list = np.append(i_list, possible_i_start[best_indices])
|
|
351
|
+
j_list = np.append(j_list, possible_j_start[best_indices])
|
|
352
|
+
state_list = np.append(
|
|
353
|
+
state_list,
|
|
354
|
+
np.full(len(best_indices),
|
|
355
|
+
TraceState.MATCH_STATE, dtype=int)
|
|
356
|
+
)
|
|
357
|
+
if g1_max_score == max_score:
|
|
358
|
+
best_indices = np.where(g1_scores == max_score)[0]
|
|
359
|
+
i_list = np.append(i_list, possible_i_start[best_indices])
|
|
360
|
+
j_list = np.append(j_list, possible_j_start[best_indices])
|
|
361
|
+
state_list = np.append(
|
|
362
|
+
state_list,
|
|
363
|
+
np.full(len(best_indices),
|
|
364
|
+
TraceState.GAP_LEFT_STATE, dtype=int)
|
|
365
|
+
)
|
|
366
|
+
if g2_max_score == max_score:
|
|
367
|
+
best_indices = np.where(g2_scores == max_score)[0]
|
|
368
|
+
i_list = np.append(i_list, possible_i_start[best_indices])
|
|
369
|
+
j_list = np.append(j_list, possible_j_start[best_indices])
|
|
370
|
+
state_list = np.append(
|
|
371
|
+
state_list,
|
|
372
|
+
np.full(len(best_indices),
|
|
373
|
+
TraceState.GAP_TOP_STATE, dtype=int)
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
# Choose the trace start index with the highest score
|
|
377
|
+
# in the score table
|
|
378
|
+
scores = score_table[possible_i_start, possible_j_start]
|
|
379
|
+
max_score = np.max(scores)
|
|
380
|
+
best_indices = np.where(scores == max_score)
|
|
381
|
+
i_list = possible_i_start[best_indices]
|
|
382
|
+
j_list = possible_j_start[best_indices]
|
|
383
|
+
state_list = np.full(
|
|
384
|
+
len(i_list), TraceState.NO_STATE, dtype=int
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Follow the traces specified in state and indices lists
|
|
388
|
+
cdef int curr_trace_count
|
|
389
|
+
for k in range(len(i_list)):
|
|
390
|
+
i_start = i_list[k]
|
|
391
|
+
j_start = j_list[k]
|
|
392
|
+
state_start = state_list[k]
|
|
393
|
+
# Pessimistic array allocation:
|
|
394
|
+
# The maximum trace length arises from an alignment, where each
|
|
395
|
+
# symbol is aligned to a gap
|
|
396
|
+
trace = np.full((len(seq1) + len(seq2), 2), -1, dtype=np.int64)
|
|
397
|
+
curr_trace_count = 1
|
|
398
|
+
follow_trace(
|
|
399
|
+
trace_table, True, i_start, j_start, 0,
|
|
400
|
+
trace, trace_list, state=state_start,
|
|
401
|
+
curr_trace_count=&curr_trace_count, max_trace_count=max_number,
|
|
402
|
+
lower_diag=lower_diag, upper_diag=upper_diag
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Replace gap entries in trace with -1
|
|
406
|
+
for i, trace in enumerate(trace_list):
|
|
407
|
+
trace = np.flip(trace, axis=0)
|
|
408
|
+
gap_filter = np.zeros(trace.shape, dtype=bool)
|
|
409
|
+
gap_filter[np.unique(trace[:,0], return_index=True)[1], 0] = True
|
|
410
|
+
gap_filter[np.unique(trace[:,1], return_index=True)[1], 1] = True
|
|
411
|
+
trace[~gap_filter] = -1
|
|
412
|
+
trace_list[i] = trace
|
|
413
|
+
|
|
414
|
+
# Limit the number of generated alignments to `max_number`:
|
|
415
|
+
# In most cases this is achieved by discarding branches in
|
|
416
|
+
# 'follow_trace()', however, if multiple alignment starts
|
|
417
|
+
# are used, the number of created traces are the number of
|
|
418
|
+
# starts times `max_number`
|
|
419
|
+
trace_list = trace_list[:max_number]
|
|
420
|
+
if is_swapped:
|
|
421
|
+
return [Alignment([seq2, seq1], np.flip(trace, axis=1), max_score)
|
|
422
|
+
for trace in trace_list]
|
|
423
|
+
else:
|
|
424
|
+
return [Alignment([seq1, seq2], trace, max_score)
|
|
425
|
+
for trace in trace_list]
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
@cython.boundscheck(False)
|
|
429
|
+
@cython.wraparound(False)
|
|
430
|
+
def _fill_align_table(CodeType1[:] code1 not None,
|
|
431
|
+
CodeType2[:] code2 not None,
|
|
432
|
+
const int32[:,:] mat not None,
|
|
433
|
+
uint8[:,:] trace_table not None,
|
|
434
|
+
int32[:,:] score_table not None,
|
|
435
|
+
int lower_diag,
|
|
436
|
+
int upper_diag,
|
|
437
|
+
int gap_penalty,
|
|
438
|
+
bint local):
|
|
439
|
+
"""
|
|
440
|
+
Fill an alignment table with linear gap penalty using dynamic
|
|
441
|
+
programming.
|
|
442
|
+
|
|
443
|
+
Parameters
|
|
444
|
+
----------
|
|
445
|
+
code1, code2
|
|
446
|
+
The sequence code of each sequence to be aligned.
|
|
447
|
+
mat
|
|
448
|
+
The score matrix obtained from the :class:`SubstitutionMatrix`
|
|
449
|
+
object.
|
|
450
|
+
trace_table
|
|
451
|
+
A matrix containing values indicating the direction for the
|
|
452
|
+
traceback step.
|
|
453
|
+
The matrix is filled in this function
|
|
454
|
+
score_table
|
|
455
|
+
The alignment table.
|
|
456
|
+
The matrix is filled in this function.
|
|
457
|
+
gap_penalty
|
|
458
|
+
The linear gap penalty.
|
|
459
|
+
local
|
|
460
|
+
Indicates, whether a local alignment should be performed.
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
cdef int i, j
|
|
464
|
+
cdef int seq_i, seq_j
|
|
465
|
+
cdef int32 from_diag, from_left, from_top
|
|
466
|
+
cdef uint8 trace
|
|
467
|
+
cdef int32 score
|
|
468
|
+
|
|
469
|
+
# Starts at 1 since the first row and column are already filled
|
|
470
|
+
for seq_i in range(0, code1.shape[0]):
|
|
471
|
+
# Transform sequence index into table index
|
|
472
|
+
i = seq_i + 1
|
|
473
|
+
for seq_j in range(
|
|
474
|
+
max(0, seq_i + lower_diag),
|
|
475
|
+
min(code2.shape[0], seq_i + upper_diag+1)
|
|
476
|
+
):
|
|
477
|
+
# Transform sequence index into table index
|
|
478
|
+
# Due to the diagonal band and its 'straightening'
|
|
479
|
+
# seq_j must be transformed to obtain the table index
|
|
480
|
+
j = seq_j - seq_i - lower_diag + 1
|
|
481
|
+
|
|
482
|
+
# Evaluate score from available directions:
|
|
483
|
+
# Due the 'straightening' of the the diagonal band,
|
|
484
|
+
# the 'upper left' and 'upper' direction from the classic
|
|
485
|
+
# matrix become 'upper' and 'upper right', respectively
|
|
486
|
+
from_diag = score_table[i-1, j ] + mat[code1[seq_i], code2[seq_j]]
|
|
487
|
+
from_left = score_table[i, j-1] + gap_penalty
|
|
488
|
+
from_top = score_table[i-1, j+1] + gap_penalty
|
|
489
|
+
|
|
490
|
+
trace = get_trace_linear(from_diag, from_left, from_top, &score)
|
|
491
|
+
|
|
492
|
+
# Local alignment specialty:
|
|
493
|
+
# If score is less than or equal to 0,
|
|
494
|
+
# then 0 is saved on the field and the trace ends here
|
|
495
|
+
if local == True and score <= 0:
|
|
496
|
+
score_table[i,j] = 0
|
|
497
|
+
else:
|
|
498
|
+
score_table[i,j] = score
|
|
499
|
+
trace_table[i,j] = trace
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
@cython.boundscheck(False)
|
|
503
|
+
@cython.wraparound(False)
|
|
504
|
+
def _fill_align_table_affine(CodeType1[:] code1 not None,
|
|
505
|
+
CodeType2[:] code2 not None,
|
|
506
|
+
const int32[:,:] mat not None,
|
|
507
|
+
uint8[:,:] trace_table not None,
|
|
508
|
+
int32[:,:] m_table not None,
|
|
509
|
+
int32[:,:] g1_table not None,
|
|
510
|
+
int32[:,:] g2_table not None,
|
|
511
|
+
int lower_diag,
|
|
512
|
+
int upper_diag,
|
|
513
|
+
int gap_open,
|
|
514
|
+
int gap_ext,
|
|
515
|
+
bint local):
|
|
516
|
+
"""
|
|
517
|
+
Fill an alignment table with affine gap penalty using dynamic
|
|
518
|
+
programming.
|
|
519
|
+
|
|
520
|
+
Parameters
|
|
521
|
+
----------
|
|
522
|
+
code1, code2
|
|
523
|
+
The sequence code of each sequence to be aligned.
|
|
524
|
+
matrix
|
|
525
|
+
The score matrix obtained from the class:`SubstitutionMatrix`
|
|
526
|
+
object.
|
|
527
|
+
trace_table
|
|
528
|
+
A matrix containing values indicating the direction for the
|
|
529
|
+
traceback step.
|
|
530
|
+
The matrix is filled in this function.
|
|
531
|
+
m_table, g1_table, g2_table
|
|
532
|
+
The alignment tables containing the scores.
|
|
533
|
+
`m_table` contains values for matches.
|
|
534
|
+
`g1_table` contains values for gaps in the first sequence.
|
|
535
|
+
`g2_table` contains values for gaps in the second sequence.
|
|
536
|
+
The matrix is filled in this function.
|
|
537
|
+
gap_open
|
|
538
|
+
The gap opening penalty.
|
|
539
|
+
gap_ext
|
|
540
|
+
The gap extension penalty.
|
|
541
|
+
local
|
|
542
|
+
Indicates, whether a local alignment should be performed.
|
|
543
|
+
"""
|
|
544
|
+
|
|
545
|
+
cdef int i, j
|
|
546
|
+
cdef int seq_i, seq_j
|
|
547
|
+
cdef int32 mm_score, g1m_score, g2m_score
|
|
548
|
+
cdef int32 mg1_score, g1g1_score
|
|
549
|
+
cdef int32 mg2_score, g2g2_score
|
|
550
|
+
cdef uint8 trace
|
|
551
|
+
cdef int32 m_score, g1_score, g2_score
|
|
552
|
+
cdef int32 similarity_score
|
|
553
|
+
|
|
554
|
+
# Starts at 1 since the first row and column are already fil
|
|
555
|
+
for seq_i in range(0, code1.shape[0]):
|
|
556
|
+
i = seq_i + 1
|
|
557
|
+
for seq_j in range(
|
|
558
|
+
max(0, seq_i + lower_diag),
|
|
559
|
+
min(code2.shape[0], seq_i + upper_diag+1)
|
|
560
|
+
):
|
|
561
|
+
j = seq_j - seq_i - lower_diag + 1
|
|
562
|
+
# Calculate the scores for possible transitions
|
|
563
|
+
# into the current cell
|
|
564
|
+
similarity_score = mat[code1[seq_i], code2[seq_j]]
|
|
565
|
+
mm_score = m_table[i-1, j] + similarity_score
|
|
566
|
+
g1m_score = g1_table[i-1, j] + similarity_score
|
|
567
|
+
g2m_score = g2_table[i-1, j] + similarity_score
|
|
568
|
+
# No transition from g1_table to g2_table and vice versa
|
|
569
|
+
# Since this would mean adjacent gaps in both sequences
|
|
570
|
+
# A substitution makes more sense in this case
|
|
571
|
+
mg1_score = m_table[i, j-1] + gap_open
|
|
572
|
+
g1g1_score = g1_table[i, j-1] + gap_ext
|
|
573
|
+
mg2_score = m_table[i-1, j+1] + gap_open
|
|
574
|
+
g2g2_score = g2_table[i-1, j+1] + gap_ext
|
|
575
|
+
|
|
576
|
+
trace = get_trace_affine(
|
|
577
|
+
mm_score, g1m_score, g2m_score,
|
|
578
|
+
mg1_score, g1g1_score,
|
|
579
|
+
mg2_score, g2g2_score,
|
|
580
|
+
# The max score values to be written
|
|
581
|
+
&m_score, &g1_score, &g2_score
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Fill values into tables
|
|
585
|
+
# Local alignment specialty:
|
|
586
|
+
# If score is less than or equal to 0,
|
|
587
|
+
# then the score of the cell remains 0
|
|
588
|
+
# and the trace ends here
|
|
589
|
+
if local == True:
|
|
590
|
+
if m_score <= 0:
|
|
591
|
+
# End trace in specific table
|
|
592
|
+
# by filtering out the respective bits
|
|
593
|
+
trace &= ~(
|
|
594
|
+
TraceDirectionAffine.MATCH_TO_MATCH |
|
|
595
|
+
TraceDirectionAffine.GAP_LEFT_TO_MATCH |
|
|
596
|
+
TraceDirectionAffine.GAP_TOP_TO_MATCH
|
|
597
|
+
)
|
|
598
|
+
# m_table[i,j] remains 0
|
|
599
|
+
else:
|
|
600
|
+
m_table[i,j] = m_score
|
|
601
|
+
if g1_score <= 0:
|
|
602
|
+
trace &= ~(
|
|
603
|
+
TraceDirectionAffine.MATCH_TO_GAP_LEFT |
|
|
604
|
+
TraceDirectionAffine.GAP_LEFT_TO_GAP_LEFT
|
|
605
|
+
)
|
|
606
|
+
# g1_table[i,j] remains negative infinity
|
|
607
|
+
else:
|
|
608
|
+
g1_table[i,j] = g1_score
|
|
609
|
+
if g2_score <= 0:
|
|
610
|
+
trace &= ~(
|
|
611
|
+
TraceDirectionAffine.MATCH_TO_GAP_TOP |
|
|
612
|
+
TraceDirectionAffine.GAP_TOP_TO_GAP_TOP
|
|
613
|
+
)
|
|
614
|
+
# g2_table[i,j] remains negative infinity
|
|
615
|
+
else:
|
|
616
|
+
g2_table[i,j] = g2_score
|
|
617
|
+
else:
|
|
618
|
+
m_table[i,j] = m_score
|
|
619
|
+
g1_table[i,j] = g1_score
|
|
620
|
+
g2_table[i,j] = g2_score
|
|
621
|
+
trace_table[i,j] = trace
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def get_global_trace_starts(seq1_len, seq2_len, lower_diag, upper_diag):
|
|
625
|
+
band_width = upper_diag - lower_diag + 1
|
|
626
|
+
|
|
627
|
+
j = np.arange(1, band_width + 1)
|
|
628
|
+
seq_j = j + (seq1_len-1) + lower_diag - 1
|
|
629
|
+
# Start from the end from the first (shorter) sequence,
|
|
630
|
+
# if the table cell is in bounds of the second (longer) sequence,
|
|
631
|
+
# otherwise start from the end of the second sequence
|
|
632
|
+
i = np.where(
|
|
633
|
+
seq_j < seq2_len,
|
|
634
|
+
np.full(len(j), (seq1_len-1) + 1, dtype=int),
|
|
635
|
+
# Take:
|
|
636
|
+
#
|
|
637
|
+
# seq_j = j + (seq1_len-1) + lower_diag - 1
|
|
638
|
+
#
|
|
639
|
+
# Replace seq_j with last sequence position of second sequence
|
|
640
|
+
# and last sequence position of first sequence with seq_i:
|
|
641
|
+
#
|
|
642
|
+
# (seq2_len-1) = j + seq_i + lower_diag - 1
|
|
643
|
+
#
|
|
644
|
+
# Replace seq_i with corresponding i in trace table:
|
|
645
|
+
#
|
|
646
|
+
# (seq2_len-1) = j + (i - 1) + lower_diag - 1
|
|
647
|
+
#
|
|
648
|
+
# Resolve to i:
|
|
649
|
+
#
|
|
650
|
+
(seq2_len-1) - j - lower_diag + 2
|
|
651
|
+
)
|
|
652
|
+
return i, j
|