biotite 1.5.0__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cp314-win_amd64.pyd +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +4 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,892 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["align_local_gapped"]
|
|
8
|
+
|
|
9
|
+
cimport cython
|
|
10
|
+
cimport numpy as np
|
|
11
|
+
from .tracetable cimport follow_trace, get_trace_linear, get_trace_affine
|
|
12
|
+
|
|
13
|
+
import itertools
|
|
14
|
+
import numpy as np
|
|
15
|
+
from .alignment import Alignment
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
ctypedef np.int32_t int32
|
|
19
|
+
ctypedef np.int64_t int64
|
|
20
|
+
ctypedef np.uint8_t uint8
|
|
21
|
+
ctypedef np.uint16_t uint16
|
|
22
|
+
ctypedef np.uint32_t uint32
|
|
23
|
+
ctypedef np.uint64_t uint64
|
|
24
|
+
|
|
25
|
+
ctypedef fused CodeType1:
|
|
26
|
+
uint8
|
|
27
|
+
uint16
|
|
28
|
+
uint32
|
|
29
|
+
uint64
|
|
30
|
+
ctypedef fused CodeType2:
|
|
31
|
+
uint8
|
|
32
|
+
uint16
|
|
33
|
+
uint32
|
|
34
|
+
uint64
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
cdef int INIT_SIZE = 100
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def align_local_gapped(seq1, seq2, matrix, seed, int32 threshold,
|
|
41
|
+
gap_penalty=-10, max_number=1,
|
|
42
|
+
direction="both", score_only=False,
|
|
43
|
+
max_table_size=None):
|
|
44
|
+
"""
|
|
45
|
+
align_local_gapped(seq1, seq2, matrix, seed, threshold,
|
|
46
|
+
gap_penalty=-10, max_number=1,
|
|
47
|
+
direction="both", score_only=False,
|
|
48
|
+
max_table_size=None)
|
|
49
|
+
|
|
50
|
+
Perform a local gapped alignment extending from a given `seed`
|
|
51
|
+
position.
|
|
52
|
+
|
|
53
|
+
The alignment extends into one or both directions (controlled by
|
|
54
|
+
`direction`) until the total alignment score falls more than
|
|
55
|
+
`threshold` below the maximum score found (*X-Drop*).
|
|
56
|
+
:footcite:`Zhang2000`
|
|
57
|
+
The returned alignment contains the range that yielded the maximum
|
|
58
|
+
score.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
seq1, seq2 : Sequence
|
|
63
|
+
The sequences to be aligned.
|
|
64
|
+
matrix : SubstitutionMatrix
|
|
65
|
+
The substitution matrix used for scoring.
|
|
66
|
+
seed : tuple(int, int)
|
|
67
|
+
The indices in `seq1` and `seq2` where the local alignment
|
|
68
|
+
starts.
|
|
69
|
+
The indices must be non-negative.
|
|
70
|
+
threshold : int
|
|
71
|
+
If the current score falls this value below the maximum score
|
|
72
|
+
found, the alignment terminates.
|
|
73
|
+
gap_penalty : int or tuple(int, int), optional
|
|
74
|
+
If an integer is provided, the value will be interpreted as
|
|
75
|
+
linear gap penalty.
|
|
76
|
+
If a tuple is provided, an affine gap penalty is used
|
|
77
|
+
:footcite:`Gotoh1982`.
|
|
78
|
+
The first integer in the tuple is the gap opening penalty,
|
|
79
|
+
the second integer is the gap extension penalty.
|
|
80
|
+
threshold : int
|
|
81
|
+
If the current score falls this value below the maximum score
|
|
82
|
+
found, the alignment terminates.
|
|
83
|
+
max_number : int, optional
|
|
84
|
+
The maximum number of alignments returned.
|
|
85
|
+
When the number of branches exceeds this value in the traceback
|
|
86
|
+
step, no further branches are created.
|
|
87
|
+
By default, only a single alignment is returned.
|
|
88
|
+
direction : {'both', 'upstream', 'downstream'}, optional
|
|
89
|
+
Controls in which direction the alignment extends starting
|
|
90
|
+
from the seed.
|
|
91
|
+
If ``'upstream'``, the alignment starts before the `seed` and
|
|
92
|
+
ends at the `seed`.
|
|
93
|
+
If ``'downstream'``, the alignment starts at the `seed` and
|
|
94
|
+
ends behind the `seed`.
|
|
95
|
+
If ``'both'`` (default) the alignment starts before the `seed`
|
|
96
|
+
and ends behind the `seed`.
|
|
97
|
+
The `seed` position itself is always included in the alignment.
|
|
98
|
+
score_only : bool, optional
|
|
99
|
+
If set to ``True``, only the similarity score is returned
|
|
100
|
+
instead of the :class:`Alignment`, decreasing the runtime
|
|
101
|
+
substantially.
|
|
102
|
+
max_table_size : int, optional
|
|
103
|
+
A :class:`MemoryError` is raised, if the number of cells
|
|
104
|
+
in the internal dynamic programming table, i.e. approximately
|
|
105
|
+
the product of the lengths of the aligned regions, would exceed
|
|
106
|
+
the given value.
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
alignments : list of Alignment
|
|
111
|
+
A list of found alignments.
|
|
112
|
+
Each alignment in the list has the same similarity
|
|
113
|
+
score.
|
|
114
|
+
Only returned, if `score_only` is ``False``.
|
|
115
|
+
score : int
|
|
116
|
+
The alignment similarity score.
|
|
117
|
+
Only returned, if `score_only` is ``True``.
|
|
118
|
+
|
|
119
|
+
See Also
|
|
120
|
+
--------
|
|
121
|
+
align_ungapped
|
|
122
|
+
For ungapped local alignments with the same *X-Drop* technique.
|
|
123
|
+
|
|
124
|
+
Notes
|
|
125
|
+
-----
|
|
126
|
+
Unilke :func:`align_optimal()`, this function does not allocate
|
|
127
|
+
memory proportional to the length of both sequences, but only
|
|
128
|
+
approximately proportional to lengths of the aligned regions.
|
|
129
|
+
In principle, this makes this function viable for local alignments
|
|
130
|
+
of sequences of any length.
|
|
131
|
+
However, if the product of the lengths of the homologous regions
|
|
132
|
+
is too large to fit into memory, a :class:`MemoryError` or even a
|
|
133
|
+
crash may occur.
|
|
134
|
+
This may also happen in spurious long alignments due to poor choice
|
|
135
|
+
of substitution matrix or gap penalty.
|
|
136
|
+
You may set `max_table_size` to avoid excessive memory use and
|
|
137
|
+
crashes.
|
|
138
|
+
|
|
139
|
+
References
|
|
140
|
+
----------
|
|
141
|
+
|
|
142
|
+
.. footbibliography::
|
|
143
|
+
|
|
144
|
+
Examples
|
|
145
|
+
--------
|
|
146
|
+
|
|
147
|
+
>>> seq1 = NucleotideSequence("CGTAGCTATCGCCTGTACGGTT")
|
|
148
|
+
>>> seq2 = NucleotideSequence("TATATGCCTTACGGAATTGCTTTTT")
|
|
149
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
150
|
+
>>> alignment = align_local_gapped(
|
|
151
|
+
... seq1, seq2, matrix, seed=(16, 10), threshold=20
|
|
152
|
+
... )[0]
|
|
153
|
+
>>> print(alignment)
|
|
154
|
+
TATCGCCTGTACGG
|
|
155
|
+
TAT-GCCT-TACGG
|
|
156
|
+
>>> alignment = align_local_gapped(
|
|
157
|
+
... seq1, seq2, matrix, seed=(16, 10), threshold=20, direction="upstream"
|
|
158
|
+
... )[0]
|
|
159
|
+
>>> print(alignment)
|
|
160
|
+
TATCGCCTGTA
|
|
161
|
+
TAT-GCCT-TA
|
|
162
|
+
>>> alignment = align_local_gapped(
|
|
163
|
+
... seq1, seq2, matrix, seed=(16, 10), threshold=20, direction="downstream"
|
|
164
|
+
... )[0]
|
|
165
|
+
>>> print(alignment)
|
|
166
|
+
ACGG
|
|
167
|
+
ACGG
|
|
168
|
+
>>> score = align_local_gapped(
|
|
169
|
+
... seq1, seq2, matrix, seed=(16, 10), threshold=20, score_only=True
|
|
170
|
+
... )
|
|
171
|
+
>>> print(score)
|
|
172
|
+
40
|
|
173
|
+
"""
|
|
174
|
+
# Check matrix alphabets
|
|
175
|
+
if not matrix.get_alphabet1().extends(seq1.get_alphabet()) \
|
|
176
|
+
or not matrix.get_alphabet2().extends(seq2.get_alphabet()):
|
|
177
|
+
raise ValueError("The sequences' alphabets do not fit the matrix")
|
|
178
|
+
score_matrix = matrix.score_matrix()
|
|
179
|
+
|
|
180
|
+
# Check if gap penalty is linear or affine
|
|
181
|
+
if type(gap_penalty) == int:
|
|
182
|
+
if gap_penalty >= 0:
|
|
183
|
+
raise ValueError("Gap penalty must be negative")
|
|
184
|
+
elif type(gap_penalty) == tuple:
|
|
185
|
+
if gap_penalty[0] >= 0 or gap_penalty[1] >= 0:
|
|
186
|
+
raise ValueError("Gap penalty must be negative")
|
|
187
|
+
else:
|
|
188
|
+
raise TypeError("Gap penalty must be either integer or tuple")
|
|
189
|
+
|
|
190
|
+
# Check if max_number is reasonable
|
|
191
|
+
if max_number < 1:
|
|
192
|
+
raise ValueError(
|
|
193
|
+
"Maximum number of returned alignments must be at least 1"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Check maximum table size
|
|
197
|
+
if max_table_size is None:
|
|
198
|
+
max_table_size = np.iinfo(np.int64).max
|
|
199
|
+
elif max_table_size <= 0:
|
|
200
|
+
raise ValueError("Maximum table size must be a positve value")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
code1 = seq1.code
|
|
204
|
+
code2 = seq2.code
|
|
205
|
+
|
|
206
|
+
cdef int seq1_start, seq2_start
|
|
207
|
+
seq1_start, seq2_start = seed
|
|
208
|
+
if seq1_start < 0 or seq2_start < 0:
|
|
209
|
+
raise IndexError("Seed must contain positive indices")
|
|
210
|
+
if seq1_start >= len(code1) or seq2_start >= len(code2):
|
|
211
|
+
raise IndexError(
|
|
212
|
+
f"Seed {(seq1_start, seq2_start)} is out of bounds "
|
|
213
|
+
f"for the sequences of length {len(code1)} and {len(code2)}"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
cdef bint upstream
|
|
218
|
+
cdef bint downstream
|
|
219
|
+
if direction == "both":
|
|
220
|
+
upstream = True
|
|
221
|
+
downstream = True
|
|
222
|
+
elif direction == "upstream":
|
|
223
|
+
upstream = True
|
|
224
|
+
downstream = False
|
|
225
|
+
elif direction == "downstream":
|
|
226
|
+
upstream = False
|
|
227
|
+
downstream = True
|
|
228
|
+
else:
|
|
229
|
+
raise ValueError(f"Direction '{direction}' is invalid")
|
|
230
|
+
# Range check to avoid negative indices
|
|
231
|
+
if seq1_start == 0 or seq2_start == 0:
|
|
232
|
+
upstream = False
|
|
233
|
+
|
|
234
|
+
if threshold < 0:
|
|
235
|
+
raise ValueError("The threshold value must be a non-negative integer")
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
cdef int32 score
|
|
239
|
+
cdef int32 total_score = 0
|
|
240
|
+
# Separate alignment into two parts:
|
|
241
|
+
# the regions upstream and downstream from the seed position
|
|
242
|
+
if upstream:
|
|
243
|
+
# For the upstream region the respective part of the sequence
|
|
244
|
+
# must be reversed
|
|
245
|
+
score, upstream_traces = _align_region(
|
|
246
|
+
code1[seq1_start-1::-1], code2[seq2_start-1::-1],
|
|
247
|
+
score_matrix, threshold, gap_penalty,
|
|
248
|
+
max_number, score_only, max_table_size
|
|
249
|
+
)
|
|
250
|
+
total_score += score
|
|
251
|
+
if upstream_traces is not None:
|
|
252
|
+
# Undo the sequence reversing
|
|
253
|
+
upstream_traces = [trace[::-1] for trace in upstream_traces]
|
|
254
|
+
offset = np.array(seed) - 1
|
|
255
|
+
for trace in upstream_traces:
|
|
256
|
+
# Gap values (-1) are not transformed,
|
|
257
|
+
# as gaps are not indices
|
|
258
|
+
non_gap_mask = (trace != -1)
|
|
259
|
+
# Second part of sequence reversing
|
|
260
|
+
trace[non_gap_mask] *= -1
|
|
261
|
+
# Add seed offset to trace indices
|
|
262
|
+
trace[non_gap_mask[:, 0], 0] += offset[0]
|
|
263
|
+
trace[non_gap_mask[:, 1], 1] += offset[1]
|
|
264
|
+
|
|
265
|
+
if downstream:
|
|
266
|
+
score, downstream_traces = _align_region(
|
|
267
|
+
code1[seq1_start+1:], code2[seq2_start+1:],
|
|
268
|
+
score_matrix, threshold, gap_penalty,
|
|
269
|
+
max_number, score_only, max_table_size
|
|
270
|
+
)
|
|
271
|
+
total_score += score
|
|
272
|
+
if downstream_traces is not None:
|
|
273
|
+
offset = np.array(seed) + 1
|
|
274
|
+
for trace in downstream_traces:
|
|
275
|
+
trace[trace[:, 0] != -1, 0] += offset[0]
|
|
276
|
+
trace[trace[:, 1] != -1, 1] += offset[1]
|
|
277
|
+
|
|
278
|
+
total_score += score_matrix[code1[seq1_start], code2[seq2_start]]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
if score_only:
|
|
282
|
+
return total_score
|
|
283
|
+
else:
|
|
284
|
+
if upstream and downstream:
|
|
285
|
+
# Create cartesian product of upstream and downstream traces
|
|
286
|
+
# Only consider max_number alignments
|
|
287
|
+
traces = [
|
|
288
|
+
np.concatenate([upstream_trace, [seed], downstream_trace])
|
|
289
|
+
for _, (upstream_trace, downstream_trace) in zip(
|
|
290
|
+
range(max_number),
|
|
291
|
+
itertools.product(upstream_traces, downstream_traces)
|
|
292
|
+
)
|
|
293
|
+
]
|
|
294
|
+
elif upstream:
|
|
295
|
+
traces = [
|
|
296
|
+
np.concatenate([trace, [seed]]) for trace in upstream_traces
|
|
297
|
+
]
|
|
298
|
+
elif downstream:
|
|
299
|
+
traces = [
|
|
300
|
+
np.concatenate([[seed], trace]) for trace in downstream_traces
|
|
301
|
+
]
|
|
302
|
+
else:
|
|
303
|
+
# 'direction == "upstream"', but the start index is 0 so no
|
|
304
|
+
# upstream alignment is performed
|
|
305
|
+
# -> the trace includes only the seed
|
|
306
|
+
traces = [np.array(seed)[np.newaxis, :]]
|
|
307
|
+
|
|
308
|
+
return [Alignment([seq1, seq2], trace, total_score)
|
|
309
|
+
for trace in traces]
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _align_region(code1, code2, matrix, threshold, gap_penalty,
|
|
313
|
+
max_number, score_only, max_table_size):
|
|
314
|
+
"""
|
|
315
|
+
Perfrom a local *X-Drop* alignment extending from the start of the
|
|
316
|
+
given sequences
|
|
317
|
+
|
|
318
|
+
Parameters
|
|
319
|
+
----------
|
|
320
|
+
code1, code2 : ndarray, dtype={np.uint8, np.uint16, np.uint32, np.uint64}
|
|
321
|
+
The code of the sequences to be aligned.
|
|
322
|
+
matrix : ndarray, shape(k, k), dtype=np.int32
|
|
323
|
+
The score matrix.
|
|
324
|
+
threshold : int
|
|
325
|
+
If the current score falls this value below the maximum score
|
|
326
|
+
found, the alignment terminates.
|
|
327
|
+
gap_penalty : int or tuple(int, int)
|
|
328
|
+
If an integer is provided, the value will be interpreted as
|
|
329
|
+
linear gap penalty.
|
|
330
|
+
If a tuple is provided, an affine gap penalty is used [2]_.
|
|
331
|
+
The first integer in the tuple is the gap opening penalty,
|
|
332
|
+
the second integer is the gap extension penalty.
|
|
333
|
+
threshold : int
|
|
334
|
+
If the current score falls this value below the maximum score
|
|
335
|
+
found, the alignment terminates.
|
|
336
|
+
max_number : int
|
|
337
|
+
The maximum number of alignments returned.
|
|
338
|
+
When the number of branches exceeds this value in the traceback
|
|
339
|
+
step, no further branches are created.
|
|
340
|
+
score_only : bool
|
|
341
|
+
If set to ``True``, only the similarity score is calculated and
|
|
342
|
+
the traceback is not conducted.
|
|
343
|
+
max_table_size : int
|
|
344
|
+
Raise a :class:`MemoryError`, if a dynamic programming table
|
|
345
|
+
exceeds this size.
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
score : int or None
|
|
350
|
+
The alignment similarity score.
|
|
351
|
+
trace : list of (ndarray, shape=(n,2), dtype=int) or None
|
|
352
|
+
A list of alignment traces, where each trace corresponds to an
|
|
353
|
+
alignment with the maximum similarity score found.
|
|
354
|
+
This list has only multiple elements if there are multiple
|
|
355
|
+
traces, that correspond to the same maximum similarity score.
|
|
356
|
+
``None``, if `score_only` is ``False``.
|
|
357
|
+
"""
|
|
358
|
+
if type(gap_penalty) == int:
|
|
359
|
+
affine_penalty = False
|
|
360
|
+
else:
|
|
361
|
+
affine_penalty = True
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
init_size = (
|
|
366
|
+
_min(len(code1)+1, INIT_SIZE),
|
|
367
|
+
_min(len(code2)+1, INIT_SIZE)
|
|
368
|
+
)
|
|
369
|
+
trace_table = np.zeros(init_size, dtype=np.uint8)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
# Table filling
|
|
373
|
+
###############
|
|
374
|
+
# Set the initial (upper left) score value to 'threshold + 1',
|
|
375
|
+
# to be able to use '0' as minus infinity value
|
|
376
|
+
init_score = threshold + 1
|
|
377
|
+
if affine_penalty:
|
|
378
|
+
m_table = np.zeros(init_size, dtype=np.int32)
|
|
379
|
+
g1_table = np.zeros(init_size, dtype=np.int32)
|
|
380
|
+
g2_table = np.zeros(init_size, dtype=np.int32)
|
|
381
|
+
# This implementation does not initialize the entire first
|
|
382
|
+
# row/column to avoid issues with premature pruning in the table
|
|
383
|
+
# filling process
|
|
384
|
+
m_table[0,0] = init_score
|
|
385
|
+
trace_table, m_table, g1_table, g2_table = _fill_align_table_affine(
|
|
386
|
+
code1, code2, matrix, trace_table, m_table, g1_table, g2_table,
|
|
387
|
+
threshold, gap_penalty[0], gap_penalty[1], score_only,
|
|
388
|
+
max_table_size
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
score_table = np.zeros(init_size, dtype=np.int32)
|
|
392
|
+
score_table[0,0] = init_score
|
|
393
|
+
trace_table, score_table = _fill_align_table(
|
|
394
|
+
code1, code2, matrix, trace_table, score_table, threshold,
|
|
395
|
+
gap_penalty, score_only, max_table_size
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# If only the score is desired, the traceback is not necessary
|
|
399
|
+
if score_only:
|
|
400
|
+
if affine_penalty:
|
|
401
|
+
# The maximum score in the gap score tables do not need to
|
|
402
|
+
# be considered, as these starting positions would indicate
|
|
403
|
+
# that the alignment starts with a gap
|
|
404
|
+
# Hence the maximum score value in these tables is always
|
|
405
|
+
# less than in the match table
|
|
406
|
+
max_score = np.max(m_table)
|
|
407
|
+
else:
|
|
408
|
+
max_score = np.max(score_table)
|
|
409
|
+
# The initial score needs to be subtracted again,
|
|
410
|
+
# since it was artificially added for convenience resaons
|
|
411
|
+
return max_score - init_score, None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# Traceback
|
|
415
|
+
###########
|
|
416
|
+
# Stores all possible traces (= possible alignments)
|
|
417
|
+
# A trace stores the indices of the aligned symbols
|
|
418
|
+
# in both sequences
|
|
419
|
+
trace_list = []
|
|
420
|
+
# Lists of trace starting indices
|
|
421
|
+
i_list = np.zeros(0, dtype=int)
|
|
422
|
+
j_list = np.zeros(0, dtype=int)
|
|
423
|
+
# List of start states
|
|
424
|
+
# State specifies the table the trace starts in
|
|
425
|
+
state_list = np.zeros(0, dtype=int)
|
|
426
|
+
# The start point is the maximal score in the table
|
|
427
|
+
# Multiple starting points possible,
|
|
428
|
+
# when duplicates of maximal score exist
|
|
429
|
+
if affine_penalty:
|
|
430
|
+
# Only consicder match table (see reason above)
|
|
431
|
+
max_score = np.max(m_table)
|
|
432
|
+
i_list, j_list = np.where((m_table == max_score))
|
|
433
|
+
state_list = np.append(state_list, np.full(len(i_list), 1))
|
|
434
|
+
else:
|
|
435
|
+
max_score = np.max(score_table)
|
|
436
|
+
i_list, j_list = np.where((score_table == max_score))
|
|
437
|
+
# State is always 0 for linear gap penalty
|
|
438
|
+
# since there is only one table
|
|
439
|
+
state_list = np.zeros(len(i_list), dtype=int)
|
|
440
|
+
|
|
441
|
+
# Follow the traces specified in state and indices lists
|
|
442
|
+
cdef int curr_trace_count
|
|
443
|
+
for k in range(len(i_list)):
|
|
444
|
+
i_start = i_list[k]
|
|
445
|
+
j_start = j_list[k]
|
|
446
|
+
state_start = state_list[k]
|
|
447
|
+
# Pessimistic array allocation:
|
|
448
|
+
# The maximum trace length arises from an alignment, where each
|
|
449
|
+
# symbol is aligned to a gap
|
|
450
|
+
trace = np.full(( i_start+1 + j_start+1, 2 ), -1, dtype=np.int64)
|
|
451
|
+
curr_trace_count = 1
|
|
452
|
+
follow_trace(
|
|
453
|
+
trace_table, False, i_start, j_start, 0, trace, trace_list,
|
|
454
|
+
state=state_start, curr_trace_count=&curr_trace_count,
|
|
455
|
+
max_trace_count=max_number,
|
|
456
|
+
# Diagonals are only needed for banded alignments
|
|
457
|
+
lower_diag=0, upper_diag=0
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Replace gap entries in trace with -1
|
|
461
|
+
for i, trace in enumerate(trace_list):
|
|
462
|
+
trace = np.flip(trace, axis=0)
|
|
463
|
+
gap_filter = np.zeros(trace.shape, dtype=bool)
|
|
464
|
+
gap_filter[np.unique(trace[:,0], return_index=True)[1], 0] = True
|
|
465
|
+
gap_filter[np.unique(trace[:,1], return_index=True)[1], 1] = True
|
|
466
|
+
trace[~gap_filter] = -1
|
|
467
|
+
trace_list[i] = trace
|
|
468
|
+
|
|
469
|
+
# Limit the number of generated alignments to `max_number`:
|
|
470
|
+
# In most cases this is achieved by discarding branches in
|
|
471
|
+
# 'follow_trace()', however, if multiple local alignment starts
|
|
472
|
+
# are used, the number of created traces are the number of
|
|
473
|
+
# starts times `max_number`
|
|
474
|
+
trace_list = trace_list[:max_number]
|
|
475
|
+
|
|
476
|
+
return max_score - init_score, trace_list
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
@cython.boundscheck(False)
|
|
480
|
+
@cython.wraparound(False)
|
|
481
|
+
def _fill_align_table(CodeType1[:] code1 not None,
|
|
482
|
+
CodeType2[:] code2 not None,
|
|
483
|
+
const int32[:,:] matrix not None,
|
|
484
|
+
uint8[:,:] trace_table not None,
|
|
485
|
+
int32[:,:] score_table not None,
|
|
486
|
+
int32 threshold,
|
|
487
|
+
int32 gap_penalty,
|
|
488
|
+
bint score_only,
|
|
489
|
+
int64 max_table_size):
|
|
490
|
+
"""
|
|
491
|
+
Fill an alignment table with linear gap penalty using dynamic
|
|
492
|
+
programming.
|
|
493
|
+
|
|
494
|
+
Parameters
|
|
495
|
+
----------
|
|
496
|
+
code1, code2
|
|
497
|
+
The sequence code of each sequence to be aligned.
|
|
498
|
+
matrix
|
|
499
|
+
The score matrix obtained from the :class:`SubstitutionMatrix`
|
|
500
|
+
object.
|
|
501
|
+
trace_table
|
|
502
|
+
The initial matrix containing values indicating the direction
|
|
503
|
+
for the traceback step.
|
|
504
|
+
score_table
|
|
505
|
+
The initial score table.
|
|
506
|
+
threshold
|
|
507
|
+
An alignment cell is invalidated if the total similarity score
|
|
508
|
+
is this threshold below the maximum similarity score found so
|
|
509
|
+
far.
|
|
510
|
+
gap_penalty
|
|
511
|
+
The linear gap penalty.
|
|
512
|
+
score_only
|
|
513
|
+
If true, the trace table is not filled.
|
|
514
|
+
max_table_size : int64
|
|
515
|
+
Raise a :class:`MemoryError`, if a dynamic programming table
|
|
516
|
+
exceeds this size.
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
trace_table
|
|
521
|
+
The filled trace table.
|
|
522
|
+
score_table
|
|
523
|
+
The filled score table.
|
|
524
|
+
"""
|
|
525
|
+
cdef int i, j, k=0
|
|
526
|
+
# The ranges for i in the current (k=0)
|
|
527
|
+
# and previous (k=1, k=2) antidiagonals, that point to valid cells
|
|
528
|
+
cdef int i_min_k_0=0, i_max_k_0=0
|
|
529
|
+
cdef int i_min_k_1=0, i_max_k_1=0
|
|
530
|
+
cdef int i_min_k_2=0, i_max_k_2=0
|
|
531
|
+
# The pruned range for i and j in the current antidiagonal,
|
|
532
|
+
# calculated from the previous antidiagonals
|
|
533
|
+
cdef int i_min, i_max
|
|
534
|
+
cdef int j_max
|
|
535
|
+
# The maximum values for i and j ever encountered while iterating
|
|
536
|
+
# over the antidiagonals -> used for final trimming of tables
|
|
537
|
+
cdef int i_max_total=0, j_max_total=0
|
|
538
|
+
|
|
539
|
+
cdef int32 from_diag, from_left, from_top
|
|
540
|
+
cdef uint8 trace = 0
|
|
541
|
+
cdef int32 score = 0
|
|
542
|
+
cdef int32 max_score = score_table[0, 0]
|
|
543
|
+
cdef int32 req_score = max_score - threshold
|
|
544
|
+
|
|
545
|
+
# Instead of iteration over row and column,
|
|
546
|
+
# iterate over antidiagonals and diagonals to achieve symmetric
|
|
547
|
+
# treatment of both sequences
|
|
548
|
+
for k in range(1, code1.shape[0] + code2.shape[0] + 1):
|
|
549
|
+
# Prepare values for iteration
|
|
550
|
+
i_min_k_2 = i_min_k_1
|
|
551
|
+
i_max_k_2 = i_max_k_1
|
|
552
|
+
i_min_k_1 = i_min_k_0
|
|
553
|
+
i_max_k_1 = i_max_k_0
|
|
554
|
+
# Reset values for iteration to most 'restrictive' values
|
|
555
|
+
# These restrictive values are overwritten in the next iteration
|
|
556
|
+
# if valid cells are present
|
|
557
|
+
i_min_k_0 = k
|
|
558
|
+
i_max_k_0 = 0
|
|
559
|
+
|
|
560
|
+
# Prune index range for antidiagonal
|
|
561
|
+
# to range where valid cells exist
|
|
562
|
+
i_min = _min(i_min_k_1, i_min_k_2 + 1)
|
|
563
|
+
i_max = _max(i_max_k_1 + 1, i_max_k_2 + 1)
|
|
564
|
+
# The index must also not be out of sequence range
|
|
565
|
+
i_min = _max(i_min, k - code2.shape[0])
|
|
566
|
+
i_max = _min(i_max, code1.shape[0])
|
|
567
|
+
# The algorithm has finished,
|
|
568
|
+
# if the calculated antidiagonal has no range of valid cells
|
|
569
|
+
if i_min > i_max:
|
|
570
|
+
break
|
|
571
|
+
|
|
572
|
+
j_max = k - i_min
|
|
573
|
+
# Expand ndarrays
|
|
574
|
+
# if their size would be exceeded in the following iteration
|
|
575
|
+
if i_max >= score_table.shape[0]:
|
|
576
|
+
score_table = _extend_table(
|
|
577
|
+
np.asarray(score_table), 0, max_table_size
|
|
578
|
+
)
|
|
579
|
+
if not score_only:
|
|
580
|
+
trace_table = _extend_table(
|
|
581
|
+
np.asarray(trace_table), 0, max_table_size
|
|
582
|
+
)
|
|
583
|
+
if j_max >= score_table.shape[1]:
|
|
584
|
+
score_table = _extend_table(
|
|
585
|
+
np.asarray(score_table), 1, max_table_size
|
|
586
|
+
)
|
|
587
|
+
if not score_only:
|
|
588
|
+
trace_table = _extend_table(
|
|
589
|
+
np.asarray(trace_table), 1, max_table_size
|
|
590
|
+
)
|
|
591
|
+
i_max_total = _max(i_max_total, i_max)
|
|
592
|
+
j_max_total = _max(j_max_total, j_max)
|
|
593
|
+
|
|
594
|
+
for i in range(i_min, i_max+1):
|
|
595
|
+
j = k - i
|
|
596
|
+
|
|
597
|
+
# Evaluate score from diagonal direction
|
|
598
|
+
if i != 0 and j != 0:
|
|
599
|
+
from_diag = score_table[i-1, j-1]
|
|
600
|
+
# Check if score stems from cells that are valid
|
|
601
|
+
if from_diag != 0:
|
|
602
|
+
# -1 in sequence index is necessary
|
|
603
|
+
# due to the shift of the sequences
|
|
604
|
+
# to the bottom/right in the table
|
|
605
|
+
from_diag += matrix[code1[i-1], code2[j-1]]
|
|
606
|
+
else:
|
|
607
|
+
from_diag = 0
|
|
608
|
+
else:
|
|
609
|
+
from_diag = 0
|
|
610
|
+
# Evaluate score through gap insertion
|
|
611
|
+
if i != 0:
|
|
612
|
+
from_top = score_table[i-1, j] + gap_penalty
|
|
613
|
+
else:
|
|
614
|
+
from_top = 0
|
|
615
|
+
if j != 0:
|
|
616
|
+
from_left = score_table[i, j-1] + gap_penalty
|
|
617
|
+
else:
|
|
618
|
+
from_left = 0
|
|
619
|
+
|
|
620
|
+
if score_only:
|
|
621
|
+
score = _max(from_diag, _max(from_left, from_top))
|
|
622
|
+
else:
|
|
623
|
+
trace = get_trace_linear(
|
|
624
|
+
from_diag, from_left, from_top, &score
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# Check if the obtained score reaches the required threshold
|
|
628
|
+
# and if they even exceed the maximum score
|
|
629
|
+
if score >= req_score:
|
|
630
|
+
if i_min_k_0 == k:
|
|
631
|
+
# 'i_min_k_0 == k'
|
|
632
|
+
# -> i_min_k_0 has not been set in this iteration, yet
|
|
633
|
+
i_min_k_0 = i
|
|
634
|
+
i_max_k_0 = i
|
|
635
|
+
score_table[i,j] = score
|
|
636
|
+
if not score_only:
|
|
637
|
+
trace_table[i,j] = trace
|
|
638
|
+
if score > max_score:
|
|
639
|
+
max_score = score
|
|
640
|
+
req_score = max_score - threshold
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
return np.asarray(trace_table)[:i_max_total+1, :j_max_total+1], \
|
|
644
|
+
np.asarray(score_table)[:i_max_total+1, :j_max_total+1]
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
@cython.boundscheck(False)
|
|
648
|
+
@cython.wraparound(False)
|
|
649
|
+
def _fill_align_table_affine(CodeType1[:] code1 not None,
|
|
650
|
+
CodeType2[:] code2 not None,
|
|
651
|
+
const int32[:,:] matrix not None,
|
|
652
|
+
uint8[:,:] trace_table not None,
|
|
653
|
+
int32[:,:] m_table not None,
|
|
654
|
+
int32[:,:] g1_table not None,
|
|
655
|
+
int32[:,:] g2_table not None,
|
|
656
|
+
int32 threshold,
|
|
657
|
+
int32 gap_open,
|
|
658
|
+
int32 gap_ext,
|
|
659
|
+
bint score_only,
|
|
660
|
+
int64 max_table_size):
|
|
661
|
+
"""
|
|
662
|
+
Fill an alignment table with affines gap penalty using dynamic
|
|
663
|
+
programming.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
code1, code2
|
|
668
|
+
The sequence code of each sequence to be aligned.
|
|
669
|
+
matrix
|
|
670
|
+
The score matrix obtained from the :class:`SubstitutionMatrix`
|
|
671
|
+
object.
|
|
672
|
+
trace_table
|
|
673
|
+
The initial matrix containing values indicating the direction
|
|
674
|
+
for the traceback step.
|
|
675
|
+
m_table, g1_table, g2_table
|
|
676
|
+
The alignment tables containing the scores.
|
|
677
|
+
`m_table` contains values for matches.
|
|
678
|
+
`g1_table` contains values for gaps in the first sequence.
|
|
679
|
+
`g2_table` contains values for gaps in the second sequence.
|
|
680
|
+
threshold
|
|
681
|
+
An alignment cell is invalidated if the total similarity score
|
|
682
|
+
is this threshold below the maximum similarity score found so
|
|
683
|
+
far.
|
|
684
|
+
gap_open
|
|
685
|
+
The gap opening penalty.
|
|
686
|
+
gap_ext
|
|
687
|
+
The gap extension penalty.
|
|
688
|
+
score_only
|
|
689
|
+
If true, the trace table is not filled.
|
|
690
|
+
max_table_size : int64
|
|
691
|
+
Raise a :class:`MemoryError`, if a dynamic programming table
|
|
692
|
+
exceeds this size.
|
|
693
|
+
|
|
694
|
+
Returns
|
|
695
|
+
-------
|
|
696
|
+
trace_table
|
|
697
|
+
The filled trace table.
|
|
698
|
+
m_table, g1_table, g2_table
|
|
699
|
+
The filled score tables.
|
|
700
|
+
"""
|
|
701
|
+
cdef int i, j, k=0
|
|
702
|
+
# The ranges for i in the current (k=0)
|
|
703
|
+
# and previous (k=1, k=2) antidiagonals, that point to valid cells
|
|
704
|
+
cdef int i_min_k_0=0, i_max_k_0=0
|
|
705
|
+
cdef int i_min_k_1=0, i_max_k_1=0
|
|
706
|
+
cdef int i_min_k_2=0, i_max_k_2=0
|
|
707
|
+
# The pruned range for i and j in the current antidiagonal,
|
|
708
|
+
# calculated from the previous antidiagonals
|
|
709
|
+
cdef int i_min, i_max
|
|
710
|
+
cdef int j_max
|
|
711
|
+
# The maximum values for i and j ever encountered while iterating
|
|
712
|
+
# over the antidiagonals -> used for final trimming of tables
|
|
713
|
+
cdef int i_max_total=0, j_max_total=0
|
|
714
|
+
|
|
715
|
+
cdef uint8 trace = 0
|
|
716
|
+
cdef bint is_valid_cell
|
|
717
|
+
|
|
718
|
+
cdef int32 mm_score, g1m_score, g2m_score
|
|
719
|
+
cdef int32 mg1_score, g1g1_score
|
|
720
|
+
cdef int32 mg2_score, g2g2_score
|
|
721
|
+
cdef int32 m_score, g1_score, g2_score
|
|
722
|
+
cdef int32 similarity_score
|
|
723
|
+
cdef int32 max_score = m_table[0, 0]
|
|
724
|
+
cdef int32 req_score = max_score - threshold
|
|
725
|
+
|
|
726
|
+
# Instead of iteration over row and column,
|
|
727
|
+
# iterate over antidiagonals and diagonals to achieve symmetric
|
|
728
|
+
# treatment of both sequences
|
|
729
|
+
for k in range(1, code1.shape[0] + code2.shape[0] + 1):
|
|
730
|
+
# Prepare values for iteration
|
|
731
|
+
i_min_k_2 = i_min_k_1
|
|
732
|
+
i_max_k_2 = i_max_k_1
|
|
733
|
+
i_min_k_1 = i_min_k_0
|
|
734
|
+
i_max_k_1 = i_max_k_0
|
|
735
|
+
# Reset values for iteration to most 'restrictive' values
|
|
736
|
+
# These restrictive values are overwritten in the next iteration
|
|
737
|
+
# if valid cells are present
|
|
738
|
+
i_min_k_0 = k
|
|
739
|
+
i_max_k_0 = 0
|
|
740
|
+
|
|
741
|
+
# Prune index range for antidiagonal
|
|
742
|
+
# to range where valid cells exist
|
|
743
|
+
i_min = _min(i_min_k_1, i_min_k_2 + 1)
|
|
744
|
+
i_max = _max(i_max_k_1 + 1, i_max_k_2 + 1)
|
|
745
|
+
# The index must also not be out of sequence range
|
|
746
|
+
i_min = _max(i_min, k - code2.shape[0])
|
|
747
|
+
i_max = _min(i_max, code1.shape[0])
|
|
748
|
+
# The algorithm has finished,
|
|
749
|
+
# if the calculated antidiagonal has no range of valid cells
|
|
750
|
+
if i_min > i_max:
|
|
751
|
+
break
|
|
752
|
+
|
|
753
|
+
j_max = k - i_min
|
|
754
|
+
# Expand ndarrays
|
|
755
|
+
# if their size would be exceeded in the following iteration
|
|
756
|
+
if i_max >= m_table.shape[0]:
|
|
757
|
+
m_table = _extend_table(np.asarray(m_table), 0, max_table_size)
|
|
758
|
+
g1_table = _extend_table(np.asarray(g1_table), 0, max_table_size)
|
|
759
|
+
g2_table = _extend_table(np.asarray(g2_table), 0, max_table_size)
|
|
760
|
+
if not score_only:
|
|
761
|
+
trace_table = _extend_table(
|
|
762
|
+
np.asarray(trace_table), 0, max_table_size
|
|
763
|
+
)
|
|
764
|
+
if j_max >= m_table.shape[1]:
|
|
765
|
+
m_table = _extend_table(np.asarray(m_table), 1, max_table_size)
|
|
766
|
+
g1_table = _extend_table(np.asarray(g1_table), 1, max_table_size)
|
|
767
|
+
g2_table = _extend_table(np.asarray(g2_table), 1, max_table_size)
|
|
768
|
+
if not score_only:
|
|
769
|
+
trace_table = _extend_table(
|
|
770
|
+
np.asarray(trace_table), 1, max_table_size
|
|
771
|
+
)
|
|
772
|
+
i_max_total = _max(i_max_total, i_max)
|
|
773
|
+
j_max_total = _max(j_max_total, j_max)
|
|
774
|
+
|
|
775
|
+
for i in range(i_min, i_max+1):
|
|
776
|
+
j = k - i
|
|
777
|
+
|
|
778
|
+
# Evaluate score from diagonal direction
|
|
779
|
+
if i != 0 and j != 0:
|
|
780
|
+
# -1 in sequence index is necessary
|
|
781
|
+
# due to the shift of the sequences
|
|
782
|
+
# to the bottom/right in the table
|
|
783
|
+
similarity_score = matrix[code1[i-1], code2[j-1]]
|
|
784
|
+
mm_score = m_table[i-1,j-1]
|
|
785
|
+
g1m_score = g1_table[i-1,j-1]
|
|
786
|
+
g2m_score = g2_table[i-1,j-1]
|
|
787
|
+
# Check if scores stem from cells that are valid
|
|
788
|
+
if mm_score != 0:
|
|
789
|
+
mm_score += similarity_score
|
|
790
|
+
if g1m_score != 0:
|
|
791
|
+
g1m_score += similarity_score
|
|
792
|
+
if g2m_score != 0:
|
|
793
|
+
g2m_score += similarity_score
|
|
794
|
+
else:
|
|
795
|
+
mm_score = 0
|
|
796
|
+
g1m_score = 0
|
|
797
|
+
g2m_score = 0
|
|
798
|
+
# Evaluate score through gap insertion
|
|
799
|
+
# No transition from g1_table to g2_table and vice versa,
|
|
800
|
+
# since this would mean adjacent gaps in both sequences;
|
|
801
|
+
# a substitution makes more sense in this case
|
|
802
|
+
if j != 0:
|
|
803
|
+
mg1_score = m_table[i,j-1] + gap_open
|
|
804
|
+
g1g1_score = g1_table[i,j-1] + gap_ext
|
|
805
|
+
else:
|
|
806
|
+
mg1_score = 0
|
|
807
|
+
g1g1_score = 0
|
|
808
|
+
if i != 0:
|
|
809
|
+
mg2_score = m_table[i-1,j] + gap_open
|
|
810
|
+
g2g2_score = g2_table[i-1,j] + gap_ext
|
|
811
|
+
else:
|
|
812
|
+
mg2_score = 0
|
|
813
|
+
g2g2_score = 0
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
if score_only:
|
|
818
|
+
m_score = _max(mm_score, _max(g1m_score, g2m_score))
|
|
819
|
+
g1_score = _max(mg1_score, g1g1_score)
|
|
820
|
+
g2_score = _max(mg2_score, g2g2_score)
|
|
821
|
+
else:
|
|
822
|
+
trace = get_trace_affine(
|
|
823
|
+
mm_score, g1m_score, g2m_score,
|
|
824
|
+
mg1_score, g1g1_score,
|
|
825
|
+
mg2_score, g2g2_score,
|
|
826
|
+
# The max score values to be written
|
|
827
|
+
&m_score, &g1_score, &g2_score
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
# Check if the obtained scores reach the required threshold
|
|
832
|
+
# and if they even exceed the maximum score
|
|
833
|
+
is_valid_cell = False
|
|
834
|
+
|
|
835
|
+
if m_score >= req_score:
|
|
836
|
+
if i_min_k_0 == k:
|
|
837
|
+
i_min_k_0 = i
|
|
838
|
+
i_max_k_0 = i
|
|
839
|
+
m_table[i,j] = m_score
|
|
840
|
+
is_valid_cell = True
|
|
841
|
+
if m_score > max_score:
|
|
842
|
+
max_score = m_score
|
|
843
|
+
req_score = max_score - threshold
|
|
844
|
+
|
|
845
|
+
if g1_score >= req_score:
|
|
846
|
+
if i_min_k_0 == k:
|
|
847
|
+
i_min_k_0 = i
|
|
848
|
+
i_max_k_0 = i
|
|
849
|
+
g1_table[i,j] = g1_score
|
|
850
|
+
is_valid_cell = True
|
|
851
|
+
if g1_score > max_score:
|
|
852
|
+
max_score = g1_score
|
|
853
|
+
req_score = max_score - threshold
|
|
854
|
+
|
|
855
|
+
if g2_score >= req_score:
|
|
856
|
+
if i_min_k_0 == k:
|
|
857
|
+
i_min_k_0 = i
|
|
858
|
+
i_max_k_0 = i
|
|
859
|
+
g2_table[i,j] = g2_score
|
|
860
|
+
is_valid_cell = True
|
|
861
|
+
if g2_score > max_score:
|
|
862
|
+
max_score = g2_score
|
|
863
|
+
req_score = max_score - threshold
|
|
864
|
+
|
|
865
|
+
if is_valid_cell and not score_only:
|
|
866
|
+
trace_table[i,j] = trace
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
return np.asarray(trace_table)[:i_max_total+1, :j_max_total+1], \
|
|
870
|
+
np.asarray(m_table )[:i_max_total+1, :j_max_total+1], \
|
|
871
|
+
np.asarray(g1_table )[:i_max_total+1, :j_max_total+1], \
|
|
872
|
+
np.asarray(g2_table )[:i_max_total+1, :j_max_total+1]
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def _extend_table(table, int dimension, int64 max_size):
|
|
876
|
+
if dimension == 0:
|
|
877
|
+
new_shape = (table.shape[0] * 2, table.shape[1])
|
|
878
|
+
else:
|
|
879
|
+
new_shape = (table.shape[0], table.shape[1] * 2)
|
|
880
|
+
if new_shape[0] * new_shape[1] > max_size:
|
|
881
|
+
raise MemoryError("Maximum table size exceeded")
|
|
882
|
+
new_table = np.zeros(new_shape, dtype=table.dtype)
|
|
883
|
+
# Fill in exiisting data
|
|
884
|
+
new_table[:table.shape[0], :table.shape[1]] = table
|
|
885
|
+
return new_table
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
cdef inline int _min(int32 a, int32 b):
|
|
889
|
+
return a if a < b else b
|
|
890
|
+
|
|
891
|
+
cdef inline int _max(int32 a, int32 b):
|
|
892
|
+
return a if a > b else b
|