biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["align_multiple"]
|
|
8
|
+
|
|
9
|
+
cimport cython
|
|
10
|
+
cimport numpy as np
|
|
11
|
+
from libc.math cimport log
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from .matrix import SubstitutionMatrix
|
|
15
|
+
from .alignment import Alignment
|
|
16
|
+
from .pairwise import align_optimal
|
|
17
|
+
from ..sequence import Sequence
|
|
18
|
+
from ..alphabet import Alphabet
|
|
19
|
+
from ..phylo.upgma import upgma
|
|
20
|
+
from ..phylo.tree import Tree, TreeNode, as_binary
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ctypedef np.int32_t int32
|
|
24
|
+
ctypedef np.int64_t int64
|
|
25
|
+
ctypedef np.uint8_t uint8
|
|
26
|
+
ctypedef np.uint16_t uint16
|
|
27
|
+
ctypedef np.uint32_t uint32
|
|
28
|
+
ctypedef np.uint64_t uint64
|
|
29
|
+
ctypedef np.float32_t float32
|
|
30
|
+
|
|
31
|
+
ctypedef fused CodeType:
|
|
32
|
+
uint8
|
|
33
|
+
uint16
|
|
34
|
+
uint32
|
|
35
|
+
uint64
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
cdef float32 MAX_FLOAT = np.finfo(np.float32).max
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GapSymbol:
|
|
42
|
+
|
|
43
|
+
_instance = None
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
if GapSymbol._instance is not None:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"Cannot instantiate this singleton more than one time"
|
|
49
|
+
)
|
|
50
|
+
else:
|
|
51
|
+
GapSymbol._instance = self
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def instance():
|
|
55
|
+
if GapSymbol._instance is None:
|
|
56
|
+
GapSymbol._instance = GapSymbol()
|
|
57
|
+
return GapSymbol._instance
|
|
58
|
+
|
|
59
|
+
def __str__(self):
|
|
60
|
+
return "-"
|
|
61
|
+
|
|
62
|
+
def __hash__(self):
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def align_multiple(sequences, matrix, gap_penalty=-10, terminal_penalty=True,
|
|
67
|
+
distances=None, guide_tree=None):
|
|
68
|
+
r"""
|
|
69
|
+
align_multiple(sequences, matrix, gap_penalty=-10,
|
|
70
|
+
terminal_penalty=True, distances=None,
|
|
71
|
+
guide_tree=None)
|
|
72
|
+
|
|
73
|
+
Perform a multiple sequence alignment using a progressive
|
|
74
|
+
alignment algorithm. :footcite:`Feng1987`
|
|
75
|
+
|
|
76
|
+
Based on pairwise sequence distances a guide tree is constructed.
|
|
77
|
+
The sequences are progessively aligned according to the tree,
|
|
78
|
+
following the rule 'Once a gap, always a gap'.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
sequences : list of Sequence
|
|
83
|
+
The sequences to be aligned.
|
|
84
|
+
The alpahbet of the substitution matrix must be equal or
|
|
85
|
+
extend the alphabet of each sequence.
|
|
86
|
+
matrix : SubstitutionMatrix
|
|
87
|
+
The substitution matrix used for scoring.
|
|
88
|
+
Must be symmetric.
|
|
89
|
+
gap_penalty : int or tuple(int, int), optional
|
|
90
|
+
If an integer is provided, the value will be interpreted as
|
|
91
|
+
general gap penalty. If a tuple is provided, an affine gap
|
|
92
|
+
penalty is used. The first integer in the tuple is the gap
|
|
93
|
+
opening penalty, the second integer is the gap extension
|
|
94
|
+
penalty.
|
|
95
|
+
The values need to be negative.
|
|
96
|
+
terminal_penalty : bool, optional
|
|
97
|
+
If true, gap penalties are applied to terminal gaps.
|
|
98
|
+
distances : ndarray, shape=(n,n)
|
|
99
|
+
Pairwise distances of the sequences.
|
|
100
|
+
The matrix must be symmetric and all entries must be larger
|
|
101
|
+
than 0.
|
|
102
|
+
By default the pairwise distances are calculated from
|
|
103
|
+
similarities obtained from optimal global pairwise alignments
|
|
104
|
+
(:func:`align_optimal()`).
|
|
105
|
+
The similarities are converted into distances using the method
|
|
106
|
+
proposed by Feng & Doolittle :footcite:`Feng1996`.
|
|
107
|
+
guide_tree : Tree
|
|
108
|
+
The guide tree to be used for the progressive alignment.
|
|
109
|
+
By default the guide tree is constructed from `distances`
|
|
110
|
+
via the UPGMA clustering method.
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
alignment : Alignment
|
|
115
|
+
The global multiple sequence alignment of the input sequences.
|
|
116
|
+
order : ndarray, dtype=int
|
|
117
|
+
The sequence order represented by the guide tree.
|
|
118
|
+
When this order is applied to alignment sequence order,
|
|
119
|
+
similar sequences are adjacent to each other.
|
|
120
|
+
tree : Tree
|
|
121
|
+
The guide tree used for progressive alignment.
|
|
122
|
+
Equal to `guide_tree` if provided.
|
|
123
|
+
distance_matrix : ndarray, shape=(n,n), dtype=float32
|
|
124
|
+
The pairwise distance matrix used to construct the guide tree.
|
|
125
|
+
Equal to `distances` if provided.
|
|
126
|
+
|
|
127
|
+
Notes
|
|
128
|
+
-----
|
|
129
|
+
The similarity to distance conversion is performed according to the
|
|
130
|
+
following formula:
|
|
131
|
+
|
|
132
|
+
.. math:: D_{a,b} = -\ln\left(
|
|
133
|
+
\frac
|
|
134
|
+
{ S_{a,b} - S_{a,b}^{rand} }
|
|
135
|
+
{ S_{a,b}^{max} - S_{a,b}^{rand} }
|
|
136
|
+
\right)
|
|
137
|
+
|
|
138
|
+
.. math:: S_{a,b}^{max} = \frac{ S_{a,a} + S_{b,b} }{ 2 }
|
|
139
|
+
|
|
140
|
+
.. math:: S_{a,b}^{rand} = \frac{1}{L_{a,b}}
|
|
141
|
+
\left(
|
|
142
|
+
\sum_{x \in \Omega} \sum_{y \in \Omega}
|
|
143
|
+
s_{x,y} \cdot N_a(x) \cdot N_b(y)
|
|
144
|
+
\right)
|
|
145
|
+
+ N_{a,b}^{open} \cdot p^{open} + N_{a,b}^{ext} \cdot p^{ext}
|
|
146
|
+
|
|
147
|
+
:math:`D_{a,b}` - The distance between the sequences *a* and *b*.
|
|
148
|
+
|
|
149
|
+
:math:`S_{a,b}` - The similarity score between the sequences *a* and *b*.
|
|
150
|
+
|
|
151
|
+
:math:`s_{x,y}` - The similarity score between the symbols *x* and *y*.
|
|
152
|
+
|
|
153
|
+
:math:`\Omega` - The sequence alphabet.
|
|
154
|
+
|
|
155
|
+
:math:`N_a(x)` - Number of occurences of symbol *x* in sequence *a*.
|
|
156
|
+
|
|
157
|
+
:math:`N_{a,b}^{open}, N_{a,b}^{ext}` - Number of gap openings/
|
|
158
|
+
extensions, in the alignment of *a* and *b*.
|
|
159
|
+
|
|
160
|
+
:math:`p^{open}, p^{ext}` - The penalty for a gap opening/extension.
|
|
161
|
+
|
|
162
|
+
:math:`L_{a,b}` - Number of columns in the alignment of *a* and *b*.
|
|
163
|
+
|
|
164
|
+
In rare cases of extremely unrelated sequences, :math:`S_{a,b}`
|
|
165
|
+
can be lower than :math:`S_{a,b}^{rand}`.
|
|
166
|
+
In this case the logarithm cannot be calculated and a
|
|
167
|
+
:class:`ValueError` is raised.
|
|
168
|
+
|
|
169
|
+
References
|
|
170
|
+
----------
|
|
171
|
+
|
|
172
|
+
.. footbibliography::
|
|
173
|
+
|
|
174
|
+
Examples
|
|
175
|
+
--------
|
|
176
|
+
|
|
177
|
+
>>> seq1 = ProteinSequence("BIQTITE")
|
|
178
|
+
>>> seq2 = ProteinSequence("TITANITE")
|
|
179
|
+
>>> seq3 = ProteinSequence("BISMITE")
|
|
180
|
+
>>> seq4 = ProteinSequence("IQLITE")
|
|
181
|
+
>>> matrix = SubstitutionMatrix.std_protein_matrix()
|
|
182
|
+
>>>
|
|
183
|
+
>>> alignment, order, tree, distances = align_multiple(
|
|
184
|
+
... [seq1, seq2, seq3, seq4], matrix
|
|
185
|
+
... )
|
|
186
|
+
>>>
|
|
187
|
+
>>> print(alignment)
|
|
188
|
+
BIQT-ITE
|
|
189
|
+
TITANITE
|
|
190
|
+
BISM-ITE
|
|
191
|
+
-IQL-ITE
|
|
192
|
+
>>> print(alignment[:, order.tolist()])
|
|
193
|
+
-IQL-ITE
|
|
194
|
+
BISM-ITE
|
|
195
|
+
BIQT-ITE
|
|
196
|
+
TITANITE
|
|
197
|
+
>>> print(distances)
|
|
198
|
+
[[0.000 1.034 0.382 0.560]
|
|
199
|
+
[1.034 0.000 0.923 1.132]
|
|
200
|
+
[0.382 0.923 0.000 0.632]
|
|
201
|
+
[0.560 1.132 0.632 0.000]]
|
|
202
|
+
>>>
|
|
203
|
+
>>> print(tree.to_newick(
|
|
204
|
+
... labels=["seq1", "seq2", "seq3", "seq4"], include_distance=False
|
|
205
|
+
... ))
|
|
206
|
+
((seq4,(seq3,seq1)),seq2);
|
|
207
|
+
"""
|
|
208
|
+
if not matrix.is_symmetric():
|
|
209
|
+
raise ValueError("A symmetric substitution matrix is required")
|
|
210
|
+
alphabet = matrix.get_alphabet1()
|
|
211
|
+
for i, seq in enumerate(sequences):
|
|
212
|
+
if seq.code is None:
|
|
213
|
+
raise ValueError(f"Code of sequence {i} is 'None'")
|
|
214
|
+
if not alphabet.extends(seq.get_alphabet()):
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"The substitution matrix and sequence {i} have "
|
|
217
|
+
f"incompatible alphabets"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Create guide tree
|
|
221
|
+
# Template parameter workaround
|
|
222
|
+
_T = sequences[0].code
|
|
223
|
+
if distances is None:
|
|
224
|
+
distances = _get_distance_matrix(
|
|
225
|
+
_T, sequences, matrix, gap_penalty, terminal_penalty
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
distances = distances.astype(np.float32, copy=True)
|
|
229
|
+
if guide_tree is None:
|
|
230
|
+
guide_tree = upgma(distances)
|
|
231
|
+
else:
|
|
232
|
+
# Assure that every node in the guide tree is binary
|
|
233
|
+
guide_tree = as_binary(guide_tree)
|
|
234
|
+
|
|
235
|
+
# Create new matrix with neutral gap symbol
|
|
236
|
+
gap_symbol = GapSymbol.instance()
|
|
237
|
+
new_alphabet = Alphabet(
|
|
238
|
+
matrix.get_alphabet1().get_symbols() + (gap_symbol,)
|
|
239
|
+
)
|
|
240
|
+
new_score_matrix = np.zeros(
|
|
241
|
+
(len(new_alphabet), len(new_alphabet)), dtype=np.int32
|
|
242
|
+
)
|
|
243
|
+
# New substitution matrix is the same as the old one,
|
|
244
|
+
# except the neutral ghap symbol,
|
|
245
|
+
# that scores 0 with all other symbols
|
|
246
|
+
new_score_matrix[:-1,:-1] = matrix.score_matrix()
|
|
247
|
+
new_matrix = SubstitutionMatrix(
|
|
248
|
+
new_alphabet, new_alphabet, new_score_matrix
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Progressive alignment
|
|
252
|
+
gap_symbol_code = new_alphabet.encode(gap_symbol)
|
|
253
|
+
order, aligned_seqs = _progressive_align(
|
|
254
|
+
_T, sequences, guide_tree.root, distances, new_matrix,
|
|
255
|
+
gap_symbol_code, gap_penalty, terminal_penalty
|
|
256
|
+
)
|
|
257
|
+
aligned_seq_codes = [seq.code for seq in aligned_seqs]
|
|
258
|
+
|
|
259
|
+
# Remove neutral gap symbols and create actual trace
|
|
260
|
+
seq_i = np.zeros(len(aligned_seqs))
|
|
261
|
+
trace = np.full(
|
|
262
|
+
(len(aligned_seqs[0]), len(aligned_seqs)), -1, dtype=np.int64)
|
|
263
|
+
for j in range(trace.shape[1]):
|
|
264
|
+
seq_code = aligned_seq_codes[j]
|
|
265
|
+
seq_i = 0
|
|
266
|
+
for i in range(trace.shape[0]):
|
|
267
|
+
if seq_code[i] == gap_symbol_code:
|
|
268
|
+
trace[i,j] = -1
|
|
269
|
+
else:
|
|
270
|
+
trace[i,j] = seq_i
|
|
271
|
+
seq_i += 1
|
|
272
|
+
aligned_seq_codes = [
|
|
273
|
+
code[code != gap_symbol_code] for code in aligned_seq_codes
|
|
274
|
+
]
|
|
275
|
+
for i in range(len(aligned_seqs)):
|
|
276
|
+
aligned_seqs[i].code = aligned_seq_codes[i]
|
|
277
|
+
|
|
278
|
+
# Reorder alignmets into original alignemnt
|
|
279
|
+
new_order = np.argsort(order)
|
|
280
|
+
aligned_seqs = [aligned_seqs[pos] for pos in new_order]
|
|
281
|
+
trace = trace[:, new_order]
|
|
282
|
+
|
|
283
|
+
return Alignment(aligned_seqs, trace), order, guide_tree, distances
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _get_distance_matrix(CodeType[:] _T, sequences, matrix,
|
|
287
|
+
gap_penalty, terminal_penalty):
|
|
288
|
+
"""
|
|
289
|
+
Create all pairwise alignments for the given sequences and use the
|
|
290
|
+
method proposed by Feng & Doolittle to calculate the pairwise
|
|
291
|
+
distance matrix
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
_T : ndarray, dtype=VARAIBLE
|
|
296
|
+
A little bit hacky workaround to get the correct dtype for the
|
|
297
|
+
sequence code of the sequences in a static way
|
|
298
|
+
(important for Cython).
|
|
299
|
+
sequences : list of Sequence, length=n
|
|
300
|
+
The sequences to get the distance matrix for.
|
|
301
|
+
matrix : SubstitutionMatrix
|
|
302
|
+
The substitution matrix used for the alignments.
|
|
303
|
+
gap_penalty : int or tuple(int, int)
|
|
304
|
+
A linear or affine gap penalty for the alignments.
|
|
305
|
+
terminal_penalty : bool
|
|
306
|
+
Whether to or not count terminal gap penalties for the
|
|
307
|
+
alignments.
|
|
308
|
+
|
|
309
|
+
Returns
|
|
310
|
+
-------
|
|
311
|
+
distances : ndarray, shape=(n,n), dtype=float32
|
|
312
|
+
The pairwise distance matrix.
|
|
313
|
+
"""
|
|
314
|
+
cdef int i, j
|
|
315
|
+
|
|
316
|
+
cdef np.ndarray scores = np.zeros(
|
|
317
|
+
(len(sequences), len(sequences)), dtype=np.int32
|
|
318
|
+
)
|
|
319
|
+
cdef np.ndarray alignments = np.full(
|
|
320
|
+
(len(sequences), len(sequences)), None, dtype=object
|
|
321
|
+
)
|
|
322
|
+
for i in range(len(sequences)):
|
|
323
|
+
# Inclusive range
|
|
324
|
+
for j in range(i+1):
|
|
325
|
+
# For this method we only consider one alignment:
|
|
326
|
+
# Score is equal for all alignments
|
|
327
|
+
# Alignment length is equal for most alignments
|
|
328
|
+
alignment = align_optimal(
|
|
329
|
+
sequences[i], sequences[j], matrix,
|
|
330
|
+
gap_penalty, terminal_penalty, max_number=1
|
|
331
|
+
)[0]
|
|
332
|
+
scores[i,j] = alignment.score
|
|
333
|
+
alignments[i,j] = alignment
|
|
334
|
+
|
|
335
|
+
### Distance calculation from similarity scores ###
|
|
336
|
+
# Calculate the occurences of each symbol code in each sequence
|
|
337
|
+
# This is used later for the random score
|
|
338
|
+
# Both alphabets are the same
|
|
339
|
+
cdef CodeType alphabet_size = len(matrix.get_alphabet1())
|
|
340
|
+
cdef np.ndarray code_count = np.zeros(
|
|
341
|
+
(len(sequences), alphabet_size), dtype=np.int32
|
|
342
|
+
)
|
|
343
|
+
cdef int32[:,:] code_count_v = code_count
|
|
344
|
+
for i in range(len(sequences)):
|
|
345
|
+
code_count[i] = np.bincount(sequences[i].code, minlength=alphabet_size)
|
|
346
|
+
|
|
347
|
+
cdef int gap_open=0, gap_ext=0
|
|
348
|
+
if type(gap_penalty) == int:
|
|
349
|
+
gap_open = gap_penalty
|
|
350
|
+
gap_ext = gap_penalty
|
|
351
|
+
elif type(gap_penalty) == tuple:
|
|
352
|
+
gap_open = gap_penalty[0]
|
|
353
|
+
gap_ext = gap_penalty[1]
|
|
354
|
+
else:
|
|
355
|
+
raise TypeError("Gap penalty must be either integer or tuple")
|
|
356
|
+
|
|
357
|
+
cdef const int32[:,:] score_matrix = matrix.score_matrix()
|
|
358
|
+
cdef int32[:,:] scores_v = scores
|
|
359
|
+
cdef np.ndarray distances = np.zeros(
|
|
360
|
+
(scores.shape[0], scores.shape[1]), dtype=np.float32
|
|
361
|
+
)
|
|
362
|
+
cdef float32[:,:] distances_v = distances
|
|
363
|
+
cdef CodeType[:] seq_code1, seq_code2
|
|
364
|
+
cdef CodeType code1, code2
|
|
365
|
+
cdef float32 score_rand, score_max
|
|
366
|
+
|
|
367
|
+
# Calculate distance
|
|
368
|
+
# i and j are indicating the alignment between the sequences i and j
|
|
369
|
+
for i in range(scores_v.shape[0]):
|
|
370
|
+
for j in range(i):
|
|
371
|
+
score_max = (scores_v[i,i] + scores_v[j,j]) / 2.0
|
|
372
|
+
score_rand = 0
|
|
373
|
+
for code1 in range(alphabet_size):
|
|
374
|
+
for code2 in range(alphabet_size):
|
|
375
|
+
score_rand += score_matrix[code1,code2] \
|
|
376
|
+
* code_count[i,code1] \
|
|
377
|
+
* code_count[j,code2]
|
|
378
|
+
score_rand /= alignments[i,j].trace.shape[0]
|
|
379
|
+
gap_open_count, gap_ext_count = _count_gaps(
|
|
380
|
+
alignments[i,j].trace.astype(np.int64, copy=False),
|
|
381
|
+
terminal_penalty
|
|
382
|
+
)
|
|
383
|
+
score_rand += gap_open_count * gap_open
|
|
384
|
+
score_rand += gap_ext_count * gap_ext
|
|
385
|
+
if scores_v[i,j] < score_rand:
|
|
386
|
+
# Randomized alignment is better than actual alignment
|
|
387
|
+
# -> the logaritmus argument would become negative
|
|
388
|
+
# resulting in an NaN distance
|
|
389
|
+
raise ValueError(
|
|
390
|
+
f"The randomized alignment of sequences {j} and {i} "
|
|
391
|
+
f"scores better than the real pairwise alignment, "
|
|
392
|
+
f"cannot calculate proper pairwise distance"
|
|
393
|
+
)
|
|
394
|
+
else:
|
|
395
|
+
distances_v[i,j] = -log(
|
|
396
|
+
(scores_v[i,j] - score_rand) / (score_max - score_rand)
|
|
397
|
+
)
|
|
398
|
+
# Pairwise distance matrix is symmetric
|
|
399
|
+
distances_v[j,i] = distances_v[i,j]
|
|
400
|
+
return distances
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _count_gaps(int64[:,:] trace_v, bint terminal_penalty):
|
|
404
|
+
"""
|
|
405
|
+
Count the number of gap openings and gap extensions in an alignment
|
|
406
|
+
trace.
|
|
407
|
+
|
|
408
|
+
Parameters
|
|
409
|
+
----------
|
|
410
|
+
trace_v : ndarary, shape=(n,2), dtype=int
|
|
411
|
+
The alignemnt trace.
|
|
412
|
+
terminal_penalty : bool
|
|
413
|
+
Whether to or not count terminal gap penalties.
|
|
414
|
+
|
|
415
|
+
Returns
|
|
416
|
+
-------
|
|
417
|
+
gap_open_count, gap_ext_count: int
|
|
418
|
+
The number of gap opening and gap extension columns
|
|
419
|
+
"""
|
|
420
|
+
cdef int i, j
|
|
421
|
+
cdef int gap_open_count=0, gap_ext_count=0
|
|
422
|
+
cdef int start_index=-1, stop_index=-1
|
|
423
|
+
|
|
424
|
+
if not terminal_penalty:
|
|
425
|
+
# Ignore terminal gaps
|
|
426
|
+
# -> get start and exclusive stop column of the trace
|
|
427
|
+
# excluding terminal gaps
|
|
428
|
+
for i in range(trace_v.shape[0]):
|
|
429
|
+
# Check if all sequences have no gap at the given position
|
|
430
|
+
if trace_v[i,0] != -1 and trace_v[i,1] != -1:
|
|
431
|
+
start_index = i
|
|
432
|
+
break
|
|
433
|
+
# Reverse iteration
|
|
434
|
+
for i in range(trace_v.shape[0]-1, -1, -1):
|
|
435
|
+
# Check if all sequences have no gap at the given position
|
|
436
|
+
if trace_v[i,0] != -1 and trace_v[i,1] != -1:
|
|
437
|
+
stop_index = i+1
|
|
438
|
+
break
|
|
439
|
+
if start_index == -1 or stop_index == -1:
|
|
440
|
+
return 0, 0
|
|
441
|
+
trace_v = trace_v[start_index : stop_index]
|
|
442
|
+
|
|
443
|
+
if trace_v[0,0] == -1:
|
|
444
|
+
gap_open_count += 1
|
|
445
|
+
if trace_v[0,1] == -1:
|
|
446
|
+
gap_open_count += 1
|
|
447
|
+
for i in range(1, trace_v.shape[0]):
|
|
448
|
+
# trace_v.shape[1] = 2 due to pairwise alignemt
|
|
449
|
+
for j in range(trace_v.shape[1]):
|
|
450
|
+
if trace_v[i,j] == -1:
|
|
451
|
+
if trace_v[i-1,j] == -1:
|
|
452
|
+
gap_ext_count += 1
|
|
453
|
+
else:
|
|
454
|
+
gap_open_count += 1
|
|
455
|
+
return gap_open_count, gap_ext_count
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _progressive_align(CodeType[:] _T, sequences, tree_node,
|
|
459
|
+
float32[:,:]distances_v, matrix,
|
|
460
|
+
int gap_symbol_code, gap_penalty, terminal_penalty):
|
|
461
|
+
"""
|
|
462
|
+
Conduct the progressive alignemt of the sequences that are
|
|
463
|
+
referred to by the given guide tree node.
|
|
464
|
+
|
|
465
|
+
At first the the two sub-MSAs are calculated from the child nodes
|
|
466
|
+
of the given node.
|
|
467
|
+
Then the sub-MSAs are combined to one MSA by aligning the two
|
|
468
|
+
sequences from both sub-MSAs with the lowest distance to each other,
|
|
469
|
+
taken from the pairwise distance matrix.
|
|
470
|
+
The gaps inserted in this pairwise alignment are also inserted
|
|
471
|
+
into all other sequences in the respective sub-MSA at the same
|
|
472
|
+
position.
|
|
473
|
+
|
|
474
|
+
Parameters
|
|
475
|
+
----------
|
|
476
|
+
_T : ndarray, dtype=VARAIBLE
|
|
477
|
+
A little bit hacky workaround to get the correct dtype for the
|
|
478
|
+
sequence code of the sequences in a static way
|
|
479
|
+
(important for Cython).
|
|
480
|
+
sequences : list of Sequence, lebgth=n
|
|
481
|
+
All sequences that should be aligned in the MSA.
|
|
482
|
+
tree_node : TreeNode
|
|
483
|
+
This guide tree node defines, which of sequences in the
|
|
484
|
+
`sequences` parameter should be aligned in this call.
|
|
485
|
+
This is the only parameter that changes in the series of
|
|
486
|
+
recursive calls of this function.
|
|
487
|
+
distances_v : ndarray, shape=(n,n)
|
|
488
|
+
The pairwise distance matrix.
|
|
489
|
+
matrix : SubstitutionMatrix
|
|
490
|
+
The substitution matrix used for the alignments.
|
|
491
|
+
gap_symbol_code : int
|
|
492
|
+
The symbol code for the gap symbol.
|
|
493
|
+
gap_penalty : int or tuple(int, int)
|
|
494
|
+
A linear or affine gap penalty for the alignments.
|
|
495
|
+
terminal_penalty : bool
|
|
496
|
+
Whether to or not count terminal gap penalties for the
|
|
497
|
+
alignments.
|
|
498
|
+
|
|
499
|
+
Returns
|
|
500
|
+
-------
|
|
501
|
+
order : ndarray, shape=(m,), dtype=int
|
|
502
|
+
The index of each element in `aligned_sequences` in the
|
|
503
|
+
orginal `sequences` parameter.
|
|
504
|
+
aligned_sequences : list of Sequence, length=m
|
|
505
|
+
A list of the sequences that were aligned.
|
|
506
|
+
Instead of an :class:`Alignment` object that represents the gaps
|
|
507
|
+
as ``-1`` in the trace, the gaps are represented as dedicated
|
|
508
|
+
gap symbols in this case.
|
|
509
|
+
This allows for the pairwise alignemt of gapped sequences.
|
|
510
|
+
"""
|
|
511
|
+
cdef int i=0, j=0
|
|
512
|
+
cdef int i_min=0, j_min=0
|
|
513
|
+
cdef float32 dist_min, dist
|
|
514
|
+
cdef int32[:] indices1_v, indices2_v
|
|
515
|
+
cdef np.ndarray incides1, incides2
|
|
516
|
+
cdef list aligned_seqs1, aligned_seqs2
|
|
517
|
+
|
|
518
|
+
if tree_node.is_leaf():
|
|
519
|
+
# Child node -> Cannot do an alignment
|
|
520
|
+
# -> Just return the sequence corresponding to the leaf node
|
|
521
|
+
# Copy sequences to avoid modification of input sequences
|
|
522
|
+
# when neutral gap character is inserted
|
|
523
|
+
return np.array([tree_node.index], dtype=np.int32), \
|
|
524
|
+
[sequences[tree_node.index].copy()]
|
|
525
|
+
|
|
526
|
+
else:
|
|
527
|
+
# Multiple alignment of sequences corresponding to both child nodes
|
|
528
|
+
child1, child2 = tree_node.children
|
|
529
|
+
incides1, aligned_seqs1 = _progressive_align(
|
|
530
|
+
_T, sequences, child1, distances_v, matrix,
|
|
531
|
+
gap_symbol_code, gap_penalty, terminal_penalty
|
|
532
|
+
)
|
|
533
|
+
indices1_v = incides1
|
|
534
|
+
incides2, aligned_seqs2 = _progressive_align(
|
|
535
|
+
_T, sequences, child2, distances_v, matrix,
|
|
536
|
+
gap_symbol_code, gap_penalty, terminal_penalty
|
|
537
|
+
)
|
|
538
|
+
indices2_v = incides2
|
|
539
|
+
|
|
540
|
+
# Find sequence pair with lowest distance
|
|
541
|
+
dist_min = MAX_FLOAT
|
|
542
|
+
for i in range(indices1_v.shape[0]):
|
|
543
|
+
for j in range(indices2_v.shape[0]):
|
|
544
|
+
dist = distances_v[indices1_v[i], indices2_v[j]]
|
|
545
|
+
if dist < dist_min:
|
|
546
|
+
dist_min = dist
|
|
547
|
+
i_min = i
|
|
548
|
+
j_min = j
|
|
549
|
+
# Alignment of sequence pair with lowest distance
|
|
550
|
+
# For this method we only consider one alignment:
|
|
551
|
+
alignment = align_optimal(
|
|
552
|
+
aligned_seqs1[i_min], aligned_seqs2[j_min], matrix,
|
|
553
|
+
gap_penalty, terminal_penalty, max_number=1
|
|
554
|
+
)[0]
|
|
555
|
+
# Place neutral gap symbol for position of new gaps
|
|
556
|
+
# in both sequence groups
|
|
557
|
+
for i in range(len(aligned_seqs1)):
|
|
558
|
+
seq = aligned_seqs1[i]
|
|
559
|
+
seq.code = _replace_gaps(
|
|
560
|
+
_T, alignment.trace[:,0], seq.code, gap_symbol_code
|
|
561
|
+
)
|
|
562
|
+
for i in range(len(aligned_seqs2)):
|
|
563
|
+
seq = aligned_seqs2[i]
|
|
564
|
+
seq.code = _replace_gaps(
|
|
565
|
+
_T, alignment.trace[:,1], seq.code, gap_symbol_code
|
|
566
|
+
)
|
|
567
|
+
return np.append(incides1, incides2), \
|
|
568
|
+
aligned_seqs1 + aligned_seqs2
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _replace_gaps(CodeType[:] _T,
|
|
573
|
+
int64[:] partial_trace_v,
|
|
574
|
+
np.ndarray seq_code,
|
|
575
|
+
int gap_symbol_code):
|
|
576
|
+
"""
|
|
577
|
+
Replace gaps in a sequence in an :class:`Alignment` with a dedicated
|
|
578
|
+
gap symbol.
|
|
579
|
+
|
|
580
|
+
The replacement is required by the progressive alignment algorithm
|
|
581
|
+
to be able to align gapped sequences with each other.
|
|
582
|
+
|
|
583
|
+
Parameters
|
|
584
|
+
----------
|
|
585
|
+
_T : ndarray, dtype=VARAIBLE
|
|
586
|
+
A little bit hacky workaround to get the correct dtype for the
|
|
587
|
+
sequence code of the sequences in a static way
|
|
588
|
+
(important for Cython).
|
|
589
|
+
partial_trace_v : ndarary, shape=(m,), dtype=int
|
|
590
|
+
The row of the alignemnt trace reffering to the given sequence.
|
|
591
|
+
seq_code : ndarary, shape=(n,)
|
|
592
|
+
The sequence code representing the given sequence.
|
|
593
|
+
gap_symbol_code : int
|
|
594
|
+
The symbol code for the gap symbol.
|
|
595
|
+
|
|
596
|
+
Returns
|
|
597
|
+
-------
|
|
598
|
+
new_seq_code : ndarary, shape=(m,)
|
|
599
|
+
The sequence code representing a new sequence, that is the given
|
|
600
|
+
sequence with inserted gap symbols.
|
|
601
|
+
"""
|
|
602
|
+
cdef int i
|
|
603
|
+
cdef int64 index
|
|
604
|
+
cdef CodeType code
|
|
605
|
+
|
|
606
|
+
cdef CodeType[:] seq_code_v = seq_code
|
|
607
|
+
cdef np.ndarray new_seq_code = np.zeros(
|
|
608
|
+
partial_trace_v.shape[0], dtype=seq_code.dtype
|
|
609
|
+
)
|
|
610
|
+
cdef CodeType[:] new_seq_code_v = new_seq_code
|
|
611
|
+
|
|
612
|
+
for i in range(partial_trace_v.shape[0]):
|
|
613
|
+
index = partial_trace_v[i]
|
|
614
|
+
if index == -1:
|
|
615
|
+
new_seq_code_v[i] = gap_symbol_code
|
|
616
|
+
else:
|
|
617
|
+
new_seq_code_v[i] = seq_code[index]
|
|
618
|
+
|
|
619
|
+
return new_seq_code
|
|
Binary file
|