biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,702 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
|
|
8
|
+
import numbers
|
|
9
|
+
import textwrap
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Alignment",
|
|
15
|
+
"get_codes",
|
|
16
|
+
"get_symbols",
|
|
17
|
+
"get_sequence_identity",
|
|
18
|
+
"get_pairwise_sequence_identity",
|
|
19
|
+
"score",
|
|
20
|
+
"find_terminal_gaps",
|
|
21
|
+
"remove_terminal_gaps",
|
|
22
|
+
"remove_gaps",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Alignment(object):
|
|
27
|
+
"""
|
|
28
|
+
An :class:`Alignment` object stores information about which symbols
|
|
29
|
+
of *n* sequences are aligned to each other and it stores the
|
|
30
|
+
corresponding alignment score.
|
|
31
|
+
|
|
32
|
+
Instead of saving a list of aligned symbols, this class saves the
|
|
33
|
+
original *n* sequences, that were aligned, and a so called *trace*,
|
|
34
|
+
which indicate the aligned symbols of these sequences.
|
|
35
|
+
The trace is a *(m x n)* :class:`ndarray` with alignment length
|
|
36
|
+
*m* and sequence count *n*.
|
|
37
|
+
Each element of the trace is the index in the corresponding
|
|
38
|
+
sequence.
|
|
39
|
+
A gap is represented by the value -1.
|
|
40
|
+
|
|
41
|
+
Furthermore this class provides multiple utility functions for
|
|
42
|
+
conversion into strings in order to make the alignment human
|
|
43
|
+
readable.
|
|
44
|
+
|
|
45
|
+
Unless an :class:`Alignment` object is the result of an multiple
|
|
46
|
+
sequence alignment, the object will contain only two sequences.
|
|
47
|
+
|
|
48
|
+
All attributes of this class are publicly accessible.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
sequences : list
|
|
53
|
+
A list of aligned sequences.
|
|
54
|
+
trace : ndarray, dtype=int, shape=(n,m)
|
|
55
|
+
The alignment trace.
|
|
56
|
+
score : int, optional
|
|
57
|
+
Alignment score.
|
|
58
|
+
|
|
59
|
+
Attributes
|
|
60
|
+
----------
|
|
61
|
+
sequences : list
|
|
62
|
+
A list of aligned sequences.
|
|
63
|
+
trace : ndarray, dtype=int, shape=(n,m)
|
|
64
|
+
The alignment trace.
|
|
65
|
+
score : int
|
|
66
|
+
Alignment score.
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
|
|
71
|
+
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
72
|
+
>>> seq2 = NucleotideSequence("TCATGC")
|
|
73
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
74
|
+
>>> ali = align_optimal(seq1, seq2, matrix)[0]
|
|
75
|
+
>>> print(ali)
|
|
76
|
+
CGTCAT--
|
|
77
|
+
--TCATGC
|
|
78
|
+
>>> print(ali.trace)
|
|
79
|
+
[[ 0 -1]
|
|
80
|
+
[ 1 -1]
|
|
81
|
+
[ 2 0]
|
|
82
|
+
[ 3 1]
|
|
83
|
+
[ 4 2]
|
|
84
|
+
[ 5 3]
|
|
85
|
+
[-1 4]
|
|
86
|
+
[-1 5]]
|
|
87
|
+
>>> print(ali[1:4].trace)
|
|
88
|
+
[[ 1 -1]
|
|
89
|
+
[ 2 0]
|
|
90
|
+
[ 3 1]]
|
|
91
|
+
>>> print(ali[1:4, 0:1].trace)
|
|
92
|
+
[[1]
|
|
93
|
+
[2]
|
|
94
|
+
[3]]
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(self, sequences, trace, score=None):
|
|
98
|
+
self.sequences = sequences.copy()
|
|
99
|
+
self.trace = trace
|
|
100
|
+
self.score = score
|
|
101
|
+
|
|
102
|
+
def __repr__(self):
|
|
103
|
+
"""Represent Alignment a string for debugging."""
|
|
104
|
+
return (
|
|
105
|
+
f"Alignment([{', '.join([seq.__repr__() for seq in self.sequences])}], "
|
|
106
|
+
f"np.{np.array_repr(self.trace)}, score={self.score})"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def _gapped_str(self, seq_index):
|
|
110
|
+
seq_str = ""
|
|
111
|
+
for i in range(len(self.trace)):
|
|
112
|
+
j = self.trace[i][seq_index]
|
|
113
|
+
if j != -1:
|
|
114
|
+
seq_str += str(self.sequences[seq_index][j])
|
|
115
|
+
else:
|
|
116
|
+
seq_str += "-"
|
|
117
|
+
return seq_str
|
|
118
|
+
|
|
119
|
+
def get_gapped_sequences(self):
|
|
120
|
+
"""
|
|
121
|
+
Get a the string representation of the gapped sequences.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
sequences : list of str
|
|
126
|
+
The list of gapped sequence strings. The order is the same
|
|
127
|
+
as in `Alignment.sequences`.
|
|
128
|
+
"""
|
|
129
|
+
return [self._gapped_str(i) for i in range(len(self.sequences))]
|
|
130
|
+
|
|
131
|
+
def __str__(self):
|
|
132
|
+
# Check if any of the sequences
|
|
133
|
+
# has an non-single letter alphabet
|
|
134
|
+
all_single_letter = True
|
|
135
|
+
for seq in self.sequences:
|
|
136
|
+
if not _is_single_letter(seq.alphabet):
|
|
137
|
+
all_single_letter = False
|
|
138
|
+
if all_single_letter:
|
|
139
|
+
# First dimension: sequence number,
|
|
140
|
+
# second dimension: line number
|
|
141
|
+
seq_str_lines_list = []
|
|
142
|
+
wrapper = textwrap.TextWrapper(break_on_hyphens=False)
|
|
143
|
+
for i in range(len(self.sequences)):
|
|
144
|
+
seq_str_lines_list.append(wrapper.wrap(self._gapped_str(i)))
|
|
145
|
+
ali_str = ""
|
|
146
|
+
for row_i in range(len(seq_str_lines_list[0])):
|
|
147
|
+
for seq_j in range(len(seq_str_lines_list)):
|
|
148
|
+
ali_str += seq_str_lines_list[seq_j][row_i] + "\n"
|
|
149
|
+
ali_str += "\n"
|
|
150
|
+
# Remove final line breaks
|
|
151
|
+
return ali_str[:-2]
|
|
152
|
+
else:
|
|
153
|
+
return super().__str__()
|
|
154
|
+
|
|
155
|
+
def __getitem__(self, index):
|
|
156
|
+
if isinstance(index, tuple):
|
|
157
|
+
if len(index) > 2:
|
|
158
|
+
raise IndexError("Only 1D or 2D indices are allowed")
|
|
159
|
+
if isinstance(index[0], numbers.Integral) or isinstance(
|
|
160
|
+
index[0], numbers.Integral
|
|
161
|
+
):
|
|
162
|
+
raise IndexError(
|
|
163
|
+
"Integers are invalid indices for alignments, "
|
|
164
|
+
"a single sequence or alignment column cannot be "
|
|
165
|
+
"selected"
|
|
166
|
+
)
|
|
167
|
+
return Alignment(
|
|
168
|
+
Alignment._index_sequences(self.sequences, index[1]),
|
|
169
|
+
self.trace[index],
|
|
170
|
+
self.score,
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
return Alignment(self.sequences, self.trace[index], self.score)
|
|
174
|
+
|
|
175
|
+
def __iter__(self):
|
|
176
|
+
raise TypeError("'Alignment' object is not iterable")
|
|
177
|
+
|
|
178
|
+
def __len__(self):
|
|
179
|
+
return len(self.trace)
|
|
180
|
+
|
|
181
|
+
def __eq__(self, item):
|
|
182
|
+
if not isinstance(item, Alignment):
|
|
183
|
+
return False
|
|
184
|
+
if self.sequences != item.sequences:
|
|
185
|
+
return False
|
|
186
|
+
if not np.array_equal(self.trace, item.trace):
|
|
187
|
+
return False
|
|
188
|
+
if self.score != item.score:
|
|
189
|
+
return False
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def _index_sequences(sequences, index):
|
|
194
|
+
if isinstance(index, (list, tuple)) or (
|
|
195
|
+
isinstance(index, np.ndarray) and index.dtype != bool
|
|
196
|
+
):
|
|
197
|
+
return [sequences[i] for i in index]
|
|
198
|
+
elif isinstance(index, np.ndarray) and index.dtype == bool:
|
|
199
|
+
return [seq for seq, mask in zip(sequences, index) if mask]
|
|
200
|
+
if isinstance(index, slice):
|
|
201
|
+
return sequences[index]
|
|
202
|
+
else:
|
|
203
|
+
raise IndexError(f"Invalid alignment index type '{type(index).__name__}'")
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def trace_from_strings(seq_str_list):
|
|
207
|
+
"""
|
|
208
|
+
Create a trace from strings that represent aligned sequences.
|
|
209
|
+
|
|
210
|
+
Parameters
|
|
211
|
+
----------
|
|
212
|
+
seq_str_list : list of str
|
|
213
|
+
The strings, where each each one represents a sequence
|
|
214
|
+
(with gaps) in an alignment.
|
|
215
|
+
A ``-`` is interpreted as gap.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
trace : ndarray, dtype=int, shape=(n,2)
|
|
220
|
+
The created trace.
|
|
221
|
+
"""
|
|
222
|
+
if len(seq_str_list) < 2:
|
|
223
|
+
raise ValueError("An alignment must contain at least two sequences")
|
|
224
|
+
seq_i = np.zeros(len(seq_str_list))
|
|
225
|
+
trace = np.full((len(seq_str_list[0]), len(seq_str_list)), -1, dtype=int)
|
|
226
|
+
# Get length of string (same length for all strings)
|
|
227
|
+
# rather than length of list
|
|
228
|
+
for pos_i in range(len(seq_str_list[0])):
|
|
229
|
+
for str_j in range(len(seq_str_list)):
|
|
230
|
+
if seq_str_list[str_j][pos_i] == "-":
|
|
231
|
+
trace[pos_i, str_j] = -1
|
|
232
|
+
else:
|
|
233
|
+
trace[pos_i, str_j] = seq_i[str_j]
|
|
234
|
+
seq_i[str_j] += 1
|
|
235
|
+
return trace
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def get_codes(alignment):
|
|
239
|
+
"""
|
|
240
|
+
Get the sequence codes of the sequences in the alignment.
|
|
241
|
+
|
|
242
|
+
The codes are built from the trace:
|
|
243
|
+
Instead of the indices of the aligned symbols (trace), the return
|
|
244
|
+
value contains the corresponding symbol codes for each index.
|
|
245
|
+
Gaps are still represented by *-1*.
|
|
246
|
+
|
|
247
|
+
Parameters
|
|
248
|
+
----------
|
|
249
|
+
alignment : Alignment
|
|
250
|
+
The alignment to get the sequence codes for.
|
|
251
|
+
|
|
252
|
+
Returns
|
|
253
|
+
-------
|
|
254
|
+
codes : ndarray, dtype=int, shape=(n,m)
|
|
255
|
+
The sequence codes for the alignment.
|
|
256
|
+
The shape is *(n,m)* for *n* sequences and *m* alignment cloumn.
|
|
257
|
+
The array uses *-1* values for gaps.
|
|
258
|
+
|
|
259
|
+
Examples
|
|
260
|
+
--------
|
|
261
|
+
|
|
262
|
+
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
263
|
+
>>> seq2 = NucleotideSequence("TCATGC")
|
|
264
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
265
|
+
>>> ali = align_optimal(seq1, seq2, matrix)[0]
|
|
266
|
+
>>> print(ali)
|
|
267
|
+
CGTCAT--
|
|
268
|
+
--TCATGC
|
|
269
|
+
>>> print(get_codes(ali))
|
|
270
|
+
[[ 1 2 3 1 0 3 -1 -1]
|
|
271
|
+
[-1 -1 3 1 0 3 2 1]]
|
|
272
|
+
"""
|
|
273
|
+
trace = alignment.trace
|
|
274
|
+
sequences = alignment.sequences
|
|
275
|
+
|
|
276
|
+
# The number of sequences is the first dimension
|
|
277
|
+
codes = np.zeros((trace.shape[1], trace.shape[0]), dtype=np.int64)
|
|
278
|
+
for i in range(len(sequences)):
|
|
279
|
+
# Mark -1 explicitly as int64 to avoid that the unsigned dtype
|
|
280
|
+
# of the sequence code is used
|
|
281
|
+
# (https://numpy.org/neps/nep-0050-scalar-promotion.html)
|
|
282
|
+
codes[i] = np.where(
|
|
283
|
+
trace[:, i] != -1, sequences[i].code[trace[:, i]], np.int64(-1)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
return np.stack(codes)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def get_symbols(alignment):
|
|
290
|
+
"""
|
|
291
|
+
Similar to :func:`get_codes()`, but contains the decoded symbols
|
|
292
|
+
instead of codes.
|
|
293
|
+
Gaps are still represented by *None* values.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
alignment : Alignment
|
|
298
|
+
The alignment to get the symbols for.
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
symbols : list of list
|
|
303
|
+
The nested list of symbols.
|
|
304
|
+
|
|
305
|
+
See Also
|
|
306
|
+
--------
|
|
307
|
+
get_codes : Get the sequence codes of the sequences in the alignment.
|
|
308
|
+
|
|
309
|
+
Examples
|
|
310
|
+
--------
|
|
311
|
+
|
|
312
|
+
>>> seq1 = NucleotideSequence("CGTCAT")
|
|
313
|
+
>>> seq2 = NucleotideSequence("TCATGC")
|
|
314
|
+
>>> matrix = SubstitutionMatrix.std_nucleotide_matrix()
|
|
315
|
+
>>> ali = align_optimal(seq1, seq2, matrix)[0]
|
|
316
|
+
>>> print(ali)
|
|
317
|
+
CGTCAT--
|
|
318
|
+
--TCATGC
|
|
319
|
+
>>> print(get_symbols(ali))
|
|
320
|
+
[['C', 'G', 'T', 'C', 'A', 'T', None, None], [None, None, 'T', 'C', 'A', 'T', 'G', 'C']]
|
|
321
|
+
"""
|
|
322
|
+
codes = get_codes(alignment)
|
|
323
|
+
symbols = [None] * codes.shape[0]
|
|
324
|
+
for i in range(codes.shape[0]):
|
|
325
|
+
alphabet = alignment.sequences[i].get_alphabet()
|
|
326
|
+
codes_wo_gaps = codes[i, codes[i] != -1]
|
|
327
|
+
symbols_wo_gaps = alphabet.decode_multiple(codes_wo_gaps)
|
|
328
|
+
if isinstance(symbols_wo_gaps, np.ndarray):
|
|
329
|
+
symbols_wo_gaps = symbols_wo_gaps.tolist()
|
|
330
|
+
symbols_for_seq = np.full(len(codes[i]), None, dtype=object)
|
|
331
|
+
symbols_for_seq[codes[i] != -1] = symbols_wo_gaps
|
|
332
|
+
symbols[i] = symbols_for_seq.tolist()
|
|
333
|
+
return symbols
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def get_sequence_identity(alignment, mode="not_terminal"):
|
|
337
|
+
"""
|
|
338
|
+
Calculate the sequence identity for an alignment.
|
|
339
|
+
|
|
340
|
+
The identity is equal to the matches divided by a measure for the
|
|
341
|
+
length of the alignment that depends on the `mode` parameter.
|
|
342
|
+
|
|
343
|
+
Parameters
|
|
344
|
+
----------
|
|
345
|
+
alignment : Alignment
|
|
346
|
+
The alignment to calculate the identity for.
|
|
347
|
+
mode : {'all', 'not_terminal', 'shortest'}, optional
|
|
348
|
+
The calculation mode for alignment length.
|
|
349
|
+
|
|
350
|
+
- **all** - The number of matches divided by the number of
|
|
351
|
+
all alignment columns.
|
|
352
|
+
- **not_terminal** - The number of matches divided by the
|
|
353
|
+
number of alignment columns that are not terminal gaps in
|
|
354
|
+
any of the sequences.
|
|
355
|
+
- **shortest** - The number of matches divided by the
|
|
356
|
+
length of the shortest sequence.
|
|
357
|
+
|
|
358
|
+
Default is *not_terminal*.
|
|
359
|
+
|
|
360
|
+
Returns
|
|
361
|
+
-------
|
|
362
|
+
identity : float
|
|
363
|
+
The sequence identity, ranging between 0 and 1.
|
|
364
|
+
|
|
365
|
+
See Also
|
|
366
|
+
--------
|
|
367
|
+
get_pairwise_sequence_identity : Get sequence identity for each pair of alignment rows.
|
|
368
|
+
"""
|
|
369
|
+
codes = get_codes(alignment)
|
|
370
|
+
|
|
371
|
+
# Count matches
|
|
372
|
+
matches = 0
|
|
373
|
+
for i in range(codes.shape[1]):
|
|
374
|
+
column = codes[:, i]
|
|
375
|
+
# One unique value -> all symbols match
|
|
376
|
+
unique_symbols = np.unique(column)
|
|
377
|
+
if len(unique_symbols) == 1 and unique_symbols[0] != -1:
|
|
378
|
+
matches += 1
|
|
379
|
+
|
|
380
|
+
# Calculate length
|
|
381
|
+
if mode == "all":
|
|
382
|
+
length = len(alignment)
|
|
383
|
+
elif mode == "not_terminal":
|
|
384
|
+
start, stop = find_terminal_gaps(alignment)
|
|
385
|
+
if stop <= start:
|
|
386
|
+
raise ValueError(
|
|
387
|
+
"Cannot calculate non-terminal identity, "
|
|
388
|
+
"at least two sequences have no overlap"
|
|
389
|
+
)
|
|
390
|
+
length = stop - start
|
|
391
|
+
elif mode == "shortest":
|
|
392
|
+
length = min([len(seq) for seq in alignment.sequences])
|
|
393
|
+
else:
|
|
394
|
+
raise ValueError(f"'{mode}' is an invalid calculation mode")
|
|
395
|
+
|
|
396
|
+
return matches / length
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def get_pairwise_sequence_identity(alignment, mode="not_terminal"):
|
|
400
|
+
"""
|
|
401
|
+
Calculate the pairwise sequence identity for an alignment.
|
|
402
|
+
|
|
403
|
+
The identity is equal to the matches divided by a measure for the
|
|
404
|
+
length of the alignment that depends on the `mode` parameter.
|
|
405
|
+
|
|
406
|
+
Parameters
|
|
407
|
+
----------
|
|
408
|
+
alignment : Alignment, length=n
|
|
409
|
+
The alignment to calculate the pairwise sequence identity for.
|
|
410
|
+
mode : {'all', 'not_terminal', 'shortest'}, optional
|
|
411
|
+
The calculation mode for alignment length.
|
|
412
|
+
|
|
413
|
+
- **all** - The number of matches divided by the number of
|
|
414
|
+
all alignment columns.
|
|
415
|
+
- **not_terminal** - The number of matches divided by the
|
|
416
|
+
number of alignment columns that are not terminal gaps in
|
|
417
|
+
any of the two considered sequences.
|
|
418
|
+
- **shortest** - The number of matches divided by the
|
|
419
|
+
length of the shortest one of the two sequences.
|
|
420
|
+
|
|
421
|
+
Default is *not_terminal*.
|
|
422
|
+
|
|
423
|
+
Returns
|
|
424
|
+
-------
|
|
425
|
+
identity : ndarray, dtype=float, shape=(n,n)
|
|
426
|
+
The pairwise sequence identity, ranging between 0 and 1.
|
|
427
|
+
|
|
428
|
+
See Also
|
|
429
|
+
--------
|
|
430
|
+
get_sequence_identity : Get sequence identity over all alignment rows.
|
|
431
|
+
"""
|
|
432
|
+
codes = get_codes(alignment)
|
|
433
|
+
n_seq = len(codes)
|
|
434
|
+
|
|
435
|
+
# Count matches
|
|
436
|
+
# Calculate at which positions the sequences are identical
|
|
437
|
+
# and are not gaps
|
|
438
|
+
equality_matrix = (
|
|
439
|
+
(codes[:, np.newaxis, :] == codes[np.newaxis, :, :])
|
|
440
|
+
& (codes[:, np.newaxis, :] != -1)
|
|
441
|
+
& (codes[np.newaxis, :, :] != -1)
|
|
442
|
+
)
|
|
443
|
+
# Sum these positions up
|
|
444
|
+
matches = np.count_nonzero(equality_matrix, axis=-1)
|
|
445
|
+
|
|
446
|
+
# Calculate length
|
|
447
|
+
if mode == "all":
|
|
448
|
+
length = len(alignment)
|
|
449
|
+
elif mode == "not_terminal":
|
|
450
|
+
length = np.zeros((n_seq, n_seq))
|
|
451
|
+
for i in range(n_seq):
|
|
452
|
+
for j in range(n_seq):
|
|
453
|
+
# Find latest start and earliest stop of all sequences
|
|
454
|
+
start, stop = find_terminal_gaps(alignment[:, [i, j]])
|
|
455
|
+
if stop <= start:
|
|
456
|
+
raise ValueError(
|
|
457
|
+
"Cannot calculate non-terminal identity, "
|
|
458
|
+
"as the two sequences have no overlap"
|
|
459
|
+
)
|
|
460
|
+
length[i, j] = stop - start
|
|
461
|
+
elif mode == "shortest":
|
|
462
|
+
length = np.zeros((n_seq, n_seq))
|
|
463
|
+
for i in range(n_seq):
|
|
464
|
+
for j in range(n_seq):
|
|
465
|
+
length[i, j] = min(
|
|
466
|
+
[len(alignment.sequences[i]), len(alignment.sequences[j])]
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
raise ValueError(f"'{mode}' is an invalid calculation mode")
|
|
470
|
+
|
|
471
|
+
return matches / length
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def score(alignment, matrix, gap_penalty=-10, terminal_penalty=True):
|
|
475
|
+
"""
|
|
476
|
+
Calculate the similarity score of an alignment.
|
|
477
|
+
|
|
478
|
+
If the alignment contains more than two sequences,
|
|
479
|
+
all pairwise scores are counted.
|
|
480
|
+
|
|
481
|
+
Parameters
|
|
482
|
+
----------
|
|
483
|
+
alignment : Alignment
|
|
484
|
+
The alignment to calculate the identity for.
|
|
485
|
+
matrix : SubstitutionMatrix
|
|
486
|
+
The substitution matrix used for scoring.
|
|
487
|
+
gap_penalty : int or (tuple, dtype=int), optional
|
|
488
|
+
If an integer is provided, the value will be interpreted as
|
|
489
|
+
general gap penalty. If a tuple is provided, an affine gap
|
|
490
|
+
penalty is used. The first integer in the tuple is the gap
|
|
491
|
+
opening penalty, the second integer is the gap extension
|
|
492
|
+
penalty.
|
|
493
|
+
The values need to be negative.
|
|
494
|
+
terminal_penalty : bool, optional
|
|
495
|
+
If true, gap penalties are applied to terminal gaps.
|
|
496
|
+
|
|
497
|
+
Returns
|
|
498
|
+
-------
|
|
499
|
+
score : int
|
|
500
|
+
The similarity score.
|
|
501
|
+
"""
|
|
502
|
+
codes = get_codes(alignment)
|
|
503
|
+
matrix = matrix.score_matrix()
|
|
504
|
+
|
|
505
|
+
# Sum similarity scores (without gaps)
|
|
506
|
+
score = 0
|
|
507
|
+
# Iterate over all positions
|
|
508
|
+
for pos in range(codes.shape[1]):
|
|
509
|
+
column = codes[:, pos]
|
|
510
|
+
# Iterate over all possible pairs
|
|
511
|
+
# Do not count self-similarity
|
|
512
|
+
# and do not count similarity twice (not S(i,j) and S(j,i))
|
|
513
|
+
for i in range(codes.shape[0]):
|
|
514
|
+
for j in range(i + 1, codes.shape[0]):
|
|
515
|
+
code_i = column[i]
|
|
516
|
+
code_j = column[j]
|
|
517
|
+
# Ignore gaps
|
|
518
|
+
if code_i != -1 and code_j != -1:
|
|
519
|
+
score += matrix[code_i, code_j]
|
|
520
|
+
|
|
521
|
+
# Sum gap penalties
|
|
522
|
+
if isinstance(gap_penalty, numbers.Real):
|
|
523
|
+
gap_open = gap_penalty
|
|
524
|
+
gap_ext = gap_penalty
|
|
525
|
+
elif isinstance(gap_penalty, Sequence):
|
|
526
|
+
gap_open = gap_penalty[0]
|
|
527
|
+
gap_ext = gap_penalty[1]
|
|
528
|
+
else:
|
|
529
|
+
raise TypeError("Gap penalty must be either integer or tuple")
|
|
530
|
+
# Iterate over all sequences
|
|
531
|
+
for seq_code in codes:
|
|
532
|
+
in_gap = False
|
|
533
|
+
if terminal_penalty:
|
|
534
|
+
start_index = 0
|
|
535
|
+
stop_index = len(seq_code)
|
|
536
|
+
else:
|
|
537
|
+
# Find a start and stop index excluding terminal gaps
|
|
538
|
+
start_index, stop_index = find_terminal_gaps(alignment)
|
|
539
|
+
for i in range(start_index, stop_index):
|
|
540
|
+
if seq_code[i] == -1:
|
|
541
|
+
if in_gap:
|
|
542
|
+
score += gap_ext
|
|
543
|
+
else:
|
|
544
|
+
score += gap_open
|
|
545
|
+
in_gap = True
|
|
546
|
+
else:
|
|
547
|
+
in_gap = False
|
|
548
|
+
return score
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def find_terminal_gaps(alignment):
|
|
552
|
+
"""
|
|
553
|
+
Find the slice indices that would remove terminal gaps from an
|
|
554
|
+
alignment.
|
|
555
|
+
|
|
556
|
+
Terminal gaps are gaps that appear before all sequences start and
|
|
557
|
+
after any sequence ends.
|
|
558
|
+
|
|
559
|
+
Parameters
|
|
560
|
+
----------
|
|
561
|
+
alignment : Alignment
|
|
562
|
+
The alignment, where the slice indices should be found in.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
start, stop : int
|
|
567
|
+
Indices that point to the start and exclusive stop of the
|
|
568
|
+
alignment columns without terminal gaps.
|
|
569
|
+
When these indices are used as slice index for an alignment or
|
|
570
|
+
trace, the index would remove terminal gaps.
|
|
571
|
+
|
|
572
|
+
See Also
|
|
573
|
+
--------
|
|
574
|
+
remove_terminal_gaps : Remove terminal gap columns directly.
|
|
575
|
+
|
|
576
|
+
Examples
|
|
577
|
+
--------
|
|
578
|
+
|
|
579
|
+
>>> sequences = [
|
|
580
|
+
... NucleotideSequence(seq_string) for seq_string in (
|
|
581
|
+
... "AAAAACTGATTC",
|
|
582
|
+
... "AAACTGTTCA",
|
|
583
|
+
... "CTGATTCAAA"
|
|
584
|
+
... )
|
|
585
|
+
... ]
|
|
586
|
+
>>> trace = np.transpose([
|
|
587
|
+
... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
|
|
588
|
+
... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
|
|
589
|
+
... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
|
|
590
|
+
... ])
|
|
591
|
+
>>> alignment = Alignment(sequences, trace)
|
|
592
|
+
>>> print(alignment)
|
|
593
|
+
AAAAACTGATTC---
|
|
594
|
+
--AAACTG-TTCA--
|
|
595
|
+
-----CTGATTCAAA
|
|
596
|
+
>>> print(find_terminal_gaps(alignment))
|
|
597
|
+
(5, 12)
|
|
598
|
+
"""
|
|
599
|
+
trace = alignment.trace
|
|
600
|
+
# Find for each sequence the positions of non-gap symbols
|
|
601
|
+
no_gap_pos = [np.where(trace[:, i] != -1)[0] for i in range(trace.shape[1])]
|
|
602
|
+
# Find for each sequence the positions of the sequence start and end
|
|
603
|
+
# in the alignment
|
|
604
|
+
firsts = [no_gap_pos[i][0] for i in range(trace.shape[1])]
|
|
605
|
+
lasts = [no_gap_pos[i][-1] for i in range(trace.shape[1])]
|
|
606
|
+
# The terminal gaps are before all sequences start and after any
|
|
607
|
+
# sequence ends
|
|
608
|
+
# Use exclusive stop -> -1
|
|
609
|
+
return np.max(firsts).item(), np.min(lasts).item() + 1
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def remove_terminal_gaps(alignment):
|
|
613
|
+
"""
|
|
614
|
+
Remove terminal gaps from an alignment.
|
|
615
|
+
|
|
616
|
+
Terminal gaps are gaps that appear before all sequences start and
|
|
617
|
+
after any sequence ends.
|
|
618
|
+
|
|
619
|
+
Parameters
|
|
620
|
+
----------
|
|
621
|
+
alignment : Alignment
|
|
622
|
+
The alignment, where the terminal gaps should be removed from.
|
|
623
|
+
|
|
624
|
+
Returns
|
|
625
|
+
-------
|
|
626
|
+
truncated_alignment : Alignment
|
|
627
|
+
A shallow copy of the input `alignment` with an truncated trace,
|
|
628
|
+
that does not contain alignment columns with terminal gaps.
|
|
629
|
+
|
|
630
|
+
See Also
|
|
631
|
+
--------
|
|
632
|
+
find_terminal_gaps : Only find terminal gap columns.
|
|
633
|
+
|
|
634
|
+
Examples
|
|
635
|
+
--------
|
|
636
|
+
|
|
637
|
+
>>> sequences = [
|
|
638
|
+
... NucleotideSequence(seq_string) for seq_string in (
|
|
639
|
+
... "AAAAACTGATTC",
|
|
640
|
+
... "AAACTGTTCA",
|
|
641
|
+
... "CTGATTCAAA"
|
|
642
|
+
... )
|
|
643
|
+
... ]
|
|
644
|
+
>>> trace = np.transpose([
|
|
645
|
+
... ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1),
|
|
646
|
+
... (-1, -1, 0, 1, 2, 3, 4, 5, -1, 6, 7, 8, 9, -1, -1),
|
|
647
|
+
... (-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
|
|
648
|
+
... ])
|
|
649
|
+
>>> alignment = Alignment(sequences, trace)
|
|
650
|
+
>>> print(alignment)
|
|
651
|
+
AAAAACTGATTC---
|
|
652
|
+
--AAACTG-TTCA--
|
|
653
|
+
-----CTGATTCAAA
|
|
654
|
+
>>> truncated_alignment = remove_terminal_gaps(alignment)
|
|
655
|
+
>>> print(truncated_alignment)
|
|
656
|
+
CTGATTC
|
|
657
|
+
CTG-TTC
|
|
658
|
+
CTGATTC
|
|
659
|
+
"""
|
|
660
|
+
start, stop = find_terminal_gaps(alignment)
|
|
661
|
+
if stop < start:
|
|
662
|
+
raise ValueError(
|
|
663
|
+
"Cannot remove terminal gaps, since at least two sequences have "
|
|
664
|
+
"no overlap and the resulting alignment would be empty"
|
|
665
|
+
)
|
|
666
|
+
return alignment[start:stop]
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def remove_gaps(alignment):
|
|
670
|
+
"""
|
|
671
|
+
Remove all gap columns from an alignment.
|
|
672
|
+
|
|
673
|
+
Parameters
|
|
674
|
+
----------
|
|
675
|
+
alignment : Alignment
|
|
676
|
+
The alignment to be modified.
|
|
677
|
+
|
|
678
|
+
Returns
|
|
679
|
+
-------
|
|
680
|
+
truncated_alignment : Alignment
|
|
681
|
+
The alignment without gap columns.
|
|
682
|
+
|
|
683
|
+
See Also
|
|
684
|
+
--------
|
|
685
|
+
remove_terminal_gaps : Remove only terminal gap columns.
|
|
686
|
+
"""
|
|
687
|
+
non_gap_mask = (alignment.trace != -1).all(axis=1)
|
|
688
|
+
return alignment[non_gap_mask]
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _is_single_letter(alphabet):
|
|
692
|
+
"""
|
|
693
|
+
More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
|
|
694
|
+
It is sufficient that only only the string representation of each symbol is only
|
|
695
|
+
a single character.
|
|
696
|
+
"""
|
|
697
|
+
if alphabet.is_letter_alphabet():
|
|
698
|
+
return True
|
|
699
|
+
for symbol in alphabet:
|
|
700
|
+
if len(str(symbol)) != 1:
|
|
701
|
+
return False
|
|
702
|
+
return True
|
|
Binary file
|