biotite 0.41.1__cp311-cp311-macosx_10_16_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +19 -0
- biotite/application/__init__.py +43 -0
- biotite/application/application.py +265 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +505 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +83 -0
- biotite/application/blast/webapp.py +421 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +238 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +152 -0
- biotite/application/localapp.py +306 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +122 -0
- biotite/application/msaapp.py +374 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +254 -0
- biotite/application/muscle/app5.py +171 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +456 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +222 -0
- biotite/application/util.py +59 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +304 -0
- biotite/application/viennarna/rnafold.py +269 -0
- biotite/application/viennarna/rnaplot.py +187 -0
- biotite/application/viennarna/util.py +72 -0
- biotite/application/webapp.py +77 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +61 -0
- biotite/database/entrez/dbnames.py +89 -0
- biotite/database/entrez/download.py +223 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +223 -0
- biotite/database/error.py +15 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +260 -0
- biotite/database/pubchem/error.py +20 -0
- biotite/database/pubchem/query.py +827 -0
- biotite/database/pubchem/throttle.py +99 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +167 -0
- biotite/database/rcsb/query.py +959 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +32 -0
- biotite/database/uniprot/download.py +134 -0
- biotite/database/uniprot/query.py +209 -0
- biotite/file.py +251 -0
- biotite/sequence/__init__.py +73 -0
- biotite/sequence/align/__init__.py +49 -0
- biotite/sequence/align/alignment.py +658 -0
- biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +69 -0
- biotite/sequence/align/cigar.py +434 -0
- biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +574 -0
- biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/kmertable.pyx +3400 -0
- biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +405 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
- biotite/sequence/align/multiple.pyx +620 -0
- biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
- biotite/sequence/align/pairwise.pyx +587 -0
- biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
- biotite/sequence/align/permutation.pyx +305 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
- biotite/sequence/align/selector.pyx +956 -0
- biotite/sequence/align/statistics.py +265 -0
- biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +566 -0
- biotite/sequence/annotation.py +829 -0
- biotite/sequence/codec.cpython-311-darwin.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +466 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1034 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +139 -0
- biotite/sequence/graphics/dendrogram.py +184 -0
- biotite/sequence/graphics/features.py +510 -0
- biotite/sequence/graphics/logo.py +110 -0
- biotite/sequence/graphics/plasmid.py +661 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +273 -0
- biotite/sequence/io/fasta/file.py +278 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +120 -0
- biotite/sequence/io/fastq/file.py +551 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +277 -0
- biotite/sequence/io/genbank/file.py +575 -0
- biotite/sequence/io/genbank/metadata.py +324 -0
- biotite/sequence/io/genbank/sequence.py +172 -0
- biotite/sequence/io/general.py +192 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +133 -0
- biotite/sequence/io/gff/file.py +434 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +456 -0
- biotite/sequence/search.py +116 -0
- biotite/sequence/seqtypes.py +556 -0
- biotite/sequence/sequence.py +374 -0
- biotite/structure/__init__.py +132 -0
- biotite/structure/atoms.py +1455 -0
- biotite/structure/basepairs.py +1415 -0
- biotite/structure/bonds.cpython-311-darwin.so +0 -0
- biotite/structure/bonds.pyx +1933 -0
- biotite/structure/box.py +592 -0
- biotite/structure/celllist.cpython-311-darwin.so +0 -0
- biotite/structure/celllist.pyx +849 -0
- biotite/structure/chains.py +298 -0
- biotite/structure/charges.cpython-311-darwin.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +274 -0
- biotite/structure/density.py +114 -0
- biotite/structure/dotbracket.py +216 -0
- biotite/structure/error.py +31 -0
- biotite/structure/filter.py +585 -0
- biotite/structure/geometry.py +697 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +226 -0
- biotite/structure/graphics/rna.py +282 -0
- biotite/structure/hbond.py +409 -0
- biotite/structure/info/__init__.py +25 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +82 -0
- biotite/structure/info/bonds.py +145 -0
- biotite/structure/info/ccd/README.rst +8 -0
- biotite/structure/info/ccd/amino_acids.txt +1663 -0
- biotite/structure/info/ccd/carbohydrates.txt +1135 -0
- biotite/structure/info/ccd/components.bcif +0 -0
- biotite/structure/info/ccd/nucleotides.txt +798 -0
- biotite/structure/info/ccd.py +95 -0
- biotite/structure/info/groups.py +90 -0
- biotite/structure/info/masses.py +123 -0
- biotite/structure/info/misc.py +144 -0
- biotite/structure/info/radii.py +197 -0
- biotite/structure/info/standardize.py +196 -0
- biotite/structure/integrity.py +268 -0
- biotite/structure/io/__init__.py +30 -0
- biotite/structure/io/ctab.py +72 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +65 -0
- biotite/structure/io/general.py +257 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mmtf/__init__.py +21 -0
- biotite/structure/io/mmtf/assembly.py +214 -0
- biotite/structure/io/mmtf/convertarray.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertarray.pyx +341 -0
- biotite/structure/io/mmtf/convertfile.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/convertfile.pyx +501 -0
- biotite/structure/io/mmtf/decode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/decode.pyx +152 -0
- biotite/structure/io/mmtf/encode.cpython-311-darwin.so +0 -0
- biotite/structure/io/mmtf/encode.pyx +183 -0
- biotite/structure/io/mmtf/file.py +233 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +115 -0
- biotite/structure/io/mol/ctab.py +414 -0
- biotite/structure/io/mol/header.py +116 -0
- biotite/structure/io/mol/mol.py +193 -0
- biotite/structure/io/mol/sdf.py +916 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +63 -0
- biotite/structure/io/npz/__init__.py +20 -0
- biotite/structure/io/npz/file.py +152 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +293 -0
- biotite/structure/io/pdb/file.py +1240 -0
- biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +107 -0
- biotite/structure/io/pdbqt/file.py +640 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +648 -0
- biotite/structure/io/pdbx/cif.py +1032 -0
- biotite/structure/io/pdbx/component.py +246 -0
- biotite/structure/io/pdbx/convert.py +1597 -0
- biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +950 -0
- biotite/structure/io/pdbx/legacy.py +267 -0
- biotite/structure/io/tng/__init__.py +13 -0
- biotite/structure/io/tng/file.py +46 -0
- biotite/structure/io/trajfile.py +710 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +46 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +46 -0
- biotite/structure/mechanics.py +75 -0
- biotite/structure/molecules.py +353 -0
- biotite/structure/pseudoknots.py +642 -0
- biotite/structure/rdf.py +243 -0
- biotite/structure/repair.py +253 -0
- biotite/structure/residues.py +562 -0
- biotite/structure/resutil.py +178 -0
- biotite/structure/sasa.cpython-311-darwin.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/sequence.py +112 -0
- biotite/structure/sse.py +327 -0
- biotite/structure/superimpose.py +727 -0
- biotite/structure/transform.py +504 -0
- biotite/structure/util.py +98 -0
- biotite/temp.py +86 -0
- biotite/version.py +16 -0
- biotite/visualize.py +251 -0
- biotite-0.41.1.dist-info/METADATA +187 -0
- biotite-0.41.1.dist-info/RECORD +340 -0
- biotite-0.41.1.dist-info/WHEEL +4 -0
- biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
|
Binary file
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.phylo"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["upgma"]
|
|
8
|
+
|
|
9
|
+
cimport cython
|
|
10
|
+
cimport numpy as np
|
|
11
|
+
|
|
12
|
+
from .tree import Tree, TreeNode
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
ctypedef np.float32_t float32
|
|
16
|
+
ctypedef np.uint8_t uint8
|
|
17
|
+
ctypedef np.uint32_t uint32
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
cdef float32 MAX_FLOAT = np.finfo(np.float32).max
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@cython.boundscheck(False)
|
|
24
|
+
@cython.wraparound(False)
|
|
25
|
+
def upgma(np.ndarray distances):
|
|
26
|
+
"""
|
|
27
|
+
upgma(distances)
|
|
28
|
+
|
|
29
|
+
Perform hierarchical clustering using the
|
|
30
|
+
*unweighted pair group method with arithmetic mean* (UPGMA).
|
|
31
|
+
|
|
32
|
+
This algorithm produces leaf nodes with the same distance to the
|
|
33
|
+
root node.
|
|
34
|
+
In the context of evolution this means a constant evolution rate
|
|
35
|
+
(molecular clock).
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
distances : ndarray, shape=(n,n)
|
|
40
|
+
Pairwise distance matrix.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
tree : Tree
|
|
45
|
+
A rooted binary tree. The `index` attribute in the leaf
|
|
46
|
+
:class:`TreeNode` objects refer to the indices of `distances`.
|
|
47
|
+
|
|
48
|
+
Raises
|
|
49
|
+
------
|
|
50
|
+
ValueError
|
|
51
|
+
If the distance matrix is not symmetric
|
|
52
|
+
or if any matrix entry is below 0.
|
|
53
|
+
|
|
54
|
+
Examples
|
|
55
|
+
--------
|
|
56
|
+
|
|
57
|
+
>>> distances = np.array([
|
|
58
|
+
... [0, 1, 7, 7, 9],
|
|
59
|
+
... [1, 0, 7, 6, 8],
|
|
60
|
+
... [7, 7, 0, 2, 4],
|
|
61
|
+
... [7, 6, 2, 0, 3],
|
|
62
|
+
... [9, 8, 4, 3, 0],
|
|
63
|
+
... ])
|
|
64
|
+
>>> tree = upgma(distances)
|
|
65
|
+
>>> print(tree.to_newick(include_distance=False))
|
|
66
|
+
((4,(3,2)),(1,0));
|
|
67
|
+
"""
|
|
68
|
+
cdef int i=0, j=0, k=0
|
|
69
|
+
cdef int i_min=0, j_min=0
|
|
70
|
+
cdef float32 dist, dist_min
|
|
71
|
+
cdef float mean
|
|
72
|
+
cdef float height
|
|
73
|
+
|
|
74
|
+
if distances.shape[0] != distances.shape[1] \
|
|
75
|
+
or not np.allclose(distances.T, distances):
|
|
76
|
+
raise ValueError("Distance matrix must be symmetric")
|
|
77
|
+
if np.isnan(distances).any():
|
|
78
|
+
raise ValueError("Distance matrix contains NaN values")
|
|
79
|
+
if (distances >= MAX_FLOAT).any():
|
|
80
|
+
raise ValueError("Distance matrix contains infinity")
|
|
81
|
+
if (distances < 0).any():
|
|
82
|
+
raise ValueError("Distances must be positive")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Keep track on clustered indices
|
|
86
|
+
cdef np.ndarray nodes = np.array(
|
|
87
|
+
[TreeNode(index=i) for i in range(distances.shape[0])]
|
|
88
|
+
)
|
|
89
|
+
# Indicates whether an index in the distance matrix has already been
|
|
90
|
+
# clustered and the repsective rows and columns can be ignored
|
|
91
|
+
cdef uint8[:] is_clustered_v = np.full(
|
|
92
|
+
distances.shape[0], False, dtype=np.uint8
|
|
93
|
+
)
|
|
94
|
+
# Number of indices in the current node (cardinality)
|
|
95
|
+
# (required for proportional averaging)
|
|
96
|
+
cdef uint32[:] cluster_size_v = np.ones(
|
|
97
|
+
distances.shape[0], dtype=np.uint32
|
|
98
|
+
)
|
|
99
|
+
# Distance of each node from leaf nodes,
|
|
100
|
+
# used for calculation of distance to child nodes
|
|
101
|
+
cdef float32[:] node_heights = np.zeros(
|
|
102
|
+
distances.shape[0], dtype=np.float32
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Cluster indices
|
|
107
|
+
cdef float32[:,:] distances_v = distances.astype(np.float32, copy=True)
|
|
108
|
+
# Exit loop via 'break'
|
|
109
|
+
while True:
|
|
110
|
+
|
|
111
|
+
# Find minimum distance
|
|
112
|
+
dist_min = MAX_FLOAT
|
|
113
|
+
i_min = -1
|
|
114
|
+
j_min = -1
|
|
115
|
+
for i in range(distances_v.shape[0]):
|
|
116
|
+
if is_clustered_v[i]:
|
|
117
|
+
continue
|
|
118
|
+
for j in range(i):
|
|
119
|
+
if is_clustered_v[j]:
|
|
120
|
+
continue
|
|
121
|
+
dist = distances_v[i,j]
|
|
122
|
+
if dist < dist_min:
|
|
123
|
+
dist_min = dist
|
|
124
|
+
i_min = i
|
|
125
|
+
j_min = j
|
|
126
|
+
|
|
127
|
+
if i_min == -1 or j_min == -1:
|
|
128
|
+
# No distance found -> all leaf nodes are clustered
|
|
129
|
+
# -> exit loop
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
# Cluster the nodes with minimum distance
|
|
133
|
+
# replacing the node at position i_min
|
|
134
|
+
# leaving the node at position j_min empty
|
|
135
|
+
# (is_clustered_v -> True)
|
|
136
|
+
height = dist_min/2
|
|
137
|
+
nodes[i_min] = TreeNode(
|
|
138
|
+
(nodes[i_min], nodes[j_min]),
|
|
139
|
+
(height-node_heights[i_min], height-node_heights[j_min])
|
|
140
|
+
)
|
|
141
|
+
node_heights[i_min] = height
|
|
142
|
+
# Mark position j_min as clustered
|
|
143
|
+
nodes[j_min] = None
|
|
144
|
+
is_clustered_v[j_min] = True
|
|
145
|
+
# Calculate arithmetic mean distances of child nodes
|
|
146
|
+
# as distances for new node and update matrix
|
|
147
|
+
for k in range(distances_v.shape[0]):
|
|
148
|
+
if not is_clustered_v[k] and k != i_min:
|
|
149
|
+
mean = (
|
|
150
|
+
(
|
|
151
|
+
distances_v[i_min,k] * cluster_size_v[i_min]
|
|
152
|
+
+ distances_v[j_min,k] * cluster_size_v[j_min]
|
|
153
|
+
) / (cluster_size_v[i_min] + cluster_size_v[j_min])
|
|
154
|
+
)
|
|
155
|
+
distances_v[i_min,k] = mean
|
|
156
|
+
distances_v[k,i_min] = mean
|
|
157
|
+
# Updating cluster size of new node
|
|
158
|
+
cluster_size_v[i_min] = cluster_size_v[i_min] + cluster_size_v[j_min]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# As each higher level node is always created on position i_min
|
|
162
|
+
# and i is always higher than j in minimum distance calculation,
|
|
163
|
+
# the root node must be at the last index
|
|
164
|
+
return Tree(nodes[len(nodes)-1])
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
import warnings
|
|
6
|
+
import numpy as np
|
|
7
|
+
from .seqtypes import NucleotideSequence, ProteinSequence, GeneralSequence
|
|
8
|
+
from .alphabet import LetterAlphabet
|
|
9
|
+
from .align.alignment import get_codes
|
|
10
|
+
|
|
11
|
+
__name__ = "biotite.sequence"
|
|
12
|
+
__author__ = "Maximilian Greil"
|
|
13
|
+
__all__ = ["SequenceProfile"]
|
|
14
|
+
|
|
15
|
+
# Abbreviations
|
|
16
|
+
_NUC_DNA_ALPH = NucleotideSequence.alphabet_unamb
|
|
17
|
+
_NUC_RNA_ALPH = LetterAlphabet(["A", "C", "G", "U"])
|
|
18
|
+
_PROT_ALPH = ProteinSequence.alphabet
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _determine_common_alphabet(alphabets):
|
|
22
|
+
"""
|
|
23
|
+
Determine the common alphabet from a list of alphabets, that
|
|
24
|
+
extends all alphabets.
|
|
25
|
+
"""
|
|
26
|
+
common_alphabet = alphabets[0]
|
|
27
|
+
for alphabet in alphabets[1:]:
|
|
28
|
+
if not common_alphabet.extends(alphabet):
|
|
29
|
+
if alphabet.extends(common_alphabet):
|
|
30
|
+
common_alphabet = alphabet
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError(
|
|
33
|
+
"There is no common alphabet that extends all alphabets"
|
|
34
|
+
)
|
|
35
|
+
return common_alphabet
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _codes_to_iupac(frequency, codes, maxes, row):
|
|
39
|
+
"""
|
|
40
|
+
Returns IUPAC code for a row of 'symbols' with none, one or
|
|
41
|
+
multiple maximum positions.
|
|
42
|
+
"""
|
|
43
|
+
if np.sum(frequency) == 0:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"There is an empty column in the 'symbols' frequency table. "
|
|
46
|
+
f"This doesn't make sense in context of an alignment. "
|
|
47
|
+
f"Please check the 'symbols' frequency table in row {row}."
|
|
48
|
+
)
|
|
49
|
+
key = tuple(np.where(frequency == maxes)[0])
|
|
50
|
+
return codes[key]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SequenceProfile(object):
|
|
54
|
+
"""
|
|
55
|
+
A :class:`SequenceProfile` object stores information about a
|
|
56
|
+
sequence profile of aligned sequences.
|
|
57
|
+
It is possible to calculate and return its consensus sequence.
|
|
58
|
+
|
|
59
|
+
This class saves the position frequency matrix
|
|
60
|
+
(position count matrix) 'symbols' of the occurrences of each
|
|
61
|
+
alphabet symbol at each position.
|
|
62
|
+
It also saves the number of gaps at each position in the array
|
|
63
|
+
'gaps'.
|
|
64
|
+
|
|
65
|
+
With :meth:`probability_matrix()` the position probability matrix
|
|
66
|
+
can be created based on 'symbols' and a pseudocount.
|
|
67
|
+
|
|
68
|
+
With :meth:`log_odds_matrix()` the position weight matrix can
|
|
69
|
+
be created based on the before calculated position probability
|
|
70
|
+
matrix and the background frequencies.
|
|
71
|
+
|
|
72
|
+
With :meth:`from_alignment()` a :class:`SequenceProfile` object can
|
|
73
|
+
be created from an indefinite number of aligned sequences.
|
|
74
|
+
|
|
75
|
+
With :meth:`sequence_probability_from_matrix()` the probability of a
|
|
76
|
+
sequence can be calculated based on the before calculated position
|
|
77
|
+
probability matrix of this instance of object SequenceProfile.
|
|
78
|
+
|
|
79
|
+
With :meth:`sequence_score_from_matrix()` the score of a sequence
|
|
80
|
+
can be calculated based on the before calculated position weight
|
|
81
|
+
matrix of this instance of object SequenceProfile.
|
|
82
|
+
|
|
83
|
+
All attributes of this class are publicly accessible.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
symbols : ndarray, dtype=int, shape=(n,k)
|
|
88
|
+
This matrix simply saves for each position how often absolutely
|
|
89
|
+
each symbol is present.
|
|
90
|
+
gaps : ndarray, dtype=int, shape=n
|
|
91
|
+
Array which indicates the number of gaps at each position.
|
|
92
|
+
alphabet : Alphabet, length=k
|
|
93
|
+
Alphabet of sequences of sequence profile
|
|
94
|
+
|
|
95
|
+
Attributes
|
|
96
|
+
----------
|
|
97
|
+
symbols : ndarray, dtype=int, shape=(n,k)
|
|
98
|
+
This matrix simply saves for each position how often absolutely
|
|
99
|
+
each symbol is present.
|
|
100
|
+
gaps : ndarray, dtype=int, shape=n
|
|
101
|
+
Array which indicates the number of gaps at each position.
|
|
102
|
+
alphabet : Alphabet, length=k
|
|
103
|
+
Alphabet of sequences of sequence profile
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, symbols, gaps, alphabet):
|
|
107
|
+
self._symbols = symbols
|
|
108
|
+
self._gaps = gaps
|
|
109
|
+
self._alphabet = alphabet
|
|
110
|
+
|
|
111
|
+
if len(alphabet) != symbols.shape[1]:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"The given alphabet doesn't have the same length "
|
|
114
|
+
f"({len(alphabet)}) as the number of columns "
|
|
115
|
+
f"({symbols.shape[1]}) in the 'symbols' frequency table."
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if gaps.shape[0] != symbols.shape[0]:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"The given 'gaps' position matrix doesn't have the same "
|
|
121
|
+
f"length ({gaps.shape[0]}) as the 'symbols' "
|
|
122
|
+
f"frequency table ({symbols.shape[0]})"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def symbols(self):
|
|
127
|
+
return self._symbols
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def gaps(self):
|
|
131
|
+
return self._gaps
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def alphabet(self):
|
|
135
|
+
return self._alphabet
|
|
136
|
+
|
|
137
|
+
@symbols.setter
|
|
138
|
+
def symbols(self, new_symbols):
|
|
139
|
+
if not new_symbols.shape == self.symbols.shape:
|
|
140
|
+
raise ValueError(
|
|
141
|
+
f"New ndarray 'symbols' must be of same shape "
|
|
142
|
+
f"{self.symbols.shape} as the old one"
|
|
143
|
+
)
|
|
144
|
+
self._symbols = new_symbols
|
|
145
|
+
|
|
146
|
+
@gaps.setter
|
|
147
|
+
def gaps(self, new_gaps):
|
|
148
|
+
if not new_gaps.shape == self.gaps.shape:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"New ndarray 'gaps' must be of same shape "
|
|
151
|
+
f"{self.gaps.shape} as the old one"
|
|
152
|
+
)
|
|
153
|
+
self._gaps = new_gaps
|
|
154
|
+
|
|
155
|
+
def __repr__(self):
|
|
156
|
+
"""Represent SequenceProfile as a string for debugging."""
|
|
157
|
+
return f"SequenceProfile(np.{np.array_repr(self.symbols)}, " \
|
|
158
|
+
f"np.{np.array_repr(self.gaps)}, Alphabet({self.alphabet}))"
|
|
159
|
+
|
|
160
|
+
def __eq__(self, item):
|
|
161
|
+
if not isinstance(item, SequenceProfile):
|
|
162
|
+
return False
|
|
163
|
+
if not np.array_equal(self.symbols, item.symbols):
|
|
164
|
+
return False
|
|
165
|
+
if not np.array_equal(self.gaps, item.gaps):
|
|
166
|
+
return False
|
|
167
|
+
if not self.alphabet == item.alphabet:
|
|
168
|
+
return False
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def from_alignment(alignment, alphabet=None):
|
|
173
|
+
"""
|
|
174
|
+
Get an object of :class:`SequenceProfile` from an object of
|
|
175
|
+
:class:`Alignment`.
|
|
176
|
+
|
|
177
|
+
Based on the sequences of the alignment, the SequenceProfile
|
|
178
|
+
parameters symbols and gaps are calculated.
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
alignment : Alignment
|
|
183
|
+
An Alignment object to create the SequenceProfile object
|
|
184
|
+
from.
|
|
185
|
+
alphabet : bool
|
|
186
|
+
This alphabet will be used when creating the SequenceProfile
|
|
187
|
+
object. If no alphabet is selected, the alphabet for this
|
|
188
|
+
SequenceProfile
|
|
189
|
+
object will be calculated from the sequences of object
|
|
190
|
+
Alignment.
|
|
191
|
+
(Default: None).
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
profile: SequenceProfile
|
|
196
|
+
The created SequenceProfile object
|
|
197
|
+
"""
|
|
198
|
+
sequences = get_codes(alignment)
|
|
199
|
+
if alphabet is None:
|
|
200
|
+
alphabet = _determine_common_alphabet(
|
|
201
|
+
[seq.alphabet for seq in alignment.sequences]
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
for alph in (seq.alphabet for seq in alignment.sequences):
|
|
205
|
+
if not alphabet.extends(alph):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"The given alphabet is incompatible with a least one "
|
|
208
|
+
"alphabet of the given sequences"
|
|
209
|
+
)
|
|
210
|
+
symbols = np.zeros((len(sequences[0]), len(alphabet)), dtype=int)
|
|
211
|
+
gaps = np.zeros(len(sequences[0]), dtype=int)
|
|
212
|
+
sequences = np.transpose(sequences)
|
|
213
|
+
for i in range(len(sequences)):
|
|
214
|
+
row = np.where(sequences[i, ] == -1, len(alphabet), sequences[i, ])
|
|
215
|
+
count = np.bincount(row, minlength=len(alphabet) + 1)
|
|
216
|
+
symbols[i, ] = count[0:len(alphabet)]
|
|
217
|
+
gaps[i] = count[-1]
|
|
218
|
+
return SequenceProfile(symbols, gaps, alphabet)
|
|
219
|
+
|
|
220
|
+
def to_consensus(self, as_general=False):
|
|
221
|
+
"""
|
|
222
|
+
Get the consensus sequence for this SequenceProfile object.
|
|
223
|
+
|
|
224
|
+
Parameters
|
|
225
|
+
----------
|
|
226
|
+
as_general : bool
|
|
227
|
+
If true, returns consensus sequence as GeneralSequence
|
|
228
|
+
object.
|
|
229
|
+
Otherwise, the consensus sequence object type is chosen
|
|
230
|
+
based on the alphabet of this SequenceProfile object
|
|
231
|
+
(Default: False).
|
|
232
|
+
|
|
233
|
+
Returns
|
|
234
|
+
-------
|
|
235
|
+
consensus: Sequence
|
|
236
|
+
The calculated consensus sequence
|
|
237
|
+
"""
|
|
238
|
+
# https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes
|
|
239
|
+
if as_general:
|
|
240
|
+
return self._general_to_consensus()
|
|
241
|
+
elif self.alphabet == _NUC_DNA_ALPH:
|
|
242
|
+
return NucleotideSequence(self._dna_to_consensus())
|
|
243
|
+
elif self.alphabet == _NUC_RNA_ALPH:
|
|
244
|
+
return NucleotideSequence(self._rna_to_consensus())
|
|
245
|
+
elif self.alphabet == _PROT_ALPH:
|
|
246
|
+
return self._prot_to_consensus()
|
|
247
|
+
return self._general_to_consensus()
|
|
248
|
+
|
|
249
|
+
def _dna_to_consensus(self):
|
|
250
|
+
codes = {
|
|
251
|
+
(0,): 'A', (1,): 'C', (2,): 'G', (3,): 'T',
|
|
252
|
+
(0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
|
|
253
|
+
(1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
|
|
254
|
+
(0, 1, 2, 3): 'N'
|
|
255
|
+
}
|
|
256
|
+
consensus = ""
|
|
257
|
+
maxes = np.max(self.symbols, axis=1)
|
|
258
|
+
for i in range(len(self.symbols)):
|
|
259
|
+
consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
|
|
260
|
+
return consensus
|
|
261
|
+
|
|
262
|
+
def _rna_to_consensus(self):
|
|
263
|
+
codes = {
|
|
264
|
+
(0,): 'A', (1,): 'C', (2,): 'G', (3,): 'U',
|
|
265
|
+
(0, 2): 'R', (1, 3): 'Y', (1, 2): 'S', (0, 3): 'W', (2, 3): 'K', (0, 1): 'M',
|
|
266
|
+
(1, 2, 3): 'B', (0, 2, 3): 'D', (0, 1, 3): 'H', (0, 1, 2): 'V',
|
|
267
|
+
(0, 1, 2, 3): 'N'
|
|
268
|
+
}
|
|
269
|
+
consensus = ""
|
|
270
|
+
maxes = np.max(self.symbols, axis=1)
|
|
271
|
+
for i in range(len(self.symbols)):
|
|
272
|
+
consensus += _codes_to_iupac(self.symbols[i, :], codes, maxes[i], i)
|
|
273
|
+
return consensus
|
|
274
|
+
|
|
275
|
+
def _prot_to_consensus(self):
|
|
276
|
+
"""
|
|
277
|
+
In case there is more than one symbol with the same maximal
|
|
278
|
+
occurrences, the alphabetically sorted first symbol will be
|
|
279
|
+
taken for the consensus sequence.
|
|
280
|
+
"""
|
|
281
|
+
consensus = ProteinSequence()
|
|
282
|
+
consensus.code = np.argmax(self.symbols, axis=1)
|
|
283
|
+
consensus.code = np.where(
|
|
284
|
+
np.sum(self.symbols, axis=1) == 0, 23, consensus.code
|
|
285
|
+
) # _PROT_ALPH[23] = 'X'
|
|
286
|
+
return consensus
|
|
287
|
+
|
|
288
|
+
def _general_to_consensus(self):
|
|
289
|
+
"""
|
|
290
|
+
In case there is more than one symbol with the same maximal
|
|
291
|
+
occurrences, the alphabetically sorted first symbol will be
|
|
292
|
+
taken for the consensus sequence.
|
|
293
|
+
In case the sum of occurrences of all symbols at a position is
|
|
294
|
+
zero, the alphabetically sorted first symbol will be taken for
|
|
295
|
+
the consensus sequence.
|
|
296
|
+
"""
|
|
297
|
+
consensus = GeneralSequence(self.alphabet)
|
|
298
|
+
consensus.code = np.argmax(self.symbols, axis=1)
|
|
299
|
+
return consensus
|
|
300
|
+
|
|
301
|
+
def probability_matrix(self, pseudocount=0):
|
|
302
|
+
r"""
|
|
303
|
+
Calculate the position probability matrix (PPM) based on
|
|
304
|
+
'symbols' and the given pseudocount.
|
|
305
|
+
This new matrix has the same shape as 'symbols'.
|
|
306
|
+
|
|
307
|
+
.. math::
|
|
308
|
+
|
|
309
|
+
P(S) = \frac {C_S + \frac{c_p}{k}} {\sum_{i} C_i + c_p}
|
|
310
|
+
|
|
311
|
+
:math:`S`: The symbol.
|
|
312
|
+
|
|
313
|
+
:math:`C_S`: The count of symbol :math:`S` at the sequence
|
|
314
|
+
position.
|
|
315
|
+
|
|
316
|
+
:math:`c_p`: The pseudocount.
|
|
317
|
+
|
|
318
|
+
:math:`k`: Length of the alphabet.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
pseudocount: int, optional
|
|
323
|
+
Amount added to the number of observed cases in order to
|
|
324
|
+
change the expected probability of the PPM.
|
|
325
|
+
(Default: 0)
|
|
326
|
+
|
|
327
|
+
Returns
|
|
328
|
+
-------
|
|
329
|
+
ppm: ndarray, dtype=float, shape=(n,k)
|
|
330
|
+
The calculated the position probability matrix.
|
|
331
|
+
"""
|
|
332
|
+
if pseudocount < 0:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
f"Pseudocount can not be smaller than zero."
|
|
335
|
+
)
|
|
336
|
+
return (self.symbols + pseudocount / self.symbols.shape[1]) / \
|
|
337
|
+
(np.sum(self.symbols, axis=1)[:, np.newaxis] + pseudocount)
|
|
338
|
+
|
|
339
|
+
def log_odds_matrix(self, background_frequencies=None, pseudocount=0):
|
|
340
|
+
r"""
|
|
341
|
+
Calculate the position weight matrix (PWM) based on the
|
|
342
|
+
position probability matrix (PPM) (with given pseudocount) and
|
|
343
|
+
background_frequencies.
|
|
344
|
+
This new matrix has the same shape as 'symbols'.
|
|
345
|
+
|
|
346
|
+
.. math::
|
|
347
|
+
|
|
348
|
+
W(S) = \log_2 \left( \frac{P(S)}{B_S} \right)
|
|
349
|
+
|
|
350
|
+
:math:`S`: The symbol.
|
|
351
|
+
|
|
352
|
+
:math:`P(S)`: The probability of symbol :math:`S` at the
|
|
353
|
+
sequence position.
|
|
354
|
+
|
|
355
|
+
:math:`c_p`: The background frequency of symbol :math:`S`.
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
pseudocount: int, optional
|
|
360
|
+
Amount added to the number of observed cases in order to change
|
|
361
|
+
the expected probability of the PPM.
|
|
362
|
+
(Default: 0)
|
|
363
|
+
background_frequencies: ndarray, shape=(k,), dtype=float, optional
|
|
364
|
+
The background frequencies for each symbol in the alphabet.
|
|
365
|
+
By default, a uniform distribution is assumed.
|
|
366
|
+
|
|
367
|
+
Returns
|
|
368
|
+
-------
|
|
369
|
+
pwm: ndarray, dtype=float, shape=(n,k)
|
|
370
|
+
The calculated the position weight matrix.
|
|
371
|
+
"""
|
|
372
|
+
if background_frequencies is None:
|
|
373
|
+
background_frequencies = 1 / len(self.alphabet)
|
|
374
|
+
ppm = self.probability_matrix(pseudocount=pseudocount)
|
|
375
|
+
# Catch warning that appears, if a symbol is missing at any
|
|
376
|
+
# position in the profile
|
|
377
|
+
with warnings.catch_warnings():
|
|
378
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
379
|
+
return np.log2(ppm / background_frequencies)
|
|
380
|
+
|
|
381
|
+
def sequence_probability(self, sequence, pseudocount=0):
|
|
382
|
+
r"""
|
|
383
|
+
Calculate probability of a sequence based on the
|
|
384
|
+
position probability matrix (PPM).
|
|
385
|
+
|
|
386
|
+
The sequence probability is the product of the probability of
|
|
387
|
+
the respective symbol over all sequence positions.
|
|
388
|
+
|
|
389
|
+
Parameters
|
|
390
|
+
----------
|
|
391
|
+
sequence : Sequence
|
|
392
|
+
The input sequence.
|
|
393
|
+
pseudocount: int, optional
|
|
394
|
+
Amount added to the number of observed cases in order to change
|
|
395
|
+
the expected probability of the PPM.
|
|
396
|
+
(Default: 0)
|
|
397
|
+
|
|
398
|
+
Returns
|
|
399
|
+
-------
|
|
400
|
+
probability: float
|
|
401
|
+
The calculated probability for the input sequence based on
|
|
402
|
+
the PPM.
|
|
403
|
+
"""
|
|
404
|
+
ppm = self.probability_matrix(pseudocount=pseudocount)
|
|
405
|
+
if len(sequence) != len(ppm):
|
|
406
|
+
raise ValueError(
|
|
407
|
+
f"The given sequence has a different length ({len(sequence)}) than "
|
|
408
|
+
f"the position probability matrix ({len(ppm)})."
|
|
409
|
+
)
|
|
410
|
+
if not ppm.shape == self.symbols.shape:
|
|
411
|
+
raise ValueError(
|
|
412
|
+
f"Position probability matrix {ppm.shape} must be of same shape "
|
|
413
|
+
f"as 'symbols' {self.symbols.shape}"
|
|
414
|
+
)
|
|
415
|
+
return np.prod(ppm[np.arange(len(sequence)), sequence.code])
|
|
416
|
+
|
|
417
|
+
def sequence_score(self, sequence, background_frequencies=None, pseudocount=0):
|
|
418
|
+
"""
|
|
419
|
+
Calculate score of a sequence based on the
|
|
420
|
+
position weight matrix (PWM).
|
|
421
|
+
|
|
422
|
+
The score is the sum of weights (log-odds scores) of
|
|
423
|
+
the respective symbol over all sequence positions.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
sequence : Sequence
|
|
428
|
+
The input sequence.
|
|
429
|
+
pseudocount: int, optional
|
|
430
|
+
Amount added to the number of observed cases in order to change
|
|
431
|
+
the expected probability of the PPM.
|
|
432
|
+
(Default: 0)
|
|
433
|
+
background_frequencies: ndarray, shape=(k,), dtype=float, optional
|
|
434
|
+
The background frequencies for each symbol in the alphabet.
|
|
435
|
+
By default a uniform distribution is assumed.
|
|
436
|
+
|
|
437
|
+
Returns
|
|
438
|
+
-------
|
|
439
|
+
score: float
|
|
440
|
+
The calculated score for the input sequence based on
|
|
441
|
+
the PWM.
|
|
442
|
+
"""
|
|
443
|
+
if background_frequencies is None:
|
|
444
|
+
background_frequencies = 1 / len(self.alphabet)
|
|
445
|
+
pwm = self.log_odds_matrix(background_frequencies=background_frequencies, pseudocount=pseudocount)
|
|
446
|
+
if len(sequence) != len(pwm):
|
|
447
|
+
raise ValueError(
|
|
448
|
+
f"The given sequence has a different length ({len(sequence)}) than "
|
|
449
|
+
f"the position weight matrix ({len(pwm)})."
|
|
450
|
+
)
|
|
451
|
+
if not pwm.shape == self.symbols.shape:
|
|
452
|
+
raise ValueError(
|
|
453
|
+
f"Position weight matrix {pwm.shape} must be of same shape "
|
|
454
|
+
f"as 'symbols' {self.symbols.shape}"
|
|
455
|
+
)
|
|
456
|
+
return np.sum(pwm[np.arange(len(sequence)), sequence.code])
|