biotite 1.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-312-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
__name__ = "biotite.sequence.align"
|
|
6
|
+
__author__ = "Patrick Kunzmann"
|
|
7
|
+
__all__ = ["KmerAlphabet"]
|
|
8
|
+
|
|
9
|
+
cimport cython
|
|
10
|
+
cimport numpy as np
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from ..alphabet import Alphabet, LetterAlphabet, AlphabetError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ctypedef np.uint8_t uint8
|
|
17
|
+
ctypedef np.uint16_t uint16
|
|
18
|
+
ctypedef np.uint32_t uint32
|
|
19
|
+
ctypedef np.uint64_t uint64
|
|
20
|
+
ctypedef np.int64_t int64
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ctypedef fused CodeType:
|
|
24
|
+
uint8
|
|
25
|
+
uint16
|
|
26
|
+
uint32
|
|
27
|
+
uint64
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class KmerAlphabet(Alphabet):
|
|
31
|
+
"""
|
|
32
|
+
__init__(base_alphabet, k, spacing=None)
|
|
33
|
+
|
|
34
|
+
This type of alphabet uses *k-mers* as symbols, i.e. all
|
|
35
|
+
combinations of *k* symbols from its *base alphabet*.
|
|
36
|
+
|
|
37
|
+
It's primary use is its :meth:`create_kmers()` method, that iterates
|
|
38
|
+
over all overlapping *k-mers* in a :class:`Sequence` and encodes
|
|
39
|
+
each one into its corresponding *k-mer* symbol code
|
|
40
|
+
(*k-mer* code in short).
|
|
41
|
+
This functionality is prominently used by a :class:`KmerTable` to
|
|
42
|
+
find *k-mer* matches between two sequences.
|
|
43
|
+
|
|
44
|
+
A :class:`KmerAlphabet` has :math:`n^k` different symbols, where
|
|
45
|
+
:math:`n` is the number of symbols in the base alphabet.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
base_alphabet : Alphabet
|
|
50
|
+
The base alphabet.
|
|
51
|
+
The created :class:`KmerAlphabet` contains all combinations of
|
|
52
|
+
*k* symbols from this alphabet.
|
|
53
|
+
k : int
|
|
54
|
+
An integer greater than 1 that defines the length of the
|
|
55
|
+
*k-mers*.
|
|
56
|
+
spacing : None or str or list or ndarray, dtype=int, shape=(k,)
|
|
57
|
+
If provided, spaced *k-mers* are used instead of continuous
|
|
58
|
+
ones :footcite:`Ma2002`.
|
|
59
|
+
The value contains the *informative* positions relative to the
|
|
60
|
+
start of the *k-mer*, also called the *model*.
|
|
61
|
+
The number of *informative* positions must equal *k*.
|
|
62
|
+
|
|
63
|
+
If a string is given, each ``'1'`` in the string indicates an
|
|
64
|
+
*informative* position.
|
|
65
|
+
For a continuous *k-mer* the `spacing` would be ``'111...'``.
|
|
66
|
+
|
|
67
|
+
If a list or array is given, it must contain unique non-negative
|
|
68
|
+
integers, that indicate the *informative* positions.
|
|
69
|
+
For a continuous *k-mer* the `spacing` would be
|
|
70
|
+
``[0, 1, 2,...]``.
|
|
71
|
+
|
|
72
|
+
Attributes
|
|
73
|
+
----------
|
|
74
|
+
base_alphabet : Alphabet
|
|
75
|
+
The base alphabet, from which the :class:`KmerAlphabet` was
|
|
76
|
+
created.
|
|
77
|
+
k : int
|
|
78
|
+
The length of the *k-mers*.
|
|
79
|
+
spacing : None or ndarray, dtype=int
|
|
80
|
+
The *k-mer* model in array form, if spaced *k-mers* are used,
|
|
81
|
+
``None`` otherwise.
|
|
82
|
+
|
|
83
|
+
Notes
|
|
84
|
+
-----
|
|
85
|
+
The symbol code for a *k-mer* :math:`s` calculates as
|
|
86
|
+
|
|
87
|
+
.. math:: RMSD = \sum_{i=0}^{k-1} n^{k-i-1} s_i
|
|
88
|
+
|
|
89
|
+
where :math:`n` is the length of the base alphabet.
|
|
90
|
+
|
|
91
|
+
Hence the :class:`KmerAlphabet` sorts *k-mers* in the order of the
|
|
92
|
+
base alphabet, where leading positions within the *k-mer* take
|
|
93
|
+
precedence.
|
|
94
|
+
|
|
95
|
+
References
|
|
96
|
+
----------
|
|
97
|
+
|
|
98
|
+
.. footbibliography::
|
|
99
|
+
|
|
100
|
+
Examples
|
|
101
|
+
--------
|
|
102
|
+
Create an alphabet of nucleobase *2-mers*:
|
|
103
|
+
|
|
104
|
+
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
|
|
105
|
+
>>> print(base_alphabet.get_symbols())
|
|
106
|
+
('A', 'C', 'G', 'T')
|
|
107
|
+
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
|
|
108
|
+
>>> print(kmer_alphabet.get_symbols())
|
|
109
|
+
('AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT')
|
|
110
|
+
|
|
111
|
+
Encode and decode *k-mers*:
|
|
112
|
+
|
|
113
|
+
>>> print(kmer_alphabet.encode("TC"))
|
|
114
|
+
13
|
|
115
|
+
>>> print(kmer_alphabet.decode(13))
|
|
116
|
+
['T' 'C']
|
|
117
|
+
|
|
118
|
+
Fuse symbol codes from the base alphabet into a *k-mer* code
|
|
119
|
+
and split the *k-mer* code back into the original symbol codes:
|
|
120
|
+
|
|
121
|
+
>>> symbol_codes = base_alphabet.encode_multiple("TC")
|
|
122
|
+
>>> print(symbol_codes)
|
|
123
|
+
[3 1]
|
|
124
|
+
>>> print(kmer_alphabet.fuse(symbol_codes))
|
|
125
|
+
13
|
|
126
|
+
>>> print(kmer_alphabet.split(13))
|
|
127
|
+
[3 1]
|
|
128
|
+
|
|
129
|
+
Encode all overlapping continuous k-mers of a sequence:
|
|
130
|
+
|
|
131
|
+
>>> sequence = NucleotideSequence("ATTGCT")
|
|
132
|
+
>>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
|
|
133
|
+
>>> print(kmer_codes)
|
|
134
|
+
[ 3 15 14 9 7]
|
|
135
|
+
>>> print(["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)])
|
|
136
|
+
['AT', 'TT', 'TG', 'GC', 'CT']
|
|
137
|
+
|
|
138
|
+
Encode all overlapping k-mers using spacing:
|
|
139
|
+
|
|
140
|
+
>>> base_alphabet = ProteinSequence.alphabet
|
|
141
|
+
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 3, spacing="1101")
|
|
142
|
+
>>> sequence = ProteinSequence("BIQTITE")
|
|
143
|
+
>>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
|
|
144
|
+
>>> # Pretty print k-mers
|
|
145
|
+
>>> strings = ["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)]
|
|
146
|
+
>>> print([s[0] + s[1] + "_" + s[2] for s in strings])
|
|
147
|
+
['BI_T', 'IQ_I', 'QT_T', 'TI_E']
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(self, base_alphabet, k, spacing=None):
|
|
151
|
+
if not isinstance(base_alphabet, Alphabet):
|
|
152
|
+
raise TypeError(
|
|
153
|
+
f"Got {type(base_alphabet).__name__}, "
|
|
154
|
+
f"but Alphabet was expected"
|
|
155
|
+
)
|
|
156
|
+
if k < 2:
|
|
157
|
+
raise ValueError("k must be at least 2")
|
|
158
|
+
self._base_alph = base_alphabet
|
|
159
|
+
self._k = k
|
|
160
|
+
|
|
161
|
+
base_alph_len = len(self._base_alph)
|
|
162
|
+
self._radix_multiplier = np.array(
|
|
163
|
+
[base_alph_len**n for n in reversed(range(0, self._k))],
|
|
164
|
+
dtype=np.int64
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if spacing is None:
|
|
168
|
+
self._spacing = None
|
|
169
|
+
|
|
170
|
+
elif isinstance(spacing, str):
|
|
171
|
+
self._spacing = _to_array_form(spacing)
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
self._spacing = np.array(spacing, dtype=np.int64)
|
|
175
|
+
self._spacing.sort()
|
|
176
|
+
if (self._spacing < 0).any():
|
|
177
|
+
raise ValueError(
|
|
178
|
+
"Only non-negative integers are allowed for spacing"
|
|
179
|
+
)
|
|
180
|
+
if len(np.unique(self._spacing)) != len(self._spacing):
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"Spacing model contains duplicate values"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if spacing is not None and len(self._spacing) != self._k:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"Expected {self._k} informative positions, "
|
|
188
|
+
f"but got {len(self._spacing)} positions in spacing"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def base_alphabet(self):
|
|
194
|
+
return self._base_alph
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def k(self):
|
|
198
|
+
return self._k
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def spacing(self):
|
|
202
|
+
return None if self._spacing is None else self._spacing.copy()
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def get_symbols(self):
|
|
206
|
+
"""
|
|
207
|
+
get_symbols()
|
|
208
|
+
|
|
209
|
+
Get the symbols in the alphabet.
|
|
210
|
+
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
symbols : tuple
|
|
214
|
+
A tuple of all *k-mer* symbols, i.e. all possible
|
|
215
|
+
combinations of *k* symbols from its *base alphabet*.
|
|
216
|
+
|
|
217
|
+
Notes
|
|
218
|
+
-----
|
|
219
|
+
In contrast the base :class:`Alphabet` and
|
|
220
|
+
:class:`LetterAlphabet` class, :class:`KmerAlphabet` does not
|
|
221
|
+
hold a list of its symbols internally for performance reasons.
|
|
222
|
+
Hence calling :meth:`get_symbols()` may be quite time consuming
|
|
223
|
+
for large base alphabets or large *k* values, as the list needs
|
|
224
|
+
to be created first.
|
|
225
|
+
"""
|
|
226
|
+
if isinstance(self._base_alph, LetterAlphabet):
|
|
227
|
+
return tuple(["".join(self.decode(code)) for code in range(len(self))])
|
|
228
|
+
else:
|
|
229
|
+
return tuple([list(self.decode(code)) for code in range(len(self))])
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def extends(self, alphabet):
|
|
233
|
+
# A KmerAlphabet cannot really extend another KmerAlphabet:
|
|
234
|
+
# If k is not equal, all symbols are not equal
|
|
235
|
+
# If the base alphabet has additional symbols, the correct
|
|
236
|
+
# order is not preserved
|
|
237
|
+
# A KmerAlphabet can only 'extend' another KmerAlphabet,
|
|
238
|
+
# if the two alphabets are equal
|
|
239
|
+
return alphabet == self
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def encode(self, symbol):
|
|
243
|
+
return self.fuse(self._base_alph.encode_multiple(symbol))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def decode(self, code):
|
|
247
|
+
return self._base_alph.decode_multiple(self.split(code))
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def fuse(self, codes):
|
|
251
|
+
"""
|
|
252
|
+
fuse(codes)
|
|
253
|
+
|
|
254
|
+
Get the *k-mer* code for *k* symbol codes from the base
|
|
255
|
+
alphabet.
|
|
256
|
+
|
|
257
|
+
This method can be used in a vectorized manner to obtain
|
|
258
|
+
*n* *k-mer* codes from an *(n,k)* integer array.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
codes : ndarray, dtype=int, shape=(k,) or shape=(n,k)
|
|
263
|
+
The symbol codes from the base alphabet to be fused.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
kmer_codes : int or ndarray, dtype=np.int64, shape=(n,)
|
|
268
|
+
The fused *k-mer* code(s).
|
|
269
|
+
|
|
270
|
+
See Also
|
|
271
|
+
--------
|
|
272
|
+
split
|
|
273
|
+
The reverse operation.
|
|
274
|
+
|
|
275
|
+
Examples
|
|
276
|
+
--------
|
|
277
|
+
|
|
278
|
+
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
|
|
279
|
+
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
|
|
280
|
+
>>> symbol_codes = base_alphabet.encode_multiple("TC")
|
|
281
|
+
>>> print(symbol_codes)
|
|
282
|
+
[3 1]
|
|
283
|
+
>>> print(kmer_alphabet.fuse(symbol_codes))
|
|
284
|
+
13
|
|
285
|
+
>>> print(kmer_alphabet.split(13))
|
|
286
|
+
[3 1]
|
|
287
|
+
"""
|
|
288
|
+
if codes.shape[-1] != self._k:
|
|
289
|
+
raise AlphabetError(
|
|
290
|
+
f"Given k-mer(s) has {codes.shape[-1]} symbols, "
|
|
291
|
+
f"but alphabet expects {self._k}-mers"
|
|
292
|
+
)
|
|
293
|
+
if np.any(codes > len(self._base_alph)):
|
|
294
|
+
raise AlphabetError("Given k-mer(s) contains invalid symbol code")
|
|
295
|
+
|
|
296
|
+
orig_shape = codes.shape
|
|
297
|
+
codes = np.atleast_2d(codes)
|
|
298
|
+
kmer_code = np.sum(self._radix_multiplier * codes, axis=-1)
|
|
299
|
+
# The last dimension is removed since it collpased in np.sum
|
|
300
|
+
return kmer_code.reshape(orig_shape[:-1])
|
|
301
|
+
|
|
302
|
+
def split(self, kmer_code):
|
|
303
|
+
"""
|
|
304
|
+
split(kmer_code)
|
|
305
|
+
|
|
306
|
+
Convert a *k-mer* code back into *k* symbol codes from the base
|
|
307
|
+
alphabet.
|
|
308
|
+
|
|
309
|
+
This method can be used in a vectorized manner to split
|
|
310
|
+
*n* *k-mer* codes into an *(n,k)* integer array.
|
|
311
|
+
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
kmer_code : int or ndarray, dtype=int, shape=(n,)
|
|
315
|
+
The *k-mer* code(s).
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
codes : ndarray, dtype=np.uint64, shape=(k,) or shape=(n,k)
|
|
320
|
+
The split symbol codes from the base alphabet.
|
|
321
|
+
|
|
322
|
+
See Also
|
|
323
|
+
--------
|
|
324
|
+
fuse
|
|
325
|
+
The reverse operation.
|
|
326
|
+
|
|
327
|
+
Examples
|
|
328
|
+
--------
|
|
329
|
+
|
|
330
|
+
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
|
|
331
|
+
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
|
|
332
|
+
>>> symbol_codes = base_alphabet.encode_multiple("TC")
|
|
333
|
+
>>> print(symbol_codes)
|
|
334
|
+
[3 1]
|
|
335
|
+
>>> print(kmer_alphabet.fuse(symbol_codes))
|
|
336
|
+
13
|
|
337
|
+
>>> print(kmer_alphabet.split(13))
|
|
338
|
+
[3 1]
|
|
339
|
+
"""
|
|
340
|
+
if np.any(kmer_code >= len(self)) or np.any(kmer_code < 0):
|
|
341
|
+
raise AlphabetError(
|
|
342
|
+
f"Given k-mer symbol code is invalid for this alphabet"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
orig_shape = np.shape(kmer_code)
|
|
346
|
+
split_codes = self._split(
|
|
347
|
+
np.atleast_1d(kmer_code).astype(np.int64, copy=False)
|
|
348
|
+
)
|
|
349
|
+
return split_codes.reshape(orig_shape + (self._k,))
|
|
350
|
+
|
|
351
|
+
@cython.boundscheck(False)
|
|
352
|
+
@cython.wraparound(False)
|
|
353
|
+
@cython.cdivision(True)
|
|
354
|
+
def _split(self, int64[:] codes not None):
|
|
355
|
+
cdef int i, n
|
|
356
|
+
cdef int64 code, val, symbol_code
|
|
357
|
+
|
|
358
|
+
cdef int64[:] radix_multiplier = self._radix_multiplier
|
|
359
|
+
|
|
360
|
+
cdef uint64[:,:] split_codes = np.empty(
|
|
361
|
+
(codes.shape[0], self._k), dtype=np.uint64
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
cdef int k = self._k
|
|
365
|
+
for i in range(codes.shape[0]):
|
|
366
|
+
code = codes[i]
|
|
367
|
+
for n in range(k):
|
|
368
|
+
val = radix_multiplier[n]
|
|
369
|
+
symbol_code = code // val
|
|
370
|
+
split_codes[i,n] = symbol_code
|
|
371
|
+
code -= symbol_code * val
|
|
372
|
+
|
|
373
|
+
return np.asarray(split_codes)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def kmer_array_length(self, int64 length):
|
|
377
|
+
"""
|
|
378
|
+
kmer_array_length(length)
|
|
379
|
+
|
|
380
|
+
Get the length of the *k-mer* array, created by
|
|
381
|
+
:meth:`create_kmers()`, if a sequence of size `length` would be
|
|
382
|
+
given.
|
|
383
|
+
|
|
384
|
+
Parameters
|
|
385
|
+
----------
|
|
386
|
+
length : int
|
|
387
|
+
The length of the hypothetical sequence
|
|
388
|
+
|
|
389
|
+
Returns
|
|
390
|
+
-------
|
|
391
|
+
kmer_length : int
|
|
392
|
+
The length of created *k-mer* array.
|
|
393
|
+
"""
|
|
394
|
+
cdef int64 max_offset
|
|
395
|
+
cdef int64[:] spacing
|
|
396
|
+
|
|
397
|
+
if self._spacing is None:
|
|
398
|
+
return length - self._k + 1
|
|
399
|
+
else:
|
|
400
|
+
spacing = self._spacing
|
|
401
|
+
max_offset = self._spacing[len(spacing)-1] + 1
|
|
402
|
+
return length - max_offset + 1
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def create_kmers(self, seq_code):
|
|
406
|
+
"""
|
|
407
|
+
create_kmers(seq_code)
|
|
408
|
+
|
|
409
|
+
Create *k-mer* codes for all overlapping *k-mers* in the given
|
|
410
|
+
sequence code.
|
|
411
|
+
|
|
412
|
+
Parameters
|
|
413
|
+
----------
|
|
414
|
+
seq_code : ndarray, dtype={np.uint8, np.uint16, np.uint32, np.uint64}
|
|
415
|
+
The sequence code to be converted into *k-mers*.
|
|
416
|
+
|
|
417
|
+
Returns
|
|
418
|
+
-------
|
|
419
|
+
kmer_codes : ndarray, dtype=int64
|
|
420
|
+
The symbol codes for the *k-mers*.
|
|
421
|
+
|
|
422
|
+
Examples
|
|
423
|
+
--------
|
|
424
|
+
|
|
425
|
+
>>> base_alphabet = NucleotideSequence.unambiguous_alphabet()
|
|
426
|
+
>>> kmer_alphabet = KmerAlphabet(base_alphabet, 2)
|
|
427
|
+
>>> sequence = NucleotideSequence("ATTGCT")
|
|
428
|
+
>>> kmer_codes = kmer_alphabet.create_kmers(sequence.code)
|
|
429
|
+
>>> print(kmer_codes)
|
|
430
|
+
[ 3 15 14 9 7]
|
|
431
|
+
>>> print(["".join(kmer) for kmer in kmer_alphabet.decode_multiple(kmer_codes)])
|
|
432
|
+
['AT', 'TT', 'TG', 'GC', 'CT']
|
|
433
|
+
"""
|
|
434
|
+
if self._spacing is None:
|
|
435
|
+
return self._create_continuous_kmers(seq_code)
|
|
436
|
+
else:
|
|
437
|
+
return self._create_spaced_kmers(seq_code)
|
|
438
|
+
|
|
439
|
+
@cython.boundscheck(False)
|
|
440
|
+
@cython.wraparound(False)
|
|
441
|
+
def _create_continuous_kmers(self, CodeType[:] seq_code not None):
|
|
442
|
+
"""
|
|
443
|
+
Fast implementation of k-mer decomposition.
|
|
444
|
+
Each k-mer is computed from the previous one by removing
|
|
445
|
+
a symbol shifting the remaining values and add the new symbol.
|
|
446
|
+
Requires looping only over sequence length.
|
|
447
|
+
"""
|
|
448
|
+
cdef int64 i
|
|
449
|
+
|
|
450
|
+
cdef int k = self._k
|
|
451
|
+
cdef uint64 alphabet_length = len(self._base_alph)
|
|
452
|
+
cdef int64[:] radix_multiplier = self._radix_multiplier
|
|
453
|
+
cdef int64 end_radix_multiplier = alphabet_length**(k-1)
|
|
454
|
+
|
|
455
|
+
if len(seq_code) < <unsigned int>k:
|
|
456
|
+
raise ValueError(
|
|
457
|
+
"The length of the sequence code is shorter than k"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
cdef int64[:] kmers = np.empty(
|
|
461
|
+
self.kmer_array_length(len(seq_code)), dtype=np.int64
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
cdef CodeType code
|
|
465
|
+
cdef int64 kmer, prev_kmer
|
|
466
|
+
# Compute first k-mer using naive approach
|
|
467
|
+
kmer = 0
|
|
468
|
+
for i in range(k):
|
|
469
|
+
code = seq_code[i]
|
|
470
|
+
if code >= alphabet_length:
|
|
471
|
+
raise AlphabetError(f"Symbol code {code} is out of range")
|
|
472
|
+
kmer += radix_multiplier[i] * code
|
|
473
|
+
kmers[0] = kmer
|
|
474
|
+
|
|
475
|
+
# Compute all following k-mers from the previous one
|
|
476
|
+
prev_kmer = kmer
|
|
477
|
+
for i in range(1, kmers.shape[0]):
|
|
478
|
+
code = seq_code[i + k - 1]
|
|
479
|
+
if code >= alphabet_length:
|
|
480
|
+
raise AlphabetError(f"Symbol code {code} is out of range")
|
|
481
|
+
kmer = (
|
|
482
|
+
(
|
|
483
|
+
# Remove first symbol
|
|
484
|
+
(prev_kmer - seq_code[i - 1] * end_radix_multiplier)
|
|
485
|
+
# Shift k-mer to left
|
|
486
|
+
* alphabet_length
|
|
487
|
+
)
|
|
488
|
+
# Add new symbol
|
|
489
|
+
+ code
|
|
490
|
+
)
|
|
491
|
+
kmers[i] = kmer
|
|
492
|
+
prev_kmer = kmer
|
|
493
|
+
|
|
494
|
+
return np.asarray(kmers)
|
|
495
|
+
|
|
496
|
+
@cython.boundscheck(False)
|
|
497
|
+
@cython.wraparound(False)
|
|
498
|
+
def _create_spaced_kmers(self, CodeType[:] seq_code not None):
|
|
499
|
+
cdef int64 i, j
|
|
500
|
+
|
|
501
|
+
cdef int k = self._k
|
|
502
|
+
cdef int64[:] spacing = self._spacing
|
|
503
|
+
# The last element of the spacing model
|
|
504
|
+
# defines the total k-mer 'span'
|
|
505
|
+
cdef int64 max_offset = spacing[len(spacing)-1] + 1
|
|
506
|
+
cdef uint64 alphabet_length = len(self._base_alph)
|
|
507
|
+
cdef int64[:] radix_multiplier = self._radix_multiplier
|
|
508
|
+
|
|
509
|
+
if len(seq_code) < <unsigned int>max_offset:
|
|
510
|
+
raise ValueError(
|
|
511
|
+
"The length of the sequence code is shorter "
|
|
512
|
+
"than the k-mer span"
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
cdef int64[:] kmers = np.empty(
|
|
516
|
+
self.kmer_array_length(len(seq_code)), dtype=np.int64
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
cdef CodeType code
|
|
520
|
+
cdef int64 kmer
|
|
521
|
+
cdef int64 offset
|
|
522
|
+
for i in range(kmers.shape[0]):
|
|
523
|
+
kmer = 0
|
|
524
|
+
for j in range(k):
|
|
525
|
+
offset = spacing[j]
|
|
526
|
+
code = seq_code[i + offset]
|
|
527
|
+
if code >= alphabet_length:
|
|
528
|
+
raise AlphabetError(f"Symbol code {code} is out of range")
|
|
529
|
+
kmer += radix_multiplier[j] * code
|
|
530
|
+
kmers[i] = kmer
|
|
531
|
+
|
|
532
|
+
return np.asarray(kmers)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def __str__(self):
|
|
536
|
+
return str(self.get_symbols())
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def __repr__(self):
|
|
540
|
+
return f"KmerAlphabet({repr(self._base_alph)}, " \
|
|
541
|
+
f"{self._k}, {repr(self._spacing)})"
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def __eq__(self, item):
|
|
545
|
+
if item is self:
|
|
546
|
+
return True
|
|
547
|
+
if not isinstance(item, KmerAlphabet):
|
|
548
|
+
return False
|
|
549
|
+
if self._base_alph != item._base_alph:
|
|
550
|
+
return False
|
|
551
|
+
if self._k != item._k:
|
|
552
|
+
return False
|
|
553
|
+
|
|
554
|
+
if self._spacing is None:
|
|
555
|
+
if item._spacing is not None:
|
|
556
|
+
return False
|
|
557
|
+
elif np.any(self._spacing != item._spacing):
|
|
558
|
+
return False
|
|
559
|
+
|
|
560
|
+
return True
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def __hash__(self):
|
|
564
|
+
return hash((self._base_alph, self._k, tuple(self._spacing.tolist())))
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def __len__(self):
|
|
568
|
+
return int(len(self._base_alph) ** self._k)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def __iter__(self):
|
|
572
|
+
# Creating all symbols is expensive
|
|
573
|
+
# -> Use a generator instead
|
|
574
|
+
if isinstance(self._base_alph, LetterAlphabet):
|
|
575
|
+
return ("".join(self.decode(code)) for code in range(len(self)))
|
|
576
|
+
else:
|
|
577
|
+
return (list(self.decode(code)) for code in range(len(self)))
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def __contains__(self, symbol):
|
|
581
|
+
try:
|
|
582
|
+
self.fuse(self._base_alph.encode_multiple(symbol))
|
|
583
|
+
return True
|
|
584
|
+
except AlphabetError:
|
|
585
|
+
return False
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _to_array_form(model_string):
|
|
589
|
+
"""
|
|
590
|
+
Convert the the common string representation of a *k-mer* spacing
|
|
591
|
+
model into an array, e.g. ``'1*11'`` into ``[0, 2, 3]``.
|
|
592
|
+
"""
|
|
593
|
+
return np.array([
|
|
594
|
+
i for i in range(len(model_string)) if model_string[i] == "1"
|
|
595
|
+
], dtype=np.int64)
|
|
Binary file
|