biotite 1.5.0__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biotite might be problematic. Click here for more details.
- biotite/__init__.py +18 -0
- biotite/application/__init__.py +69 -0
- biotite/application/application.py +276 -0
- biotite/application/autodock/__init__.py +12 -0
- biotite/application/autodock/app.py +500 -0
- biotite/application/blast/__init__.py +14 -0
- biotite/application/blast/alignment.py +92 -0
- biotite/application/blast/webapp.py +428 -0
- biotite/application/clustalo/__init__.py +12 -0
- biotite/application/clustalo/app.py +223 -0
- biotite/application/dssp/__init__.py +12 -0
- biotite/application/dssp/app.py +216 -0
- biotite/application/localapp.py +342 -0
- biotite/application/mafft/__init__.py +12 -0
- biotite/application/mafft/app.py +116 -0
- biotite/application/msaapp.py +363 -0
- biotite/application/muscle/__init__.py +13 -0
- biotite/application/muscle/app3.py +227 -0
- biotite/application/muscle/app5.py +163 -0
- biotite/application/sra/__init__.py +18 -0
- biotite/application/sra/app.py +447 -0
- biotite/application/tantan/__init__.py +12 -0
- biotite/application/tantan/app.py +199 -0
- biotite/application/util.py +77 -0
- biotite/application/viennarna/__init__.py +18 -0
- biotite/application/viennarna/rnaalifold.py +310 -0
- biotite/application/viennarna/rnafold.py +254 -0
- biotite/application/viennarna/rnaplot.py +208 -0
- biotite/application/viennarna/util.py +77 -0
- biotite/application/webapp.py +76 -0
- biotite/copyable.py +71 -0
- biotite/database/__init__.py +23 -0
- biotite/database/afdb/__init__.py +12 -0
- biotite/database/afdb/download.py +197 -0
- biotite/database/entrez/__init__.py +15 -0
- biotite/database/entrez/check.py +60 -0
- biotite/database/entrez/dbnames.py +101 -0
- biotite/database/entrez/download.py +228 -0
- biotite/database/entrez/key.py +44 -0
- biotite/database/entrez/query.py +263 -0
- biotite/database/error.py +16 -0
- biotite/database/pubchem/__init__.py +21 -0
- biotite/database/pubchem/download.py +258 -0
- biotite/database/pubchem/error.py +30 -0
- biotite/database/pubchem/query.py +819 -0
- biotite/database/pubchem/throttle.py +98 -0
- biotite/database/rcsb/__init__.py +13 -0
- biotite/database/rcsb/download.py +161 -0
- biotite/database/rcsb/query.py +963 -0
- biotite/database/uniprot/__init__.py +13 -0
- biotite/database/uniprot/check.py +40 -0
- biotite/database/uniprot/download.py +126 -0
- biotite/database/uniprot/query.py +292 -0
- biotite/file.py +244 -0
- biotite/interface/__init__.py +19 -0
- biotite/interface/openmm/__init__.py +20 -0
- biotite/interface/openmm/state.py +93 -0
- biotite/interface/openmm/system.py +227 -0
- biotite/interface/pymol/__init__.py +201 -0
- biotite/interface/pymol/cgo.py +346 -0
- biotite/interface/pymol/convert.py +185 -0
- biotite/interface/pymol/display.py +267 -0
- biotite/interface/pymol/object.py +1228 -0
- biotite/interface/pymol/shapes.py +178 -0
- biotite/interface/pymol/startup.py +169 -0
- biotite/interface/rdkit/__init__.py +19 -0
- biotite/interface/rdkit/mol.py +490 -0
- biotite/interface/version.py +94 -0
- biotite/interface/warning.py +19 -0
- biotite/sequence/__init__.py +84 -0
- biotite/sequence/align/__init__.py +199 -0
- biotite/sequence/align/alignment.py +702 -0
- biotite/sequence/align/banded.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/banded.pyx +652 -0
- biotite/sequence/align/buckets.py +71 -0
- biotite/sequence/align/cigar.py +425 -0
- biotite/sequence/align/kmeralphabet.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmeralphabet.pyx +595 -0
- biotite/sequence/align/kmersimilarity.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmersimilarity.pyx +233 -0
- biotite/sequence/align/kmertable.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/kmertable.pyx +3411 -0
- biotite/sequence/align/localgapped.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localgapped.pyx +892 -0
- biotite/sequence/align/localungapped.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/localungapped.pyx +279 -0
- biotite/sequence/align/matrix.py +631 -0
- biotite/sequence/align/matrix_data/3Di.mat +24 -0
- biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
- biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
- biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
- biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
- biotite/sequence/align/matrix_data/GONNET.mat +26 -0
- biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
- biotite/sequence/align/matrix_data/MATCH.mat +25 -0
- biotite/sequence/align/matrix_data/NUC.mat +25 -0
- biotite/sequence/align/matrix_data/PAM10.mat +34 -0
- biotite/sequence/align/matrix_data/PAM100.mat +34 -0
- biotite/sequence/align/matrix_data/PAM110.mat +34 -0
- biotite/sequence/align/matrix_data/PAM120.mat +34 -0
- biotite/sequence/align/matrix_data/PAM130.mat +34 -0
- biotite/sequence/align/matrix_data/PAM140.mat +34 -0
- biotite/sequence/align/matrix_data/PAM150.mat +34 -0
- biotite/sequence/align/matrix_data/PAM160.mat +34 -0
- biotite/sequence/align/matrix_data/PAM170.mat +34 -0
- biotite/sequence/align/matrix_data/PAM180.mat +34 -0
- biotite/sequence/align/matrix_data/PAM190.mat +34 -0
- biotite/sequence/align/matrix_data/PAM20.mat +34 -0
- biotite/sequence/align/matrix_data/PAM200.mat +34 -0
- biotite/sequence/align/matrix_data/PAM210.mat +34 -0
- biotite/sequence/align/matrix_data/PAM220.mat +34 -0
- biotite/sequence/align/matrix_data/PAM230.mat +34 -0
- biotite/sequence/align/matrix_data/PAM240.mat +34 -0
- biotite/sequence/align/matrix_data/PAM250.mat +34 -0
- biotite/sequence/align/matrix_data/PAM260.mat +34 -0
- biotite/sequence/align/matrix_data/PAM270.mat +34 -0
- biotite/sequence/align/matrix_data/PAM280.mat +34 -0
- biotite/sequence/align/matrix_data/PAM290.mat +34 -0
- biotite/sequence/align/matrix_data/PAM30.mat +34 -0
- biotite/sequence/align/matrix_data/PAM300.mat +34 -0
- biotite/sequence/align/matrix_data/PAM310.mat +34 -0
- biotite/sequence/align/matrix_data/PAM320.mat +34 -0
- biotite/sequence/align/matrix_data/PAM330.mat +34 -0
- biotite/sequence/align/matrix_data/PAM340.mat +34 -0
- biotite/sequence/align/matrix_data/PAM350.mat +34 -0
- biotite/sequence/align/matrix_data/PAM360.mat +34 -0
- biotite/sequence/align/matrix_data/PAM370.mat +34 -0
- biotite/sequence/align/matrix_data/PAM380.mat +34 -0
- biotite/sequence/align/matrix_data/PAM390.mat +34 -0
- biotite/sequence/align/matrix_data/PAM40.mat +34 -0
- biotite/sequence/align/matrix_data/PAM400.mat +34 -0
- biotite/sequence/align/matrix_data/PAM410.mat +34 -0
- biotite/sequence/align/matrix_data/PAM420.mat +34 -0
- biotite/sequence/align/matrix_data/PAM430.mat +34 -0
- biotite/sequence/align/matrix_data/PAM440.mat +34 -0
- biotite/sequence/align/matrix_data/PAM450.mat +34 -0
- biotite/sequence/align/matrix_data/PAM460.mat +34 -0
- biotite/sequence/align/matrix_data/PAM470.mat +34 -0
- biotite/sequence/align/matrix_data/PAM480.mat +34 -0
- biotite/sequence/align/matrix_data/PAM490.mat +34 -0
- biotite/sequence/align/matrix_data/PAM50.mat +34 -0
- biotite/sequence/align/matrix_data/PAM500.mat +34 -0
- biotite/sequence/align/matrix_data/PAM60.mat +34 -0
- biotite/sequence/align/matrix_data/PAM70.mat +34 -0
- biotite/sequence/align/matrix_data/PAM80.mat +34 -0
- biotite/sequence/align/matrix_data/PAM90.mat +34 -0
- biotite/sequence/align/matrix_data/PB.license +21 -0
- biotite/sequence/align/matrix_data/PB.mat +18 -0
- biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
- biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
- biotite/sequence/align/multiple.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/multiple.pyx +619 -0
- biotite/sequence/align/pairwise.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/pairwise.pyx +585 -0
- biotite/sequence/align/permutation.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/permutation.pyx +313 -0
- biotite/sequence/align/primes.txt +821 -0
- biotite/sequence/align/selector.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/selector.pyx +954 -0
- biotite/sequence/align/statistics.py +264 -0
- biotite/sequence/align/tracetable.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/align/tracetable.pxd +64 -0
- biotite/sequence/align/tracetable.pyx +370 -0
- biotite/sequence/alphabet.py +555 -0
- biotite/sequence/annotation.py +836 -0
- biotite/sequence/codec.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/codec.pyx +155 -0
- biotite/sequence/codon.py +476 -0
- biotite/sequence/codon_tables.txt +202 -0
- biotite/sequence/graphics/__init__.py +33 -0
- biotite/sequence/graphics/alignment.py +1101 -0
- biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
- biotite/sequence/graphics/color_schemes/autumn.json +51 -0
- biotite/sequence/graphics/color_schemes/blossom.json +51 -0
- biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
- biotite/sequence/graphics/color_schemes/flower.json +51 -0
- biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
- biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
- biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
- biotite/sequence/graphics/color_schemes/ocean.json +51 -0
- biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
- biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
- biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
- biotite/sequence/graphics/color_schemes/spring.json +51 -0
- biotite/sequence/graphics/color_schemes/sunset.json +51 -0
- biotite/sequence/graphics/color_schemes/wither.json +51 -0
- biotite/sequence/graphics/colorschemes.py +170 -0
- biotite/sequence/graphics/dendrogram.py +231 -0
- biotite/sequence/graphics/features.py +544 -0
- biotite/sequence/graphics/logo.py +102 -0
- biotite/sequence/graphics/plasmid.py +712 -0
- biotite/sequence/io/__init__.py +12 -0
- biotite/sequence/io/fasta/__init__.py +22 -0
- biotite/sequence/io/fasta/convert.py +283 -0
- biotite/sequence/io/fasta/file.py +265 -0
- biotite/sequence/io/fastq/__init__.py +19 -0
- biotite/sequence/io/fastq/convert.py +117 -0
- biotite/sequence/io/fastq/file.py +507 -0
- biotite/sequence/io/genbank/__init__.py +17 -0
- biotite/sequence/io/genbank/annotation.py +269 -0
- biotite/sequence/io/genbank/file.py +573 -0
- biotite/sequence/io/genbank/metadata.py +336 -0
- biotite/sequence/io/genbank/sequence.py +173 -0
- biotite/sequence/io/general.py +201 -0
- biotite/sequence/io/gff/__init__.py +26 -0
- biotite/sequence/io/gff/convert.py +128 -0
- biotite/sequence/io/gff/file.py +449 -0
- biotite/sequence/phylo/__init__.py +36 -0
- biotite/sequence/phylo/nj.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/nj.pyx +221 -0
- biotite/sequence/phylo/tree.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/tree.pyx +1169 -0
- biotite/sequence/phylo/upgma.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/sequence/phylo/upgma.pyx +164 -0
- biotite/sequence/profile.py +561 -0
- biotite/sequence/search.py +117 -0
- biotite/sequence/seqtypes.py +720 -0
- biotite/sequence/sequence.py +373 -0
- biotite/setup_ccd.py +197 -0
- biotite/structure/__init__.py +135 -0
- biotite/structure/alphabet/__init__.py +25 -0
- biotite/structure/alphabet/encoder.py +332 -0
- biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
- biotite/structure/alphabet/i3d.py +109 -0
- biotite/structure/alphabet/layers.py +86 -0
- biotite/structure/alphabet/pb.license +21 -0
- biotite/structure/alphabet/pb.py +170 -0
- biotite/structure/alphabet/unkerasify.py +128 -0
- biotite/structure/atoms.py +1562 -0
- biotite/structure/basepairs.py +1403 -0
- biotite/structure/bonds.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/bonds.pyx +2036 -0
- biotite/structure/box.py +724 -0
- biotite/structure/celllist.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/celllist.pyx +864 -0
- biotite/structure/chains.py +310 -0
- biotite/structure/charges.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/charges.pyx +520 -0
- biotite/structure/compare.py +683 -0
- biotite/structure/density.py +109 -0
- biotite/structure/dotbracket.py +213 -0
- biotite/structure/error.py +39 -0
- biotite/structure/filter.py +591 -0
- biotite/structure/geometry.py +817 -0
- biotite/structure/graphics/__init__.py +13 -0
- biotite/structure/graphics/atoms.py +243 -0
- biotite/structure/graphics/rna.py +298 -0
- biotite/structure/hbond.py +425 -0
- biotite/structure/info/__init__.py +24 -0
- biotite/structure/info/atom_masses.json +121 -0
- biotite/structure/info/atoms.py +98 -0
- biotite/structure/info/bonds.py +149 -0
- biotite/structure/info/ccd.py +200 -0
- biotite/structure/info/components.bcif +0 -0
- biotite/structure/info/groups.py +128 -0
- biotite/structure/info/masses.py +121 -0
- biotite/structure/info/misc.py +137 -0
- biotite/structure/info/radii.py +267 -0
- biotite/structure/info/standardize.py +185 -0
- biotite/structure/integrity.py +213 -0
- biotite/structure/io/__init__.py +29 -0
- biotite/structure/io/dcd/__init__.py +13 -0
- biotite/structure/io/dcd/file.py +67 -0
- biotite/structure/io/general.py +243 -0
- biotite/structure/io/gro/__init__.py +14 -0
- biotite/structure/io/gro/file.py +343 -0
- biotite/structure/io/mol/__init__.py +20 -0
- biotite/structure/io/mol/convert.py +112 -0
- biotite/structure/io/mol/ctab.py +420 -0
- biotite/structure/io/mol/header.py +120 -0
- biotite/structure/io/mol/mol.py +149 -0
- biotite/structure/io/mol/sdf.py +940 -0
- biotite/structure/io/netcdf/__init__.py +13 -0
- biotite/structure/io/netcdf/file.py +64 -0
- biotite/structure/io/pdb/__init__.py +20 -0
- biotite/structure/io/pdb/convert.py +389 -0
- biotite/structure/io/pdb/file.py +1380 -0
- biotite/structure/io/pdb/hybrid36.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdb/hybrid36.pyx +242 -0
- biotite/structure/io/pdbqt/__init__.py +15 -0
- biotite/structure/io/pdbqt/convert.py +113 -0
- biotite/structure/io/pdbqt/file.py +688 -0
- biotite/structure/io/pdbx/__init__.py +23 -0
- biotite/structure/io/pdbx/bcif.py +674 -0
- biotite/structure/io/pdbx/cif.py +1091 -0
- biotite/structure/io/pdbx/component.py +251 -0
- biotite/structure/io/pdbx/compress.py +362 -0
- biotite/structure/io/pdbx/convert.py +2113 -0
- biotite/structure/io/pdbx/encoding.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/io/pdbx/encoding.pyx +1078 -0
- biotite/structure/io/trajfile.py +696 -0
- biotite/structure/io/trr/__init__.py +13 -0
- biotite/structure/io/trr/file.py +43 -0
- biotite/structure/io/util.py +38 -0
- biotite/structure/io/xtc/__init__.py +13 -0
- biotite/structure/io/xtc/file.py +43 -0
- biotite/structure/mechanics.py +72 -0
- biotite/structure/molecules.py +337 -0
- biotite/structure/pseudoknots.py +622 -0
- biotite/structure/rdf.py +245 -0
- biotite/structure/repair.py +302 -0
- biotite/structure/residues.py +716 -0
- biotite/structure/rings.py +451 -0
- biotite/structure/sasa.cpython-311-x86_64-linux-gnu.so +0 -0
- biotite/structure/sasa.pyx +322 -0
- biotite/structure/segments.py +328 -0
- biotite/structure/sequence.py +110 -0
- biotite/structure/spacegroups.json +1567 -0
- biotite/structure/spacegroups.license +26 -0
- biotite/structure/sse.py +306 -0
- biotite/structure/superimpose.py +511 -0
- biotite/structure/tm.py +581 -0
- biotite/structure/transform.py +736 -0
- biotite/structure/util.py +160 -0
- biotite/version.py +34 -0
- biotite/visualize.py +375 -0
- biotite-1.5.0.dist-info/METADATA +162 -0
- biotite-1.5.0.dist-info/RECORD +354 -0
- biotite-1.5.0.dist-info/WHEEL +6 -0
- biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
|
@@ -0,0 +1,3411 @@
|
|
|
1
|
+
# This source code is part of the Biotite package and is distributed
|
|
2
|
+
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
|
|
3
|
+
# information.
|
|
4
|
+
|
|
5
|
+
# distutils: language = c++
|
|
6
|
+
|
|
7
|
+
__name__ = "biotite.sequence.align"
|
|
8
|
+
__author__ = "Patrick Kunzmann"
|
|
9
|
+
__all__ = ["KmerTable", "BucketKmerTable"]
|
|
10
|
+
|
|
11
|
+
cimport cython
|
|
12
|
+
cimport numpy as np
|
|
13
|
+
from cpython.mem cimport PyMem_Malloc as malloc
|
|
14
|
+
from cpython.mem cimport PyMem_Free as free
|
|
15
|
+
from libc.string cimport memcpy
|
|
16
|
+
from libcpp.set cimport set as cpp_set
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from ..alphabet import LetterAlphabet, common_alphabet, AlphabetError
|
|
20
|
+
from .kmeralphabet import KmerAlphabet
|
|
21
|
+
from .buckets import bucket_number
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
ctypedef np.int32_t int32
|
|
25
|
+
ctypedef np.int64_t int64
|
|
26
|
+
ctypedef np.uint8_t uint8
|
|
27
|
+
ctypedef np.uint32_t uint32
|
|
28
|
+
ctypedef np.uint64_t ptr
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
cdef enum EntrySize:
|
|
32
|
+
# The size (number of 32 bit elements) for each entry in C-arrays
|
|
33
|
+
# of KmerTable and BucketKmerTable, respectively
|
|
34
|
+
#
|
|
35
|
+
# Size: reference ID (int32) + sequence pos (int32)
|
|
36
|
+
NO_BUCKETS = 2
|
|
37
|
+
# Size: k-mer (int64) + reference ID (int32) + sequence pos (int32)
|
|
38
|
+
BUCKETS = 4
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
cdef class KmerTable:
|
|
42
|
+
"""
|
|
43
|
+
This class represents a *k-mer* index table.
|
|
44
|
+
It maps *k-mers* (subsequences with length *k*) to the sequence
|
|
45
|
+
positions, where the *k-mer* appears.
|
|
46
|
+
It is primarily used to find *k-mer* matches between two sequences.
|
|
47
|
+
A match is defined as a *k-mer* that appears in both sequences.
|
|
48
|
+
Instances of this class are immutable.
|
|
49
|
+
|
|
50
|
+
There are multiple ways to create a :class:`KmerTable`:
|
|
51
|
+
|
|
52
|
+
- :meth:`from_sequences()` iterates through all overlapping
|
|
53
|
+
*k-mers* in a sequence and stores the sequence position of
|
|
54
|
+
each *kmer* in the table.
|
|
55
|
+
- :meth:`from_kmers()` is similar to :meth:`from_sequences()`
|
|
56
|
+
but directly accepts *k-mers* as input instead of sequences.
|
|
57
|
+
- :meth:`from_kmer_selection()` takes a combination of *k-mers*
|
|
58
|
+
and their positions in a sequence, which can be used to
|
|
59
|
+
apply subset selectors, such as :class:`MinimizerSelector`.
|
|
60
|
+
- :meth:`from_tables()` merges the entries from multiple
|
|
61
|
+
:class:`KmerTable` objects into a new table.
|
|
62
|
+
- :meth:`from_positions()` let's the user provide manual
|
|
63
|
+
*k-mer* positions, which can be useful for loading a
|
|
64
|
+
:class:`KmerTable` from file.
|
|
65
|
+
|
|
66
|
+
The standard constructor merely returns an empty table and is
|
|
67
|
+
reserved for internal use.
|
|
68
|
+
|
|
69
|
+
Each indexed *k-mer* position is represented by a tuple of
|
|
70
|
+
|
|
71
|
+
1. a unique reference ID that identifies to which sequence a
|
|
72
|
+
position refers to and
|
|
73
|
+
2. the zero-based sequence position of the first symbol in the
|
|
74
|
+
*k-mer*.
|
|
75
|
+
|
|
76
|
+
The :meth:`match()` method iterates through all overlapping *k-mers*
|
|
77
|
+
in another sequence and, for each *k-mer*, looks up the reference
|
|
78
|
+
IDs and positions of this *k-mer* in the table.
|
|
79
|
+
For each matching position, it adds the *k-mer* position in this
|
|
80
|
+
sequence, the matching reference ID and the matching sequence
|
|
81
|
+
position to the array of matches.
|
|
82
|
+
Finally these matches are returned to the user.
|
|
83
|
+
Optionally, a :class:`SimilarityRule` can be supplied, to find
|
|
84
|
+
also matches for similar *k-mers*.
|
|
85
|
+
This is especially useful for protein sequences to match two
|
|
86
|
+
*k-mers* with a high substitution probability.
|
|
87
|
+
|
|
88
|
+
The positions for a given *k-mer* code can be obtained via indexing.
|
|
89
|
+
Iteration over a :class:`KmerTable` yields the *k-mers* that have at
|
|
90
|
+
least one associated position.
|
|
91
|
+
The *k-mer* code for a *k-mer* can be calculated with
|
|
92
|
+
``table.kmer_alphabet.encode()`` (see :class:`KmerAlphabet`).
|
|
93
|
+
|
|
94
|
+
Attributes
|
|
95
|
+
----------
|
|
96
|
+
kmer_alphabet : KmerAlphabet
|
|
97
|
+
The internal :class:`KmerAlphabet`, that is used to
|
|
98
|
+
encode all overlapping *k-mers* of an input sequence.
|
|
99
|
+
alphabet : Alphabet
|
|
100
|
+
The base alphabet, from which this :class:`KmerTable` was
|
|
101
|
+
created.
|
|
102
|
+
k : int
|
|
103
|
+
The length of the *k-mers*.
|
|
104
|
+
|
|
105
|
+
See Also
|
|
106
|
+
--------
|
|
107
|
+
BucketKmerTable
|
|
108
|
+
|
|
109
|
+
Notes
|
|
110
|
+
-----
|
|
111
|
+
|
|
112
|
+
The design of the :class:`KmerTable` is inspired by the *MMseqs2*
|
|
113
|
+
software :footcite:`Steinegger2017`.
|
|
114
|
+
|
|
115
|
+
*Memory consumption*
|
|
116
|
+
|
|
117
|
+
For efficient mapping, a :class:`KmerTable` contains a pointer
|
|
118
|
+
array, that contains one 64-bit pointer for each possible *k-mer*.
|
|
119
|
+
If there is at least one position for a *k-mer*, the corresponding
|
|
120
|
+
pointer points to a C-array that contains
|
|
121
|
+
|
|
122
|
+
1. The length of the C-array *(int64)*
|
|
123
|
+
2. The reference ID for each position of this *k-mer* *(uint32)*
|
|
124
|
+
3. The sequence position for each position of this *k-mer* *(uint32)*
|
|
125
|
+
|
|
126
|
+
Hence, the memory requirements can be quite large for long *k-mers*
|
|
127
|
+
or large alphabets.
|
|
128
|
+
The required memory space :math:`S` in byte is within the bounds of
|
|
129
|
+
|
|
130
|
+
.. math::
|
|
131
|
+
|
|
132
|
+
8 n^k + 8L \leq S \leq 16 n^k + 8L,
|
|
133
|
+
|
|
134
|
+
where :math:`n` is the number of symbols in the alphabet and
|
|
135
|
+
:math:`L` is the summed length of all sequences added to the table.
|
|
136
|
+
|
|
137
|
+
*Multiprocessing*
|
|
138
|
+
|
|
139
|
+
:class:`KmerTable` objects can be used in multi-processed setups:
|
|
140
|
+
Adding a large database of sequences to a table can be sped up by
|
|
141
|
+
splitting the database into smaller chunks and create a separate
|
|
142
|
+
table for each chunk in separate processes.
|
|
143
|
+
Eventually, the tables can be merged to one large table using
|
|
144
|
+
:meth:`from_tables()`.
|
|
145
|
+
|
|
146
|
+
Since :class:`KmerTable` supports the *pickle* protocol,
|
|
147
|
+
the matching step can also be divided into multiple processes, if
|
|
148
|
+
multiple sequences need to be matched.
|
|
149
|
+
|
|
150
|
+
*Storage on hard drive*
|
|
151
|
+
|
|
152
|
+
The most time efficient way to read/write a :class:`KmerTable` is
|
|
153
|
+
the *pickle* format.
|
|
154
|
+
If a custom format is desired, the user needs to extract the
|
|
155
|
+
reference IDs and position for each *k-mer*.
|
|
156
|
+
To restrict this task to all *k-mer* that have at least one match
|
|
157
|
+
:meth:`get_kmers()` can be used.
|
|
158
|
+
Conversely, the reference IDs and positions can be restored via
|
|
159
|
+
:meth:`from_positions()`.
|
|
160
|
+
|
|
161
|
+
References
|
|
162
|
+
----------
|
|
163
|
+
|
|
164
|
+
.. footbibliography::
|
|
165
|
+
|
|
166
|
+
Examples
|
|
167
|
+
--------
|
|
168
|
+
|
|
169
|
+
Create a *2-mer* index table for some nucleotide sequences:
|
|
170
|
+
|
|
171
|
+
>>> table = KmerTable.from_sequences(
|
|
172
|
+
... k = 2,
|
|
173
|
+
... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
|
|
174
|
+
... ref_ids = [0, 1]
|
|
175
|
+
... )
|
|
176
|
+
|
|
177
|
+
Display the contents of the table as
|
|
178
|
+
(reference ID, sequence position) tuples:
|
|
179
|
+
|
|
180
|
+
>>> print(table)
|
|
181
|
+
AG: (1, 2)
|
|
182
|
+
AT: (0, 2)
|
|
183
|
+
CT: (1, 0)
|
|
184
|
+
TA: (0, 1), (0, 3), (1, 1)
|
|
185
|
+
TT: (0, 0)
|
|
186
|
+
|
|
187
|
+
Find matches of the table with a sequence:
|
|
188
|
+
|
|
189
|
+
>>> query = NucleotideSequence("TAG")
|
|
190
|
+
>>> matches = table.match(query)
|
|
191
|
+
>>> for query_pos, table_ref_id, table_pos in matches:
|
|
192
|
+
... print("Query sequence position:", query_pos)
|
|
193
|
+
... print("Table reference ID: ", table_ref_id)
|
|
194
|
+
... print("Table sequence position:", table_pos)
|
|
195
|
+
... print()
|
|
196
|
+
Query sequence position: 0
|
|
197
|
+
Table reference ID: 0
|
|
198
|
+
Table sequence position: 1
|
|
199
|
+
<BLANKLINE>
|
|
200
|
+
Query sequence position: 0
|
|
201
|
+
Table reference ID: 0
|
|
202
|
+
Table sequence position: 3
|
|
203
|
+
<BLANKLINE>
|
|
204
|
+
Query sequence position: 0
|
|
205
|
+
Table reference ID: 1
|
|
206
|
+
Table sequence position: 1
|
|
207
|
+
<BLANKLINE>
|
|
208
|
+
Query sequence position: 1
|
|
209
|
+
Table reference ID: 1
|
|
210
|
+
Table sequence position: 2
|
|
211
|
+
<BLANKLINE>
|
|
212
|
+
|
|
213
|
+
Get all reference IDs and positions for a given *k-mer*:
|
|
214
|
+
|
|
215
|
+
>>> kmer_code = table.kmer_alphabet.encode("TA")
|
|
216
|
+
>>> print(table[kmer_code])
|
|
217
|
+
[[0 1]
|
|
218
|
+
[0 3]
|
|
219
|
+
[1 1]]
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
cdef object _kmer_alph
|
|
223
|
+
cdef int _k
|
|
224
|
+
|
|
225
|
+
# The pointer array is the core of the index table:
|
|
226
|
+
# It maps each possible k-mer (represented by its code) to a
|
|
227
|
+
# C-array of indices.
|
|
228
|
+
# Each entry in a C-array points to a reference ID and the
|
|
229
|
+
# location in that sequence where the respective k-mer appears
|
|
230
|
+
# The memory layout of each C-array is as following:
|
|
231
|
+
#
|
|
232
|
+
# (Array length) (RefID 0) (Position 0) (RefID 1) (Position 1) ...
|
|
233
|
+
# -----int64----|---uint32---|---uint32---|---uint32---|---uint32---
|
|
234
|
+
#
|
|
235
|
+
# The array length is based on 32 bit units.
|
|
236
|
+
# If there is no entry for a k-mer, the respective pointer is NULL.
|
|
237
|
+
cdef ptr[:] _ptr_array
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def __cinit__(self, kmer_alphabet):
|
|
241
|
+
# This check is necessary for proper memory management
|
|
242
|
+
# of the allocated arrays
|
|
243
|
+
if self._is_initialized():
|
|
244
|
+
raise Exception("Duplicate call of constructor")
|
|
245
|
+
|
|
246
|
+
self._kmer_alph = kmer_alphabet
|
|
247
|
+
self._k = kmer_alphabet.k
|
|
248
|
+
self._ptr_array = np.zeros(len(self._kmer_alph), dtype=np.uint64)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def kmer_alphabet(self):
|
|
253
|
+
return self._kmer_alph
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def alphabet(self):
|
|
257
|
+
return self._kmer_alph.base_alphabet
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def k(self):
|
|
261
|
+
return self._k
|
|
262
|
+
|
|
263
|
+
@staticmethod
|
|
264
|
+
def from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
|
|
265
|
+
alphabet=None, spacing=None):
|
|
266
|
+
"""
|
|
267
|
+
from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
|
|
268
|
+
alphabet=None, spacing=None)
|
|
269
|
+
|
|
270
|
+
Create a :class:`KmerTable` by storing the positions of all
|
|
271
|
+
overlapping *k-mers* from the input `sequences`.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
k : int
|
|
276
|
+
The length of the *k-mers*.
|
|
277
|
+
sequences : sized iterable object of Sequence, length=m
|
|
278
|
+
The sequences to get the *k-mer* positions from.
|
|
279
|
+
These sequences must have equal alphabets, or one of these
|
|
280
|
+
sequences must have an alphabet that extends the alphabets
|
|
281
|
+
of all other sequences.
|
|
282
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
283
|
+
The reference IDs for the given sequences.
|
|
284
|
+
These are used to identify the corresponding sequence for a
|
|
285
|
+
*k-mer* match.
|
|
286
|
+
By default the IDs are counted from *0* to *m*.
|
|
287
|
+
ignore_masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
|
|
288
|
+
Sequence positions to ignore.
|
|
289
|
+
*k-mers* that involve these sequence positions are not added
|
|
290
|
+
to the table.
|
|
291
|
+
This is used e.g. to skip repeat regions.
|
|
292
|
+
If provided, the list must contain one boolean mask
|
|
293
|
+
(or ``None``) for each sequence, and each bolean mask must
|
|
294
|
+
have the same length as the sequence.
|
|
295
|
+
By default, no sequence position is ignored.
|
|
296
|
+
alphabet : Alphabet, optional
|
|
297
|
+
The alphabet to use for this table.
|
|
298
|
+
It must extend the alphabets of the input `sequences`.
|
|
299
|
+
By default, an appropriate alphabet is inferred from the
|
|
300
|
+
input `sequences`.
|
|
301
|
+
This option is usually used for compatibility with another
|
|
302
|
+
sequence/table in the matching step.
|
|
303
|
+
spacing : None or str or list or ndarray, dtype=int, shape=(k,)
|
|
304
|
+
If provided, spaced *k-mers* are used instead of continuous
|
|
305
|
+
ones.
|
|
306
|
+
The value contains the *informative* positions relative to
|
|
307
|
+
the start of the *k-mer*, also called the *model*.
|
|
308
|
+
The number of *informative* positions must equal *k*.
|
|
309
|
+
Refer to :class:`KmerAlphabet` for more details.
|
|
310
|
+
|
|
311
|
+
See Also
|
|
312
|
+
--------
|
|
313
|
+
from_kmers : The same functionality based on already created *k-mers*
|
|
314
|
+
|
|
315
|
+
Returns
|
|
316
|
+
-------
|
|
317
|
+
table : KmerTable
|
|
318
|
+
The newly created table.
|
|
319
|
+
|
|
320
|
+
Examples
|
|
321
|
+
--------
|
|
322
|
+
|
|
323
|
+
>>> sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")]
|
|
324
|
+
>>> table = KmerTable.from_sequences(
|
|
325
|
+
... 2, sequences, ref_ids=[100, 101]
|
|
326
|
+
... )
|
|
327
|
+
>>> print(table)
|
|
328
|
+
AG: (101, 2)
|
|
329
|
+
AT: (100, 2)
|
|
330
|
+
CT: (101, 0)
|
|
331
|
+
TA: (100, 1), (100, 3), (101, 1)
|
|
332
|
+
TT: (100, 0)
|
|
333
|
+
|
|
334
|
+
Give an explicit compatible alphabet:
|
|
335
|
+
|
|
336
|
+
>>> table = KmerTable.from_sequences(
|
|
337
|
+
... 2, sequences, ref_ids=[100, 101],
|
|
338
|
+
... alphabet=NucleotideSequence.ambiguous_alphabet()
|
|
339
|
+
... )
|
|
340
|
+
|
|
341
|
+
Ignore all ``N`` in a sequence:
|
|
342
|
+
|
|
343
|
+
>>> sequence = NucleotideSequence("ACCNTANNG")
|
|
344
|
+
>>> table = KmerTable.from_sequences(
|
|
345
|
+
... 2, [sequence], ignore_masks=[sequence.symbols == "N"]
|
|
346
|
+
... )
|
|
347
|
+
>>> print(table)
|
|
348
|
+
AC: (0, 0)
|
|
349
|
+
CC: (0, 1)
|
|
350
|
+
TA: (0, 4)
|
|
351
|
+
"""
|
|
352
|
+
ref_ids = _compute_ref_ids(ref_ids, sequences)
|
|
353
|
+
ignore_masks = _compute_masks(ignore_masks, sequences)
|
|
354
|
+
alphabet = _compute_alphabet(
|
|
355
|
+
alphabet, (sequence.alphabet for sequence in sequences)
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
table = KmerTable(KmerAlphabet(alphabet, k, spacing))
|
|
359
|
+
|
|
360
|
+
# Calculate k-mers
|
|
361
|
+
kmers_list = [
|
|
362
|
+
table._kmer_alph.create_kmers(sequence.code)
|
|
363
|
+
for sequence in sequences
|
|
364
|
+
]
|
|
365
|
+
|
|
366
|
+
masks = [
|
|
367
|
+
_prepare_mask(table._kmer_alph, ignore_mask, len(sequence))
|
|
368
|
+
for sequence, ignore_mask in zip(sequences, ignore_masks)
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
# Count the number of appearances of each k-mer and store the
|
|
372
|
+
# result in the pointer array, that is now used as count array
|
|
373
|
+
for kmers, mask in zip(kmers_list, masks):
|
|
374
|
+
table._count_masked_kmers(kmers, mask)
|
|
375
|
+
|
|
376
|
+
# Transfrom count array into pointer array with C-array of
|
|
377
|
+
# appropriate size
|
|
378
|
+
_init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
|
|
379
|
+
|
|
380
|
+
# Fill the C-arrays with the k-mer positions
|
|
381
|
+
for kmers, ref_id, mask in zip(kmers_list, ref_ids, masks):
|
|
382
|
+
table._add_kmers(kmers, ref_id, mask)
|
|
383
|
+
|
|
384
|
+
return table
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@staticmethod
|
|
388
|
+
def from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None):
|
|
389
|
+
"""
|
|
390
|
+
from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None)
|
|
391
|
+
|
|
392
|
+
Create a :class:`KmerTable` by storing the positions of all
|
|
393
|
+
input *k-mers*.
|
|
394
|
+
|
|
395
|
+
Parameters
|
|
396
|
+
----------
|
|
397
|
+
kmer_alphabet : KmerAlphabet
|
|
398
|
+
The :class:`KmerAlphabet` to use for the new table.
|
|
399
|
+
Should be the same alphabet that was used to calculate the
|
|
400
|
+
input *kmers*.
|
|
401
|
+
kmers : sized iterable object of (ndarray, dtype=np.int64), length=m
|
|
402
|
+
List where each array contains the *k-mer* codes from a
|
|
403
|
+
sequence.
|
|
404
|
+
For each array the index of the *k-mer* code in the array
|
|
405
|
+
is stored in the table as sequence position.
|
|
406
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
407
|
+
The reference IDs for the sequences.
|
|
408
|
+
These are used to identify the corresponding sequence for a
|
|
409
|
+
*k-mer* match.
|
|
410
|
+
By default the IDs are counted from *0* to *m*.
|
|
411
|
+
masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
|
|
412
|
+
A *k-mer* code at a position, where the corresponding mask
|
|
413
|
+
is false, is not added to the table.
|
|
414
|
+
By default, all positions are added.
|
|
415
|
+
|
|
416
|
+
See Also
|
|
417
|
+
--------
|
|
418
|
+
from_sequences : The same functionality based on undecomposed sequences
|
|
419
|
+
|
|
420
|
+
Returns
|
|
421
|
+
-------
|
|
422
|
+
table : KmerTable
|
|
423
|
+
The newly created table.
|
|
424
|
+
|
|
425
|
+
Examples
|
|
426
|
+
--------
|
|
427
|
+
|
|
428
|
+
>>> sequences = [ProteinSequence("BIQTITE"), ProteinSequence("NIQBITE")]
|
|
429
|
+
>>> kmer_alphabet = KmerAlphabet(ProteinSequence.alphabet, 3)
|
|
430
|
+
>>> kmer_codes = [kmer_alphabet.create_kmers(s.code) for s in sequences]
|
|
431
|
+
>>> for code in kmer_codes:
|
|
432
|
+
... print(code)
|
|
433
|
+
[11701 4360 7879 9400 4419]
|
|
434
|
+
[ 6517 4364 7975 11704 4419]
|
|
435
|
+
>>> table = KmerTable.from_kmers(
|
|
436
|
+
... kmer_alphabet, kmer_codes
|
|
437
|
+
... )
|
|
438
|
+
>>> print(table)
|
|
439
|
+
IQT: (0, 1)
|
|
440
|
+
IQB: (1, 1)
|
|
441
|
+
ITE: (0, 4), (1, 4)
|
|
442
|
+
NIQ: (1, 0)
|
|
443
|
+
QTI: (0, 2)
|
|
444
|
+
QBI: (1, 2)
|
|
445
|
+
TIT: (0, 3)
|
|
446
|
+
BIQ: (0, 0)
|
|
447
|
+
BIT: (1, 3)
|
|
448
|
+
"""
|
|
449
|
+
_check_kmer_alphabet(kmer_alphabet)
|
|
450
|
+
_check_multiple_kmer_bounds(kmers, kmer_alphabet)
|
|
451
|
+
|
|
452
|
+
ref_ids = _compute_ref_ids(ref_ids, kmers)
|
|
453
|
+
masks = _compute_masks(masks, kmers)
|
|
454
|
+
|
|
455
|
+
table = KmerTable(kmer_alphabet)
|
|
456
|
+
|
|
457
|
+
masks = [
|
|
458
|
+
np.ones(len(arr), dtype=np.uint8) if mask is None
|
|
459
|
+
# Convert boolean mask into uint8 array to be able
|
|
460
|
+
# to handle it as memory view
|
|
461
|
+
else np.frombuffer(
|
|
462
|
+
mask.astype(bool, copy=False), dtype=np.uint8
|
|
463
|
+
)
|
|
464
|
+
for mask, arr in zip(masks, kmers)
|
|
465
|
+
]
|
|
466
|
+
|
|
467
|
+
for arr, mask in zip(kmers, masks):
|
|
468
|
+
table._count_masked_kmers(arr, mask)
|
|
469
|
+
|
|
470
|
+
_init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
|
|
471
|
+
|
|
472
|
+
for arr, ref_id, mask in zip(kmers, ref_ids, masks):
|
|
473
|
+
table._add_kmers(arr, ref_id, mask)
|
|
474
|
+
|
|
475
|
+
return table
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
@staticmethod
|
|
479
|
+
def from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None):
|
|
480
|
+
"""
|
|
481
|
+
from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None)
|
|
482
|
+
|
|
483
|
+
Create a :class:`KmerTable` by storing the positions of a
|
|
484
|
+
filtered subset of input *k-mers*.
|
|
485
|
+
|
|
486
|
+
This can be used to reduce the number of stored *k-mers* using
|
|
487
|
+
a *k-mer* subset selector such as :class:`MinimizerSelector`.
|
|
488
|
+
|
|
489
|
+
Parameters
|
|
490
|
+
----------
|
|
491
|
+
kmer_alphabet : KmerAlphabet
|
|
492
|
+
The :class:`KmerAlphabet` to use for the new table.
|
|
493
|
+
Should be the same alphabet that was used to calculate the
|
|
494
|
+
input *kmers*.
|
|
495
|
+
positions : sized iterable object of (ndarray, shape=(n,), dtype=uint32), length=m
|
|
496
|
+
List where each array contains the sequence positions of
|
|
497
|
+
the filtered subset of *k-mers* given in `kmers`.
|
|
498
|
+
The list may contain multiple elements for multiple
|
|
499
|
+
sequences.
|
|
500
|
+
kmers : sized iterable object of (ndarray, shape=(n,), dtype=np.int64), length=m
|
|
501
|
+
List where each array contains the filtered subset of
|
|
502
|
+
*k-mer* codes from a sequence.
|
|
503
|
+
For each array the index of the *k-mer* code in the array,
|
|
504
|
+
is stored in the table as sequence position.
|
|
505
|
+
The list may contain multiple elements for multiple
|
|
506
|
+
sequences.
|
|
507
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
508
|
+
The reference IDs for the sequences.
|
|
509
|
+
These are used to identify the corresponding sequence for a
|
|
510
|
+
*k-mer* match.
|
|
511
|
+
By default the IDs are counted from *0* to *m*.
|
|
512
|
+
|
|
513
|
+
Returns
|
|
514
|
+
-------
|
|
515
|
+
table : KmerTable
|
|
516
|
+
The newly created table.
|
|
517
|
+
|
|
518
|
+
Examples
|
|
519
|
+
--------
|
|
520
|
+
|
|
521
|
+
Reduce the size of sequence data in the table using minimizers:
|
|
522
|
+
|
|
523
|
+
>>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
|
|
524
|
+
>>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
|
|
525
|
+
>>> minimizer = MinimizerSelector(kmer_alph, window=4)
|
|
526
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence1)
|
|
527
|
+
>>> kmer_table = KmerTable.from_kmer_selection(
|
|
528
|
+
... kmer_alph, [minimizer_pos], [minimizers]
|
|
529
|
+
... )
|
|
530
|
+
|
|
531
|
+
Use the same :class:`MinimizerSelector` to select the minimizers
|
|
532
|
+
from the query sequence and match them against the table.
|
|
533
|
+
Although the amount of *k-mers* is reduced, matching is still
|
|
534
|
+
guanrateed to work, if the two sequences share identity in the
|
|
535
|
+
given window:
|
|
536
|
+
|
|
537
|
+
>>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
|
|
538
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence2)
|
|
539
|
+
>>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
|
|
540
|
+
>>> print(matches)
|
|
541
|
+
[[ 9 0 11]
|
|
542
|
+
[12 0 14]]
|
|
543
|
+
>>> for query_pos, _, db_pos in matches:
|
|
544
|
+
... print(sequence1)
|
|
545
|
+
... print(" " * (db_pos-1) + "^" * kmer_table.k)
|
|
546
|
+
... print(sequence2)
|
|
547
|
+
... print(" " * (query_pos-1) + "^" * kmer_table.k)
|
|
548
|
+
... print()
|
|
549
|
+
THIS*IS*A*SEQVENCE
|
|
550
|
+
^^^
|
|
551
|
+
ANQTHER*SEQVENCE
|
|
552
|
+
^^^
|
|
553
|
+
<BLANKLINE>
|
|
554
|
+
THIS*IS*A*SEQVENCE
|
|
555
|
+
^^^
|
|
556
|
+
ANQTHER*SEQVENCE
|
|
557
|
+
^^^
|
|
558
|
+
<BLANKLINE>
|
|
559
|
+
"""
|
|
560
|
+
_check_kmer_alphabet(kmer_alphabet)
|
|
561
|
+
_check_multiple_kmer_bounds(kmers, kmer_alphabet)
|
|
562
|
+
_check_position_shape(positions, kmers)
|
|
563
|
+
|
|
564
|
+
ref_ids = _compute_ref_ids(ref_ids, kmers)
|
|
565
|
+
|
|
566
|
+
table = KmerTable(kmer_alphabet)
|
|
567
|
+
|
|
568
|
+
for arr in kmers:
|
|
569
|
+
table._count_kmers(arr)
|
|
570
|
+
|
|
571
|
+
_init_c_arrays(table._ptr_array, EntrySize.NO_BUCKETS)
|
|
572
|
+
|
|
573
|
+
for pos, arr, ref_id in zip(positions, kmers, ref_ids):
|
|
574
|
+
table._add_kmer_selection(
|
|
575
|
+
pos.astype(np.uint32, copy=False), arr, ref_id
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
return table
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
@staticmethod
|
|
582
|
+
def from_tables(tables):
|
|
583
|
+
"""
|
|
584
|
+
from_tables(tables)
|
|
585
|
+
|
|
586
|
+
Create a :class:`KmerTable` by merging the *k-mer* positions
|
|
587
|
+
from existing `tables`.
|
|
588
|
+
|
|
589
|
+
Parameters
|
|
590
|
+
----------
|
|
591
|
+
tables : iterable object of KmerTable
|
|
592
|
+
The tables to be merged.
|
|
593
|
+
All tables must have equal :class:`KmerAlphabet` objects,
|
|
594
|
+
i.e. the same *k* and equal base alphabets.
|
|
595
|
+
|
|
596
|
+
Returns
|
|
597
|
+
-------
|
|
598
|
+
table : KmerTable
|
|
599
|
+
The newly created table.
|
|
600
|
+
|
|
601
|
+
Examples
|
|
602
|
+
--------
|
|
603
|
+
|
|
604
|
+
>>> table1 = KmerTable.from_sequences(
|
|
605
|
+
... 2, [NucleotideSequence("TTATA")], ref_ids=[100]
|
|
606
|
+
... )
|
|
607
|
+
>>> table2 = KmerTable.from_sequences(
|
|
608
|
+
... 2, [NucleotideSequence("CTAG")], ref_ids=[101]
|
|
609
|
+
... )
|
|
610
|
+
>>> merged_table = KmerTable.from_tables([table1, table2])
|
|
611
|
+
>>> print(merged_table)
|
|
612
|
+
AG: (101, 2)
|
|
613
|
+
AT: (100, 2)
|
|
614
|
+
CT: (101, 0)
|
|
615
|
+
TA: (100, 1), (100, 3), (101, 1)
|
|
616
|
+
TT: (100, 0)
|
|
617
|
+
"""
|
|
618
|
+
cdef KmerTable table
|
|
619
|
+
|
|
620
|
+
_check_same_kmer_alphabet(tables)
|
|
621
|
+
|
|
622
|
+
merged_table = KmerTable(tables[0].kmer_alphabet)
|
|
623
|
+
|
|
624
|
+
# Sum the number of appearances of each k-mer from the tables
|
|
625
|
+
for table in tables:
|
|
626
|
+
# 'merged_table._ptr_array' is repurposed as count array,
|
|
627
|
+
# This can be safely done, because in this step the pointers
|
|
628
|
+
# are not initialized yet.
|
|
629
|
+
# This may save a lot of memory because no extra array is
|
|
630
|
+
# required to count the number of positions for each *k-mer*
|
|
631
|
+
_count_table_entries(
|
|
632
|
+
merged_table._ptr_array, table._ptr_array,
|
|
633
|
+
EntrySize.NO_BUCKETS
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
_init_c_arrays(merged_table._ptr_array, EntrySize.NO_BUCKETS)
|
|
637
|
+
|
|
638
|
+
for table in tables:
|
|
639
|
+
_append_entries(merged_table._ptr_array, table._ptr_array)
|
|
640
|
+
|
|
641
|
+
return merged_table
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
@cython.boundscheck(False)
|
|
645
|
+
@cython.wraparound(False)
|
|
646
|
+
@staticmethod
|
|
647
|
+
def from_positions(kmer_alphabet, dict kmer_positions):
|
|
648
|
+
"""
|
|
649
|
+
from_positions(kmer_alphabet, kmer_positions)
|
|
650
|
+
|
|
651
|
+
Create a :class:`KmerTable` from *k-mer* reference IDs and
|
|
652
|
+
positions.
|
|
653
|
+
This constructor is especially useful for restoring a table
|
|
654
|
+
from previously serialized data.
|
|
655
|
+
|
|
656
|
+
Parameters
|
|
657
|
+
----------
|
|
658
|
+
kmer_alphabet : KmerAlphabet
|
|
659
|
+
The :class:`KmerAlphabet` to use for the new table
|
|
660
|
+
kmer_positions : dict of (int -> ndarray, shape=(n,2), dtype=int)
|
|
661
|
+
A dictionary representing the *k-mer* reference IDs and
|
|
662
|
+
positions to be stored in the newly created table.
|
|
663
|
+
It maps a *k-mer* code to a :class:`ndarray`.
|
|
664
|
+
To achieve a high performance the data type ``uint32``
|
|
665
|
+
is preferred for the arrays.
|
|
666
|
+
|
|
667
|
+
Returns
|
|
668
|
+
-------
|
|
669
|
+
table : KmerTable
|
|
670
|
+
The newly created table.
|
|
671
|
+
|
|
672
|
+
Examples
|
|
673
|
+
--------
|
|
674
|
+
|
|
675
|
+
>>> sequence = ProteinSequence("BIQTITE")
|
|
676
|
+
>>> table = KmerTable.from_sequences(3, [sequence], ref_ids=[100])
|
|
677
|
+
>>> print(table)
|
|
678
|
+
IQT: (100, 1)
|
|
679
|
+
ITE: (100, 4)
|
|
680
|
+
QTI: (100, 2)
|
|
681
|
+
TIT: (100, 3)
|
|
682
|
+
BIQ: (100, 0)
|
|
683
|
+
>>> data = {kmer: table[kmer] for kmer in table}
|
|
684
|
+
>>> print(data)
|
|
685
|
+
{4360: array([[100, 1]], dtype=uint32), 4419: array([[100, 4]], dtype=uint32), 7879: array([[100, 2]], dtype=uint32), 9400: array([[100, 3]], dtype=uint32), 11701: array([[100, 0]], dtype=uint32)}
|
|
686
|
+
>>> restored_table = KmerTable.from_positions(table.kmer_alphabet, data)
|
|
687
|
+
>>> print(restored_table)
|
|
688
|
+
IQT: (100, 1)
|
|
689
|
+
ITE: (100, 4)
|
|
690
|
+
QTI: (100, 2)
|
|
691
|
+
TIT: (100, 3)
|
|
692
|
+
BIQ: (100, 0)
|
|
693
|
+
"""
|
|
694
|
+
cdef int64 length
|
|
695
|
+
cdef uint32* kmer_ptr
|
|
696
|
+
cdef int64 i
|
|
697
|
+
cdef int64 kmer
|
|
698
|
+
cdef uint32[:,:] positions
|
|
699
|
+
|
|
700
|
+
table = KmerTable(kmer_alphabet)
|
|
701
|
+
|
|
702
|
+
cdef ptr[:] ptr_array = table._ptr_array
|
|
703
|
+
cdef int64 alph_length = len(kmer_alphabet)
|
|
704
|
+
|
|
705
|
+
for kmer, position_array in kmer_positions.items():
|
|
706
|
+
if kmer < 0 or kmer >= alph_length:
|
|
707
|
+
raise AlphabetError(
|
|
708
|
+
f"k-mer code {kmer} does not represent a valid k-mer"
|
|
709
|
+
)
|
|
710
|
+
positions = position_array.astype(np.uint32, copy=False)
|
|
711
|
+
if positions.shape[0] == 0:
|
|
712
|
+
# No position to add -> jump to the next k-mer
|
|
713
|
+
continue
|
|
714
|
+
if positions.shape[1] != 2:
|
|
715
|
+
raise IndexError(
|
|
716
|
+
f"Each entry in position array has {positions.shape[1]} "
|
|
717
|
+
f"values, but 2 were expected"
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Plus the size of array length value (int64)
|
|
721
|
+
length = 2 * positions.shape[0] + 2
|
|
722
|
+
kmer_ptr = <uint32*>malloc(length * sizeof(uint32))
|
|
723
|
+
if not kmer_ptr:
|
|
724
|
+
raise MemoryError
|
|
725
|
+
ptr_array[kmer] = <ptr>kmer_ptr
|
|
726
|
+
(<int64*> kmer_ptr)[0] = length
|
|
727
|
+
# Jump behind the length value
|
|
728
|
+
kmer_ptr += 2
|
|
729
|
+
|
|
730
|
+
# Add entries
|
|
731
|
+
for i in range(positions.shape[0]):
|
|
732
|
+
kmer_ptr[0] = positions[i,0]
|
|
733
|
+
kmer_ptr += 1
|
|
734
|
+
kmer_ptr[0] = positions[i,1]
|
|
735
|
+
kmer_ptr += 1
|
|
736
|
+
|
|
737
|
+
return table
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
@cython.boundscheck(False)
|
|
741
|
+
@cython.wraparound(False)
|
|
742
|
+
def match_table(self, KmerTable table, similarity_rule=None):
|
|
743
|
+
"""
|
|
744
|
+
match_table(table, similarity_rule=None)
|
|
745
|
+
|
|
746
|
+
Find matches between the *k-mers* in this table with the
|
|
747
|
+
*k-mers* in another `table`.
|
|
748
|
+
|
|
749
|
+
This means that for each *k-mer* the cartesian product between
|
|
750
|
+
the positions in both tables is added to the matches.
|
|
751
|
+
|
|
752
|
+
Parameters
|
|
753
|
+
----------
|
|
754
|
+
table : KmerTable
|
|
755
|
+
The table to be matched.
|
|
756
|
+
Both tables must have equal :class:`KmerAlphabet` objects,
|
|
757
|
+
i.e. the same *k* and equal base alphabets.
|
|
758
|
+
similarity_rule : SimilarityRule, optional
|
|
759
|
+
If this parameter is given, not only exact *k-mer* matches
|
|
760
|
+
are considered, but also similar ones according to the given
|
|
761
|
+
:class:`SimilarityRule`.
|
|
762
|
+
|
|
763
|
+
Returns
|
|
764
|
+
-------
|
|
765
|
+
matches : ndarray, shape=(n,4), dtype=np.uint32
|
|
766
|
+
The *k-mer* matches.
|
|
767
|
+
Each row contains one match. Each match has the following
|
|
768
|
+
columns:
|
|
769
|
+
|
|
770
|
+
0. The reference ID of the matched sequence in the other
|
|
771
|
+
table
|
|
772
|
+
1. The sequence position of the matched sequence in the
|
|
773
|
+
other table
|
|
774
|
+
2. The reference ID of the matched sequence in this
|
|
775
|
+
table
|
|
776
|
+
3. The sequence position of the matched sequence in this
|
|
777
|
+
table
|
|
778
|
+
|
|
779
|
+
Notes
|
|
780
|
+
-----
|
|
781
|
+
|
|
782
|
+
There is no guaranteed order of the reference IDs or
|
|
783
|
+
sequence positions in the returned matches.
|
|
784
|
+
|
|
785
|
+
Examples
|
|
786
|
+
--------
|
|
787
|
+
|
|
788
|
+
>>> sequence1 = ProteinSequence("BIQTITE")
|
|
789
|
+
>>> table1 = KmerTable.from_sequences(3, [sequence1], ref_ids=[100])
|
|
790
|
+
>>> print(table1)
|
|
791
|
+
IQT: (100, 1)
|
|
792
|
+
ITE: (100, 4)
|
|
793
|
+
QTI: (100, 2)
|
|
794
|
+
TIT: (100, 3)
|
|
795
|
+
BIQ: (100, 0)
|
|
796
|
+
>>> sequence2 = ProteinSequence("TITANITE")
|
|
797
|
+
>>> table2 = KmerTable.from_sequences(3, [sequence2], ref_ids=[101])
|
|
798
|
+
>>> print(table2)
|
|
799
|
+
ANI: (101, 3)
|
|
800
|
+
ITA: (101, 1)
|
|
801
|
+
ITE: (101, 5)
|
|
802
|
+
NIT: (101, 4)
|
|
803
|
+
TAN: (101, 2)
|
|
804
|
+
TIT: (101, 0)
|
|
805
|
+
>>> print(table1.match_table(table2))
|
|
806
|
+
[[101 5 100 4]
|
|
807
|
+
[101 0 100 3]]
|
|
808
|
+
"""
|
|
809
|
+
cdef int INIT_SIZE = 1
|
|
810
|
+
|
|
811
|
+
cdef int64 kmer, sim_kmer
|
|
812
|
+
cdef int64 match_i
|
|
813
|
+
cdef int64 i, j, l
|
|
814
|
+
cdef int64 self_length, other_length
|
|
815
|
+
cdef uint32* self_kmer_ptr
|
|
816
|
+
cdef uint32* other_kmer_ptr
|
|
817
|
+
|
|
818
|
+
# This variable will only be used if a similarity rule exists
|
|
819
|
+
cdef int64[:] similar_kmers
|
|
820
|
+
|
|
821
|
+
# Store in new variables
|
|
822
|
+
# to disable repetitive initialization checks
|
|
823
|
+
cdef ptr[:] self_ptr_array = self._ptr_array
|
|
824
|
+
cdef ptr[:] other_ptr_array = table._ptr_array
|
|
825
|
+
|
|
826
|
+
_check_same_kmer_alphabet((self, table))
|
|
827
|
+
|
|
828
|
+
# This array will store the match positions
|
|
829
|
+
# As the final number of matches is unknown, a list-like
|
|
830
|
+
# approach is used:
|
|
831
|
+
# The array is initialized with a relatively small inital size
|
|
832
|
+
# and every time the limit would be exceeded its size is doubled
|
|
833
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 4), dtype=np.int64)
|
|
834
|
+
match_i = 0
|
|
835
|
+
if similarity_rule is None:
|
|
836
|
+
for kmer in range(self_ptr_array.shape[0]):
|
|
837
|
+
self_kmer_ptr = <uint32*>self_ptr_array[kmer]
|
|
838
|
+
other_kmer_ptr = <uint32*>other_ptr_array[kmer]
|
|
839
|
+
# For each k-mer create the cartesian product
|
|
840
|
+
if self_kmer_ptr != NULL and other_kmer_ptr != NULL:
|
|
841
|
+
# This kmer exists for both tables
|
|
842
|
+
other_length = (<int64*>other_kmer_ptr)[0]
|
|
843
|
+
self_length = (<int64*>self_kmer_ptr )[0]
|
|
844
|
+
for i in range(2, other_length, 2):
|
|
845
|
+
for j in range(2, self_length, 2):
|
|
846
|
+
if match_i >= matches.shape[0]:
|
|
847
|
+
# The 'matches' array is full
|
|
848
|
+
# -> double its size
|
|
849
|
+
matches = expand(np.asarray(matches))
|
|
850
|
+
matches[match_i, 0] = other_kmer_ptr[i]
|
|
851
|
+
matches[match_i, 1] = other_kmer_ptr[i+1]
|
|
852
|
+
matches[match_i, 2] = self_kmer_ptr[j]
|
|
853
|
+
matches[match_i, 3] = self_kmer_ptr[j+1]
|
|
854
|
+
match_i += 1
|
|
855
|
+
|
|
856
|
+
else:
|
|
857
|
+
for kmer in range(self_ptr_array.shape[0]):
|
|
858
|
+
other_kmer_ptr = <uint32*>other_ptr_array[kmer]
|
|
859
|
+
if other_kmer_ptr != NULL:
|
|
860
|
+
# If a similarity rule exists, iterate not only over
|
|
861
|
+
# the exact k-mer, but over all k-mers similar to
|
|
862
|
+
# the current k-mer
|
|
863
|
+
similar_kmers = similarity_rule.similar_kmers(
|
|
864
|
+
self._kmer_alph, kmer
|
|
865
|
+
)
|
|
866
|
+
for l in range(similar_kmers.shape[0]):
|
|
867
|
+
sim_kmer = similar_kmers[l]
|
|
868
|
+
# Actual copy of the code from the other
|
|
869
|
+
# if-branch:
|
|
870
|
+
# It cannot be put properly in a cdef-function,
|
|
871
|
+
# as every function call would perform reference
|
|
872
|
+
# count changes and would decrease performance
|
|
873
|
+
self_kmer_ptr = <uint32*>self_ptr_array[sim_kmer]
|
|
874
|
+
if self_kmer_ptr != NULL:
|
|
875
|
+
other_length = (<int64*>other_kmer_ptr)[0]
|
|
876
|
+
self_length = (<int64*>self_kmer_ptr )[0]
|
|
877
|
+
for i in range(2, other_length, 2):
|
|
878
|
+
for j in range(2, self_length, 2):
|
|
879
|
+
if match_i >= matches.shape[0]:
|
|
880
|
+
matches = expand(np.asarray(matches))
|
|
881
|
+
matches[match_i, 0] = other_kmer_ptr[i]
|
|
882
|
+
matches[match_i, 1] = other_kmer_ptr[i+1]
|
|
883
|
+
matches[match_i, 2] = self_kmer_ptr[j]
|
|
884
|
+
matches[match_i, 3] = self_kmer_ptr[j+1]
|
|
885
|
+
match_i += 1
|
|
886
|
+
|
|
887
|
+
# Trim to correct size and return
|
|
888
|
+
return np.asarray(matches[:match_i])
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
@cython.boundscheck(False)
|
|
892
|
+
@cython.wraparound(False)
|
|
893
|
+
def match(self, sequence, similarity_rule=None, ignore_mask=None):
|
|
894
|
+
"""
|
|
895
|
+
match(sequence, similarity_rule=None, ignore_mask=None)
|
|
896
|
+
|
|
897
|
+
Find matches between the *k-mers* in this table with all
|
|
898
|
+
overlapping *k-mers* in the given `sequence`.
|
|
899
|
+
*k* is determined by the table.
|
|
900
|
+
|
|
901
|
+
Parameters
|
|
902
|
+
----------
|
|
903
|
+
sequence : Sequence
|
|
904
|
+
The sequence to be matched.
|
|
905
|
+
The table's base alphabet must extend the alphabet of the
|
|
906
|
+
sequence.
|
|
907
|
+
similarity_rule : SimilarityRule, optional
|
|
908
|
+
If this parameter is given, not only exact *k-mer* matches
|
|
909
|
+
are considered, but also similar ones according to the given
|
|
910
|
+
:class:`SimilarityRule`.
|
|
911
|
+
ignore_mask : ndarray, dtype=bool, optional
|
|
912
|
+
Boolean mask of sequence positions to ignore.
|
|
913
|
+
*k-mers* that involve these sequence positions are not added
|
|
914
|
+
to the table.
|
|
915
|
+
This is used e.g. to skip repeat regions.
|
|
916
|
+
By default, no sequence position is ignored.
|
|
917
|
+
|
|
918
|
+
Returns
|
|
919
|
+
-------
|
|
920
|
+
matches : ndarray, shape=(n,3), dtype=np.uint32
|
|
921
|
+
The *k-mer* matches.
|
|
922
|
+
Each row contains one match. Each match has the following
|
|
923
|
+
columns:
|
|
924
|
+
|
|
925
|
+
0. The sequence position in the input sequence
|
|
926
|
+
1. The reference ID of the matched sequence in the table
|
|
927
|
+
2. The sequence position of the matched sequence in the
|
|
928
|
+
table
|
|
929
|
+
|
|
930
|
+
Notes
|
|
931
|
+
-----
|
|
932
|
+
|
|
933
|
+
The matches are ordered by the first column.
|
|
934
|
+
|
|
935
|
+
Examples
|
|
936
|
+
--------
|
|
937
|
+
|
|
938
|
+
>>> sequence1 = ProteinSequence("BIQTITE")
|
|
939
|
+
>>> table = KmerTable.from_sequences(3, [sequence1], ref_ids=[100])
|
|
940
|
+
>>> print(table)
|
|
941
|
+
IQT: (100, 1)
|
|
942
|
+
ITE: (100, 4)
|
|
943
|
+
QTI: (100, 2)
|
|
944
|
+
TIT: (100, 3)
|
|
945
|
+
BIQ: (100, 0)
|
|
946
|
+
>>> sequence2 = ProteinSequence("TITANITE")
|
|
947
|
+
>>> print(table.match(sequence2))
|
|
948
|
+
[[ 0 100 3]
|
|
949
|
+
[ 5 100 4]]
|
|
950
|
+
"""
|
|
951
|
+
cdef int INIT_SIZE = 1
|
|
952
|
+
|
|
953
|
+
cdef int64 kmer, sim_kmer
|
|
954
|
+
cdef int64 match_i
|
|
955
|
+
cdef int64 i, j, l
|
|
956
|
+
cdef int64 length
|
|
957
|
+
cdef uint32* kmer_ptr
|
|
958
|
+
|
|
959
|
+
# This variable will only be used if a similarity rule exists
|
|
960
|
+
cdef int64[:] similar_kmers
|
|
961
|
+
|
|
962
|
+
# Store in new variable
|
|
963
|
+
# to disable repetitive initialization checks
|
|
964
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
965
|
+
|
|
966
|
+
if len(sequence.code) < self._k:
|
|
967
|
+
raise ValueError("Sequence code is shorter than k")
|
|
968
|
+
if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
|
|
969
|
+
raise ValueError(
|
|
970
|
+
"The alphabet used for the k-mer index table is not equal to "
|
|
971
|
+
"the alphabet of the sequence"
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
cdef int64[:] kmers = self._kmer_alph.create_kmers(sequence.code)
|
|
975
|
+
cdef uint8[:] kmer_mask = _prepare_mask(
|
|
976
|
+
self._kmer_alph, ignore_mask, len(sequence.code)
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
# This array will store the match positions
|
|
980
|
+
# As the final number of matches is unknown, a list-like
|
|
981
|
+
# approach is used:
|
|
982
|
+
# The array is initialized with a relatively small inital size
|
|
983
|
+
# and every time the limit would be exceeded its size is doubled
|
|
984
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
|
|
985
|
+
match_i = 0
|
|
986
|
+
if similarity_rule is None:
|
|
987
|
+
for i in range(kmers.shape[0]):
|
|
988
|
+
if kmer_mask[i]:
|
|
989
|
+
kmer = kmers[i]
|
|
990
|
+
kmer_ptr = <uint32*>ptr_array[kmer]
|
|
991
|
+
if kmer_ptr != NULL:
|
|
992
|
+
# There is at least one entry for the k-mer
|
|
993
|
+
length = (<int64*>kmer_ptr)[0]
|
|
994
|
+
for j in range(2, length, 2):
|
|
995
|
+
if match_i >= matches.shape[0]:
|
|
996
|
+
# The 'matches' array is full
|
|
997
|
+
# -> double its size
|
|
998
|
+
matches = expand(np.asarray(matches))
|
|
999
|
+
matches[match_i, 0] = i
|
|
1000
|
+
matches[match_i, 1] = kmer_ptr[j]
|
|
1001
|
+
matches[match_i, 2] = kmer_ptr[j+1]
|
|
1002
|
+
match_i += 1
|
|
1003
|
+
|
|
1004
|
+
else:
|
|
1005
|
+
for i in range(kmers.shape[0]):
|
|
1006
|
+
if kmer_mask[i]:
|
|
1007
|
+
kmer = kmers[i]
|
|
1008
|
+
# If a similarity rule exists, iterate not only over
|
|
1009
|
+
# the exact k-mer, but over all k-mers similar to
|
|
1010
|
+
# the current k-mer
|
|
1011
|
+
similar_kmers = similarity_rule.similar_kmers(
|
|
1012
|
+
self._kmer_alph, kmer
|
|
1013
|
+
)
|
|
1014
|
+
for l in range(similar_kmers.shape[0]):
|
|
1015
|
+
sim_kmer = similar_kmers[l]
|
|
1016
|
+
# Actual copy of the code from the other
|
|
1017
|
+
# if-branch:
|
|
1018
|
+
# It cannot be put properly in a cdef-function,
|
|
1019
|
+
# as every function call would perform reference
|
|
1020
|
+
# count changes and would decrease performance
|
|
1021
|
+
kmer_ptr = <uint32*>ptr_array[sim_kmer]
|
|
1022
|
+
if kmer_ptr != NULL:
|
|
1023
|
+
# There is at least one entry for the k-mer
|
|
1024
|
+
length = (<int64*>kmer_ptr)[0]
|
|
1025
|
+
for j in range(2, length, 2):
|
|
1026
|
+
if match_i >= matches.shape[0]:
|
|
1027
|
+
# The 'matches' array is full
|
|
1028
|
+
# -> double its size
|
|
1029
|
+
matches = expand(np.asarray(matches))
|
|
1030
|
+
matches[match_i, 0] = i
|
|
1031
|
+
matches[match_i, 1] = kmer_ptr[j]
|
|
1032
|
+
matches[match_i, 2] = kmer_ptr[j+1]
|
|
1033
|
+
match_i += 1
|
|
1034
|
+
|
|
1035
|
+
# Trim to correct size and return
|
|
1036
|
+
return np.asarray(matches[:match_i])
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
@cython.boundscheck(False)
|
|
1040
|
+
@cython.wraparound(False)
|
|
1041
|
+
def match_kmer_selection(self, positions, kmers):
|
|
1042
|
+
"""
|
|
1043
|
+
match_kmer_selection(positions, kmers)
|
|
1044
|
+
|
|
1045
|
+
Find matches between the *k-mers* in this table with the given
|
|
1046
|
+
*k-mer* selection.
|
|
1047
|
+
|
|
1048
|
+
It is intended to use this method to find matches in a table
|
|
1049
|
+
that was created using :meth:`from_kmer_selection()`.
|
|
1050
|
+
|
|
1051
|
+
Parameters
|
|
1052
|
+
----------
|
|
1053
|
+
positions : ndarray, shape=(n,), dtype=uint32
|
|
1054
|
+
Sequence positions of the filtered subset of *k-mers* given
|
|
1055
|
+
in `kmers`.
|
|
1056
|
+
kmers : ndarray, shape=(n,), dtype=np.int64
|
|
1057
|
+
Filtered subset of *k-mer* codes to match against.
|
|
1058
|
+
|
|
1059
|
+
Returns
|
|
1060
|
+
-------
|
|
1061
|
+
matches : ndarray, shape=(n,3), dtype=np.uint32
|
|
1062
|
+
The *k-mer* matches.
|
|
1063
|
+
Each row contains one *k-mer* match.
|
|
1064
|
+
Each match has the following columns:
|
|
1065
|
+
|
|
1066
|
+
0. The sequence position of the input *k-mer*, taken
|
|
1067
|
+
from `positions`
|
|
1068
|
+
1. The reference ID of the matched sequence in the table
|
|
1069
|
+
2. The sequence position of the matched *k-mer* in the
|
|
1070
|
+
table
|
|
1071
|
+
|
|
1072
|
+
Examples
|
|
1073
|
+
--------
|
|
1074
|
+
|
|
1075
|
+
Reduce the size of sequence data in the table using minimizers:
|
|
1076
|
+
|
|
1077
|
+
>>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
|
|
1078
|
+
>>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
|
|
1079
|
+
>>> minimizer = MinimizerSelector(kmer_alph, window=4)
|
|
1080
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence1)
|
|
1081
|
+
>>> kmer_table = KmerTable.from_kmer_selection(
|
|
1082
|
+
... kmer_alph, [minimizer_pos], [minimizers]
|
|
1083
|
+
... )
|
|
1084
|
+
|
|
1085
|
+
Use the same :class:`MinimizerSelector` to select the minimizers
|
|
1086
|
+
from the query sequence and match them against the table.
|
|
1087
|
+
Although the amount of *k-mers* is reduced, matching is still
|
|
1088
|
+
guanrateed to work, if the two sequences share identity in the
|
|
1089
|
+
given window:
|
|
1090
|
+
|
|
1091
|
+
>>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
|
|
1092
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence2)
|
|
1093
|
+
>>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
|
|
1094
|
+
>>> print(matches)
|
|
1095
|
+
[[ 9 0 11]
|
|
1096
|
+
[12 0 14]]
|
|
1097
|
+
>>> for query_pos, _, db_pos in matches:
|
|
1098
|
+
... print(sequence1)
|
|
1099
|
+
... print(" " * (db_pos-1) + "^" * kmer_table.k)
|
|
1100
|
+
... print(sequence2)
|
|
1101
|
+
... print(" " * (query_pos-1) + "^" * kmer_table.k)
|
|
1102
|
+
... print()
|
|
1103
|
+
THIS*IS*A*SEQVENCE
|
|
1104
|
+
^^^
|
|
1105
|
+
ANQTHER*SEQVENCE
|
|
1106
|
+
^^^
|
|
1107
|
+
<BLANKLINE>
|
|
1108
|
+
THIS*IS*A*SEQVENCE
|
|
1109
|
+
^^^
|
|
1110
|
+
ANQTHER*SEQVENCE
|
|
1111
|
+
^^^
|
|
1112
|
+
<BLANKLINE>
|
|
1113
|
+
"""
|
|
1114
|
+
cdef int INIT_SIZE = 1
|
|
1115
|
+
|
|
1116
|
+
cdef int64 i, j
|
|
1117
|
+
|
|
1118
|
+
cdef int64 kmer
|
|
1119
|
+
cdef int64 match_i
|
|
1120
|
+
cdef int64 seq_pos
|
|
1121
|
+
cdef int64 length
|
|
1122
|
+
cdef uint32* kmer_ptr
|
|
1123
|
+
|
|
1124
|
+
# Store in new variable
|
|
1125
|
+
# to disable repetitive initialization checks
|
|
1126
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
1127
|
+
|
|
1128
|
+
_check_kmer_bounds(kmers, self._kmer_alph)
|
|
1129
|
+
if positions.shape[0] != kmers.shape[0]:
|
|
1130
|
+
raise IndexError(
|
|
1131
|
+
f"{positions.shape[0]} positions were given "
|
|
1132
|
+
f"for {kmers.shape[0]} k-mers"
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
cdef uint32[:] pos_array = positions.astype(np.uint32, copy=False)
|
|
1136
|
+
cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
|
|
1137
|
+
|
|
1138
|
+
# This array will store the match positions
|
|
1139
|
+
# As the final number of matches is unknown, a list-like
|
|
1140
|
+
# approach is used:
|
|
1141
|
+
# The array is initialized with a relatively small inital size
|
|
1142
|
+
# and every time the limit would be exceeded its size is doubled
|
|
1143
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
|
|
1144
|
+
match_i = 0
|
|
1145
|
+
for i in range(kmer_array.shape[0]):
|
|
1146
|
+
kmer = kmer_array[i]
|
|
1147
|
+
seq_pos = pos_array[i]
|
|
1148
|
+
kmer_ptr = <uint32*>ptr_array[kmer]
|
|
1149
|
+
if kmer_ptr != NULL:
|
|
1150
|
+
# There is at least one entry for the k-mer
|
|
1151
|
+
length = (<int64*>kmer_ptr)[0]
|
|
1152
|
+
for j in range(2, length, 2):
|
|
1153
|
+
if match_i >= matches.shape[0]:
|
|
1154
|
+
# The 'matches' array is full
|
|
1155
|
+
# -> double its size
|
|
1156
|
+
matches = expand(np.asarray(matches))
|
|
1157
|
+
matches[match_i, 0] = seq_pos
|
|
1158
|
+
matches[match_i, 1] = kmer_ptr[j]
|
|
1159
|
+
matches[match_i, 2] = kmer_ptr[j+1]
|
|
1160
|
+
match_i += 1
|
|
1161
|
+
|
|
1162
|
+
# Trim to correct size and return
|
|
1163
|
+
return np.asarray(matches[:match_i])
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
@cython.boundscheck(False)
|
|
1167
|
+
@cython.wraparound(False)
|
|
1168
|
+
def count(self, kmers=None):
|
|
1169
|
+
"""
|
|
1170
|
+
count(kmers=None)
|
|
1171
|
+
|
|
1172
|
+
Count the number of occurences for each *k-mer* in the table.
|
|
1173
|
+
|
|
1174
|
+
Parameters
|
|
1175
|
+
----------
|
|
1176
|
+
kmers : ndarray, dtype=np.int64, optional
|
|
1177
|
+
The count is returned for these *k-mer* codes.
|
|
1178
|
+
By default all *k-mers* are counted in ascending order, i.e.
|
|
1179
|
+
``count_for_kmer = counts[kmer]``.
|
|
1180
|
+
|
|
1181
|
+
Returns
|
|
1182
|
+
-------
|
|
1183
|
+
counts : ndarray, dtype=np.int64, optional
|
|
1184
|
+
The counts for each given *k-mer*.
|
|
1185
|
+
|
|
1186
|
+
Examples
|
|
1187
|
+
--------
|
|
1188
|
+
>>> table = KmerTable.from_sequences(
|
|
1189
|
+
... k = 2,
|
|
1190
|
+
... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
|
|
1191
|
+
... ref_ids = [0, 1]
|
|
1192
|
+
... )
|
|
1193
|
+
>>> print(table)
|
|
1194
|
+
AG: (1, 2)
|
|
1195
|
+
AT: (0, 2)
|
|
1196
|
+
CT: (1, 0)
|
|
1197
|
+
TA: (0, 1), (0, 3), (1, 1)
|
|
1198
|
+
TT: (0, 0)
|
|
1199
|
+
|
|
1200
|
+
Count two selected *k-mers*:
|
|
1201
|
+
|
|
1202
|
+
>>> print(table.count(table.kmer_alphabet.encode_multiple(["TA", "AG"])))
|
|
1203
|
+
[3 1]
|
|
1204
|
+
|
|
1205
|
+
Count all *k-mers*:
|
|
1206
|
+
|
|
1207
|
+
>>> counts = table.count()
|
|
1208
|
+
>>> print(counts)
|
|
1209
|
+
[0 0 1 1 0 0 0 1 0 0 0 0 3 0 0 1]
|
|
1210
|
+
>>> for kmer, count in zip(table.kmer_alphabet.get_symbols(), counts):
|
|
1211
|
+
... print(kmer, count)
|
|
1212
|
+
AA 0
|
|
1213
|
+
AC 0
|
|
1214
|
+
AG 1
|
|
1215
|
+
AT 1
|
|
1216
|
+
CA 0
|
|
1217
|
+
CC 0
|
|
1218
|
+
CG 0
|
|
1219
|
+
CT 1
|
|
1220
|
+
GA 0
|
|
1221
|
+
GC 0
|
|
1222
|
+
GG 0
|
|
1223
|
+
GT 0
|
|
1224
|
+
TA 3
|
|
1225
|
+
TC 0
|
|
1226
|
+
TG 0
|
|
1227
|
+
TT 1
|
|
1228
|
+
"""
|
|
1229
|
+
cdef int64 i
|
|
1230
|
+
|
|
1231
|
+
cdef int64 length
|
|
1232
|
+
cdef int64 kmer
|
|
1233
|
+
cdef int64* kmer_ptr
|
|
1234
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
1235
|
+
cdef int64[:] kmer_array
|
|
1236
|
+
cdef int64[:] counts
|
|
1237
|
+
|
|
1238
|
+
if kmers is None:
|
|
1239
|
+
counts = np.zeros(ptr_array.shape[0], dtype=np.int64)
|
|
1240
|
+
for kmer in range(ptr_array.shape[0]):
|
|
1241
|
+
kmer_ptr = <int64*> (ptr_array[kmer])
|
|
1242
|
+
if kmer_ptr != NULL:
|
|
1243
|
+
# First 64 bytes are length of C-array
|
|
1244
|
+
length = kmer_ptr[0]
|
|
1245
|
+
# Array length is measured in uint32
|
|
1246
|
+
# length = 2 * count + 2 -> rearrange formula
|
|
1247
|
+
counts[kmer] = (length - 2) // 2
|
|
1248
|
+
|
|
1249
|
+
else:
|
|
1250
|
+
_check_kmer_bounds(kmers, self._kmer_alph)
|
|
1251
|
+
|
|
1252
|
+
kmer_array = kmers.astype(np.int64, copy=False)
|
|
1253
|
+
counts = np.zeros(kmer_array.shape[0], dtype=np.int64)
|
|
1254
|
+
for i in range(kmer_array.shape[0]):
|
|
1255
|
+
kmer = kmer_array[i]
|
|
1256
|
+
kmer_ptr = <int64*> (ptr_array[kmer])
|
|
1257
|
+
if kmer_ptr != NULL:
|
|
1258
|
+
length = kmer_ptr[0]
|
|
1259
|
+
counts[i] = (length - 2) // 2
|
|
1260
|
+
|
|
1261
|
+
return np.asarray(counts)
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
@cython.boundscheck(False)
|
|
1265
|
+
@cython.wraparound(False)
|
|
1266
|
+
def get_kmers(self):
|
|
1267
|
+
"""
|
|
1268
|
+
Get the *k-mer* codes for all *k-mers* that have at least one
|
|
1269
|
+
position in the table.
|
|
1270
|
+
|
|
1271
|
+
Returns
|
|
1272
|
+
-------
|
|
1273
|
+
kmers : ndarray, shape=(n,), dtype=np.int64
|
|
1274
|
+
The *k-mer* codes.
|
|
1275
|
+
|
|
1276
|
+
Examples
|
|
1277
|
+
--------
|
|
1278
|
+
|
|
1279
|
+
>>> sequence = ProteinSequence("BIQTITE")
|
|
1280
|
+
>>> table = KmerTable.from_sequences(3, [sequence], ref_ids=[100])
|
|
1281
|
+
>>> print(table)
|
|
1282
|
+
IQT: (100, 1)
|
|
1283
|
+
ITE: (100, 4)
|
|
1284
|
+
QTI: (100, 2)
|
|
1285
|
+
TIT: (100, 3)
|
|
1286
|
+
BIQ: (100, 0)
|
|
1287
|
+
>>> kmer_codes = table.get_kmers()
|
|
1288
|
+
>>> print(kmer_codes)
|
|
1289
|
+
[ 4360 4419 7879 9400 11701]
|
|
1290
|
+
>>> for code in kmer_codes:
|
|
1291
|
+
... print(table[code])
|
|
1292
|
+
[[100 1]]
|
|
1293
|
+
[[100 4]]
|
|
1294
|
+
[[100 2]]
|
|
1295
|
+
[[100 3]]
|
|
1296
|
+
[[100 0]]
|
|
1297
|
+
"""
|
|
1298
|
+
cdef int64 kmer
|
|
1299
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
1300
|
+
|
|
1301
|
+
# Pessimistic allocation:
|
|
1302
|
+
# The maximum number of used kmers are all possible kmers
|
|
1303
|
+
cdef int64[:] kmers = np.zeros(ptr_array.shape[0], dtype=np.int64)
|
|
1304
|
+
|
|
1305
|
+
cdef int64 i = 0
|
|
1306
|
+
for kmer in range(ptr_array.shape[0]):
|
|
1307
|
+
if <uint32*> (ptr_array[kmer]) != NULL:
|
|
1308
|
+
kmers[i] = kmer
|
|
1309
|
+
i += 1
|
|
1310
|
+
|
|
1311
|
+
# Trim to correct size
|
|
1312
|
+
return np.asarray(kmers)[:i]
|
|
1313
|
+
|
|
1314
|
+
|
|
1315
|
+
@cython.cdivision(True)
|
|
1316
|
+
@cython.boundscheck(False)
|
|
1317
|
+
@cython.wraparound(False)
|
|
1318
|
+
def __getitem__(self, int64 kmer):
|
|
1319
|
+
cdef int64 i, j
|
|
1320
|
+
cdef int64 length
|
|
1321
|
+
cdef uint32* kmer_ptr
|
|
1322
|
+
cdef uint32[:,:] positions
|
|
1323
|
+
|
|
1324
|
+
if kmer >= len(self):
|
|
1325
|
+
raise AlphabetError(
|
|
1326
|
+
f"k-mer code {kmer} is out of bounds "
|
|
1327
|
+
f"for the given KmerAlphabet"
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
kmer_ptr = <uint32*>self._ptr_array[kmer]
|
|
1331
|
+
if kmer_ptr == NULL:
|
|
1332
|
+
return np.zeros((0, 2), dtype=np.uint32)
|
|
1333
|
+
else:
|
|
1334
|
+
length = (<int64*>kmer_ptr)[0]
|
|
1335
|
+
positions = np.empty(((length - 2) // 2, 2), dtype=np.uint32)
|
|
1336
|
+
i = 0
|
|
1337
|
+
for j in range(2, length, 2):
|
|
1338
|
+
positions[i,0] = kmer_ptr[j]
|
|
1339
|
+
positions[i,1] = kmer_ptr[j+1]
|
|
1340
|
+
i += 1
|
|
1341
|
+
return np.asarray(positions)
|
|
1342
|
+
|
|
1343
|
+
|
|
1344
|
+
def __len__(self):
|
|
1345
|
+
return len(self._kmer_alph)
|
|
1346
|
+
|
|
1347
|
+
|
|
1348
|
+
def __contains__(self, int64 kmer):
|
|
1349
|
+
# If there is at least one entry for a k-mer,
|
|
1350
|
+
# the pointer is not NULL
|
|
1351
|
+
return self._ptr_array[kmer] != 0
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
def __iter__(self):
|
|
1355
|
+
for kmer in self.get_kmers():
|
|
1356
|
+
yield kmer.item()
|
|
1357
|
+
|
|
1358
|
+
|
|
1359
|
+
def __reversed__(self):
|
|
1360
|
+
return reversed(self.get_kmers())
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
def __eq__(self, item):
|
|
1364
|
+
if item is self:
|
|
1365
|
+
return True
|
|
1366
|
+
if type(item) != KmerTable:
|
|
1367
|
+
return False
|
|
1368
|
+
|
|
1369
|
+
# Introduce static typing to access statically typed fields
|
|
1370
|
+
cdef KmerTable other = item
|
|
1371
|
+
if self._kmer_alph.base_alphabet != other._kmer_alph.base_alphabet:
|
|
1372
|
+
return False
|
|
1373
|
+
if self._k != other._k:
|
|
1374
|
+
return False
|
|
1375
|
+
return _equal_c_arrays(self._ptr_array, other._ptr_array)
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
def __str__(self):
|
|
1379
|
+
return _to_string(self)
|
|
1380
|
+
|
|
1381
|
+
|
|
1382
|
+
def __getnewargs_ex__(self):
|
|
1383
|
+
return (self._kmer_alph,), {}
|
|
1384
|
+
|
|
1385
|
+
|
|
1386
|
+
def __getstate__(self):
|
|
1387
|
+
return _pickle_c_arrays(self._ptr_array)
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
def __setstate__(self, state):
|
|
1391
|
+
_unpickle_c_arrays(self._ptr_array, state)
|
|
1392
|
+
|
|
1393
|
+
|
|
1394
|
+
def __dealloc__(self):
|
|
1395
|
+
if self._is_initialized():
|
|
1396
|
+
_deallocate_ptrs(self._ptr_array)
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
@cython.boundscheck(False)
|
|
1400
|
+
@cython.wraparound(False)
|
|
1401
|
+
def _count_kmers(self, int64[:] kmers):
|
|
1402
|
+
"""
|
|
1403
|
+
Repurpose the pointer array as count array and add the
|
|
1404
|
+
total number of positions for the given kmers to the values in
|
|
1405
|
+
the count array.
|
|
1406
|
+
|
|
1407
|
+
This can be safely done, because in this step the pointers are
|
|
1408
|
+
not initialized yet.
|
|
1409
|
+
This may save a lot of memory because no extra array is required
|
|
1410
|
+
to count the number of positions for each *k-mer*.
|
|
1411
|
+
"""
|
|
1412
|
+
cdef uint32 seq_pos
|
|
1413
|
+
cdef int64 kmer
|
|
1414
|
+
|
|
1415
|
+
cdef ptr[:] count_array = self._ptr_array
|
|
1416
|
+
|
|
1417
|
+
for seq_pos in range(kmers.shape[0]):
|
|
1418
|
+
kmer = kmers[seq_pos]
|
|
1419
|
+
count_array[kmer] += 1
|
|
1420
|
+
|
|
1421
|
+
@cython.boundscheck(False)
|
|
1422
|
+
@cython.wraparound(False)
|
|
1423
|
+
def _count_masked_kmers(self, int64[:] kmers, uint8[:] mask):
|
|
1424
|
+
"""
|
|
1425
|
+
Same as above, but with mask.
|
|
1426
|
+
"""
|
|
1427
|
+
cdef uint32 seq_pos
|
|
1428
|
+
cdef int64 kmer
|
|
1429
|
+
|
|
1430
|
+
cdef ptr[:] count_array = self._ptr_array
|
|
1431
|
+
|
|
1432
|
+
for seq_pos in range(kmers.shape[0]):
|
|
1433
|
+
if mask[seq_pos]:
|
|
1434
|
+
kmer = kmers[seq_pos]
|
|
1435
|
+
count_array[kmer] += 1
|
|
1436
|
+
|
|
1437
|
+
|
|
1438
|
+
@cython.boundscheck(False)
|
|
1439
|
+
@cython.wraparound(False)
|
|
1440
|
+
def _add_kmers(self, int64[:] kmers, uint32 ref_id, uint8[:] mask):
|
|
1441
|
+
"""
|
|
1442
|
+
For each *k-mer* in `kmers` add the reference ID and the
|
|
1443
|
+
position in the array to the corresponding C-array and update
|
|
1444
|
+
the length of the C-array.
|
|
1445
|
+
"""
|
|
1446
|
+
cdef uint32 seq_pos
|
|
1447
|
+
cdef int64 current_size
|
|
1448
|
+
cdef int64 kmer
|
|
1449
|
+
cdef uint32* kmer_ptr
|
|
1450
|
+
|
|
1451
|
+
# Store in new variable
|
|
1452
|
+
# to disable repetitive initialization checks
|
|
1453
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
1454
|
+
|
|
1455
|
+
if mask.shape[0] != kmers.shape[0]:
|
|
1456
|
+
raise IndexError(
|
|
1457
|
+
f"Mask has length {mask.shape[0]}, "
|
|
1458
|
+
f"but there are {kmers.shape[0]} k-mers"
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
for seq_pos in range(kmers.shape[0]):
|
|
1462
|
+
if mask[seq_pos]:
|
|
1463
|
+
kmer = kmers[seq_pos]
|
|
1464
|
+
kmer_ptr = <uint32*> ptr_array[kmer]
|
|
1465
|
+
|
|
1466
|
+
# Append k-mer reference ID and position
|
|
1467
|
+
current_size = (<int64*> kmer_ptr)[0]
|
|
1468
|
+
kmer_ptr[current_size ] = ref_id
|
|
1469
|
+
kmer_ptr[current_size + 1] = seq_pos
|
|
1470
|
+
(<int64*> kmer_ptr)[0] = current_size + EntrySize.NO_BUCKETS
|
|
1471
|
+
|
|
1472
|
+
@cython.boundscheck(False)
|
|
1473
|
+
@cython.wraparound(False)
|
|
1474
|
+
def _add_kmer_selection(self, uint32[:] positions, int64[:] kmers,
|
|
1475
|
+
uint32 ref_id):
|
|
1476
|
+
"""
|
|
1477
|
+
For each *k-mer* in `kmers` add the reference ID and the
|
|
1478
|
+
position from `positions` to the corresponding C-array and
|
|
1479
|
+
update the length of the C-array.
|
|
1480
|
+
"""
|
|
1481
|
+
cdef uint32 i
|
|
1482
|
+
cdef uint32 seq_pos
|
|
1483
|
+
cdef int64 current_size
|
|
1484
|
+
cdef int64 kmer
|
|
1485
|
+
cdef uint32* kmer_ptr
|
|
1486
|
+
|
|
1487
|
+
if positions.shape[0] != kmers.shape[0]:
|
|
1488
|
+
raise IndexError(
|
|
1489
|
+
f"{positions.shape[0]} positions were given "
|
|
1490
|
+
f"for {kmers.shape[0]} k-mers"
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
# Store in new variable
|
|
1494
|
+
# to disable repetitive initialization checks
|
|
1495
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
1496
|
+
|
|
1497
|
+
for i in range(positions.shape[0]):
|
|
1498
|
+
kmer = kmers[i]
|
|
1499
|
+
seq_pos = positions[i]
|
|
1500
|
+
kmer_ptr = <uint32*> ptr_array[kmer]
|
|
1501
|
+
|
|
1502
|
+
# Append k-mer reference ID and position
|
|
1503
|
+
current_size = (<int64*> kmer_ptr)[0]
|
|
1504
|
+
kmer_ptr[current_size ] = ref_id
|
|
1505
|
+
kmer_ptr[current_size + 1] = seq_pos
|
|
1506
|
+
(<int64*> kmer_ptr)[0] = current_size + EntrySize.NO_BUCKETS
|
|
1507
|
+
|
|
1508
|
+
|
|
1509
|
+
cdef inline bint _is_initialized(self):
|
|
1510
|
+
# Memoryviews are not initialized on class creation
|
|
1511
|
+
# This method checks, if the _ptr_array memoryview was
|
|
1512
|
+
# initialized and is not None
|
|
1513
|
+
try:
|
|
1514
|
+
if self._ptr_array is not None:
|
|
1515
|
+
return True
|
|
1516
|
+
else:
|
|
1517
|
+
return False
|
|
1518
|
+
except AttributeError:
|
|
1519
|
+
return False
|
|
1520
|
+
|
|
1521
|
+
|
|
1522
|
+
|
|
1523
|
+
|
|
1524
|
+
cdef class BucketKmerTable:
|
|
1525
|
+
"""
|
|
1526
|
+
This class represents a *k-mer* index table.
|
|
1527
|
+
In contrast to :class:`KmerTable` it does store each unique *k-mer*
|
|
1528
|
+
in a separate C-array, but limits the number of C-arrays instead
|
|
1529
|
+
to a number of buckets.
|
|
1530
|
+
Hence, different *k-mer* may be stored in the same bucket, like in a
|
|
1531
|
+
hash table.
|
|
1532
|
+
This approach makes *k-mer* indices with large *k-mer* alphabets
|
|
1533
|
+
fit into memory.
|
|
1534
|
+
|
|
1535
|
+
Otherwise, the API for creating a :class:`BucketKmerTable` and
|
|
1536
|
+
matching to it is analogous to :class:`KmerTable`.
|
|
1537
|
+
|
|
1538
|
+
Attributes
|
|
1539
|
+
----------
|
|
1540
|
+
kmer_alphabet : KmerAlphabet
|
|
1541
|
+
The internal :class:`KmerAlphabet`, that is used to
|
|
1542
|
+
encode all overlapping *k-mers* of an input sequence.
|
|
1543
|
+
alphabet : Alphabet
|
|
1544
|
+
The base alphabet, from which this :class:`BucketKmerTable` was
|
|
1545
|
+
created.
|
|
1546
|
+
k : int
|
|
1547
|
+
The length of the *k-mers*.
|
|
1548
|
+
n_buckets : int
|
|
1549
|
+
The number of buckets, the *k-mers* are divided into.
|
|
1550
|
+
|
|
1551
|
+
See Also
|
|
1552
|
+
--------
|
|
1553
|
+
KmerTable
|
|
1554
|
+
|
|
1555
|
+
Notes
|
|
1556
|
+
-----
|
|
1557
|
+
|
|
1558
|
+
*Memory consumption*
|
|
1559
|
+
|
|
1560
|
+
For efficient mapping, a :class:`BucketKmerTable` contains a pointer
|
|
1561
|
+
array, that contains one 64-bit pointer for each bucket.
|
|
1562
|
+
If there is at least one position for a bucket, the corresponding
|
|
1563
|
+
pointer points to a C-array that contains
|
|
1564
|
+
|
|
1565
|
+
1. The length of the C-array *(int64)*
|
|
1566
|
+
2. The *k-mers* *(int64)*
|
|
1567
|
+
3. The reference ID for each *k-mer* *(uint32)*
|
|
1568
|
+
4. The sequence position for each *k-mer* *(uint32)*
|
|
1569
|
+
|
|
1570
|
+
As buckets are used, the memory requirements are limited to the number
|
|
1571
|
+
of buckets instead of scaling with the :class:`KmerAlphabet` size.
|
|
1572
|
+
If each bucket is used, the required memory space :math:`S` in byte
|
|
1573
|
+
is
|
|
1574
|
+
|
|
1575
|
+
.. math::
|
|
1576
|
+
|
|
1577
|
+
S = 16B + 16L
|
|
1578
|
+
|
|
1579
|
+
where :math:`B` is the number of buckets and :math:`L` is the summed
|
|
1580
|
+
length of all sequences added to the table.
|
|
1581
|
+
|
|
1582
|
+
*Buckets*
|
|
1583
|
+
|
|
1584
|
+
The ratio :math:`L/B` is called *load_factor*.
|
|
1585
|
+
By default :class:`BucketKmerTable` uses a load factor of
|
|
1586
|
+
approximately 0.8 to ensure efficient *k-mer* matching.
|
|
1587
|
+
The number fo buckets can be adjusted by setting the
|
|
1588
|
+
`n_buckets` parameters on :class:`BucketKmerTable` creation.
|
|
1589
|
+
It is recommended to use :func:`bucket_number()` to compute an
|
|
1590
|
+
appropriate number of buckets.
|
|
1591
|
+
|
|
1592
|
+
*Multiprocessing*
|
|
1593
|
+
|
|
1594
|
+
:class:`BucketKmerTable` objects can be used in multi-processed
|
|
1595
|
+
setups:
|
|
1596
|
+
Adding a large database of sequences to a table can be sped up by
|
|
1597
|
+
splitting the database into smaller chunks and create a separate
|
|
1598
|
+
table for each chunk in separate processes.
|
|
1599
|
+
Eventually, the tables can be merged to one large table using
|
|
1600
|
+
:meth:`from_tables()`.
|
|
1601
|
+
|
|
1602
|
+
Since :class:`BucketKmerTable` supports the *pickle* protocol,
|
|
1603
|
+
the matching step can also be divided into multiple processes, if
|
|
1604
|
+
multiple sequences need to be matched.
|
|
1605
|
+
|
|
1606
|
+
*Storage on hard drive*
|
|
1607
|
+
|
|
1608
|
+
The most time efficient way to read/write a :class:`BucketKmerTable`
|
|
1609
|
+
is the *pickle* format.
|
|
1610
|
+
|
|
1611
|
+
*Indexing and iteration*
|
|
1612
|
+
|
|
1613
|
+
Due to the higher complexity in the *k-mer* lookup compared to
|
|
1614
|
+
:class:`KmerTable`, this class is still indexable but not iterable.
|
|
1615
|
+
|
|
1616
|
+
Examples
|
|
1617
|
+
--------
|
|
1618
|
+
|
|
1619
|
+
Create a *2-mer* index table for some nucleotide sequences:
|
|
1620
|
+
|
|
1621
|
+
>>> table = BucketKmerTable.from_sequences(
|
|
1622
|
+
... k = 2,
|
|
1623
|
+
... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
|
|
1624
|
+
... ref_ids = [0, 1]
|
|
1625
|
+
... )
|
|
1626
|
+
|
|
1627
|
+
Display the contents of the table as
|
|
1628
|
+
(reference ID, sequence position) tuples:
|
|
1629
|
+
|
|
1630
|
+
>>> print(table)
|
|
1631
|
+
AG: (1, 2)
|
|
1632
|
+
AT: (0, 2)
|
|
1633
|
+
CT: (1, 0)
|
|
1634
|
+
TA: (0, 1), (0, 3), (1, 1)
|
|
1635
|
+
TT: (0, 0)
|
|
1636
|
+
|
|
1637
|
+
Find matches of the table with a sequence:
|
|
1638
|
+
|
|
1639
|
+
>>> query = NucleotideSequence("TAG")
|
|
1640
|
+
>>> matches = table.match(query)
|
|
1641
|
+
>>> for query_pos, table_ref_id, table_pos in matches:
|
|
1642
|
+
... print("Query sequence position:", query_pos)
|
|
1643
|
+
... print("Table reference ID: ", table_ref_id)
|
|
1644
|
+
... print("Table sequence position:", table_pos)
|
|
1645
|
+
... print()
|
|
1646
|
+
Query sequence position: 0
|
|
1647
|
+
Table reference ID: 0
|
|
1648
|
+
Table sequence position: 1
|
|
1649
|
+
<BLANKLINE>
|
|
1650
|
+
Query sequence position: 0
|
|
1651
|
+
Table reference ID: 0
|
|
1652
|
+
Table sequence position: 3
|
|
1653
|
+
<BLANKLINE>
|
|
1654
|
+
Query sequence position: 0
|
|
1655
|
+
Table reference ID: 1
|
|
1656
|
+
Table sequence position: 1
|
|
1657
|
+
<BLANKLINE>
|
|
1658
|
+
Query sequence position: 1
|
|
1659
|
+
Table reference ID: 1
|
|
1660
|
+
Table sequence position: 2
|
|
1661
|
+
<BLANKLINE>
|
|
1662
|
+
|
|
1663
|
+
Get all reference IDs and positions for a given *k-mer*:
|
|
1664
|
+
|
|
1665
|
+
>>> kmer_code = table.kmer_alphabet.encode("TA")
|
|
1666
|
+
>>> print(table[kmer_code])
|
|
1667
|
+
[[0 1]
|
|
1668
|
+
[0 3]
|
|
1669
|
+
[1 1]]
|
|
1670
|
+
"""
|
|
1671
|
+
|
|
1672
|
+
cdef object _kmer_alph
|
|
1673
|
+
cdef int _k
|
|
1674
|
+
cdef int64 _n_buckets
|
|
1675
|
+
|
|
1676
|
+
# The pointer array is the core of the index table:
|
|
1677
|
+
# It maps each possible k-mer bucket (represented by its code) to a
|
|
1678
|
+
# C-array of indices.
|
|
1679
|
+
# Each entry in a C-array contains the k-mer code, a reference ID
|
|
1680
|
+
# and the location in that sequence where that k-mer appears
|
|
1681
|
+
# The memory layout of each C-array is as following:
|
|
1682
|
+
#
|
|
1683
|
+
# (Array length) (k-mer 0) (RefID 0) (Position 0) (k-mer 1) ...
|
|
1684
|
+
# -----int64----|--int64--|---uint32---|---uint32---|--int64--
|
|
1685
|
+
#
|
|
1686
|
+
# The array length is based on 32 bit units.
|
|
1687
|
+
# If there is no entry for a k-mer bucket, the respective pointer is
|
|
1688
|
+
# NULL.
|
|
1689
|
+
cdef ptr[:] _ptr_array
|
|
1690
|
+
|
|
1691
|
+
|
|
1692
|
+
def __cinit__(self, n_buckets, kmer_alphabet):
|
|
1693
|
+
# This check is necessary for proper memory management
|
|
1694
|
+
# of the allocated arrays
|
|
1695
|
+
if self._is_initialized():
|
|
1696
|
+
raise Exception("Duplicate call of constructor")
|
|
1697
|
+
|
|
1698
|
+
self._kmer_alph = kmer_alphabet
|
|
1699
|
+
self._k = kmer_alphabet.k
|
|
1700
|
+
if len(self._kmer_alph) < n_buckets:
|
|
1701
|
+
self._n_buckets = len(self._kmer_alph)
|
|
1702
|
+
else:
|
|
1703
|
+
self._n_buckets = n_buckets
|
|
1704
|
+
self._ptr_array = np.zeros(self._n_buckets, dtype=np.uint64)
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
@property
|
|
1708
|
+
def kmer_alphabet(self):
|
|
1709
|
+
return self._kmer_alph
|
|
1710
|
+
|
|
1711
|
+
@property
|
|
1712
|
+
def alphabet(self):
|
|
1713
|
+
return self._kmer_alph.base_alphabet
|
|
1714
|
+
|
|
1715
|
+
@property
|
|
1716
|
+
def k(self):
|
|
1717
|
+
return self._k
|
|
1718
|
+
|
|
1719
|
+
@property
|
|
1720
|
+
def n_buckets(self):
|
|
1721
|
+
return self._n_buckets
|
|
1722
|
+
|
|
1723
|
+
@staticmethod
|
|
1724
|
+
def from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
|
|
1725
|
+
alphabet=None, spacing=None, n_buckets=None):
|
|
1726
|
+
"""
|
|
1727
|
+
from_sequences(k, sequences, ref_ids=None, ignore_masks=None,
|
|
1728
|
+
alphabet=None, spacing=None, n_buckets=None)
|
|
1729
|
+
|
|
1730
|
+
Create a :class:`BucketKmerTable` by storing the positions of
|
|
1731
|
+
all overlapping *k-mers* from the input `sequences`.
|
|
1732
|
+
|
|
1733
|
+
Parameters
|
|
1734
|
+
----------
|
|
1735
|
+
k : int
|
|
1736
|
+
The length of the *k-mers*.
|
|
1737
|
+
sequences : sized iterable object of Sequence, length=m
|
|
1738
|
+
The sequences to get the *k-mer* positions from.
|
|
1739
|
+
These sequences must have equal alphabets, or one of these
|
|
1740
|
+
sequences must have an alphabet that extends the alphabets
|
|
1741
|
+
of all other sequences.
|
|
1742
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
1743
|
+
The reference IDs for the given sequences.
|
|
1744
|
+
These are used to identify the corresponding sequence for a
|
|
1745
|
+
*k-mer* match.
|
|
1746
|
+
By default the IDs are counted from *0* to *m*.
|
|
1747
|
+
ignore_masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
|
|
1748
|
+
Sequence positions to ignore.
|
|
1749
|
+
*k-mers* that involve these sequence positions are not added
|
|
1750
|
+
to the table.
|
|
1751
|
+
This is used e.g. to skip repeat regions.
|
|
1752
|
+
If provided, the list must contain one boolean mask
|
|
1753
|
+
(or ``None``) for each sequence, and each bolean mask must
|
|
1754
|
+
have the same length as the sequence.
|
|
1755
|
+
By default, no sequence position is ignored.
|
|
1756
|
+
alphabet : Alphabet, optional
|
|
1757
|
+
The alphabet to use for this table.
|
|
1758
|
+
It must extend the alphabets of the input `sequences`.
|
|
1759
|
+
By default, an appropriate alphabet is inferred from the
|
|
1760
|
+
input `sequences`.
|
|
1761
|
+
This option is usually used for compatibility with another
|
|
1762
|
+
sequence/table in the matching step.
|
|
1763
|
+
spacing : None or str or list or ndarray, dtype=int, shape=(k,)
|
|
1764
|
+
If provided, spaced *k-mers* are used instead of continuous
|
|
1765
|
+
ones.
|
|
1766
|
+
The value contains the *informative* positions relative to
|
|
1767
|
+
the start of the *k-mer*, also called the *model*.
|
|
1768
|
+
The number of *informative* positions must equal *k*.
|
|
1769
|
+
Refer to :class:`KmerAlphabet` for more details.
|
|
1770
|
+
n_buckets : int, optional
|
|
1771
|
+
Set the number of buckets in the table, e.g. to use a
|
|
1772
|
+
different load factor.
|
|
1773
|
+
It is recommended to use :func:`bucket_number()` for this
|
|
1774
|
+
purpose.
|
|
1775
|
+
By default, a load factor of approximately 0.8 is used.
|
|
1776
|
+
|
|
1777
|
+
See Also
|
|
1778
|
+
--------
|
|
1779
|
+
from_kmers : The same functionality based on already created *k-mers*
|
|
1780
|
+
|
|
1781
|
+
Returns
|
|
1782
|
+
-------
|
|
1783
|
+
table : BucketKmerTable
|
|
1784
|
+
The newly created table.
|
|
1785
|
+
|
|
1786
|
+
Examples
|
|
1787
|
+
--------
|
|
1788
|
+
|
|
1789
|
+
>>> sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")]
|
|
1790
|
+
>>> table = BucketKmerTable.from_sequences(
|
|
1791
|
+
... 2, sequences, ref_ids=[100, 101]
|
|
1792
|
+
... )
|
|
1793
|
+
>>> print(table)
|
|
1794
|
+
AG: (101, 2)
|
|
1795
|
+
AT: (100, 2)
|
|
1796
|
+
CT: (101, 0)
|
|
1797
|
+
TA: (100, 1), (100, 3), (101, 1)
|
|
1798
|
+
TT: (100, 0)
|
|
1799
|
+
|
|
1800
|
+
Give an explicit compatible alphabet:
|
|
1801
|
+
|
|
1802
|
+
>>> table = BucketKmerTable.from_sequences(
|
|
1803
|
+
... 2, sequences, ref_ids=[100, 101],
|
|
1804
|
+
... alphabet=NucleotideSequence.ambiguous_alphabet()
|
|
1805
|
+
... )
|
|
1806
|
+
|
|
1807
|
+
Ignore all ``N`` in a sequence:
|
|
1808
|
+
|
|
1809
|
+
>>> sequence = NucleotideSequence("ACCNTANNG")
|
|
1810
|
+
>>> table = BucketKmerTable.from_sequences(
|
|
1811
|
+
... 2, [sequence], ignore_masks=[sequence.symbols == "N"]
|
|
1812
|
+
... )
|
|
1813
|
+
>>> print(table)
|
|
1814
|
+
AC: (0, 0)
|
|
1815
|
+
CC: (0, 1)
|
|
1816
|
+
TA: (0, 4)
|
|
1817
|
+
"""
|
|
1818
|
+
ref_ids = _compute_ref_ids(ref_ids, sequences)
|
|
1819
|
+
ignore_masks = _compute_masks(ignore_masks, sequences)
|
|
1820
|
+
alphabet = _compute_alphabet(
|
|
1821
|
+
alphabet, (sequence.alphabet for sequence in sequences)
|
|
1822
|
+
)
|
|
1823
|
+
kmer_alphabet = KmerAlphabet(alphabet, k, spacing)
|
|
1824
|
+
|
|
1825
|
+
# Calculate k-mers
|
|
1826
|
+
kmers_list = [
|
|
1827
|
+
kmer_alphabet.create_kmers(sequence.code)
|
|
1828
|
+
for sequence in sequences
|
|
1829
|
+
]
|
|
1830
|
+
|
|
1831
|
+
if n_buckets is None:
|
|
1832
|
+
n_kmers = np.sum([len(kmers) for kmers in kmers_list])
|
|
1833
|
+
n_buckets = bucket_number(n_kmers)
|
|
1834
|
+
|
|
1835
|
+
table = BucketKmerTable(n_buckets, kmer_alphabet)
|
|
1836
|
+
|
|
1837
|
+
masks = [
|
|
1838
|
+
_prepare_mask(kmer_alphabet, ignore_mask, len(sequence))
|
|
1839
|
+
for sequence, ignore_mask in zip(sequences, ignore_masks)
|
|
1840
|
+
]
|
|
1841
|
+
|
|
1842
|
+
# Count the number of appearances of each k-mer and store the
|
|
1843
|
+
# result in the pointer array, that is now used as count array
|
|
1844
|
+
for kmers, mask in zip(kmers_list, masks):
|
|
1845
|
+
table._count_masked_kmers(kmers, mask)
|
|
1846
|
+
|
|
1847
|
+
# Transfrom count array into pointer array with C-array of
|
|
1848
|
+
# appropriate size
|
|
1849
|
+
_init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
|
|
1850
|
+
|
|
1851
|
+
# Fill the C-arrays with the k-mer positions
|
|
1852
|
+
for kmers, ref_id, mask in zip(kmers_list, ref_ids, masks):
|
|
1853
|
+
table._add_kmers(kmers, ref_id, mask)
|
|
1854
|
+
|
|
1855
|
+
return table
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
@staticmethod
|
|
1859
|
+
def from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None,
|
|
1860
|
+
n_buckets=None):
|
|
1861
|
+
"""
|
|
1862
|
+
from_kmers(kmer_alphabet, kmers, ref_ids=None, masks=None,
|
|
1863
|
+
n_buckets=None)
|
|
1864
|
+
|
|
1865
|
+
Create a :class:`BucketKmerTable` by storing the positions of
|
|
1866
|
+
all input *k-mers*.
|
|
1867
|
+
|
|
1868
|
+
Parameters
|
|
1869
|
+
----------
|
|
1870
|
+
kmer_alphabet : KmerAlphabet
|
|
1871
|
+
The :class:`KmerAlphabet` to use for the new table.
|
|
1872
|
+
Should be the same alphabet that was used to calculate the
|
|
1873
|
+
input *kmers*.
|
|
1874
|
+
kmers : sized iterable object of (ndarray, dtype=np.int64), length=m
|
|
1875
|
+
List where each array contains the *k-mer* codes from a
|
|
1876
|
+
sequence.
|
|
1877
|
+
For each array the index of the *k-mer* code in the array
|
|
1878
|
+
is stored in the table as sequence position.
|
|
1879
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
1880
|
+
The reference IDs for the sequences.
|
|
1881
|
+
These are used to identify the corresponding sequence for a
|
|
1882
|
+
*k-mer* match.
|
|
1883
|
+
By default the IDs are counted from *0* to *m*.
|
|
1884
|
+
masks : sized iterable object of (ndarray, dtype=bool), length=m, optional
|
|
1885
|
+
A *k-mer* code at a position, where the corresponding mask
|
|
1886
|
+
is false, is not added to the table.
|
|
1887
|
+
By default, all positions are added.
|
|
1888
|
+
n_buckets : int, optional
|
|
1889
|
+
Set the number of buckets in the table, e.g. to use a
|
|
1890
|
+
different load factor.
|
|
1891
|
+
It is recommended to use :func:`bucket_number()` for this
|
|
1892
|
+
purpose.
|
|
1893
|
+
By default, a load factor of approximately 0.8 is used.
|
|
1894
|
+
|
|
1895
|
+
See Also
|
|
1896
|
+
--------
|
|
1897
|
+
from_sequences : The same functionality based on undecomposed sequences
|
|
1898
|
+
|
|
1899
|
+
Returns
|
|
1900
|
+
-------
|
|
1901
|
+
table : BucketKmerTable
|
|
1902
|
+
The newly created table.
|
|
1903
|
+
|
|
1904
|
+
Examples
|
|
1905
|
+
--------
|
|
1906
|
+
|
|
1907
|
+
>>> sequences = [ProteinSequence("BIQTITE"), ProteinSequence("NIQBITE")]
|
|
1908
|
+
>>> kmer_alphabet = KmerAlphabet(ProteinSequence.alphabet, 3)
|
|
1909
|
+
>>> kmer_codes = [kmer_alphabet.create_kmers(s.code) for s in sequences]
|
|
1910
|
+
>>> for code in kmer_codes:
|
|
1911
|
+
... print(code)
|
|
1912
|
+
[11701 4360 7879 9400 4419]
|
|
1913
|
+
[ 6517 4364 7975 11704 4419]
|
|
1914
|
+
>>> table = BucketKmerTable.from_kmers(kmer_alphabet, kmer_codes)
|
|
1915
|
+
>>> print(table)
|
|
1916
|
+
IQT: (0, 1)
|
|
1917
|
+
IQB: (1, 1)
|
|
1918
|
+
ITE: (0, 4), (1, 4)
|
|
1919
|
+
NIQ: (1, 0)
|
|
1920
|
+
QTI: (0, 2)
|
|
1921
|
+
QBI: (1, 2)
|
|
1922
|
+
TIT: (0, 3)
|
|
1923
|
+
BIQ: (0, 0)
|
|
1924
|
+
BIT: (1, 3)
|
|
1925
|
+
"""
|
|
1926
|
+
_check_kmer_alphabet(kmer_alphabet)
|
|
1927
|
+
_check_multiple_kmer_bounds(kmers, kmer_alphabet)
|
|
1928
|
+
|
|
1929
|
+
ref_ids = _compute_ref_ids(ref_ids, kmers)
|
|
1930
|
+
masks = _compute_masks(masks, kmers)
|
|
1931
|
+
|
|
1932
|
+
if n_buckets is None:
|
|
1933
|
+
n_kmers = np.sum([len(e) for e in kmers])
|
|
1934
|
+
n_buckets = bucket_number(n_kmers)
|
|
1935
|
+
|
|
1936
|
+
table = BucketKmerTable(n_buckets, kmer_alphabet)
|
|
1937
|
+
|
|
1938
|
+
masks = [
|
|
1939
|
+
np.ones(len(arr), dtype=np.uint8) if mask is None
|
|
1940
|
+
# Convert boolean mask into uint8 array to be able
|
|
1941
|
+
# to handle it as memory view
|
|
1942
|
+
else np.frombuffer(
|
|
1943
|
+
mask.astype(bool, copy=False), dtype=np.uint8
|
|
1944
|
+
)
|
|
1945
|
+
for mask, arr in zip(masks, kmers)
|
|
1946
|
+
]
|
|
1947
|
+
|
|
1948
|
+
for arr, mask in zip(kmers, masks):
|
|
1949
|
+
table._count_masked_kmers(arr, mask)
|
|
1950
|
+
|
|
1951
|
+
_init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
|
|
1952
|
+
|
|
1953
|
+
for arr, ref_id, mask in zip(kmers, ref_ids, masks):
|
|
1954
|
+
table._add_kmers(arr, ref_id, mask)
|
|
1955
|
+
|
|
1956
|
+
return table
|
|
1957
|
+
|
|
1958
|
+
|
|
1959
|
+
@staticmethod
|
|
1960
|
+
def from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None,
|
|
1961
|
+
n_buckets=None):
|
|
1962
|
+
"""
|
|
1963
|
+
from_kmer_selection(kmer_alphabet, positions, kmers, ref_ids=None,
|
|
1964
|
+
n_buckets=None)
|
|
1965
|
+
|
|
1966
|
+
Create a :class:`BucketKmerTable` by storing the positions of a
|
|
1967
|
+
filtered subset of input *k-mers*.
|
|
1968
|
+
|
|
1969
|
+
This can be used to reduce the number of stored *k-mers* using
|
|
1970
|
+
a *k-mer* subset selector such as :class:`MinimizerSelector`.
|
|
1971
|
+
|
|
1972
|
+
Parameters
|
|
1973
|
+
----------
|
|
1974
|
+
kmer_alphabet : KmerAlphabet
|
|
1975
|
+
The :class:`KmerAlphabet` to use for the new table.
|
|
1976
|
+
Should be the same alphabet that was used to calculate the
|
|
1977
|
+
input *kmers*.
|
|
1978
|
+
positions : sized iterable object of (ndarray, shape=(n,), dtype=uint32), length=m
|
|
1979
|
+
List where each array contains the sequence positions of
|
|
1980
|
+
the filtered subset of *k-mers* given in `kmers`.
|
|
1981
|
+
The list may contain multiple elements for multiple
|
|
1982
|
+
sequences.
|
|
1983
|
+
kmers : sized iterable object of (ndarray, shape=(n,), dtype=np.int64), length=m
|
|
1984
|
+
List where each array contains the filtered subset of
|
|
1985
|
+
*k-mer* codes from a sequence.
|
|
1986
|
+
For each array the index of the *k-mer* code in the array,
|
|
1987
|
+
is stored in the table as sequence position.
|
|
1988
|
+
The list may contain multiple elements for multiple
|
|
1989
|
+
sequences.
|
|
1990
|
+
ref_ids : sized iterable object of int, length=m, optional
|
|
1991
|
+
The reference IDs for the sequences.
|
|
1992
|
+
These are used to identify the corresponding sequence for a
|
|
1993
|
+
*k-mer* match.
|
|
1994
|
+
By default the IDs are counted from *0* to *m*.
|
|
1995
|
+
n_buckets : int, optional
|
|
1996
|
+
Set the number of buckets in the table, e.g. to use a
|
|
1997
|
+
different load factor.
|
|
1998
|
+
It is recommended to use :func:`bucket_number()` for this
|
|
1999
|
+
purpose.
|
|
2000
|
+
By default, a load factor of approximately 0.8 is used.
|
|
2001
|
+
|
|
2002
|
+
Returns
|
|
2003
|
+
-------
|
|
2004
|
+
table : BucketKmerTable
|
|
2005
|
+
The newly created table.
|
|
2006
|
+
|
|
2007
|
+
Examples
|
|
2008
|
+
--------
|
|
2009
|
+
|
|
2010
|
+
Reduce the size of sequence data in the table using minimizers:
|
|
2011
|
+
|
|
2012
|
+
>>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
|
|
2013
|
+
>>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
|
|
2014
|
+
>>> minimizer = MinimizerSelector(kmer_alph, window=4)
|
|
2015
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence1)
|
|
2016
|
+
>>> kmer_table = BucketKmerTable.from_kmer_selection(
|
|
2017
|
+
... kmer_alph, [minimizer_pos], [minimizers]
|
|
2018
|
+
... )
|
|
2019
|
+
|
|
2020
|
+
Use the same :class:`MinimizerSelector` to select the minimizers
|
|
2021
|
+
from the query sequence and match them against the table.
|
|
2022
|
+
Although the amount of *k-mers* is reduced, matching is still
|
|
2023
|
+
guanrateed to work, if the two sequences share identity in the
|
|
2024
|
+
given window:
|
|
2025
|
+
|
|
2026
|
+
>>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
|
|
2027
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence2)
|
|
2028
|
+
>>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
|
|
2029
|
+
>>> print(matches)
|
|
2030
|
+
[[ 9 0 11]
|
|
2031
|
+
[12 0 14]]
|
|
2032
|
+
>>> for query_pos, _, db_pos in matches:
|
|
2033
|
+
... print(sequence1)
|
|
2034
|
+
... print(" " * (db_pos-1) + "^" * kmer_table.k)
|
|
2035
|
+
... print(sequence2)
|
|
2036
|
+
... print(" " * (query_pos-1) + "^" * kmer_table.k)
|
|
2037
|
+
... print()
|
|
2038
|
+
THIS*IS*A*SEQVENCE
|
|
2039
|
+
^^^
|
|
2040
|
+
ANQTHER*SEQVENCE
|
|
2041
|
+
^^^
|
|
2042
|
+
<BLANKLINE>
|
|
2043
|
+
THIS*IS*A*SEQVENCE
|
|
2044
|
+
^^^
|
|
2045
|
+
ANQTHER*SEQVENCE
|
|
2046
|
+
^^^
|
|
2047
|
+
<BLANKLINE>
|
|
2048
|
+
"""
|
|
2049
|
+
_check_kmer_alphabet(kmer_alphabet)
|
|
2050
|
+
_check_multiple_kmer_bounds(kmers, kmer_alphabet)
|
|
2051
|
+
_check_position_shape(positions, kmers)
|
|
2052
|
+
|
|
2053
|
+
ref_ids = _compute_ref_ids(ref_ids, kmers)
|
|
2054
|
+
|
|
2055
|
+
if n_buckets is None:
|
|
2056
|
+
n_kmers = np.sum([len(e) for e in kmers])
|
|
2057
|
+
n_buckets = bucket_number(n_kmers)
|
|
2058
|
+
|
|
2059
|
+
table = BucketKmerTable(n_buckets, kmer_alphabet)
|
|
2060
|
+
|
|
2061
|
+
for arr in kmers:
|
|
2062
|
+
table._count_kmers(arr)
|
|
2063
|
+
|
|
2064
|
+
_init_c_arrays(table._ptr_array, EntrySize.BUCKETS)
|
|
2065
|
+
|
|
2066
|
+
for pos, arr, ref_id in zip(positions, kmers, ref_ids):
|
|
2067
|
+
table._add_kmer_selection(
|
|
2068
|
+
pos.astype(np.uint32, copy=False), arr, ref_id
|
|
2069
|
+
)
|
|
2070
|
+
|
|
2071
|
+
return table
|
|
2072
|
+
|
|
2073
|
+
|
|
2074
|
+
@staticmethod
|
|
2075
|
+
def from_tables(tables):
|
|
2076
|
+
"""
|
|
2077
|
+
from_tables(tables)
|
|
2078
|
+
|
|
2079
|
+
Create a :class:`BucketKmerTable` by merging the *k-mer*
|
|
2080
|
+
positions from existing `tables`.
|
|
2081
|
+
|
|
2082
|
+
Parameters
|
|
2083
|
+
----------
|
|
2084
|
+
tables : iterable object of BucketKmerTable
|
|
2085
|
+
The tables to be merged.
|
|
2086
|
+
All tables must have equal number of buckets and equal
|
|
2087
|
+
:class:`KmerAlphabet` objects, i.e. the same *k* and equal
|
|
2088
|
+
base alphabets.
|
|
2089
|
+
|
|
2090
|
+
Returns
|
|
2091
|
+
-------
|
|
2092
|
+
table : BucketKmerTable
|
|
2093
|
+
The newly created table.
|
|
2094
|
+
|
|
2095
|
+
Examples
|
|
2096
|
+
--------
|
|
2097
|
+
To ensure that all tables have the same number of buckets,
|
|
2098
|
+
`n_buckets` need to be set on table creation.
|
|
2099
|
+
|
|
2100
|
+
>>> # The sequence length is not exactly the length of resulting k-mers,
|
|
2101
|
+
>>> # but it is close enough for bucket computation
|
|
2102
|
+
>>> n_buckets = bucket_number(len("TTATA") + len("CTAG"))
|
|
2103
|
+
>>> table1 = BucketKmerTable.from_sequences(
|
|
2104
|
+
... 2, [NucleotideSequence("TTATA")], ref_ids=[100],
|
|
2105
|
+
... n_buckets=n_buckets
|
|
2106
|
+
... )
|
|
2107
|
+
>>> table2 = BucketKmerTable.from_sequences(
|
|
2108
|
+
... 2, [NucleotideSequence("CTAG")], ref_ids=[101],
|
|
2109
|
+
... n_buckets=n_buckets
|
|
2110
|
+
... )
|
|
2111
|
+
>>> merged_table = BucketKmerTable.from_tables([table1, table2])
|
|
2112
|
+
>>> print(merged_table)
|
|
2113
|
+
AG: (101, 2)
|
|
2114
|
+
AT: (100, 2)
|
|
2115
|
+
CT: (101, 0)
|
|
2116
|
+
TA: (100, 1), (100, 3), (101, 1)
|
|
2117
|
+
TT: (100, 0)
|
|
2118
|
+
"""
|
|
2119
|
+
cdef BucketKmerTable table
|
|
2120
|
+
|
|
2121
|
+
_check_same_kmer_alphabet(tables)
|
|
2122
|
+
_check_same_buckets(tables)
|
|
2123
|
+
|
|
2124
|
+
merged_table = BucketKmerTable(
|
|
2125
|
+
tables[0].n_buckets,
|
|
2126
|
+
tables[0].kmer_alphabet
|
|
2127
|
+
)
|
|
2128
|
+
|
|
2129
|
+
# Sum the number of appearances of each k-mer from the tables
|
|
2130
|
+
for table in tables:
|
|
2131
|
+
_count_table_entries(
|
|
2132
|
+
merged_table._ptr_array, table._ptr_array,
|
|
2133
|
+
EntrySize.BUCKETS
|
|
2134
|
+
)
|
|
2135
|
+
|
|
2136
|
+
_init_c_arrays(merged_table._ptr_array, EntrySize.BUCKETS)
|
|
2137
|
+
|
|
2138
|
+
for table in tables:
|
|
2139
|
+
_append_entries(merged_table._ptr_array, table._ptr_array)
|
|
2140
|
+
|
|
2141
|
+
return merged_table
|
|
2142
|
+
|
|
2143
|
+
|
|
2144
|
+
@cython.cdivision(True)
|
|
2145
|
+
@cython.boundscheck(False)
|
|
2146
|
+
@cython.wraparound(False)
|
|
2147
|
+
def match_table(self, BucketKmerTable table, similarity_rule=None):
|
|
2148
|
+
"""
|
|
2149
|
+
match_table(table, similarity_rule=None)
|
|
2150
|
+
|
|
2151
|
+
Find matches between the *k-mers* in this table with the
|
|
2152
|
+
*k-mers* in another `table`.
|
|
2153
|
+
|
|
2154
|
+
This means that for each *k-mer* the cartesian product between
|
|
2155
|
+
the positions in both tables is added to the matches.
|
|
2156
|
+
|
|
2157
|
+
Parameters
|
|
2158
|
+
----------
|
|
2159
|
+
table : BucketKmerTable
|
|
2160
|
+
The table to be matched.
|
|
2161
|
+
Both tables must have equal number of buckets and equal
|
|
2162
|
+
:class:`KmerAlphabet` objects, i.e. the same *k* and equal
|
|
2163
|
+
base alphabets.
|
|
2164
|
+
similarity_rule : SimilarityRule, optional
|
|
2165
|
+
If this parameter is given, not only exact *k-mer* matches
|
|
2166
|
+
are considered, but also similar ones according to the given
|
|
2167
|
+
:class:`SimilarityRule`.
|
|
2168
|
+
|
|
2169
|
+
Returns
|
|
2170
|
+
-------
|
|
2171
|
+
matches : ndarray, shape=(n,4), dtype=np.uint32
|
|
2172
|
+
The *k-mer* matches.
|
|
2173
|
+
Each row contains one match. Each match has the following
|
|
2174
|
+
columns:
|
|
2175
|
+
|
|
2176
|
+
0. The reference ID of the matched sequence in the other
|
|
2177
|
+
table
|
|
2178
|
+
1. The sequence position of the matched sequence in the
|
|
2179
|
+
other table
|
|
2180
|
+
2. The reference ID of the matched sequence in this
|
|
2181
|
+
table
|
|
2182
|
+
3. The sequence position of the matched sequence in this
|
|
2183
|
+
table
|
|
2184
|
+
|
|
2185
|
+
Notes
|
|
2186
|
+
-----
|
|
2187
|
+
|
|
2188
|
+
|
|
2189
|
+
There is no guaranteed order of the reference IDs or
|
|
2190
|
+
sequence positions in the returned matches.
|
|
2191
|
+
|
|
2192
|
+
Examples
|
|
2193
|
+
--------
|
|
2194
|
+
To ensure that both tables have the same number of buckets,
|
|
2195
|
+
`n_buckets` need to be set on table creation.
|
|
2196
|
+
|
|
2197
|
+
>>> # The sequence length is not exactly the length of resulting k-mers,
|
|
2198
|
+
>>> # but it is close enouggh for bucket computation
|
|
2199
|
+
>>> n_buckets = bucket_number(max(len("BIQTITE"), len("TITANITE")))
|
|
2200
|
+
>>> sequence1 = ProteinSequence("BIQTITE")
|
|
2201
|
+
>>> table1 = BucketKmerTable.from_sequences(3, [sequence1], ref_ids=[100])
|
|
2202
|
+
>>> print(table1)
|
|
2203
|
+
IQT: (100, 1)
|
|
2204
|
+
ITE: (100, 4)
|
|
2205
|
+
QTI: (100, 2)
|
|
2206
|
+
TIT: (100, 3)
|
|
2207
|
+
BIQ: (100, 0)
|
|
2208
|
+
>>> sequence2 = ProteinSequence("TITANITE")
|
|
2209
|
+
>>> table2 = BucketKmerTable.from_sequences(3, [sequence2], ref_ids=[101])
|
|
2210
|
+
>>> print(table2)
|
|
2211
|
+
ANI: (101, 3)
|
|
2212
|
+
ITA: (101, 1)
|
|
2213
|
+
ITE: (101, 5)
|
|
2214
|
+
NIT: (101, 4)
|
|
2215
|
+
TAN: (101, 2)
|
|
2216
|
+
TIT: (101, 0)
|
|
2217
|
+
>>> print(table1.match_table(table2))
|
|
2218
|
+
[[101 0 100 3]
|
|
2219
|
+
[101 5 100 4]]
|
|
2220
|
+
"""
|
|
2221
|
+
cdef int INIT_SIZE = 1
|
|
2222
|
+
|
|
2223
|
+
cdef int64 bucket, sim_bucket
|
|
2224
|
+
cdef int64 self_kmer, other_kmer, sim_kmer
|
|
2225
|
+
cdef int64 match_i
|
|
2226
|
+
cdef int64 i, j, l
|
|
2227
|
+
cdef int64 self_length, other_length
|
|
2228
|
+
cdef uint32* self_bucket_ptr
|
|
2229
|
+
cdef uint32* other_bucket_ptr
|
|
2230
|
+
|
|
2231
|
+
# This variable will only be used if a similarity rule exists
|
|
2232
|
+
cdef int64[:] similar_kmers
|
|
2233
|
+
|
|
2234
|
+
# Store in new variables
|
|
2235
|
+
# to disable repetitive initialization checks
|
|
2236
|
+
cdef ptr[:] self_ptr_array = self._ptr_array
|
|
2237
|
+
cdef ptr[:] other_ptr_array = table._ptr_array
|
|
2238
|
+
|
|
2239
|
+
_check_same_kmer_alphabet((self, table))
|
|
2240
|
+
_check_same_buckets((self, table))
|
|
2241
|
+
|
|
2242
|
+
# This array will store the match positions
|
|
2243
|
+
# As the final number of matches is unknown, a list-like
|
|
2244
|
+
# approach is used:
|
|
2245
|
+
# The array is initialized with a relatively small inital size
|
|
2246
|
+
# and every time the limit would be exceeded its size is doubled
|
|
2247
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 4), dtype=np.int64)
|
|
2248
|
+
match_i = 0
|
|
2249
|
+
if similarity_rule is None:
|
|
2250
|
+
for bucket in range(self_ptr_array.shape[0]):
|
|
2251
|
+
self_bucket_ptr = <uint32*>self_ptr_array[bucket]
|
|
2252
|
+
other_bucket_ptr = <uint32*>other_ptr_array[bucket]
|
|
2253
|
+
if self_bucket_ptr != NULL and other_bucket_ptr != NULL:
|
|
2254
|
+
# This bucket exists for both tables
|
|
2255
|
+
other_length = (<int64*>other_bucket_ptr)[0]
|
|
2256
|
+
self_length = (<int64*>self_bucket_ptr )[0]
|
|
2257
|
+
for i in range(2, other_length, 4):
|
|
2258
|
+
# Hacky syntax to achieve casting to int64*
|
|
2259
|
+
# after offset is applied
|
|
2260
|
+
other_kmer = (<int64*>(other_bucket_ptr + i))[0]
|
|
2261
|
+
for j in range(2, self_length, 4):
|
|
2262
|
+
self_kmer = (<int64*>(self_bucket_ptr + j))[0]
|
|
2263
|
+
if self_kmer == other_kmer:
|
|
2264
|
+
# The k-mers are not only in the same
|
|
2265
|
+
# bucket, but they are actually equal
|
|
2266
|
+
if match_i >= matches.shape[0]:
|
|
2267
|
+
# The 'matches' array is full
|
|
2268
|
+
# -> double its size
|
|
2269
|
+
matches = expand(np.asarray(matches))
|
|
2270
|
+
matches[match_i, 0] = other_bucket_ptr[i+2]
|
|
2271
|
+
matches[match_i, 1] = other_bucket_ptr[i+3]
|
|
2272
|
+
matches[match_i, 2] = self_bucket_ptr[j+2]
|
|
2273
|
+
matches[match_i, 3] = self_bucket_ptr[j+3]
|
|
2274
|
+
match_i += 1
|
|
2275
|
+
|
|
2276
|
+
else:
|
|
2277
|
+
for bucket in range(self_ptr_array.shape[0]):
|
|
2278
|
+
other_bucket_ptr = <uint32*>other_ptr_array[bucket]
|
|
2279
|
+
if other_bucket_ptr != NULL:
|
|
2280
|
+
other_length = (<int64*>other_bucket_ptr)[0]
|
|
2281
|
+
for i in range(2, other_length, 4):
|
|
2282
|
+
other_kmer = (<int64*>(other_bucket_ptr + i))[0]
|
|
2283
|
+
# If a similarity rule exists, iterate not only over
|
|
2284
|
+
# the exact k-mer, but over all k-mers similar to
|
|
2285
|
+
# the current k-mer
|
|
2286
|
+
similar_kmers = similarity_rule.similar_kmers(
|
|
2287
|
+
self._kmer_alph, other_kmer
|
|
2288
|
+
)
|
|
2289
|
+
for l in range(similar_kmers.shape[0]):
|
|
2290
|
+
sim_kmer = similar_kmers[l]
|
|
2291
|
+
sim_bucket = sim_kmer % self._n_buckets
|
|
2292
|
+
self_bucket_ptr = <uint32*>self_ptr_array[sim_bucket]
|
|
2293
|
+
if self_bucket_ptr != NULL:
|
|
2294
|
+
self_length = (<int64*>self_bucket_ptr)[0]
|
|
2295
|
+
for j in range(2, self_length, 4):
|
|
2296
|
+
self_kmer = (<int64*>(self_bucket_ptr + j))[0]
|
|
2297
|
+
if self_kmer == sim_kmer:
|
|
2298
|
+
if match_i >= matches.shape[0]:
|
|
2299
|
+
# The 'matches' array is full
|
|
2300
|
+
# -> double its size
|
|
2301
|
+
matches = expand(np.asarray(matches))
|
|
2302
|
+
matches[match_i, 0] = other_bucket_ptr[i+2]
|
|
2303
|
+
matches[match_i, 1] = other_bucket_ptr[i+3]
|
|
2304
|
+
matches[match_i, 2] = self_bucket_ptr[j+2]
|
|
2305
|
+
matches[match_i, 3] = self_bucket_ptr[j+3]
|
|
2306
|
+
match_i += 1
|
|
2307
|
+
|
|
2308
|
+
# Trim to correct size and return
|
|
2309
|
+
return np.asarray(matches[:match_i])
|
|
2310
|
+
|
|
2311
|
+
|
|
2312
|
+
@cython.cdivision(True)
|
|
2313
|
+
@cython.boundscheck(False)
|
|
2314
|
+
@cython.wraparound(False)
|
|
2315
|
+
def match(self, sequence, similarity_rule=None, ignore_mask=None):
|
|
2316
|
+
"""
|
|
2317
|
+
match(sequence, similarity_rule=None, ignore_mask=None)
|
|
2318
|
+
|
|
2319
|
+
Find matches between the *k-mers* in this table with all
|
|
2320
|
+
overlapping *k-mers* in the given `sequence`.
|
|
2321
|
+
*k* is determined by the table.
|
|
2322
|
+
|
|
2323
|
+
Parameters
|
|
2324
|
+
----------
|
|
2325
|
+
sequence : Sequence
|
|
2326
|
+
The sequence to be matched.
|
|
2327
|
+
The table's base alphabet must extend the alphabet of the
|
|
2328
|
+
sequence.
|
|
2329
|
+
similarity_rule : SimilarityRule, optional
|
|
2330
|
+
If this parameter is given, not only exact *k-mer* matches
|
|
2331
|
+
are considered, but also similar ones according to the given
|
|
2332
|
+
:class:`SimilarityRule`.
|
|
2333
|
+
ignore_mask : ndarray, dtype=bool, optional
|
|
2334
|
+
Boolean mask of sequence positions to ignore.
|
|
2335
|
+
*k-mers* that involve these sequence positions are not added
|
|
2336
|
+
to the table.
|
|
2337
|
+
This is used e.g. to skip repeat regions.
|
|
2338
|
+
By default, no sequence position is ignored.
|
|
2339
|
+
|
|
2340
|
+
Returns
|
|
2341
|
+
-------
|
|
2342
|
+
matches : ndarray, shape=(n,3), dtype=np.uint32
|
|
2343
|
+
The *k-mer* matches.
|
|
2344
|
+
Each row contains one match. Each match has the following
|
|
2345
|
+
columns:
|
|
2346
|
+
|
|
2347
|
+
0. The sequence position in the input sequence
|
|
2348
|
+
1. The reference ID of the matched sequence in the table
|
|
2349
|
+
2. The sequence position of the matched sequence in the
|
|
2350
|
+
table
|
|
2351
|
+
|
|
2352
|
+
Notes
|
|
2353
|
+
-----
|
|
2354
|
+
|
|
2355
|
+
The matches are ordered by the first column.
|
|
2356
|
+
|
|
2357
|
+
Examples
|
|
2358
|
+
--------
|
|
2359
|
+
|
|
2360
|
+
>>> sequence1 = ProteinSequence("BIQTITE")
|
|
2361
|
+
>>> table = BucketKmerTable.from_sequences(3, [sequence1], ref_ids=[100])
|
|
2362
|
+
>>> print(table)
|
|
2363
|
+
IQT: (100, 1)
|
|
2364
|
+
ITE: (100, 4)
|
|
2365
|
+
QTI: (100, 2)
|
|
2366
|
+
TIT: (100, 3)
|
|
2367
|
+
BIQ: (100, 0)
|
|
2368
|
+
>>> sequence2 = ProteinSequence("TITANITE")
|
|
2369
|
+
>>> print(table.match(sequence2))
|
|
2370
|
+
[[ 0 100 3]
|
|
2371
|
+
[ 5 100 4]]
|
|
2372
|
+
"""
|
|
2373
|
+
cdef int INIT_SIZE = 1
|
|
2374
|
+
|
|
2375
|
+
cdef int64 bucket
|
|
2376
|
+
cdef int64 self_kmer, other_kmer, sim_kmer
|
|
2377
|
+
cdef int64 match_i
|
|
2378
|
+
cdef int64 i, l
|
|
2379
|
+
cdef int64 length
|
|
2380
|
+
cdef uint32* bucket_ptr
|
|
2381
|
+
cdef uint32* array_stop
|
|
2382
|
+
|
|
2383
|
+
# This variable will only be used if a similarity rule exists
|
|
2384
|
+
cdef int64[:] similar_kmers
|
|
2385
|
+
|
|
2386
|
+
# Store in new variable
|
|
2387
|
+
# to disable repetitive initialization checks
|
|
2388
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2389
|
+
|
|
2390
|
+
if len(sequence.code) < self._k:
|
|
2391
|
+
raise ValueError("Sequence code is shorter than k")
|
|
2392
|
+
if not self._kmer_alph.base_alphabet.extends(sequence.alphabet):
|
|
2393
|
+
raise ValueError(
|
|
2394
|
+
"The alphabet used for the k-mer index table is not equal to "
|
|
2395
|
+
"the alphabet of the sequence"
|
|
2396
|
+
)
|
|
2397
|
+
|
|
2398
|
+
cdef int64[:] kmers = self._kmer_alph.create_kmers(sequence.code)
|
|
2399
|
+
cdef uint8[:] kmer_mask = _prepare_mask(
|
|
2400
|
+
self._kmer_alph, ignore_mask, len(sequence.code)
|
|
2401
|
+
)
|
|
2402
|
+
|
|
2403
|
+
# This array will store the match positions
|
|
2404
|
+
# As the final number of matches is unknown, a list-like
|
|
2405
|
+
# approach is used:
|
|
2406
|
+
# The array is initialized with a relatively small inital size
|
|
2407
|
+
# and every time the limit would be exceeded its size is doubled
|
|
2408
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
|
|
2409
|
+
match_i = 0
|
|
2410
|
+
if similarity_rule is None:
|
|
2411
|
+
for i in range(kmers.shape[0]):
|
|
2412
|
+
if kmer_mask[i]:
|
|
2413
|
+
other_kmer = kmers[i]
|
|
2414
|
+
bucket = other_kmer % self._n_buckets
|
|
2415
|
+
bucket_ptr = <uint32*>ptr_array[bucket]
|
|
2416
|
+
if bucket_ptr != NULL:
|
|
2417
|
+
# There is at least one entry in this bucket
|
|
2418
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2419
|
+
array_stop = bucket_ptr + length
|
|
2420
|
+
bucket_ptr += 2
|
|
2421
|
+
while bucket_ptr < array_stop:
|
|
2422
|
+
self_kmer = (<int64*>bucket_ptr)[0]
|
|
2423
|
+
if self_kmer == other_kmer:
|
|
2424
|
+
# The k-mers are not only in the same
|
|
2425
|
+
# bucket, but they are actually equal
|
|
2426
|
+
if match_i >= matches.shape[0]:
|
|
2427
|
+
# The 'matches' array is full
|
|
2428
|
+
# -> double its size
|
|
2429
|
+
matches = expand(np.asarray(matches))
|
|
2430
|
+
matches[match_i, 0] = i
|
|
2431
|
+
bucket_ptr += 2
|
|
2432
|
+
matches[match_i, 1] = bucket_ptr[0]
|
|
2433
|
+
bucket_ptr += 1
|
|
2434
|
+
matches[match_i, 2] = bucket_ptr[0]
|
|
2435
|
+
bucket_ptr += 1
|
|
2436
|
+
match_i += 1
|
|
2437
|
+
else:
|
|
2438
|
+
bucket_ptr += EntrySize.BUCKETS
|
|
2439
|
+
|
|
2440
|
+
else:
|
|
2441
|
+
for i in range(kmers.shape[0]):
|
|
2442
|
+
if kmer_mask[i]:
|
|
2443
|
+
other_kmer = kmers[i]
|
|
2444
|
+
# If a similarity rule exists, iterate not only over
|
|
2445
|
+
# the exact k-mer, but over all k-mers similar to
|
|
2446
|
+
# the current k-mer
|
|
2447
|
+
similar_kmers = similarity_rule.similar_kmers(
|
|
2448
|
+
self._kmer_alph, other_kmer
|
|
2449
|
+
)
|
|
2450
|
+
for l in range(similar_kmers.shape[0]):
|
|
2451
|
+
sim_kmer = similar_kmers[l]
|
|
2452
|
+
bucket = sim_kmer % self._n_buckets
|
|
2453
|
+
# Actual copy of the code from the other
|
|
2454
|
+
# if-branch:
|
|
2455
|
+
# It cannot be put properly in a cdef-function,
|
|
2456
|
+
# as every function call would perform reference
|
|
2457
|
+
# count changes and would decrease performance
|
|
2458
|
+
bucket_ptr = <uint32*>ptr_array[bucket]
|
|
2459
|
+
if bucket_ptr != NULL:
|
|
2460
|
+
# There is at least one entry in this bucket
|
|
2461
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2462
|
+
array_stop = bucket_ptr + length
|
|
2463
|
+
bucket_ptr += 2
|
|
2464
|
+
while bucket_ptr < array_stop:
|
|
2465
|
+
self_kmer = (<int64*>bucket_ptr)[0]
|
|
2466
|
+
if self_kmer == sim_kmer:
|
|
2467
|
+
# The k-mers are not only in the same
|
|
2468
|
+
# bucket, but they are actually equal
|
|
2469
|
+
if match_i >= matches.shape[0]:
|
|
2470
|
+
# The 'matches' array is full
|
|
2471
|
+
# -> double its size
|
|
2472
|
+
matches = expand(np.asarray(matches))
|
|
2473
|
+
matches[match_i, 0] = i
|
|
2474
|
+
bucket_ptr += 2
|
|
2475
|
+
matches[match_i, 1] = bucket_ptr[0]
|
|
2476
|
+
bucket_ptr += 1
|
|
2477
|
+
matches[match_i, 2] = bucket_ptr[0]
|
|
2478
|
+
bucket_ptr += 1
|
|
2479
|
+
match_i += 1
|
|
2480
|
+
else:
|
|
2481
|
+
bucket_ptr += EntrySize.BUCKETS
|
|
2482
|
+
|
|
2483
|
+
# Trim to correct size and return
|
|
2484
|
+
return np.asarray(matches[:match_i])
|
|
2485
|
+
|
|
2486
|
+
|
|
2487
|
+
@cython.cdivision(True)
|
|
2488
|
+
@cython.boundscheck(False)
|
|
2489
|
+
@cython.wraparound(False)
|
|
2490
|
+
def match_kmer_selection(self, positions, kmers):
|
|
2491
|
+
"""
|
|
2492
|
+
match_kmer_selection(positions, kmers)
|
|
2493
|
+
|
|
2494
|
+
Find matches between the *k-mers* in this table with the given
|
|
2495
|
+
*k-mer* selection.
|
|
2496
|
+
|
|
2497
|
+
It is intended to use this method to find matches in a table
|
|
2498
|
+
that was created using :meth:`from_kmer_selection()`.
|
|
2499
|
+
|
|
2500
|
+
Parameters
|
|
2501
|
+
----------
|
|
2502
|
+
positions : ndarray, shape=(n,), dtype=uint32
|
|
2503
|
+
Sequence positions of the filtered subset of *k-mers* given
|
|
2504
|
+
in `kmers`.
|
|
2505
|
+
kmers : ndarray, shape=(n,), dtype=np.int64
|
|
2506
|
+
Filtered subset of *k-mer* codes to match against.
|
|
2507
|
+
|
|
2508
|
+
Returns
|
|
2509
|
+
-------
|
|
2510
|
+
matches : ndarray, shape=(n,3), dtype=np.uint32
|
|
2511
|
+
The *k-mer* matches.
|
|
2512
|
+
Each row contains one *k-mer* match.
|
|
2513
|
+
Each match has the following columns:
|
|
2514
|
+
|
|
2515
|
+
0. The sequence position of the input *k-mer*, taken
|
|
2516
|
+
from `positions`
|
|
2517
|
+
1. The reference ID of the matched sequence in the table
|
|
2518
|
+
2. The sequence position of the matched *k-mer* in the
|
|
2519
|
+
table
|
|
2520
|
+
|
|
2521
|
+
Examples
|
|
2522
|
+
--------
|
|
2523
|
+
|
|
2524
|
+
Reduce the size of sequence data in the table using minimizers:
|
|
2525
|
+
|
|
2526
|
+
>>> sequence1 = ProteinSequence("THIS*IS*A*SEQVENCE")
|
|
2527
|
+
>>> kmer_alph = KmerAlphabet(sequence1.alphabet, k=3)
|
|
2528
|
+
>>> minimizer = MinimizerSelector(kmer_alph, window=4)
|
|
2529
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence1)
|
|
2530
|
+
>>> kmer_table = BucketKmerTable.from_kmer_selection(
|
|
2531
|
+
... kmer_alph, [minimizer_pos], [minimizers]
|
|
2532
|
+
... )
|
|
2533
|
+
|
|
2534
|
+
Use the same :class:`MinimizerSelector` to select the minimizers
|
|
2535
|
+
from the query sequence and match them against the table.
|
|
2536
|
+
Although the amount of *k-mers* is reduced, matching is still
|
|
2537
|
+
guanrateed to work, if the two sequences share identity in the
|
|
2538
|
+
given window:
|
|
2539
|
+
|
|
2540
|
+
>>> sequence2 = ProteinSequence("ANQTHER*SEQVENCE")
|
|
2541
|
+
>>> minimizer_pos, minimizers = minimizer.select(sequence2)
|
|
2542
|
+
>>> matches = kmer_table.match_kmer_selection(minimizer_pos, minimizers)
|
|
2543
|
+
>>> print(matches)
|
|
2544
|
+
[[ 9 0 11]
|
|
2545
|
+
[12 0 14]]
|
|
2546
|
+
>>> for query_pos, _, db_pos in matches:
|
|
2547
|
+
... print(sequence1)
|
|
2548
|
+
... print(" " * (db_pos-1) + "^" * kmer_table.k)
|
|
2549
|
+
... print(sequence2)
|
|
2550
|
+
... print(" " * (query_pos-1) + "^" * kmer_table.k)
|
|
2551
|
+
... print()
|
|
2552
|
+
THIS*IS*A*SEQVENCE
|
|
2553
|
+
^^^
|
|
2554
|
+
ANQTHER*SEQVENCE
|
|
2555
|
+
^^^
|
|
2556
|
+
<BLANKLINE>
|
|
2557
|
+
THIS*IS*A*SEQVENCE
|
|
2558
|
+
^^^
|
|
2559
|
+
ANQTHER*SEQVENCE
|
|
2560
|
+
^^^
|
|
2561
|
+
<BLANKLINE>
|
|
2562
|
+
"""
|
|
2563
|
+
cdef int INIT_SIZE = 1
|
|
2564
|
+
|
|
2565
|
+
cdef int64 i
|
|
2566
|
+
|
|
2567
|
+
cdef int64 bucket
|
|
2568
|
+
cdef int64 self_kmer, other_kmer
|
|
2569
|
+
cdef int64 match_i
|
|
2570
|
+
cdef int64 seq_pos
|
|
2571
|
+
cdef int64 length
|
|
2572
|
+
cdef uint32* bucket_ptr
|
|
2573
|
+
cdef uint32* array_stop
|
|
2574
|
+
|
|
2575
|
+
# Store in new variable
|
|
2576
|
+
# to disable repetitive initialization checks
|
|
2577
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2578
|
+
|
|
2579
|
+
_check_kmer_bounds(kmers, self._kmer_alph)
|
|
2580
|
+
if positions.shape[0] != kmers.shape[0]:
|
|
2581
|
+
raise IndexError(
|
|
2582
|
+
f"{positions.shape[0]} positions were given "
|
|
2583
|
+
f"for {kmers.shape[0]} k-mers"
|
|
2584
|
+
)
|
|
2585
|
+
|
|
2586
|
+
cdef uint32[:] pos_array = positions.astype(np.uint32, copy=False)
|
|
2587
|
+
cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
|
|
2588
|
+
|
|
2589
|
+
# This array will store the match positions
|
|
2590
|
+
# As the final number of matches is unknown, a list-like
|
|
2591
|
+
# approach is used:
|
|
2592
|
+
# The array is initialized with a relatively small inital size
|
|
2593
|
+
# and every time the limit would be exceeded its size is doubled
|
|
2594
|
+
cdef int64[:,:] matches = np.empty((INIT_SIZE, 3), dtype=np.int64)
|
|
2595
|
+
match_i = 0
|
|
2596
|
+
for i in range(kmer_array.shape[0]):
|
|
2597
|
+
other_kmer = kmer_array[i]
|
|
2598
|
+
seq_pos = pos_array[i]
|
|
2599
|
+
bucket = other_kmer % self._n_buckets
|
|
2600
|
+
bucket_ptr = <uint32*>ptr_array[bucket]
|
|
2601
|
+
if bucket_ptr != NULL:
|
|
2602
|
+
# There is at least one entry in this bucket
|
|
2603
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2604
|
+
array_stop = bucket_ptr + length
|
|
2605
|
+
bucket_ptr += 2
|
|
2606
|
+
while bucket_ptr < array_stop:
|
|
2607
|
+
self_kmer = (<int64*>bucket_ptr)[0]
|
|
2608
|
+
if self_kmer == other_kmer:
|
|
2609
|
+
# The k-mers are not only in the same
|
|
2610
|
+
# bucket, but they are actually equal
|
|
2611
|
+
if match_i >= matches.shape[0]:
|
|
2612
|
+
# The 'matches' array is full
|
|
2613
|
+
# -> double its size
|
|
2614
|
+
matches = expand(np.asarray(matches))
|
|
2615
|
+
matches[match_i, 0] = seq_pos
|
|
2616
|
+
bucket_ptr += 2
|
|
2617
|
+
matches[match_i, 1] = bucket_ptr[0]
|
|
2618
|
+
bucket_ptr += 1
|
|
2619
|
+
matches[match_i, 2] = bucket_ptr[0]
|
|
2620
|
+
bucket_ptr += 1
|
|
2621
|
+
match_i += 1
|
|
2622
|
+
else:
|
|
2623
|
+
bucket_ptr += EntrySize.BUCKETS
|
|
2624
|
+
|
|
2625
|
+
# Trim to correct size and return
|
|
2626
|
+
return np.asarray(matches[:match_i])
|
|
2627
|
+
|
|
2628
|
+
|
|
2629
|
+
@cython.cdivision(True)
|
|
2630
|
+
@cython.boundscheck(False)
|
|
2631
|
+
@cython.wraparound(False)
|
|
2632
|
+
def count(self, kmers):
|
|
2633
|
+
"""
|
|
2634
|
+
count(kmers=None)
|
|
2635
|
+
|
|
2636
|
+
Count the number of occurences for each *k-mer* in the table.
|
|
2637
|
+
|
|
2638
|
+
Parameters
|
|
2639
|
+
----------
|
|
2640
|
+
kmers : ndarray, dtype=np.int64, optional
|
|
2641
|
+
The count is returned for these *k-mer* codes.
|
|
2642
|
+
By default all *k-mers* are counted in ascending order, i.e.
|
|
2643
|
+
``count_for_kmer = counts[kmer]``.
|
|
2644
|
+
|
|
2645
|
+
Returns
|
|
2646
|
+
-------
|
|
2647
|
+
counts : ndarray, dtype=np.int64, optional
|
|
2648
|
+
The counts for each given *k-mer*.
|
|
2649
|
+
|
|
2650
|
+
Notes
|
|
2651
|
+
-----
|
|
2652
|
+
As each bucket need to be inspected for the actual *k-mer*
|
|
2653
|
+
entries, this method requires far more computation time than its
|
|
2654
|
+
:class:`KmerTable` equivalent.
|
|
2655
|
+
|
|
2656
|
+
Examples
|
|
2657
|
+
--------
|
|
2658
|
+
>>> table = BucketKmerTable.from_sequences(
|
|
2659
|
+
... k = 2,
|
|
2660
|
+
... sequences = [NucleotideSequence("TTATA"), NucleotideSequence("CTAG")],
|
|
2661
|
+
... ref_ids = [0, 1]
|
|
2662
|
+
... )
|
|
2663
|
+
>>> print(table)
|
|
2664
|
+
AG: (1, 2)
|
|
2665
|
+
AT: (0, 2)
|
|
2666
|
+
CT: (1, 0)
|
|
2667
|
+
TA: (0, 1), (0, 3), (1, 1)
|
|
2668
|
+
TT: (0, 0)
|
|
2669
|
+
|
|
2670
|
+
Count two selected *k-mers*:
|
|
2671
|
+
|
|
2672
|
+
>>> print(table.count(table.kmer_alphabet.encode_multiple(["TA", "AG"])))
|
|
2673
|
+
[3 1]
|
|
2674
|
+
"""
|
|
2675
|
+
cdef int64 i
|
|
2676
|
+
|
|
2677
|
+
cdef int64 bucket
|
|
2678
|
+
cdef int64 kmer, self_kmer
|
|
2679
|
+
cdef int64 length
|
|
2680
|
+
cdef uint32* bucket_ptr
|
|
2681
|
+
cdef uint32* array_stop
|
|
2682
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2683
|
+
|
|
2684
|
+
_check_kmer_bounds(kmers, self._kmer_alph)
|
|
2685
|
+
cdef int64[:] kmer_array = kmers.astype(np.int64, copy=False)
|
|
2686
|
+
cdef int64[:] counts = np.zeros(kmer_array.shape[0], dtype=np.int64)
|
|
2687
|
+
|
|
2688
|
+
for i in range(kmer_array.shape[0]):
|
|
2689
|
+
kmer = kmer_array[i]
|
|
2690
|
+
bucket = kmer % self._n_buckets
|
|
2691
|
+
bucket_ptr = <uint32*> (ptr_array[bucket])
|
|
2692
|
+
if bucket_ptr != NULL:
|
|
2693
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2694
|
+
array_stop = bucket_ptr + length
|
|
2695
|
+
bucket_ptr += 2
|
|
2696
|
+
while bucket_ptr < array_stop:
|
|
2697
|
+
self_kmer = (<int64*>bucket_ptr)[0]
|
|
2698
|
+
if self_kmer == kmer:
|
|
2699
|
+
counts[i] += 1
|
|
2700
|
+
bucket_ptr += EntrySize.BUCKETS
|
|
2701
|
+
|
|
2702
|
+
return np.asarray(counts)
|
|
2703
|
+
|
|
2704
|
+
|
|
2705
|
+
@cython.boundscheck(False)
|
|
2706
|
+
@cython.wraparound(False)
|
|
2707
|
+
def get_kmers(self):
|
|
2708
|
+
"""
|
|
2709
|
+
Get the *k-mer* codes for all *k-mers* that have at least one
|
|
2710
|
+
position in the table.
|
|
2711
|
+
|
|
2712
|
+
Returns
|
|
2713
|
+
-------
|
|
2714
|
+
kmers : ndarray, shape=(n,), dtype=np.int64
|
|
2715
|
+
The *k-mer* codes.
|
|
2716
|
+
|
|
2717
|
+
Notes
|
|
2718
|
+
-----
|
|
2719
|
+
As each bucket need to be inspected for the actual *k-mer*
|
|
2720
|
+
entries, this method requires far more computation time than its
|
|
2721
|
+
:class:`KmerTable` equivalent.
|
|
2722
|
+
|
|
2723
|
+
Examples
|
|
2724
|
+
--------
|
|
2725
|
+
|
|
2726
|
+
>>> sequence = ProteinSequence("BIQTITE")
|
|
2727
|
+
>>> table = BucketKmerTable.from_sequences(3, [sequence], ref_ids=[100])
|
|
2728
|
+
>>> print(table)
|
|
2729
|
+
IQT: (100, 1)
|
|
2730
|
+
ITE: (100, 4)
|
|
2731
|
+
QTI: (100, 2)
|
|
2732
|
+
TIT: (100, 3)
|
|
2733
|
+
BIQ: (100, 0)
|
|
2734
|
+
>>> kmer_codes = table.get_kmers()
|
|
2735
|
+
>>> print(kmer_codes)
|
|
2736
|
+
[ 4360 4419 7879 9400 11701]
|
|
2737
|
+
>>> for code in kmer_codes:
|
|
2738
|
+
... print(table[code])
|
|
2739
|
+
[[100 1]]
|
|
2740
|
+
[[100 4]]
|
|
2741
|
+
[[100 2]]
|
|
2742
|
+
[[100 3]]
|
|
2743
|
+
[[100 0]]
|
|
2744
|
+
"""
|
|
2745
|
+
cdef int64 bucket
|
|
2746
|
+
cdef int64 kmer
|
|
2747
|
+
cdef int64 length
|
|
2748
|
+
cdef uint32* bucket_ptr
|
|
2749
|
+
cdef uint32* array_stop
|
|
2750
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2751
|
+
|
|
2752
|
+
cdef cpp_set[int64] kmer_set
|
|
2753
|
+
|
|
2754
|
+
for bucket in range(ptr_array.shape[0]):
|
|
2755
|
+
bucket_ptr = <uint32*> (ptr_array[bucket])
|
|
2756
|
+
if bucket_ptr != NULL:
|
|
2757
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2758
|
+
array_stop = bucket_ptr + length
|
|
2759
|
+
bucket_ptr += 2
|
|
2760
|
+
while bucket_ptr < array_stop:
|
|
2761
|
+
kmer = (<int64*>bucket_ptr)[0]
|
|
2762
|
+
kmer_set.insert(kmer)
|
|
2763
|
+
bucket_ptr += EntrySize.BUCKETS
|
|
2764
|
+
|
|
2765
|
+
cdef int64[:] kmers = np.zeros(kmer_set.size(), dtype=np.int64)
|
|
2766
|
+
cdef int64 i = 0
|
|
2767
|
+
for kmer in kmer_set:
|
|
2768
|
+
kmers[i] = kmer
|
|
2769
|
+
i += 1
|
|
2770
|
+
return np.sort(np.asarray(kmers))
|
|
2771
|
+
|
|
2772
|
+
|
|
2773
|
+
@cython.cdivision(True)
|
|
2774
|
+
@cython.boundscheck(False)
|
|
2775
|
+
@cython.wraparound(False)
|
|
2776
|
+
def __getitem__(self, int64 kmer):
|
|
2777
|
+
cdef int64 i, j
|
|
2778
|
+
cdef int64 self_kmer
|
|
2779
|
+
cdef int64 length
|
|
2780
|
+
cdef uint32* bucket_ptr
|
|
2781
|
+
cdef uint32[:,:] positions
|
|
2782
|
+
|
|
2783
|
+
if kmer >= len(self):
|
|
2784
|
+
raise AlphabetError(
|
|
2785
|
+
f"k-mer code {kmer} is out of bounds "
|
|
2786
|
+
f"for the given KmerAlphabet"
|
|
2787
|
+
)
|
|
2788
|
+
|
|
2789
|
+
bucket_ptr = <uint32*>self._ptr_array[kmer % self._n_buckets]
|
|
2790
|
+
if bucket_ptr == NULL:
|
|
2791
|
+
return np.zeros((0, 2), dtype=np.uint32)
|
|
2792
|
+
else:
|
|
2793
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2794
|
+
# Pessimistic array allocation:
|
|
2795
|
+
# All k-mer positions in bucket belong to the requested k-mer
|
|
2796
|
+
positions = np.empty(((length - 2) // 4, 2), dtype=np.uint32)
|
|
2797
|
+
i = 0
|
|
2798
|
+
for j in range(2, length, 4):
|
|
2799
|
+
self_kmer = bucket_ptr[j]
|
|
2800
|
+
if self_kmer == kmer:
|
|
2801
|
+
positions[i,0] = bucket_ptr[j+2]
|
|
2802
|
+
positions[i,1] = bucket_ptr[j+3]
|
|
2803
|
+
i += 1
|
|
2804
|
+
# Trim to correct size
|
|
2805
|
+
return np.asarray(positions)[:i]
|
|
2806
|
+
|
|
2807
|
+
|
|
2808
|
+
def __len__(self):
|
|
2809
|
+
return len(self._kmer_alph)
|
|
2810
|
+
|
|
2811
|
+
|
|
2812
|
+
def __eq__(self, item):
|
|
2813
|
+
if item is self:
|
|
2814
|
+
return True
|
|
2815
|
+
if type(item) != BucketKmerTable:
|
|
2816
|
+
return False
|
|
2817
|
+
|
|
2818
|
+
# Introduce static typing to access statically typed fields
|
|
2819
|
+
cdef BucketKmerTable other = item
|
|
2820
|
+
if self._kmer_alph.base_alphabet != other._kmer_alph.base_alphabet:
|
|
2821
|
+
return False
|
|
2822
|
+
if self._k != other._k:
|
|
2823
|
+
return False
|
|
2824
|
+
if self._n_buckets != other._n_buckets:
|
|
2825
|
+
return False
|
|
2826
|
+
return _equal_c_arrays(self._ptr_array, other._ptr_array)
|
|
2827
|
+
|
|
2828
|
+
|
|
2829
|
+
def __str__(self):
|
|
2830
|
+
return _to_string(self)
|
|
2831
|
+
|
|
2832
|
+
|
|
2833
|
+
def __getnewargs_ex__(self):
|
|
2834
|
+
return (self._n_buckets, self._kmer_alph), {}
|
|
2835
|
+
|
|
2836
|
+
|
|
2837
|
+
def __getstate__(self):
|
|
2838
|
+
return _pickle_c_arrays(self._ptr_array)
|
|
2839
|
+
|
|
2840
|
+
def __setstate__(self, state):
|
|
2841
|
+
_unpickle_c_arrays(self._ptr_array, state)
|
|
2842
|
+
|
|
2843
|
+
|
|
2844
|
+
def __dealloc__(self):
|
|
2845
|
+
if self._is_initialized():
|
|
2846
|
+
_deallocate_ptrs(self._ptr_array)
|
|
2847
|
+
|
|
2848
|
+
|
|
2849
|
+
## These private methods work analogous to KmerTable
|
|
2850
|
+
|
|
2851
|
+
@cython.cdivision(True)
|
|
2852
|
+
@cython.boundscheck(False)
|
|
2853
|
+
@cython.wraparound(False)
|
|
2854
|
+
def _count_kmers(self, int64[:] kmers):
|
|
2855
|
+
cdef uint32 seq_pos
|
|
2856
|
+
cdef int64 kmer
|
|
2857
|
+
|
|
2858
|
+
cdef ptr[:] count_array = self._ptr_array
|
|
2859
|
+
|
|
2860
|
+
for seq_pos in range(kmers.shape[0]):
|
|
2861
|
+
kmer = kmers[seq_pos]
|
|
2862
|
+
# Pool all k-mers that should go into the same bucket
|
|
2863
|
+
count_array[kmer % self._n_buckets] += 1
|
|
2864
|
+
|
|
2865
|
+
@cython.cdivision(True)
|
|
2866
|
+
@cython.boundscheck(False)
|
|
2867
|
+
@cython.wraparound(False)
|
|
2868
|
+
def _count_masked_kmers(self, int64[:] kmers, uint8[:] mask):
|
|
2869
|
+
cdef uint32 seq_pos
|
|
2870
|
+
cdef int64 kmer
|
|
2871
|
+
|
|
2872
|
+
cdef ptr[:] count_array = self._ptr_array
|
|
2873
|
+
|
|
2874
|
+
for seq_pos in range(kmers.shape[0]):
|
|
2875
|
+
if mask[seq_pos]:
|
|
2876
|
+
kmer = kmers[seq_pos]
|
|
2877
|
+
# Pool all k-mers that should go into the same bucket
|
|
2878
|
+
count_array[kmer % self._n_buckets] += 1
|
|
2879
|
+
|
|
2880
|
+
|
|
2881
|
+
@cython.cdivision(True)
|
|
2882
|
+
@cython.boundscheck(False)
|
|
2883
|
+
@cython.wraparound(False)
|
|
2884
|
+
def _add_kmers(self, int64[:] kmers, uint32 ref_id, uint8[:] mask):
|
|
2885
|
+
cdef uint32 seq_pos
|
|
2886
|
+
cdef int64 current_size
|
|
2887
|
+
cdef int64 kmer
|
|
2888
|
+
cdef uint32* bucket_ptr
|
|
2889
|
+
cdef uint32* kmer_val_ptr
|
|
2890
|
+
|
|
2891
|
+
# Store in new variable
|
|
2892
|
+
# to disable repetitive initialization checks
|
|
2893
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2894
|
+
|
|
2895
|
+
if mask.shape[0] != kmers.shape[0]:
|
|
2896
|
+
raise IndexError(
|
|
2897
|
+
f"Mask has length {mask.shape[0]}, "
|
|
2898
|
+
f"but there are {kmers.shape[0]} k-mers"
|
|
2899
|
+
)
|
|
2900
|
+
|
|
2901
|
+
for seq_pos in range(kmers.shape[0]):
|
|
2902
|
+
if mask[seq_pos]:
|
|
2903
|
+
kmer = kmers[seq_pos]
|
|
2904
|
+
bucket_ptr = <uint32*> ptr_array[kmer % self._n_buckets]
|
|
2905
|
+
|
|
2906
|
+
# Append k-mer, reference ID and position
|
|
2907
|
+
current_size = (<int64*> bucket_ptr)[0]
|
|
2908
|
+
kmer_val_ptr = &bucket_ptr[current_size]
|
|
2909
|
+
(<int64*> kmer_val_ptr)[0] = kmer
|
|
2910
|
+
bucket_ptr[current_size + 2] = ref_id
|
|
2911
|
+
bucket_ptr[current_size + 3] = seq_pos
|
|
2912
|
+
(<int64*> bucket_ptr)[0] = current_size + EntrySize.BUCKETS
|
|
2913
|
+
|
|
2914
|
+
|
|
2915
|
+
@cython.cdivision(True)
|
|
2916
|
+
@cython.boundscheck(False)
|
|
2917
|
+
@cython.wraparound(False)
|
|
2918
|
+
def _add_kmer_selection(self, uint32[:] positions, int64[:] kmers,
|
|
2919
|
+
uint32 ref_id):
|
|
2920
|
+
cdef uint32 i
|
|
2921
|
+
cdef uint32 seq_pos
|
|
2922
|
+
cdef int64 current_size
|
|
2923
|
+
cdef int64 kmer
|
|
2924
|
+
cdef uint32* bucket_ptr
|
|
2925
|
+
cdef uint32* kmer_val_ptr
|
|
2926
|
+
|
|
2927
|
+
if positions.shape[0] != kmers.shape[0]:
|
|
2928
|
+
raise IndexError(
|
|
2929
|
+
f"{positions.shape[0]} positions were given "
|
|
2930
|
+
f"for {kmers.shape[0]} k-mers"
|
|
2931
|
+
)
|
|
2932
|
+
|
|
2933
|
+
# Store in new variable
|
|
2934
|
+
# to disable repetitive initialization checks
|
|
2935
|
+
cdef ptr[:] ptr_array = self._ptr_array
|
|
2936
|
+
|
|
2937
|
+
for i in range(positions.shape[0]):
|
|
2938
|
+
kmer = kmers[i]
|
|
2939
|
+
seq_pos = positions[i]
|
|
2940
|
+
bucket_ptr = <uint32*> ptr_array[kmer % self._n_buckets]
|
|
2941
|
+
|
|
2942
|
+
# Append k-mer reference ID and position
|
|
2943
|
+
current_size = (<int64*> bucket_ptr)[0]
|
|
2944
|
+
kmer_val_ptr = &bucket_ptr[current_size]
|
|
2945
|
+
(<int64*> kmer_val_ptr)[0] = kmer
|
|
2946
|
+
bucket_ptr[current_size + 2] = ref_id
|
|
2947
|
+
bucket_ptr[current_size + 3] = seq_pos
|
|
2948
|
+
(<int64*> bucket_ptr)[0] = current_size + EntrySize.BUCKETS
|
|
2949
|
+
|
|
2950
|
+
|
|
2951
|
+
cdef inline bint _is_initialized(self):
|
|
2952
|
+
try:
|
|
2953
|
+
if self._ptr_array is not None:
|
|
2954
|
+
return True
|
|
2955
|
+
else:
|
|
2956
|
+
return False
|
|
2957
|
+
except AttributeError:
|
|
2958
|
+
return False
|
|
2959
|
+
|
|
2960
|
+
|
|
2961
|
+
|
|
2962
|
+
|
|
2963
|
+
@cython.cdivision(True)
|
|
2964
|
+
@cython.boundscheck(False)
|
|
2965
|
+
@cython.wraparound(False)
|
|
2966
|
+
def _count_table_entries(ptr[:] count_array, ptr[:] ptr_array,
|
|
2967
|
+
int64 element_size):
|
|
2968
|
+
"""
|
|
2969
|
+
For each bucket, count the number of elements in `ptr_array` and add
|
|
2970
|
+
the count to the counts in `count_array`.
|
|
2971
|
+
The element size gives the number of 32 bit elements per entry.
|
|
2972
|
+
"""
|
|
2973
|
+
cdef int64 length
|
|
2974
|
+
cdef int64 count
|
|
2975
|
+
cdef int64 bucket
|
|
2976
|
+
cdef uint32* bucket_ptr
|
|
2977
|
+
|
|
2978
|
+
for bucket in range(count_array.shape[0]):
|
|
2979
|
+
bucket_ptr = <uint32*> (ptr_array[bucket])
|
|
2980
|
+
if bucket_ptr != NULL:
|
|
2981
|
+
# First 64 bits are length of C-array
|
|
2982
|
+
length = (<int64*>bucket_ptr)[0]
|
|
2983
|
+
count = (length - 2) // element_size
|
|
2984
|
+
count_array[bucket] += count
|
|
2985
|
+
|
|
2986
|
+
|
|
2987
|
+
@cython.boundscheck(False)
|
|
2988
|
+
@cython.wraparound(False)
|
|
2989
|
+
def _init_c_arrays(ptr[:] ptr_array, int64 element_size):
|
|
2990
|
+
"""
|
|
2991
|
+
Transform an array of counts into a pointer array, by replacing the
|
|
2992
|
+
count in each element with a pointer to an initialized but empty
|
|
2993
|
+
``int32`` C-array.
|
|
2994
|
+
The size of each C-array is the count mutliplied by the
|
|
2995
|
+
`element_size`.
|
|
2996
|
+
The first element of each C-array is is the currently filled size
|
|
2997
|
+
of the C-array (an ``int64``) measured in number of ``int32``
|
|
2998
|
+
elements.
|
|
2999
|
+
"""
|
|
3000
|
+
cdef int64 bucket
|
|
3001
|
+
cdef int64 count
|
|
3002
|
+
cdef uint32* bucket_ptr
|
|
3003
|
+
|
|
3004
|
+
for bucket in range(ptr_array.shape[0]):
|
|
3005
|
+
# Before the C-array for a bucket initialized, the element in
|
|
3006
|
+
# the pointer array contains the number of elements the C-array
|
|
3007
|
+
# should hold
|
|
3008
|
+
count = ptr_array[bucket]
|
|
3009
|
+
if count != 0:
|
|
3010
|
+
# Array size + n x element size
|
|
3011
|
+
bucket_ptr = <uint32*>malloc(
|
|
3012
|
+
(2 + count * element_size) * sizeof(uint32)
|
|
3013
|
+
)
|
|
3014
|
+
if not bucket_ptr:
|
|
3015
|
+
raise MemoryError()
|
|
3016
|
+
# The initial size is 2,
|
|
3017
|
+
# which is the size of the array size value (int64)
|
|
3018
|
+
(<int64*> bucket_ptr)[0] = 2
|
|
3019
|
+
ptr_array[bucket] = <ptr>bucket_ptr
|
|
3020
|
+
|
|
3021
|
+
|
|
3022
|
+
@cython.boundscheck(False)
|
|
3023
|
+
@cython.wraparound(False)
|
|
3024
|
+
def _equal_c_arrays(ptr[:] self_ptr_array, ptr[:] other_ptr_array):
|
|
3025
|
+
"""
|
|
3026
|
+
Check if two pointer arrays are equal, i.e. they point to C-arrays
|
|
3027
|
+
with equal elements.
|
|
3028
|
+
"""
|
|
3029
|
+
cdef int64 bucket
|
|
3030
|
+
cdef int64 i
|
|
3031
|
+
cdef int64 self_length, other_length
|
|
3032
|
+
cdef uint32* self_bucket_ptr
|
|
3033
|
+
cdef uint32* other_bucket_ptr
|
|
3034
|
+
|
|
3035
|
+
for bucket in range(self_ptr_array.shape[0]):
|
|
3036
|
+
self_bucket_ptr = <uint32*>self_ptr_array[bucket]
|
|
3037
|
+
other_bucket_ptr = <uint32*>other_ptr_array[bucket]
|
|
3038
|
+
if self_bucket_ptr != NULL or other_bucket_ptr != NULL:
|
|
3039
|
+
if self_bucket_ptr == NULL or other_bucket_ptr == NULL:
|
|
3040
|
+
# One of the tables has entries for this bucket
|
|
3041
|
+
# while the other one has not
|
|
3042
|
+
return False
|
|
3043
|
+
# This bucket exists in both tables
|
|
3044
|
+
self_length = (<int64*>self_bucket_ptr )[0]
|
|
3045
|
+
other_length = (<int64*>other_bucket_ptr)[0]
|
|
3046
|
+
if self_length != other_length:
|
|
3047
|
+
return False
|
|
3048
|
+
for i in range(2, self_length):
|
|
3049
|
+
if self_bucket_ptr[i] != other_bucket_ptr[i]:
|
|
3050
|
+
return False
|
|
3051
|
+
|
|
3052
|
+
# If none of the previous checks failed, both objects are equal
|
|
3053
|
+
return True
|
|
3054
|
+
|
|
3055
|
+
|
|
3056
|
+
@cython.boundscheck(False)
|
|
3057
|
+
@cython.wraparound(False)
|
|
3058
|
+
def _append_entries(ptr[:] trg_ptr_array, ptr[:] src_ptr_array):
|
|
3059
|
+
"""
|
|
3060
|
+
Append the elements in all C-arrays of the source pointer array to
|
|
3061
|
+
the corresponding C-arrays of the target pointer array.
|
|
3062
|
+
|
|
3063
|
+
Expect that the target C-arrays are already initialized to
|
|
3064
|
+
sufficient capacity.
|
|
3065
|
+
"""
|
|
3066
|
+
cdef int64 bucket
|
|
3067
|
+
cdef int64 self_length, other_length, new_length
|
|
3068
|
+
cdef uint32* self_kmer_ptr
|
|
3069
|
+
cdef uint32* other_kmer_ptr
|
|
3070
|
+
|
|
3071
|
+
for bucket in range(trg_ptr_array.shape[0]):
|
|
3072
|
+
self_kmer_ptr = <uint32*>trg_ptr_array[bucket]
|
|
3073
|
+
other_kmer_ptr = <uint32*>src_ptr_array[bucket]
|
|
3074
|
+
if other_kmer_ptr != NULL:
|
|
3075
|
+
self_length = (<int64*>self_kmer_ptr)[0]
|
|
3076
|
+
other_length = (<int64*>other_kmer_ptr)[0]
|
|
3077
|
+
# New new C-array needs the combucketed space of both
|
|
3078
|
+
# arrays, but only one length value
|
|
3079
|
+
new_length = self_length + other_length - 2
|
|
3080
|
+
(<int64*>self_kmer_ptr)[0] = new_length
|
|
3081
|
+
|
|
3082
|
+
# Append the entry from the other table
|
|
3083
|
+
# to the entry in this table
|
|
3084
|
+
self_kmer_ptr += self_length
|
|
3085
|
+
other_kmer_ptr += 2
|
|
3086
|
+
memcpy(
|
|
3087
|
+
self_kmer_ptr, other_kmer_ptr,
|
|
3088
|
+
(other_length - 2) * sizeof(uint32)
|
|
3089
|
+
)
|
|
3090
|
+
|
|
3091
|
+
|
|
3092
|
+
@cython.boundscheck(False)
|
|
3093
|
+
@cython.wraparound(False)
|
|
3094
|
+
def _pickle_c_arrays(ptr[:] ptr_array):
|
|
3095
|
+
"""
|
|
3096
|
+
Pickle the C arrays into a single concatenated :class:`ndarray`.
|
|
3097
|
+
The lengths of each C-array on these concatenated array is saved as well.
|
|
3098
|
+
"""
|
|
3099
|
+
cdef int64 pointer_i, bucket_i, concat_i
|
|
3100
|
+
cdef int64 length
|
|
3101
|
+
cdef uint32* bucket_ptr
|
|
3102
|
+
|
|
3103
|
+
# First pass: Count the total concatenated size
|
|
3104
|
+
cdef int64 total_length = 0
|
|
3105
|
+
for pointer_i in range(ptr_array.shape[0]):
|
|
3106
|
+
bucket_ptr = <uint32*>ptr_array[pointer_i]
|
|
3107
|
+
if bucket_ptr != NULL:
|
|
3108
|
+
# The first element of the C-array is the length
|
|
3109
|
+
# of the array
|
|
3110
|
+
total_length += (<int64*>bucket_ptr)[0]
|
|
3111
|
+
|
|
3112
|
+
# Second pass: Copy the C-arrays into a single concatenated array
|
|
3113
|
+
# and track the start position of each C-array
|
|
3114
|
+
cdef uint32[:] concatenated_array = np.empty(total_length, dtype=np.uint32)
|
|
3115
|
+
cdef int64[:] lengths = np.empty(ptr_array.shape[0], dtype=np.int64)
|
|
3116
|
+
concat_i = 0
|
|
3117
|
+
for pointer_i in range(ptr_array.shape[0]):
|
|
3118
|
+
bucket_ptr = <uint32*>ptr_array[pointer_i]
|
|
3119
|
+
if bucket_ptr != NULL:
|
|
3120
|
+
length = (<int64*>bucket_ptr)[0]
|
|
3121
|
+
lengths[pointer_i] = length
|
|
3122
|
+
memcpy(
|
|
3123
|
+
&concatenated_array[concat_i],
|
|
3124
|
+
bucket_ptr,
|
|
3125
|
+
length * sizeof(uint32),
|
|
3126
|
+
)
|
|
3127
|
+
concat_i += length
|
|
3128
|
+
else:
|
|
3129
|
+
lengths[pointer_i] = 0
|
|
3130
|
+
|
|
3131
|
+
return np.asarray(concatenated_array), np.asarray(lengths)
|
|
3132
|
+
|
|
3133
|
+
|
|
3134
|
+
@cython.boundscheck(False)
|
|
3135
|
+
@cython.wraparound(False)
|
|
3136
|
+
def _unpickle_c_arrays(ptr[:] ptr_array, state):
|
|
3137
|
+
"""
|
|
3138
|
+
Unpickle the pickled `state` into the given `ptr_array`.
|
|
3139
|
+
"""
|
|
3140
|
+
cdef int64 pointer_i, concat_i
|
|
3141
|
+
cdef int64 length
|
|
3142
|
+
cdef uint32* bucket_ptr
|
|
3143
|
+
|
|
3144
|
+
cdef uint32[:] concatenated_array = state[0]
|
|
3145
|
+
cdef int64[:] lengths = state[1]
|
|
3146
|
+
|
|
3147
|
+
concat_i = 0
|
|
3148
|
+
for pointer_i in range(ptr_array.shape[0]):
|
|
3149
|
+
length = lengths[pointer_i]
|
|
3150
|
+
if length != 0:
|
|
3151
|
+
bucket_ptr = <uint32*>malloc(length * sizeof(uint32))
|
|
3152
|
+
if not bucket_ptr:
|
|
3153
|
+
raise MemoryError
|
|
3154
|
+
memcpy(
|
|
3155
|
+
bucket_ptr,
|
|
3156
|
+
&concatenated_array[concat_i],
|
|
3157
|
+
length * sizeof(uint32),
|
|
3158
|
+
)
|
|
3159
|
+
concat_i += length
|
|
3160
|
+
ptr_array[pointer_i] = <ptr>bucket_ptr
|
|
3161
|
+
|
|
3162
|
+
|
|
3163
|
+
cdef inline void _deallocate_ptrs(ptr[:] ptrs):
|
|
3164
|
+
cdef int64 kmer
|
|
3165
|
+
for kmer in range(ptrs.shape[0]):
|
|
3166
|
+
free(<uint32*>ptrs[kmer])
|
|
3167
|
+
|
|
3168
|
+
|
|
3169
|
+
cdef np.ndarray expand(np.ndarray array):
|
|
3170
|
+
"""
|
|
3171
|
+
Double the size of the first dimension of an existing 2D array.
|
|
3172
|
+
"""
|
|
3173
|
+
new_array = np.empty(
|
|
3174
|
+
(array.shape[0] * 2, array.shape[1]), dtype=array.dtype
|
|
3175
|
+
)
|
|
3176
|
+
new_array[:array.shape[0], :] = array
|
|
3177
|
+
return new_array
|
|
3178
|
+
|
|
3179
|
+
|
|
3180
|
+
def _prepare_mask(kmer_alphabet, ignore_mask, seq_length):
|
|
3181
|
+
"""
|
|
3182
|
+
Convert an ignore mask into a positive mask.
|
|
3183
|
+
Multiple formats (boolean mask, pointer array, None) are supported
|
|
3184
|
+
for the input.
|
|
3185
|
+
"""
|
|
3186
|
+
if ignore_mask is None:
|
|
3187
|
+
kmer_mask = np.ones(
|
|
3188
|
+
kmer_alphabet.kmer_array_length(seq_length), dtype=np.uint8
|
|
3189
|
+
)
|
|
3190
|
+
else:
|
|
3191
|
+
if not isinstance(ignore_mask, np.ndarray):
|
|
3192
|
+
raise TypeError(
|
|
3193
|
+
f"The given mask is a '{type(ignore_mask).__name__}', "
|
|
3194
|
+
f"but an ndarray was expected"
|
|
3195
|
+
)
|
|
3196
|
+
if ignore_mask.dtype != np.dtype(bool):
|
|
3197
|
+
raise ValueError("Expected a boolean mask")
|
|
3198
|
+
if len(ignore_mask) != seq_length:
|
|
3199
|
+
raise IndexError(
|
|
3200
|
+
f"ignore mask has length {len(ignore_mask)}, "
|
|
3201
|
+
f"but the length of the sequence is {seq_length}"
|
|
3202
|
+
)
|
|
3203
|
+
kmer_mask = _to_kmer_mask(
|
|
3204
|
+
np.frombuffer(
|
|
3205
|
+
ignore_mask.astype(bool, copy=False), dtype=np.uint8
|
|
3206
|
+
),
|
|
3207
|
+
kmer_alphabet
|
|
3208
|
+
)
|
|
3209
|
+
return kmer_mask
|
|
3210
|
+
|
|
3211
|
+
|
|
3212
|
+
@cython.boundscheck(False)
|
|
3213
|
+
@cython.wraparound(False)
|
|
3214
|
+
def _to_kmer_mask(uint8[:] mask not None, kmer_alphabet):
|
|
3215
|
+
"""
|
|
3216
|
+
Transform a sequence ignore mask into a *k-mer* mask.
|
|
3217
|
+
|
|
3218
|
+
The difference between those masks is that
|
|
3219
|
+
|
|
3220
|
+
1. the *k-mer* mask is shorter and
|
|
3221
|
+
2. a position *i* in the *k-mer* mask is false, if any
|
|
3222
|
+
informative position of *k-mer[i]* is true in the ignore
|
|
3223
|
+
mask.
|
|
3224
|
+
"""
|
|
3225
|
+
cdef int64 i, j
|
|
3226
|
+
cdef bint is_retained
|
|
3227
|
+
|
|
3228
|
+
cdef uint8[:] kmer_mask = np.empty(
|
|
3229
|
+
kmer_alphabet.kmer_array_length(mask.shape[0]), dtype=np.uint8
|
|
3230
|
+
)
|
|
3231
|
+
cdef int64 offset
|
|
3232
|
+
cdef int64 k = kmer_alphabet.k
|
|
3233
|
+
cdef int64[:] spacing
|
|
3234
|
+
|
|
3235
|
+
if kmer_alphabet.spacing is None:
|
|
3236
|
+
# Continuous k-mers
|
|
3237
|
+
for i in range(kmer_mask.shape[0]):
|
|
3238
|
+
is_retained = True
|
|
3239
|
+
# If any sequence position of this k-mer is removed,
|
|
3240
|
+
# discard this k-mer position
|
|
3241
|
+
for j in range(i, i + k):
|
|
3242
|
+
if mask[j]:
|
|
3243
|
+
is_retained = False
|
|
3244
|
+
kmer_mask[i] = is_retained
|
|
3245
|
+
|
|
3246
|
+
else:
|
|
3247
|
+
# Spaced k-mers
|
|
3248
|
+
spacing = kmer_alphabet.spacing
|
|
3249
|
+
for i in range(kmer_mask.shape[0]):
|
|
3250
|
+
is_retained = True
|
|
3251
|
+
# If any sequence position of this k-mer is removed,
|
|
3252
|
+
# discard this k-mer position
|
|
3253
|
+
for j in range(spacing.shape[0]):
|
|
3254
|
+
offset = spacing[j]
|
|
3255
|
+
if mask[j + offset]:
|
|
3256
|
+
is_retained = False
|
|
3257
|
+
kmer_mask[i] = is_retained
|
|
3258
|
+
|
|
3259
|
+
return np.asarray(kmer_mask)
|
|
3260
|
+
|
|
3261
|
+
|
|
3262
|
+
|
|
3263
|
+
def _check_position_shape(position_arrays, kmer_arrays):
|
|
3264
|
+
"""
|
|
3265
|
+
Check if the given lists and each element have the same length
|
|
3266
|
+
and raise an exception, if this is not teh case.
|
|
3267
|
+
"""
|
|
3268
|
+
if len(position_arrays) != len(kmer_arrays):
|
|
3269
|
+
raise IndexError(
|
|
3270
|
+
f"{len(position_arrays)} position arrays "
|
|
3271
|
+
f"for {len(kmer_arrays)} k-mer arrays were given"
|
|
3272
|
+
)
|
|
3273
|
+
for i, (positions, kmers) in enumerate(
|
|
3274
|
+
zip(position_arrays, kmer_arrays)
|
|
3275
|
+
):
|
|
3276
|
+
if len(positions) != len(kmers):
|
|
3277
|
+
raise IndexError(
|
|
3278
|
+
f"{len(positions)} positions"
|
|
3279
|
+
f"for {len(kmers)} k-mers were given at index {i}"
|
|
3280
|
+
)
|
|
3281
|
+
|
|
3282
|
+
|
|
3283
|
+
def _check_same_kmer_alphabet(tables):
|
|
3284
|
+
"""
|
|
3285
|
+
Check if the *k-mer* alphabets of all tables are equal.
|
|
3286
|
+
"""
|
|
3287
|
+
ref_alph = tables[0].kmer_alphabet
|
|
3288
|
+
for alph in (table.kmer_alphabet for table in tables):
|
|
3289
|
+
if not alph == ref_alph:
|
|
3290
|
+
raise ValueError(
|
|
3291
|
+
"The *k-mer* alphabets of the tables are not equal "
|
|
3292
|
+
"to each other"
|
|
3293
|
+
)
|
|
3294
|
+
|
|
3295
|
+
|
|
3296
|
+
def _check_same_buckets(tables):
|
|
3297
|
+
"""
|
|
3298
|
+
Check if the bucket sizes of all tables are equal.
|
|
3299
|
+
"""
|
|
3300
|
+
ref_n_buckets = tables[0].n_buckets
|
|
3301
|
+
for buckets in (table.n_buckets for table in tables):
|
|
3302
|
+
if not buckets == ref_n_buckets:
|
|
3303
|
+
raise ValueError(
|
|
3304
|
+
"The number of buckets of the tables are not equal "
|
|
3305
|
+
"to each other"
|
|
3306
|
+
)
|
|
3307
|
+
|
|
3308
|
+
|
|
3309
|
+
def _check_kmer_bounds(kmers, kmer_alphabet):
|
|
3310
|
+
"""
|
|
3311
|
+
Check k-mer codes for out-of-bounds values.
|
|
3312
|
+
"""
|
|
3313
|
+
if np.any(kmers < 0) or np.any(kmers >= len(kmer_alphabet)):
|
|
3314
|
+
raise AlphabetError(
|
|
3315
|
+
"Given k-mer codes do not represent valid k-mers"
|
|
3316
|
+
)
|
|
3317
|
+
|
|
3318
|
+
|
|
3319
|
+
def _check_multiple_kmer_bounds(kmer_arrays, kmer_alphabet):
|
|
3320
|
+
"""
|
|
3321
|
+
Check given arrays of k-mer codes for out-of-bounds values.
|
|
3322
|
+
"""
|
|
3323
|
+
for kmers in kmer_arrays:
|
|
3324
|
+
if np.any(kmers < 0) or np.any(kmers >= len(kmer_alphabet)):
|
|
3325
|
+
raise AlphabetError(
|
|
3326
|
+
"Given k-mer codes do not represent valid k-mers"
|
|
3327
|
+
)
|
|
3328
|
+
|
|
3329
|
+
|
|
3330
|
+
def _check_kmer_alphabet(kmer_alph):
|
|
3331
|
+
"""
|
|
3332
|
+
Check if the given object is a KmerAaphabet and raise an exception,
|
|
3333
|
+
if this is not the case
|
|
3334
|
+
"""
|
|
3335
|
+
if not isinstance(kmer_alph, KmerAlphabet):
|
|
3336
|
+
raise TypeError(
|
|
3337
|
+
f"Got {type(kmer_alph).__name__}, but KmerAlphabet was expected"
|
|
3338
|
+
)
|
|
3339
|
+
|
|
3340
|
+
|
|
3341
|
+
def _compute_masks(masks, sequences):
|
|
3342
|
+
"""
|
|
3343
|
+
Check, if the number of masks match the number of sequences, and
|
|
3344
|
+
raise an exception if this is not the case.
|
|
3345
|
+
If no masks are given, create a respective list of ``None`` values.
|
|
3346
|
+
"""
|
|
3347
|
+
if masks is None:
|
|
3348
|
+
return [None] * len(sequences)
|
|
3349
|
+
else:
|
|
3350
|
+
if len(masks) != len(sequences):
|
|
3351
|
+
raise IndexError(
|
|
3352
|
+
f"{len(masks)} masks were given, "
|
|
3353
|
+
f"but there are {len(sequences)} sequences"
|
|
3354
|
+
)
|
|
3355
|
+
return masks
|
|
3356
|
+
|
|
3357
|
+
|
|
3358
|
+
def _compute_ref_ids(ref_ids, sequences):
|
|
3359
|
+
"""
|
|
3360
|
+
Check, if the number of reference IDs match the number of
|
|
3361
|
+
sequences, and raise an exception, if this is not the case.
|
|
3362
|
+
If no reference IDs are given, create an array that simply
|
|
3363
|
+
enumerates.
|
|
3364
|
+
"""
|
|
3365
|
+
if ref_ids is None:
|
|
3366
|
+
return np.arange(len(sequences))
|
|
3367
|
+
else:
|
|
3368
|
+
if len(ref_ids) != len(sequences):
|
|
3369
|
+
raise IndexError(
|
|
3370
|
+
f"{len(ref_ids)} reference IDs were given, "
|
|
3371
|
+
f"but there are {len(sequences)} sequences"
|
|
3372
|
+
)
|
|
3373
|
+
return ref_ids
|
|
3374
|
+
|
|
3375
|
+
|
|
3376
|
+
def _compute_alphabet(given_alphabet, sequence_alphabets):
|
|
3377
|
+
"""
|
|
3378
|
+
If `given_alphabet` is None, find a common alphabet among
|
|
3379
|
+
`sequence_alphabets` and raise an exception if this is not possible.
|
|
3380
|
+
Otherwise just check compatibility of alphabets.
|
|
3381
|
+
"""
|
|
3382
|
+
if given_alphabet is None:
|
|
3383
|
+
alphabet = common_alphabet(sequence_alphabets)
|
|
3384
|
+
if alphabet is None:
|
|
3385
|
+
raise ValueError(
|
|
3386
|
+
"There is no common alphabet that extends all alphabets"
|
|
3387
|
+
)
|
|
3388
|
+
return alphabet
|
|
3389
|
+
else:
|
|
3390
|
+
for alph in sequence_alphabets:
|
|
3391
|
+
if not given_alphabet.extends(alph):
|
|
3392
|
+
raise ValueError(
|
|
3393
|
+
"The given alphabet is incompatible with a least one "
|
|
3394
|
+
"alphabet of the given sequences"
|
|
3395
|
+
)
|
|
3396
|
+
return given_alphabet
|
|
3397
|
+
|
|
3398
|
+
|
|
3399
|
+
def _to_string(table):
|
|
3400
|
+
lines = []
|
|
3401
|
+
for kmer in table.get_kmers():
|
|
3402
|
+
symbols = table.kmer_alphabet.decode(kmer)
|
|
3403
|
+
if isinstance(table.alphabet, LetterAlphabet):
|
|
3404
|
+
symbols = "".join(symbols)
|
|
3405
|
+
else:
|
|
3406
|
+
symbols = str(tuple(symbols))
|
|
3407
|
+
line = symbols + ": " + ", ".join(
|
|
3408
|
+
[str((ref_id.item(), pos.item())) for ref_id, pos in table[kmer]]
|
|
3409
|
+
)
|
|
3410
|
+
lines.append(line)
|
|
3411
|
+
return "\n".join(lines)
|